|
@@ -23,6 +23,7 @@
|
|
|
#include <linux/latencytop.h>
|
|
|
#include <linux/sched.h>
|
|
|
#include <linux/cpumask.h>
|
|
|
+#include <linux/cpuidle.h>
|
|
|
#include <linux/slab.h>
|
|
|
#include <linux/profile.h>
|
|
|
#include <linux/interrupt.h>
|
|
@@ -665,6 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
}
|
|
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
+static int select_idle_sibling(struct task_struct *p, int cpu);
|
|
|
static unsigned long task_h_load(struct task_struct *p);
|
|
|
|
|
|
static inline void __update_task_entity_contrib(struct sched_entity *se);
|
|
@@ -1038,7 +1040,8 @@ struct numa_stats {
|
|
|
*/
|
|
|
static void update_numa_stats(struct numa_stats *ns, int nid)
|
|
|
{
|
|
|
- int cpu, cpus = 0;
|
|
|
+ int smt, cpu, cpus = 0;
|
|
|
+ unsigned long capacity;
|
|
|
|
|
|
memset(ns, 0, sizeof(*ns));
|
|
|
for_each_cpu(cpu, cpumask_of_node(nid)) {
|
|
@@ -1062,8 +1065,12 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
|
|
|
if (!cpus)
|
|
|
return;
|
|
|
|
|
|
- ns->task_capacity =
|
|
|
- DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
|
|
|
+ /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
|
|
|
+ smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
|
|
|
+ capacity = cpus / smt; /* cores */
|
|
|
+
|
|
|
+ ns->task_capacity = min_t(unsigned, capacity,
|
|
|
+ DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
|
|
|
ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
|
|
|
}
|
|
|
|
|
@@ -1206,7 +1213,7 @@ static void task_numa_compare(struct task_numa_env *env,
|
|
|
|
|
|
if (!cur) {
|
|
|
/* Is there capacity at our destination? */
|
|
|
- if (env->src_stats.has_free_capacity &&
|
|
|
+ if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
|
|
|
!env->dst_stats.has_free_capacity)
|
|
|
goto unlock;
|
|
|
|
|
@@ -1252,6 +1259,13 @@ balance:
|
|
|
if (load_too_imbalanced(src_load, dst_load, env))
|
|
|
goto unlock;
|
|
|
|
|
|
+ /*
|
|
|
+ * One idle CPU per node is evaluated for a task numa move.
|
|
|
+ * Call select_idle_sibling to maybe find a better one.
|
|
|
+ */
|
|
|
+ if (!cur)
|
|
|
+ env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
|
|
|
+
|
|
|
assign:
|
|
|
task_numa_assign(env, cur, imp);
|
|
|
unlock:
|
|
@@ -1775,7 +1789,7 @@ void task_numa_free(struct task_struct *p)
|
|
|
list_del(&p->numa_entry);
|
|
|
grp->nr_tasks--;
|
|
|
spin_unlock_irqrestore(&grp->lock, flags);
|
|
|
- rcu_assign_pointer(p->numa_group, NULL);
|
|
|
+ RCU_INIT_POINTER(p->numa_group, NULL);
|
|
|
put_numa_group(grp);
|
|
|
}
|
|
|
|
|
@@ -1804,10 +1818,6 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
|
|
|
if (!p->mm)
|
|
|
return;
|
|
|
|
|
|
- /* Do not worry about placement if exiting */
|
|
|
- if (p->state == TASK_DEAD)
|
|
|
- return;
|
|
|
-
|
|
|
/* Allocate buffer to track faults on a per-node basis */
|
|
|
if (unlikely(!p->numa_faults_memory)) {
|
|
|
int size = sizeof(*p->numa_faults_memory) *
|
|
@@ -2211,8 +2221,8 @@ static __always_inline u64 decay_load(u64 val, u64 n)
|
|
|
|
|
|
/*
|
|
|
* As y^PERIOD = 1/2, we can combine
|
|
|
- * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
|
|
|
- * With a look-up table which covers k^n (n<PERIOD)
|
|
|
+ * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
|
|
|
+ * With a look-up table which covers y^n (n<PERIOD)
|
|
|
*
|
|
|
* To achieve constant time decay_load.
|
|
|
*/
|
|
@@ -2377,6 +2387,9 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
|
|
|
tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
|
|
|
tg_contrib -= cfs_rq->tg_load_contrib;
|
|
|
|
|
|
+ if (!tg_contrib)
|
|
|
+ return;
|
|
|
+
|
|
|
if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
|
|
|
atomic_long_add(tg_contrib, &tg->load_avg);
|
|
|
cfs_rq->tg_load_contrib += tg_contrib;
|
|
@@ -3892,14 +3905,6 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
|
|
|
resched_curr(rq);
|
|
|
return;
|
|
|
}
|
|
|
-
|
|
|
- /*
|
|
|
- * Don't schedule slices shorter than 10000ns, that just
|
|
|
- * doesn't make sense. Rely on vruntime for fairness.
|
|
|
- */
|
|
|
- if (rq->curr != p)
|
|
|
- delta = max_t(s64, 10000LL, delta);
|
|
|
-
|
|
|
hrtick_start(rq, delta);
|
|
|
}
|
|
|
}
|
|
@@ -4087,7 +4092,7 @@ static unsigned long capacity_of(int cpu)
|
|
|
static unsigned long cpu_avg_load_per_task(int cpu)
|
|
|
{
|
|
|
struct rq *rq = cpu_rq(cpu);
|
|
|
- unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
|
|
|
+ unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
|
|
|
unsigned long load_avg = rq->cfs.runnable_load_avg;
|
|
|
|
|
|
if (nr_running)
|
|
@@ -4276,8 +4281,8 @@ static int wake_wide(struct task_struct *p)
|
|
|
static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
|
|
|
{
|
|
|
s64 this_load, load;
|
|
|
+ s64 this_eff_load, prev_eff_load;
|
|
|
int idx, this_cpu, prev_cpu;
|
|
|
- unsigned long tl_per_task;
|
|
|
struct task_group *tg;
|
|
|
unsigned long weight;
|
|
|
int balanced;
|
|
@@ -4320,47 +4325,30 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
|
|
|
* Otherwise check if either cpus are near enough in load to allow this
|
|
|
* task to be woken on this_cpu.
|
|
|
*/
|
|
|
- if (this_load > 0) {
|
|
|
- s64 this_eff_load, prev_eff_load;
|
|
|
+ this_eff_load = 100;
|
|
|
+ this_eff_load *= capacity_of(prev_cpu);
|
|
|
|
|
|
- this_eff_load = 100;
|
|
|
- this_eff_load *= capacity_of(prev_cpu);
|
|
|
+ prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
|
|
|
+ prev_eff_load *= capacity_of(this_cpu);
|
|
|
+
|
|
|
+ if (this_load > 0) {
|
|
|
this_eff_load *= this_load +
|
|
|
effective_load(tg, this_cpu, weight, weight);
|
|
|
|
|
|
- prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
|
|
|
- prev_eff_load *= capacity_of(this_cpu);
|
|
|
prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
|
|
|
+ }
|
|
|
|
|
|
- balanced = this_eff_load <= prev_eff_load;
|
|
|
- } else
|
|
|
- balanced = true;
|
|
|
-
|
|
|
- /*
|
|
|
- * If the currently running task will sleep within
|
|
|
- * a reasonable amount of time then attract this newly
|
|
|
- * woken task:
|
|
|
- */
|
|
|
- if (sync && balanced)
|
|
|
- return 1;
|
|
|
+ balanced = this_eff_load <= prev_eff_load;
|
|
|
|
|
|
schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
|
|
|
- tl_per_task = cpu_avg_load_per_task(this_cpu);
|
|
|
|
|
|
- if (balanced ||
|
|
|
- (this_load <= load &&
|
|
|
- this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
|
|
|
- /*
|
|
|
- * This domain has SD_WAKE_AFFINE and
|
|
|
- * p is cache cold in this domain, and
|
|
|
- * there is no bad imbalance.
|
|
|
- */
|
|
|
- schedstat_inc(sd, ttwu_move_affine);
|
|
|
- schedstat_inc(p, se.statistics.nr_wakeups_affine);
|
|
|
+ if (!balanced)
|
|
|
+ return 0;
|
|
|
|
|
|
- return 1;
|
|
|
- }
|
|
|
- return 0;
|
|
|
+ schedstat_inc(sd, ttwu_move_affine);
|
|
|
+ schedstat_inc(p, se.statistics.nr_wakeups_affine);
|
|
|
+
|
|
|
+ return 1;
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -4428,20 +4416,46 @@ static int
|
|
|
find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
|
|
|
{
|
|
|
unsigned long load, min_load = ULONG_MAX;
|
|
|
- int idlest = -1;
|
|
|
+ unsigned int min_exit_latency = UINT_MAX;
|
|
|
+ u64 latest_idle_timestamp = 0;
|
|
|
+ int least_loaded_cpu = this_cpu;
|
|
|
+ int shallowest_idle_cpu = -1;
|
|
|
int i;
|
|
|
|
|
|
/* Traverse only the allowed CPUs */
|
|
|
for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
|
|
|
- load = weighted_cpuload(i);
|
|
|
-
|
|
|
- if (load < min_load || (load == min_load && i == this_cpu)) {
|
|
|
- min_load = load;
|
|
|
- idlest = i;
|
|
|
+ if (idle_cpu(i)) {
|
|
|
+ struct rq *rq = cpu_rq(i);
|
|
|
+ struct cpuidle_state *idle = idle_get_state(rq);
|
|
|
+ if (idle && idle->exit_latency < min_exit_latency) {
|
|
|
+ /*
|
|
|
+ * We give priority to a CPU whose idle state
|
|
|
+ * has the smallest exit latency irrespective
|
|
|
+ * of any idle timestamp.
|
|
|
+ */
|
|
|
+ min_exit_latency = idle->exit_latency;
|
|
|
+ latest_idle_timestamp = rq->idle_stamp;
|
|
|
+ shallowest_idle_cpu = i;
|
|
|
+ } else if ((!idle || idle->exit_latency == min_exit_latency) &&
|
|
|
+ rq->idle_stamp > latest_idle_timestamp) {
|
|
|
+ /*
|
|
|
+ * If equal or no active idle state, then
|
|
|
+ * the most recently idled CPU might have
|
|
|
+ * a warmer cache.
|
|
|
+ */
|
|
|
+ latest_idle_timestamp = rq->idle_stamp;
|
|
|
+ shallowest_idle_cpu = i;
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ load = weighted_cpuload(i);
|
|
|
+ if (load < min_load || (load == min_load && i == this_cpu)) {
|
|
|
+ min_load = load;
|
|
|
+ least_loaded_cpu = i;
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- return idlest;
|
|
|
+ return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -4513,11 +4527,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
|
|
|
if (p->nr_cpus_allowed == 1)
|
|
|
return prev_cpu;
|
|
|
|
|
|
- if (sd_flag & SD_BALANCE_WAKE) {
|
|
|
- if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
|
|
|
- want_affine = 1;
|
|
|
- new_cpu = prev_cpu;
|
|
|
- }
|
|
|
+ if (sd_flag & SD_BALANCE_WAKE)
|
|
|
+ want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
|
|
|
|
|
|
rcu_read_lock();
|
|
|
for_each_domain(cpu, tmp) {
|
|
@@ -4704,7 +4715,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
|
|
|
return;
|
|
|
|
|
|
/*
|
|
|
- * This is possible from callers such as move_task(), in which we
|
|
|
+ * This is possible from callers such as attach_tasks(), in which we
|
|
|
* unconditionally check_prempt_curr() after an enqueue (which may have
|
|
|
* lead to a throttle). This both saves work and prevents false
|
|
|
* next-buddy nomination below.
|
|
@@ -5112,20 +5123,9 @@ struct lb_env {
|
|
|
unsigned int loop_max;
|
|
|
|
|
|
enum fbq_type fbq_type;
|
|
|
+ struct list_head tasks;
|
|
|
};
|
|
|
|
|
|
-/*
|
|
|
- * move_task - move a task from one runqueue to another runqueue.
|
|
|
- * Both runqueues must be locked.
|
|
|
- */
|
|
|
-static void move_task(struct task_struct *p, struct lb_env *env)
|
|
|
-{
|
|
|
- deactivate_task(env->src_rq, p, 0);
|
|
|
- set_task_cpu(p, env->dst_cpu);
|
|
|
- activate_task(env->dst_rq, p, 0);
|
|
|
- check_preempt_curr(env->dst_rq, p, 0);
|
|
|
-}
|
|
|
-
|
|
|
/*
|
|
|
* Is this task likely cache-hot:
|
|
|
*/
|
|
@@ -5133,6 +5133,8 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
|
|
|
{
|
|
|
s64 delta;
|
|
|
|
|
|
+ lockdep_assert_held(&env->src_rq->lock);
|
|
|
+
|
|
|
if (p->sched_class != &fair_sched_class)
|
|
|
return 0;
|
|
|
|
|
@@ -5252,6 +5254,9 @@ static
|
|
|
int can_migrate_task(struct task_struct *p, struct lb_env *env)
|
|
|
{
|
|
|
int tsk_cache_hot = 0;
|
|
|
+
|
|
|
+ lockdep_assert_held(&env->src_rq->lock);
|
|
|
+
|
|
|
/*
|
|
|
* We do not migrate tasks that are:
|
|
|
* 1) throttled_lb_pair, or
|
|
@@ -5310,24 +5315,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
|
|
|
if (!tsk_cache_hot)
|
|
|
tsk_cache_hot = migrate_degrades_locality(p, env);
|
|
|
|
|
|
- if (migrate_improves_locality(p, env)) {
|
|
|
-#ifdef CONFIG_SCHEDSTATS
|
|
|
+ if (migrate_improves_locality(p, env) || !tsk_cache_hot ||
|
|
|
+ env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
|
|
|
if (tsk_cache_hot) {
|
|
|
schedstat_inc(env->sd, lb_hot_gained[env->idle]);
|
|
|
schedstat_inc(p, se.statistics.nr_forced_migrations);
|
|
|
}
|
|
|
-#endif
|
|
|
- return 1;
|
|
|
- }
|
|
|
-
|
|
|
- if (!tsk_cache_hot ||
|
|
|
- env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
|
|
|
-
|
|
|
- if (tsk_cache_hot) {
|
|
|
- schedstat_inc(env->sd, lb_hot_gained[env->idle]);
|
|
|
- schedstat_inc(p, se.statistics.nr_forced_migrations);
|
|
|
- }
|
|
|
-
|
|
|
return 1;
|
|
|
}
|
|
|
|
|
@@ -5336,47 +5329,63 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
- * move_one_task tries to move exactly one task from busiest to this_rq, as
|
|
|
+ * detach_task() -- detach the task for the migration specified in env
|
|
|
+ */
|
|
|
+static void detach_task(struct task_struct *p, struct lb_env *env)
|
|
|
+{
|
|
|
+ lockdep_assert_held(&env->src_rq->lock);
|
|
|
+
|
|
|
+ deactivate_task(env->src_rq, p, 0);
|
|
|
+ p->on_rq = TASK_ON_RQ_MIGRATING;
|
|
|
+ set_task_cpu(p, env->dst_cpu);
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
|
|
|
* part of active balancing operations within "domain".
|
|
|
- * Returns 1 if successful and 0 otherwise.
|
|
|
*
|
|
|
- * Called with both runqueues locked.
|
|
|
+ * Returns a task if successful and NULL otherwise.
|
|
|
*/
|
|
|
-static int move_one_task(struct lb_env *env)
|
|
|
+static struct task_struct *detach_one_task(struct lb_env *env)
|
|
|
{
|
|
|
struct task_struct *p, *n;
|
|
|
|
|
|
+ lockdep_assert_held(&env->src_rq->lock);
|
|
|
+
|
|
|
list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
|
|
|
if (!can_migrate_task(p, env))
|
|
|
continue;
|
|
|
|
|
|
- move_task(p, env);
|
|
|
+ detach_task(p, env);
|
|
|
+
|
|
|
/*
|
|
|
- * Right now, this is only the second place move_task()
|
|
|
- * is called, so we can safely collect move_task()
|
|
|
- * stats here rather than inside move_task().
|
|
|
+ * Right now, this is only the second place where
|
|
|
+ * lb_gained[env->idle] is updated (other is detach_tasks)
|
|
|
+ * so we can safely collect stats here rather than
|
|
|
+ * inside detach_tasks().
|
|
|
*/
|
|
|
schedstat_inc(env->sd, lb_gained[env->idle]);
|
|
|
- return 1;
|
|
|
+ return p;
|
|
|
}
|
|
|
- return 0;
|
|
|
+ return NULL;
|
|
|
}
|
|
|
|
|
|
static const unsigned int sched_nr_migrate_break = 32;
|
|
|
|
|
|
/*
|
|
|
- * move_tasks tries to move up to imbalance weighted load from busiest to
|
|
|
- * this_rq, as part of a balancing operation within domain "sd".
|
|
|
- * Returns 1 if successful and 0 otherwise.
|
|
|
+ * detach_tasks() -- tries to detach up to imbalance weighted load from
|
|
|
+ * busiest_rq, as part of a balancing operation within domain "sd".
|
|
|
*
|
|
|
- * Called with both runqueues locked.
|
|
|
+ * Returns number of detached tasks if successful and 0 otherwise.
|
|
|
*/
|
|
|
-static int move_tasks(struct lb_env *env)
|
|
|
+static int detach_tasks(struct lb_env *env)
|
|
|
{
|
|
|
struct list_head *tasks = &env->src_rq->cfs_tasks;
|
|
|
struct task_struct *p;
|
|
|
unsigned long load;
|
|
|
- int pulled = 0;
|
|
|
+ int detached = 0;
|
|
|
+
|
|
|
+ lockdep_assert_held(&env->src_rq->lock);
|
|
|
|
|
|
if (env->imbalance <= 0)
|
|
|
return 0;
|
|
@@ -5407,14 +5416,16 @@ static int move_tasks(struct lb_env *env)
|
|
|
if ((load / 2) > env->imbalance)
|
|
|
goto next;
|
|
|
|
|
|
- move_task(p, env);
|
|
|
- pulled++;
|
|
|
+ detach_task(p, env);
|
|
|
+ list_add(&p->se.group_node, &env->tasks);
|
|
|
+
|
|
|
+ detached++;
|
|
|
env->imbalance -= load;
|
|
|
|
|
|
#ifdef CONFIG_PREEMPT
|
|
|
/*
|
|
|
* NEWIDLE balancing is a source of latency, so preemptible
|
|
|
- * kernels will stop after the first task is pulled to minimize
|
|
|
+ * kernels will stop after the first task is detached to minimize
|
|
|
* the critical section.
|
|
|
*/
|
|
|
if (env->idle == CPU_NEWLY_IDLE)
|
|
@@ -5434,13 +5445,58 @@ next:
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
- * Right now, this is one of only two places move_task() is called,
|
|
|
- * so we can safely collect move_task() stats here rather than
|
|
|
- * inside move_task().
|
|
|
+ * Right now, this is one of only two places we collect this stat
|
|
|
+ * so we can safely collect detach_one_task() stats here rather
|
|
|
+ * than inside detach_one_task().
|
|
|
*/
|
|
|
- schedstat_add(env->sd, lb_gained[env->idle], pulled);
|
|
|
+ schedstat_add(env->sd, lb_gained[env->idle], detached);
|
|
|
+
|
|
|
+ return detached;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * attach_task() -- attach the task detached by detach_task() to its new rq.
|
|
|
+ */
|
|
|
+static void attach_task(struct rq *rq, struct task_struct *p)
|
|
|
+{
|
|
|
+ lockdep_assert_held(&rq->lock);
|
|
|
+
|
|
|
+ BUG_ON(task_rq(p) != rq);
|
|
|
+ p->on_rq = TASK_ON_RQ_QUEUED;
|
|
|
+ activate_task(rq, p, 0);
|
|
|
+ check_preempt_curr(rq, p, 0);
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * attach_one_task() -- attaches the task returned from detach_one_task() to
|
|
|
+ * its new rq.
|
|
|
+ */
|
|
|
+static void attach_one_task(struct rq *rq, struct task_struct *p)
|
|
|
+{
|
|
|
+ raw_spin_lock(&rq->lock);
|
|
|
+ attach_task(rq, p);
|
|
|
+ raw_spin_unlock(&rq->lock);
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
|
|
|
+ * new rq.
|
|
|
+ */
|
|
|
+static void attach_tasks(struct lb_env *env)
|
|
|
+{
|
|
|
+ struct list_head *tasks = &env->tasks;
|
|
|
+ struct task_struct *p;
|
|
|
+
|
|
|
+ raw_spin_lock(&env->dst_rq->lock);
|
|
|
+
|
|
|
+ while (!list_empty(tasks)) {
|
|
|
+ p = list_first_entry(tasks, struct task_struct, se.group_node);
|
|
|
+ list_del_init(&p->se.group_node);
|
|
|
|
|
|
- return pulled;
|
|
|
+ attach_task(env->dst_rq, p);
|
|
|
+ }
|
|
|
+
|
|
|
+ raw_spin_unlock(&env->dst_rq->lock);
|
|
|
}
|
|
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
@@ -5559,6 +5615,13 @@ static unsigned long task_h_load(struct task_struct *p)
|
|
|
#endif
|
|
|
|
|
|
/********** Helpers for find_busiest_group ************************/
|
|
|
+
|
|
|
+enum group_type {
|
|
|
+ group_other = 0,
|
|
|
+ group_imbalanced,
|
|
|
+ group_overloaded,
|
|
|
+};
|
|
|
+
|
|
|
/*
|
|
|
* sg_lb_stats - stats of a sched_group required for load_balancing
|
|
|
*/
|
|
@@ -5572,7 +5635,7 @@ struct sg_lb_stats {
|
|
|
unsigned int group_capacity_factor;
|
|
|
unsigned int idle_cpus;
|
|
|
unsigned int group_weight;
|
|
|
- int group_imb; /* Is there an imbalance in the group ? */
|
|
|
+ enum group_type group_type;
|
|
|
int group_has_free_capacity;
|
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
|
unsigned int nr_numa_running;
|
|
@@ -5610,6 +5673,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
|
|
|
.total_capacity = 0UL,
|
|
|
.busiest_stat = {
|
|
|
.avg_load = 0UL,
|
|
|
+ .sum_nr_running = 0,
|
|
|
+ .group_type = group_other,
|
|
|
},
|
|
|
};
|
|
|
}
|
|
@@ -5652,19 +5717,17 @@ unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
|
|
|
return default_scale_capacity(sd, cpu);
|
|
|
}
|
|
|
|
|
|
-static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu)
|
|
|
+static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
|
|
|
{
|
|
|
- unsigned long weight = sd->span_weight;
|
|
|
- unsigned long smt_gain = sd->smt_gain;
|
|
|
+ if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
|
|
|
+ return sd->smt_gain / sd->span_weight;
|
|
|
|
|
|
- smt_gain /= weight;
|
|
|
-
|
|
|
- return smt_gain;
|
|
|
+ return SCHED_CAPACITY_SCALE;
|
|
|
}
|
|
|
|
|
|
-unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu)
|
|
|
+unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
|
|
|
{
|
|
|
- return default_scale_smt_capacity(sd, cpu);
|
|
|
+ return default_scale_cpu_capacity(sd, cpu);
|
|
|
}
|
|
|
|
|
|
static unsigned long scale_rt_capacity(int cpu)
|
|
@@ -5703,18 +5766,15 @@ static unsigned long scale_rt_capacity(int cpu)
|
|
|
|
|
|
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
|
|
|
{
|
|
|
- unsigned long weight = sd->span_weight;
|
|
|
unsigned long capacity = SCHED_CAPACITY_SCALE;
|
|
|
struct sched_group *sdg = sd->groups;
|
|
|
|
|
|
- if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) {
|
|
|
- if (sched_feat(ARCH_CAPACITY))
|
|
|
- capacity *= arch_scale_smt_capacity(sd, cpu);
|
|
|
- else
|
|
|
- capacity *= default_scale_smt_capacity(sd, cpu);
|
|
|
+ if (sched_feat(ARCH_CAPACITY))
|
|
|
+ capacity *= arch_scale_cpu_capacity(sd, cpu);
|
|
|
+ else
|
|
|
+ capacity *= default_scale_cpu_capacity(sd, cpu);
|
|
|
|
|
|
- capacity >>= SCHED_CAPACITY_SHIFT;
|
|
|
- }
|
|
|
+ capacity >>= SCHED_CAPACITY_SHIFT;
|
|
|
|
|
|
sdg->sgc->capacity_orig = capacity;
|
|
|
|
|
@@ -5891,6 +5951,18 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro
|
|
|
return capacity_factor;
|
|
|
}
|
|
|
|
|
|
+static enum group_type
|
|
|
+group_classify(struct sched_group *group, struct sg_lb_stats *sgs)
|
|
|
+{
|
|
|
+ if (sgs->sum_nr_running > sgs->group_capacity_factor)
|
|
|
+ return group_overloaded;
|
|
|
+
|
|
|
+ if (sg_imbalanced(group))
|
|
|
+ return group_imbalanced;
|
|
|
+
|
|
|
+ return group_other;
|
|
|
+}
|
|
|
+
|
|
|
/**
|
|
|
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
|
|
|
* @env: The load balancing environment.
|
|
@@ -5920,7 +5992,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
|
|
load = source_load(i, load_idx);
|
|
|
|
|
|
sgs->group_load += load;
|
|
|
- sgs->sum_nr_running += rq->nr_running;
|
|
|
+ sgs->sum_nr_running += rq->cfs.h_nr_running;
|
|
|
|
|
|
if (rq->nr_running > 1)
|
|
|
*overload = true;
|
|
@@ -5942,9 +6014,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
|
|
sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
|
|
|
|
|
|
sgs->group_weight = group->group_weight;
|
|
|
-
|
|
|
- sgs->group_imb = sg_imbalanced(group);
|
|
|
sgs->group_capacity_factor = sg_capacity_factor(env, group);
|
|
|
+ sgs->group_type = group_classify(group, sgs);
|
|
|
|
|
|
if (sgs->group_capacity_factor > sgs->sum_nr_running)
|
|
|
sgs->group_has_free_capacity = 1;
|
|
@@ -5968,13 +6039,19 @@ static bool update_sd_pick_busiest(struct lb_env *env,
|
|
|
struct sched_group *sg,
|
|
|
struct sg_lb_stats *sgs)
|
|
|
{
|
|
|
- if (sgs->avg_load <= sds->busiest_stat.avg_load)
|
|
|
- return false;
|
|
|
+ struct sg_lb_stats *busiest = &sds->busiest_stat;
|
|
|
|
|
|
- if (sgs->sum_nr_running > sgs->group_capacity_factor)
|
|
|
+ if (sgs->group_type > busiest->group_type)
|
|
|
return true;
|
|
|
|
|
|
- if (sgs->group_imb)
|
|
|
+ if (sgs->group_type < busiest->group_type)
|
|
|
+ return false;
|
|
|
+
|
|
|
+ if (sgs->avg_load <= busiest->avg_load)
|
|
|
+ return false;
|
|
|
+
|
|
|
+ /* This is the busiest node in its class. */
|
|
|
+ if (!(env->sd->flags & SD_ASYM_PACKING))
|
|
|
return true;
|
|
|
|
|
|
/*
|
|
@@ -5982,8 +6059,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
|
|
|
* numbered CPUs in the group, therefore mark all groups
|
|
|
* higher than ourself as busy.
|
|
|
*/
|
|
|
- if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
|
|
|
- env->dst_cpu < group_first_cpu(sg)) {
|
|
|
+ if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
|
|
|
if (!sds->busiest)
|
|
|
return true;
|
|
|
|
|
@@ -6228,7 +6304,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
|
|
|
local = &sds->local_stat;
|
|
|
busiest = &sds->busiest_stat;
|
|
|
|
|
|
- if (busiest->group_imb) {
|
|
|
+ if (busiest->group_type == group_imbalanced) {
|
|
|
/*
|
|
|
* In the group_imb case we cannot rely on group-wide averages
|
|
|
* to ensure cpu-load equilibrium, look at wider averages. XXX
|
|
@@ -6248,12 +6324,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
|
|
|
return fix_small_imbalance(env, sds);
|
|
|
}
|
|
|
|
|
|
- if (!busiest->group_imb) {
|
|
|
- /*
|
|
|
- * Don't want to pull so many tasks that a group would go idle.
|
|
|
- * Except of course for the group_imb case, since then we might
|
|
|
- * have to drop below capacity to reach cpu-load equilibrium.
|
|
|
- */
|
|
|
+ /*
|
|
|
+ * If there aren't any idle cpus, avoid creating some.
|
|
|
+ */
|
|
|
+ if (busiest->group_type == group_overloaded &&
|
|
|
+ local->group_type == group_overloaded) {
|
|
|
load_above_capacity =
|
|
|
(busiest->sum_nr_running - busiest->group_capacity_factor);
|
|
|
|
|
@@ -6337,7 +6412,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
|
|
|
* work because they assume all things are equal, which typically
|
|
|
* isn't true due to cpus_allowed constraints and the like.
|
|
|
*/
|
|
|
- if (busiest->group_imb)
|
|
|
+ if (busiest->group_type == group_imbalanced)
|
|
|
goto force_balance;
|
|
|
|
|
|
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
|
|
@@ -6346,7 +6421,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
|
|
|
goto force_balance;
|
|
|
|
|
|
/*
|
|
|
- * If the local group is more busy than the selected busiest group
|
|
|
+ * If the local group is busier than the selected busiest group
|
|
|
* don't try and pull any tasks.
|
|
|
*/
|
|
|
if (local->avg_load >= busiest->avg_load)
|
|
@@ -6361,13 +6436,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
|
|
|
|
|
|
if (env->idle == CPU_IDLE) {
|
|
|
/*
|
|
|
- * This cpu is idle. If the busiest group load doesn't
|
|
|
- * have more tasks than the number of available cpu's and
|
|
|
- * there is no imbalance between this and busiest group
|
|
|
- * wrt to idle cpu's, it is balanced.
|
|
|
+ * This cpu is idle. If the busiest group is not overloaded
|
|
|
+ * and there is no imbalance between this and busiest group
|
|
|
+ * wrt idle cpus, it is balanced. The imbalance becomes
|
|
|
+ * significant if the diff is greater than 1 otherwise we
|
|
|
+ * might end up to just move the imbalance on another group
|
|
|
*/
|
|
|
- if ((local->idle_cpus < busiest->idle_cpus) &&
|
|
|
- busiest->sum_nr_running <= busiest->group_weight)
|
|
|
+ if ((busiest->group_type != group_overloaded) &&
|
|
|
+ (local->idle_cpus <= (busiest->idle_cpus + 1)))
|
|
|
goto out_balanced;
|
|
|
} else {
|
|
|
/*
|
|
@@ -6550,6 +6626,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
|
|
|
.loop_break = sched_nr_migrate_break,
|
|
|
.cpus = cpus,
|
|
|
.fbq_type = all,
|
|
|
+ .tasks = LIST_HEAD_INIT(env.tasks),
|
|
|
};
|
|
|
|
|
|
/*
|
|
@@ -6599,23 +6676,30 @@ redo:
|
|
|
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
|
|
|
|
|
|
more_balance:
|
|
|
- local_irq_save(flags);
|
|
|
- double_rq_lock(env.dst_rq, busiest);
|
|
|
+ raw_spin_lock_irqsave(&busiest->lock, flags);
|
|
|
|
|
|
/*
|
|
|
* cur_ld_moved - load moved in current iteration
|
|
|
* ld_moved - cumulative load moved across iterations
|
|
|
*/
|
|
|
- cur_ld_moved = move_tasks(&env);
|
|
|
- ld_moved += cur_ld_moved;
|
|
|
- double_rq_unlock(env.dst_rq, busiest);
|
|
|
- local_irq_restore(flags);
|
|
|
+ cur_ld_moved = detach_tasks(&env);
|
|
|
|
|
|
/*
|
|
|
- * some other cpu did the load balance for us.
|
|
|
+ * We've detached some tasks from busiest_rq. Every
|
|
|
+ * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
|
|
|
+ * unlock busiest->lock, and we are able to be sure
|
|
|
+ * that nobody can manipulate the tasks in parallel.
|
|
|
+ * See task_rq_lock() family for the details.
|
|
|
*/
|
|
|
- if (cur_ld_moved && env.dst_cpu != smp_processor_id())
|
|
|
- resched_cpu(env.dst_cpu);
|
|
|
+
|
|
|
+ raw_spin_unlock(&busiest->lock);
|
|
|
+
|
|
|
+ if (cur_ld_moved) {
|
|
|
+ attach_tasks(&env);
|
|
|
+ ld_moved += cur_ld_moved;
|
|
|
+ }
|
|
|
+
|
|
|
+ local_irq_restore(flags);
|
|
|
|
|
|
if (env.flags & LBF_NEED_BREAK) {
|
|
|
env.flags &= ~LBF_NEED_BREAK;
|
|
@@ -6665,10 +6749,8 @@ more_balance:
|
|
|
if (sd_parent) {
|
|
|
int *group_imbalance = &sd_parent->groups->sgc->imbalance;
|
|
|
|
|
|
- if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
|
|
|
+ if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
|
|
|
*group_imbalance = 1;
|
|
|
- } else if (*group_imbalance)
|
|
|
- *group_imbalance = 0;
|
|
|
}
|
|
|
|
|
|
/* All tasks on this runqueue were pinned by CPU affinity */
|
|
@@ -6679,7 +6761,7 @@ more_balance:
|
|
|
env.loop_break = sched_nr_migrate_break;
|
|
|
goto redo;
|
|
|
}
|
|
|
- goto out_balanced;
|
|
|
+ goto out_all_pinned;
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -6744,7 +6826,7 @@ more_balance:
|
|
|
* If we've begun active balancing, start to back off. This
|
|
|
* case may not be covered by the all_pinned logic if there
|
|
|
* is only 1 task on the busy runqueue (because we don't call
|
|
|
- * move_tasks).
|
|
|
+ * detach_tasks).
|
|
|
*/
|
|
|
if (sd->balance_interval < sd->max_interval)
|
|
|
sd->balance_interval *= 2;
|
|
@@ -6753,6 +6835,23 @@ more_balance:
|
|
|
goto out;
|
|
|
|
|
|
out_balanced:
|
|
|
+ /*
|
|
|
+ * We reach balance although we may have faced some affinity
|
|
|
+ * constraints. Clear the imbalance flag if it was set.
|
|
|
+ */
|
|
|
+ if (sd_parent) {
|
|
|
+ int *group_imbalance = &sd_parent->groups->sgc->imbalance;
|
|
|
+
|
|
|
+ if (*group_imbalance)
|
|
|
+ *group_imbalance = 0;
|
|
|
+ }
|
|
|
+
|
|
|
+out_all_pinned:
|
|
|
+ /*
|
|
|
+ * We reach balance because all tasks are pinned at this level so
|
|
|
+ * we can't migrate them. Let the imbalance flag set so parent level
|
|
|
+ * can try to migrate them.
|
|
|
+ */
|
|
|
schedstat_inc(sd, lb_balanced[idle]);
|
|
|
|
|
|
sd->nr_balance_failed = 0;
|
|
@@ -6914,6 +7013,7 @@ static int active_load_balance_cpu_stop(void *data)
|
|
|
int target_cpu = busiest_rq->push_cpu;
|
|
|
struct rq *target_rq = cpu_rq(target_cpu);
|
|
|
struct sched_domain *sd;
|
|
|
+ struct task_struct *p = NULL;
|
|
|
|
|
|
raw_spin_lock_irq(&busiest_rq->lock);
|
|
|
|
|
@@ -6933,9 +7033,6 @@ static int active_load_balance_cpu_stop(void *data)
|
|
|
*/
|
|
|
BUG_ON(busiest_rq == target_rq);
|
|
|
|
|
|
- /* move a task from busiest_rq to target_rq */
|
|
|
- double_lock_balance(busiest_rq, target_rq);
|
|
|
-
|
|
|
/* Search for an sd spanning us and the target CPU. */
|
|
|
rcu_read_lock();
|
|
|
for_each_domain(target_cpu, sd) {
|
|
@@ -6956,16 +7053,22 @@ static int active_load_balance_cpu_stop(void *data)
|
|
|
|
|
|
schedstat_inc(sd, alb_count);
|
|
|
|
|
|
- if (move_one_task(&env))
|
|
|
+ p = detach_one_task(&env);
|
|
|
+ if (p)
|
|
|
schedstat_inc(sd, alb_pushed);
|
|
|
else
|
|
|
schedstat_inc(sd, alb_failed);
|
|
|
}
|
|
|
rcu_read_unlock();
|
|
|
- double_unlock_balance(busiest_rq, target_rq);
|
|
|
out_unlock:
|
|
|
busiest_rq->active_balance = 0;
|
|
|
- raw_spin_unlock_irq(&busiest_rq->lock);
|
|
|
+ raw_spin_unlock(&busiest_rq->lock);
|
|
|
+
|
|
|
+ if (p)
|
|
|
+ attach_one_task(target_rq, p);
|
|
|
+
|
|
|
+ local_irq_enable();
|
|
|
+
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
@@ -7465,7 +7568,7 @@ static void task_fork_fair(struct task_struct *p)
|
|
|
static void
|
|
|
prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
|
|
|
{
|
|
|
- if (!p->se.on_rq)
|
|
|
+ if (!task_on_rq_queued(p))
|
|
|
return;
|
|
|
|
|
|
/*
|
|
@@ -7490,11 +7593,11 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
|
|
|
* switched back to the fair class the enqueue_entity(.flags=0) will
|
|
|
* do the right thing.
|
|
|
*
|
|
|
- * If it's on_rq, then the dequeue_entity(.flags=0) will already
|
|
|
- * have normalized the vruntime, if it's !on_rq, then only when
|
|
|
+ * If it's queued, then the dequeue_entity(.flags=0) will already
|
|
|
+ * have normalized the vruntime, if it's !queued, then only when
|
|
|
* the task is sleeping will it still have non-normalized vruntime.
|
|
|
*/
|
|
|
- if (!p->on_rq && p->state != TASK_RUNNING) {
|
|
|
+ if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) {
|
|
|
/*
|
|
|
* Fix up our vruntime so that the current sleep doesn't
|
|
|
* cause 'unlimited' sleep bonus.
|
|
@@ -7521,15 +7624,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
|
|
|
*/
|
|
|
static void switched_to_fair(struct rq *rq, struct task_struct *p)
|
|
|
{
|
|
|
- struct sched_entity *se = &p->se;
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
+ struct sched_entity *se = &p->se;
|
|
|
/*
|
|
|
* Since the real-depth could have been changed (only FAIR
|
|
|
* class maintain depth value), reset depth properly.
|
|
|
*/
|
|
|
se->depth = se->parent ? se->parent->depth + 1 : 0;
|
|
|
#endif
|
|
|
- if (!se->on_rq)
|
|
|
+ if (!task_on_rq_queued(p))
|
|
|
return;
|
|
|
|
|
|
/*
|
|
@@ -7575,7 +7678,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
|
|
|
}
|
|
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
-static void task_move_group_fair(struct task_struct *p, int on_rq)
|
|
|
+static void task_move_group_fair(struct task_struct *p, int queued)
|
|
|
{
|
|
|
struct sched_entity *se = &p->se;
|
|
|
struct cfs_rq *cfs_rq;
|
|
@@ -7594,7 +7697,7 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
|
|
|
* fair sleeper stuff for the first placement, but who cares.
|
|
|
*/
|
|
|
/*
|
|
|
- * When !on_rq, vruntime of the task has usually NOT been normalized.
|
|
|
+ * When !queued, vruntime of the task has usually NOT been normalized.
|
|
|
* But there are some cases where it has already been normalized:
|
|
|
*
|
|
|
* - Moving a forked child which is waiting for being woken up by
|
|
@@ -7605,14 +7708,14 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
|
|
|
* To prevent boost or penalty in the new cfs_rq caused by delta
|
|
|
* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
|
|
|
*/
|
|
|
- if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
|
|
|
- on_rq = 1;
|
|
|
+ if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
|
|
|
+ queued = 1;
|
|
|
|
|
|
- if (!on_rq)
|
|
|
+ if (!queued)
|
|
|
se->vruntime -= cfs_rq_of(se)->min_vruntime;
|
|
|
set_task_rq(p, task_cpu(p));
|
|
|
se->depth = se->parent ? se->parent->depth + 1 : 0;
|
|
|
- if (!on_rq) {
|
|
|
+ if (!queued) {
|
|
|
cfs_rq = cfs_rq_of(se);
|
|
|
se->vruntime += cfs_rq->min_vruntime;
|
|
|
#ifdef CONFIG_SMP
|