|
@@ -1062,7 +1062,6 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
|
|
|
if (!cpus)
|
|
|
return;
|
|
|
|
|
|
- ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity;
|
|
|
ns->task_capacity =
|
|
|
DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
|
|
|
ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
|
|
@@ -1096,18 +1095,30 @@ static void task_numa_assign(struct task_numa_env *env,
|
|
|
env->best_cpu = env->dst_cpu;
|
|
|
}
|
|
|
|
|
|
-static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
|
|
|
- long src_load, long dst_load,
|
|
|
+static bool load_too_imbalanced(long src_load, long dst_load,
|
|
|
struct task_numa_env *env)
|
|
|
{
|
|
|
long imb, old_imb;
|
|
|
+ long orig_src_load, orig_dst_load;
|
|
|
+ long src_capacity, dst_capacity;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * The load is corrected for the CPU capacity available on each node.
|
|
|
+ *
|
|
|
+ * src_load dst_load
|
|
|
+ * ------------ vs ---------
|
|
|
+ * src_capacity dst_capacity
|
|
|
+ */
|
|
|
+ src_capacity = env->src_stats.compute_capacity;
|
|
|
+ dst_capacity = env->dst_stats.compute_capacity;
|
|
|
|
|
|
/* We care about the slope of the imbalance, not the direction. */
|
|
|
if (dst_load < src_load)
|
|
|
swap(dst_load, src_load);
|
|
|
|
|
|
/* Is the difference below the threshold? */
|
|
|
- imb = dst_load * 100 - src_load * env->imbalance_pct;
|
|
|
+ imb = dst_load * src_capacity * 100 -
|
|
|
+ src_load * dst_capacity * env->imbalance_pct;
|
|
|
if (imb <= 0)
|
|
|
return false;
|
|
|
|
|
@@ -1115,10 +1126,14 @@ static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
|
|
|
* The imbalance is above the allowed threshold.
|
|
|
* Compare it with the old imbalance.
|
|
|
*/
|
|
|
+ orig_src_load = env->src_stats.load;
|
|
|
+ orig_dst_load = env->dst_stats.load;
|
|
|
+
|
|
|
if (orig_dst_load < orig_src_load)
|
|
|
swap(orig_dst_load, orig_src_load);
|
|
|
|
|
|
- old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct;
|
|
|
+ old_imb = orig_dst_load * src_capacity * 100 -
|
|
|
+ orig_src_load * dst_capacity * env->imbalance_pct;
|
|
|
|
|
|
/* Would this change make things worse? */
|
|
|
return (imb > old_imb);
|
|
@@ -1136,10 +1151,10 @@ static void task_numa_compare(struct task_numa_env *env,
|
|
|
struct rq *src_rq = cpu_rq(env->src_cpu);
|
|
|
struct rq *dst_rq = cpu_rq(env->dst_cpu);
|
|
|
struct task_struct *cur;
|
|
|
- long orig_src_load, src_load;
|
|
|
- long orig_dst_load, dst_load;
|
|
|
+ long src_load, dst_load;
|
|
|
long load;
|
|
|
- long imp = (groupimp > 0) ? groupimp : taskimp;
|
|
|
+ long imp = env->p->numa_group ? groupimp : taskimp;
|
|
|
+ long moveimp = imp;
|
|
|
|
|
|
rcu_read_lock();
|
|
|
cur = ACCESS_ONCE(dst_rq->curr);
|
|
@@ -1177,11 +1192,6 @@ static void task_numa_compare(struct task_numa_env *env,
|
|
|
* itself (not part of a group), use the task weight
|
|
|
* instead.
|
|
|
*/
|
|
|
- if (env->p->numa_group)
|
|
|
- imp = groupimp;
|
|
|
- else
|
|
|
- imp = taskimp;
|
|
|
-
|
|
|
if (cur->numa_group)
|
|
|
imp += group_weight(cur, env->src_nid) -
|
|
|
group_weight(cur, env->dst_nid);
|
|
@@ -1191,7 +1201,7 @@ static void task_numa_compare(struct task_numa_env *env,
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- if (imp < env->best_imp)
|
|
|
+ if (imp <= env->best_imp && moveimp <= env->best_imp)
|
|
|
goto unlock;
|
|
|
|
|
|
if (!cur) {
|
|
@@ -1204,20 +1214,34 @@ static void task_numa_compare(struct task_numa_env *env,
|
|
|
}
|
|
|
|
|
|
/* Balance doesn't matter much if we're running a task per cpu */
|
|
|
- if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
|
|
|
+ if (imp > env->best_imp && src_rq->nr_running == 1 &&
|
|
|
+ dst_rq->nr_running == 1)
|
|
|
goto assign;
|
|
|
|
|
|
/*
|
|
|
* In the overloaded case, try and keep the load balanced.
|
|
|
*/
|
|
|
balance:
|
|
|
- orig_dst_load = env->dst_stats.load;
|
|
|
- orig_src_load = env->src_stats.load;
|
|
|
-
|
|
|
- /* XXX missing capacity terms */
|
|
|
load = task_h_load(env->p);
|
|
|
- dst_load = orig_dst_load + load;
|
|
|
- src_load = orig_src_load - load;
|
|
|
+ dst_load = env->dst_stats.load + load;
|
|
|
+ src_load = env->src_stats.load - load;
|
|
|
+
|
|
|
+ if (moveimp > imp && moveimp > env->best_imp) {
|
|
|
+ /*
|
|
|
+ * If the improvement from just moving env->p direction is
|
|
|
+ * better than swapping tasks around, check if a move is
|
|
|
+ * possible. Store a slightly smaller score than moveimp,
|
|
|
+ * so an actually idle CPU will win.
|
|
|
+ */
|
|
|
+ if (!load_too_imbalanced(src_load, dst_load, env)) {
|
|
|
+ imp = moveimp - 1;
|
|
|
+ cur = NULL;
|
|
|
+ goto assign;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if (imp <= env->best_imp)
|
|
|
+ goto unlock;
|
|
|
|
|
|
if (cur) {
|
|
|
load = task_h_load(cur);
|
|
@@ -1225,8 +1249,7 @@ balance:
|
|
|
src_load += load;
|
|
|
}
|
|
|
|
|
|
- if (load_too_imbalanced(orig_src_load, orig_dst_load,
|
|
|
- src_load, dst_load, env))
|
|
|
+ if (load_too_imbalanced(src_load, dst_load, env))
|
|
|
goto unlock;
|
|
|
|
|
|
assign:
|
|
@@ -1302,9 +1325,8 @@ static int task_numa_migrate(struct task_struct *p)
|
|
|
groupimp = group_weight(p, env.dst_nid) - groupweight;
|
|
|
update_numa_stats(&env.dst_stats, env.dst_nid);
|
|
|
|
|
|
- /* If the preferred nid has free capacity, try to use it. */
|
|
|
- if (env.dst_stats.has_free_capacity)
|
|
|
- task_numa_find_cpu(&env, taskimp, groupimp);
|
|
|
+ /* Try to find a spot on the preferred nid. */
|
|
|
+ task_numa_find_cpu(&env, taskimp, groupimp);
|
|
|
|
|
|
/* No space available on the preferred nid. Look elsewhere. */
|
|
|
if (env.best_cpu == -1) {
|
|
@@ -1324,10 +1346,6 @@ static int task_numa_migrate(struct task_struct *p)
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- /* No better CPU than the current one was found. */
|
|
|
- if (env.best_cpu == -1)
|
|
|
- return -EAGAIN;
|
|
|
-
|
|
|
/*
|
|
|
* If the task is part of a workload that spans multiple NUMA nodes,
|
|
|
* and is migrating into one of the workload's active nodes, remember
|
|
@@ -1336,8 +1354,19 @@ static int task_numa_migrate(struct task_struct *p)
|
|
|
* A task that migrated to a second choice node will be better off
|
|
|
* trying for a better one later. Do not set the preferred node here.
|
|
|
*/
|
|
|
- if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes))
|
|
|
- sched_setnuma(p, env.dst_nid);
|
|
|
+ if (p->numa_group) {
|
|
|
+ if (env.best_cpu == -1)
|
|
|
+ nid = env.src_nid;
|
|
|
+ else
|
|
|
+ nid = env.dst_nid;
|
|
|
+
|
|
|
+ if (node_isset(nid, p->numa_group->active_nodes))
|
|
|
+ sched_setnuma(p, env.dst_nid);
|
|
|
+ }
|
|
|
+
|
|
|
+ /* No better CPU than the current one was found. */
|
|
|
+ if (env.best_cpu == -1)
|
|
|
+ return -EAGAIN;
|
|
|
|
|
|
/*
|
|
|
* Reset the scan period if the task is being rescheduled on an
|
|
@@ -1415,12 +1444,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
|
|
|
/*
|
|
|
* When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
|
|
|
* increments. The more local the fault statistics are, the higher the scan
|
|
|
- * period will be for the next scan window. If local/remote ratio is below
|
|
|
- * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
|
|
|
- * scan period will decrease
|
|
|
+ * period will be for the next scan window. If local/(local+remote) ratio is
|
|
|
+ * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
|
|
|
+ * the scan period will decrease. Aim for 70% local accesses.
|
|
|
*/
|
|
|
#define NUMA_PERIOD_SLOTS 10
|
|
|
-#define NUMA_PERIOD_THRESHOLD 3
|
|
|
+#define NUMA_PERIOD_THRESHOLD 7
|
|
|
|
|
|
/*
|
|
|
* Increase the scan period (slow down scanning) if the majority of
|
|
@@ -1595,30 +1624,17 @@ static void task_numa_placement(struct task_struct *p)
|
|
|
|
|
|
if (p->numa_group) {
|
|
|
update_numa_active_node_mask(p->numa_group);
|
|
|
- /*
|
|
|
- * If the preferred task and group nids are different,
|
|
|
- * iterate over the nodes again to find the best place.
|
|
|
- */
|
|
|
- if (max_nid != max_group_nid) {
|
|
|
- unsigned long weight, max_weight = 0;
|
|
|
-
|
|
|
- for_each_online_node(nid) {
|
|
|
- weight = task_weight(p, nid) + group_weight(p, nid);
|
|
|
- if (weight > max_weight) {
|
|
|
- max_weight = weight;
|
|
|
- max_nid = nid;
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
spin_unlock_irq(group_lock);
|
|
|
+ max_nid = max_group_nid;
|
|
|
}
|
|
|
|
|
|
- /* Preferred node as the node with the most faults */
|
|
|
- if (max_faults && max_nid != p->numa_preferred_nid) {
|
|
|
- /* Update the preferred nid and migrate task if possible */
|
|
|
- sched_setnuma(p, max_nid);
|
|
|
- numa_migrate_preferred(p);
|
|
|
+ if (max_faults) {
|
|
|
+ /* Set the new preferred node */
|
|
|
+ if (max_nid != p->numa_preferred_nid)
|
|
|
+ sched_setnuma(p, max_nid);
|
|
|
+
|
|
|
+ if (task_node(p) != p->numa_preferred_nid)
|
|
|
+ numa_migrate_preferred(p);
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -2899,7 +2915,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
|
|
|
ideal_runtime = sched_slice(cfs_rq, curr);
|
|
|
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
|
|
|
if (delta_exec > ideal_runtime) {
|
|
|
- resched_task(rq_of(cfs_rq)->curr);
|
|
|
+ resched_curr(rq_of(cfs_rq));
|
|
|
/*
|
|
|
* The current task ran long enough, ensure it doesn't get
|
|
|
* re-elected due to buddy favours.
|
|
@@ -2923,7 +2939,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
|
|
|
return;
|
|
|
|
|
|
if (delta > ideal_runtime)
|
|
|
- resched_task(rq_of(cfs_rq)->curr);
|
|
|
+ resched_curr(rq_of(cfs_rq));
|
|
|
}
|
|
|
|
|
|
static void
|
|
@@ -3063,7 +3079,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
|
|
|
* validating it and just reschedule.
|
|
|
*/
|
|
|
if (queued) {
|
|
|
- resched_task(rq_of(cfs_rq)->curr);
|
|
|
+ resched_curr(rq_of(cfs_rq));
|
|
|
return;
|
|
|
}
|
|
|
/*
|
|
@@ -3254,7 +3270,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
|
|
|
* hierarchy can be throttled
|
|
|
*/
|
|
|
if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
|
|
|
- resched_task(rq_of(cfs_rq)->curr);
|
|
|
+ resched_curr(rq_of(cfs_rq));
|
|
|
}
|
|
|
|
|
|
static __always_inline
|
|
@@ -3360,7 +3376,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
|
|
|
cfs_rq->throttled = 1;
|
|
|
cfs_rq->throttled_clock = rq_clock(rq);
|
|
|
raw_spin_lock(&cfs_b->lock);
|
|
|
- list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
|
|
|
+ /*
|
|
|
+ * Add to the _head_ of the list, so that an already-started
|
|
|
+ * distribute_cfs_runtime will not see us
|
|
|
+ */
|
|
|
+ list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
|
|
|
if (!cfs_b->timer_active)
|
|
|
__start_cfs_bandwidth(cfs_b, false);
|
|
|
raw_spin_unlock(&cfs_b->lock);
|
|
@@ -3410,14 +3430,15 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
|
|
|
|
|
|
/* determine whether we need to wake up potentially idle cpu */
|
|
|
if (rq->curr == rq->idle && rq->cfs.nr_running)
|
|
|
- resched_task(rq->curr);
|
|
|
+ resched_curr(rq);
|
|
|
}
|
|
|
|
|
|
static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
|
|
|
u64 remaining, u64 expires)
|
|
|
{
|
|
|
struct cfs_rq *cfs_rq;
|
|
|
- u64 runtime = remaining;
|
|
|
+ u64 runtime;
|
|
|
+ u64 starting_runtime = remaining;
|
|
|
|
|
|
rcu_read_lock();
|
|
|
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
|
|
@@ -3448,7 +3469,7 @@ next:
|
|
|
}
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
- return remaining;
|
|
|
+ return starting_runtime - remaining;
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -3494,22 +3515,17 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
|
|
|
/* account preceding periods in which throttling occurred */
|
|
|
cfs_b->nr_throttled += overrun;
|
|
|
|
|
|
- /*
|
|
|
- * There are throttled entities so we must first use the new bandwidth
|
|
|
- * to unthrottle them before making it generally available. This
|
|
|
- * ensures that all existing debts will be paid before a new cfs_rq is
|
|
|
- * allowed to run.
|
|
|
- */
|
|
|
- runtime = cfs_b->runtime;
|
|
|
runtime_expires = cfs_b->runtime_expires;
|
|
|
- cfs_b->runtime = 0;
|
|
|
|
|
|
/*
|
|
|
- * This check is repeated as we are holding onto the new bandwidth
|
|
|
- * while we unthrottle. This can potentially race with an unthrottled
|
|
|
- * group trying to acquire new bandwidth from the global pool.
|
|
|
+ * This check is repeated as we are holding onto the new bandwidth while
|
|
|
+ * we unthrottle. This can potentially race with an unthrottled group
|
|
|
+ * trying to acquire new bandwidth from the global pool. This can result
|
|
|
+ * in us over-using our runtime if it is all used during this loop, but
|
|
|
+ * only by limited amounts in that extreme case.
|
|
|
*/
|
|
|
- while (throttled && runtime > 0) {
|
|
|
+ while (throttled && cfs_b->runtime > 0) {
|
|
|
+ runtime = cfs_b->runtime;
|
|
|
raw_spin_unlock(&cfs_b->lock);
|
|
|
/* we can't nest cfs_b->lock while distributing bandwidth */
|
|
|
runtime = distribute_cfs_runtime(cfs_b, runtime,
|
|
@@ -3517,10 +3533,10 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
|
|
|
raw_spin_lock(&cfs_b->lock);
|
|
|
|
|
|
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
|
|
|
+
|
|
|
+ cfs_b->runtime -= min(runtime, cfs_b->runtime);
|
|
|
}
|
|
|
|
|
|
- /* return (any) remaining runtime */
|
|
|
- cfs_b->runtime = runtime;
|
|
|
/*
|
|
|
* While we are ensured activity in the period following an
|
|
|
* unthrottle, this also covers the case in which the new bandwidth is
|
|
@@ -3631,10 +3647,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
|
|
|
return;
|
|
|
}
|
|
|
|
|
|
- if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
|
|
|
+ if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
|
|
|
runtime = cfs_b->runtime;
|
|
|
- cfs_b->runtime = 0;
|
|
|
- }
|
|
|
+
|
|
|
expires = cfs_b->runtime_expires;
|
|
|
raw_spin_unlock(&cfs_b->lock);
|
|
|
|
|
@@ -3645,7 +3660,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
|
|
|
|
|
|
raw_spin_lock(&cfs_b->lock);
|
|
|
if (expires == cfs_b->runtime_expires)
|
|
|
- cfs_b->runtime = runtime;
|
|
|
+ cfs_b->runtime -= min(runtime, cfs_b->runtime);
|
|
|
raw_spin_unlock(&cfs_b->lock);
|
|
|
}
|
|
|
|
|
@@ -3775,6 +3790,19 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
|
|
|
hrtimer_cancel(&cfs_b->slack_timer);
|
|
|
}
|
|
|
|
|
|
+static void __maybe_unused update_runtime_enabled(struct rq *rq)
|
|
|
+{
|
|
|
+ struct cfs_rq *cfs_rq;
|
|
|
+
|
|
|
+ for_each_leaf_cfs_rq(rq, cfs_rq) {
|
|
|
+ struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
|
|
|
+
|
|
|
+ raw_spin_lock(&cfs_b->lock);
|
|
|
+ cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
|
|
|
+ raw_spin_unlock(&cfs_b->lock);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
|
|
|
{
|
|
|
struct cfs_rq *cfs_rq;
|
|
@@ -3788,6 +3816,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
|
|
|
* there's some valid quota amount
|
|
|
*/
|
|
|
cfs_rq->runtime_remaining = 1;
|
|
|
+ /*
|
|
|
+ * Offline rq is schedulable till cpu is completely disabled
|
|
|
+ * in take_cpu_down(), so we prevent new cfs throttling here.
|
|
|
+ */
|
|
|
+ cfs_rq->runtime_enabled = 0;
|
|
|
+
|
|
|
if (cfs_rq_throttled(cfs_rq))
|
|
|
unthrottle_cfs_rq(cfs_rq);
|
|
|
}
|
|
@@ -3831,6 +3865,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
|
|
|
return NULL;
|
|
|
}
|
|
|
static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
|
|
|
+static inline void update_runtime_enabled(struct rq *rq) {}
|
|
|
static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
|
|
|
|
|
|
#endif /* CONFIG_CFS_BANDWIDTH */
|
|
@@ -3854,7 +3889,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
|
|
|
|
|
|
if (delta < 0) {
|
|
|
if (rq->curr == p)
|
|
|
- resched_task(p);
|
|
|
+ resched_curr(rq);
|
|
|
return;
|
|
|
}
|
|
|
|
|
@@ -4723,7 +4758,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
|
|
|
return;
|
|
|
|
|
|
preempt:
|
|
|
- resched_task(curr);
|
|
|
+ resched_curr(rq);
|
|
|
/*
|
|
|
* Only set the backward buddy when the current task is still
|
|
|
* on the rq. This can happen when a wakeup gets interleaved
|
|
@@ -5094,8 +5129,7 @@ static void move_task(struct task_struct *p, struct lb_env *env)
|
|
|
/*
|
|
|
* Is this task likely cache-hot:
|
|
|
*/
|
|
|
-static int
|
|
|
-task_hot(struct task_struct *p, u64 now)
|
|
|
+static int task_hot(struct task_struct *p, struct lb_env *env)
|
|
|
{
|
|
|
s64 delta;
|
|
|
|
|
@@ -5108,7 +5142,7 @@ task_hot(struct task_struct *p, u64 now)
|
|
|
/*
|
|
|
* Buddy candidates are cache hot:
|
|
|
*/
|
|
|
- if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
|
|
|
+ if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
|
|
|
(&p->se == cfs_rq_of(&p->se)->next ||
|
|
|
&p->se == cfs_rq_of(&p->se)->last))
|
|
|
return 1;
|
|
@@ -5118,7 +5152,7 @@ task_hot(struct task_struct *p, u64 now)
|
|
|
if (sysctl_sched_migration_cost == 0)
|
|
|
return 0;
|
|
|
|
|
|
- delta = now - p->se.exec_start;
|
|
|
+ delta = rq_clock_task(env->src_rq) - p->se.exec_start;
|
|
|
|
|
|
return delta < (s64)sysctl_sched_migration_cost;
|
|
|
}
|
|
@@ -5272,7 +5306,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
|
|
|
* 2) task is cache cold, or
|
|
|
* 3) too many balance attempts have failed.
|
|
|
*/
|
|
|
- tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq));
|
|
|
+ tsk_cache_hot = task_hot(p, env);
|
|
|
if (!tsk_cache_hot)
|
|
|
tsk_cache_hot = migrate_degrades_locality(p, env);
|
|
|
|
|
@@ -5864,10 +5898,12 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro
|
|
|
* @load_idx: Load index of sched_domain of this_cpu for load calc.
|
|
|
* @local_group: Does group contain this_cpu.
|
|
|
* @sgs: variable to hold the statistics for this group.
|
|
|
+ * @overload: Indicate more than one runnable task for any CPU.
|
|
|
*/
|
|
|
static inline void update_sg_lb_stats(struct lb_env *env,
|
|
|
struct sched_group *group, int load_idx,
|
|
|
- int local_group, struct sg_lb_stats *sgs)
|
|
|
+ int local_group, struct sg_lb_stats *sgs,
|
|
|
+ bool *overload)
|
|
|
{
|
|
|
unsigned long load;
|
|
|
int i;
|
|
@@ -5885,6 +5921,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
|
|
|
|
|
sgs->group_load += load;
|
|
|
sgs->sum_nr_running += rq->nr_running;
|
|
|
+
|
|
|
+ if (rq->nr_running > 1)
|
|
|
+ *overload = true;
|
|
|
+
|
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
|
sgs->nr_numa_running += rq->nr_numa_running;
|
|
|
sgs->nr_preferred_running += rq->nr_preferred_running;
|
|
@@ -5995,6 +6035,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
|
|
|
struct sched_group *sg = env->sd->groups;
|
|
|
struct sg_lb_stats tmp_sgs;
|
|
|
int load_idx, prefer_sibling = 0;
|
|
|
+ bool overload = false;
|
|
|
|
|
|
if (child && child->flags & SD_PREFER_SIBLING)
|
|
|
prefer_sibling = 1;
|
|
@@ -6015,7 +6056,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
|
|
|
update_group_capacity(env->sd, env->dst_cpu);
|
|
|
}
|
|
|
|
|
|
- update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
|
|
|
+ update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
|
|
|
+ &overload);
|
|
|
|
|
|
if (local_group)
|
|
|
goto next_group;
|
|
@@ -6049,6 +6091,13 @@ next_group:
|
|
|
|
|
|
if (env->sd->flags & SD_NUMA)
|
|
|
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
|
|
|
+
|
|
|
+ if (!env->sd->parent) {
|
|
|
+ /* update overload indicator if we are at root domain */
|
|
|
+ if (env->dst_rq->rd->overload != overload)
|
|
|
+ env->dst_rq->rd->overload = overload;
|
|
|
+ }
|
|
|
+
|
|
|
}
|
|
|
|
|
|
/**
|
|
@@ -6767,7 +6816,8 @@ static int idle_balance(struct rq *this_rq)
|
|
|
*/
|
|
|
this_rq->idle_stamp = rq_clock(this_rq);
|
|
|
|
|
|
- if (this_rq->avg_idle < sysctl_sched_migration_cost) {
|
|
|
+ if (this_rq->avg_idle < sysctl_sched_migration_cost ||
|
|
|
+ !this_rq->rd->overload) {
|
|
|
rcu_read_lock();
|
|
|
sd = rcu_dereference_check_sched_domain(this_rq->sd);
|
|
|
if (sd)
|
|
@@ -7325,6 +7375,8 @@ void trigger_load_balance(struct rq *rq)
|
|
|
static void rq_online_fair(struct rq *rq)
|
|
|
{
|
|
|
update_sysctl();
|
|
|
+
|
|
|
+ update_runtime_enabled(rq);
|
|
|
}
|
|
|
|
|
|
static void rq_offline_fair(struct rq *rq)
|
|
@@ -7398,7 +7450,7 @@ static void task_fork_fair(struct task_struct *p)
|
|
|
* 'current' within the tree based on its new key value.
|
|
|
*/
|
|
|
swap(curr->vruntime, se->vruntime);
|
|
|
- resched_task(rq->curr);
|
|
|
+ resched_curr(rq);
|
|
|
}
|
|
|
|
|
|
se->vruntime -= cfs_rq->min_vruntime;
|
|
@@ -7423,7 +7475,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
|
|
|
*/
|
|
|
if (rq->curr == p) {
|
|
|
if (p->prio > oldprio)
|
|
|
- resched_task(rq->curr);
|
|
|
+ resched_curr(rq);
|
|
|
} else
|
|
|
check_preempt_curr(rq, p, 0);
|
|
|
}
|
|
@@ -7486,7 +7538,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
|
|
|
* if we can still preempt the current task.
|
|
|
*/
|
|
|
if (rq->curr == p)
|
|
|
- resched_task(rq->curr);
|
|
|
+ resched_curr(rq);
|
|
|
else
|
|
|
check_preempt_curr(rq, p, 0);
|
|
|
}
|