|
@@ -114,6 +114,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
|
|
|
unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
|
|
|
#endif
|
|
|
|
|
|
+/*
|
|
|
+ * The margin used when comparing utilization with CPU capacity:
|
|
|
+ * util * 1024 < capacity * margin
|
|
|
+ */
|
|
|
+unsigned int capacity_margin = 1280; /* ~20% */
|
|
|
+
|
|
|
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
|
|
|
{
|
|
|
lw->weight += inc;
|
|
@@ -256,9 +262,7 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
|
|
|
|
|
|
static inline struct task_struct *task_of(struct sched_entity *se)
|
|
|
{
|
|
|
-#ifdef CONFIG_SCHED_DEBUG
|
|
|
- WARN_ON_ONCE(!entity_is_task(se));
|
|
|
-#endif
|
|
|
+ SCHED_WARN_ON(!entity_is_task(se));
|
|
|
return container_of(se, struct task_struct, se);
|
|
|
}
|
|
|
|
|
@@ -456,17 +460,23 @@ static inline int entity_before(struct sched_entity *a,
|
|
|
|
|
|
static void update_min_vruntime(struct cfs_rq *cfs_rq)
|
|
|
{
|
|
|
+ struct sched_entity *curr = cfs_rq->curr;
|
|
|
+
|
|
|
u64 vruntime = cfs_rq->min_vruntime;
|
|
|
|
|
|
- if (cfs_rq->curr)
|
|
|
- vruntime = cfs_rq->curr->vruntime;
|
|
|
+ if (curr) {
|
|
|
+ if (curr->on_rq)
|
|
|
+ vruntime = curr->vruntime;
|
|
|
+ else
|
|
|
+ curr = NULL;
|
|
|
+ }
|
|
|
|
|
|
if (cfs_rq->rb_leftmost) {
|
|
|
struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
|
|
|
struct sched_entity,
|
|
|
run_node);
|
|
|
|
|
|
- if (!cfs_rq->curr)
|
|
|
+ if (!curr)
|
|
|
vruntime = se->vruntime;
|
|
|
else
|
|
|
vruntime = min_vruntime(vruntime, se->vruntime);
|
|
@@ -656,7 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
}
|
|
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
-static int select_idle_sibling(struct task_struct *p, int cpu);
|
|
|
+static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
|
|
|
static unsigned long task_h_load(struct task_struct *p);
|
|
|
|
|
|
/*
|
|
@@ -726,7 +736,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
|
|
|
struct sched_avg *sa = &se->avg;
|
|
|
long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
|
|
|
u64 now = cfs_rq_clock_task(cfs_rq);
|
|
|
- int tg_update;
|
|
|
|
|
|
if (cap > 0) {
|
|
|
if (cfs_rq->avg.util_avg != 0) {
|
|
@@ -759,10 +768,9 @@ void post_init_entity_util_avg(struct sched_entity *se)
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
|
|
|
+ update_cfs_rq_load_avg(now, cfs_rq, false);
|
|
|
attach_entity_load_avg(cfs_rq, se);
|
|
|
- if (tg_update)
|
|
|
- update_tg_load_avg(cfs_rq, false);
|
|
|
+ update_tg_load_avg(cfs_rq, false);
|
|
|
}
|
|
|
|
|
|
#else /* !CONFIG_SMP */
|
|
@@ -799,7 +807,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
|
|
|
max(delta_exec, curr->statistics.exec_max));
|
|
|
|
|
|
curr->sum_exec_runtime += delta_exec;
|
|
|
- schedstat_add(cfs_rq, exec_clock, delta_exec);
|
|
|
+ schedstat_add(cfs_rq->exec_clock, delta_exec);
|
|
|
|
|
|
curr->vruntime += calc_delta_fair(delta_exec, curr);
|
|
|
update_min_vruntime(cfs_rq);
|
|
@@ -820,26 +828,34 @@ static void update_curr_fair(struct rq *rq)
|
|
|
update_curr(cfs_rq_of(&rq->curr->se));
|
|
|
}
|
|
|
|
|
|
-#ifdef CONFIG_SCHEDSTATS
|
|
|
static inline void
|
|
|
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
{
|
|
|
- u64 wait_start = rq_clock(rq_of(cfs_rq));
|
|
|
+ u64 wait_start, prev_wait_start;
|
|
|
+
|
|
|
+ if (!schedstat_enabled())
|
|
|
+ return;
|
|
|
+
|
|
|
+ wait_start = rq_clock(rq_of(cfs_rq));
|
|
|
+ prev_wait_start = schedstat_val(se->statistics.wait_start);
|
|
|
|
|
|
if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
|
|
|
- likely(wait_start > se->statistics.wait_start))
|
|
|
- wait_start -= se->statistics.wait_start;
|
|
|
+ likely(wait_start > prev_wait_start))
|
|
|
+ wait_start -= prev_wait_start;
|
|
|
|
|
|
- se->statistics.wait_start = wait_start;
|
|
|
+ schedstat_set(se->statistics.wait_start, wait_start);
|
|
|
}
|
|
|
|
|
|
-static void
|
|
|
+static inline void
|
|
|
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
{
|
|
|
struct task_struct *p;
|
|
|
u64 delta;
|
|
|
|
|
|
- delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
|
|
|
+ if (!schedstat_enabled())
|
|
|
+ return;
|
|
|
+
|
|
|
+ delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
|
|
|
|
|
|
if (entity_is_task(se)) {
|
|
|
p = task_of(se);
|
|
@@ -849,35 +865,114 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
* time stamp can be adjusted to accumulate wait time
|
|
|
* prior to migration.
|
|
|
*/
|
|
|
- se->statistics.wait_start = delta;
|
|
|
+ schedstat_set(se->statistics.wait_start, delta);
|
|
|
return;
|
|
|
}
|
|
|
trace_sched_stat_wait(p, delta);
|
|
|
}
|
|
|
|
|
|
- se->statistics.wait_max = max(se->statistics.wait_max, delta);
|
|
|
- se->statistics.wait_count++;
|
|
|
- se->statistics.wait_sum += delta;
|
|
|
- se->statistics.wait_start = 0;
|
|
|
+ schedstat_set(se->statistics.wait_max,
|
|
|
+ max(schedstat_val(se->statistics.wait_max), delta));
|
|
|
+ schedstat_inc(se->statistics.wait_count);
|
|
|
+ schedstat_add(se->statistics.wait_sum, delta);
|
|
|
+ schedstat_set(se->statistics.wait_start, 0);
|
|
|
+}
|
|
|
+
|
|
|
+static inline void
|
|
|
+update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
+{
|
|
|
+ struct task_struct *tsk = NULL;
|
|
|
+ u64 sleep_start, block_start;
|
|
|
+
|
|
|
+ if (!schedstat_enabled())
|
|
|
+ return;
|
|
|
+
|
|
|
+ sleep_start = schedstat_val(se->statistics.sleep_start);
|
|
|
+ block_start = schedstat_val(se->statistics.block_start);
|
|
|
+
|
|
|
+ if (entity_is_task(se))
|
|
|
+ tsk = task_of(se);
|
|
|
+
|
|
|
+ if (sleep_start) {
|
|
|
+ u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
|
|
|
+
|
|
|
+ if ((s64)delta < 0)
|
|
|
+ delta = 0;
|
|
|
+
|
|
|
+ if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
|
|
|
+ schedstat_set(se->statistics.sleep_max, delta);
|
|
|
+
|
|
|
+ schedstat_set(se->statistics.sleep_start, 0);
|
|
|
+ schedstat_add(se->statistics.sum_sleep_runtime, delta);
|
|
|
+
|
|
|
+ if (tsk) {
|
|
|
+ account_scheduler_latency(tsk, delta >> 10, 1);
|
|
|
+ trace_sched_stat_sleep(tsk, delta);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (block_start) {
|
|
|
+ u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
|
|
|
+
|
|
|
+ if ((s64)delta < 0)
|
|
|
+ delta = 0;
|
|
|
+
|
|
|
+ if (unlikely(delta > schedstat_val(se->statistics.block_max)))
|
|
|
+ schedstat_set(se->statistics.block_max, delta);
|
|
|
+
|
|
|
+ schedstat_set(se->statistics.block_start, 0);
|
|
|
+ schedstat_add(se->statistics.sum_sleep_runtime, delta);
|
|
|
+
|
|
|
+ if (tsk) {
|
|
|
+ if (tsk->in_iowait) {
|
|
|
+ schedstat_add(se->statistics.iowait_sum, delta);
|
|
|
+ schedstat_inc(se->statistics.iowait_count);
|
|
|
+ trace_sched_stat_iowait(tsk, delta);
|
|
|
+ }
|
|
|
+
|
|
|
+ trace_sched_stat_blocked(tsk, delta);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Blocking time is in units of nanosecs, so shift by
|
|
|
+ * 20 to get a milliseconds-range estimation of the
|
|
|
+ * amount of time that the task spent sleeping:
|
|
|
+ */
|
|
|
+ if (unlikely(prof_on == SLEEP_PROFILING)) {
|
|
|
+ profile_hits(SLEEP_PROFILING,
|
|
|
+ (void *)get_wchan(tsk),
|
|
|
+ delta >> 20);
|
|
|
+ }
|
|
|
+ account_scheduler_latency(tsk, delta >> 10, 0);
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
* Task is being enqueued - update stats:
|
|
|
*/
|
|
|
static inline void
|
|
|
-update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
|
{
|
|
|
+ if (!schedstat_enabled())
|
|
|
+ return;
|
|
|
+
|
|
|
/*
|
|
|
* Are we enqueueing a waiting task? (for current tasks
|
|
|
* a dequeue/enqueue event is a NOP)
|
|
|
*/
|
|
|
if (se != cfs_rq->curr)
|
|
|
update_stats_wait_start(cfs_rq, se);
|
|
|
+
|
|
|
+ if (flags & ENQUEUE_WAKEUP)
|
|
|
+ update_stats_enqueue_sleeper(cfs_rq, se);
|
|
|
}
|
|
|
|
|
|
static inline void
|
|
|
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
|
{
|
|
|
+
|
|
|
+ if (!schedstat_enabled())
|
|
|
+ return;
|
|
|
+
|
|
|
/*
|
|
|
* Mark the end of the wait period if dequeueing a
|
|
|
* waiting task:
|
|
@@ -885,40 +980,18 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
|
if (se != cfs_rq->curr)
|
|
|
update_stats_wait_end(cfs_rq, se);
|
|
|
|
|
|
- if (flags & DEQUEUE_SLEEP) {
|
|
|
- if (entity_is_task(se)) {
|
|
|
- struct task_struct *tsk = task_of(se);
|
|
|
+ if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
|
|
|
+ struct task_struct *tsk = task_of(se);
|
|
|
|
|
|
- if (tsk->state & TASK_INTERRUPTIBLE)
|
|
|
- se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
|
|
|
- if (tsk->state & TASK_UNINTERRUPTIBLE)
|
|
|
- se->statistics.block_start = rq_clock(rq_of(cfs_rq));
|
|
|
- }
|
|
|
+ if (tsk->state & TASK_INTERRUPTIBLE)
|
|
|
+ schedstat_set(se->statistics.sleep_start,
|
|
|
+ rq_clock(rq_of(cfs_rq)));
|
|
|
+ if (tsk->state & TASK_UNINTERRUPTIBLE)
|
|
|
+ schedstat_set(se->statistics.block_start,
|
|
|
+ rq_clock(rq_of(cfs_rq)));
|
|
|
}
|
|
|
-
|
|
|
-}
|
|
|
-#else
|
|
|
-static inline void
|
|
|
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
-{
|
|
|
-}
|
|
|
-
|
|
|
-static inline void
|
|
|
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
-{
|
|
|
-}
|
|
|
-
|
|
|
-static inline void
|
|
|
-update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
-{
|
|
|
}
|
|
|
|
|
|
-static inline void
|
|
|
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
|
-{
|
|
|
-}
|
|
|
-#endif
|
|
|
-
|
|
|
/*
|
|
|
* We are picking a new current task - update its stats:
|
|
|
*/
|
|
@@ -1513,8 +1586,16 @@ balance:
|
|
|
* One idle CPU per node is evaluated for a task numa move.
|
|
|
* Call select_idle_sibling to maybe find a better one.
|
|
|
*/
|
|
|
- if (!cur)
|
|
|
- env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
|
|
|
+ if (!cur) {
|
|
|
+ /*
|
|
|
+ * select_idle_siblings() uses an per-cpu cpumask that
|
|
|
+ * can be used from IRQ context.
|
|
|
+ */
|
|
|
+ local_irq_disable();
|
|
|
+ env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
|
|
|
+ env->dst_cpu);
|
|
|
+ local_irq_enable();
|
|
|
+ }
|
|
|
|
|
|
assign:
|
|
|
task_numa_assign(env, cur, imp);
|
|
@@ -2292,7 +2373,7 @@ void task_numa_work(struct callback_head *work)
|
|
|
unsigned long nr_pte_updates = 0;
|
|
|
long pages, virtpages;
|
|
|
|
|
|
- WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
|
|
|
+ SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
|
|
|
|
|
|
work->next = work; /* protect against double add */
|
|
|
/*
|
|
@@ -2803,9 +2884,21 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
|
|
|
}
|
|
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
-/*
|
|
|
- * Updating tg's load_avg is necessary before update_cfs_share (which is done)
|
|
|
- * and effective_load (which is not done because it is too costly).
|
|
|
+/**
|
|
|
+ * update_tg_load_avg - update the tg's load avg
|
|
|
+ * @cfs_rq: the cfs_rq whose avg changed
|
|
|
+ * @force: update regardless of how small the difference
|
|
|
+ *
|
|
|
+ * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
|
|
|
+ * However, because tg->load_avg is a global value there are performance
|
|
|
+ * considerations.
|
|
|
+ *
|
|
|
+ * In order to avoid having to look at the other cfs_rq's, we use a
|
|
|
+ * differential update where we store the last value we propagated. This in
|
|
|
+ * turn allows skipping updates if the differential is 'small'.
|
|
|
+ *
|
|
|
+ * Updating tg's load_avg is necessary before update_cfs_share() (which is
|
|
|
+ * done) and effective_load() (which is not done because it is too costly).
|
|
|
*/
|
|
|
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
|
|
|
{
|
|
@@ -2925,10 +3018,10 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
|
|
|
*
|
|
|
* cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
|
|
|
*
|
|
|
- * Returns true if the load decayed or we removed utilization. It is expected
|
|
|
- * that one calls update_tg_load_avg() on this condition, but after you've
|
|
|
- * modified the cfs_rq avg (attach/detach), such that we propagate the new
|
|
|
- * avg up.
|
|
|
+ * Returns true if the load decayed or we removed load.
|
|
|
+ *
|
|
|
+ * Since both these conditions indicate a changed cfs_rq->avg.load we should
|
|
|
+ * call update_tg_load_avg() when this function returns true.
|
|
|
*/
|
|
|
static inline int
|
|
|
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
|
|
@@ -3174,68 +3267,6 @@ static inline int idle_balance(struct rq *rq)
|
|
|
|
|
|
#endif /* CONFIG_SMP */
|
|
|
|
|
|
-static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
-{
|
|
|
-#ifdef CONFIG_SCHEDSTATS
|
|
|
- struct task_struct *tsk = NULL;
|
|
|
-
|
|
|
- if (entity_is_task(se))
|
|
|
- tsk = task_of(se);
|
|
|
-
|
|
|
- if (se->statistics.sleep_start) {
|
|
|
- u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
|
|
|
-
|
|
|
- if ((s64)delta < 0)
|
|
|
- delta = 0;
|
|
|
-
|
|
|
- if (unlikely(delta > se->statistics.sleep_max))
|
|
|
- se->statistics.sleep_max = delta;
|
|
|
-
|
|
|
- se->statistics.sleep_start = 0;
|
|
|
- se->statistics.sum_sleep_runtime += delta;
|
|
|
-
|
|
|
- if (tsk) {
|
|
|
- account_scheduler_latency(tsk, delta >> 10, 1);
|
|
|
- trace_sched_stat_sleep(tsk, delta);
|
|
|
- }
|
|
|
- }
|
|
|
- if (se->statistics.block_start) {
|
|
|
- u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
|
|
|
-
|
|
|
- if ((s64)delta < 0)
|
|
|
- delta = 0;
|
|
|
-
|
|
|
- if (unlikely(delta > se->statistics.block_max))
|
|
|
- se->statistics.block_max = delta;
|
|
|
-
|
|
|
- se->statistics.block_start = 0;
|
|
|
- se->statistics.sum_sleep_runtime += delta;
|
|
|
-
|
|
|
- if (tsk) {
|
|
|
- if (tsk->in_iowait) {
|
|
|
- se->statistics.iowait_sum += delta;
|
|
|
- se->statistics.iowait_count++;
|
|
|
- trace_sched_stat_iowait(tsk, delta);
|
|
|
- }
|
|
|
-
|
|
|
- trace_sched_stat_blocked(tsk, delta);
|
|
|
-
|
|
|
- /*
|
|
|
- * Blocking time is in units of nanosecs, so shift by
|
|
|
- * 20 to get a milliseconds-range estimation of the
|
|
|
- * amount of time that the task spent sleeping:
|
|
|
- */
|
|
|
- if (unlikely(prof_on == SLEEP_PROFILING)) {
|
|
|
- profile_hits(SLEEP_PROFILING,
|
|
|
- (void *)get_wchan(tsk),
|
|
|
- delta >> 20);
|
|
|
- }
|
|
|
- account_scheduler_latency(tsk, delta >> 10, 0);
|
|
|
- }
|
|
|
- }
|
|
|
-#endif
|
|
|
-}
|
|
|
-
|
|
|
static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
{
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
@@ -3245,7 +3276,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
d = -d;
|
|
|
|
|
|
if (d > 3*sysctl_sched_latency)
|
|
|
- schedstat_inc(cfs_rq, nr_spread_over);
|
|
|
+ schedstat_inc(cfs_rq->nr_spread_over);
|
|
|
#endif
|
|
|
}
|
|
|
|
|
@@ -3362,17 +3393,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
|
account_entity_enqueue(cfs_rq, se);
|
|
|
update_cfs_shares(cfs_rq);
|
|
|
|
|
|
- if (flags & ENQUEUE_WAKEUP) {
|
|
|
+ if (flags & ENQUEUE_WAKEUP)
|
|
|
place_entity(cfs_rq, se, 0);
|
|
|
- if (schedstat_enabled())
|
|
|
- enqueue_sleeper(cfs_rq, se);
|
|
|
- }
|
|
|
|
|
|
check_schedstat_required();
|
|
|
- if (schedstat_enabled()) {
|
|
|
- update_stats_enqueue(cfs_rq, se);
|
|
|
- check_spread(cfs_rq, se);
|
|
|
- }
|
|
|
+ update_stats_enqueue(cfs_rq, se, flags);
|
|
|
+ check_spread(cfs_rq, se);
|
|
|
if (!curr)
|
|
|
__enqueue_entity(cfs_rq, se);
|
|
|
se->on_rq = 1;
|
|
@@ -3439,8 +3465,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
|
update_curr(cfs_rq);
|
|
|
dequeue_entity_load_avg(cfs_rq, se);
|
|
|
|
|
|
- if (schedstat_enabled())
|
|
|
- update_stats_dequeue(cfs_rq, se, flags);
|
|
|
+ update_stats_dequeue(cfs_rq, se, flags);
|
|
|
|
|
|
clear_buddies(cfs_rq, se);
|
|
|
|
|
@@ -3450,9 +3475,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
|
account_entity_dequeue(cfs_rq, se);
|
|
|
|
|
|
/*
|
|
|
- * Normalize the entity after updating the min_vruntime because the
|
|
|
- * update can refer to the ->curr item and we need to reflect this
|
|
|
- * movement in our normalized position.
|
|
|
+ * Normalize after update_curr(); which will also have moved
|
|
|
+ * min_vruntime if @se is the one holding it back. But before doing
|
|
|
+ * update_min_vruntime() again, which will discount @se's position and
|
|
|
+ * can move min_vruntime forward still more.
|
|
|
*/
|
|
|
if (!(flags & DEQUEUE_SLEEP))
|
|
|
se->vruntime -= cfs_rq->min_vruntime;
|
|
@@ -3460,8 +3486,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
|
/* return excess runtime on last dequeue */
|
|
|
return_cfs_rq_runtime(cfs_rq);
|
|
|
|
|
|
- update_min_vruntime(cfs_rq);
|
|
|
update_cfs_shares(cfs_rq);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Now advance min_vruntime if @se was the entity holding it back,
|
|
|
+ * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
|
|
|
+ * put back on, and if we advance min_vruntime, we'll be placed back
|
|
|
+ * further than we started -- ie. we'll be penalized.
|
|
|
+ */
|
|
|
+ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
|
|
|
+ update_min_vruntime(cfs_rq);
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -3514,25 +3548,25 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
* a CPU. So account for the time it spent waiting on the
|
|
|
* runqueue.
|
|
|
*/
|
|
|
- if (schedstat_enabled())
|
|
|
- update_stats_wait_end(cfs_rq, se);
|
|
|
+ update_stats_wait_end(cfs_rq, se);
|
|
|
__dequeue_entity(cfs_rq, se);
|
|
|
update_load_avg(se, 1);
|
|
|
}
|
|
|
|
|
|
update_stats_curr_start(cfs_rq, se);
|
|
|
cfs_rq->curr = se;
|
|
|
-#ifdef CONFIG_SCHEDSTATS
|
|
|
+
|
|
|
/*
|
|
|
* Track our maximum slice length, if the CPU's load is at
|
|
|
* least twice that of our own weight (i.e. dont track it
|
|
|
* when there are only lesser-weight tasks around):
|
|
|
*/
|
|
|
if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
|
|
|
- se->statistics.slice_max = max(se->statistics.slice_max,
|
|
|
- se->sum_exec_runtime - se->prev_sum_exec_runtime);
|
|
|
+ schedstat_set(se->statistics.slice_max,
|
|
|
+ max((u64)schedstat_val(se->statistics.slice_max),
|
|
|
+ se->sum_exec_runtime - se->prev_sum_exec_runtime));
|
|
|
}
|
|
|
-#endif
|
|
|
+
|
|
|
se->prev_sum_exec_runtime = se->sum_exec_runtime;
|
|
|
}
|
|
|
|
|
@@ -3611,13 +3645,10 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
|
|
|
/* throttle cfs_rqs exceeding runtime */
|
|
|
check_cfs_rq_runtime(cfs_rq);
|
|
|
|
|
|
- if (schedstat_enabled()) {
|
|
|
- check_spread(cfs_rq, prev);
|
|
|
- if (prev->on_rq)
|
|
|
- update_stats_wait_start(cfs_rq, prev);
|
|
|
- }
|
|
|
+ check_spread(cfs_rq, prev);
|
|
|
|
|
|
if (prev->on_rq) {
|
|
|
+ update_stats_wait_start(cfs_rq, prev);
|
|
|
/* Put 'current' back into the tree. */
|
|
|
__enqueue_entity(cfs_rq, prev);
|
|
|
/* in !on_rq case, update occurred at dequeue */
|
|
@@ -4447,9 +4478,9 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
|
|
|
struct sched_entity *se = &p->se;
|
|
|
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
|
|
|
|
- WARN_ON(task_rq(p) != rq);
|
|
|
+ SCHED_WARN_ON(task_rq(p) != rq);
|
|
|
|
|
|
- if (cfs_rq->nr_running > 1) {
|
|
|
+ if (rq->cfs.h_nr_running > 1) {
|
|
|
u64 slice = sched_slice(cfs_rq, se);
|
|
|
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
|
|
|
s64 delta = slice - ran;
|
|
@@ -4604,6 +4635,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
|
|
}
|
|
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
+
|
|
|
+/* Working cpumask for: load_balance, load_balance_newidle. */
|
|
|
+DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
|
|
|
+DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
|
|
|
+
|
|
|
#ifdef CONFIG_NO_HZ_COMMON
|
|
|
/*
|
|
|
* per rq 'load' arrray crap; XXX kill this.
|
|
@@ -5005,9 +5041,9 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
|
|
|
* wl = S * s'_i; see (2)
|
|
|
*/
|
|
|
if (W > 0 && w < W)
|
|
|
- wl = (w * (long)tg->shares) / W;
|
|
|
+ wl = (w * (long)scale_load_down(tg->shares)) / W;
|
|
|
else
|
|
|
- wl = tg->shares;
|
|
|
+ wl = scale_load_down(tg->shares);
|
|
|
|
|
|
/*
|
|
|
* Per the above, wl is the new se->load.weight value; since
|
|
@@ -5090,18 +5126,18 @@ static int wake_wide(struct task_struct *p)
|
|
|
return 1;
|
|
|
}
|
|
|
|
|
|
-static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
|
|
|
+static int wake_affine(struct sched_domain *sd, struct task_struct *p,
|
|
|
+ int prev_cpu, int sync)
|
|
|
{
|
|
|
s64 this_load, load;
|
|
|
s64 this_eff_load, prev_eff_load;
|
|
|
- int idx, this_cpu, prev_cpu;
|
|
|
+ int idx, this_cpu;
|
|
|
struct task_group *tg;
|
|
|
unsigned long weight;
|
|
|
int balanced;
|
|
|
|
|
|
idx = sd->wake_idx;
|
|
|
this_cpu = smp_processor_id();
|
|
|
- prev_cpu = task_cpu(p);
|
|
|
load = source_load(prev_cpu, idx);
|
|
|
this_load = target_load(this_cpu, idx);
|
|
|
|
|
@@ -5145,13 +5181,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
|
|
|
|
|
|
balanced = this_eff_load <= prev_eff_load;
|
|
|
|
|
|
- schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
|
|
|
+ schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
|
|
|
|
|
|
if (!balanced)
|
|
|
return 0;
|
|
|
|
|
|
- schedstat_inc(sd, ttwu_move_affine);
|
|
|
- schedstat_inc(p, se.statistics.nr_wakeups_affine);
|
|
|
+ schedstat_inc(sd->ttwu_move_affine);
|
|
|
+ schedstat_inc(p->se.statistics.nr_wakeups_affine);
|
|
|
|
|
|
return 1;
|
|
|
}
|
|
@@ -5227,6 +5263,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
|
|
|
int shallowest_idle_cpu = -1;
|
|
|
int i;
|
|
|
|
|
|
+ /* Check if we have any choice: */
|
|
|
+ if (group->group_weight == 1)
|
|
|
+ return cpumask_first(sched_group_cpus(group));
|
|
|
+
|
|
|
/* Traverse only the allowed CPUs */
|
|
|
for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
|
|
|
if (idle_cpu(i)) {
|
|
@@ -5264,64 +5304,237 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
- * Try and locate an idle CPU in the sched_domain.
|
|
|
+ * Implement a for_each_cpu() variant that starts the scan at a given cpu
|
|
|
+ * (@start), and wraps around.
|
|
|
+ *
|
|
|
+ * This is used to scan for idle CPUs; such that not all CPUs looking for an
|
|
|
+ * idle CPU find the same CPU. The down-side is that tasks tend to cycle
|
|
|
+ * through the LLC domain.
|
|
|
+ *
|
|
|
+ * Especially tbench is found sensitive to this.
|
|
|
+ */
|
|
|
+
|
|
|
+static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
|
|
|
+{
|
|
|
+ int next;
|
|
|
+
|
|
|
+again:
|
|
|
+ next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
|
|
|
+
|
|
|
+ if (*wrapped) {
|
|
|
+ if (next >= start)
|
|
|
+ return nr_cpumask_bits;
|
|
|
+ } else {
|
|
|
+ if (next >= nr_cpumask_bits) {
|
|
|
+ *wrapped = 1;
|
|
|
+ n = -1;
|
|
|
+ goto again;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return next;
|
|
|
+}
|
|
|
+
|
|
|
+#define for_each_cpu_wrap(cpu, mask, start, wrap) \
|
|
|
+ for ((wrap) = 0, (cpu) = (start)-1; \
|
|
|
+ (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \
|
|
|
+ (cpu) < nr_cpumask_bits; )
|
|
|
+
|
|
|
+#ifdef CONFIG_SCHED_SMT
|
|
|
+
|
|
|
+static inline void set_idle_cores(int cpu, int val)
|
|
|
+{
|
|
|
+ struct sched_domain_shared *sds;
|
|
|
+
|
|
|
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
|
|
|
+ if (sds)
|
|
|
+ WRITE_ONCE(sds->has_idle_cores, val);
|
|
|
+}
|
|
|
+
|
|
|
+static inline bool test_idle_cores(int cpu, bool def)
|
|
|
+{
|
|
|
+ struct sched_domain_shared *sds;
|
|
|
+
|
|
|
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
|
|
|
+ if (sds)
|
|
|
+ return READ_ONCE(sds->has_idle_cores);
|
|
|
+
|
|
|
+ return def;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Scans the local SMT mask to see if the entire core is idle, and records this
|
|
|
+ * information in sd_llc_shared->has_idle_cores.
|
|
|
+ *
|
|
|
+ * Since SMT siblings share all cache levels, inspecting this limited remote
|
|
|
+ * state should be fairly cheap.
|
|
|
*/
|
|
|
-static int select_idle_sibling(struct task_struct *p, int target)
|
|
|
+void __update_idle_core(struct rq *rq)
|
|
|
+{
|
|
|
+ int core = cpu_of(rq);
|
|
|
+ int cpu;
|
|
|
+
|
|
|
+ rcu_read_lock();
|
|
|
+ if (test_idle_cores(core, true))
|
|
|
+ goto unlock;
|
|
|
+
|
|
|
+ for_each_cpu(cpu, cpu_smt_mask(core)) {
|
|
|
+ if (cpu == core)
|
|
|
+ continue;
|
|
|
+
|
|
|
+ if (!idle_cpu(cpu))
|
|
|
+ goto unlock;
|
|
|
+ }
|
|
|
+
|
|
|
+ set_idle_cores(core, 1);
|
|
|
+unlock:
|
|
|
+ rcu_read_unlock();
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Scan the entire LLC domain for idle cores; this dynamically switches off if
|
|
|
+ * there are no idle cores left in the system; tracked through
|
|
|
+ * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
|
|
|
+ */
|
|
|
+static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
|
|
|
+{
|
|
|
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
|
|
|
+ int core, cpu, wrap;
|
|
|
+
|
|
|
+ if (!static_branch_likely(&sched_smt_present))
|
|
|
+ return -1;
|
|
|
+
|
|
|
+ if (!test_idle_cores(target, false))
|
|
|
+ return -1;
|
|
|
+
|
|
|
+ cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
|
|
|
+
|
|
|
+ for_each_cpu_wrap(core, cpus, target, wrap) {
|
|
|
+ bool idle = true;
|
|
|
+
|
|
|
+ for_each_cpu(cpu, cpu_smt_mask(core)) {
|
|
|
+ cpumask_clear_cpu(cpu, cpus);
|
|
|
+ if (!idle_cpu(cpu))
|
|
|
+ idle = false;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (idle)
|
|
|
+ return core;
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Failed to find an idle core; stop looking for one.
|
|
|
+ */
|
|
|
+ set_idle_cores(target, 0);
|
|
|
+
|
|
|
+ return -1;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Scan the local SMT mask for idle CPUs.
|
|
|
+ */
|
|
|
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
|
|
|
+{
|
|
|
+ int cpu;
|
|
|
+
|
|
|
+ if (!static_branch_likely(&sched_smt_present))
|
|
|
+ return -1;
|
|
|
+
|
|
|
+ for_each_cpu(cpu, cpu_smt_mask(target)) {
|
|
|
+ if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
|
|
|
+ continue;
|
|
|
+ if (idle_cpu(cpu))
|
|
|
+ return cpu;
|
|
|
+ }
|
|
|
+
|
|
|
+ return -1;
|
|
|
+}
|
|
|
+
|
|
|
+#else /* CONFIG_SCHED_SMT */
|
|
|
+
|
|
|
+static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
|
|
|
+{
|
|
|
+ return -1;
|
|
|
+}
|
|
|
+
|
|
|
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
|
|
|
+{
|
|
|
+ return -1;
|
|
|
+}
|
|
|
+
|
|
|
+#endif /* CONFIG_SCHED_SMT */
|
|
|
+
|
|
|
+/*
|
|
|
+ * Scan the LLC domain for idle CPUs; this is dynamically regulated by
|
|
|
+ * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
|
|
|
+ * average idle time for this rq (as found in rq->avg_idle).
|
|
|
+ */
|
|
|
+static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
|
|
|
+{
|
|
|
+ struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
|
|
|
+ u64 avg_idle = this_rq()->avg_idle;
|
|
|
+ u64 avg_cost = this_sd->avg_scan_cost;
|
|
|
+ u64 time, cost;
|
|
|
+ s64 delta;
|
|
|
+ int cpu, wrap;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Due to large variance we need a large fuzz factor; hackbench in
|
|
|
+ * particularly is sensitive here.
|
|
|
+ */
|
|
|
+ if ((avg_idle / 512) < avg_cost)
|
|
|
+ return -1;
|
|
|
+
|
|
|
+ time = local_clock();
|
|
|
+
|
|
|
+ for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
|
|
|
+ if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
|
|
|
+ continue;
|
|
|
+ if (idle_cpu(cpu))
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
+ time = local_clock() - time;
|
|
|
+ cost = this_sd->avg_scan_cost;
|
|
|
+ delta = (s64)(time - cost) / 8;
|
|
|
+ this_sd->avg_scan_cost += delta;
|
|
|
+
|
|
|
+ return cpu;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Try and locate an idle core/thread in the LLC cache domain.
|
|
|
+ */
|
|
|
+static int select_idle_sibling(struct task_struct *p, int prev, int target)
|
|
|
{
|
|
|
struct sched_domain *sd;
|
|
|
- struct sched_group *sg;
|
|
|
- int i = task_cpu(p);
|
|
|
+ int i;
|
|
|
|
|
|
if (idle_cpu(target))
|
|
|
return target;
|
|
|
|
|
|
/*
|
|
|
- * If the prevous cpu is cache affine and idle, don't be stupid.
|
|
|
+ * If the previous cpu is cache affine and idle, don't be stupid.
|
|
|
*/
|
|
|
- if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
|
|
|
- return i;
|
|
|
+ if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
|
|
|
+ return prev;
|
|
|
|
|
|
- /*
|
|
|
- * Otherwise, iterate the domains and find an eligible idle cpu.
|
|
|
- *
|
|
|
- * A completely idle sched group at higher domains is more
|
|
|
- * desirable than an idle group at a lower level, because lower
|
|
|
- * domains have smaller groups and usually share hardware
|
|
|
- * resources which causes tasks to contend on them, e.g. x86
|
|
|
- * hyperthread siblings in the lowest domain (SMT) can contend
|
|
|
- * on the shared cpu pipeline.
|
|
|
- *
|
|
|
- * However, while we prefer idle groups at higher domains
|
|
|
- * finding an idle cpu at the lowest domain is still better than
|
|
|
- * returning 'target', which we've already established, isn't
|
|
|
- * idle.
|
|
|
- */
|
|
|
sd = rcu_dereference(per_cpu(sd_llc, target));
|
|
|
- for_each_lower_domain(sd) {
|
|
|
- sg = sd->groups;
|
|
|
- do {
|
|
|
- if (!cpumask_intersects(sched_group_cpus(sg),
|
|
|
- tsk_cpus_allowed(p)))
|
|
|
- goto next;
|
|
|
-
|
|
|
- /* Ensure the entire group is idle */
|
|
|
- for_each_cpu(i, sched_group_cpus(sg)) {
|
|
|
- if (i == target || !idle_cpu(i))
|
|
|
- goto next;
|
|
|
- }
|
|
|
+ if (!sd)
|
|
|
+ return target;
|
|
|
+
|
|
|
+ i = select_idle_core(p, sd, target);
|
|
|
+ if ((unsigned)i < nr_cpumask_bits)
|
|
|
+ return i;
|
|
|
+
|
|
|
+ i = select_idle_cpu(p, sd, target);
|
|
|
+ if ((unsigned)i < nr_cpumask_bits)
|
|
|
+ return i;
|
|
|
+
|
|
|
+ i = select_idle_smt(p, sd, target);
|
|
|
+ if ((unsigned)i < nr_cpumask_bits)
|
|
|
+ return i;
|
|
|
|
|
|
- /*
|
|
|
- * It doesn't matter which cpu we pick, the
|
|
|
- * whole group is idle.
|
|
|
- */
|
|
|
- target = cpumask_first_and(sched_group_cpus(sg),
|
|
|
- tsk_cpus_allowed(p));
|
|
|
- goto done;
|
|
|
-next:
|
|
|
- sg = sg->next;
|
|
|
- } while (sg != sd->groups);
|
|
|
- }
|
|
|
-done:
|
|
|
return target;
|
|
|
}
|
|
|
|
|
@@ -5359,6 +5572,32 @@ static int cpu_util(int cpu)
|
|
|
return (util >= capacity) ? capacity : util;
|
|
|
}
|
|
|
|
|
|
+static inline int task_util(struct task_struct *p)
|
|
|
+{
|
|
|
+ return p->se.avg.util_avg;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
|
|
|
+ * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
|
|
|
+ *
|
|
|
+ * In that case WAKE_AFFINE doesn't make sense and we'll let
|
|
|
+ * BALANCE_WAKE sort things out.
|
|
|
+ */
|
|
|
+static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
|
|
|
+{
|
|
|
+ long min_cap, max_cap;
|
|
|
+
|
|
|
+ min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
|
|
|
+ max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
|
|
|
+
|
|
|
+ /* Minimum capacity is close to max, no need to abort wake_affine */
|
|
|
+ if (max_cap - min_cap < max_cap >> 3)
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ return min_cap * 1024 < task_util(p) * capacity_margin;
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* select_task_rq_fair: Select target runqueue for the waking task in domains
|
|
|
* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
|
|
@@ -5382,7 +5621,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
|
|
|
|
|
|
if (sd_flag & SD_BALANCE_WAKE) {
|
|
|
record_wakee(p);
|
|
|
- want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
|
|
|
+ want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
|
|
|
+ && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
|
|
|
}
|
|
|
|
|
|
rcu_read_lock();
|
|
@@ -5408,13 +5648,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
|
|
|
|
|
|
if (affine_sd) {
|
|
|
sd = NULL; /* Prefer wake_affine over balance flags */
|
|
|
- if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
|
|
|
+ if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
|
|
|
new_cpu = cpu;
|
|
|
}
|
|
|
|
|
|
if (!sd) {
|
|
|
if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
|
|
|
- new_cpu = select_idle_sibling(p, new_cpu);
|
|
|
+ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
|
|
|
|
|
|
} else while (sd) {
|
|
|
struct sched_group *group;
|
|
@@ -5938,7 +6178,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
|
|
|
*
|
|
|
* The adjacency matrix of the resulting graph is given by:
|
|
|
*
|
|
|
- * log_2 n
|
|
|
+ * log_2 n
|
|
|
* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
|
|
|
* k = 0
|
|
|
*
|
|
@@ -5984,7 +6224,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
|
|
|
*
|
|
|
* [XXX write more on how we solve this.. _after_ merging pjt's patches that
|
|
|
* rewrite all of this once again.]
|
|
|
- */
|
|
|
+ */
|
|
|
|
|
|
static unsigned long __read_mostly max_load_balance_interval = HZ/10;
|
|
|
|
|
@@ -6132,7 +6372,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
|
|
|
if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
|
|
|
int cpu;
|
|
|
|
|
|
- schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
|
|
|
+ schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
|
|
|
|
|
|
env->flags |= LBF_SOME_PINNED;
|
|
|
|
|
@@ -6163,7 +6403,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
|
|
|
env->flags &= ~LBF_ALL_PINNED;
|
|
|
|
|
|
if (task_running(env->src_rq, p)) {
|
|
|
- schedstat_inc(p, se.statistics.nr_failed_migrations_running);
|
|
|
+ schedstat_inc(p->se.statistics.nr_failed_migrations_running);
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
@@ -6180,13 +6420,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
|
|
|
if (tsk_cache_hot <= 0 ||
|
|
|
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
|
|
|
if (tsk_cache_hot == 1) {
|
|
|
- schedstat_inc(env->sd, lb_hot_gained[env->idle]);
|
|
|
- schedstat_inc(p, se.statistics.nr_forced_migrations);
|
|
|
+ schedstat_inc(env->sd->lb_hot_gained[env->idle]);
|
|
|
+ schedstat_inc(p->se.statistics.nr_forced_migrations);
|
|
|
}
|
|
|
return 1;
|
|
|
}
|
|
|
|
|
|
- schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
|
|
|
+ schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
@@ -6226,7 +6466,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
|
|
|
* so we can safely collect stats here rather than
|
|
|
* inside detach_tasks().
|
|
|
*/
|
|
|
- schedstat_inc(env->sd, lb_gained[env->idle]);
|
|
|
+ schedstat_inc(env->sd->lb_gained[env->idle]);
|
|
|
return p;
|
|
|
}
|
|
|
return NULL;
|
|
@@ -6318,7 +6558,7 @@ next:
|
|
|
* so we can safely collect detach_one_task() stats here rather
|
|
|
* than inside detach_one_task().
|
|
|
*/
|
|
|
- schedstat_add(env->sd, lb_gained[env->idle], detached);
|
|
|
+ schedstat_add(env->sd->lb_gained[env->idle], detached);
|
|
|
|
|
|
return detached;
|
|
|
}
|
|
@@ -6646,7 +6886,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
|
|
|
/*
|
|
|
* !SD_OVERLAP domains can assume that child groups
|
|
|
* span the current group.
|
|
|
- */
|
|
|
+ */
|
|
|
|
|
|
group = child->groups;
|
|
|
do {
|
|
@@ -7146,7 +7386,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
|
|
|
load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
|
|
|
if (load_above_capacity > busiest->group_capacity) {
|
|
|
load_above_capacity -= busiest->group_capacity;
|
|
|
- load_above_capacity *= NICE_0_LOAD;
|
|
|
+ load_above_capacity *= scale_load_down(NICE_0_LOAD);
|
|
|
load_above_capacity /= busiest->group_capacity;
|
|
|
} else
|
|
|
load_above_capacity = ~0UL;
|
|
@@ -7353,9 +7593,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
|
|
|
*/
|
|
|
#define MAX_PINNED_INTERVAL 512
|
|
|
|
|
|
-/* Working cpumask for load_balance and load_balance_newidle. */
|
|
|
-DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
|
|
|
-
|
|
|
static int need_active_balance(struct lb_env *env)
|
|
|
{
|
|
|
struct sched_domain *sd = env->sd;
|
|
@@ -7459,7 +7696,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
|
|
|
|
|
|
cpumask_copy(cpus, cpu_active_mask);
|
|
|
|
|
|
- schedstat_inc(sd, lb_count[idle]);
|
|
|
+ schedstat_inc(sd->lb_count[idle]);
|
|
|
|
|
|
redo:
|
|
|
if (!should_we_balance(&env)) {
|
|
@@ -7469,19 +7706,19 @@ redo:
|
|
|
|
|
|
group = find_busiest_group(&env);
|
|
|
if (!group) {
|
|
|
- schedstat_inc(sd, lb_nobusyg[idle]);
|
|
|
+ schedstat_inc(sd->lb_nobusyg[idle]);
|
|
|
goto out_balanced;
|
|
|
}
|
|
|
|
|
|
busiest = find_busiest_queue(&env, group);
|
|
|
if (!busiest) {
|
|
|
- schedstat_inc(sd, lb_nobusyq[idle]);
|
|
|
+ schedstat_inc(sd->lb_nobusyq[idle]);
|
|
|
goto out_balanced;
|
|
|
}
|
|
|
|
|
|
BUG_ON(busiest == env.dst_rq);
|
|
|
|
|
|
- schedstat_add(sd, lb_imbalance[idle], env.imbalance);
|
|
|
+ schedstat_add(sd->lb_imbalance[idle], env.imbalance);
|
|
|
|
|
|
env.src_cpu = busiest->cpu;
|
|
|
env.src_rq = busiest;
|
|
@@ -7588,7 +7825,7 @@ more_balance:
|
|
|
}
|
|
|
|
|
|
if (!ld_moved) {
|
|
|
- schedstat_inc(sd, lb_failed[idle]);
|
|
|
+ schedstat_inc(sd->lb_failed[idle]);
|
|
|
/*
|
|
|
* Increment the failure counter only on periodic balance.
|
|
|
* We do not want newidle balance, which can be very
|
|
@@ -7671,7 +7908,7 @@ out_all_pinned:
|
|
|
* we can't migrate them. Let the imbalance flag set so parent level
|
|
|
* can try to migrate them.
|
|
|
*/
|
|
|
- schedstat_inc(sd, lb_balanced[idle]);
|
|
|
+ schedstat_inc(sd->lb_balanced[idle]);
|
|
|
|
|
|
sd->nr_balance_failed = 0;
|
|
|
|
|
@@ -7703,11 +7940,12 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
|
|
|
}
|
|
|
|
|
|
static inline void
|
|
|
-update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
|
|
|
+update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
|
|
|
{
|
|
|
unsigned long interval, next;
|
|
|
|
|
|
- interval = get_sd_balance_interval(sd, cpu_busy);
|
|
|
+ /* used by idle balance, so cpu_busy = 0 */
|
|
|
+ interval = get_sd_balance_interval(sd, 0);
|
|
|
next = sd->last_balance + interval;
|
|
|
|
|
|
if (time_after(*next_balance, next))
|
|
@@ -7737,7 +7975,7 @@ static int idle_balance(struct rq *this_rq)
|
|
|
rcu_read_lock();
|
|
|
sd = rcu_dereference_check_sched_domain(this_rq->sd);
|
|
|
if (sd)
|
|
|
- update_next_balance(sd, 0, &next_balance);
|
|
|
+ update_next_balance(sd, &next_balance);
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
goto out;
|
|
@@ -7755,7 +7993,7 @@ static int idle_balance(struct rq *this_rq)
|
|
|
continue;
|
|
|
|
|
|
if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
|
|
|
- update_next_balance(sd, 0, &next_balance);
|
|
|
+ update_next_balance(sd, &next_balance);
|
|
|
break;
|
|
|
}
|
|
|
|
|
@@ -7773,7 +8011,7 @@ static int idle_balance(struct rq *this_rq)
|
|
|
curr_cost += domain_cost;
|
|
|
}
|
|
|
|
|
|
- update_next_balance(sd, 0, &next_balance);
|
|
|
+ update_next_balance(sd, &next_balance);
|
|
|
|
|
|
/*
|
|
|
* Stop searching for tasks to pull if there are
|
|
@@ -7863,15 +8101,15 @@ static int active_load_balance_cpu_stop(void *data)
|
|
|
.idle = CPU_IDLE,
|
|
|
};
|
|
|
|
|
|
- schedstat_inc(sd, alb_count);
|
|
|
+ schedstat_inc(sd->alb_count);
|
|
|
|
|
|
p = detach_one_task(&env);
|
|
|
if (p) {
|
|
|
- schedstat_inc(sd, alb_pushed);
|
|
|
+ schedstat_inc(sd->alb_pushed);
|
|
|
/* Active balancing done, reset the failure counter. */
|
|
|
sd->nr_balance_failed = 0;
|
|
|
} else {
|
|
|
- schedstat_inc(sd, alb_failed);
|
|
|
+ schedstat_inc(sd->alb_failed);
|
|
|
}
|
|
|
}
|
|
|
rcu_read_unlock();
|
|
@@ -7963,13 +8201,13 @@ static inline void set_cpu_sd_state_busy(void)
|
|
|
int cpu = smp_processor_id();
|
|
|
|
|
|
rcu_read_lock();
|
|
|
- sd = rcu_dereference(per_cpu(sd_busy, cpu));
|
|
|
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
|
|
|
|
|
|
if (!sd || !sd->nohz_idle)
|
|
|
goto unlock;
|
|
|
sd->nohz_idle = 0;
|
|
|
|
|
|
- atomic_inc(&sd->groups->sgc->nr_busy_cpus);
|
|
|
+ atomic_inc(&sd->shared->nr_busy_cpus);
|
|
|
unlock:
|
|
|
rcu_read_unlock();
|
|
|
}
|
|
@@ -7980,13 +8218,13 @@ void set_cpu_sd_state_idle(void)
|
|
|
int cpu = smp_processor_id();
|
|
|
|
|
|
rcu_read_lock();
|
|
|
- sd = rcu_dereference(per_cpu(sd_busy, cpu));
|
|
|
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
|
|
|
|
|
|
if (!sd || sd->nohz_idle)
|
|
|
goto unlock;
|
|
|
sd->nohz_idle = 1;
|
|
|
|
|
|
- atomic_dec(&sd->groups->sgc->nr_busy_cpus);
|
|
|
+ atomic_dec(&sd->shared->nr_busy_cpus);
|
|
|
unlock:
|
|
|
rcu_read_unlock();
|
|
|
}
|
|
@@ -8213,8 +8451,8 @@ end:
|
|
|
static inline bool nohz_kick_needed(struct rq *rq)
|
|
|
{
|
|
|
unsigned long now = jiffies;
|
|
|
+ struct sched_domain_shared *sds;
|
|
|
struct sched_domain *sd;
|
|
|
- struct sched_group_capacity *sgc;
|
|
|
int nr_busy, cpu = rq->cpu;
|
|
|
bool kick = false;
|
|
|
|
|
@@ -8242,11 +8480,13 @@ static inline bool nohz_kick_needed(struct rq *rq)
|
|
|
return true;
|
|
|
|
|
|
rcu_read_lock();
|
|
|
- sd = rcu_dereference(per_cpu(sd_busy, cpu));
|
|
|
- if (sd) {
|
|
|
- sgc = sd->groups->sgc;
|
|
|
- nr_busy = atomic_read(&sgc->nr_busy_cpus);
|
|
|
-
|
|
|
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
|
|
|
+ if (sds) {
|
|
|
+ /*
|
|
|
+ * XXX: write a coherent comment on why we do this.
|
|
|
+ * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
|
|
|
+ */
|
|
|
+ nr_busy = atomic_read(&sds->nr_busy_cpus);
|
|
|
if (nr_busy > 1) {
|
|
|
kick = true;
|
|
|
goto unlock;
|
|
@@ -8440,7 +8680,6 @@ static void detach_task_cfs_rq(struct task_struct *p)
|
|
|
struct sched_entity *se = &p->se;
|
|
|
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
|
u64 now = cfs_rq_clock_task(cfs_rq);
|
|
|
- int tg_update;
|
|
|
|
|
|
if (!vruntime_normalized(p)) {
|
|
|
/*
|
|
@@ -8452,10 +8691,9 @@ static void detach_task_cfs_rq(struct task_struct *p)
|
|
|
}
|
|
|
|
|
|
/* Catch up with the cfs_rq and remove our load when we leave */
|
|
|
- tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
|
|
|
+ update_cfs_rq_load_avg(now, cfs_rq, false);
|
|
|
detach_entity_load_avg(cfs_rq, se);
|
|
|
- if (tg_update)
|
|
|
- update_tg_load_avg(cfs_rq, false);
|
|
|
+ update_tg_load_avg(cfs_rq, false);
|
|
|
}
|
|
|
|
|
|
static void attach_task_cfs_rq(struct task_struct *p)
|
|
@@ -8463,7 +8701,6 @@ static void attach_task_cfs_rq(struct task_struct *p)
|
|
|
struct sched_entity *se = &p->se;
|
|
|
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
|
u64 now = cfs_rq_clock_task(cfs_rq);
|
|
|
- int tg_update;
|
|
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
/*
|
|
@@ -8474,10 +8711,9 @@ static void attach_task_cfs_rq(struct task_struct *p)
|
|
|
#endif
|
|
|
|
|
|
/* Synchronize task with its cfs_rq */
|
|
|
- tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
|
|
|
+ update_cfs_rq_load_avg(now, cfs_rq, false);
|
|
|
attach_entity_load_avg(cfs_rq, se);
|
|
|
- if (tg_update)
|
|
|
- update_tg_load_avg(cfs_rq, false);
|
|
|
+ update_tg_load_avg(cfs_rq, false);
|
|
|
|
|
|
if (!vruntime_normalized(p))
|
|
|
se->vruntime += cfs_rq->min_vruntime;
|