|
@@ -738,12 +738,56 @@ static void update_curr_fair(struct rq *rq)
|
|
update_curr(cfs_rq_of(&rq->curr->se));
|
|
update_curr(cfs_rq_of(&rq->curr->se));
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+#ifdef CONFIG_SCHEDSTATS
|
|
|
|
+static inline void
|
|
|
|
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
|
+{
|
|
|
|
+ u64 wait_start = rq_clock(rq_of(cfs_rq));
|
|
|
|
+
|
|
|
|
+ if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
|
|
|
|
+ likely(wait_start > se->statistics.wait_start))
|
|
|
|
+ wait_start -= se->statistics.wait_start;
|
|
|
|
+
|
|
|
|
+ se->statistics.wait_start = wait_start;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static void
|
|
|
|
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
|
+{
|
|
|
|
+ struct task_struct *p;
|
|
|
|
+ u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
|
|
|
|
+
|
|
|
|
+ if (entity_is_task(se)) {
|
|
|
|
+ p = task_of(se);
|
|
|
|
+ if (task_on_rq_migrating(p)) {
|
|
|
|
+ /*
|
|
|
|
+ * Preserve migrating task's wait time so wait_start
|
|
|
|
+ * time stamp can be adjusted to accumulate wait time
|
|
|
|
+ * prior to migration.
|
|
|
|
+ */
|
|
|
|
+ se->statistics.wait_start = delta;
|
|
|
|
+ return;
|
|
|
|
+ }
|
|
|
|
+ trace_sched_stat_wait(p, delta);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ se->statistics.wait_max = max(se->statistics.wait_max, delta);
|
|
|
|
+ se->statistics.wait_count++;
|
|
|
|
+ se->statistics.wait_sum += delta;
|
|
|
|
+ se->statistics.wait_start = 0;
|
|
|
|
+}
|
|
|
|
+#else
|
|
static inline void
|
|
static inline void
|
|
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
{
|
|
{
|
|
- schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
|
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+static inline void
|
|
|
|
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
|
+{
|
|
|
|
+}
|
|
|
|
+#endif
|
|
|
|
+
|
|
/*
|
|
/*
|
|
* Task is being enqueued - update stats:
|
|
* Task is being enqueued - update stats:
|
|
*/
|
|
*/
|
|
@@ -757,23 +801,6 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
update_stats_wait_start(cfs_rq, se);
|
|
update_stats_wait_start(cfs_rq, se);
|
|
}
|
|
}
|
|
|
|
|
|
-static void
|
|
|
|
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
|
-{
|
|
|
|
- schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
|
|
|
|
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
|
|
|
|
- schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
|
|
|
|
- schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
|
|
|
|
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
|
|
|
|
-#ifdef CONFIG_SCHEDSTATS
|
|
|
|
- if (entity_is_task(se)) {
|
|
|
|
- trace_sched_stat_wait(task_of(se),
|
|
|
|
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
|
|
|
|
- }
|
|
|
|
-#endif
|
|
|
|
- schedstat_set(se->statistics.wait_start, 0);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
static inline void
|
|
static inline void
|
|
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
{
|
|
{
|
|
@@ -2155,6 +2182,7 @@ void task_numa_work(struct callback_head *work)
|
|
unsigned long migrate, next_scan, now = jiffies;
|
|
unsigned long migrate, next_scan, now = jiffies;
|
|
struct task_struct *p = current;
|
|
struct task_struct *p = current;
|
|
struct mm_struct *mm = p->mm;
|
|
struct mm_struct *mm = p->mm;
|
|
|
|
+ u64 runtime = p->se.sum_exec_runtime;
|
|
struct vm_area_struct *vma;
|
|
struct vm_area_struct *vma;
|
|
unsigned long start, end;
|
|
unsigned long start, end;
|
|
unsigned long nr_pte_updates = 0;
|
|
unsigned long nr_pte_updates = 0;
|
|
@@ -2277,6 +2305,17 @@ out:
|
|
else
|
|
else
|
|
reset_ptenuma_scan(p);
|
|
reset_ptenuma_scan(p);
|
|
up_read(&mm->mmap_sem);
|
|
up_read(&mm->mmap_sem);
|
|
|
|
+
|
|
|
|
+ /*
|
|
|
|
+ * Make sure tasks use at least 32x as much time to run other code
|
|
|
|
+ * than they used here, to limit NUMA PTE scanning overhead to 3% max.
|
|
|
|
+ * Usually update_task_scan_period slows down scanning enough; on an
|
|
|
|
+ * overloaded system we need to limit overhead on a per task basis.
|
|
|
|
+ */
|
|
|
|
+ if (unlikely(p->se.sum_exec_runtime != runtime)) {
|
|
|
|
+ u64 diff = p->se.sum_exec_runtime - runtime;
|
|
|
|
+ p->node_stamp += 32 * diff;
|
|
|
|
+ }
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
/*
|
|
@@ -2670,12 +2709,64 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
|
|
{
|
|
{
|
|
long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
|
|
long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
|
|
|
|
|
|
|
|
+ /*
|
|
|
|
+ * No need to update load_avg for root_task_group as it is not used.
|
|
|
|
+ */
|
|
|
|
+ if (cfs_rq->tg == &root_task_group)
|
|
|
|
+ return;
|
|
|
|
+
|
|
if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
|
|
if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
|
|
atomic_long_add(delta, &cfs_rq->tg->load_avg);
|
|
atomic_long_add(delta, &cfs_rq->tg->load_avg);
|
|
cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
|
|
cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+/*
|
|
|
|
+ * Called within set_task_rq() right before setting a task's cpu. The
|
|
|
|
+ * caller only guarantees p->pi_lock is held; no other assumptions,
|
|
|
|
+ * including the state of rq->lock, should be made.
|
|
|
|
+ */
|
|
|
|
+void set_task_rq_fair(struct sched_entity *se,
|
|
|
|
+ struct cfs_rq *prev, struct cfs_rq *next)
|
|
|
|
+{
|
|
|
|
+ if (!sched_feat(ATTACH_AGE_LOAD))
|
|
|
|
+ return;
|
|
|
|
+
|
|
|
|
+ /*
|
|
|
|
+ * We are supposed to update the task to "current" time, then its up to
|
|
|
|
+ * date and ready to go to new CPU/cfs_rq. But we have difficulty in
|
|
|
|
+ * getting what current time is, so simply throw away the out-of-date
|
|
|
|
+ * time. This will result in the wakee task is less decayed, but giving
|
|
|
|
+ * the wakee more load sounds not bad.
|
|
|
|
+ */
|
|
|
|
+ if (se->avg.last_update_time && prev) {
|
|
|
|
+ u64 p_last_update_time;
|
|
|
|
+ u64 n_last_update_time;
|
|
|
|
+
|
|
|
|
+#ifndef CONFIG_64BIT
|
|
|
|
+ u64 p_last_update_time_copy;
|
|
|
|
+ u64 n_last_update_time_copy;
|
|
|
|
+
|
|
|
|
+ do {
|
|
|
|
+ p_last_update_time_copy = prev->load_last_update_time_copy;
|
|
|
|
+ n_last_update_time_copy = next->load_last_update_time_copy;
|
|
|
|
+
|
|
|
|
+ smp_rmb();
|
|
|
|
+
|
|
|
|
+ p_last_update_time = prev->avg.last_update_time;
|
|
|
|
+ n_last_update_time = next->avg.last_update_time;
|
|
|
|
+
|
|
|
|
+ } while (p_last_update_time != p_last_update_time_copy ||
|
|
|
|
+ n_last_update_time != n_last_update_time_copy);
|
|
|
|
+#else
|
|
|
|
+ p_last_update_time = prev->avg.last_update_time;
|
|
|
|
+ n_last_update_time = next->avg.last_update_time;
|
|
|
|
+#endif
|
|
|
|
+ __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
|
|
|
|
+ &se->avg, 0, 0, NULL);
|
|
|
|
+ se->avg.last_update_time = n_last_update_time;
|
|
|
|
+ }
|
|
|
|
+}
|
|
#else /* CONFIG_FAIR_GROUP_SCHED */
|
|
#else /* CONFIG_FAIR_GROUP_SCHED */
|
|
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
|
|
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
|
|
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
|
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
|
@@ -2809,48 +2900,48 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
|
|
max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
|
|
}
|
|
}
|
|
|
|
|
|
-/*
|
|
|
|
- * Task first catches up with cfs_rq, and then subtract
|
|
|
|
- * itself from the cfs_rq (task must be off the queue now).
|
|
|
|
- */
|
|
|
|
-void remove_entity_load_avg(struct sched_entity *se)
|
|
|
|
-{
|
|
|
|
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
|
|
- u64 last_update_time;
|
|
|
|
-
|
|
|
|
#ifndef CONFIG_64BIT
|
|
#ifndef CONFIG_64BIT
|
|
|
|
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
|
|
|
|
+{
|
|
u64 last_update_time_copy;
|
|
u64 last_update_time_copy;
|
|
|
|
+ u64 last_update_time;
|
|
|
|
|
|
do {
|
|
do {
|
|
last_update_time_copy = cfs_rq->load_last_update_time_copy;
|
|
last_update_time_copy = cfs_rq->load_last_update_time_copy;
|
|
smp_rmb();
|
|
smp_rmb();
|
|
last_update_time = cfs_rq->avg.last_update_time;
|
|
last_update_time = cfs_rq->avg.last_update_time;
|
|
} while (last_update_time != last_update_time_copy);
|
|
} while (last_update_time != last_update_time_copy);
|
|
-#else
|
|
|
|
- last_update_time = cfs_rq->avg.last_update_time;
|
|
|
|
-#endif
|
|
|
|
|
|
|
|
- __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
|
|
|
|
- atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
|
|
|
|
- atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
|
|
|
|
|
|
+ return last_update_time;
|
|
}
|
|
}
|
|
-
|
|
|
|
-/*
|
|
|
|
- * Update the rq's load with the elapsed running time before entering
|
|
|
|
- * idle. if the last scheduled task is not a CFS task, idle_enter will
|
|
|
|
- * be the only way to update the runnable statistic.
|
|
|
|
- */
|
|
|
|
-void idle_enter_fair(struct rq *this_rq)
|
|
|
|
|
|
+#else
|
|
|
|
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
|
|
{
|
|
{
|
|
|
|
+ return cfs_rq->avg.last_update_time;
|
|
}
|
|
}
|
|
|
|
+#endif
|
|
|
|
|
|
/*
|
|
/*
|
|
- * Update the rq's load with the elapsed idle time before a task is
|
|
|
|
- * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
|
|
|
|
- * be the only way to update the runnable statistic.
|
|
|
|
|
|
+ * Task first catches up with cfs_rq, and then subtract
|
|
|
|
+ * itself from the cfs_rq (task must be off the queue now).
|
|
*/
|
|
*/
|
|
-void idle_exit_fair(struct rq *this_rq)
|
|
|
|
|
|
+void remove_entity_load_avg(struct sched_entity *se)
|
|
{
|
|
{
|
|
|
|
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
|
|
+ u64 last_update_time;
|
|
|
|
+
|
|
|
|
+ /*
|
|
|
|
+ * Newly created task or never used group entity should not be removed
|
|
|
|
+ * from its (source) cfs_rq
|
|
|
|
+ */
|
|
|
|
+ if (se->avg.last_update_time == 0)
|
|
|
|
+ return;
|
|
|
|
+
|
|
|
|
+ last_update_time = cfs_rq_last_update_time(cfs_rq);
|
|
|
|
+
|
|
|
|
+ __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
|
|
|
|
+ atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
|
|
|
|
+ atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
|
|
}
|
|
}
|
|
|
|
|
|
static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
|
|
static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
|
|
@@ -4240,42 +4331,37 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
|
*/
|
|
*/
|
|
|
|
|
|
/*
|
|
/*
|
|
- * The exact cpuload at various idx values, calculated at every tick would be
|
|
|
|
- * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
|
|
|
|
|
|
+ * The exact cpuload calculated at every tick would be:
|
|
|
|
+ *
|
|
|
|
+ * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
|
|
*
|
|
*
|
|
- * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
|
|
|
|
- * on nth tick when cpu may be busy, then we have:
|
|
|
|
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
|
|
|
|
- * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
|
|
|
|
|
|
+ * If a cpu misses updates for n ticks (as it was idle) and update gets
|
|
|
|
+ * called on the n+1-th tick when cpu may be busy, then we have:
|
|
|
|
+ *
|
|
|
|
+ * load_n = (1 - 1/2^i)^n * load_0
|
|
|
|
+ * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
|
|
*
|
|
*
|
|
* decay_load_missed() below does efficient calculation of
|
|
* decay_load_missed() below does efficient calculation of
|
|
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
|
|
|
|
- * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
|
|
|
|
|
|
+ *
|
|
|
|
+ * load' = (1 - 1/2^i)^n * load
|
|
|
|
+ *
|
|
|
|
+ * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
|
|
|
|
+ * This allows us to precompute the above in said factors, thereby allowing the
|
|
|
|
+ * reduction of an arbitrary n in O(log_2 n) steps. (See also
|
|
|
|
+ * fixed_power_int())
|
|
*
|
|
*
|
|
* The calculation is approximated on a 128 point scale.
|
|
* The calculation is approximated on a 128 point scale.
|
|
- * degrade_zero_ticks is the number of ticks after which load at any
|
|
|
|
- * particular idx is approximated to be zero.
|
|
|
|
- * degrade_factor is a precomputed table, a row for each load idx.
|
|
|
|
- * Each column corresponds to degradation factor for a power of two ticks,
|
|
|
|
- * based on 128 point scale.
|
|
|
|
- * Example:
|
|
|
|
- * row 2, col 3 (=12) says that the degradation at load idx 2 after
|
|
|
|
- * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
|
|
|
|
- *
|
|
|
|
- * With this power of 2 load factors, we can degrade the load n times
|
|
|
|
- * by looking at 1 bits in n and doing as many mult/shift instead of
|
|
|
|
- * n mult/shifts needed by the exact degradation.
|
|
|
|
*/
|
|
*/
|
|
#define DEGRADE_SHIFT 7
|
|
#define DEGRADE_SHIFT 7
|
|
-static const unsigned char
|
|
|
|
- degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
|
|
|
|
-static const unsigned char
|
|
|
|
- degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
|
|
|
|
- {0, 0, 0, 0, 0, 0, 0, 0},
|
|
|
|
- {64, 32, 8, 0, 0, 0, 0, 0},
|
|
|
|
- {96, 72, 40, 12, 1, 0, 0},
|
|
|
|
- {112, 98, 75, 43, 15, 1, 0},
|
|
|
|
- {120, 112, 98, 76, 45, 16, 2} };
|
|
|
|
|
|
+
|
|
|
|
+static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
|
|
|
|
+static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
|
|
|
|
+ { 0, 0, 0, 0, 0, 0, 0, 0 },
|
|
|
|
+ { 64, 32, 8, 0, 0, 0, 0, 0 },
|
|
|
|
+ { 96, 72, 40, 12, 1, 0, 0, 0 },
|
|
|
|
+ { 112, 98, 75, 43, 15, 1, 0, 0 },
|
|
|
|
+ { 120, 112, 98, 76, 45, 16, 2, 0 }
|
|
|
|
+};
|
|
|
|
|
|
/*
|
|
/*
|
|
* Update cpu_load for any missed ticks, due to tickless idle. The backlog
|
|
* Update cpu_load for any missed ticks, due to tickless idle. The backlog
|
|
@@ -4306,14 +4392,46 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
|
|
return load;
|
|
return load;
|
|
}
|
|
}
|
|
|
|
|
|
-/*
|
|
|
|
|
|
+/**
|
|
|
|
+ * __update_cpu_load - update the rq->cpu_load[] statistics
|
|
|
|
+ * @this_rq: The rq to update statistics for
|
|
|
|
+ * @this_load: The current load
|
|
|
|
+ * @pending_updates: The number of missed updates
|
|
|
|
+ * @active: !0 for NOHZ_FULL
|
|
|
|
+ *
|
|
* Update rq->cpu_load[] statistics. This function is usually called every
|
|
* Update rq->cpu_load[] statistics. This function is usually called every
|
|
- * scheduler tick (TICK_NSEC). With tickless idle this will not be called
|
|
|
|
- * every tick. We fix it up based on jiffies.
|
|
|
|
|
|
+ * scheduler tick (TICK_NSEC).
|
|
|
|
+ *
|
|
|
|
+ * This function computes a decaying average:
|
|
|
|
+ *
|
|
|
|
+ * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
|
|
|
|
+ *
|
|
|
|
+ * Because of NOHZ it might not get called on every tick which gives need for
|
|
|
|
+ * the @pending_updates argument.
|
|
|
|
+ *
|
|
|
|
+ * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
|
|
|
|
+ * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
|
|
|
|
+ * = A * (A * load[i]_n-2 + B) + B
|
|
|
|
+ * = A * (A * (A * load[i]_n-3 + B) + B) + B
|
|
|
|
+ * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
|
|
|
|
+ * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
|
|
|
|
+ * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
|
|
|
|
+ * = (1 - 1/2^i)^n * (load[i]_0 - load) + load
|
|
|
|
+ *
|
|
|
|
+ * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
|
|
|
|
+ * any change in load would have resulted in the tick being turned back on.
|
|
|
|
+ *
|
|
|
|
+ * For regular NOHZ, this reduces to:
|
|
|
|
+ *
|
|
|
|
+ * load[i]_n = (1 - 1/2^i)^n * load[i]_0
|
|
|
|
+ *
|
|
|
|
+ * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
|
|
|
|
+ * term. See the @active paramter.
|
|
*/
|
|
*/
|
|
static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
|
|
static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
|
|
- unsigned long pending_updates)
|
|
|
|
|
|
+ unsigned long pending_updates, int active)
|
|
{
|
|
{
|
|
|
|
+ unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
|
|
int i, scale;
|
|
int i, scale;
|
|
|
|
|
|
this_rq->nr_load_updates++;
|
|
this_rq->nr_load_updates++;
|
|
@@ -4325,8 +4443,9 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
|
|
|
|
|
|
/* scale is effectively 1 << i now, and >> i divides by scale */
|
|
/* scale is effectively 1 << i now, and >> i divides by scale */
|
|
|
|
|
|
- old_load = this_rq->cpu_load[i];
|
|
|
|
|
|
+ old_load = this_rq->cpu_load[i] - tickless_load;
|
|
old_load = decay_load_missed(old_load, pending_updates - 1, i);
|
|
old_load = decay_load_missed(old_load, pending_updates - 1, i);
|
|
|
|
+ old_load += tickless_load;
|
|
new_load = this_load;
|
|
new_load = this_load;
|
|
/*
|
|
/*
|
|
* Round up the averaging division if load is increasing. This
|
|
* Round up the averaging division if load is increasing. This
|
|
@@ -4381,16 +4500,17 @@ static void update_idle_cpu_load(struct rq *this_rq)
|
|
pending_updates = curr_jiffies - this_rq->last_load_update_tick;
|
|
pending_updates = curr_jiffies - this_rq->last_load_update_tick;
|
|
this_rq->last_load_update_tick = curr_jiffies;
|
|
this_rq->last_load_update_tick = curr_jiffies;
|
|
|
|
|
|
- __update_cpu_load(this_rq, load, pending_updates);
|
|
|
|
|
|
+ __update_cpu_load(this_rq, load, pending_updates, 0);
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
/*
|
|
* Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
|
|
* Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
|
|
*/
|
|
*/
|
|
-void update_cpu_load_nohz(void)
|
|
|
|
|
|
+void update_cpu_load_nohz(int active)
|
|
{
|
|
{
|
|
struct rq *this_rq = this_rq();
|
|
struct rq *this_rq = this_rq();
|
|
unsigned long curr_jiffies = READ_ONCE(jiffies);
|
|
unsigned long curr_jiffies = READ_ONCE(jiffies);
|
|
|
|
+ unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
|
|
unsigned long pending_updates;
|
|
unsigned long pending_updates;
|
|
|
|
|
|
if (curr_jiffies == this_rq->last_load_update_tick)
|
|
if (curr_jiffies == this_rq->last_load_update_tick)
|
|
@@ -4401,10 +4521,11 @@ void update_cpu_load_nohz(void)
|
|
if (pending_updates) {
|
|
if (pending_updates) {
|
|
this_rq->last_load_update_tick = curr_jiffies;
|
|
this_rq->last_load_update_tick = curr_jiffies;
|
|
/*
|
|
/*
|
|
- * We were idle, this means load 0, the current load might be
|
|
|
|
- * !0 due to remote wakeups and the sort.
|
|
|
|
|
|
+ * In the regular NOHZ case, we were idle, this means load 0.
|
|
|
|
+ * In the NOHZ_FULL case, we were non-idle, we should consider
|
|
|
|
+ * its weighted load.
|
|
*/
|
|
*/
|
|
- __update_cpu_load(this_rq, 0, pending_updates);
|
|
|
|
|
|
+ __update_cpu_load(this_rq, load, pending_updates, active);
|
|
}
|
|
}
|
|
raw_spin_unlock(&this_rq->lock);
|
|
raw_spin_unlock(&this_rq->lock);
|
|
}
|
|
}
|
|
@@ -4420,7 +4541,7 @@ void update_cpu_load_active(struct rq *this_rq)
|
|
* See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
|
|
* See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
|
|
*/
|
|
*/
|
|
this_rq->last_load_update_tick = jiffies;
|
|
this_rq->last_load_update_tick = jiffies;
|
|
- __update_cpu_load(this_rq, load, 1);
|
|
|
|
|
|
+ __update_cpu_load(this_rq, load, 1, 1);
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
/*
|
|
@@ -5007,8 +5128,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
|
|
/*
|
|
/*
|
|
* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
|
|
* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
|
|
* cfs_rq_of(p) references at time of call are still valid and identify the
|
|
* cfs_rq_of(p) references at time of call are still valid and identify the
|
|
- * previous cpu. However, the caller only guarantees p->pi_lock is held; no
|
|
|
|
- * other assumptions, including the state of rq->lock, should be made.
|
|
|
|
|
|
+ * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
|
|
*/
|
|
*/
|
|
static void migrate_task_rq_fair(struct task_struct *p)
|
|
static void migrate_task_rq_fair(struct task_struct *p)
|
|
{
|
|
{
|
|
@@ -5721,8 +5841,8 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
|
|
{
|
|
{
|
|
lockdep_assert_held(&env->src_rq->lock);
|
|
lockdep_assert_held(&env->src_rq->lock);
|
|
|
|
|
|
- deactivate_task(env->src_rq, p, 0);
|
|
|
|
p->on_rq = TASK_ON_RQ_MIGRATING;
|
|
p->on_rq = TASK_ON_RQ_MIGRATING;
|
|
|
|
+ deactivate_task(env->src_rq, p, 0);
|
|
set_task_cpu(p, env->dst_cpu);
|
|
set_task_cpu(p, env->dst_cpu);
|
|
}
|
|
}
|
|
|
|
|
|
@@ -5855,8 +5975,8 @@ static void attach_task(struct rq *rq, struct task_struct *p)
|
|
lockdep_assert_held(&rq->lock);
|
|
lockdep_assert_held(&rq->lock);
|
|
|
|
|
|
BUG_ON(task_rq(p) != rq);
|
|
BUG_ON(task_rq(p) != rq);
|
|
- p->on_rq = TASK_ON_RQ_QUEUED;
|
|
|
|
activate_task(rq, p, 0);
|
|
activate_task(rq, p, 0);
|
|
|
|
+ p->on_rq = TASK_ON_RQ_QUEUED;
|
|
check_preempt_curr(rq, p, 0);
|
|
check_preempt_curr(rq, p, 0);
|
|
}
|
|
}
|
|
|
|
|
|
@@ -6302,7 +6422,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
|
bool *overload)
|
|
bool *overload)
|
|
{
|
|
{
|
|
unsigned long load;
|
|
unsigned long load;
|
|
- int i;
|
|
|
|
|
|
+ int i, nr_running;
|
|
|
|
|
|
memset(sgs, 0, sizeof(*sgs));
|
|
memset(sgs, 0, sizeof(*sgs));
|
|
|
|
|
|
@@ -6319,7 +6439,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
|
sgs->group_util += cpu_util(i);
|
|
sgs->group_util += cpu_util(i);
|
|
sgs->sum_nr_running += rq->cfs.h_nr_running;
|
|
sgs->sum_nr_running += rq->cfs.h_nr_running;
|
|
|
|
|
|
- if (rq->nr_running > 1)
|
|
|
|
|
|
+ nr_running = rq->nr_running;
|
|
|
|
+ if (nr_running > 1)
|
|
*overload = true;
|
|
*overload = true;
|
|
|
|
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
@@ -6327,7 +6448,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
|
sgs->nr_preferred_running += rq->nr_preferred_running;
|
|
sgs->nr_preferred_running += rq->nr_preferred_running;
|
|
#endif
|
|
#endif
|
|
sgs->sum_weighted_load += weighted_cpuload(i);
|
|
sgs->sum_weighted_load += weighted_cpuload(i);
|
|
- if (idle_cpu(i))
|
|
|
|
|
|
+ /*
|
|
|
|
+ * No need to call idle_cpu() if nr_running is not 0
|
|
|
|
+ */
|
|
|
|
+ if (!nr_running && idle_cpu(i))
|
|
sgs->idle_cpus++;
|
|
sgs->idle_cpus++;
|
|
}
|
|
}
|
|
|
|
|
|
@@ -7248,8 +7372,6 @@ static int idle_balance(struct rq *this_rq)
|
|
int pulled_task = 0;
|
|
int pulled_task = 0;
|
|
u64 curr_cost = 0;
|
|
u64 curr_cost = 0;
|
|
|
|
|
|
- idle_enter_fair(this_rq);
|
|
|
|
-
|
|
|
|
/*
|
|
/*
|
|
* We must set idle_stamp _before_ calling idle_balance(), such that we
|
|
* We must set idle_stamp _before_ calling idle_balance(), such that we
|
|
* measure the duration of idle_balance() as idle time.
|
|
* measure the duration of idle_balance() as idle time.
|
|
@@ -7330,10 +7452,8 @@ out:
|
|
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
|
|
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
|
|
pulled_task = -1;
|
|
pulled_task = -1;
|
|
|
|
|
|
- if (pulled_task) {
|
|
|
|
- idle_exit_fair(this_rq);
|
|
|
|
|
|
+ if (pulled_task)
|
|
this_rq->idle_stamp = 0;
|
|
this_rq->idle_stamp = 0;
|
|
- }
|
|
|
|
|
|
|
|
return pulled_task;
|
|
return pulled_task;
|
|
}
|
|
}
|