|
@@ -33,6 +33,7 @@
|
|
|
#include <linux/mempolicy.h>
|
|
|
#include <linux/migrate.h>
|
|
|
#include <linux/task_work.h>
|
|
|
+#include <linux/sched/isolation.h>
|
|
|
|
|
|
#include <trace/events/sched.h>
|
|
|
|
|
@@ -717,13 +718,8 @@ void init_entity_runnable_average(struct sched_entity *se)
|
|
|
{
|
|
|
struct sched_avg *sa = &se->avg;
|
|
|
|
|
|
- sa->last_update_time = 0;
|
|
|
- /*
|
|
|
- * sched_avg's period_contrib should be strictly less then 1024, so
|
|
|
- * we give it 1023 to make sure it is almost a period (1024us), and
|
|
|
- * will definitely be update (after enqueue).
|
|
|
- */
|
|
|
- sa->period_contrib = 1023;
|
|
|
+ memset(sa, 0, sizeof(*sa));
|
|
|
+
|
|
|
/*
|
|
|
* Tasks are intialized with full load to be seen as heavy tasks until
|
|
|
* they get a chance to stabilize to their real load level.
|
|
@@ -731,13 +727,10 @@ void init_entity_runnable_average(struct sched_entity *se)
|
|
|
* nothing has been attached to the task group yet.
|
|
|
*/
|
|
|
if (entity_is_task(se))
|
|
|
- sa->load_avg = scale_load_down(se->load.weight);
|
|
|
- sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
|
|
|
- /*
|
|
|
- * At this point, util_avg won't be used in select_task_rq_fair anyway
|
|
|
- */
|
|
|
- sa->util_avg = 0;
|
|
|
- sa->util_sum = 0;
|
|
|
+ sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
|
|
|
+
|
|
|
+ se->runnable_weight = se->load.weight;
|
|
|
+
|
|
|
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
|
|
|
}
|
|
|
|
|
@@ -785,7 +778,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
|
|
|
} else {
|
|
|
sa->util_avg = cap;
|
|
|
}
|
|
|
- sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
|
|
|
}
|
|
|
|
|
|
if (entity_is_task(se)) {
|
|
@@ -2026,7 +2018,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
|
|
|
delta = runtime - p->last_sum_exec_runtime;
|
|
|
*period = now - p->last_task_numa_placement;
|
|
|
} else {
|
|
|
- delta = p->se.avg.load_sum / p->se.load.weight;
|
|
|
+ delta = p->se.avg.load_sum;
|
|
|
*period = LOAD_AVG_MAX;
|
|
|
}
|
|
|
|
|
@@ -2693,18 +2685,226 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
cfs_rq->nr_running--;
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * Signed add and clamp on underflow.
|
|
|
+ *
|
|
|
+ * Explicitly do a load-store to ensure the intermediate value never hits
|
|
|
+ * memory. This allows lockless observations without ever seeing the negative
|
|
|
+ * values.
|
|
|
+ */
|
|
|
+#define add_positive(_ptr, _val) do { \
|
|
|
+ typeof(_ptr) ptr = (_ptr); \
|
|
|
+ typeof(_val) val = (_val); \
|
|
|
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
|
|
|
+ \
|
|
|
+ res = var + val; \
|
|
|
+ \
|
|
|
+ if (val < 0 && res > var) \
|
|
|
+ res = 0; \
|
|
|
+ \
|
|
|
+ WRITE_ONCE(*ptr, res); \
|
|
|
+} while (0)
|
|
|
+
|
|
|
+/*
|
|
|
+ * Unsigned subtract and clamp on underflow.
|
|
|
+ *
|
|
|
+ * Explicitly do a load-store to ensure the intermediate value never hits
|
|
|
+ * memory. This allows lockless observations without ever seeing the negative
|
|
|
+ * values.
|
|
|
+ */
|
|
|
+#define sub_positive(_ptr, _val) do { \
|
|
|
+ typeof(_ptr) ptr = (_ptr); \
|
|
|
+ typeof(*ptr) val = (_val); \
|
|
|
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
|
|
|
+ res = var - val; \
|
|
|
+ if (res > var) \
|
|
|
+ res = 0; \
|
|
|
+ WRITE_ONCE(*ptr, res); \
|
|
|
+} while (0)
|
|
|
+
|
|
|
+#ifdef CONFIG_SMP
|
|
|
+/*
|
|
|
+ * XXX we want to get rid of these helpers and use the full load resolution.
|
|
|
+ */
|
|
|
+static inline long se_weight(struct sched_entity *se)
|
|
|
+{
|
|
|
+ return scale_load_down(se->load.weight);
|
|
|
+}
|
|
|
+
|
|
|
+static inline long se_runnable(struct sched_entity *se)
|
|
|
+{
|
|
|
+ return scale_load_down(se->runnable_weight);
|
|
|
+}
|
|
|
+
|
|
|
+static inline void
|
|
|
+enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
+{
|
|
|
+ cfs_rq->runnable_weight += se->runnable_weight;
|
|
|
+
|
|
|
+ cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
|
|
|
+ cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
|
|
|
+}
|
|
|
+
|
|
|
+static inline void
|
|
|
+dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
+{
|
|
|
+ cfs_rq->runnable_weight -= se->runnable_weight;
|
|
|
+
|
|
|
+ sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
|
|
|
+ sub_positive(&cfs_rq->avg.runnable_load_sum,
|
|
|
+ se_runnable(se) * se->avg.runnable_load_sum);
|
|
|
+}
|
|
|
+
|
|
|
+static inline void
|
|
|
+enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
+{
|
|
|
+ cfs_rq->avg.load_avg += se->avg.load_avg;
|
|
|
+ cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
|
|
|
+}
|
|
|
+
|
|
|
+static inline void
|
|
|
+dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
+{
|
|
|
+ sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
|
|
|
+ sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
|
|
|
+}
|
|
|
+#else
|
|
|
+static inline void
|
|
|
+enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
|
|
|
+static inline void
|
|
|
+dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
|
|
|
+static inline void
|
|
|
+enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
|
|
|
+static inline void
|
|
|
+dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
|
|
|
+#endif
|
|
|
+
|
|
|
+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
|
|
|
+ unsigned long weight, unsigned long runnable)
|
|
|
+{
|
|
|
+ if (se->on_rq) {
|
|
|
+ /* commit outstanding execution time */
|
|
|
+ if (cfs_rq->curr == se)
|
|
|
+ update_curr(cfs_rq);
|
|
|
+ account_entity_dequeue(cfs_rq, se);
|
|
|
+ dequeue_runnable_load_avg(cfs_rq, se);
|
|
|
+ }
|
|
|
+ dequeue_load_avg(cfs_rq, se);
|
|
|
+
|
|
|
+ se->runnable_weight = runnable;
|
|
|
+ update_load_set(&se->load, weight);
|
|
|
+
|
|
|
+#ifdef CONFIG_SMP
|
|
|
+ do {
|
|
|
+ u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
|
|
|
+
|
|
|
+ se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
|
|
|
+ se->avg.runnable_load_avg =
|
|
|
+ div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
|
|
|
+ } while (0);
|
|
|
+#endif
|
|
|
+
|
|
|
+ enqueue_load_avg(cfs_rq, se);
|
|
|
+ if (se->on_rq) {
|
|
|
+ account_entity_enqueue(cfs_rq, se);
|
|
|
+ enqueue_runnable_load_avg(cfs_rq, se);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void reweight_task(struct task_struct *p, int prio)
|
|
|
+{
|
|
|
+ struct sched_entity *se = &p->se;
|
|
|
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
|
+ struct load_weight *load = &se->load;
|
|
|
+ unsigned long weight = scale_load(sched_prio_to_weight[prio]);
|
|
|
+
|
|
|
+ reweight_entity(cfs_rq, se, weight, weight);
|
|
|
+ load->inv_weight = sched_prio_to_wmult[prio];
|
|
|
+}
|
|
|
+
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
# ifdef CONFIG_SMP
|
|
|
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
|
|
|
+/*
|
|
|
+ * All this does is approximate the hierarchical proportion which includes that
|
|
|
+ * global sum we all love to hate.
|
|
|
+ *
|
|
|
+ * That is, the weight of a group entity, is the proportional share of the
|
|
|
+ * group weight based on the group runqueue weights. That is:
|
|
|
+ *
|
|
|
+ * tg->weight * grq->load.weight
|
|
|
+ * ge->load.weight = ----------------------------- (1)
|
|
|
+ * \Sum grq->load.weight
|
|
|
+ *
|
|
|
+ * Now, because computing that sum is prohibitively expensive to compute (been
|
|
|
+ * there, done that) we approximate it with this average stuff. The average
|
|
|
+ * moves slower and therefore the approximation is cheaper and more stable.
|
|
|
+ *
|
|
|
+ * So instead of the above, we substitute:
|
|
|
+ *
|
|
|
+ * grq->load.weight -> grq->avg.load_avg (2)
|
|
|
+ *
|
|
|
+ * which yields the following:
|
|
|
+ *
|
|
|
+ * tg->weight * grq->avg.load_avg
|
|
|
+ * ge->load.weight = ------------------------------ (3)
|
|
|
+ * tg->load_avg
|
|
|
+ *
|
|
|
+ * Where: tg->load_avg ~= \Sum grq->avg.load_avg
|
|
|
+ *
|
|
|
+ * That is shares_avg, and it is right (given the approximation (2)).
|
|
|
+ *
|
|
|
+ * The problem with it is that because the average is slow -- it was designed
|
|
|
+ * to be exactly that of course -- this leads to transients in boundary
|
|
|
+ * conditions. In specific, the case where the group was idle and we start the
|
|
|
+ * one task. It takes time for our CPU's grq->avg.load_avg to build up,
|
|
|
+ * yielding bad latency etc..
|
|
|
+ *
|
|
|
+ * Now, in that special case (1) reduces to:
|
|
|
+ *
|
|
|
+ * tg->weight * grq->load.weight
|
|
|
+ * ge->load.weight = ----------------------------- = tg->weight (4)
|
|
|
+ * grp->load.weight
|
|
|
+ *
|
|
|
+ * That is, the sum collapses because all other CPUs are idle; the UP scenario.
|
|
|
+ *
|
|
|
+ * So what we do is modify our approximation (3) to approach (4) in the (near)
|
|
|
+ * UP case, like:
|
|
|
+ *
|
|
|
+ * ge->load.weight =
|
|
|
+ *
|
|
|
+ * tg->weight * grq->load.weight
|
|
|
+ * --------------------------------------------------- (5)
|
|
|
+ * tg->load_avg - grq->avg.load_avg + grq->load.weight
|
|
|
+ *
|
|
|
+ * But because grq->load.weight can drop to 0, resulting in a divide by zero,
|
|
|
+ * we need to use grq->avg.load_avg as its lower bound, which then gives:
|
|
|
+ *
|
|
|
+ *
|
|
|
+ * tg->weight * grq->load.weight
|
|
|
+ * ge->load.weight = ----------------------------- (6)
|
|
|
+ * tg_load_avg'
|
|
|
+ *
|
|
|
+ * Where:
|
|
|
+ *
|
|
|
+ * tg_load_avg' = tg->load_avg - grq->avg.load_avg +
|
|
|
+ * max(grq->load.weight, grq->avg.load_avg)
|
|
|
+ *
|
|
|
+ * And that is shares_weight and is icky. In the (near) UP case it approaches
|
|
|
+ * (4) while in the normal case it approaches (3). It consistently
|
|
|
+ * overestimates the ge->load.weight and therefore:
|
|
|
+ *
|
|
|
+ * \Sum ge->load.weight >= tg->weight
|
|
|
+ *
|
|
|
+ * hence icky!
|
|
|
+ */
|
|
|
+static long calc_group_shares(struct cfs_rq *cfs_rq)
|
|
|
{
|
|
|
- long tg_weight, load, shares;
|
|
|
+ long tg_weight, tg_shares, load, shares;
|
|
|
+ struct task_group *tg = cfs_rq->tg;
|
|
|
|
|
|
- /*
|
|
|
- * This really should be: cfs_rq->avg.load_avg, but instead we use
|
|
|
- * cfs_rq->load.weight, which is its upper bound. This helps ramp up
|
|
|
- * the shares for small weight interactive tasks.
|
|
|
- */
|
|
|
- load = scale_load_down(cfs_rq->load.weight);
|
|
|
+ tg_shares = READ_ONCE(tg->shares);
|
|
|
+
|
|
|
+ load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
|
|
|
|
|
|
tg_weight = atomic_long_read(&tg->load_avg);
|
|
|
|
|
@@ -2712,7 +2912,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
|
|
|
tg_weight -= cfs_rq->tg_load_avg_contrib;
|
|
|
tg_weight += load;
|
|
|
|
|
|
- shares = (tg->shares * load);
|
|
|
+ shares = (tg_shares * load);
|
|
|
if (tg_weight)
|
|
|
shares /= tg_weight;
|
|
|
|
|
@@ -2728,63 +2928,86 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
|
|
|
* case no task is runnable on a CPU MIN_SHARES=2 should be returned
|
|
|
* instead of 0.
|
|
|
*/
|
|
|
- if (shares < MIN_SHARES)
|
|
|
- shares = MIN_SHARES;
|
|
|
- if (shares > tg->shares)
|
|
|
- shares = tg->shares;
|
|
|
-
|
|
|
- return shares;
|
|
|
-}
|
|
|
-# else /* CONFIG_SMP */
|
|
|
-static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
|
|
|
-{
|
|
|
- return tg->shares;
|
|
|
+ return clamp_t(long, shares, MIN_SHARES, tg_shares);
|
|
|
}
|
|
|
-# endif /* CONFIG_SMP */
|
|
|
|
|
|
-static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
|
|
|
- unsigned long weight)
|
|
|
+/*
|
|
|
+ * This calculates the effective runnable weight for a group entity based on
|
|
|
+ * the group entity weight calculated above.
|
|
|
+ *
|
|
|
+ * Because of the above approximation (2), our group entity weight is
|
|
|
+ * an load_avg based ratio (3). This means that it includes blocked load and
|
|
|
+ * does not represent the runnable weight.
|
|
|
+ *
|
|
|
+ * Approximate the group entity's runnable weight per ratio from the group
|
|
|
+ * runqueue:
|
|
|
+ *
|
|
|
+ * grq->avg.runnable_load_avg
|
|
|
+ * ge->runnable_weight = ge->load.weight * -------------------------- (7)
|
|
|
+ * grq->avg.load_avg
|
|
|
+ *
|
|
|
+ * However, analogous to above, since the avg numbers are slow, this leads to
|
|
|
+ * transients in the from-idle case. Instead we use:
|
|
|
+ *
|
|
|
+ * ge->runnable_weight = ge->load.weight *
|
|
|
+ *
|
|
|
+ * max(grq->avg.runnable_load_avg, grq->runnable_weight)
|
|
|
+ * ----------------------------------------------------- (8)
|
|
|
+ * max(grq->avg.load_avg, grq->load.weight)
|
|
|
+ *
|
|
|
+ * Where these max() serve both to use the 'instant' values to fix the slow
|
|
|
+ * from-idle and avoid the /0 on to-idle, similar to (6).
|
|
|
+ */
|
|
|
+static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
|
|
|
{
|
|
|
- if (se->on_rq) {
|
|
|
- /* commit outstanding execution time */
|
|
|
- if (cfs_rq->curr == se)
|
|
|
- update_curr(cfs_rq);
|
|
|
- account_entity_dequeue(cfs_rq, se);
|
|
|
- }
|
|
|
+ long runnable, load_avg;
|
|
|
|
|
|
- update_load_set(&se->load, weight);
|
|
|
+ load_avg = max(cfs_rq->avg.load_avg,
|
|
|
+ scale_load_down(cfs_rq->load.weight));
|
|
|
|
|
|
- if (se->on_rq)
|
|
|
- account_entity_enqueue(cfs_rq, se);
|
|
|
+ runnable = max(cfs_rq->avg.runnable_load_avg,
|
|
|
+ scale_load_down(cfs_rq->runnable_weight));
|
|
|
+
|
|
|
+ runnable *= shares;
|
|
|
+ if (load_avg)
|
|
|
+ runnable /= load_avg;
|
|
|
+
|
|
|
+ return clamp_t(long, runnable, MIN_SHARES, shares);
|
|
|
}
|
|
|
+# endif /* CONFIG_SMP */
|
|
|
|
|
|
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
|
|
|
|
|
|
-static void update_cfs_shares(struct sched_entity *se)
|
|
|
+/*
|
|
|
+ * Recomputes the group entity based on the current state of its group
|
|
|
+ * runqueue.
|
|
|
+ */
|
|
|
+static void update_cfs_group(struct sched_entity *se)
|
|
|
{
|
|
|
- struct cfs_rq *cfs_rq = group_cfs_rq(se);
|
|
|
- struct task_group *tg;
|
|
|
- long shares;
|
|
|
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
|
|
|
+ long shares, runnable;
|
|
|
|
|
|
- if (!cfs_rq)
|
|
|
+ if (!gcfs_rq)
|
|
|
return;
|
|
|
|
|
|
- if (throttled_hierarchy(cfs_rq))
|
|
|
+ if (throttled_hierarchy(gcfs_rq))
|
|
|
return;
|
|
|
|
|
|
- tg = cfs_rq->tg;
|
|
|
-
|
|
|
#ifndef CONFIG_SMP
|
|
|
- if (likely(se->load.weight == tg->shares))
|
|
|
+ runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
|
|
|
+
|
|
|
+ if (likely(se->load.weight == shares))
|
|
|
return;
|
|
|
+#else
|
|
|
+ shares = calc_group_shares(gcfs_rq);
|
|
|
+ runnable = calc_group_runnable(gcfs_rq, shares);
|
|
|
#endif
|
|
|
- shares = calc_cfs_shares(cfs_rq, tg);
|
|
|
|
|
|
- reweight_entity(cfs_rq_of(se), se, shares);
|
|
|
+ reweight_entity(cfs_rq_of(se), se, shares, runnable);
|
|
|
}
|
|
|
|
|
|
#else /* CONFIG_FAIR_GROUP_SCHED */
|
|
|
-static inline void update_cfs_shares(struct sched_entity *se)
|
|
|
+static inline void update_cfs_group(struct sched_entity *se)
|
|
|
{
|
|
|
}
|
|
|
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
|
@@ -2893,7 +3116,7 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
|
|
|
*/
|
|
|
static __always_inline u32
|
|
|
accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
|
|
|
- unsigned long weight, int running, struct cfs_rq *cfs_rq)
|
|
|
+ unsigned long load, unsigned long runnable, int running)
|
|
|
{
|
|
|
unsigned long scale_freq, scale_cpu;
|
|
|
u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
|
|
@@ -2910,10 +3133,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
|
|
|
*/
|
|
|
if (periods) {
|
|
|
sa->load_sum = decay_load(sa->load_sum, periods);
|
|
|
- if (cfs_rq) {
|
|
|
- cfs_rq->runnable_load_sum =
|
|
|
- decay_load(cfs_rq->runnable_load_sum, periods);
|
|
|
- }
|
|
|
+ sa->runnable_load_sum =
|
|
|
+ decay_load(sa->runnable_load_sum, periods);
|
|
|
sa->util_sum = decay_load((u64)(sa->util_sum), periods);
|
|
|
|
|
|
/*
|
|
@@ -2926,11 +3147,10 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
|
|
|
sa->period_contrib = delta;
|
|
|
|
|
|
contrib = cap_scale(contrib, scale_freq);
|
|
|
- if (weight) {
|
|
|
- sa->load_sum += weight * contrib;
|
|
|
- if (cfs_rq)
|
|
|
- cfs_rq->runnable_load_sum += weight * contrib;
|
|
|
- }
|
|
|
+ if (load)
|
|
|
+ sa->load_sum += load * contrib;
|
|
|
+ if (runnable)
|
|
|
+ sa->runnable_load_sum += runnable * contrib;
|
|
|
if (running)
|
|
|
sa->util_sum += contrib * scale_cpu;
|
|
|
|
|
@@ -2966,8 +3186,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
|
|
|
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
|
|
|
*/
|
|
|
static __always_inline int
|
|
|
-___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
|
|
|
- unsigned long weight, int running, struct cfs_rq *cfs_rq)
|
|
|
+___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
|
|
|
+ unsigned long load, unsigned long runnable, int running)
|
|
|
{
|
|
|
u64 delta;
|
|
|
|
|
@@ -3000,8 +3220,8 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
|
|
|
* this happens during idle_balance() which calls
|
|
|
* update_blocked_averages()
|
|
|
*/
|
|
|
- if (!weight)
|
|
|
- running = 0;
|
|
|
+ if (!load)
|
|
|
+ runnable = running = 0;
|
|
|
|
|
|
/*
|
|
|
* Now we know we crossed measurement unit boundaries. The *_avg
|
|
@@ -3010,63 +3230,96 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
|
|
|
* Step 1: accumulate *_sum since last_update_time. If we haven't
|
|
|
* crossed period boundaries, finish.
|
|
|
*/
|
|
|
- if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
|
|
|
+ if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
|
|
|
return 0;
|
|
|
|
|
|
+ return 1;
|
|
|
+}
|
|
|
+
|
|
|
+static __always_inline void
|
|
|
+___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
|
|
|
+{
|
|
|
+ u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
|
|
|
+
|
|
|
/*
|
|
|
* Step 2: update *_avg.
|
|
|
*/
|
|
|
- sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
|
|
|
- if (cfs_rq) {
|
|
|
- cfs_rq->runnable_load_avg =
|
|
|
- div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
|
|
|
- }
|
|
|
- sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib);
|
|
|
-
|
|
|
- return 1;
|
|
|
+ sa->load_avg = div_u64(load * sa->load_sum, divider);
|
|
|
+ sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
|
|
|
+ sa->util_avg = sa->util_sum / divider;
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * sched_entity:
|
|
|
+ *
|
|
|
+ * task:
|
|
|
+ * se_runnable() == se_weight()
|
|
|
+ *
|
|
|
+ * group: [ see update_cfs_group() ]
|
|
|
+ * se_weight() = tg->weight * grq->load_avg / tg->load_avg
|
|
|
+ * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
|
|
|
+ *
|
|
|
+ * load_sum := runnable_sum
|
|
|
+ * load_avg = se_weight(se) * runnable_avg
|
|
|
+ *
|
|
|
+ * runnable_load_sum := runnable_sum
|
|
|
+ * runnable_load_avg = se_runnable(se) * runnable_avg
|
|
|
+ *
|
|
|
+ * XXX collapse load_sum and runnable_load_sum
|
|
|
+ *
|
|
|
+ * cfq_rs:
|
|
|
+ *
|
|
|
+ * load_sum = \Sum se_weight(se) * se->avg.load_sum
|
|
|
+ * load_avg = \Sum se->avg.load_avg
|
|
|
+ *
|
|
|
+ * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
|
|
|
+ * runnable_load_avg = \Sum se->avg.runable_load_avg
|
|
|
+ */
|
|
|
+
|
|
|
static int
|
|
|
__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
|
|
|
{
|
|
|
- return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL);
|
|
|
+ if (entity_is_task(se))
|
|
|
+ se->runnable_weight = se->load.weight;
|
|
|
+
|
|
|
+ if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
|
|
|
+ ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
|
|
|
+ return 1;
|
|
|
+ }
|
|
|
+
|
|
|
+ return 0;
|
|
|
}
|
|
|
|
|
|
static int
|
|
|
__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
{
|
|
|
- return ___update_load_avg(now, cpu, &se->avg,
|
|
|
- se->on_rq * scale_load_down(se->load.weight),
|
|
|
- cfs_rq->curr == se, NULL);
|
|
|
+ if (entity_is_task(se))
|
|
|
+ se->runnable_weight = se->load.weight;
|
|
|
+
|
|
|
+ if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
|
|
|
+ cfs_rq->curr == se)) {
|
|
|
+
|
|
|
+ ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
|
|
|
+ return 1;
|
|
|
+ }
|
|
|
+
|
|
|
+ return 0;
|
|
|
}
|
|
|
|
|
|
static int
|
|
|
__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
|
|
|
{
|
|
|
- return ___update_load_avg(now, cpu, &cfs_rq->avg,
|
|
|
- scale_load_down(cfs_rq->load.weight),
|
|
|
- cfs_rq->curr != NULL, cfs_rq);
|
|
|
-}
|
|
|
+ if (___update_load_sum(now, cpu, &cfs_rq->avg,
|
|
|
+ scale_load_down(cfs_rq->load.weight),
|
|
|
+ scale_load_down(cfs_rq->runnable_weight),
|
|
|
+ cfs_rq->curr != NULL)) {
|
|
|
|
|
|
-/*
|
|
|
- * Signed add and clamp on underflow.
|
|
|
- *
|
|
|
- * Explicitly do a load-store to ensure the intermediate value never hits
|
|
|
- * memory. This allows lockless observations without ever seeing the negative
|
|
|
- * values.
|
|
|
- */
|
|
|
-#define add_positive(_ptr, _val) do { \
|
|
|
- typeof(_ptr) ptr = (_ptr); \
|
|
|
- typeof(_val) val = (_val); \
|
|
|
- typeof(*ptr) res, var = READ_ONCE(*ptr); \
|
|
|
- \
|
|
|
- res = var + val; \
|
|
|
- \
|
|
|
- if (val < 0 && res > var) \
|
|
|
- res = 0; \
|
|
|
- \
|
|
|
- WRITE_ONCE(*ptr, res); \
|
|
|
-} while (0)
|
|
|
+ ___update_load_avg(&cfs_rq->avg, 1, 1);
|
|
|
+ return 1;
|
|
|
+ }
|
|
|
+
|
|
|
+ return 0;
|
|
|
+}
|
|
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
/**
|
|
@@ -3149,11 +3402,77 @@ void set_task_rq_fair(struct sched_entity *se,
|
|
|
se->avg.last_update_time = n_last_update_time;
|
|
|
}
|
|
|
|
|
|
-/* Take into account change of utilization of a child task group */
|
|
|
+
|
|
|
+/*
|
|
|
+ * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
|
|
|
+ * propagate its contribution. The key to this propagation is the invariant
|
|
|
+ * that for each group:
|
|
|
+ *
|
|
|
+ * ge->avg == grq->avg (1)
|
|
|
+ *
|
|
|
+ * _IFF_ we look at the pure running and runnable sums. Because they
|
|
|
+ * represent the very same entity, just at different points in the hierarchy.
|
|
|
+ *
|
|
|
+ *
|
|
|
+ * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and
|
|
|
+ * simply copies the running sum over.
|
|
|
+ *
|
|
|
+ * However, update_tg_cfs_runnable() is more complex. So we have:
|
|
|
+ *
|
|
|
+ * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
|
|
|
+ *
|
|
|
+ * And since, like util, the runnable part should be directly transferable,
|
|
|
+ * the following would _appear_ to be the straight forward approach:
|
|
|
+ *
|
|
|
+ * grq->avg.load_avg = grq->load.weight * grq->avg.running_avg (3)
|
|
|
+ *
|
|
|
+ * And per (1) we have:
|
|
|
+ *
|
|
|
+ * ge->avg.running_avg == grq->avg.running_avg
|
|
|
+ *
|
|
|
+ * Which gives:
|
|
|
+ *
|
|
|
+ * ge->load.weight * grq->avg.load_avg
|
|
|
+ * ge->avg.load_avg = ----------------------------------- (4)
|
|
|
+ * grq->load.weight
|
|
|
+ *
|
|
|
+ * Except that is wrong!
|
|
|
+ *
|
|
|
+ * Because while for entities historical weight is not important and we
|
|
|
+ * really only care about our future and therefore can consider a pure
|
|
|
+ * runnable sum, runqueues can NOT do this.
|
|
|
+ *
|
|
|
+ * We specifically want runqueues to have a load_avg that includes
|
|
|
+ * historical weights. Those represent the blocked load, the load we expect
|
|
|
+ * to (shortly) return to us. This only works by keeping the weights as
|
|
|
+ * integral part of the sum. We therefore cannot decompose as per (3).
|
|
|
+ *
|
|
|
+ * OK, so what then?
|
|
|
+ *
|
|
|
+ *
|
|
|
+ * Another way to look at things is:
|
|
|
+ *
|
|
|
+ * grq->avg.load_avg = \Sum se->avg.load_avg
|
|
|
+ *
|
|
|
+ * Therefore, per (2):
|
|
|
+ *
|
|
|
+ * grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg
|
|
|
+ *
|
|
|
+ * And the very thing we're propagating is a change in that sum (someone
|
|
|
+ * joined/left). So we can easily know the runnable change, which would be, per
|
|
|
+ * (2) the already tracked se->load_avg divided by the corresponding
|
|
|
+ * se->weight.
|
|
|
+ *
|
|
|
+ * Basically (4) but in differential form:
|
|
|
+ *
|
|
|
+ * d(runnable_avg) += se->avg.load_avg / se->load.weight
|
|
|
+ * (5)
|
|
|
+ * ge->avg.load_avg += ge->load.weight * d(runnable_avg)
|
|
|
+ */
|
|
|
+
|
|
|
static inline void
|
|
|
-update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
|
|
|
{
|
|
|
- struct cfs_rq *gcfs_rq = group_cfs_rq(se);
|
|
|
long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
|
|
|
|
|
|
/* Nothing to update */
|
|
@@ -3169,102 +3488,65 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
|
|
|
}
|
|
|
|
|
|
-/* Take into account change of load of a child task group */
|
|
|
static inline void
|
|
|
-update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
+update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
|
|
|
{
|
|
|
- struct cfs_rq *gcfs_rq = group_cfs_rq(se);
|
|
|
- long delta, load = gcfs_rq->avg.load_avg;
|
|
|
+ long runnable_sum = gcfs_rq->prop_runnable_sum;
|
|
|
+ long runnable_load_avg, load_avg;
|
|
|
+ s64 runnable_load_sum, load_sum;
|
|
|
|
|
|
- /*
|
|
|
- * If the load of group cfs_rq is null, the load of the
|
|
|
- * sched_entity will also be null so we can skip the formula
|
|
|
- */
|
|
|
- if (load) {
|
|
|
- long tg_load;
|
|
|
+ if (!runnable_sum)
|
|
|
+ return;
|
|
|
|
|
|
- /* Get tg's load and ensure tg_load > 0 */
|
|
|
- tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
|
|
|
+ gcfs_rq->prop_runnable_sum = 0;
|
|
|
|
|
|
- /* Ensure tg_load >= load and updated with current load*/
|
|
|
- tg_load -= gcfs_rq->tg_load_avg_contrib;
|
|
|
- tg_load += load;
|
|
|
+ load_sum = (s64)se_weight(se) * runnable_sum;
|
|
|
+ load_avg = div_s64(load_sum, LOAD_AVG_MAX);
|
|
|
|
|
|
- /*
|
|
|
- * We need to compute a correction term in the case that the
|
|
|
- * task group is consuming more CPU than a task of equal
|
|
|
- * weight. A task with a weight equals to tg->shares will have
|
|
|
- * a load less or equal to scale_load_down(tg->shares).
|
|
|
- * Similarly, the sched_entities that represent the task group
|
|
|
- * at parent level, can't have a load higher than
|
|
|
- * scale_load_down(tg->shares). And the Sum of sched_entities'
|
|
|
- * load must be <= scale_load_down(tg->shares).
|
|
|
- */
|
|
|
- if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
|
|
|
- /* scale gcfs_rq's load into tg's shares*/
|
|
|
- load *= scale_load_down(gcfs_rq->tg->shares);
|
|
|
- load /= tg_load;
|
|
|
- }
|
|
|
- }
|
|
|
+ add_positive(&se->avg.load_sum, runnable_sum);
|
|
|
+ add_positive(&se->avg.load_avg, load_avg);
|
|
|
|
|
|
- delta = load - se->avg.load_avg;
|
|
|
+ add_positive(&cfs_rq->avg.load_avg, load_avg);
|
|
|
+ add_positive(&cfs_rq->avg.load_sum, load_sum);
|
|
|
|
|
|
- /* Nothing to update */
|
|
|
- if (!delta)
|
|
|
- return;
|
|
|
-
|
|
|
- /* Set new sched_entity's load */
|
|
|
- se->avg.load_avg = load;
|
|
|
- se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
|
|
|
+ runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
|
|
|
+ runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
|
|
|
|
|
|
- /* Update parent cfs_rq load */
|
|
|
- add_positive(&cfs_rq->avg.load_avg, delta);
|
|
|
- cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
|
|
|
+ add_positive(&se->avg.runnable_load_sum, runnable_sum);
|
|
|
+ add_positive(&se->avg.runnable_load_avg, runnable_load_avg);
|
|
|
|
|
|
- /*
|
|
|
- * If the sched_entity is already enqueued, we also have to update the
|
|
|
- * runnable load avg.
|
|
|
- */
|
|
|
if (se->on_rq) {
|
|
|
- /* Update parent cfs_rq runnable_load_avg */
|
|
|
- add_positive(&cfs_rq->runnable_load_avg, delta);
|
|
|
- cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
|
|
|
+ add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg);
|
|
|
+ add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
|
|
|
-{
|
|
|
- cfs_rq->propagate_avg = 1;
|
|
|
-}
|
|
|
-
|
|
|
-static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
|
|
|
+static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
|
|
|
{
|
|
|
- struct cfs_rq *cfs_rq = group_cfs_rq(se);
|
|
|
-
|
|
|
- if (!cfs_rq->propagate_avg)
|
|
|
- return 0;
|
|
|
-
|
|
|
- cfs_rq->propagate_avg = 0;
|
|
|
- return 1;
|
|
|
+ cfs_rq->propagate = 1;
|
|
|
+ cfs_rq->prop_runnable_sum += runnable_sum;
|
|
|
}
|
|
|
|
|
|
/* Update task and its cfs_rq load average */
|
|
|
static inline int propagate_entity_load_avg(struct sched_entity *se)
|
|
|
{
|
|
|
- struct cfs_rq *cfs_rq;
|
|
|
+ struct cfs_rq *cfs_rq, *gcfs_rq;
|
|
|
|
|
|
if (entity_is_task(se))
|
|
|
return 0;
|
|
|
|
|
|
- if (!test_and_clear_tg_cfs_propagate(se))
|
|
|
+ gcfs_rq = group_cfs_rq(se);
|
|
|
+ if (!gcfs_rq->propagate)
|
|
|
return 0;
|
|
|
|
|
|
+ gcfs_rq->propagate = 0;
|
|
|
+
|
|
|
cfs_rq = cfs_rq_of(se);
|
|
|
|
|
|
- set_tg_cfs_propagate(cfs_rq);
|
|
|
+ add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
|
|
|
|
|
|
- update_tg_cfs_util(cfs_rq, se);
|
|
|
- update_tg_cfs_load(cfs_rq, se);
|
|
|
+ update_tg_cfs_util(cfs_rq, se, gcfs_rq);
|
|
|
+ update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
|
|
|
|
|
|
return 1;
|
|
|
}
|
|
@@ -3288,7 +3570,7 @@ static inline bool skip_blocked_update(struct sched_entity *se)
|
|
|
* If there is a pending propagation, we have to update the load and
|
|
|
* the utilization of the sched_entity:
|
|
|
*/
|
|
|
- if (gcfs_rq->propagate_avg)
|
|
|
+ if (gcfs_rq->propagate)
|
|
|
return false;
|
|
|
|
|
|
/*
|
|
@@ -3308,27 +3590,10 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
-static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
|
|
|
+static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
|
|
|
|
|
|
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
|
|
|
|
|
-/*
|
|
|
- * Unsigned subtract and clamp on underflow.
|
|
|
- *
|
|
|
- * Explicitly do a load-store to ensure the intermediate value never hits
|
|
|
- * memory. This allows lockless observations without ever seeing the negative
|
|
|
- * values.
|
|
|
- */
|
|
|
-#define sub_positive(_ptr, _val) do { \
|
|
|
- typeof(_ptr) ptr = (_ptr); \
|
|
|
- typeof(*ptr) val = (_val); \
|
|
|
- typeof(*ptr) res, var = READ_ONCE(*ptr); \
|
|
|
- res = var - val; \
|
|
|
- if (res > var) \
|
|
|
- res = 0; \
|
|
|
- WRITE_ONCE(*ptr, res); \
|
|
|
-} while (0)
|
|
|
-
|
|
|
/**
|
|
|
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
|
|
|
* @now: current time, as per cfs_rq_clock_task()
|
|
@@ -3348,65 +3613,45 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
|
|
|
static inline int
|
|
|
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
|
|
|
{
|
|
|
+ unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
|
|
|
struct sched_avg *sa = &cfs_rq->avg;
|
|
|
- int decayed, removed_load = 0, removed_util = 0;
|
|
|
+ int decayed = 0;
|
|
|
|
|
|
- if (atomic_long_read(&cfs_rq->removed_load_avg)) {
|
|
|
- s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
|
|
|
+ if (cfs_rq->removed.nr) {
|
|
|
+ unsigned long r;
|
|
|
+ u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
|
|
|
+
|
|
|
+ raw_spin_lock(&cfs_rq->removed.lock);
|
|
|
+ swap(cfs_rq->removed.util_avg, removed_util);
|
|
|
+ swap(cfs_rq->removed.load_avg, removed_load);
|
|
|
+ swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
|
|
|
+ cfs_rq->removed.nr = 0;
|
|
|
+ raw_spin_unlock(&cfs_rq->removed.lock);
|
|
|
+
|
|
|
+ r = removed_load;
|
|
|
sub_positive(&sa->load_avg, r);
|
|
|
- sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
|
|
|
- removed_load = 1;
|
|
|
- set_tg_cfs_propagate(cfs_rq);
|
|
|
- }
|
|
|
+ sub_positive(&sa->load_sum, r * divider);
|
|
|
|
|
|
- if (atomic_long_read(&cfs_rq->removed_util_avg)) {
|
|
|
- long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
|
|
|
+ r = removed_util;
|
|
|
sub_positive(&sa->util_avg, r);
|
|
|
- sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
|
|
|
- removed_util = 1;
|
|
|
- set_tg_cfs_propagate(cfs_rq);
|
|
|
+ sub_positive(&sa->util_sum, r * divider);
|
|
|
+
|
|
|
+ add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
|
|
|
+
|
|
|
+ decayed = 1;
|
|
|
}
|
|
|
|
|
|
- decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
|
|
|
+ decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
|
|
|
|
|
|
#ifndef CONFIG_64BIT
|
|
|
smp_wmb();
|
|
|
cfs_rq->load_last_update_time_copy = sa->last_update_time;
|
|
|
#endif
|
|
|
|
|
|
- if (decayed || removed_util)
|
|
|
+ if (decayed)
|
|
|
cfs_rq_util_change(cfs_rq);
|
|
|
|
|
|
- return decayed || removed_load;
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- * Optional action to be done while updating the load average
|
|
|
- */
|
|
|
-#define UPDATE_TG 0x1
|
|
|
-#define SKIP_AGE_LOAD 0x2
|
|
|
-
|
|
|
-/* Update task and its cfs_rq load average */
|
|
|
-static inline void update_load_avg(struct sched_entity *se, int flags)
|
|
|
-{
|
|
|
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
|
- u64 now = cfs_rq_clock_task(cfs_rq);
|
|
|
- struct rq *rq = rq_of(cfs_rq);
|
|
|
- int cpu = cpu_of(rq);
|
|
|
- int decayed;
|
|
|
-
|
|
|
- /*
|
|
|
- * Track task load average for carrying it to new CPU after migrated, and
|
|
|
- * track group sched_entity load average for task_h_load calc in migration
|
|
|
- */
|
|
|
- if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
|
|
|
- __update_load_avg_se(now, cpu, cfs_rq, se);
|
|
|
-
|
|
|
- decayed = update_cfs_rq_load_avg(now, cfs_rq);
|
|
|
- decayed |= propagate_entity_load_avg(se);
|
|
|
-
|
|
|
- if (decayed && (flags & UPDATE_TG))
|
|
|
- update_tg_load_avg(cfs_rq, 0);
|
|
|
+ return decayed;
|
|
|
}
|
|
|
|
|
|
/**
|
|
@@ -3419,12 +3664,39 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
|
|
|
*/
|
|
|
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
{
|
|
|
+ u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * When we attach the @se to the @cfs_rq, we must align the decay
|
|
|
+ * window because without that, really weird and wonderful things can
|
|
|
+ * happen.
|
|
|
+ *
|
|
|
+ * XXX illustrate
|
|
|
+ */
|
|
|
se->avg.last_update_time = cfs_rq->avg.last_update_time;
|
|
|
- cfs_rq->avg.load_avg += se->avg.load_avg;
|
|
|
- cfs_rq->avg.load_sum += se->avg.load_sum;
|
|
|
+ se->avg.period_contrib = cfs_rq->avg.period_contrib;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
|
|
|
+ * period_contrib. This isn't strictly correct, but since we're
|
|
|
+ * entirely outside of the PELT hierarchy, nobody cares if we truncate
|
|
|
+ * _sum a little.
|
|
|
+ */
|
|
|
+ se->avg.util_sum = se->avg.util_avg * divider;
|
|
|
+
|
|
|
+ se->avg.load_sum = divider;
|
|
|
+ if (se_weight(se)) {
|
|
|
+ se->avg.load_sum =
|
|
|
+ div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
|
|
|
+ }
|
|
|
+
|
|
|
+ se->avg.runnable_load_sum = se->avg.load_sum;
|
|
|
+
|
|
|
+ enqueue_load_avg(cfs_rq, se);
|
|
|
cfs_rq->avg.util_avg += se->avg.util_avg;
|
|
|
cfs_rq->avg.util_sum += se->avg.util_sum;
|
|
|
- set_tg_cfs_propagate(cfs_rq);
|
|
|
+
|
|
|
+ add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
|
|
|
|
|
|
cfs_rq_util_change(cfs_rq);
|
|
|
}
|
|
@@ -3439,39 +3711,47 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
|
|
|
*/
|
|
|
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
{
|
|
|
-
|
|
|
- sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
|
|
|
- sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
|
|
|
+ dequeue_load_avg(cfs_rq, se);
|
|
|
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
|
|
|
sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
|
|
|
- set_tg_cfs_propagate(cfs_rq);
|
|
|
+
|
|
|
+ add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
|
|
|
|
|
|
cfs_rq_util_change(cfs_rq);
|
|
|
}
|
|
|
|
|
|
-/* Add the load generated by se into cfs_rq's load average */
|
|
|
-static inline void
|
|
|
-enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
+/*
|
|
|
+ * Optional action to be done while updating the load average
|
|
|
+ */
|
|
|
+#define UPDATE_TG 0x1
|
|
|
+#define SKIP_AGE_LOAD 0x2
|
|
|
+#define DO_ATTACH 0x4
|
|
|
+
|
|
|
+/* Update task and its cfs_rq load average */
|
|
|
+static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
|
{
|
|
|
- struct sched_avg *sa = &se->avg;
|
|
|
+ u64 now = cfs_rq_clock_task(cfs_rq);
|
|
|
+ struct rq *rq = rq_of(cfs_rq);
|
|
|
+ int cpu = cpu_of(rq);
|
|
|
+ int decayed;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Track task load average for carrying it to new CPU after migrated, and
|
|
|
+ * track group sched_entity load average for task_h_load calc in migration
|
|
|
+ */
|
|
|
+ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
|
|
|
+ __update_load_avg_se(now, cpu, cfs_rq, se);
|
|
|
|
|
|
- cfs_rq->runnable_load_avg += sa->load_avg;
|
|
|
- cfs_rq->runnable_load_sum += sa->load_sum;
|
|
|
+ decayed = update_cfs_rq_load_avg(now, cfs_rq);
|
|
|
+ decayed |= propagate_entity_load_avg(se);
|
|
|
+
|
|
|
+ if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
|
|
|
|
|
|
- if (!sa->last_update_time) {
|
|
|
attach_entity_load_avg(cfs_rq, se);
|
|
|
update_tg_load_avg(cfs_rq, 0);
|
|
|
- }
|
|
|
-}
|
|
|
|
|
|
-/* Remove the runnable load generated by se from cfs_rq's runnable load average */
|
|
|
-static inline void
|
|
|
-dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
-{
|
|
|
- cfs_rq->runnable_load_avg =
|
|
|
- max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
|
|
|
- cfs_rq->runnable_load_sum =
|
|
|
- max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
|
|
|
+ } else if (decayed && (flags & UPDATE_TG))
|
|
|
+ update_tg_load_avg(cfs_rq, 0);
|
|
|
}
|
|
|
|
|
|
#ifndef CONFIG_64BIT
|
|
@@ -3515,6 +3795,7 @@ void sync_entity_load_avg(struct sched_entity *se)
|
|
|
void remove_entity_load_avg(struct sched_entity *se)
|
|
|
{
|
|
|
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
|
+ unsigned long flags;
|
|
|
|
|
|
/*
|
|
|
* tasks cannot exit without having gone through wake_up_new_task() ->
|
|
@@ -3527,13 +3808,18 @@ void remove_entity_load_avg(struct sched_entity *se)
|
|
|
*/
|
|
|
|
|
|
sync_entity_load_avg(se);
|
|
|
- atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
|
|
|
- atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
|
|
|
+
|
|
|
+ raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
|
|
|
+ ++cfs_rq->removed.nr;
|
|
|
+ cfs_rq->removed.util_avg += se->avg.util_avg;
|
|
|
+ cfs_rq->removed.load_avg += se->avg.load_avg;
|
|
|
+ cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */
|
|
|
+ raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
|
|
|
}
|
|
|
|
|
|
static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
|
|
|
{
|
|
|
- return cfs_rq->runnable_load_avg;
|
|
|
+ return cfs_rq->avg.runnable_load_avg;
|
|
|
}
|
|
|
|
|
|
static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
|
|
@@ -3553,16 +3839,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
|
|
|
|
|
|
#define UPDATE_TG 0x0
|
|
|
#define SKIP_AGE_LOAD 0x0
|
|
|
+#define DO_ATTACH 0x0
|
|
|
|
|
|
-static inline void update_load_avg(struct sched_entity *se, int not_used1)
|
|
|
+static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
|
|
|
{
|
|
|
- cfs_rq_util_change(cfs_rq_of(se));
|
|
|
+ cfs_rq_util_change(cfs_rq);
|
|
|
}
|
|
|
|
|
|
-static inline void
|
|
|
-enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
|
|
|
-static inline void
|
|
|
-dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
|
|
|
static inline void remove_entity_load_avg(struct sched_entity *se) {}
|
|
|
|
|
|
static inline void
|
|
@@ -3707,9 +3990,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
|
* its group cfs_rq
|
|
|
* - Add its new weight to cfs_rq->load.weight
|
|
|
*/
|
|
|
- update_load_avg(se, UPDATE_TG);
|
|
|
- enqueue_entity_load_avg(cfs_rq, se);
|
|
|
- update_cfs_shares(se);
|
|
|
+ update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
|
|
|
+ update_cfs_group(se);
|
|
|
+ enqueue_runnable_load_avg(cfs_rq, se);
|
|
|
account_entity_enqueue(cfs_rq, se);
|
|
|
|
|
|
if (flags & ENQUEUE_WAKEUP)
|
|
@@ -3791,8 +4074,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
|
* - For group entity, update its weight to reflect the new share
|
|
|
* of its group cfs_rq.
|
|
|
*/
|
|
|
- update_load_avg(se, UPDATE_TG);
|
|
|
- dequeue_entity_load_avg(cfs_rq, se);
|
|
|
+ update_load_avg(cfs_rq, se, UPDATE_TG);
|
|
|
+ dequeue_runnable_load_avg(cfs_rq, se);
|
|
|
|
|
|
update_stats_dequeue(cfs_rq, se, flags);
|
|
|
|
|
@@ -3815,7 +4098,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
|
/* return excess runtime on last dequeue */
|
|
|
return_cfs_rq_runtime(cfs_rq);
|
|
|
|
|
|
- update_cfs_shares(se);
|
|
|
+ update_cfs_group(se);
|
|
|
|
|
|
/*
|
|
|
* Now advance min_vruntime if @se was the entity holding it back,
|
|
@@ -3879,7 +4162,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
*/
|
|
|
update_stats_wait_end(cfs_rq, se);
|
|
|
__dequeue_entity(cfs_rq, se);
|
|
|
- update_load_avg(se, UPDATE_TG);
|
|
|
+ update_load_avg(cfs_rq, se, UPDATE_TG);
|
|
|
}
|
|
|
|
|
|
update_stats_curr_start(cfs_rq, se);
|
|
@@ -3981,7 +4264,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
|
|
|
/* Put 'current' back into the tree. */
|
|
|
__enqueue_entity(cfs_rq, prev);
|
|
|
/* in !on_rq case, update occurred at dequeue */
|
|
|
- update_load_avg(prev, 0);
|
|
|
+ update_load_avg(cfs_rq, prev, 0);
|
|
|
}
|
|
|
cfs_rq->curr = NULL;
|
|
|
}
|
|
@@ -3997,8 +4280,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
|
|
|
/*
|
|
|
* Ensure that runnable average is periodically updated.
|
|
|
*/
|
|
|
- update_load_avg(curr, UPDATE_TG);
|
|
|
- update_cfs_shares(curr);
|
|
|
+ update_load_avg(cfs_rq, curr, UPDATE_TG);
|
|
|
+ update_cfs_group(curr);
|
|
|
|
|
|
#ifdef CONFIG_SCHED_HRTICK
|
|
|
/*
|
|
@@ -4915,8 +5198,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
|
|
if (cfs_rq_throttled(cfs_rq))
|
|
|
break;
|
|
|
|
|
|
- update_load_avg(se, UPDATE_TG);
|
|
|
- update_cfs_shares(se);
|
|
|
+ update_load_avg(cfs_rq, se, UPDATE_TG);
|
|
|
+ update_cfs_group(se);
|
|
|
}
|
|
|
|
|
|
if (!se)
|
|
@@ -4974,8 +5257,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
|
|
if (cfs_rq_throttled(cfs_rq))
|
|
|
break;
|
|
|
|
|
|
- update_load_avg(se, UPDATE_TG);
|
|
|
- update_cfs_shares(se);
|
|
|
+ update_load_avg(cfs_rq, se, UPDATE_TG);
|
|
|
+ update_cfs_group(se);
|
|
|
}
|
|
|
|
|
|
if (!se)
|
|
@@ -5449,6 +5732,8 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
|
|
|
/*
|
|
|
* find_idlest_group finds and returns the least busy CPU group within the
|
|
|
* domain.
|
|
|
+ *
|
|
|
+ * Assumes p is allowed on at least one CPU in sd.
|
|
|
*/
|
|
|
static struct sched_group *
|
|
|
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
|
|
@@ -5456,8 +5741,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
|
|
|
{
|
|
|
struct sched_group *idlest = NULL, *group = sd->groups;
|
|
|
struct sched_group *most_spare_sg = NULL;
|
|
|
- unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0;
|
|
|
- unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0;
|
|
|
+ unsigned long min_runnable_load = ULONG_MAX;
|
|
|
+ unsigned long this_runnable_load = ULONG_MAX;
|
|
|
+ unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
|
|
|
unsigned long most_spare = 0, this_spare = 0;
|
|
|
int load_idx = sd->forkexec_idx;
|
|
|
int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
|
|
@@ -5578,10 +5864,10 @@ skip_spare:
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
- * find_idlest_cpu - find the idlest cpu among the cpus in group.
|
|
|
+ * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
|
|
|
*/
|
|
|
static int
|
|
|
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
|
|
|
+find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
|
|
|
{
|
|
|
unsigned long load, min_load = ULONG_MAX;
|
|
|
unsigned int min_exit_latency = UINT_MAX;
|
|
@@ -5630,6 +5916,53 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
|
|
|
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
|
|
|
}
|
|
|
|
|
|
+static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
|
|
|
+ int cpu, int prev_cpu, int sd_flag)
|
|
|
+{
|
|
|
+ int new_cpu = cpu;
|
|
|
+
|
|
|
+ if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
|
|
|
+ return prev_cpu;
|
|
|
+
|
|
|
+ while (sd) {
|
|
|
+ struct sched_group *group;
|
|
|
+ struct sched_domain *tmp;
|
|
|
+ int weight;
|
|
|
+
|
|
|
+ if (!(sd->flags & sd_flag)) {
|
|
|
+ sd = sd->child;
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ group = find_idlest_group(sd, p, cpu, sd_flag);
|
|
|
+ if (!group) {
|
|
|
+ sd = sd->child;
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ new_cpu = find_idlest_group_cpu(group, p, cpu);
|
|
|
+ if (new_cpu == cpu) {
|
|
|
+ /* Now try balancing at a lower domain level of cpu */
|
|
|
+ sd = sd->child;
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ /* Now try balancing at a lower domain level of new_cpu */
|
|
|
+ cpu = new_cpu;
|
|
|
+ weight = sd->span_weight;
|
|
|
+ sd = NULL;
|
|
|
+ for_each_domain(cpu, tmp) {
|
|
|
+ if (weight <= tmp->span_weight)
|
|
|
+ break;
|
|
|
+ if (tmp->flags & sd_flag)
|
|
|
+ sd = tmp;
|
|
|
+ }
|
|
|
+ /* while loop will break here if sd == NULL */
|
|
|
+ }
|
|
|
+
|
|
|
+ return new_cpu;
|
|
|
+}
|
|
|
+
|
|
|
#ifdef CONFIG_SCHED_SMT
|
|
|
|
|
|
static inline void set_idle_cores(int cpu, int val)
|
|
@@ -5982,50 +6315,30 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
|
|
|
new_cpu = cpu;
|
|
|
}
|
|
|
|
|
|
+ if (sd && !(sd_flag & SD_BALANCE_FORK)) {
|
|
|
+ /*
|
|
|
+ * We're going to need the task's util for capacity_spare_wake
|
|
|
+ * in find_idlest_group. Sync it up to prev_cpu's
|
|
|
+ * last_update_time.
|
|
|
+ */
|
|
|
+ sync_entity_load_avg(&p->se);
|
|
|
+ }
|
|
|
+
|
|
|
if (!sd) {
|
|
|
- pick_cpu:
|
|
|
+pick_cpu:
|
|
|
if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
|
|
|
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
|
|
|
|
|
|
- } else while (sd) {
|
|
|
- struct sched_group *group;
|
|
|
- int weight;
|
|
|
-
|
|
|
- if (!(sd->flags & sd_flag)) {
|
|
|
- sd = sd->child;
|
|
|
- continue;
|
|
|
- }
|
|
|
-
|
|
|
- group = find_idlest_group(sd, p, cpu, sd_flag);
|
|
|
- if (!group) {
|
|
|
- sd = sd->child;
|
|
|
- continue;
|
|
|
- }
|
|
|
-
|
|
|
- new_cpu = find_idlest_cpu(group, p, cpu);
|
|
|
- if (new_cpu == -1 || new_cpu == cpu) {
|
|
|
- /* Now try balancing at a lower domain level of cpu */
|
|
|
- sd = sd->child;
|
|
|
- continue;
|
|
|
- }
|
|
|
-
|
|
|
- /* Now try balancing at a lower domain level of new_cpu */
|
|
|
- cpu = new_cpu;
|
|
|
- weight = sd->span_weight;
|
|
|
- sd = NULL;
|
|
|
- for_each_domain(cpu, tmp) {
|
|
|
- if (weight <= tmp->span_weight)
|
|
|
- break;
|
|
|
- if (tmp->flags & sd_flag)
|
|
|
- sd = tmp;
|
|
|
- }
|
|
|
- /* while loop will break here if sd == NULL */
|
|
|
+ } else {
|
|
|
+ new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
|
|
|
}
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
return new_cpu;
|
|
|
}
|
|
|
|
|
|
+static void detach_entity_cfs_rq(struct sched_entity *se);
|
|
|
+
|
|
|
/*
|
|
|
* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
|
|
|
* cfs_rq_of(p) references at time of call are still valid and identify the
|
|
@@ -6059,14 +6372,25 @@ static void migrate_task_rq_fair(struct task_struct *p)
|
|
|
se->vruntime -= min_vruntime;
|
|
|
}
|
|
|
|
|
|
- /*
|
|
|
- * We are supposed to update the task to "current" time, then its up to date
|
|
|
- * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
|
|
|
- * what current time is, so simply throw away the out-of-date time. This
|
|
|
- * will result in the wakee task is less decayed, but giving the wakee more
|
|
|
- * load sounds not bad.
|
|
|
- */
|
|
|
- remove_entity_load_avg(&p->se);
|
|
|
+ if (p->on_rq == TASK_ON_RQ_MIGRATING) {
|
|
|
+ /*
|
|
|
+ * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
|
|
|
+ * rq->lock and can modify state directly.
|
|
|
+ */
|
|
|
+ lockdep_assert_held(&task_rq(p)->lock);
|
|
|
+ detach_entity_cfs_rq(&p->se);
|
|
|
+
|
|
|
+ } else {
|
|
|
+ /*
|
|
|
+ * We are supposed to update the task to "current" time, then
|
|
|
+ * its up to date and ready to go to new CPU/cfs_rq. But we
|
|
|
+ * have difficulty in getting what current time is, so simply
|
|
|
+ * throw away the out-of-date time. This will result in the
|
|
|
+ * wakee task is less decayed, but giving the wakee more load
|
|
|
+ * sounds not bad.
|
|
|
+ */
|
|
|
+ remove_entity_load_avg(&p->se);
|
|
|
+ }
|
|
|
|
|
|
/* Tell new CPU we are migrated */
|
|
|
p->se.avg.last_update_time = 0;
|
|
@@ -6334,10 +6658,7 @@ again:
|
|
|
set_next_entity(cfs_rq, se);
|
|
|
}
|
|
|
|
|
|
- if (hrtick_enabled(rq))
|
|
|
- hrtick_start_fair(rq, p);
|
|
|
-
|
|
|
- return p;
|
|
|
+ goto done;
|
|
|
simple:
|
|
|
#endif
|
|
|
|
|
@@ -6351,6 +6672,16 @@ simple:
|
|
|
|
|
|
p = task_of(se);
|
|
|
|
|
|
+done: __maybe_unused
|
|
|
+#ifdef CONFIG_SMP
|
|
|
+ /*
|
|
|
+ * Move the next running task to the front of
|
|
|
+ * the list, so our cfs_tasks list becomes MRU
|
|
|
+ * one.
|
|
|
+ */
|
|
|
+ list_move(&p->se.group_node, &rq->cfs_tasks);
|
|
|
+#endif
|
|
|
+
|
|
|
if (hrtick_enabled(rq))
|
|
|
hrtick_start_fair(rq, p);
|
|
|
|
|
@@ -6786,11 +7117,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
|
|
|
*/
|
|
|
static struct task_struct *detach_one_task(struct lb_env *env)
|
|
|
{
|
|
|
- struct task_struct *p, *n;
|
|
|
+ struct task_struct *p;
|
|
|
|
|
|
lockdep_assert_held(&env->src_rq->lock);
|
|
|
|
|
|
- list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
|
|
|
+ list_for_each_entry_reverse(p,
|
|
|
+ &env->src_rq->cfs_tasks, se.group_node) {
|
|
|
if (!can_migrate_task(p, env))
|
|
|
continue;
|
|
|
|
|
@@ -6836,7 +7168,7 @@ static int detach_tasks(struct lb_env *env)
|
|
|
if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
|
|
|
break;
|
|
|
|
|
|
- p = list_first_entry(tasks, struct task_struct, se.group_node);
|
|
|
+ p = list_last_entry(tasks, struct task_struct, se.group_node);
|
|
|
|
|
|
env->loop++;
|
|
|
/* We've more or less seen every task there is, call it quits */
|
|
@@ -6886,7 +7218,7 @@ static int detach_tasks(struct lb_env *env)
|
|
|
|
|
|
continue;
|
|
|
next:
|
|
|
- list_move_tail(&p->se.group_node, tasks);
|
|
|
+ list_move(&p->se.group_node, tasks);
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -6962,7 +7294,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
|
|
|
if (cfs_rq->avg.util_sum)
|
|
|
return false;
|
|
|
|
|
|
- if (cfs_rq->runnable_load_sum)
|
|
|
+ if (cfs_rq->avg.runnable_load_sum)
|
|
|
return false;
|
|
|
|
|
|
return true;
|
|
@@ -6994,7 +7326,7 @@ static void update_blocked_averages(int cpu)
|
|
|
/* Propagate pending load changes to the parent, if any: */
|
|
|
se = cfs_rq->tg->se[cpu];
|
|
|
if (se && !skip_blocked_update(se))
|
|
|
- update_load_avg(se, 0);
|
|
|
+ update_load_avg(cfs_rq_of(se), se, 0);
|
|
|
|
|
|
/*
|
|
|
* There can be a lot of idle CPU cgroups. Don't let fully
|
|
@@ -7875,8 +8207,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
|
|
|
if (busiest->group_type == group_imbalanced)
|
|
|
goto force_balance;
|
|
|
|
|
|
- /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
|
|
|
- if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
|
|
|
+ /*
|
|
|
+ * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
|
|
|
+ * capacities from resulting in underutilization due to avg_load.
|
|
|
+ */
|
|
|
+ if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
|
|
|
busiest->group_no_capacity)
|
|
|
goto force_balance;
|
|
|
|
|
@@ -8693,7 +9028,7 @@ void nohz_balance_enter_idle(int cpu)
|
|
|
return;
|
|
|
|
|
|
/* Spare idle load balancing on CPUs that don't want to be disturbed: */
|
|
|
- if (!is_housekeeping_cpu(cpu))
|
|
|
+ if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
|
|
|
return;
|
|
|
|
|
|
if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
|
|
@@ -9158,7 +9493,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
|
|
|
if (cfs_rq_throttled(cfs_rq))
|
|
|
break;
|
|
|
|
|
|
- update_load_avg(se, UPDATE_TG);
|
|
|
+ update_load_avg(cfs_rq, se, UPDATE_TG);
|
|
|
}
|
|
|
}
|
|
|
#else
|
|
@@ -9170,7 +9505,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
|
|
|
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
|
|
|
|
/* Catch up with the cfs_rq and remove our load when we leave */
|
|
|
- update_load_avg(se, 0);
|
|
|
+ update_load_avg(cfs_rq, se, 0);
|
|
|
detach_entity_load_avg(cfs_rq, se);
|
|
|
update_tg_load_avg(cfs_rq, false);
|
|
|
propagate_entity_cfs_rq(se);
|
|
@@ -9189,7 +9524,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
|
|
|
#endif
|
|
|
|
|
|
/* Synchronize entity with its cfs_rq */
|
|
|
- update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
|
|
|
+ update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
|
|
|
attach_entity_load_avg(cfs_rq, se);
|
|
|
update_tg_load_avg(cfs_rq, false);
|
|
|
propagate_entity_cfs_rq(se);
|
|
@@ -9271,11 +9606,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
|
|
|
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
|
|
|
#endif
|
|
|
#ifdef CONFIG_SMP
|
|
|
-#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
- cfs_rq->propagate_avg = 0;
|
|
|
-#endif
|
|
|
- atomic_long_set(&cfs_rq->removed_load_avg, 0);
|
|
|
- atomic_long_set(&cfs_rq->removed_util_avg, 0);
|
|
|
+ raw_spin_lock_init(&cfs_rq->removed.lock);
|
|
|
#endif
|
|
|
}
|
|
|
|
|
@@ -9473,8 +9804,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
|
|
|
rq_lock_irqsave(rq, &rf);
|
|
|
update_rq_clock(rq);
|
|
|
for_each_sched_entity(se) {
|
|
|
- update_load_avg(se, UPDATE_TG);
|
|
|
- update_cfs_shares(se);
|
|
|
+ update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
|
|
|
+ update_cfs_group(se);
|
|
|
}
|
|
|
rq_unlock_irqrestore(rq, &rf);
|
|
|
}
|