|
|
@@ -37,7 +37,6 @@
|
|
|
|
|
|
/*
|
|
|
* Targeted preemption latency for CPU-bound tasks:
|
|
|
- * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
|
|
|
*
|
|
|
* NOTE: this latency value is not the same as the concept of
|
|
|
* 'timeslice length' - timeslices in CFS are of variable length
|
|
|
@@ -46,31 +45,35 @@
|
|
|
*
|
|
|
* (to see the precise effective timeslice length of your workload,
|
|
|
* run vmstat and monitor the context-switches (cs) field)
|
|
|
+ *
|
|
|
+ * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
|
|
|
*/
|
|
|
-unsigned int sysctl_sched_latency = 6000000ULL;
|
|
|
-unsigned int normalized_sysctl_sched_latency = 6000000ULL;
|
|
|
+unsigned int sysctl_sched_latency = 6000000ULL;
|
|
|
+unsigned int normalized_sysctl_sched_latency = 6000000ULL;
|
|
|
|
|
|
/*
|
|
|
* The initial- and re-scaling of tunables is configurable
|
|
|
- * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
|
|
|
*
|
|
|
* Options are:
|
|
|
- * SCHED_TUNABLESCALING_NONE - unscaled, always *1
|
|
|
- * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
|
|
|
- * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
|
|
|
+ *
|
|
|
+ * SCHED_TUNABLESCALING_NONE - unscaled, always *1
|
|
|
+ * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
|
|
|
+ * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
|
|
|
+ *
|
|
|
+ * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
|
|
|
*/
|
|
|
-enum sched_tunable_scaling sysctl_sched_tunable_scaling
|
|
|
- = SCHED_TUNABLESCALING_LOG;
|
|
|
+enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
|
|
|
|
|
|
/*
|
|
|
* Minimal preemption granularity for CPU-bound tasks:
|
|
|
+ *
|
|
|
* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
|
|
|
*/
|
|
|
-unsigned int sysctl_sched_min_granularity = 750000ULL;
|
|
|
-unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
|
|
|
+unsigned int sysctl_sched_min_granularity = 750000ULL;
|
|
|
+unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
|
|
|
|
|
|
/*
|
|
|
- * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
|
|
|
+ * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
|
|
|
*/
|
|
|
static unsigned int sched_nr_latency = 8;
|
|
|
|
|
|
@@ -82,23 +85,27 @@ unsigned int sysctl_sched_child_runs_first __read_mostly;
|
|
|
|
|
|
/*
|
|
|
* SCHED_OTHER wake-up granularity.
|
|
|
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
|
|
|
*
|
|
|
* This option delays the preemption effects of decoupled workloads
|
|
|
* and reduces their over-scheduling. Synchronous workloads will still
|
|
|
* have immediate wakeup/sleep latencies.
|
|
|
+ *
|
|
|
+ * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
|
|
|
*/
|
|
|
-unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
|
|
|
-unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
|
|
|
+unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
|
|
|
+unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
|
|
|
|
|
|
-const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
|
|
|
+const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
|
|
|
|
|
|
+#ifdef CONFIG_SMP
|
|
|
/*
|
|
|
- * The exponential sliding window over which load is averaged for shares
|
|
|
- * distribution.
|
|
|
- * (default: 10msec)
|
|
|
+ * For asym packing, by default the lower numbered cpu has higher priority.
|
|
|
*/
|
|
|
-unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
|
|
|
+int __weak arch_asym_cpu_priority(int cpu)
|
|
|
+{
|
|
|
+ return -cpu;
|
|
|
+}
|
|
|
+#endif
|
|
|
|
|
|
#ifdef CONFIG_CFS_BANDWIDTH
|
|
|
/*
|
|
|
@@ -109,16 +116,18 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
|
|
|
* to consumption or the quota being specified to be smaller than the slice)
|
|
|
* we will always only issue the remaining available time.
|
|
|
*
|
|
|
- * default: 5 msec, units: microseconds
|
|
|
- */
|
|
|
-unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
|
|
|
+ * (default: 5 msec, units: microseconds)
|
|
|
+ */
|
|
|
+unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
|
|
|
#endif
|
|
|
|
|
|
/*
|
|
|
* The margin used when comparing utilization with CPU capacity:
|
|
|
- * util * 1024 < capacity * margin
|
|
|
+ * util * margin < capacity * 1024
|
|
|
+ *
|
|
|
+ * (default: ~20%)
|
|
|
*/
|
|
|
-unsigned int capacity_margin = 1280; /* ~20% */
|
|
|
+unsigned int capacity_margin = 1280;
|
|
|
|
|
|
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
|
|
|
{
|
|
|
@@ -290,19 +299,59 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
|
|
|
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
|
|
|
{
|
|
|
if (!cfs_rq->on_list) {
|
|
|
+ struct rq *rq = rq_of(cfs_rq);
|
|
|
+ int cpu = cpu_of(rq);
|
|
|
/*
|
|
|
* Ensure we either appear before our parent (if already
|
|
|
* enqueued) or force our parent to appear after us when it is
|
|
|
- * enqueued. The fact that we always enqueue bottom-up
|
|
|
- * reduces this to two cases.
|
|
|
+ * enqueued. The fact that we always enqueue bottom-up
|
|
|
+ * reduces this to two cases and a special case for the root
|
|
|
+ * cfs_rq. Furthermore, it also means that we will always reset
|
|
|
+ * tmp_alone_branch either when the branch is connected
|
|
|
+ * to a tree or when we reach the beg of the tree
|
|
|
*/
|
|
|
if (cfs_rq->tg->parent &&
|
|
|
- cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
|
|
|
- list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
|
|
|
- &rq_of(cfs_rq)->leaf_cfs_rq_list);
|
|
|
- } else {
|
|
|
+ cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
|
|
|
+ /*
|
|
|
+ * If parent is already on the list, we add the child
|
|
|
+ * just before. Thanks to circular linked property of
|
|
|
+ * the list, this means to put the child at the tail
|
|
|
+ * of the list that starts by parent.
|
|
|
+ */
|
|
|
list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
|
|
|
- &rq_of(cfs_rq)->leaf_cfs_rq_list);
|
|
|
+ &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
|
|
|
+ /*
|
|
|
+ * The branch is now connected to its tree so we can
|
|
|
+ * reset tmp_alone_branch to the beginning of the
|
|
|
+ * list.
|
|
|
+ */
|
|
|
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
|
|
|
+ } else if (!cfs_rq->tg->parent) {
|
|
|
+ /*
|
|
|
+ * cfs rq without parent should be put
|
|
|
+ * at the tail of the list.
|
|
|
+ */
|
|
|
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
|
|
|
+ &rq->leaf_cfs_rq_list);
|
|
|
+ /*
|
|
|
+ * We have reach the beg of a tree so we can reset
|
|
|
+ * tmp_alone_branch to the beginning of the list.
|
|
|
+ */
|
|
|
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
|
|
|
+ } else {
|
|
|
+ /*
|
|
|
+ * The parent has not already been added so we want to
|
|
|
+ * make sure that it will be put after us.
|
|
|
+ * tmp_alone_branch points to the beg of the branch
|
|
|
+ * where we will add parent.
|
|
|
+ */
|
|
|
+ list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
|
|
|
+ rq->tmp_alone_branch);
|
|
|
+ /*
|
|
|
+ * update tmp_alone_branch to points to the new beg
|
|
|
+ * of the branch
|
|
|
+ */
|
|
|
+ rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
|
|
|
}
|
|
|
|
|
|
cfs_rq->on_list = 1;
|
|
|
@@ -708,9 +757,7 @@ void init_entity_runnable_average(struct sched_entity *se)
|
|
|
}
|
|
|
|
|
|
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
|
|
|
-static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
|
|
|
-static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force);
|
|
|
-static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
|
|
|
+static void attach_entity_cfs_rq(struct sched_entity *se);
|
|
|
|
|
|
/*
|
|
|
* With new tasks being created, their initial util_avgs are extrapolated
|
|
|
@@ -742,7 +789,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
|
|
|
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
|
struct sched_avg *sa = &se->avg;
|
|
|
long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
|
|
|
- u64 now = cfs_rq_clock_task(cfs_rq);
|
|
|
|
|
|
if (cap > 0) {
|
|
|
if (cfs_rq->avg.util_avg != 0) {
|
|
|
@@ -770,14 +816,12 @@ void post_init_entity_util_avg(struct sched_entity *se)
|
|
|
* such that the next switched_to_fair() has the
|
|
|
* expected state.
|
|
|
*/
|
|
|
- se->avg.last_update_time = now;
|
|
|
+ se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
|
|
|
return;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- update_cfs_rq_load_avg(now, cfs_rq, false);
|
|
|
- attach_entity_load_avg(cfs_rq, se);
|
|
|
- update_tg_load_avg(cfs_rq, false);
|
|
|
+ attach_entity_cfs_rq(se);
|
|
|
}
|
|
|
|
|
|
#else /* !CONFIG_SMP */
|
|
|
@@ -2890,6 +2934,26 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
|
|
|
return decayed;
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * Signed add and clamp on underflow.
|
|
|
+ *
|
|
|
+ * Explicitly do a load-store to ensure the intermediate value never hits
|
|
|
+ * memory. This allows lockless observations without ever seeing the negative
|
|
|
+ * values.
|
|
|
+ */
|
|
|
+#define add_positive(_ptr, _val) do { \
|
|
|
+ typeof(_ptr) ptr = (_ptr); \
|
|
|
+ typeof(_val) val = (_val); \
|
|
|
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
|
|
|
+ \
|
|
|
+ res = var + val; \
|
|
|
+ \
|
|
|
+ if (val < 0 && res > var) \
|
|
|
+ res = 0; \
|
|
|
+ \
|
|
|
+ WRITE_ONCE(*ptr, res); \
|
|
|
+} while (0)
|
|
|
+
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
/**
|
|
|
* update_tg_load_avg - update the tg's load avg
|
|
|
@@ -2969,8 +3033,138 @@ void set_task_rq_fair(struct sched_entity *se,
|
|
|
se->avg.last_update_time = n_last_update_time;
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+/* Take into account change of utilization of a child task group */
|
|
|
+static inline void
|
|
|
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
+{
|
|
|
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
|
|
|
+ long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
|
|
|
+
|
|
|
+ /* Nothing to update */
|
|
|
+ if (!delta)
|
|
|
+ return;
|
|
|
+
|
|
|
+ /* Set new sched_entity's utilization */
|
|
|
+ se->avg.util_avg = gcfs_rq->avg.util_avg;
|
|
|
+ se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
|
|
|
+
|
|
|
+ /* Update parent cfs_rq utilization */
|
|
|
+ add_positive(&cfs_rq->avg.util_avg, delta);
|
|
|
+ cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
|
|
|
+}
|
|
|
+
|
|
|
+/* Take into account change of load of a child task group */
|
|
|
+static inline void
|
|
|
+update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
+{
|
|
|
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
|
|
|
+ long delta, load = gcfs_rq->avg.load_avg;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If the load of group cfs_rq is null, the load of the
|
|
|
+ * sched_entity will also be null so we can skip the formula
|
|
|
+ */
|
|
|
+ if (load) {
|
|
|
+ long tg_load;
|
|
|
+
|
|
|
+ /* Get tg's load and ensure tg_load > 0 */
|
|
|
+ tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
|
|
|
+
|
|
|
+ /* Ensure tg_load >= load and updated with current load*/
|
|
|
+ tg_load -= gcfs_rq->tg_load_avg_contrib;
|
|
|
+ tg_load += load;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * We need to compute a correction term in the case that the
|
|
|
+ * task group is consuming more CPU than a task of equal
|
|
|
+ * weight. A task with a weight equals to tg->shares will have
|
|
|
+ * a load less or equal to scale_load_down(tg->shares).
|
|
|
+ * Similarly, the sched_entities that represent the task group
|
|
|
+ * at parent level, can't have a load higher than
|
|
|
+ * scale_load_down(tg->shares). And the Sum of sched_entities'
|
|
|
+ * load must be <= scale_load_down(tg->shares).
|
|
|
+ */
|
|
|
+ if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
|
|
|
+ /* scale gcfs_rq's load into tg's shares*/
|
|
|
+ load *= scale_load_down(gcfs_rq->tg->shares);
|
|
|
+ load /= tg_load;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ delta = load - se->avg.load_avg;
|
|
|
+
|
|
|
+ /* Nothing to update */
|
|
|
+ if (!delta)
|
|
|
+ return;
|
|
|
+
|
|
|
+ /* Set new sched_entity's load */
|
|
|
+ se->avg.load_avg = load;
|
|
|
+ se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
|
|
|
+
|
|
|
+ /* Update parent cfs_rq load */
|
|
|
+ add_positive(&cfs_rq->avg.load_avg, delta);
|
|
|
+ cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If the sched_entity is already enqueued, we also have to update the
|
|
|
+ * runnable load avg.
|
|
|
+ */
|
|
|
+ if (se->on_rq) {
|
|
|
+ /* Update parent cfs_rq runnable_load_avg */
|
|
|
+ add_positive(&cfs_rq->runnable_load_avg, delta);
|
|
|
+ cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
|
|
|
+{
|
|
|
+ cfs_rq->propagate_avg = 1;
|
|
|
+}
|
|
|
+
|
|
|
+static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
|
|
|
+{
|
|
|
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
|
|
|
+
|
|
|
+ if (!cfs_rq->propagate_avg)
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ cfs_rq->propagate_avg = 0;
|
|
|
+ return 1;
|
|
|
+}
|
|
|
+
|
|
|
+/* Update task and its cfs_rq load average */
|
|
|
+static inline int propagate_entity_load_avg(struct sched_entity *se)
|
|
|
+{
|
|
|
+ struct cfs_rq *cfs_rq;
|
|
|
+
|
|
|
+ if (entity_is_task(se))
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ if (!test_and_clear_tg_cfs_propagate(se))
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ cfs_rq = cfs_rq_of(se);
|
|
|
+
|
|
|
+ set_tg_cfs_propagate(cfs_rq);
|
|
|
+
|
|
|
+ update_tg_cfs_util(cfs_rq, se);
|
|
|
+ update_tg_cfs_load(cfs_rq, se);
|
|
|
+
|
|
|
+ return 1;
|
|
|
+}
|
|
|
+
|
|
|
#else /* CONFIG_FAIR_GROUP_SCHED */
|
|
|
+
|
|
|
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
|
|
|
+
|
|
|
+static inline int propagate_entity_load_avg(struct sched_entity *se)
|
|
|
+{
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
|
|
|
+
|
|
|
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
|
|
|
|
|
static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
|
|
|
@@ -3041,6 +3235,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
|
|
|
sub_positive(&sa->load_avg, r);
|
|
|
sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
|
|
|
removed_load = 1;
|
|
|
+ set_tg_cfs_propagate(cfs_rq);
|
|
|
}
|
|
|
|
|
|
if (atomic_long_read(&cfs_rq->removed_util_avg)) {
|
|
|
@@ -3048,6 +3243,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
|
|
|
sub_positive(&sa->util_avg, r);
|
|
|
sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
|
|
|
removed_util = 1;
|
|
|
+ set_tg_cfs_propagate(cfs_rq);
|
|
|
}
|
|
|
|
|
|
decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
|
|
|
@@ -3064,23 +3260,35 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
|
|
|
return decayed || removed_load;
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * Optional action to be done while updating the load average
|
|
|
+ */
|
|
|
+#define UPDATE_TG 0x1
|
|
|
+#define SKIP_AGE_LOAD 0x2
|
|
|
+
|
|
|
/* Update task and its cfs_rq load average */
|
|
|
-static inline void update_load_avg(struct sched_entity *se, int update_tg)
|
|
|
+static inline void update_load_avg(struct sched_entity *se, int flags)
|
|
|
{
|
|
|
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
|
u64 now = cfs_rq_clock_task(cfs_rq);
|
|
|
struct rq *rq = rq_of(cfs_rq);
|
|
|
int cpu = cpu_of(rq);
|
|
|
+ int decayed;
|
|
|
|
|
|
/*
|
|
|
* Track task load average for carrying it to new CPU after migrated, and
|
|
|
* track group sched_entity load average for task_h_load calc in migration
|
|
|
*/
|
|
|
- __update_load_avg(now, cpu, &se->avg,
|
|
|
+ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
|
|
|
+ __update_load_avg(now, cpu, &se->avg,
|
|
|
se->on_rq * scale_load_down(se->load.weight),
|
|
|
cfs_rq->curr == se, NULL);
|
|
|
+ }
|
|
|
|
|
|
- if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
|
|
|
+ decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
|
|
|
+ decayed |= propagate_entity_load_avg(se);
|
|
|
+
|
|
|
+ if (decayed && (flags & UPDATE_TG))
|
|
|
update_tg_load_avg(cfs_rq, 0);
|
|
|
}
|
|
|
|
|
|
@@ -3094,31 +3302,12 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
|
|
|
*/
|
|
|
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
{
|
|
|
- if (!sched_feat(ATTACH_AGE_LOAD))
|
|
|
- goto skip_aging;
|
|
|
-
|
|
|
- /*
|
|
|
- * If we got migrated (either between CPUs or between cgroups) we'll
|
|
|
- * have aged the average right before clearing @last_update_time.
|
|
|
- *
|
|
|
- * Or we're fresh through post_init_entity_util_avg().
|
|
|
- */
|
|
|
- if (se->avg.last_update_time) {
|
|
|
- __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
|
|
|
- &se->avg, 0, 0, NULL);
|
|
|
-
|
|
|
- /*
|
|
|
- * XXX: we could have just aged the entire load away if we've been
|
|
|
- * absent from the fair class for too long.
|
|
|
- */
|
|
|
- }
|
|
|
-
|
|
|
-skip_aging:
|
|
|
se->avg.last_update_time = cfs_rq->avg.last_update_time;
|
|
|
cfs_rq->avg.load_avg += se->avg.load_avg;
|
|
|
cfs_rq->avg.load_sum += se->avg.load_sum;
|
|
|
cfs_rq->avg.util_avg += se->avg.util_avg;
|
|
|
cfs_rq->avg.util_sum += se->avg.util_sum;
|
|
|
+ set_tg_cfs_propagate(cfs_rq);
|
|
|
|
|
|
cfs_rq_util_change(cfs_rq);
|
|
|
}
|
|
|
@@ -3133,14 +3322,12 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
|
|
|
*/
|
|
|
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
{
|
|
|
- __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
|
|
|
- &se->avg, se->on_rq * scale_load_down(se->load.weight),
|
|
|
- cfs_rq->curr == se, NULL);
|
|
|
|
|
|
sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
|
|
|
sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
|
|
|
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
|
|
|
sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
|
|
|
+ set_tg_cfs_propagate(cfs_rq);
|
|
|
|
|
|
cfs_rq_util_change(cfs_rq);
|
|
|
}
|
|
|
@@ -3150,34 +3337,20 @@ static inline void
|
|
|
enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
{
|
|
|
struct sched_avg *sa = &se->avg;
|
|
|
- u64 now = cfs_rq_clock_task(cfs_rq);
|
|
|
- int migrated, decayed;
|
|
|
-
|
|
|
- migrated = !sa->last_update_time;
|
|
|
- if (!migrated) {
|
|
|
- __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
|
|
|
- se->on_rq * scale_load_down(se->load.weight),
|
|
|
- cfs_rq->curr == se, NULL);
|
|
|
- }
|
|
|
-
|
|
|
- decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
|
|
|
|
|
|
cfs_rq->runnable_load_avg += sa->load_avg;
|
|
|
cfs_rq->runnable_load_sum += sa->load_sum;
|
|
|
|
|
|
- if (migrated)
|
|
|
+ if (!sa->last_update_time) {
|
|
|
attach_entity_load_avg(cfs_rq, se);
|
|
|
-
|
|
|
- if (decayed || migrated)
|
|
|
update_tg_load_avg(cfs_rq, 0);
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
/* Remove the runnable load generated by se from cfs_rq's runnable load average */
|
|
|
static inline void
|
|
|
dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
{
|
|
|
- update_load_avg(se, 1);
|
|
|
-
|
|
|
cfs_rq->runnable_load_avg =
|
|
|
max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
|
|
|
cfs_rq->runnable_load_sum =
|
|
|
@@ -3205,6 +3378,19 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
|
|
|
}
|
|
|
#endif
|
|
|
|
|
|
+/*
|
|
|
+ * Synchronize entity load avg of dequeued entity without locking
|
|
|
+ * the previous rq.
|
|
|
+ */
|
|
|
+void sync_entity_load_avg(struct sched_entity *se)
|
|
|
+{
|
|
|
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
|
+ u64 last_update_time;
|
|
|
+
|
|
|
+ last_update_time = cfs_rq_last_update_time(cfs_rq);
|
|
|
+ __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* Task first catches up with cfs_rq, and then subtract
|
|
|
* itself from the cfs_rq (task must be off the queue now).
|
|
|
@@ -3212,7 +3398,6 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
|
|
|
void remove_entity_load_avg(struct sched_entity *se)
|
|
|
{
|
|
|
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
|
- u64 last_update_time;
|
|
|
|
|
|
/*
|
|
|
* tasks cannot exit without having gone through wake_up_new_task() ->
|
|
|
@@ -3224,9 +3409,7 @@ void remove_entity_load_avg(struct sched_entity *se)
|
|
|
* calls this.
|
|
|
*/
|
|
|
|
|
|
- last_update_time = cfs_rq_last_update_time(cfs_rq);
|
|
|
-
|
|
|
- __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
|
|
|
+ sync_entity_load_avg(se);
|
|
|
atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
|
|
|
atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
|
|
|
}
|
|
|
@@ -3251,7 +3434,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
-static inline void update_load_avg(struct sched_entity *se, int not_used)
|
|
|
+#define UPDATE_TG 0x0
|
|
|
+#define SKIP_AGE_LOAD 0x0
|
|
|
+
|
|
|
+static inline void update_load_avg(struct sched_entity *se, int not_used1)
|
|
|
{
|
|
|
cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
|
|
|
}
|
|
|
@@ -3396,6 +3582,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
|
if (renorm && !curr)
|
|
|
se->vruntime += cfs_rq->min_vruntime;
|
|
|
|
|
|
+ update_load_avg(se, UPDATE_TG);
|
|
|
enqueue_entity_load_avg(cfs_rq, se);
|
|
|
account_entity_enqueue(cfs_rq, se);
|
|
|
update_cfs_shares(cfs_rq);
|
|
|
@@ -3470,6 +3657,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
|
* Update run-time statistics of the 'current'.
|
|
|
*/
|
|
|
update_curr(cfs_rq);
|
|
|
+ update_load_avg(se, UPDATE_TG);
|
|
|
dequeue_entity_load_avg(cfs_rq, se);
|
|
|
|
|
|
update_stats_dequeue(cfs_rq, se, flags);
|
|
|
@@ -3557,7 +3745,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
*/
|
|
|
update_stats_wait_end(cfs_rq, se);
|
|
|
__dequeue_entity(cfs_rq, se);
|
|
|
- update_load_avg(se, 1);
|
|
|
+ update_load_avg(se, UPDATE_TG);
|
|
|
}
|
|
|
|
|
|
update_stats_curr_start(cfs_rq, se);
|
|
|
@@ -3675,7 +3863,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
|
|
|
/*
|
|
|
* Ensure that runnable average is periodically updated.
|
|
|
*/
|
|
|
- update_load_avg(curr, 1);
|
|
|
+ update_load_avg(curr, UPDATE_TG);
|
|
|
update_cfs_shares(cfs_rq);
|
|
|
|
|
|
#ifdef CONFIG_SCHED_HRTICK
|
|
|
@@ -4572,7 +4760,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
|
|
if (cfs_rq_throttled(cfs_rq))
|
|
|
break;
|
|
|
|
|
|
- update_load_avg(se, 1);
|
|
|
+ update_load_avg(se, UPDATE_TG);
|
|
|
update_cfs_shares(cfs_rq);
|
|
|
}
|
|
|
|
|
|
@@ -4631,7 +4819,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
|
|
if (cfs_rq_throttled(cfs_rq))
|
|
|
break;
|
|
|
|
|
|
- update_load_avg(se, 1);
|
|
|
+ update_load_avg(se, UPDATE_TG);
|
|
|
update_cfs_shares(cfs_rq);
|
|
|
}
|
|
|
|
|
|
@@ -5199,6 +5387,14 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
|
|
|
return 1;
|
|
|
}
|
|
|
|
|
|
+static inline int task_util(struct task_struct *p);
|
|
|
+static int cpu_util_wake(int cpu, struct task_struct *p);
|
|
|
+
|
|
|
+static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
|
|
|
+{
|
|
|
+ return capacity_orig_of(cpu) - cpu_util_wake(cpu, p);
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* find_idlest_group finds and returns the least busy CPU group within the
|
|
|
* domain.
|
|
|
@@ -5208,15 +5404,21 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
|
|
|
int this_cpu, int sd_flag)
|
|
|
{
|
|
|
struct sched_group *idlest = NULL, *group = sd->groups;
|
|
|
- unsigned long min_load = ULONG_MAX, this_load = 0;
|
|
|
+ struct sched_group *most_spare_sg = NULL;
|
|
|
+ unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0;
|
|
|
+ unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0;
|
|
|
+ unsigned long most_spare = 0, this_spare = 0;
|
|
|
int load_idx = sd->forkexec_idx;
|
|
|
- int imbalance = 100 + (sd->imbalance_pct-100)/2;
|
|
|
+ int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
|
|
|
+ unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
|
|
|
+ (sd->imbalance_pct-100) / 100;
|
|
|
|
|
|
if (sd_flag & SD_BALANCE_WAKE)
|
|
|
load_idx = sd->wake_idx;
|
|
|
|
|
|
do {
|
|
|
- unsigned long load, avg_load;
|
|
|
+ unsigned long load, avg_load, runnable_load;
|
|
|
+ unsigned long spare_cap, max_spare_cap;
|
|
|
int local_group;
|
|
|
int i;
|
|
|
|
|
|
@@ -5228,8 +5430,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
|
|
|
local_group = cpumask_test_cpu(this_cpu,
|
|
|
sched_group_cpus(group));
|
|
|
|
|
|
- /* Tally up the load of all CPUs in the group */
|
|
|
+ /*
|
|
|
+ * Tally up the load of all CPUs in the group and find
|
|
|
+ * the group containing the CPU with most spare capacity.
|
|
|
+ */
|
|
|
avg_load = 0;
|
|
|
+ runnable_load = 0;
|
|
|
+ max_spare_cap = 0;
|
|
|
|
|
|
for_each_cpu(i, sched_group_cpus(group)) {
|
|
|
/* Bias balancing toward cpus of our domain */
|
|
|
@@ -5238,22 +5445,84 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
|
|
|
else
|
|
|
load = target_load(i, load_idx);
|
|
|
|
|
|
- avg_load += load;
|
|
|
+ runnable_load += load;
|
|
|
+
|
|
|
+ avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
|
|
|
+
|
|
|
+ spare_cap = capacity_spare_wake(i, p);
|
|
|
+
|
|
|
+ if (spare_cap > max_spare_cap)
|
|
|
+ max_spare_cap = spare_cap;
|
|
|
}
|
|
|
|
|
|
/* Adjust by relative CPU capacity of the group */
|
|
|
- avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
|
|
|
+ avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
|
|
|
+ group->sgc->capacity;
|
|
|
+ runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
|
|
|
+ group->sgc->capacity;
|
|
|
|
|
|
if (local_group) {
|
|
|
- this_load = avg_load;
|
|
|
- } else if (avg_load < min_load) {
|
|
|
- min_load = avg_load;
|
|
|
- idlest = group;
|
|
|
+ this_runnable_load = runnable_load;
|
|
|
+ this_avg_load = avg_load;
|
|
|
+ this_spare = max_spare_cap;
|
|
|
+ } else {
|
|
|
+ if (min_runnable_load > (runnable_load + imbalance)) {
|
|
|
+ /*
|
|
|
+ * The runnable load is significantly smaller
|
|
|
+ * so we can pick this new cpu
|
|
|
+ */
|
|
|
+ min_runnable_load = runnable_load;
|
|
|
+ min_avg_load = avg_load;
|
|
|
+ idlest = group;
|
|
|
+ } else if ((runnable_load < (min_runnable_load + imbalance)) &&
|
|
|
+ (100*min_avg_load > imbalance_scale*avg_load)) {
|
|
|
+ /*
|
|
|
+ * The runnable loads are close so take the
|
|
|
+ * blocked load into account through avg_load.
|
|
|
+ */
|
|
|
+ min_avg_load = avg_load;
|
|
|
+ idlest = group;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (most_spare < max_spare_cap) {
|
|
|
+ most_spare = max_spare_cap;
|
|
|
+ most_spare_sg = group;
|
|
|
+ }
|
|
|
}
|
|
|
} while (group = group->next, group != sd->groups);
|
|
|
|
|
|
- if (!idlest || 100*this_load < imbalance*min_load)
|
|
|
+ /*
|
|
|
+ * The cross-over point between using spare capacity or least load
|
|
|
+ * is too conservative for high utilization tasks on partially
|
|
|
+ * utilized systems if we require spare_capacity > task_util(p),
|
|
|
+ * so we allow for some task stuffing by using
|
|
|
+ * spare_capacity > task_util(p)/2.
|
|
|
+ *
|
|
|
+ * Spare capacity can't be used for fork because the utilization has
|
|
|
+ * not been set yet, we must first select a rq to compute the initial
|
|
|
+ * utilization.
|
|
|
+ */
|
|
|
+ if (sd_flag & SD_BALANCE_FORK)
|
|
|
+ goto skip_spare;
|
|
|
+
|
|
|
+ if (this_spare > task_util(p) / 2 &&
|
|
|
+ imbalance_scale*this_spare > 100*most_spare)
|
|
|
+ return NULL;
|
|
|
+
|
|
|
+ if (most_spare > task_util(p) / 2)
|
|
|
+ return most_spare_sg;
|
|
|
+
|
|
|
+skip_spare:
|
|
|
+ if (!idlest)
|
|
|
+ return NULL;
|
|
|
+
|
|
|
+ if (min_runnable_load > (this_runnable_load + imbalance))
|
|
|
return NULL;
|
|
|
+
|
|
|
+ if ((this_runnable_load < (min_runnable_load + imbalance)) &&
|
|
|
+ (100*this_avg_load < imbalance_scale*min_avg_load))
|
|
|
+ return NULL;
|
|
|
+
|
|
|
return idlest;
|
|
|
}
|
|
|
|
|
|
@@ -5589,6 +5858,24 @@ static inline int task_util(struct task_struct *p)
|
|
|
return p->se.avg.util_avg;
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * cpu_util_wake: Compute cpu utilization with any contributions from
|
|
|
+ * the waking task p removed.
|
|
|
+ */
|
|
|
+static int cpu_util_wake(int cpu, struct task_struct *p)
|
|
|
+{
|
|
|
+ unsigned long util, capacity;
|
|
|
+
|
|
|
+ /* Task has no contribution or is new */
|
|
|
+ if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
|
|
|
+ return cpu_util(cpu);
|
|
|
+
|
|
|
+ capacity = capacity_orig_of(cpu);
|
|
|
+ util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0);
|
|
|
+
|
|
|
+ return (util >= capacity) ? capacity : util;
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* Disable WAKE_AFFINE in the case where task @p doesn't fit in the
|
|
|
* capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
|
|
|
@@ -5607,6 +5894,9 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
|
|
|
if (max_cap - min_cap < max_cap >> 3)
|
|
|
return 0;
|
|
|
|
|
|
+ /* Bring task utilization in sync with prev_cpu */
|
|
|
+ sync_entity_load_avg(&p->se);
|
|
|
+
|
|
|
return min_cap * 1024 < task_util(p) * capacity_margin;
|
|
|
}
|
|
|
|
|
|
@@ -6641,6 +6931,10 @@ static void update_blocked_averages(int cpu)
|
|
|
|
|
|
if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
|
|
|
update_tg_load_avg(cfs_rq, 0);
|
|
|
+
|
|
|
+ /* Propagate pending load changes to the parent */
|
|
|
+ if (cfs_rq->tg->se[cpu])
|
|
|
+ update_load_avg(cfs_rq->tg->se[cpu], 0);
|
|
|
}
|
|
|
raw_spin_unlock_irqrestore(&rq->lock, flags);
|
|
|
}
|
|
|
@@ -6845,13 +7139,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
|
|
|
|
|
|
cpu_rq(cpu)->cpu_capacity = capacity;
|
|
|
sdg->sgc->capacity = capacity;
|
|
|
+ sdg->sgc->min_capacity = capacity;
|
|
|
}
|
|
|
|
|
|
void update_group_capacity(struct sched_domain *sd, int cpu)
|
|
|
{
|
|
|
struct sched_domain *child = sd->child;
|
|
|
struct sched_group *group, *sdg = sd->groups;
|
|
|
- unsigned long capacity;
|
|
|
+ unsigned long capacity, min_capacity;
|
|
|
unsigned long interval;
|
|
|
|
|
|
interval = msecs_to_jiffies(sd->balance_interval);
|
|
|
@@ -6864,6 +7159,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
|
|
|
}
|
|
|
|
|
|
capacity = 0;
|
|
|
+ min_capacity = ULONG_MAX;
|
|
|
|
|
|
if (child->flags & SD_OVERLAP) {
|
|
|
/*
|
|
|
@@ -6888,11 +7184,12 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
|
|
|
*/
|
|
|
if (unlikely(!rq->sd)) {
|
|
|
capacity += capacity_of(cpu);
|
|
|
- continue;
|
|
|
+ } else {
|
|
|
+ sgc = rq->sd->groups->sgc;
|
|
|
+ capacity += sgc->capacity;
|
|
|
}
|
|
|
|
|
|
- sgc = rq->sd->groups->sgc;
|
|
|
- capacity += sgc->capacity;
|
|
|
+ min_capacity = min(capacity, min_capacity);
|
|
|
}
|
|
|
} else {
|
|
|
/*
|
|
|
@@ -6902,12 +7199,16 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
|
|
|
|
|
|
group = child->groups;
|
|
|
do {
|
|
|
- capacity += group->sgc->capacity;
|
|
|
+ struct sched_group_capacity *sgc = group->sgc;
|
|
|
+
|
|
|
+ capacity += sgc->capacity;
|
|
|
+ min_capacity = min(sgc->min_capacity, min_capacity);
|
|
|
group = group->next;
|
|
|
} while (group != child->groups);
|
|
|
}
|
|
|
|
|
|
sdg->sgc->capacity = capacity;
|
|
|
+ sdg->sgc->min_capacity = min_capacity;
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
@@ -6930,8 +7231,8 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
|
|
|
* cpumask covering 1 cpu of the first group and 3 cpus of the second group.
|
|
|
* Something like:
|
|
|
*
|
|
|
- * { 0 1 2 3 } { 4 5 6 7 }
|
|
|
- * * * * *
|
|
|
+ * { 0 1 2 3 } { 4 5 6 7 }
|
|
|
+ * * * * *
|
|
|
*
|
|
|
* If we were to balance group-wise we'd place two tasks in the first group and
|
|
|
* two tasks in the second group. Clearly this is undesired as it will overload
|
|
|
@@ -7002,6 +7303,17 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
|
|
|
+ * per-CPU capacity than sched_group ref.
|
|
|
+ */
|
|
|
+static inline bool
|
|
|
+group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
|
|
|
+{
|
|
|
+ return sg->sgc->min_capacity * capacity_margin <
|
|
|
+ ref->sgc->min_capacity * 1024;
|
|
|
+}
|
|
|
+
|
|
|
static inline enum
|
|
|
group_type group_classify(struct sched_group *group,
|
|
|
struct sg_lb_stats *sgs)
|
|
|
@@ -7105,6 +7417,20 @@ static bool update_sd_pick_busiest(struct lb_env *env,
|
|
|
if (sgs->avg_load <= busiest->avg_load)
|
|
|
return false;
|
|
|
|
|
|
+ if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
|
|
|
+ goto asym_packing;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Candidate sg has no more than one task per CPU and
|
|
|
+ * has higher per-CPU capacity. Migrating tasks to less
|
|
|
+ * capable CPUs may harm throughput. Maximize throughput,
|
|
|
+ * power/energy consequences are not considered.
|
|
|
+ */
|
|
|
+ if (sgs->sum_nr_running <= sgs->group_weight &&
|
|
|
+ group_smaller_cpu_capacity(sds->local, sg))
|
|
|
+ return false;
|
|
|
+
|
|
|
+asym_packing:
|
|
|
/* This is the busiest node in its class. */
|
|
|
if (!(env->sd->flags & SD_ASYM_PACKING))
|
|
|
return true;
|
|
|
@@ -7113,16 +7439,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,
|
|
|
if (env->idle == CPU_NOT_IDLE)
|
|
|
return true;
|
|
|
/*
|
|
|
- * ASYM_PACKING needs to move all the work to the lowest
|
|
|
- * numbered CPUs in the group, therefore mark all groups
|
|
|
- * higher than ourself as busy.
|
|
|
+ * ASYM_PACKING needs to move all the work to the highest
|
|
|
+ * prority CPUs in the group, therefore mark all groups
|
|
|
+ * of lower priority than ourself as busy.
|
|
|
*/
|
|
|
- if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
|
|
|
+ if (sgs->sum_nr_running &&
|
|
|
+ sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
|
|
|
if (!sds->busiest)
|
|
|
return true;
|
|
|
|
|
|
- /* Prefer to move from highest possible cpu's work */
|
|
|
- if (group_first_cpu(sds->busiest) < group_first_cpu(sg))
|
|
|
+ /* Prefer to move from lowest priority cpu's work */
|
|
|
+ if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
|
|
|
+ sg->asym_prefer_cpu))
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
@@ -7274,8 +7602,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
|
|
|
if (!sds->busiest)
|
|
|
return 0;
|
|
|
|
|
|
- busiest_cpu = group_first_cpu(sds->busiest);
|
|
|
- if (env->dst_cpu > busiest_cpu)
|
|
|
+ busiest_cpu = sds->busiest->asym_prefer_cpu;
|
|
|
+ if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
|
|
|
return 0;
|
|
|
|
|
|
env->imbalance = DIV_ROUND_CLOSEST(
|
|
|
@@ -7613,10 +7941,11 @@ static int need_active_balance(struct lb_env *env)
|
|
|
|
|
|
/*
|
|
|
* ASYM_PACKING needs to force migrate tasks from busy but
|
|
|
- * higher numbered CPUs in order to pack all tasks in the
|
|
|
- * lowest numbered CPUs.
|
|
|
+ * lower priority CPUs in order to pack all tasks in the
|
|
|
+ * highest priority CPUs.
|
|
|
*/
|
|
|
- if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
|
|
|
+ if ((sd->flags & SD_ASYM_PACKING) &&
|
|
|
+ sched_asym_prefer(env->dst_cpu, env->src_cpu))
|
|
|
return 1;
|
|
|
}
|
|
|
|
|
|
@@ -8465,7 +8794,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
|
|
|
unsigned long now = jiffies;
|
|
|
struct sched_domain_shared *sds;
|
|
|
struct sched_domain *sd;
|
|
|
- int nr_busy, cpu = rq->cpu;
|
|
|
+ int nr_busy, i, cpu = rq->cpu;
|
|
|
bool kick = false;
|
|
|
|
|
|
if (unlikely(rq->idle_balance))
|
|
|
@@ -8516,12 +8845,18 @@ static inline bool nohz_kick_needed(struct rq *rq)
|
|
|
}
|
|
|
|
|
|
sd = rcu_dereference(per_cpu(sd_asym, cpu));
|
|
|
- if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
|
|
|
- sched_domain_span(sd)) < cpu)) {
|
|
|
- kick = true;
|
|
|
- goto unlock;
|
|
|
- }
|
|
|
+ if (sd) {
|
|
|
+ for_each_cpu(i, sched_domain_span(sd)) {
|
|
|
+ if (i == cpu ||
|
|
|
+ !cpumask_test_cpu(i, nohz.idle_cpus_mask))
|
|
|
+ continue;
|
|
|
|
|
|
+ if (sched_asym_prefer(i, cpu)) {
|
|
|
+ kick = true;
|
|
|
+ goto unlock;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
unlock:
|
|
|
rcu_read_unlock();
|
|
|
return kick;
|
|
|
@@ -8687,32 +9022,45 @@ static inline bool vruntime_normalized(struct task_struct *p)
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
-static void detach_task_cfs_rq(struct task_struct *p)
|
|
|
+#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
+/*
|
|
|
+ * Propagate the changes of the sched_entity across the tg tree to make it
|
|
|
+ * visible to the root
|
|
|
+ */
|
|
|
+static void propagate_entity_cfs_rq(struct sched_entity *se)
|
|
|
{
|
|
|
- struct sched_entity *se = &p->se;
|
|
|
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
|
- u64 now = cfs_rq_clock_task(cfs_rq);
|
|
|
+ struct cfs_rq *cfs_rq;
|
|
|
|
|
|
- if (!vruntime_normalized(p)) {
|
|
|
- /*
|
|
|
- * Fix up our vruntime so that the current sleep doesn't
|
|
|
- * cause 'unlimited' sleep bonus.
|
|
|
- */
|
|
|
- place_entity(cfs_rq, se, 0);
|
|
|
- se->vruntime -= cfs_rq->min_vruntime;
|
|
|
+ /* Start to propagate at parent */
|
|
|
+ se = se->parent;
|
|
|
+
|
|
|
+ for_each_sched_entity(se) {
|
|
|
+ cfs_rq = cfs_rq_of(se);
|
|
|
+
|
|
|
+ if (cfs_rq_throttled(cfs_rq))
|
|
|
+ break;
|
|
|
+
|
|
|
+ update_load_avg(se, UPDATE_TG);
|
|
|
}
|
|
|
+}
|
|
|
+#else
|
|
|
+static void propagate_entity_cfs_rq(struct sched_entity *se) { }
|
|
|
+#endif
|
|
|
+
|
|
|
+static void detach_entity_cfs_rq(struct sched_entity *se)
|
|
|
+{
|
|
|
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
|
|
|
|
/* Catch up with the cfs_rq and remove our load when we leave */
|
|
|
- update_cfs_rq_load_avg(now, cfs_rq, false);
|
|
|
+ update_load_avg(se, 0);
|
|
|
detach_entity_load_avg(cfs_rq, se);
|
|
|
update_tg_load_avg(cfs_rq, false);
|
|
|
+ propagate_entity_cfs_rq(se);
|
|
|
}
|
|
|
|
|
|
-static void attach_task_cfs_rq(struct task_struct *p)
|
|
|
+static void attach_entity_cfs_rq(struct sched_entity *se)
|
|
|
{
|
|
|
- struct sched_entity *se = &p->se;
|
|
|
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
|
- u64 now = cfs_rq_clock_task(cfs_rq);
|
|
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
/*
|
|
|
@@ -8722,10 +9070,36 @@ static void attach_task_cfs_rq(struct task_struct *p)
|
|
|
se->depth = se->parent ? se->parent->depth + 1 : 0;
|
|
|
#endif
|
|
|
|
|
|
- /* Synchronize task with its cfs_rq */
|
|
|
- update_cfs_rq_load_avg(now, cfs_rq, false);
|
|
|
+ /* Synchronize entity with its cfs_rq */
|
|
|
+ update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
|
|
|
attach_entity_load_avg(cfs_rq, se);
|
|
|
update_tg_load_avg(cfs_rq, false);
|
|
|
+ propagate_entity_cfs_rq(se);
|
|
|
+}
|
|
|
+
|
|
|
+static void detach_task_cfs_rq(struct task_struct *p)
|
|
|
+{
|
|
|
+ struct sched_entity *se = &p->se;
|
|
|
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
|
+
|
|
|
+ if (!vruntime_normalized(p)) {
|
|
|
+ /*
|
|
|
+ * Fix up our vruntime so that the current sleep doesn't
|
|
|
+ * cause 'unlimited' sleep bonus.
|
|
|
+ */
|
|
|
+ place_entity(cfs_rq, se, 0);
|
|
|
+ se->vruntime -= cfs_rq->min_vruntime;
|
|
|
+ }
|
|
|
+
|
|
|
+ detach_entity_cfs_rq(se);
|
|
|
+}
|
|
|
+
|
|
|
+static void attach_task_cfs_rq(struct task_struct *p)
|
|
|
+{
|
|
|
+ struct sched_entity *se = &p->se;
|
|
|
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
|
+
|
|
|
+ attach_entity_cfs_rq(se);
|
|
|
|
|
|
if (!vruntime_normalized(p))
|
|
|
se->vruntime += cfs_rq->min_vruntime;
|
|
|
@@ -8779,6 +9153,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
|
|
|
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
|
|
|
#endif
|
|
|
#ifdef CONFIG_SMP
|
|
|
+#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
+ cfs_rq->propagate_avg = 0;
|
|
|
+#endif
|
|
|
atomic_long_set(&cfs_rq->removed_load_avg, 0);
|
|
|
atomic_long_set(&cfs_rq->removed_util_avg, 0);
|
|
|
#endif
|
|
|
@@ -8887,7 +9264,7 @@ void online_fair_sched_group(struct task_group *tg)
|
|
|
se = tg->se[i];
|
|
|
|
|
|
raw_spin_lock_irq(&rq->lock);
|
|
|
- post_init_entity_util_avg(se);
|
|
|
+ attach_entity_cfs_rq(se);
|
|
|
sync_throttle(tg, i);
|
|
|
raw_spin_unlock_irq(&rq->lock);
|
|
|
}
|