8 years ago · 0e2d2aaaae
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -568,6 +568,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 
				 			cfs_rq->removed.load_avg);
			
 
				 	SEQ_printf(m, "  .%-30s: %ld\n", "removed.util_avg",
			
 
				 			cfs_rq->removed.util_avg);
			
 
				+	SEQ_printf(m, "  .%-30s: %ld\n", "removed.runnable_sum",
			
 
				+			cfs_rq->removed.runnable_sum);
			
 
				 #ifdef CONFIG_FAIR_GROUP_SCHED
			
 
				 	SEQ_printf(m, "  .%-30s: %lu\n", "tg_load_avg_contrib",
			
 
				 			cfs_rq->tg_load_avg_contrib);
			
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3319,11 +3319,77 @@ void set_task_rq_fair(struct sched_entity *se,
 
				 	se->avg.last_update_time = n_last_update_time;
			
 
				 }
			
 
				 
			
 
				-/* Take into account change of utilization of a child task group */
			
 
				+
			
 
				+/*
			
 
				+ * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
			
 
				+ * propagate its contribution. The key to this propagation is the invariant
			
 
				+ * that for each group:
			
 
				+ *
			
 
				+ *   ge->avg == grq->avg						(1)
			
 
				+ *
			
 
				+ * _IFF_ we look at the pure running and runnable sums. Because they
			
 
				+ * represent the very same entity, just at different points in the hierarchy.
			
 
				+ *
			
 
				+ *
			
 
				+ * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and
			
 
				+ * simply copies the running sum over.
			
 
				+ *
			
 
				+ * However, update_tg_cfs_runnable() is more complex. So we have:
			
 
				+ *
			
 
				+ *   ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg		(2)
			
 
				+ *
			
 
				+ * And since, like util, the runnable part should be directly transferable,
			
 
				+ * the following would _appear_ to be the straight forward approach:
			
 
				+ *
			
 
				+ *   grq->avg.load_avg = grq->load.weight * grq->avg.running_avg	(3)
			
 
				+ *
			
 
				+ * And per (1) we have:
			
 
				+ *
			
 
				+ *   ge->avg.running_avg == grq->avg.running_avg
			
 
				+ *
			
 
				+ * Which gives:
			
 
				+ *
			
 
				+ *                      ge->load.weight * grq->avg.load_avg
			
 
				+ *   ge->avg.load_avg = -----------------------------------		(4)
			
 
				+ *                               grq->load.weight
			
 
				+ *
			
 
				+ * Except that is wrong!
			
 
				+ *
			
 
				+ * Because while for entities historical weight is not important and we
			
 
				+ * really only care about our future and therefore can consider a pure
			
 
				+ * runnable sum, runqueues can NOT do this.
			
 
				+ *
			
 
				+ * We specifically want runqueues to have a load_avg that includes
			
 
				+ * historical weights. Those represent the blocked load, the load we expect
			
 
				+ * to (shortly) return to us. This only works by keeping the weights as
			
 
				+ * integral part of the sum. We therefore cannot decompose as per (3).
			
 
				+ *
			
 
				+ * OK, so what then?
			
 
				+ *
			
 
				+ *
			
 
				+ * Another way to look at things is:
			
 
				+ *
			
 
				+ *   grq->avg.load_avg = \Sum se->avg.load_avg
			
 
				+ *
			
 
				+ * Therefore, per (2):
			
 
				+ *
			
 
				+ *   grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg
			
 
				+ *
			
 
				+ * And the very thing we're propagating is a change in that sum (someone
			
 
				+ * joined/left). So we can easily know the runnable change, which would be, per
			
 
				+ * (2) the already tracked se->load_avg divided by the corresponding
			
 
				+ * se->weight.
			
 
				+ *
			
 
				+ * Basically (4) but in differential form:
			
 
				+ *
			
 
				+ *   d(runnable_avg) += se->avg.load_avg / se->load.weight
			
 
				+ *								   (5)
			
 
				+ *   ge->avg.load_avg += ge->load.weight * d(runnable_avg)
			
 
				+ */
			
 
				+
			
 
				 static inline void
			
 
				-update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
			
 
				 {
			
 
				-	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
			
 
				 	long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
			
 
				 
			
 
				 	/* Nothing to update */
			
@@ -3339,102 +3405,59 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
				 	cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
			
 
				 }
			
 
				 
			
 
				-/* Take into account change of load of a child task group */
			
 
				 static inline void
			
 
				-update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				+update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
			
 
				 {
			
 
				-	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
			
 
				-	long delta, load = gcfs_rq->avg.load_avg;
			
 
				+	long runnable_sum = gcfs_rq->prop_runnable_sum;
			
 
				+	long load_avg;
			
 
				+	s64 load_sum;
			
 
				 
			
 
				-	/*
			
 
				-	 * If the load of group cfs_rq is null, the load of the
			
 
				-	 * sched_entity will also be null so we can skip the formula
			
 
				-	 */
			
 
				-	if (load) {
			
 
				-		long tg_load;
			
 
				-
			
 
				-		/* Get tg's load and ensure tg_load > 0 */
			
 
				-		tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
			
 
				-
			
 
				-		/* Ensure tg_load >= load and updated with current load*/
			
 
				-		tg_load -= gcfs_rq->tg_load_avg_contrib;
			
 
				-		tg_load += load;
			
 
				-
			
 
				-		/*
			
 
				-		 * We need to compute a correction term in the case that the
			
 
				-		 * task group is consuming more CPU than a task of equal
			
 
				-		 * weight. A task with a weight equals to tg->shares will have
			
 
				-		 * a load less or equal to scale_load_down(tg->shares).
			
 
				-		 * Similarly, the sched_entities that represent the task group
			
 
				-		 * at parent level, can't have a load higher than
			
 
				-		 * scale_load_down(tg->shares). And the Sum of sched_entities'
			
 
				-		 * load must be <= scale_load_down(tg->shares).
			
 
				-		 */
			
 
				-		if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
			
 
				-			/* scale gcfs_rq's load into tg's shares*/
			
 
				-			load *= scale_load_down(gcfs_rq->tg->shares);
			
 
				-			load /= tg_load;
			
 
				-		}
			
 
				-	}
			
 
				+	if (!runnable_sum)
			
 
				+		return;
			
 
				 
			
 
				-	delta = load - se->avg.load_avg;
			
 
				+	gcfs_rq->prop_runnable_sum = 0;
			
 
				 
			
 
				-	/* Nothing to update */
			
 
				-	if (!delta)
			
 
				-		return;
			
 
				+	load_sum = (s64)se_weight(se) * runnable_sum;
			
 
				+	load_avg = div_s64(load_sum, LOAD_AVG_MAX);
			
 
				 
			
 
				-	/* Set new sched_entity's load */
			
 
				-	se->avg.load_avg = load;
			
 
				-	se->avg.load_sum = LOAD_AVG_MAX;
			
 
				+	add_positive(&se->avg.load_sum, runnable_sum);
			
 
				+	add_positive(&se->avg.load_avg, load_avg);
			
 
				 
			
 
				-	/* Update parent cfs_rq load */
			
 
				-	add_positive(&cfs_rq->avg.load_avg, delta);
			
 
				-	cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
			
 
				+	add_positive(&cfs_rq->avg.load_avg, load_avg);
			
 
				+	add_positive(&cfs_rq->avg.load_sum, load_sum);
			
 
				 
			
 
				-	/*
			
 
				-	 * If the sched_entity is already enqueued, we also have to update the
			
 
				-	 * runnable load avg.
			
 
				-	 */
			
 
				 	if (se->on_rq) {
			
 
				-		/* Update parent cfs_rq runnable_load_avg */
			
 
				-		add_positive(&cfs_rq->runnable_load_avg, delta);
			
 
				-		cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
			
 
				+		add_positive(&cfs_rq->runnable_load_avg, load_avg);
			
 
				+		add_positive(&cfs_rq->runnable_load_sum, load_sum);
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
			
 
				+static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
			
 
				 {
			
 
				-	cfs_rq->propagate_avg = 1;
			
 
				-}
			
 
				-
			
 
				-static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
			
 
				-{
			
 
				-	struct cfs_rq *cfs_rq = group_cfs_rq(se);
			
 
				-
			
 
				-	if (!cfs_rq->propagate_avg)
			
 
				-		return 0;
			
 
				-
			
 
				-	cfs_rq->propagate_avg = 0;
			
 
				-	return 1;
			
 
				+	cfs_rq->propagate = 1;
			
 
				+	cfs_rq->prop_runnable_sum += runnable_sum;
			
 
				 }
			
 
				 
			
 
				 /* Update task and its cfs_rq load average */
			
 
				 static inline int propagate_entity_load_avg(struct sched_entity *se)
			
 
				 {
			
 
				-	struct cfs_rq *cfs_rq;
			
 
				+	struct cfs_rq *cfs_rq, *gcfs_rq;
			
 
				 
			
 
				 	if (entity_is_task(se))
			
 
				 		return 0;
			
 
				 
			
 
				-	if (!test_and_clear_tg_cfs_propagate(se))
			
 
				+	gcfs_rq = group_cfs_rq(se);
			
 
				+	if (!gcfs_rq->propagate)
			
 
				 		return 0;
			
 
				 
			
 
				+	gcfs_rq->propagate = 0;
			
 
				+
			
 
				 	cfs_rq = cfs_rq_of(se);
			
 
				 
			
 
				-	set_tg_cfs_propagate(cfs_rq);
			
 
				+	add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
			
 
				 
			
 
				-	update_tg_cfs_util(cfs_rq, se);
			
 
				-	update_tg_cfs_load(cfs_rq, se);
			
 
				+	update_tg_cfs_util(cfs_rq, se, gcfs_rq);
			
 
				+	update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
			
 
				 
			
 
				 	return 1;
			
 
				 }
			
@@ -3458,7 +3481,7 @@ static inline bool skip_blocked_update(struct sched_entity *se)
 
				 	 * If there is a pending propagation, we have to update the load and
			
 
				 	 * the utilization of the sched_entity:
			
 
				 	 */
			
 
				-	if (gcfs_rq->propagate_avg)
			
 
				+	if (gcfs_rq->propagate)
			
 
				 		return false;
			
 
				 
			
 
				 	/*
			
@@ -3478,7 +3501,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
			
 
				+static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
			
 
				 
			
 
				 #endif /* CONFIG_FAIR_GROUP_SCHED */
			
 
				 
			
@@ -3501,7 +3524,7 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
 
				 static inline int
			
 
				 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
			
 
				 {
			
 
				-	unsigned long removed_load = 0, removed_util = 0;
			
 
				+	unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
			
 
				 	struct sched_avg *sa = &cfs_rq->avg;
			
 
				 	int decayed = 0;
			
 
				 
			
@@ -3511,6 +3534,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 
				 		raw_spin_lock(&cfs_rq->removed.lock);
			
 
				 		swap(cfs_rq->removed.util_avg, removed_util);
			
 
				 		swap(cfs_rq->removed.load_avg, removed_load);
			
 
				+		swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
			
 
				 		cfs_rq->removed.nr = 0;
			
 
				 		raw_spin_unlock(&cfs_rq->removed.lock);
			
 
				 
			
@@ -3526,7 +3550,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 
				 		sub_positive(&sa->util_avg, r);
			
 
				 		sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
			
 
				 
			
 
				-		set_tg_cfs_propagate(cfs_rq);
			
 
				+		add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
			
 
				 
			
 
				 		decayed = 1;
			
 
				 	}
			
@@ -3558,7 +3582,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 
				 	enqueue_load_avg(cfs_rq, se);
			
 
				 	cfs_rq->avg.util_avg += se->avg.util_avg;
			
 
				 	cfs_rq->avg.util_sum += se->avg.util_sum;
			
 
				-	set_tg_cfs_propagate(cfs_rq);
			
 
				+
			
 
				+	add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
			
 
				 
			
 
				 	cfs_rq_util_change(cfs_rq);
			
 
				 }
			
@@ -3576,7 +3601,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 
				 	dequeue_load_avg(cfs_rq, se);
			
 
				 	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
			
 
				 	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
			
 
				-	set_tg_cfs_propagate(cfs_rq);
			
 
				+
			
 
				+	add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
			
 
				 
			
 
				 	cfs_rq_util_change(cfs_rq);
			
 
				 }
			
@@ -3678,6 +3704,7 @@ void remove_entity_load_avg(struct sched_entity *se)
 
				 	++cfs_rq->removed.nr;
			
 
				 	cfs_rq->removed.util_avg	+= se->avg.util_avg;
			
 
				 	cfs_rq->removed.load_avg	+= se->avg.load_avg;
			
 
				+	cfs_rq->removed.runnable_sum	+= se->avg.load_sum; /* == runnable_sum */
			
 
				 	raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
			
 
				 }
			
 
				 
			
@@ -9466,9 +9493,6 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 
				 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
			
 
				 #endif
			
 
				 #ifdef CONFIG_SMP
			
 
				-#ifdef CONFIG_FAIR_GROUP_SCHED
			
 
				-	cfs_rq->propagate_avg = 0;
			
 
				-#endif
			
 
				 	raw_spin_lock_init(&cfs_rq->removed.lock);
			
 
				 #endif
			
 
				 }
			
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -447,19 +447,20 @@ struct cfs_rq {
 
				 	unsigned long runnable_load_avg;
			
 
				 #ifndef CONFIG_64BIT
			
 
				 	u64 load_last_update_time_copy;
			
 
				-#endif
			
 
				-#ifdef CONFIG_FAIR_GROUP_SCHED
			
 
				-	unsigned long tg_load_avg_contrib;
			
 
				-	unsigned long propagate_avg;
			
 
				 #endif
			
 
				 	struct {
			
 
				 		raw_spinlock_t	lock ____cacheline_aligned;
			
 
				 		int		nr;
			
 
				 		unsigned long	load_avg;
			
 
				 		unsigned long	util_avg;
			
 
				+		unsigned long	runnable_sum;
			
 
				 	} removed;
			
 
				 
			
 
				 #ifdef CONFIG_FAIR_GROUP_SCHED
			
 
				+	unsigned long tg_load_avg_contrib;
			
 
				+	long propagate;
			
 
				+	long prop_runnable_sum;
			
 
				+
			
 
				 	/*
			
 
				 	 *   h_load = weight * f(tg)
			
 
				 	 *