7 years ago · 4bdced5c9a
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -73,10 +73,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 
															 	raw_spin_unlock(&rt_b->rt_runtime_lock);
														
 
															 }
														
 
															-#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI)
														
 
															-static void push_irq_work_func(struct irq_work *work);
														
 
															-#endif
														
 
															-
														
 
															 void init_rt_rq(struct rt_rq *rt_rq)
														
 
															 {
														
 
															 	struct rt_prio_array *array;
														
@@ -96,13 +92,6 @@ void init_rt_rq(struct rt_rq *rt_rq)
 
															 	rt_rq->rt_nr_migratory = 0;
														
 
															 	rt_rq->overloaded = 0;
														
 
															 	plist_head_init(&rt_rq->pushable_tasks);
														
 
															-
														
 
															-#ifdef HAVE_RT_PUSH_IPI
														
 
															-	rt_rq->push_flags = 0;
														
 
															-	rt_rq->push_cpu = nr_cpu_ids;
														
 
															-	raw_spin_lock_init(&rt_rq->push_lock);
														
 
															-	init_irq_work(&rt_rq->push_work, push_irq_work_func);
														
 
															-#endif
														
 
															 #endif /* CONFIG_SMP */
														
 
															 	/* We start is dequeued state, because no RT tasks are queued */
														
 
															 	rt_rq->rt_queued = 0;
														
@@ -1875,241 +1864,166 @@ static void push_rt_tasks(struct rq *rq)
 
															 }
														
 
															 #ifdef HAVE_RT_PUSH_IPI
														
 
															+
														
 
															 /*
														
 
															- * The search for the next cpu always starts at rq->cpu and ends
														
 
															- * when we reach rq->cpu again. It will never return rq->cpu.
														
 
															- * This returns the next cpu to check, or nr_cpu_ids if the loop
														
 
															- * is complete.
														
 
															+ * When a high priority task schedules out from a CPU and a lower priority
														
 
															+ * task is scheduled in, a check is made to see if there's any RT tasks
														
 
															+ * on other CPUs that are waiting to run because a higher priority RT task
														
 
															+ * is currently running on its CPU. In this case, the CPU with multiple RT
														
 
															+ * tasks queued on it (overloaded) needs to be notified that a CPU has opened
														
 
															+ * up that may be able to run one of its non-running queued RT tasks.
														
 
															+ *
														
 
															+ * All CPUs with overloaded RT tasks need to be notified as there is currently
														
 
															+ * no way to know which of these CPUs have the highest priority task waiting
														
 
															+ * to run. Instead of trying to take a spinlock on each of these CPUs,
														
 
															+ * which has shown to cause large latency when done on machines with many
														
 
															+ * CPUs, sending an IPI to the CPUs to have them push off the overloaded
														
 
															+ * RT tasks waiting to run.
														
 
															+ *
														
 
															+ * Just sending an IPI to each of the CPUs is also an issue, as on large
														
 
															+ * count CPU machines, this can cause an IPI storm on a CPU, especially
														
 
															+ * if its the only CPU with multiple RT tasks queued, and a large number
														
 
															+ * of CPUs scheduling a lower priority task at the same time.
														
 
															+ *
														
 
															+ * Each root domain has its own irq work function that can iterate over
														
 
															+ * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
														
 
															+ * tassk must be checked if there's one or many CPUs that are lowering
														
 
															+ * their priority, there's a single irq work iterator that will try to
														
 
															+ * push off RT tasks that are waiting to run.
														
 
															+ *
														
 
															+ * When a CPU schedules a lower priority task, it will kick off the
														
 
															+ * irq work iterator that will jump to each CPU with overloaded RT tasks.
														
 
															+ * As it only takes the first CPU that schedules a lower priority task
														
 
															+ * to start the process, the rto_start variable is incremented and if
														
 
															+ * the atomic result is one, then that CPU will try to take the rto_lock.
														
 
															+ * This prevents high contention on the lock as the process handles all
														
 
															+ * CPUs scheduling lower priority tasks.
														
 
															+ *
														
 
															+ * All CPUs that are scheduling a lower priority task will increment the
														
 
															+ * rt_loop_next variable. This will make sure that the irq work iterator
														
 
															+ * checks all RT overloaded CPUs whenever a CPU schedules a new lower
														
 
															+ * priority task, even if the iterator is in the middle of a scan. Incrementing
														
 
															+ * the rt_loop_next will cause the iterator to perform another scan.
														
 
															  *
														
 
															- * rq->rt.push_cpu holds the last cpu returned by this function,
														
 
															- * or if this is the first instance, it must hold rq->cpu.
														
 
															  */
														
 
															 static int rto_next_cpu(struct rq *rq)
														
 
															 {
														
 
															-	int prev_cpu = rq->rt.push_cpu;
														
 
															+	struct root_domain *rd = rq->rd;
														
 
															+	int next;
														
 
															 	int cpu;
														
 
															-	cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
														
 
															-
														
 
															 	/*
														
 
															-	 * If the previous cpu is less than the rq's CPU, then it already
														
 
															-	 * passed the end of the mask, and has started from the beginning.
														
 
															-	 * We end if the next CPU is greater or equal to rq's CPU.
														
 
															+	 * When starting the IPI RT pushing, the rto_cpu is set to -1,
														
 
															+	 * rt_next_cpu() will simply return the first CPU found in
														
 
															+	 * the rto_mask.
														
 
															+	 *
														
 
															+	 * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
														
 
															+	 * will return the next CPU found in the rto_mask.
														
 
															+	 *
														
 
															+	 * If there are no more CPUs left in the rto_mask, then a check is made
														
 
															+	 * against rto_loop and rto_loop_next. rto_loop is only updated with
														
 
															+	 * the rto_lock held, but any CPU may increment the rto_loop_next
														
 
															+	 * without any locking.
														
 
															 	 */
														
 
															-	if (prev_cpu < rq->cpu) {
														
 
															-		if (cpu >= rq->cpu)
														
 
															-			return nr_cpu_ids;
														
 
															+	for (;;) {
														
 
															-	} else if (cpu >= nr_cpu_ids) {
														
 
															-		/*
														
 
															-		 * We passed the end of the mask, start at the beginning.
														
 
															-		 * If the result is greater or equal to the rq's CPU, then
														
 
															-		 * the loop is finished.
														
 
															-		 */
														
 
															-		cpu = cpumask_first(rq->rd->rto_mask);
														
 
															-		if (cpu >= rq->cpu)
														
 
															-			return nr_cpu_ids;
														
 
															-	}
														
 
															-	rq->rt.push_cpu = cpu;
														
 
															+		/* When rto_cpu is -1 this acts like cpumask_first() */
														
 
															+		cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
														
 
															-	/* Return cpu to let the caller know if the loop is finished or not */
														
 
															-	return cpu;
														
 
															-}
														
 
															+		rd->rto_cpu = cpu;
														
 
															-static int find_next_push_cpu(struct rq *rq)
														
 
															-{
														
 
															-	struct rq *next_rq;
														
 
															-	int cpu;
														
 
															+		if (cpu < nr_cpu_ids)
														
 
															+			return cpu;
														
 
															-	while (1) {
														
 
															-		cpu = rto_next_cpu(rq);
														
 
															-		if (cpu >= nr_cpu_ids)
														
 
															-			break;
														
 
															-		next_rq = cpu_rq(cpu);
														
 
															+		rd->rto_cpu = -1;
														
 
															+
														
 
															+		/*
														
 
															+		 * ACQUIRE ensures we see the @rto_mask changes
														
 
															+		 * made prior to the @next value observed.
														
 
															+		 *
														
 
															+		 * Matches WMB in rt_set_overload().
														
 
															+		 */
														
 
															+		next = atomic_read_acquire(&rd->rto_loop_next);
														
 
															-		/* Make sure the next rq can push to this rq */
														
 
															-		if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
														
 
															+		if (rd->rto_loop == next)
														
 
															 			break;
														
 
															+
														
 
															+		rd->rto_loop = next;
														
 
															 	}
														
 
															-	return cpu;
														
 
															+	return -1;
														
 
															 }
														
 
															-#define RT_PUSH_IPI_EXECUTING		1
														
 
															-#define RT_PUSH_IPI_RESTART		2
														
 
															+static inline bool rto_start_trylock(atomic_t *v)
														
 
															+{
														
 
															+	return !atomic_cmpxchg_acquire(v, 0, 1);
														
 
															+}
														
 
															-/*
														
 
															- * When a high priority task schedules out from a CPU and a lower priority
														
 
															- * task is scheduled in, a check is made to see if there's any RT tasks
														
 
															- * on other CPUs that are waiting to run because a higher priority RT task
														
 
															- * is currently running on its CPU. In this case, the CPU with multiple RT
														
 
															- * tasks queued on it (overloaded) needs to be notified that a CPU has opened
														
 
															- * up that may be able to run one of its non-running queued RT tasks.
														
 
															- *
														
 
															- * On large CPU boxes, there's the case that several CPUs could schedule
														
 
															- * a lower priority task at the same time, in which case it will look for
														
 
															- * any overloaded CPUs that it could pull a task from. To do this, the runqueue
														
 
															- * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting
														
 
															- * for a single overloaded CPU's runqueue lock can produce a large latency.
														
 
															- * (This has actually been observed on large boxes running cyclictest).
														
 
															- * Instead of taking the runqueue lock of the overloaded CPU, each of the
														
 
															- * CPUs that scheduled a lower priority task simply sends an IPI to the
														
 
															- * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with
														
 
															- * lots of contention. The overloaded CPU will look to push its non-running
														
 
															- * RT task off, and if it does, it can then ignore the other IPIs coming
														
 
															- * in, and just pass those IPIs off to any other overloaded CPU.
														
 
															- *
														
 
															- * When a CPU schedules a lower priority task, it only sends an IPI to
														
 
															- * the "next" CPU that has overloaded RT tasks. This prevents IPI storms,
														
 
															- * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with
														
 
															- * RT overloaded tasks, would cause 100 IPIs to go out at once.
														
 
															- *
														
 
															- * The overloaded RT CPU, when receiving an IPI, will try to push off its
														
 
															- * overloaded RT tasks and then send an IPI to the next CPU that has
														
 
															- * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks
														
 
															- * have completed. Just because a CPU may have pushed off its own overloaded
														
 
															- * RT task does not mean it should stop sending the IPI around to other
														
 
															- * overloaded CPUs. There may be another RT task waiting to run on one of
														
 
															- * those CPUs that are of higher priority than the one that was just
														
 
															- * pushed.
														
 
															- *
														
 
															- * An optimization that could possibly be made is to make a CPU array similar
														
 
															- * to the cpupri array mask of all running RT tasks, but for the overloaded
														
 
															- * case, then the IPI could be sent to only the CPU with the highest priority
														
 
															- * RT task waiting, and that CPU could send off further IPIs to the CPU with
														
 
															- * the next highest waiting task. Since the overloaded case is much less likely
														
 
															- * to happen, the complexity of this implementation may not be worth it.
														
 
															- * Instead, just send an IPI around to all overloaded CPUs.
														
 
															- *
														
 
															- * The rq->rt.push_flags holds the status of the IPI that is going around.
														
 
															- * A run queue can only send out a single IPI at a time. The possible flags
														
 
															- * for rq->rt.push_flags are:
														
 
															- *
														
 
															- *    (None or zero):		No IPI is going around for the current rq
														
 
															- *    RT_PUSH_IPI_EXECUTING:	An IPI for the rq is being passed around
														
 
															- *    RT_PUSH_IPI_RESTART:	The priority of the running task for the rq
														
 
															- *				has changed, and the IPI should restart
														
 
															- *				circulating the overloaded CPUs again.
														
 
															- *
														
 
															- * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated
														
 
															- * before sending to the next CPU.
														
 
															- *
														
 
															- * Instead of having all CPUs that schedule a lower priority task send
														
 
															- * an IPI to the same "first" CPU in the RT overload mask, they send it
														
 
															- * to the next overloaded CPU after their own CPU. This helps distribute
														
 
															- * the work when there's more than one overloaded CPU and multiple CPUs
														
 
															- * scheduling in lower priority tasks.
														
 
															- *
														
 
															- * When a rq schedules a lower priority task than what was currently
														
 
															- * running, the next CPU with overloaded RT tasks is examined first.
														
 
															- * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower
														
 
															- * priority task, it will send an IPI first to CPU 5, then CPU 5 will
														
 
															- * send to CPU 1 if it is still overloaded. CPU 1 will clear the
														
 
															- * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set.
														
 
															- *
														
 
															- * The first CPU to notice IPI_RESTART is set, will clear that flag and then
														
 
															- * send an IPI to the next overloaded CPU after the rq->cpu and not the next
														
 
															- * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3
														
 
															- * schedules a lower priority task, and the IPI_RESTART gets set while the
														
 
															- * handling is being done on CPU 5, it will clear the flag and send it back to
														
 
															- * CPU 4 instead of CPU 1.
														
 
															- *
														
 
															- * Note, the above logic can be disabled by turning off the sched_feature
														
 
															- * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be
														
 
															- * taken by the CPU requesting a pull and the waiting RT task will be pulled
														
 
															- * by that CPU. This may be fine for machines with few CPUs.
														
 
															- */
														
 
															-static void tell_cpu_to_push(struct rq *rq)
														
 
															+static inline void rto_start_unlock(atomic_t *v)
														
 
															 {
														
 
															-	int cpu;
														
 
															+	atomic_set_release(v, 0);
														
 
															+}
														
 
															-	if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
														
 
															-		raw_spin_lock(&rq->rt.push_lock);
														
 
															-		/* Make sure it's still executing */
														
 
															-		if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
														
 
															-			/*
														
 
															-			 * Tell the IPI to restart the loop as things have
														
 
															-			 * changed since it started.
														
 
															-			 */
														
 
															-			rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
														
 
															-			raw_spin_unlock(&rq->rt.push_lock);
														
 
															-			return;
														
 
															-		}
														
 
															-		raw_spin_unlock(&rq->rt.push_lock);
														
 
															-	}
														
 
															+static void tell_cpu_to_push(struct rq *rq)
														
 
															+{
														
 
															+	int cpu = -1;
														
 
															-	/* When here, there's no IPI going around */
														
 
															+	/* Keep the loop going if the IPI is currently active */
														
 
															+	atomic_inc(&rq->rd->rto_loop_next);
														
 
															-	rq->rt.push_cpu = rq->cpu;
														
 
															-	cpu = find_next_push_cpu(rq);
														
 
															-	if (cpu >= nr_cpu_ids)
														
 
															+	/* Only one CPU can initiate a loop at a time */
														
 
															+	if (!rto_start_trylock(&rq->rd->rto_loop_start))
														
 
															 		return;
														
 
															-	rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
														
 
															+	raw_spin_lock(&rq->rd->rto_lock);
														
 
															+
														
 
															+	/*
														
 
															+	 * The rto_cpu is updated under the lock, if it has a valid cpu
														
 
															+	 * then the IPI is still running and will continue due to the
														
 
															+	 * update to loop_next, and nothing needs to be done here.
														
 
															+	 * Otherwise it is finishing up and an ipi needs to be sent.
														
 
															+	 */
														
 
															+	if (rq->rd->rto_cpu < 0)
														
 
															+		cpu = rto_next_cpu(rq);
														
 
															-	irq_work_queue_on(&rq->rt.push_work, cpu);
														
 
															+	raw_spin_unlock(&rq->rd->rto_lock);
														
 
															+
														
 
															+	rto_start_unlock(&rq->rd->rto_loop_start);
														
 
															+
														
 
															+	if (cpu >= 0)
														
 
															+		irq_work_queue_on(&rq->rd->rto_push_work, cpu);
														
 
															 }
														
 
															 /* Called from hardirq context */
														
 
															-static void try_to_push_tasks(void *arg)
														
 
															+void rto_push_irq_work_func(struct irq_work *work)
														
 
															 {
														
 
															-	struct rt_rq *rt_rq = arg;
														
 
															-	struct rq *rq, *src_rq;
														
 
															-	int this_cpu;
														
 
															+	struct rq *rq;
														
 
															 	int cpu;
														
 
															-	this_cpu = rt_rq->push_cpu;
														
 
															+	rq = this_rq();
														
 
															-	/* Paranoid check */
														
 
															-	BUG_ON(this_cpu != smp_processor_id());
														
 
															-
														
 
															-	rq = cpu_rq(this_cpu);
														
 
															-	src_rq = rq_of_rt_rq(rt_rq);
														
 
															-
														
 
															-again:
														
 
															+	/*
														
 
															+	 * We do not need to grab the lock to check for has_pushable_tasks.
														
 
															+	 * When it gets updated, a check is made if a push is possible.
														
 
															+	 */
														
 
															 	if (has_pushable_tasks(rq)) {
														
 
															 		raw_spin_lock(&rq->lock);
														
 
															-		push_rt_task(rq);
														
 
															+		push_rt_tasks(rq);
														
 
															 		raw_spin_unlock(&rq->lock);
														
 
															 	}
														
 
															-	/* Pass the IPI to the next rt overloaded queue */
														
 
															-	raw_spin_lock(&rt_rq->push_lock);
														
 
															-	/*
														
 
															-	 * If the source queue changed since the IPI went out,
														
 
															-	 * we need to restart the search from that CPU again.
														
 
															-	 */
														
 
															-	if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
														
 
															-		rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
														
 
															-		rt_rq->push_cpu = src_rq->cpu;
														
 
															-	}
														
 
															+	raw_spin_lock(&rq->rd->rto_lock);
														
 
															-	cpu = find_next_push_cpu(src_rq);
														
 
															+	/* Pass the IPI to the next rt overloaded queue */
														
 
															+	cpu = rto_next_cpu(rq);
														
 
															-	if (cpu >= nr_cpu_ids)
														
 
															-		rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
														
 
															-	raw_spin_unlock(&rt_rq->push_lock);
														
 
															+	raw_spin_unlock(&rq->rd->rto_lock);
														
 
															-	if (cpu >= nr_cpu_ids)
														
 
															+	if (cpu < 0)
														
 
															 		return;
														
 
															-	/*
														
 
															-	 * It is possible that a restart caused this CPU to be
														
 
															-	 * chosen again. Don't bother with an IPI, just see if we
														
 
															-	 * have more to push.
														
 
															-	 */
														
 
															-	if (unlikely(cpu == rq->cpu))
														
 
															-		goto again;
														
 
															-
														
 
															 	/* Try the next RT overloaded CPU */
														
 
															-	irq_work_queue_on(&rt_rq->push_work, cpu);
														
 
															-}
														
 
															-
														
 
															-static void push_irq_work_func(struct irq_work *work)
														
 
															-{
														
 
															-	struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
														
 
															-
														
 
															-	try_to_push_tasks(rt_rq);
														
 
															+	irq_work_queue_on(&rq->rd->rto_push_work, cpu);
														
 
															 }
														
 
															 #endif /* HAVE_RT_PUSH_IPI */
														
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -505,7 +505,7 @@ static inline int rt_bandwidth_enabled(void)
 
															 }
														
 
															 /* RT IPI pull logic requires IRQ_WORK */
														
 
															-#ifdef CONFIG_IRQ_WORK
														
 
															+#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP)
														
 
															 # define HAVE_RT_PUSH_IPI
														
 
															 #endif
														
@@ -527,12 +527,6 @@ struct rt_rq {
 
															 	unsigned long rt_nr_total;
														
 
															 	int overloaded;
														
 
															 	struct plist_head pushable_tasks;
														
 
															-#ifdef HAVE_RT_PUSH_IPI
														
 
															-	int push_flags;
														
 
															-	int push_cpu;
														
 
															-	struct irq_work push_work;
														
 
															-	raw_spinlock_t push_lock;
														
 
															-#endif
														
 
															 #endif /* CONFIG_SMP */
														
 
															 	int rt_queued;
														
@@ -641,6 +635,19 @@ struct root_domain {
 
															 	struct dl_bw dl_bw;
														
 
															 	struct cpudl cpudl;
														
 
															+#ifdef HAVE_RT_PUSH_IPI
														
 
															+	/*
														
 
															+	 * For IPI pull requests, loop across the rto_mask.
														
 
															+	 */
														
 
															+	struct irq_work rto_push_work;
														
 
															+	raw_spinlock_t rto_lock;
														
 
															+	/* These are only updated and read within rto_lock */
														
 
															+	int rto_loop;
														
 
															+	int rto_cpu;
														
 
															+	/* These atomics are updated outside of a lock */
														
 
															+	atomic_t rto_loop_next;
														
 
															+	atomic_t rto_loop_start;
														
 
															+#endif
														
 
															 	/*
														
 
															 	 * The "RT overload" flag: it gets set if a CPU has more than
														
 
															 	 * one runnable RT task.
														
@@ -658,6 +665,9 @@ extern void init_defrootdomain(void);
 
															 extern int sched_init_domains(const struct cpumask *cpu_map);
														
 
															 extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
														
 
															+#ifdef HAVE_RT_PUSH_IPI
														
 
															+extern void rto_push_irq_work_func(struct irq_work *work);
														
 
															+#endif
														
 
															 #endif /* CONFIG_SMP */
														
 
															 /*
														
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -268,6 +268,12 @@ static int init_rootdomain(struct root_domain *rd)
 
															 	if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
														
 
															 		goto free_dlo_mask;
														
 
															+#ifdef HAVE_RT_PUSH_IPI
														
 
															+	rd->rto_cpu = -1;
														
 
															+	raw_spin_lock_init(&rd->rto_lock);
														
 
															+	init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
														
 
															+#endif
														
 
															+
														
 
															 	init_dl_bw(&rd->dl_bw);
														
 
															 	if (cpudl_init(&rd->cpudl) != 0)
														
 
															 		goto free_rto_mask;