|
@@ -73,10 +73,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
|
|
raw_spin_unlock(&rt_b->rt_runtime_lock);
|
|
raw_spin_unlock(&rt_b->rt_runtime_lock);
|
|
}
|
|
}
|
|
|
|
|
|
-#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI)
|
|
|
|
-static void push_irq_work_func(struct irq_work *work);
|
|
|
|
-#endif
|
|
|
|
-
|
|
|
|
void init_rt_rq(struct rt_rq *rt_rq)
|
|
void init_rt_rq(struct rt_rq *rt_rq)
|
|
{
|
|
{
|
|
struct rt_prio_array *array;
|
|
struct rt_prio_array *array;
|
|
@@ -96,13 +92,6 @@ void init_rt_rq(struct rt_rq *rt_rq)
|
|
rt_rq->rt_nr_migratory = 0;
|
|
rt_rq->rt_nr_migratory = 0;
|
|
rt_rq->overloaded = 0;
|
|
rt_rq->overloaded = 0;
|
|
plist_head_init(&rt_rq->pushable_tasks);
|
|
plist_head_init(&rt_rq->pushable_tasks);
|
|
-
|
|
|
|
-#ifdef HAVE_RT_PUSH_IPI
|
|
|
|
- rt_rq->push_flags = 0;
|
|
|
|
- rt_rq->push_cpu = nr_cpu_ids;
|
|
|
|
- raw_spin_lock_init(&rt_rq->push_lock);
|
|
|
|
- init_irq_work(&rt_rq->push_work, push_irq_work_func);
|
|
|
|
-#endif
|
|
|
|
#endif /* CONFIG_SMP */
|
|
#endif /* CONFIG_SMP */
|
|
/* We start is dequeued state, because no RT tasks are queued */
|
|
/* We start is dequeued state, because no RT tasks are queued */
|
|
rt_rq->rt_queued = 0;
|
|
rt_rq->rt_queued = 0;
|
|
@@ -1875,241 +1864,166 @@ static void push_rt_tasks(struct rq *rq)
|
|
}
|
|
}
|
|
|
|
|
|
#ifdef HAVE_RT_PUSH_IPI
|
|
#ifdef HAVE_RT_PUSH_IPI
|
|
|
|
+
|
|
/*
|
|
/*
|
|
- * The search for the next cpu always starts at rq->cpu and ends
|
|
|
|
- * when we reach rq->cpu again. It will never return rq->cpu.
|
|
|
|
- * This returns the next cpu to check, or nr_cpu_ids if the loop
|
|
|
|
- * is complete.
|
|
|
|
|
|
+ * When a high priority task schedules out from a CPU and a lower priority
|
|
|
|
+ * task is scheduled in, a check is made to see if there's any RT tasks
|
|
|
|
+ * on other CPUs that are waiting to run because a higher priority RT task
|
|
|
|
+ * is currently running on its CPU. In this case, the CPU with multiple RT
|
|
|
|
+ * tasks queued on it (overloaded) needs to be notified that a CPU has opened
|
|
|
|
+ * up that may be able to run one of its non-running queued RT tasks.
|
|
|
|
+ *
|
|
|
|
+ * All CPUs with overloaded RT tasks need to be notified as there is currently
|
|
|
|
+ * no way to know which of these CPUs have the highest priority task waiting
|
|
|
|
+ * to run. Instead of trying to take a spinlock on each of these CPUs,
|
|
|
|
+ * which has shown to cause large latency when done on machines with many
|
|
|
|
+ * CPUs, sending an IPI to the CPUs to have them push off the overloaded
|
|
|
|
+ * RT tasks waiting to run.
|
|
|
|
+ *
|
|
|
|
+ * Just sending an IPI to each of the CPUs is also an issue, as on large
|
|
|
|
+ * count CPU machines, this can cause an IPI storm on a CPU, especially
|
|
|
|
+ * if its the only CPU with multiple RT tasks queued, and a large number
|
|
|
|
+ * of CPUs scheduling a lower priority task at the same time.
|
|
|
|
+ *
|
|
|
|
+ * Each root domain has its own irq work function that can iterate over
|
|
|
|
+ * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
|
|
|
|
+ * tassk must be checked if there's one or many CPUs that are lowering
|
|
|
|
+ * their priority, there's a single irq work iterator that will try to
|
|
|
|
+ * push off RT tasks that are waiting to run.
|
|
|
|
+ *
|
|
|
|
+ * When a CPU schedules a lower priority task, it will kick off the
|
|
|
|
+ * irq work iterator that will jump to each CPU with overloaded RT tasks.
|
|
|
|
+ * As it only takes the first CPU that schedules a lower priority task
|
|
|
|
+ * to start the process, the rto_start variable is incremented and if
|
|
|
|
+ * the atomic result is one, then that CPU will try to take the rto_lock.
|
|
|
|
+ * This prevents high contention on the lock as the process handles all
|
|
|
|
+ * CPUs scheduling lower priority tasks.
|
|
|
|
+ *
|
|
|
|
+ * All CPUs that are scheduling a lower priority task will increment the
|
|
|
|
+ * rt_loop_next variable. This will make sure that the irq work iterator
|
|
|
|
+ * checks all RT overloaded CPUs whenever a CPU schedules a new lower
|
|
|
|
+ * priority task, even if the iterator is in the middle of a scan. Incrementing
|
|
|
|
+ * the rt_loop_next will cause the iterator to perform another scan.
|
|
*
|
|
*
|
|
- * rq->rt.push_cpu holds the last cpu returned by this function,
|
|
|
|
- * or if this is the first instance, it must hold rq->cpu.
|
|
|
|
*/
|
|
*/
|
|
static int rto_next_cpu(struct rq *rq)
|
|
static int rto_next_cpu(struct rq *rq)
|
|
{
|
|
{
|
|
- int prev_cpu = rq->rt.push_cpu;
|
|
|
|
|
|
+ struct root_domain *rd = rq->rd;
|
|
|
|
+ int next;
|
|
int cpu;
|
|
int cpu;
|
|
|
|
|
|
- cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
|
|
|
|
-
|
|
|
|
/*
|
|
/*
|
|
- * If the previous cpu is less than the rq's CPU, then it already
|
|
|
|
- * passed the end of the mask, and has started from the beginning.
|
|
|
|
- * We end if the next CPU is greater or equal to rq's CPU.
|
|
|
|
|
|
+ * When starting the IPI RT pushing, the rto_cpu is set to -1,
|
|
|
|
+ * rt_next_cpu() will simply return the first CPU found in
|
|
|
|
+ * the rto_mask.
|
|
|
|
+ *
|
|
|
|
+ * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
|
|
|
|
+ * will return the next CPU found in the rto_mask.
|
|
|
|
+ *
|
|
|
|
+ * If there are no more CPUs left in the rto_mask, then a check is made
|
|
|
|
+ * against rto_loop and rto_loop_next. rto_loop is only updated with
|
|
|
|
+ * the rto_lock held, but any CPU may increment the rto_loop_next
|
|
|
|
+ * without any locking.
|
|
*/
|
|
*/
|
|
- if (prev_cpu < rq->cpu) {
|
|
|
|
- if (cpu >= rq->cpu)
|
|
|
|
- return nr_cpu_ids;
|
|
|
|
|
|
+ for (;;) {
|
|
|
|
|
|
- } else if (cpu >= nr_cpu_ids) {
|
|
|
|
- /*
|
|
|
|
- * We passed the end of the mask, start at the beginning.
|
|
|
|
- * If the result is greater or equal to the rq's CPU, then
|
|
|
|
- * the loop is finished.
|
|
|
|
- */
|
|
|
|
- cpu = cpumask_first(rq->rd->rto_mask);
|
|
|
|
- if (cpu >= rq->cpu)
|
|
|
|
- return nr_cpu_ids;
|
|
|
|
- }
|
|
|
|
- rq->rt.push_cpu = cpu;
|
|
|
|
|
|
+ /* When rto_cpu is -1 this acts like cpumask_first() */
|
|
|
|
+ cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
|
|
|
|
|
|
- /* Return cpu to let the caller know if the loop is finished or not */
|
|
|
|
- return cpu;
|
|
|
|
-}
|
|
|
|
|
|
+ rd->rto_cpu = cpu;
|
|
|
|
|
|
-static int find_next_push_cpu(struct rq *rq)
|
|
|
|
-{
|
|
|
|
- struct rq *next_rq;
|
|
|
|
- int cpu;
|
|
|
|
|
|
+ if (cpu < nr_cpu_ids)
|
|
|
|
+ return cpu;
|
|
|
|
|
|
- while (1) {
|
|
|
|
- cpu = rto_next_cpu(rq);
|
|
|
|
- if (cpu >= nr_cpu_ids)
|
|
|
|
- break;
|
|
|
|
- next_rq = cpu_rq(cpu);
|
|
|
|
|
|
+ rd->rto_cpu = -1;
|
|
|
|
+
|
|
|
|
+ /*
|
|
|
|
+ * ACQUIRE ensures we see the @rto_mask changes
|
|
|
|
+ * made prior to the @next value observed.
|
|
|
|
+ *
|
|
|
|
+ * Matches WMB in rt_set_overload().
|
|
|
|
+ */
|
|
|
|
+ next = atomic_read_acquire(&rd->rto_loop_next);
|
|
|
|
|
|
- /* Make sure the next rq can push to this rq */
|
|
|
|
- if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
|
|
|
|
|
|
+ if (rd->rto_loop == next)
|
|
break;
|
|
break;
|
|
|
|
+
|
|
|
|
+ rd->rto_loop = next;
|
|
}
|
|
}
|
|
|
|
|
|
- return cpu;
|
|
|
|
|
|
+ return -1;
|
|
}
|
|
}
|
|
|
|
|
|
-#define RT_PUSH_IPI_EXECUTING 1
|
|
|
|
-#define RT_PUSH_IPI_RESTART 2
|
|
|
|
|
|
+static inline bool rto_start_trylock(atomic_t *v)
|
|
|
|
+{
|
|
|
|
+ return !atomic_cmpxchg_acquire(v, 0, 1);
|
|
|
|
+}
|
|
|
|
|
|
-/*
|
|
|
|
- * When a high priority task schedules out from a CPU and a lower priority
|
|
|
|
- * task is scheduled in, a check is made to see if there's any RT tasks
|
|
|
|
- * on other CPUs that are waiting to run because a higher priority RT task
|
|
|
|
- * is currently running on its CPU. In this case, the CPU with multiple RT
|
|
|
|
- * tasks queued on it (overloaded) needs to be notified that a CPU has opened
|
|
|
|
- * up that may be able to run one of its non-running queued RT tasks.
|
|
|
|
- *
|
|
|
|
- * On large CPU boxes, there's the case that several CPUs could schedule
|
|
|
|
- * a lower priority task at the same time, in which case it will look for
|
|
|
|
- * any overloaded CPUs that it could pull a task from. To do this, the runqueue
|
|
|
|
- * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting
|
|
|
|
- * for a single overloaded CPU's runqueue lock can produce a large latency.
|
|
|
|
- * (This has actually been observed on large boxes running cyclictest).
|
|
|
|
- * Instead of taking the runqueue lock of the overloaded CPU, each of the
|
|
|
|
- * CPUs that scheduled a lower priority task simply sends an IPI to the
|
|
|
|
- * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with
|
|
|
|
- * lots of contention. The overloaded CPU will look to push its non-running
|
|
|
|
- * RT task off, and if it does, it can then ignore the other IPIs coming
|
|
|
|
- * in, and just pass those IPIs off to any other overloaded CPU.
|
|
|
|
- *
|
|
|
|
- * When a CPU schedules a lower priority task, it only sends an IPI to
|
|
|
|
- * the "next" CPU that has overloaded RT tasks. This prevents IPI storms,
|
|
|
|
- * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with
|
|
|
|
- * RT overloaded tasks, would cause 100 IPIs to go out at once.
|
|
|
|
- *
|
|
|
|
- * The overloaded RT CPU, when receiving an IPI, will try to push off its
|
|
|
|
- * overloaded RT tasks and then send an IPI to the next CPU that has
|
|
|
|
- * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks
|
|
|
|
- * have completed. Just because a CPU may have pushed off its own overloaded
|
|
|
|
- * RT task does not mean it should stop sending the IPI around to other
|
|
|
|
- * overloaded CPUs. There may be another RT task waiting to run on one of
|
|
|
|
- * those CPUs that are of higher priority than the one that was just
|
|
|
|
- * pushed.
|
|
|
|
- *
|
|
|
|
- * An optimization that could possibly be made is to make a CPU array similar
|
|
|
|
- * to the cpupri array mask of all running RT tasks, but for the overloaded
|
|
|
|
- * case, then the IPI could be sent to only the CPU with the highest priority
|
|
|
|
- * RT task waiting, and that CPU could send off further IPIs to the CPU with
|
|
|
|
- * the next highest waiting task. Since the overloaded case is much less likely
|
|
|
|
- * to happen, the complexity of this implementation may not be worth it.
|
|
|
|
- * Instead, just send an IPI around to all overloaded CPUs.
|
|
|
|
- *
|
|
|
|
- * The rq->rt.push_flags holds the status of the IPI that is going around.
|
|
|
|
- * A run queue can only send out a single IPI at a time. The possible flags
|
|
|
|
- * for rq->rt.push_flags are:
|
|
|
|
- *
|
|
|
|
- * (None or zero): No IPI is going around for the current rq
|
|
|
|
- * RT_PUSH_IPI_EXECUTING: An IPI for the rq is being passed around
|
|
|
|
- * RT_PUSH_IPI_RESTART: The priority of the running task for the rq
|
|
|
|
- * has changed, and the IPI should restart
|
|
|
|
- * circulating the overloaded CPUs again.
|
|
|
|
- *
|
|
|
|
- * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated
|
|
|
|
- * before sending to the next CPU.
|
|
|
|
- *
|
|
|
|
- * Instead of having all CPUs that schedule a lower priority task send
|
|
|
|
- * an IPI to the same "first" CPU in the RT overload mask, they send it
|
|
|
|
- * to the next overloaded CPU after their own CPU. This helps distribute
|
|
|
|
- * the work when there's more than one overloaded CPU and multiple CPUs
|
|
|
|
- * scheduling in lower priority tasks.
|
|
|
|
- *
|
|
|
|
- * When a rq schedules a lower priority task than what was currently
|
|
|
|
- * running, the next CPU with overloaded RT tasks is examined first.
|
|
|
|
- * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower
|
|
|
|
- * priority task, it will send an IPI first to CPU 5, then CPU 5 will
|
|
|
|
- * send to CPU 1 if it is still overloaded. CPU 1 will clear the
|
|
|
|
- * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set.
|
|
|
|
- *
|
|
|
|
- * The first CPU to notice IPI_RESTART is set, will clear that flag and then
|
|
|
|
- * send an IPI to the next overloaded CPU after the rq->cpu and not the next
|
|
|
|
- * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3
|
|
|
|
- * schedules a lower priority task, and the IPI_RESTART gets set while the
|
|
|
|
- * handling is being done on CPU 5, it will clear the flag and send it back to
|
|
|
|
- * CPU 4 instead of CPU 1.
|
|
|
|
- *
|
|
|
|
- * Note, the above logic can be disabled by turning off the sched_feature
|
|
|
|
- * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be
|
|
|
|
- * taken by the CPU requesting a pull and the waiting RT task will be pulled
|
|
|
|
- * by that CPU. This may be fine for machines with few CPUs.
|
|
|
|
- */
|
|
|
|
-static void tell_cpu_to_push(struct rq *rq)
|
|
|
|
|
|
+static inline void rto_start_unlock(atomic_t *v)
|
|
{
|
|
{
|
|
- int cpu;
|
|
|
|
|
|
+ atomic_set_release(v, 0);
|
|
|
|
+}
|
|
|
|
|
|
- if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
|
|
|
|
- raw_spin_lock(&rq->rt.push_lock);
|
|
|
|
- /* Make sure it's still executing */
|
|
|
|
- if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
|
|
|
|
- /*
|
|
|
|
- * Tell the IPI to restart the loop as things have
|
|
|
|
- * changed since it started.
|
|
|
|
- */
|
|
|
|
- rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
|
|
|
|
- raw_spin_unlock(&rq->rt.push_lock);
|
|
|
|
- return;
|
|
|
|
- }
|
|
|
|
- raw_spin_unlock(&rq->rt.push_lock);
|
|
|
|
- }
|
|
|
|
|
|
+static void tell_cpu_to_push(struct rq *rq)
|
|
|
|
+{
|
|
|
|
+ int cpu = -1;
|
|
|
|
|
|
- /* When here, there's no IPI going around */
|
|
|
|
|
|
+ /* Keep the loop going if the IPI is currently active */
|
|
|
|
+ atomic_inc(&rq->rd->rto_loop_next);
|
|
|
|
|
|
- rq->rt.push_cpu = rq->cpu;
|
|
|
|
- cpu = find_next_push_cpu(rq);
|
|
|
|
- if (cpu >= nr_cpu_ids)
|
|
|
|
|
|
+ /* Only one CPU can initiate a loop at a time */
|
|
|
|
+ if (!rto_start_trylock(&rq->rd->rto_loop_start))
|
|
return;
|
|
return;
|
|
|
|
|
|
- rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
|
|
|
|
|
|
+ raw_spin_lock(&rq->rd->rto_lock);
|
|
|
|
+
|
|
|
|
+ /*
|
|
|
|
+ * The rto_cpu is updated under the lock, if it has a valid cpu
|
|
|
|
+ * then the IPI is still running and will continue due to the
|
|
|
|
+ * update to loop_next, and nothing needs to be done here.
|
|
|
|
+ * Otherwise it is finishing up and an ipi needs to be sent.
|
|
|
|
+ */
|
|
|
|
+ if (rq->rd->rto_cpu < 0)
|
|
|
|
+ cpu = rto_next_cpu(rq);
|
|
|
|
|
|
- irq_work_queue_on(&rq->rt.push_work, cpu);
|
|
|
|
|
|
+ raw_spin_unlock(&rq->rd->rto_lock);
|
|
|
|
+
|
|
|
|
+ rto_start_unlock(&rq->rd->rto_loop_start);
|
|
|
|
+
|
|
|
|
+ if (cpu >= 0)
|
|
|
|
+ irq_work_queue_on(&rq->rd->rto_push_work, cpu);
|
|
}
|
|
}
|
|
|
|
|
|
/* Called from hardirq context */
|
|
/* Called from hardirq context */
|
|
-static void try_to_push_tasks(void *arg)
|
|
|
|
|
|
+void rto_push_irq_work_func(struct irq_work *work)
|
|
{
|
|
{
|
|
- struct rt_rq *rt_rq = arg;
|
|
|
|
- struct rq *rq, *src_rq;
|
|
|
|
- int this_cpu;
|
|
|
|
|
|
+ struct rq *rq;
|
|
int cpu;
|
|
int cpu;
|
|
|
|
|
|
- this_cpu = rt_rq->push_cpu;
|
|
|
|
|
|
+ rq = this_rq();
|
|
|
|
|
|
- /* Paranoid check */
|
|
|
|
- BUG_ON(this_cpu != smp_processor_id());
|
|
|
|
-
|
|
|
|
- rq = cpu_rq(this_cpu);
|
|
|
|
- src_rq = rq_of_rt_rq(rt_rq);
|
|
|
|
-
|
|
|
|
-again:
|
|
|
|
|
|
+ /*
|
|
|
|
+ * We do not need to grab the lock to check for has_pushable_tasks.
|
|
|
|
+ * When it gets updated, a check is made if a push is possible.
|
|
|
|
+ */
|
|
if (has_pushable_tasks(rq)) {
|
|
if (has_pushable_tasks(rq)) {
|
|
raw_spin_lock(&rq->lock);
|
|
raw_spin_lock(&rq->lock);
|
|
- push_rt_task(rq);
|
|
|
|
|
|
+ push_rt_tasks(rq);
|
|
raw_spin_unlock(&rq->lock);
|
|
raw_spin_unlock(&rq->lock);
|
|
}
|
|
}
|
|
|
|
|
|
- /* Pass the IPI to the next rt overloaded queue */
|
|
|
|
- raw_spin_lock(&rt_rq->push_lock);
|
|
|
|
- /*
|
|
|
|
- * If the source queue changed since the IPI went out,
|
|
|
|
- * we need to restart the search from that CPU again.
|
|
|
|
- */
|
|
|
|
- if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
|
|
|
|
- rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
|
|
|
|
- rt_rq->push_cpu = src_rq->cpu;
|
|
|
|
- }
|
|
|
|
|
|
+ raw_spin_lock(&rq->rd->rto_lock);
|
|
|
|
|
|
- cpu = find_next_push_cpu(src_rq);
|
|
|
|
|
|
+ /* Pass the IPI to the next rt overloaded queue */
|
|
|
|
+ cpu = rto_next_cpu(rq);
|
|
|
|
|
|
- if (cpu >= nr_cpu_ids)
|
|
|
|
- rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
|
|
|
|
- raw_spin_unlock(&rt_rq->push_lock);
|
|
|
|
|
|
+ raw_spin_unlock(&rq->rd->rto_lock);
|
|
|
|
|
|
- if (cpu >= nr_cpu_ids)
|
|
|
|
|
|
+ if (cpu < 0)
|
|
return;
|
|
return;
|
|
|
|
|
|
- /*
|
|
|
|
- * It is possible that a restart caused this CPU to be
|
|
|
|
- * chosen again. Don't bother with an IPI, just see if we
|
|
|
|
- * have more to push.
|
|
|
|
- */
|
|
|
|
- if (unlikely(cpu == rq->cpu))
|
|
|
|
- goto again;
|
|
|
|
-
|
|
|
|
/* Try the next RT overloaded CPU */
|
|
/* Try the next RT overloaded CPU */
|
|
- irq_work_queue_on(&rt_rq->push_work, cpu);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static void push_irq_work_func(struct irq_work *work)
|
|
|
|
-{
|
|
|
|
- struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
|
|
|
|
-
|
|
|
|
- try_to_push_tasks(rt_rq);
|
|
|
|
|
|
+ irq_work_queue_on(&rq->rd->rto_push_work, cpu);
|
|
}
|
|
}
|
|
#endif /* HAVE_RT_PUSH_IPI */
|
|
#endif /* HAVE_RT_PUSH_IPI */
|
|
|
|
|