11 years ago · 51da9830d7
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -33,6 +33,11 @@ void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *))
 
				 #define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { .func = (_f), }
			
 
				 
			
 
				 bool irq_work_queue(struct irq_work *work);
			
 
				+
			
 
				+#ifdef CONFIG_SMP
			
 
				+bool irq_work_queue_on(struct irq_work *work, int cpu);
			
 
				+#endif
			
 
				+
			
 
				 void irq_work_run(void);
			
 
				 void irq_work_sync(struct irq_work *work);
			
 
				 
			
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -181,7 +181,13 @@ static inline bool tick_nohz_full_cpu(int cpu)
 
				 
			
 
				 extern void tick_nohz_init(void);
			
 
				 extern void __tick_nohz_full_check(void);
			
 
				-extern void tick_nohz_full_kick(void);
			
 
				+extern void tick_nohz_full_kick_cpu(int cpu);
			
 
				+
			
 
				+static inline void tick_nohz_full_kick(void)
			
 
				+{
			
 
				+	tick_nohz_full_kick_cpu(smp_processor_id());
			
 
				+}
			
 
				+
			
 
				 extern void tick_nohz_full_kick_all(void);
			
 
				 extern void __tick_nohz_task_switch(struct task_struct *tsk);
			
 
				 #else
			
@@ -189,6 +195,7 @@ static inline void tick_nohz_init(void) { }
 
				 static inline bool tick_nohz_full_enabled(void) { return false; }
			
 
				 static inline bool tick_nohz_full_cpu(int cpu) { return false; }
			
 
				 static inline void __tick_nohz_full_check(void) { }
			
 
				+static inline void tick_nohz_full_kick_cpu(int cpu) { }
			
 
				 static inline void tick_nohz_full_kick(void) { }
			
 
				 static inline void tick_nohz_full_kick_all(void) { }
			
 
				 static inline void __tick_nohz_task_switch(struct task_struct *tsk) { }
			
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -16,11 +16,12 @@
 
				 #include <linux/tick.h>
			
 
				 #include <linux/cpu.h>
			
 
				 #include <linux/notifier.h>
			
 
				+#include <linux/smp.h>
			
 
				 #include <asm/processor.h>
			
 
				 
			
 
				 
			
 
				-static DEFINE_PER_CPU(struct llist_head, irq_work_list);
			
 
				-static DEFINE_PER_CPU(int, irq_work_raised);
			
 
				+static DEFINE_PER_CPU(struct llist_head, raised_list);
			
 
				+static DEFINE_PER_CPU(struct llist_head, lazy_list);
			
 
				 
			
 
				 /*
			
 
				  * Claim the entry so that no one else will poke at it.
			
@@ -55,12 +56,34 @@ void __weak arch_irq_work_raise(void)
 
				 	 */
			
 
				 }
			
 
				 
			
 
				+#ifdef CONFIG_SMP
			
 
				 /*
			
 
				- * Enqueue the irq_work @entry unless it's already pending
			
 
				+ * Enqueue the irq_work @work on @cpu unless it's already pending
			
 
				  * somewhere.
			
 
				  *
			
 
				  * Can be re-enqueued while the callback is still in progress.
			
 
				  */
			
 
				+bool irq_work_queue_on(struct irq_work *work, int cpu)
			
 
				+{
			
 
				+	/* All work should have been flushed before going offline */
			
 
				+	WARN_ON_ONCE(cpu_is_offline(cpu));
			
 
				+
			
 
				+	/* Arch remote IPI send/receive backend aren't NMI safe */
			
 
				+	WARN_ON_ONCE(in_nmi());
			
 
				+
			
 
				+	/* Only queue if not already pending */
			
 
				+	if (!irq_work_claim(work))
			
 
				+		return false;
			
 
				+
			
 
				+	if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
			
 
				+		arch_send_call_function_single_ipi(cpu);
			
 
				+
			
 
				+	return true;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(irq_work_queue_on);
			
 
				+#endif
			
 
				+
			
 
				+/* Enqueue the irq work @work on the current CPU */
			
 
				 bool irq_work_queue(struct irq_work *work)
			
 
				 {
			
 
				 	/* Only queue if not already pending */
			
@@ -70,15 +93,13 @@ bool irq_work_queue(struct irq_work *work)
 
				 	/* Queue the entry and raise the IPI if needed. */
			
 
				 	preempt_disable();
			
 
				 
			
 
				-	llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
			
 
				-
			
 
				-	/*
			
 
				-	 * If the work is not "lazy" or the tick is stopped, raise the irq
			
 
				-	 * work interrupt (if supported by the arch), otherwise, just wait
			
 
				-	 * for the next tick.
			
 
				-	 */
			
 
				-	if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) {
			
 
				-		if (!this_cpu_cmpxchg(irq_work_raised, 0, 1))
			
 
				+	/* If the work is "lazy", handle it from next tick if any */
			
 
				+	if (work->flags & IRQ_WORK_LAZY) {
			
 
				+		if (llist_add(&work->llnode, &__get_cpu_var(lazy_list)) &&
			
 
				+		    tick_nohz_tick_stopped())
			
 
				+			arch_irq_work_raise();
			
 
				+	} else {
			
 
				+		if (llist_add(&work->llnode, &__get_cpu_var(raised_list)))
			
 
				 			arch_irq_work_raise();
			
 
				 	}
			
 
				 
			
@@ -90,10 +111,11 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
 
				 
			
 
				 bool irq_work_needs_cpu(void)
			
 
				 {
			
 
				-	struct llist_head *this_list;
			
 
				+	struct llist_head *raised, *lazy;
			
 
				 
			
 
				-	this_list = &__get_cpu_var(irq_work_list);
			
 
				-	if (llist_empty(this_list))
			
 
				+	raised = &__get_cpu_var(raised_list);
			
 
				+	lazy = &__get_cpu_var(lazy_list);
			
 
				+	if (llist_empty(raised) && llist_empty(lazy))
			
 
				 		return false;
			
 
				 
			
 
				 	/* All work should have been flushed before going offline */
			
@@ -102,28 +124,18 @@ bool irq_work_needs_cpu(void)
 
				 	return true;
			
 
				 }
			
 
				 
			
 
				-static void __irq_work_run(void)
			
 
				+static void irq_work_run_list(struct llist_head *list)
			
 
				 {
			
 
				 	unsigned long flags;
			
 
				 	struct irq_work *work;
			
 
				-	struct llist_head *this_list;
			
 
				 	struct llist_node *llnode;
			
 
				 
			
 
				+	BUG_ON(!irqs_disabled());
			
 
				 
			
 
				-	/*
			
 
				-	 * Reset the "raised" state right before we check the list because
			
 
				-	 * an NMI may enqueue after we find the list empty from the runner.
			
 
				-	 */
			
 
				-	__this_cpu_write(irq_work_raised, 0);
			
 
				-	barrier();
			
 
				-
			
 
				-	this_list = &__get_cpu_var(irq_work_list);
			
 
				-	if (llist_empty(this_list))
			
 
				+	if (llist_empty(list))
			
 
				 		return;
			
 
				 
			
 
				-	BUG_ON(!irqs_disabled());
			
 
				-
			
 
				-	llnode = llist_del_all(this_list);
			
 
				+	llnode = llist_del_all(list);
			
 
				 	while (llnode != NULL) {
			
 
				 		work = llist_entry(llnode, struct irq_work, llnode);
			
 
				 
			
@@ -148,6 +160,12 @@ static void __irq_work_run(void)
 
				 	}
			
 
				 }
			
 
				 
			
 
				+static void __irq_work_run(void)
			
 
				+{
			
 
				+	irq_work_run_list(&__get_cpu_var(raised_list));
			
 
				+	irq_work_run_list(&__get_cpu_var(lazy_list));
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Run the irq_work entries on this cpu. Requires to be ran from hardirq
			
 
				  * context with local IRQs disabled.
			
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -684,10 +684,16 @@ static void wake_up_idle_cpu(int cpu)
 
				 
			
 
				 static bool wake_up_full_nohz_cpu(int cpu)
			
 
				 {
			
 
				+	/*
			
 
				+	 * We just need the target to call irq_exit() and re-evaluate
			
 
				+	 * the next tick. The nohz full kick at least implies that.
			
 
				+	 * If needed we can still optimize that later with an
			
 
				+	 * empty IRQ.
			
 
				+	 */
			
 
				 	if (tick_nohz_full_cpu(cpu)) {
			
 
				 		if (cpu != smp_processor_id() ||
			
 
				 		    tick_nohz_tick_stopped())
			
 
				-			smp_send_reschedule(cpu);
			
 
				+			tick_nohz_full_kick_cpu(cpu);
			
 
				 		return true;
			
 
				 	}
			
 
				 
			
@@ -734,10 +740,11 @@ bool sched_can_stop_tick(void)
 
				 
			
 
				        rq = this_rq();
			
 
				 
			
 
				-       /* Make sure rq->nr_running update is visible after the IPI */
			
 
				-       smp_rmb();
			
 
				-
			
 
				-       /* More than one running task need preemption */
			
 
				+	/*
			
 
				+	 * More than one running task need preemption.
			
 
				+	 * nr_running update is assumed to be visible
			
 
				+	 * after IPI is sent from wakers.
			
 
				+	 */
			
 
				        if (rq->nr_running > 1)
			
 
				                return false;
			
 
				 
			
@@ -1568,9 +1575,7 @@ void scheduler_ipi(void)
 
				 	 */
			
 
				 	preempt_fold_need_resched();
			
 
				 
			
 
				-	if (llist_empty(&this_rq()->wake_list)
			
 
				-			&& !tick_nohz_full_cpu(smp_processor_id())
			
 
				-			&& !got_nohz_idle_kick())
			
 
				+	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
			
 
				 		return;
			
 
				 
			
 
				 	/*
			
@@ -1587,7 +1592,6 @@ void scheduler_ipi(void)
 
				 	 * somewhat pessimize the simple resched case.
			
 
				 	 */
			
 
				 	irq_enter();
			
 
				-	tick_nohz_full_check();
			
 
				 	sched_ttwu_pending();
			
 
				 
			
 
				 	/*
			
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1221,9 +1221,15 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
 
				 #ifdef CONFIG_NO_HZ_FULL
			
 
				 	if (prev_nr < 2 && rq->nr_running >= 2) {
			
 
				 		if (tick_nohz_full_cpu(rq->cpu)) {
			
 
				-			/* Order rq->nr_running write against the IPI */
			
 
				-			smp_wmb();
			
 
				-			smp_send_reschedule(rq->cpu);
			
 
				+			/*
			
 
				+			 * Tick is needed if more than one task runs on a CPU.
			
 
				+			 * Send the target an IPI to kick it out of nohz mode.
			
 
				+			 *
			
 
				+			 * We assume that IPI implies full memory barrier and the
			
 
				+			 * new value of rq->nr_running is visible on reception
			
 
				+			 * from the target.
			
 
				+			 */
			
 
				+			tick_nohz_full_kick_cpu(rq->cpu);
			
 
				 		}
			
 
				        }
			
 
				 #endif
			
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -3,6 +3,7 @@
 
				  *
			
 
				  * (C) Jens Axboe <jens.axboe@oracle.com> 2008
			
 
				  */
			
 
				+#include <linux/irq_work.h>
			
 
				 #include <linux/rcupdate.h>
			
 
				 #include <linux/rculist.h>
			
 
				 #include <linux/kernel.h>
			
@@ -210,6 +211,14 @@ void generic_smp_call_function_single_interrupt(void)
 
				 		csd->func(csd->info);
			
 
				 		csd_unlock(csd);
			
 
				 	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Handle irq works queued remotely by irq_work_queue_on().
			
 
				+	 * Smp functions above are typically synchronous so they
			
 
				+	 * better run first since some other CPUs may be busy waiting
			
 
				+	 * for them.
			
 
				+	 */
			
 
				+	irq_work_run();
			
 
				 }
			
 
				 
			
 
				 /*
			
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -224,13 +224,15 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
 
				 };
			
 
				 
			
 
				 /*
			
 
				- * Kick the current CPU if it's full dynticks in order to force it to
			
 
				+ * Kick the CPU if it's full dynticks in order to force it to
			
 
				  * re-evaluate its dependency on the tick and restart it if necessary.
			
 
				  */
			
 
				-void tick_nohz_full_kick(void)
			
 
				+void tick_nohz_full_kick_cpu(int cpu)
			
 
				 {
			
 
				-	if (tick_nohz_full_cpu(smp_processor_id()))
			
 
				-		irq_work_queue(&__get_cpu_var(nohz_full_kick_work));
			
 
				+	if (!tick_nohz_full_cpu(cpu))
			
 
				+		return;
			
 
				+
			
 
				+	irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
			
 
				 }
			
 
				 
			
 
				 static void nohz_full_kick_ipi(void *info)