před 8 roky · e1c9214955
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -706,6 +706,7 @@ static inline void init_irq_proc(void)
 
				 #ifdef CONFIG_IRQ_TIMINGS
			
 
				 void irq_timings_enable(void);
			
 
				 void irq_timings_disable(void);
			
 
				+u64 irq_timings_next_event(u64 now);
			
 
				 #endif
			
 
				 
			
 
				 struct seq_file;
			
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -275,13 +275,21 @@ struct irq_timings {
 
				 
			
 
				 DECLARE_PER_CPU(struct irq_timings, irq_timings);
			
 
				 
			
 
				+extern void irq_timings_free(int irq);
			
 
				+extern int irq_timings_alloc(int irq);
			
 
				+
			
 
				 static inline void irq_remove_timings(struct irq_desc *desc)
			
 
				 {
			
 
				 	desc->istate &= ~IRQS_TIMINGS;
			
 
				+
			
 
				+	irq_timings_free(irq_desc_get_irq(desc));
			
 
				 }
			
 
				 
			
 
				 static inline void irq_setup_timings(struct irq_desc *desc, struct irqaction *act)
			
 
				 {
			
 
				+	int irq = irq_desc_get_irq(desc);
			
 
				+	int ret;
			
 
				+
			
 
				 	/*
			
 
				 	 * We don't need the measurement because the idle code already
			
 
				 	 * knows the next expiry event.
			
@@ -289,6 +297,17 @@ static inline void irq_setup_timings(struct irq_desc *desc, struct irqaction *ac
 
				 	if (act->flags & __IRQF_TIMER)
			
 
				 		return;
			
 
				 
			
 
				+	/*
			
 
				+	 * In case the timing allocation fails, we just want to warn,
			
 
				+	 * not fail, so letting the system boot anyway.
			
 
				+	 */
			
 
				+	ret = irq_timings_alloc(irq);
			
 
				+	if (ret) {
			
 
				+		pr_warn("Failed to allocate irq timing stats for irq%d (%d)",
			
 
				+			irq, ret);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				 	desc->istate |= IRQS_TIMINGS;
			
 
				 }
			
 
				 
			
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -8,10 +8,16 @@
 
				  * published by the Free Software Foundation.
			
 
				  *
			
 
				  */
			
 
				+#include <linux/kernel.h>
			
 
				 #include <linux/percpu.h>
			
 
				+#include <linux/slab.h>
			
 
				 #include <linux/static_key.h>
			
 
				 #include <linux/interrupt.h>
			
 
				+#include <linux/idr.h>
			
 
				 #include <linux/irq.h>
			
 
				+#include <linux/math64.h>
			
 
				+
			
 
				+#include <trace/events/irq.h>
			
 
				 
			
 
				 #include "internals.h"
			
 
				 
			
@@ -19,6 +25,18 @@ DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);
 
				 
			
 
				 DEFINE_PER_CPU(struct irq_timings, irq_timings);
			
 
				 
			
 
				+struct irqt_stat {
			
 
				+	u64	next_evt;
			
 
				+	u64	last_ts;
			
 
				+	u64	variance;
			
 
				+	u32	avg;
			
 
				+	u32	nr_samples;
			
 
				+	int	anomalies;
			
 
				+	int	valid;
			
 
				+};
			
 
				+
			
 
				+static DEFINE_IDR(irqt_stats);
			
 
				+
			
 
				 void irq_timings_enable(void)
			
 
				 {
			
 
				 	static_branch_enable(&irq_timing_enabled);
			
@@ -28,3 +46,324 @@ void irq_timings_disable(void)
 
				 {
			
 
				 	static_branch_disable(&irq_timing_enabled);
			
 
				 }
			
 
				+
			
 
				+/**
			
 
				+ * irqs_update - update the irq timing statistics with a new timestamp
			
 
				+ *
			
 
				+ * @irqs: an irqt_stat struct pointer
			
 
				+ * @ts: the new timestamp
			
 
				+ *
			
 
				+ * The statistics are computed online, in other words, the code is
			
 
				+ * designed to compute the statistics on a stream of values rather
			
 
				+ * than doing multiple passes on the values to compute the average,
			
 
				+ * then the variance. The integer division introduces a loss of
			
 
				+ * precision but with an acceptable error margin regarding the results
			
 
				+ * we would have with the double floating precision: we are dealing
			
 
				+ * with nanosec, so big numbers, consequently the mantisse is
			
 
				+ * negligeable, especially when converting the time in usec
			
 
				+ * afterwards.
			
 
				+ *
			
 
				+ * The computation happens at idle time. When the CPU is not idle, the
			
 
				+ * interrupts' timestamps are stored in the circular buffer, when the
			
 
				+ * CPU goes idle and this routine is called, all the buffer's values
			
 
				+ * are injected in the statistical model continuying to extend the
			
 
				+ * statistics from the previous busy-idle cycle.
			
 
				+ *
			
 
				+ * The observations showed a device will trigger a burst of periodic
			
 
				+ * interrupts followed by one or two peaks of longer time, for
			
 
				+ * instance when a SD card device flushes its cache, then the periodic
			
 
				+ * intervals occur again. A one second inactivity period resets the
			
 
				+ * stats, that gives us the certitude the statistical values won't
			
 
				+ * exceed 1x10^9, thus the computation won't overflow.
			
 
				+ *
			
 
				+ * Basically, the purpose of the algorithm is to watch the periodic
			
 
				+ * interrupts and eliminate the peaks.
			
 
				+ *
			
 
				+ * An interrupt is considered periodically stable if the interval of
			
 
				+ * its occurences follow the normal distribution, thus the values
			
 
				+ * comply with:
			
 
				+ *
			
 
				+ *      avg - 3 x stddev < value < avg + 3 x stddev
			
 
				+ *
			
 
				+ * Which can be simplified to:
			
 
				+ *
			
 
				+ *      -3 x stddev < value - avg < 3 x stddev
			
 
				+ *
			
 
				+ *      abs(value - avg) < 3 x stddev
			
 
				+ *
			
 
				+ * In order to save a costly square root computation, we use the
			
 
				+ * variance. For the record, stddev = sqrt(variance). The equation
			
 
				+ * above becomes:
			
 
				+ *
			
 
				+ *      abs(value - avg) < 3 x sqrt(variance)
			
 
				+ *
			
 
				+ * And finally we square it:
			
 
				+ *
			
 
				+ *      (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2
			
 
				+ *
			
 
				+ *      (value - avg) x (value - avg) < 9 x variance
			
 
				+ *
			
 
				+ * Statistically speaking, any values out of this interval is
			
 
				+ * considered as an anomaly and is discarded. However, a normal
			
 
				+ * distribution appears when the number of samples is 30 (it is the
			
 
				+ * rule of thumb in statistics, cf. "30 samples" on Internet). When
			
 
				+ * there are three consecutive anomalies, the statistics are resetted.
			
 
				+ *
			
 
				+ */
			
 
				+static void irqs_update(struct irqt_stat *irqs, u64 ts)
			
 
				+{
			
 
				+	u64 old_ts = irqs->last_ts;
			
 
				+	u64 variance = 0;
			
 
				+	u64 interval;
			
 
				+	s64 diff;
			
 
				+
			
 
				+	/*
			
 
				+	 * The timestamps are absolute time values, we need to compute
			
 
				+	 * the timing interval between two interrupts.
			
 
				+	 */
			
 
				+	irqs->last_ts = ts;
			
 
				+
			
 
				+	/*
			
 
				+	 * The interval type is u64 in order to deal with the same
			
 
				+	 * type in our computation, that prevent mindfuck issues with
			
 
				+	 * overflow, sign and division.
			
 
				+	 */
			
 
				+	interval = ts - old_ts;
			
 
				+
			
 
				+	/*
			
 
				+	 * The interrupt triggered more than one second apart, that
			
 
				+	 * ends the sequence as predictible for our purpose. In this
			
 
				+	 * case, assume we have the beginning of a sequence and the
			
 
				+	 * timestamp is the first value. As it is impossible to
			
 
				+	 * predict anything at this point, return.
			
 
				+	 *
			
 
				+	 * Note the first timestamp of the sequence will always fall
			
 
				+	 * in this test because the old_ts is zero. That is what we
			
 
				+	 * want as we need another timestamp to compute an interval.
			
 
				+	 */
			
 
				+	if (interval >= NSEC_PER_SEC) {
			
 
				+		memset(irqs, 0, sizeof(*irqs));
			
 
				+		irqs->last_ts = ts;
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Pre-compute the delta with the average as the result is
			
 
				+	 * used several times in this function.
			
 
				+	 */
			
 
				+	diff = interval - irqs->avg;
			
 
				+
			
 
				+	/*
			
 
				+	 * Increment the number of samples.
			
 
				+	 */
			
 
				+	irqs->nr_samples++;
			
 
				+
			
 
				+	/*
			
 
				+	 * Online variance divided by the number of elements if there
			
 
				+	 * is more than one sample.  Normally the formula is division
			
 
				+	 * by nr_samples - 1 but we assume the number of element will be
			
 
				+	 * more than 32 and dividing by 32 instead of 31 is enough
			
 
				+	 * precise.
			
 
				+	 */
			
 
				+	if (likely(irqs->nr_samples > 1))
			
 
				+		variance = irqs->variance >> IRQ_TIMINGS_SHIFT;
			
 
				+
			
 
				+	/*
			
 
				+	 * The rule of thumb in statistics for the normal distribution
			
 
				+	 * is having at least 30 samples in order to have the model to
			
 
				+	 * apply. Values outside the interval are considered as an
			
 
				+	 * anomaly.
			
 
				+	 */
			
 
				+	if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) {
			
 
				+		/*
			
 
				+		 * After three consecutive anomalies, we reset the
			
 
				+		 * stats as it is no longer stable enough.
			
 
				+		 */
			
 
				+		if (irqs->anomalies++ >= 3) {
			
 
				+			memset(irqs, 0, sizeof(*irqs));
			
 
				+			irqs->last_ts = ts;
			
 
				+			return;
			
 
				+		}
			
 
				+	} else {
			
 
				+		/*
			
 
				+		 * The anomalies must be consecutives, so at this
			
 
				+		 * point, we reset the anomalies counter.
			
 
				+		 */
			
 
				+		irqs->anomalies = 0;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * The interrupt is considered stable enough to try to predict
			
 
				+	 * the next event on it.
			
 
				+	 */
			
 
				+	irqs->valid = 1;
			
 
				+
			
 
				+	/*
			
 
				+	 * Online average algorithm:
			
 
				+	 *
			
 
				+	 *  new_average = average + ((value - average) / count)
			
 
				+	 *
			
 
				+	 * The variance computation depends on the new average
			
 
				+	 * to be computed here first.
			
 
				+	 *
			
 
				+	 */
			
 
				+	irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT);
			
 
				+
			
 
				+	/*
			
 
				+	 * Online variance algorithm:
			
 
				+	 *
			
 
				+	 *  new_variance = variance + (value - average) x (value - new_average)
			
 
				+	 *
			
 
				+	 * Warning: irqs->avg is updated with the line above, hence
			
 
				+	 * 'interval - irqs->avg' is no longer equal to 'diff'
			
 
				+	 */
			
 
				+	irqs->variance = irqs->variance + (diff * (interval - irqs->avg));
			
 
				+
			
 
				+	/*
			
 
				+	 * Update the next event
			
 
				+	 */
			
 
				+	irqs->next_evt = ts + irqs->avg;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * irq_timings_next_event - Return when the next event is supposed to arrive
			
 
				+ *
			
 
				+ * During the last busy cycle, the number of interrupts is incremented
			
 
				+ * and stored in the irq_timings structure. This information is
			
 
				+ * necessary to:
			
 
				+ *
			
 
				+ * - know if the index in the table wrapped up:
			
 
				+ *
			
 
				+ *      If more than the array size interrupts happened during the
			
 
				+ *      last busy/idle cycle, the index wrapped up and we have to
			
 
				+ *      begin with the next element in the array which is the last one
			
 
				+ *      in the sequence, otherwise it is a the index 0.
			
 
				+ *
			
 
				+ * - have an indication of the interrupts activity on this CPU
			
 
				+ *   (eg. irq/sec)
			
 
				+ *
			
 
				+ * The values are 'consumed' after inserting in the statistical model,
			
 
				+ * thus the count is reinitialized.
			
 
				+ *
			
 
				+ * The array of values **must** be browsed in the time direction, the
			
 
				+ * timestamp must increase between an element and the next one.
			
 
				+ *
			
 
				+ * Returns a nanosec time based estimation of the earliest interrupt,
			
 
				+ * U64_MAX otherwise.
			
 
				+ */
			
 
				+u64 irq_timings_next_event(u64 now)
			
 
				+{
			
 
				+	struct irq_timings *irqts = this_cpu_ptr(&irq_timings);
			
 
				+	struct irqt_stat *irqs;
			
 
				+	struct irqt_stat __percpu *s;
			
 
				+	u64 ts, next_evt = U64_MAX;
			
 
				+	int i, irq = 0;
			
 
				+
			
 
				+	/*
			
 
				+	 * This function must be called with the local irq disabled in
			
 
				+	 * order to prevent the timings circular buffer to be updated
			
 
				+	 * while we are reading it.
			
 
				+	 */
			
 
				+	WARN_ON_ONCE(!irqs_disabled());
			
 
				+
			
 
				+	/*
			
 
				+	 * Number of elements in the circular buffer: If it happens it
			
 
				+	 * was flushed before, then the number of elements could be
			
 
				+	 * smaller than IRQ_TIMINGS_SIZE, so the count is used,
			
 
				+	 * otherwise the array size is used as we wrapped. The index
			
 
				+	 * begins from zero when we did not wrap. That could be done
			
 
				+	 * in a nicer way with the proper circular array structure
			
 
				+	 * type but with the cost of extra computation in the
			
 
				+	 * interrupt handler hot path. We choose efficiency.
			
 
				+	 *
			
 
				+	 * Inject measured irq/timestamp to the statistical model
			
 
				+	 * while decrementing the counter because we consume the data
			
 
				+	 * from our circular buffer.
			
 
				+	 */
			
 
				+	for (i = irqts->count & IRQ_TIMINGS_MASK,
			
 
				+		     irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
			
 
				+	     irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
			
 
				+
			
 
				+		irq = irq_timing_decode(irqts->values[i], &ts);
			
 
				+
			
 
				+		s = idr_find(&irqt_stats, irq);
			
 
				+		if (s) {
			
 
				+			irqs = this_cpu_ptr(s);
			
 
				+			irqs_update(irqs, ts);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Look in the list of interrupts' statistics, the earliest
			
 
				+	 * next event.
			
 
				+	 */
			
 
				+	idr_for_each_entry(&irqt_stats, s, i) {
			
 
				+
			
 
				+		irqs = this_cpu_ptr(s);
			
 
				+
			
 
				+		if (!irqs->valid)
			
 
				+			continue;
			
 
				+
			
 
				+		if (irqs->next_evt <= now) {
			
 
				+			irq = i;
			
 
				+			next_evt = now;
			
 
				+
			
 
				+			/*
			
 
				+			 * This interrupt mustn't use in the future
			
 
				+			 * until new events occur and update the
			
 
				+			 * statistics.
			
 
				+			 */
			
 
				+			irqs->valid = 0;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		if (irqs->next_evt < next_evt) {
			
 
				+			irq = i;
			
 
				+			next_evt = irqs->next_evt;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return next_evt;
			
 
				+}
			
 
				+
			
 
				+void irq_timings_free(int irq)
			
 
				+{
			
 
				+	struct irqt_stat __percpu *s;
			
 
				+
			
 
				+	s = idr_find(&irqt_stats, irq);
			
 
				+	if (s) {
			
 
				+		free_percpu(s);
			
 
				+		idr_remove(&irqt_stats, irq);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int irq_timings_alloc(int irq)
			
 
				+{
			
 
				+	struct irqt_stat __percpu *s;
			
 
				+	int id;
			
 
				+
			
 
				+	/*
			
 
				+	 * Some platforms can have the same private interrupt per cpu,
			
 
				+	 * so this function may be be called several times with the
			
 
				+	 * same interrupt number. Just bail out in case the per cpu
			
 
				+	 * stat structure is already allocated.
			
 
				+	 */
			
 
				+	s = idr_find(&irqt_stats, irq);
			
 
				+	if (s)
			
 
				+		return 0;
			
 
				+
			
 
				+	s = alloc_percpu(*s);
			
 
				+	if (!s)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	idr_preload(GFP_KERNEL);
			
 
				+	id = idr_alloc(&irqt_stats, s, irq, irq + 1, GFP_NOWAIT);
			
 
				+	idr_preload_end();
			
 
				+
			
 
				+	if (id < 0) {
			
 
				+		free_percpu(s);
			
 
				+		return id;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}