10 years ago · 6c8a53c9e6
--- a/arch/arm/kernel/hw_breakpoint.c
+++ b/arch/arm/kernel/hw_breakpoint.c
@@ -648,7 +648,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
 
															 		 * Per-cpu breakpoints are not supported by our stepping
														
 
															 		 * mechanism.
														
 
															 		 */
														
 
															-		if (!bp->hw.bp_target)
														
 
															+		if (!bp->hw.target)
														
 
															 			return -EINVAL;
														
 
															 		/*
														
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -527,7 +527,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
 
															 	 * Disallow per-task kernel breakpoints since these would
														
 
															 	 * complicate the stepping code.
														
 
															 	 */
														
 
															-	if (info->ctrl.privilege == AARCH64_BREAKPOINT_EL1 && bp->hw.bp_target)
														
 
															+	if (info->ctrl.privilege == AARCH64_BREAKPOINT_EL1 && bp->hw.target)
														
 
															 		return -EINVAL;
														
 
															 	return 0;
														
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -124,7 +124,7 @@ static unsigned long ebb_switch_in(bool ebb, struct cpu_hw_events *cpuhw)
 
															 static inline void power_pmu_bhrb_enable(struct perf_event *event) {}
														
 
															 static inline void power_pmu_bhrb_disable(struct perf_event *event) {}
														
 
															-static void power_pmu_flush_branch_stack(void) {}
														
 
															+static void power_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) {}
														
 
															 static inline void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw) {}
														
 
															 static void pmao_restore_workaround(bool ebb) { }
														
 
															 #endif /* CONFIG_PPC32 */
														
@@ -350,6 +350,7 @@ static void power_pmu_bhrb_enable(struct perf_event *event)
 
															 		cpuhw->bhrb_context = event->ctx;
														
 
															 	}
														
 
															 	cpuhw->bhrb_users++;
														
 
															+	perf_sched_cb_inc(event->ctx->pmu);
														
 
															 }
														
 
															 static void power_pmu_bhrb_disable(struct perf_event *event)
														
@@ -361,6 +362,7 @@ static void power_pmu_bhrb_disable(struct perf_event *event)
 
															 	cpuhw->bhrb_users--;
														
 
															 	WARN_ON_ONCE(cpuhw->bhrb_users < 0);
														
 
															+	perf_sched_cb_dec(event->ctx->pmu);
														
 
															 	if (!cpuhw->disabled && !cpuhw->bhrb_users) {
														
 
															 		/* BHRB cannot be turned off when other
														
@@ -375,9 +377,12 @@ static void power_pmu_bhrb_disable(struct perf_event *event)
 
															 /* Called from ctxsw to prevent one process's branch entries to
														
 
															  * mingle with the other process's entries during context switch.
														
 
															  */
														
 
															-static void power_pmu_flush_branch_stack(void)
														
 
															+static void power_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
														
 
															 {
														
 
															-	if (ppmu->bhrb_nr)
														
 
															+	if (!ppmu->bhrb_nr)
														
 
															+		return;
														
 
															+
														
 
															+	if (sched_in)
														
 
															 		power_pmu_bhrb_reset();
														
 
															 }
														
 
															 /* Calculate the to address for a branch */
														
@@ -1901,7 +1906,7 @@ static struct pmu power_pmu = {
 
															 	.cancel_txn	= power_pmu_cancel_txn,
														
 
															 	.commit_txn	= power_pmu_commit_txn,
														
 
															 	.event_idx	= power_pmu_event_idx,
														
 
															-	.flush_branch_stack = power_pmu_flush_branch_stack,
														
 
															+	.sched_task	= power_pmu_sched_task,
														
 
															 };
														
 
															 /*
														
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -12,7 +12,7 @@
 
															 #include <asm/disabled-features.h>
														
 
															 #endif
														
 
															-#define NCAPINTS	11	/* N 32-bit words worth of info */
														
 
															+#define NCAPINTS	13	/* N 32-bit words worth of info */
														
 
															 #define NBUGINTS	1	/* N 32-bit bug flags */
														
 
															 /*
														
@@ -195,6 +195,7 @@
 
															 #define X86_FEATURE_HWP_ACT_WINDOW ( 7*32+ 12) /* Intel HWP_ACT_WINDOW */
														
 
															 #define X86_FEATURE_HWP_EPP	( 7*32+13) /* Intel HWP_EPP */
														
 
															 #define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */
														
 
															+#define X86_FEATURE_INTEL_PT	( 7*32+15) /* Intel Processor Trace */
														
 
															 /* Virtualization flags: Linux defined, word 8 */
														
 
															 #define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */
														
@@ -226,6 +227,7 @@
 
															 #define X86_FEATURE_ERMS	( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */
														
 
															 #define X86_FEATURE_INVPCID	( 9*32+10) /* Invalidate Processor Context ID */
														
 
															 #define X86_FEATURE_RTM		( 9*32+11) /* Restricted Transactional Memory */
														
 
															+#define X86_FEATURE_CQM		( 9*32+12) /* Cache QoS Monitoring */
														
 
															 #define X86_FEATURE_MPX		( 9*32+14) /* Memory Protection Extension */
														
 
															 #define X86_FEATURE_AVX512F	( 9*32+16) /* AVX-512 Foundation */
														
 
															 #define X86_FEATURE_RDSEED	( 9*32+18) /* The RDSEED instruction */
														
@@ -244,6 +246,12 @@
 
															 #define X86_FEATURE_XGETBV1	(10*32+ 2) /* XGETBV with ECX = 1 */
														
 
															 #define X86_FEATURE_XSAVES	(10*32+ 3) /* XSAVES/XRSTORS */
														
 
															+/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */
														
 
															+#define X86_FEATURE_CQM_LLC	(11*32+ 1) /* LLC QoS if 1 */
														
 
															+
														
 
															+/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
														
 
															+#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
														
 
															+
														
 
															 /*
														
 
															  * BUG word(s)
														
 
															  */
														
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -109,6 +109,9 @@ struct cpuinfo_x86 {
 
															 	/* in KB - valid for CPUS which support this call: */
														
 
															 	int			x86_cache_size;
														
 
															 	int			x86_cache_alignment;	/* In bytes */
														
 
															+	/* Cache QoS architectural values: */
														
 
															+	int			x86_cache_max_rmid;	/* max index */
														
 
															+	int			x86_cache_occ_scale;	/* scale to bytes */
														
 
															 	int			x86_power;
														
 
															 	unsigned long		loops_per_jiffy;
														
 
															 	/* cpuid returned max cores value: */
														
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -74,6 +74,24 @@
 
															 #define MSR_IA32_PERF_CAPABILITIES	0x00000345
														
 
															 #define MSR_PEBS_LD_LAT_THRESHOLD	0x000003f6
														
 
															+#define MSR_IA32_RTIT_CTL		0x00000570
														
 
															+#define RTIT_CTL_TRACEEN		BIT(0)
														
 
															+#define RTIT_CTL_OS			BIT(2)
														
 
															+#define RTIT_CTL_USR			BIT(3)
														
 
															+#define RTIT_CTL_CR3EN			BIT(7)
														
 
															+#define RTIT_CTL_TOPA			BIT(8)
														
 
															+#define RTIT_CTL_TSC_EN			BIT(10)
														
 
															+#define RTIT_CTL_DISRETC		BIT(11)
														
 
															+#define RTIT_CTL_BRANCH_EN		BIT(13)
														
 
															+#define MSR_IA32_RTIT_STATUS		0x00000571
														
 
															+#define RTIT_STATUS_CONTEXTEN		BIT(1)
														
 
															+#define RTIT_STATUS_TRIGGEREN		BIT(2)
														
 
															+#define RTIT_STATUS_ERROR		BIT(4)
														
 
															+#define RTIT_STATUS_STOPPED		BIT(5)
														
 
															+#define MSR_IA32_RTIT_CR3_MATCH		0x00000572
														
 
															+#define MSR_IA32_RTIT_OUTPUT_BASE	0x00000560
														
 
															+#define MSR_IA32_RTIT_OUTPUT_MASK	0x00000561
														
 
															+
														
 
															 #define MSR_MTRRfix64K_00000		0x00000250
														
 
															 #define MSR_MTRRfix16K_80000		0x00000258
														
 
															 #define MSR_MTRRfix16K_A0000		0x00000259
														
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -39,7 +39,8 @@ obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_amd_iommu.o
 
															 endif
														
 
															 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
														
 
															 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
														
 
															-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o
														
 
															+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o perf_event_intel_cqm.o
														
 
															+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_pt.o perf_event_intel_bts.o
														
 
															 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore.o \
														
 
															 					   perf_event_intel_uncore_snb.o \
														
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -646,6 +646,30 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 
															 		c->x86_capability[10] = eax;
														
 
															 	}
														
 
															+	/* Additional Intel-defined flags: level 0x0000000F */
														
 
															+	if (c->cpuid_level >= 0x0000000F) {
														
 
															+		u32 eax, ebx, ecx, edx;
														
 
															+
														
 
															+		/* QoS sub-leaf, EAX=0Fh, ECX=0 */
														
 
															+		cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx);
														
 
															+		c->x86_capability[11] = edx;
														
 
															+		if (cpu_has(c, X86_FEATURE_CQM_LLC)) {
														
 
															+			/* will be overridden if occupancy monitoring exists */
														
 
															+			c->x86_cache_max_rmid = ebx;
														
 
															+
														
 
															+			/* QoS sub-leaf, EAX=0Fh, ECX=1 */
														
 
															+			cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx);
														
 
															+			c->x86_capability[12] = edx;
														
 
															+			if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) {
														
 
															+				c->x86_cache_max_rmid = ecx;
														
 
															+				c->x86_cache_occ_scale = ebx;
														
 
															+			}
														
 
															+		} else {
														
 
															+			c->x86_cache_max_rmid = -1;
														
 
															+			c->x86_cache_occ_scale = -1;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															 	/* AMD-defined flags: level 0x80000001 */
														
 
															 	xlvl = cpuid_eax(0x80000000);
														
 
															 	c->extended_cpuid_level = xlvl;
														
@@ -834,6 +858,20 @@ static void generic_identify(struct cpuinfo_x86 *c)
 
															 	detect_nopl(c);
														
 
															 }
														
 
															+static void x86_init_cache_qos(struct cpuinfo_x86 *c)
														
 
															+{
														
 
															+	/*
														
 
															+	 * The heavy lifting of max_rmid and cache_occ_scale are handled
														
 
															+	 * in get_cpu_cap().  Here we just set the max_rmid for the boot_cpu
														
 
															+	 * in case CQM bits really aren't there in this CPU.
														
 
															+	 */
														
 
															+	if (c != &boot_cpu_data) {
														
 
															+		boot_cpu_data.x86_cache_max_rmid =
														
 
															+			min(boot_cpu_data.x86_cache_max_rmid,
														
 
															+			    c->x86_cache_max_rmid);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															 /*
														
 
															  * This does the hard work of actually picking apart the CPU stuff...
														
 
															  */
														
@@ -923,6 +961,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 
															 	init_hypervisor(c);
														
 
															 	x86_init_rdrand(c);
														
 
															+	x86_init_cache_qos(c);
														
 
															 	/*
														
 
															 	 * Clear/Set all flags overriden by options, need do it
														
--- a/arch/x86/kernel/cpu/intel_pt.h
+++ b/arch/x86/kernel/cpu/intel_pt.h
@@ -0,0 +1,131 @@
 
															+/*
														
 
															+ * Intel(R) Processor Trace PMU driver for perf
														
 
															+ * Copyright (c) 2013-2014, Intel Corporation.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or modify it
														
 
															+ * under the terms and conditions of the GNU General Public License,
														
 
															+ * version 2, as published by the Free Software Foundation.
														
 
															+ *
														
 
															+ * This program is distributed in the hope it will be useful, but WITHOUT
														
 
															+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
														
 
															+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
														
 
															+ * more details.
														
 
															+ *
														
 
															+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
														
 
															+ * Programming Reference:
														
 
															+ * http://software.intel.com/en-us/intel-isa-extensions
														
 
															+ */
														
 
															+
														
 
															+#ifndef __INTEL_PT_H__
														
 
															+#define __INTEL_PT_H__
														
 
															+
														
 
															+/*
														
 
															+ * Single-entry ToPA: when this close to region boundary, switch
														
 
															+ * buffers to avoid losing data.
														
 
															+ */
														
 
															+#define TOPA_PMI_MARGIN 512
														
 
															+
														
 
															+/*
														
 
															+ * Table of Physical Addresses bits
														
 
															+ */
														
 
															+enum topa_sz {
														
 
															+	TOPA_4K	= 0,
														
 
															+	TOPA_8K,
														
 
															+	TOPA_16K,
														
 
															+	TOPA_32K,
														
 
															+	TOPA_64K,
														
 
															+	TOPA_128K,
														
 
															+	TOPA_256K,
														
 
															+	TOPA_512K,
														
 
															+	TOPA_1MB,
														
 
															+	TOPA_2MB,
														
 
															+	TOPA_4MB,
														
 
															+	TOPA_8MB,
														
 
															+	TOPA_16MB,
														
 
															+	TOPA_32MB,
														
 
															+	TOPA_64MB,
														
 
															+	TOPA_128MB,
														
 
															+	TOPA_SZ_END,
														
 
															+};
														
 
															+
														
 
															+static inline unsigned int sizes(enum topa_sz tsz)
														
 
															+{
														
 
															+	return 1 << (tsz + 12);
														
 
															+};
														
 
															+
														
 
															+struct topa_entry {
														
 
															+	u64	end	: 1;
														
 
															+	u64	rsvd0	: 1;
														
 
															+	u64	intr	: 1;
														
 
															+	u64	rsvd1	: 1;
														
 
															+	u64	stop	: 1;
														
 
															+	u64	rsvd2	: 1;
														
 
															+	u64	size	: 4;
														
 
															+	u64	rsvd3	: 2;
														
 
															+	u64	base	: 36;
														
 
															+	u64	rsvd4	: 16;
														
 
															+};
														
 
															+
														
 
															+#define TOPA_SHIFT 12
														
 
															+#define PT_CPUID_LEAVES 2
														
 
															+
														
 
															+enum pt_capabilities {
														
 
															+	PT_CAP_max_subleaf = 0,
														
 
															+	PT_CAP_cr3_filtering,
														
 
															+	PT_CAP_topa_output,
														
 
															+	PT_CAP_topa_multiple_entries,
														
 
															+	PT_CAP_payloads_lip,
														
 
															+};
														
 
															+
														
 
															+struct pt_pmu {
														
 
															+	struct pmu		pmu;
														
 
															+	u32			caps[4 * PT_CPUID_LEAVES];
														
 
															+};
														
 
															+
														
 
															+/**
														
 
															+ * struct pt_buffer - buffer configuration; one buffer per task_struct or
														
 
															+ *		cpu, depending on perf event configuration
														
 
															+ * @cpu:	cpu for per-cpu allocation
														
 
															+ * @tables:	list of ToPA tables in this buffer
														
 
															+ * @first:	shorthand for first topa table
														
 
															+ * @last:	shorthand for last topa table
														
 
															+ * @cur:	current topa table
														
 
															+ * @nr_pages:	buffer size in pages
														
 
															+ * @cur_idx:	current output region's index within @cur table
														
 
															+ * @output_off:	offset within the current output region
														
 
															+ * @data_size:	running total of the amount of data in this buffer
														
 
															+ * @lost:	if data was lost/truncated
														
 
															+ * @head:	logical write offset inside the buffer
														
 
															+ * @snapshot:	if this is for a snapshot/overwrite counter
														
 
															+ * @stop_pos:	STOP topa entry in the buffer
														
 
															+ * @intr_pos:	INT topa entry in the buffer
														
 
															+ * @data_pages:	array of pages from perf
														
 
															+ * @topa_index:	table of topa entries indexed by page offset
														
 
															+ */
														
 
															+struct pt_buffer {
														
 
															+	int			cpu;
														
 
															+	struct list_head	tables;
														
 
															+	struct topa		*first, *last, *cur;
														
 
															+	unsigned int		cur_idx;
														
 
															+	size_t			output_off;
														
 
															+	unsigned long		nr_pages;
														
 
															+	local_t			data_size;
														
 
															+	local_t			lost;
														
 
															+	local64_t		head;
														
 
															+	bool			snapshot;
														
 
															+	unsigned long		stop_pos, intr_pos;
														
 
															+	void			**data_pages;
														
 
															+	struct topa_entry	*topa_index[0];
														
 
															+};
														
 
															+
														
 
															+/**
														
 
															+ * struct pt - per-cpu pt context
														
 
															+ * @handle:	perf output handle
														
 
															+ * @handle_nmi:	do handle PT PMI on this cpu, there's an active event
														
 
															+ */
														
 
															+struct pt {
														
 
															+	struct perf_output_handle handle;
														
 
															+	int			handle_nmi;
														
 
															+};
														
 
															+
														
 
															+#endif /* __INTEL_PT_H__ */
														
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -263,6 +263,14 @@ static void hw_perf_event_destroy(struct perf_event *event)
 
															 	}
														
 
															 }
														
 
															+void hw_perf_lbr_event_destroy(struct perf_event *event)
														
 
															+{
														
 
															+	hw_perf_event_destroy(event);
														
 
															+
														
 
															+	/* undo the lbr/bts event accounting */
														
 
															+	x86_del_exclusive(x86_lbr_exclusive_lbr);
														
 
															+}
														
 
															+
														
 
															 static inline int x86_pmu_initialized(void)
														
 
															 {
														
 
															 	return x86_pmu.handle_irq != NULL;
														
@@ -302,6 +310,35 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
 
															 	return x86_pmu_extra_regs(val, event);
														
 
															 }
														
 
															+/*
														
 
															+ * Check if we can create event of a certain type (that no conflicting events
														
 
															+ * are present).
														
 
															+ */
														
 
															+int x86_add_exclusive(unsigned int what)
														
 
															+{
														
 
															+	int ret = -EBUSY, i;
														
 
															+
														
 
															+	if (atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what]))
														
 
															+		return 0;
														
 
															+
														
 
															+	mutex_lock(&pmc_reserve_mutex);
														
 
															+	for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++)
														
 
															+		if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
														
 
															+			goto out;
														
 
															+
														
 
															+	atomic_inc(&x86_pmu.lbr_exclusive[what]);
														
 
															+	ret = 0;
														
 
															+
														
 
															+out:
														
 
															+	mutex_unlock(&pmc_reserve_mutex);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+void x86_del_exclusive(unsigned int what)
														
 
															+{
														
 
															+	atomic_dec(&x86_pmu.lbr_exclusive[what]);
														
 
															+}
														
 
															+
														
 
															 int x86_setup_perfctr(struct perf_event *event)
														
 
															 {
														
 
															 	struct perf_event_attr *attr = &event->attr;
														
@@ -346,6 +383,12 @@ int x86_setup_perfctr(struct perf_event *event)
 
															 		/* BTS is currently only allowed for user-mode. */
														
 
															 		if (!attr->exclude_kernel)
														
 
															 			return -EOPNOTSUPP;
														
 
															+
														
 
															+		/* disallow bts if conflicting events are present */
														
 
															+		if (x86_add_exclusive(x86_lbr_exclusive_lbr))
														
 
															+			return -EBUSY;
														
 
															+
														
 
															+		event->destroy = hw_perf_lbr_event_destroy;
														
 
															 	}
														
 
															 	hwc->config |= config;
														
@@ -399,39 +442,41 @@ int x86_pmu_hw_config(struct perf_event *event)
 
															 		if (event->attr.precise_ip > precise)
														
 
															 			return -EOPNOTSUPP;
														
 
															-		/*
														
 
															-		 * check that PEBS LBR correction does not conflict with
														
 
															-		 * whatever the user is asking with attr->branch_sample_type
														
 
															-		 */
														
 
															-		if (event->attr.precise_ip > 1 &&
														
 
															-		    x86_pmu.intel_cap.pebs_format < 2) {
														
 
															-			u64 *br_type = &event->attr.branch_sample_type;
														
 
															-
														
 
															-			if (has_branch_stack(event)) {
														
 
															-				if (!precise_br_compat(event))
														
 
															-					return -EOPNOTSUPP;
														
 
															-
														
 
															-				/* branch_sample_type is compatible */
														
 
															-
														
 
															-			} else {
														
 
															-				/*
														
 
															-				 * user did not specify  branch_sample_type
														
 
															-				 *
														
 
															-				 * For PEBS fixups, we capture all
														
 
															-				 * the branches at the priv level of the
														
 
															-				 * event.
														
 
															-				 */
														
 
															-				*br_type = PERF_SAMPLE_BRANCH_ANY;
														
 
															-
														
 
															-				if (!event->attr.exclude_user)
														
 
															-					*br_type |= PERF_SAMPLE_BRANCH_USER;
														
 
															-
														
 
															-				if (!event->attr.exclude_kernel)
														
 
															-					*br_type |= PERF_SAMPLE_BRANCH_KERNEL;
														
 
															-			}
														
 
															+	}
														
 
															+	/*
														
 
															+	 * check that PEBS LBR correction does not conflict with
														
 
															+	 * whatever the user is asking with attr->branch_sample_type
														
 
															+	 */
														
 
															+	if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
														
 
															+		u64 *br_type = &event->attr.branch_sample_type;
														
 
															+
														
 
															+		if (has_branch_stack(event)) {
														
 
															+			if (!precise_br_compat(event))
														
 
															+				return -EOPNOTSUPP;
														
 
															+
														
 
															+			/* branch_sample_type is compatible */
														
 
															+
														
 
															+		} else {
														
 
															+			/*
														
 
															+			 * user did not specify  branch_sample_type
														
 
															+			 *
														
 
															+			 * For PEBS fixups, we capture all
														
 
															+			 * the branches at the priv level of the
														
 
															+			 * event.
														
 
															+			 */
														
 
															+			*br_type = PERF_SAMPLE_BRANCH_ANY;
														
 
															+
														
 
															+			if (!event->attr.exclude_user)
														
 
															+				*br_type |= PERF_SAMPLE_BRANCH_USER;
														
 
															+
														
 
															+			if (!event->attr.exclude_kernel)
														
 
															+				*br_type |= PERF_SAMPLE_BRANCH_KERNEL;
														
 
															 		}
														
 
															 	}
														
 
															+	if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
														
 
															+		event->attach_state |= PERF_ATTACH_TASK_DATA;
														
 
															+
														
 
															 	/*
														
 
															 	 * Generate PMC IRQs:
														
 
															 	 * (keep 'enabled' bit clear for now)
														
@@ -449,6 +494,12 @@ int x86_pmu_hw_config(struct perf_event *event)
 
															 	if (event->attr.type == PERF_TYPE_RAW)
														
 
															 		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
														
 
															+	if (event->attr.sample_period && x86_pmu.limit_period) {
														
 
															+		if (x86_pmu.limit_period(event, event->attr.sample_period) >
														
 
															+				event->attr.sample_period)
														
 
															+			return -EINVAL;
														
 
															+	}
														
 
															+
														
 
															 	return x86_setup_perfctr(event);
														
 
															 }
														
@@ -728,14 +779,17 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 
															 	struct event_constraint *c;
														
 
															 	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
														
 
															 	struct perf_event *e;
														
 
															-	int i, wmin, wmax, num = 0;
														
 
															+	int i, wmin, wmax, unsched = 0;
														
 
															 	struct hw_perf_event *hwc;
														
 
															 	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
														
 
															+	if (x86_pmu.start_scheduling)
														
 
															+		x86_pmu.start_scheduling(cpuc);
														
 
															+
														
 
															 	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
														
 
															 		hwc = &cpuc->event_list[i]->hw;
														
 
															-		c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
														
 
															+		c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
														
 
															 		hwc->constraint = c;
														
 
															 		wmin = min(wmin, c->weight);
														
@@ -768,24 +822,30 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 
															 	/* slow path */
														
 
															 	if (i != n)
														
 
															-		num = perf_assign_events(cpuc->event_list, n, wmin,
														
 
															-					 wmax, assign);
														
 
															+		unsched = perf_assign_events(cpuc->event_list, n, wmin,
														
 
															+					     wmax, assign);
														
 
															 	/*
														
 
															-	 * Mark the event as committed, so we do not put_constraint()
														
 
															-	 * in case new events are added and fail scheduling.
														
 
															+	 * In case of success (unsched = 0), mark events as committed,
														
 
															+	 * so we do not put_constraint() in case new events are added
														
 
															+	 * and fail to be scheduled
														
 
															+	 *
														
 
															+	 * We invoke the lower level commit callback to lock the resource
														
 
															+	 *
														
 
															+	 * We do not need to do all of this in case we are called to
														
 
															+	 * validate an event group (assign == NULL)
														
 
															 	 */
														
 
															-	if (!num && assign) {
														
 
															+	if (!unsched && assign) {
														
 
															 		for (i = 0; i < n; i++) {
														
 
															 			e = cpuc->event_list[i];
														
 
															 			e->hw.flags |= PERF_X86_EVENT_COMMITTED;
														
 
															+			if (x86_pmu.commit_scheduling)
														
 
															+				x86_pmu.commit_scheduling(cpuc, e, assign[i]);
														
 
															 		}
														
 
															 	}
														
 
															-	/*
														
 
															-	 * scheduling failed or is just a simulation,
														
 
															-	 * free resources if necessary
														
 
															-	 */
														
 
															-	if (!assign || num) {
														
 
															+
														
 
															+	if (!assign || unsched) {
														
 
															+
														
 
															 		for (i = 0; i < n; i++) {
														
 
															 			e = cpuc->event_list[i];
														
 
															 			/*
														
@@ -795,11 +855,18 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 
															 			if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
														
 
															 				continue;
														
 
															+			/*
														
 
															+			 * release events that failed scheduling
														
 
															+			 */
														
 
															 			if (x86_pmu.put_event_constraints)
														
 
															 				x86_pmu.put_event_constraints(cpuc, e);
														
 
															 		}
														
 
															 	}
														
 
															-	return num ? -EINVAL : 0;
														
 
															+
														
 
															+	if (x86_pmu.stop_scheduling)
														
 
															+		x86_pmu.stop_scheduling(cpuc);
														
 
															+
														
 
															+	return unsched ? -EINVAL : 0;
														
 
															 }
														
 
															 /*
														
@@ -986,6 +1053,9 @@ int x86_perf_event_set_period(struct perf_event *event)
 
															 	if (left > x86_pmu.max_period)
														
 
															 		left = x86_pmu.max_period;
														
 
															+	if (x86_pmu.limit_period)
														
 
															+		left = x86_pmu.limit_period(event, left);
														
 
															+
														
 
															 	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
														
 
															 	/*
														
@@ -1033,7 +1103,6 @@ static int x86_pmu_add(struct perf_event *event, int flags)
 
															 	hwc = &event->hw;
														
 
															-	perf_pmu_disable(event->pmu);
														
 
															 	n0 = cpuc->n_events;
														
 
															 	ret = n = collect_events(cpuc, event, false);
														
 
															 	if (ret < 0)
														
@@ -1071,7 +1140,6 @@ static int x86_pmu_add(struct perf_event *event, int flags)
 
															 	ret = 0;
														
 
															 out:
														
 
															-	perf_pmu_enable(event->pmu);
														
 
															 	return ret;
														
 
															 }
														
@@ -1103,7 +1171,7 @@ static void x86_pmu_start(struct perf_event *event, int flags)
 
															 void perf_event_print_debug(void)
														
 
															 {
														
 
															 	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
														
 
															-	u64 pebs;
														
 
															+	u64 pebs, debugctl;
														
 
															 	struct cpu_hw_events *cpuc;
														
 
															 	unsigned long flags;
														
 
															 	int cpu, idx;
														
@@ -1121,14 +1189,20 @@ void perf_event_print_debug(void)
 
															 		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
														
 
															 		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
														
 
															 		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
														
 
															-		rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
														
 
															 		pr_info("\n");
														
 
															 		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
														
 
															 		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
														
 
															 		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
														
 
															 		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
														
 
															-		pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
														
 
															+		if (x86_pmu.pebs_constraints) {
														
 
															+			rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
														
 
															+			pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
														
 
															+		}
														
 
															+		if (x86_pmu.lbr_nr) {
														
 
															+			rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
														
 
															+			pr_info("CPU#%d: debugctl:   %016llx\n", cpu, debugctl);
														
 
															+		}
														
 
															 	}
														
 
															 	pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
														
@@ -1321,11 +1395,12 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
 
															 {
														
 
															 	unsigned int cpu = (long)hcpu;
														
 
															 	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
														
 
															-	int ret = NOTIFY_OK;
														
 
															+	int i, ret = NOTIFY_OK;
														
 
															 	switch (action & ~CPU_TASKS_FROZEN) {
														
 
															 	case CPU_UP_PREPARE:
														
 
															-		cpuc->kfree_on_online = NULL;
														
 
															+		for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
														
 
															+			cpuc->kfree_on_online[i] = NULL;
														
 
															 		if (x86_pmu.cpu_prepare)
														
 
															 			ret = x86_pmu.cpu_prepare(cpu);
														
 
															 		break;
														
@@ -1336,7 +1411,10 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
 
															 		break;
														
 
															 	case CPU_ONLINE:
														
 
															-		kfree(cpuc->kfree_on_online);
														
 
															+		for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
														
 
															+			kfree(cpuc->kfree_on_online[i]);
														
 
															+			cpuc->kfree_on_online[i] = NULL;
														
 
															+		}
														
 
															 		break;
														
 
															 	case CPU_DYING:
														
@@ -1712,7 +1790,7 @@ static int validate_event(struct perf_event *event)
 
															 	if (IS_ERR(fake_cpuc))
														
 
															 		return PTR_ERR(fake_cpuc);
														
 
															-	c = x86_pmu.get_event_constraints(fake_cpuc, event);
														
 
															+	c = x86_pmu.get_event_constraints(fake_cpuc, -1, event);
														
 
															 	if (!c || !c->weight)
														
 
															 		ret = -EINVAL;
														
@@ -1914,10 +1992,10 @@ static const struct attribute_group *x86_pmu_attr_groups[] = {
 
															 	NULL,
														
 
															 };
														
 
															-static void x86_pmu_flush_branch_stack(void)
														
 
															+static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
														
 
															 {
														
 
															-	if (x86_pmu.flush_branch_stack)
														
 
															-		x86_pmu.flush_branch_stack();
														
 
															+	if (x86_pmu.sched_task)
														
 
															+		x86_pmu.sched_task(ctx, sched_in);
														
 
															 }
														
 
															 void perf_check_microcode(void)
														
@@ -1949,7 +2027,8 @@ static struct pmu pmu = {
 
															 	.commit_txn		= x86_pmu_commit_txn,
														
 
															 	.event_idx		= x86_pmu_event_idx,
														
 
															-	.flush_branch_stack	= x86_pmu_flush_branch_stack,
														
 
															+	.sched_task		= x86_pmu_sched_task,
														
 
															+	.task_ctx_size          = sizeof(struct x86_perf_task_context),
														
 
															 };
														
 
															 void arch_perf_update_userpage(struct perf_event *event,
														
@@ -1968,13 +2047,23 @@ void arch_perf_update_userpage(struct perf_event *event,
 
															 	data = cyc2ns_read_begin();
														
 
															+	/*
														
 
															+	 * Internal timekeeping for enabled/running/stopped times
														
 
															+	 * is always in the local_clock domain.
														
 
															+	 */
														
 
															 	userpg->cap_user_time = 1;
														
 
															 	userpg->time_mult = data->cyc2ns_mul;
														
 
															 	userpg->time_shift = data->cyc2ns_shift;
														
 
															 	userpg->time_offset = data->cyc2ns_offset - now;
														
 
															-	userpg->cap_user_time_zero = 1;
														
 
															-	userpg->time_zero = data->cyc2ns_offset;
														
 
															+	/*
														
 
															+	 * cap_user_time_zero doesn't make sense when we're using a different
														
 
															+	 * time base for the records.
														
 
															+	 */
														
 
															+	if (event->clock == &local_clock) {
														
 
															+		userpg->cap_user_time_zero = 1;
														
 
															+		userpg->time_zero = data->cyc2ns_offset;
														
 
															+	}
														
 
															 	cyc2ns_read_end(data);
														
 
															 }
														
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -71,6 +71,8 @@ struct event_constraint {
 
															 #define PERF_X86_EVENT_COMMITTED	0x8 /* event passed commit_txn */
														
 
															 #define PERF_X86_EVENT_PEBS_LD_HSW	0x10 /* haswell style datala, load */
														
 
															 #define PERF_X86_EVENT_PEBS_NA_HSW	0x20 /* haswell style datala, unknown */
														
 
															+#define PERF_X86_EVENT_EXCL		0x40 /* HT exclusivity on counter */
														
 
															+#define PERF_X86_EVENT_DYNAMIC		0x80 /* dynamic alloc'd constraint */
														
 
															 #define PERF_X86_EVENT_RDPMC_ALLOWED	0x40 /* grant rdpmc permission */
														
@@ -123,8 +125,37 @@ struct intel_shared_regs {
 
															 	unsigned                core_id;	/* per-core: core id */
														
 
															 };
														
 
															+enum intel_excl_state_type {
														
 
															+	INTEL_EXCL_UNUSED    = 0, /* counter is unused */
														
 
															+	INTEL_EXCL_SHARED    = 1, /* counter can be used by both threads */
														
 
															+	INTEL_EXCL_EXCLUSIVE = 2, /* counter can be used by one thread only */
														
 
															+};
														
 
															+
														
 
															+struct intel_excl_states {
														
 
															+	enum intel_excl_state_type init_state[X86_PMC_IDX_MAX];
														
 
															+	enum intel_excl_state_type state[X86_PMC_IDX_MAX];
														
 
															+	int  num_alloc_cntrs;/* #counters allocated */
														
 
															+	int  max_alloc_cntrs;/* max #counters allowed */
														
 
															+	bool sched_started; /* true if scheduling has started */
														
 
															+};
														
 
															+
														
 
															+struct intel_excl_cntrs {
														
 
															+	raw_spinlock_t	lock;
														
 
															+
														
 
															+	struct intel_excl_states states[2];
														
 
															+
														
 
															+	int		refcnt;		/* per-core: #HT threads */
														
 
															+	unsigned	core_id;	/* per-core: core id */
														
 
															+};
														
 
															+
														
 
															 #define MAX_LBR_ENTRIES		16
														
 
															+enum {
														
 
															+	X86_PERF_KFREE_SHARED = 0,
														
 
															+	X86_PERF_KFREE_EXCL   = 1,
														
 
															+	X86_PERF_KFREE_MAX
														
 
															+};
														
 
															+
														
 
															 struct cpu_hw_events {
														
 
															 	/*
														
 
															 	 * Generic x86 PMC bits
														
@@ -179,6 +210,12 @@ struct cpu_hw_events {
 
															 	 * used on Intel NHM/WSM/SNB
														
 
															 	 */
														
 
															 	struct intel_shared_regs	*shared_regs;
														
 
															+	/*
														
 
															+	 * manage exclusive counter access between hyperthread
														
 
															+	 */
														
 
															+	struct event_constraint *constraint_list; /* in enable order */
														
 
															+	struct intel_excl_cntrs		*excl_cntrs;
														
 
															+	int excl_thread_id; /* 0 or 1 */
														
 
															 	/*
														
 
															 	 * AMD specific bits
														
@@ -187,7 +224,7 @@ struct cpu_hw_events {
 
															 	/* Inverted mask of bits to clear in the perf_ctr ctrl registers */
														
 
															 	u64				perf_ctr_virt_mask;
														
 
															-	void				*kfree_on_online;
														
 
															+	void				*kfree_on_online[X86_PERF_KFREE_MAX];
														
 
															 };
														
 
															 #define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\
														
@@ -202,6 +239,10 @@ struct cpu_hw_events {
 
															 #define EVENT_CONSTRAINT(c, n, m)	\
														
 
															 	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0)
														
 
															+#define INTEL_EXCLEVT_CONSTRAINT(c, n)	\
														
 
															+	__EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\
														
 
															+			   0, PERF_X86_EVENT_EXCL)
														
 
															+
														
 
															 /*
														
 
															  * The overlap flag marks event constraints with overlapping counter
														
 
															  * masks. This is the case if the counter mask of such an event is not
														
@@ -259,6 +300,10 @@ struct cpu_hw_events {
 
															 #define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n)	\
														
 
															 	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
														
 
															+#define INTEL_EXCLUEVT_CONSTRAINT(c, n)	\
														
 
															+	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
														
 
															+			   HWEIGHT(n), 0, PERF_X86_EVENT_EXCL)
														
 
															+
														
 
															 #define INTEL_PLD_CONSTRAINT(c, n)	\
														
 
															 	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
														
 
															 			   HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
														
@@ -283,22 +328,40 @@ struct cpu_hw_events {
 
															 /* Check flags and event code, and set the HSW load flag */
														
 
															 #define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \
														
 
															-	__EVENT_CONSTRAINT(code, n, 			\
														
 
															+	__EVENT_CONSTRAINT(code, n,			\
														
 
															 			  ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
														
 
															 			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
														
 
															+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \
														
 
															+	__EVENT_CONSTRAINT(code, n,			\
														
 
															+			  ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
														
 
															+			  HWEIGHT(n), 0, \
														
 
															+			  PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
														
 
															+
														
 
															 /* Check flags and event code/umask, and set the HSW store flag */
														
 
															 #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \
														
 
															 	__EVENT_CONSTRAINT(code, n, 			\
														
 
															 			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
														
 
															 			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
														
 
															+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(code, n) \
														
 
															+	__EVENT_CONSTRAINT(code, n,			\
														
 
															+			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
														
 
															+			  HWEIGHT(n), 0, \
														
 
															+			  PERF_X86_EVENT_PEBS_ST_HSW|PERF_X86_EVENT_EXCL)
														
 
															+
														
 
															 /* Check flags and event code/umask, and set the HSW load flag */
														
 
															 #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \
														
 
															 	__EVENT_CONSTRAINT(code, n, 			\
														
 
															 			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
														
 
															 			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
														
 
															+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(code, n) \
														
 
															+	__EVENT_CONSTRAINT(code, n,			\
														
 
															+			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
														
 
															+			  HWEIGHT(n), 0, \
														
 
															+			  PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
														
 
															+
														
 
															 /* Check flags and event code/umask, and set the HSW N/A flag */
														
 
															 #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
														
 
															 	__EVENT_CONSTRAINT(code, n, 			\
														
@@ -408,6 +471,13 @@ union x86_pmu_config {
 
															 #define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
														
 
															+enum {
														
 
															+	x86_lbr_exclusive_lbr,
														
 
															+	x86_lbr_exclusive_bts,
														
 
															+	x86_lbr_exclusive_pt,
														
 
															+	x86_lbr_exclusive_max,
														
 
															+};
														
 
															+
														
 
															 /*
														
 
															  * struct x86_pmu - generic x86 pmu
														
 
															  */
														
@@ -443,14 +513,25 @@ struct x86_pmu {
 
															 	u64		max_period;
														
 
															 	struct event_constraint *
														
 
															 			(*get_event_constraints)(struct cpu_hw_events *cpuc,
														
 
															+						 int idx,
														
 
															 						 struct perf_event *event);
														
 
															 	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
														
 
															 						 struct perf_event *event);
														
 
															+
														
 
															+	void		(*commit_scheduling)(struct cpu_hw_events *cpuc,
														
 
															+					     struct perf_event *event,
														
 
															+					     int cntr);
														
 
															+
														
 
															+	void		(*start_scheduling)(struct cpu_hw_events *cpuc);
														
 
															+
														
 
															+	void		(*stop_scheduling)(struct cpu_hw_events *cpuc);
														
 
															+
														
 
															 	struct event_constraint *event_constraints;
														
 
															 	struct x86_pmu_quirk *quirks;
														
 
															 	int		perfctr_second_write;
														
 
															 	bool		late_ack;
														
 
															+	unsigned	(*limit_period)(struct perf_event *event, unsigned l);
														
 
															 	/*
														
 
															 	 * sysfs attrs
														
@@ -472,7 +553,8 @@ struct x86_pmu {
 
															 	void		(*cpu_dead)(int cpu);
														
 
															 	void		(*check_microcode)(void);
														
 
															-	void		(*flush_branch_stack)(void);
														
 
															+	void		(*sched_task)(struct perf_event_context *ctx,
														
 
															+				      bool sched_in);
														
 
															 	/*
														
 
															 	 * Intel Arch Perfmon v2+
														
@@ -503,11 +585,16 @@ struct x86_pmu {
 
															 	const int	*lbr_sel_map;		   /* lbr_select mappings */
														
 
															 	bool		lbr_double_abort;	   /* duplicated lbr aborts */
														
 
															+	/*
														
 
															+	 * Intel PT/LBR/BTS are exclusive
														
 
															+	 */
														
 
															+	atomic_t	lbr_exclusive[x86_lbr_exclusive_max];
														
 
															+
														
 
															 	/*
														
 
															 	 * Extra registers for events
														
 
															 	 */
														
 
															 	struct extra_reg *extra_regs;
														
 
															-	unsigned int er_flags;
														
 
															+	unsigned int flags;
														
 
															 	/*
														
 
															 	 * Intel host/guest support (KVM)
														
@@ -515,6 +602,13 @@ struct x86_pmu {
 
															 	struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
														
 
															 };
														
 
															+struct x86_perf_task_context {
														
 
															+	u64 lbr_from[MAX_LBR_ENTRIES];
														
 
															+	u64 lbr_to[MAX_LBR_ENTRIES];
														
 
															+	int lbr_callstack_users;
														
 
															+	int lbr_stack_state;
														
 
															+};
														
 
															+
														
 
															 #define x86_add_quirk(func_)						\
														
 
															 do {									\
														
 
															 	static struct x86_pmu_quirk __quirk __initdata = {		\
														
@@ -524,8 +618,13 @@ do {									\
 
															 	x86_pmu.quirks = &__quirk;					\
														
 
															 } while (0)
														
 
															-#define ERF_NO_HT_SHARING	1
														
 
															-#define ERF_HAS_RSP_1		2
														
 
															+/*
														
 
															+ * x86_pmu flags
														
 
															+ */
														
 
															+#define PMU_FL_NO_HT_SHARING	0x1 /* no hyper-threading resource sharing */
														
 
															+#define PMU_FL_HAS_RSP_1	0x2 /* has 2 equivalent offcore_rsp regs   */
														
 
															+#define PMU_FL_EXCL_CNTRS	0x4 /* has exclusive counter requirements  */
														
 
															+#define PMU_FL_EXCL_ENABLED	0x8 /* exclusive counter active */
														
 
															 #define EVENT_VAR(_id)  event_attr_##_id
														
 
															 #define EVENT_PTR(_id) &event_attr_##_id.attr.attr
														
@@ -546,6 +645,12 @@ static struct perf_pmu_events_attr event_attr_##v = {			\
 
															 extern struct x86_pmu x86_pmu __read_mostly;
														
 
															+static inline bool x86_pmu_has_lbr_callstack(void)
														
 
															+{
														
 
															+	return  x86_pmu.lbr_sel_map &&
														
 
															+		x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
														
 
															+}
														
 
															+
														
 
															 DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
														
 
															 int x86_perf_event_set_period(struct perf_event *event);
														
@@ -588,6 +693,12 @@ static inline int x86_pmu_rdpmc_index(int index)
 
															 	return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
														
 
															 }
														
 
															+int x86_add_exclusive(unsigned int what);
														
 
															+
														
 
															+void x86_del_exclusive(unsigned int what);
														
 
															+
														
 
															+void hw_perf_lbr_event_destroy(struct perf_event *event);
														
 
															+
														
 
															 int x86_setup_perfctr(struct perf_event *event);
														
 
															 int x86_pmu_hw_config(struct perf_event *event);
														
@@ -674,10 +785,34 @@ static inline int amd_pmu_init(void)
 
															 #ifdef CONFIG_CPU_SUP_INTEL
														
 
															+static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
														
 
															+{
														
 
															+	/* user explicitly requested branch sampling */
														
 
															+	if (has_branch_stack(event))
														
 
															+		return true;
														
 
															+
														
 
															+	/* implicit branch sampling to correct PEBS skid */
														
 
															+	if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
														
 
															+	    x86_pmu.intel_cap.pebs_format < 2)
														
 
															+		return true;
														
 
															+
														
 
															+	return false;
														
 
															+}
														
 
															+
														
 
															+static inline bool intel_pmu_has_bts(struct perf_event *event)
														
 
															+{
														
 
															+	if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
														
 
															+	    !event->attr.freq && event->hw.sample_period == 1)
														
 
															+		return true;
														
 
															+
														
 
															+	return false;
														
 
															+}
														
 
															+
														
 
															 int intel_pmu_save_and_restart(struct perf_event *event);
														
 
															 struct event_constraint *
														
 
															-x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event);
														
 
															+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
														
 
															+			  struct perf_event *event);
														
 
															 struct intel_shared_regs *allocate_shared_regs(int cpu);
														
@@ -727,13 +862,15 @@ void intel_pmu_pebs_disable_all(void);
 
															 void intel_ds_init(void);
														
 
															+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
														
 
															+
														
 
															 void intel_pmu_lbr_reset(void);
														
 
															 void intel_pmu_lbr_enable(struct perf_event *event);
														
 
															 void intel_pmu_lbr_disable(struct perf_event *event);
														
 
															-void intel_pmu_lbr_enable_all(void);
														
 
															+void intel_pmu_lbr_enable_all(bool pmi);
														
 
															 void intel_pmu_lbr_disable_all(void);
														
@@ -747,8 +884,18 @@ void intel_pmu_lbr_init_atom(void);
 
															 void intel_pmu_lbr_init_snb(void);
														
 
															+void intel_pmu_lbr_init_hsw(void);
														
 
															+
														
 
															 int intel_pmu_setup_lbr_filter(struct perf_event *event);
														
 
															+void intel_pt_interrupt(void);
														
 
															+
														
 
															+int intel_bts_interrupt(void);
														
 
															+
														
 
															+void intel_bts_enable_local(void);
														
 
															+
														
 
															+void intel_bts_disable_local(void);
														
 
															+
														
 
															 int p4_pmu_init(void);
														
 
															 int p6_pmu_init(void);
														
@@ -758,6 +905,10 @@ int knc_pmu_init(void);
 
															 ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
														
 
															 			  char *page);
														
 
															+static inline int is_ht_workaround_enabled(void)
														
 
															+{
														
 
															+	return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED);
														
 
															+}
														
 
															 #else /* CONFIG_CPU_SUP_INTEL */
														
 
															 static inline void reserve_ds_buffers(void)
														
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -382,6 +382,7 @@ static int amd_pmu_cpu_prepare(int cpu)
 
															 static void amd_pmu_cpu_starting(int cpu)
														
 
															 {
														
 
															 	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
														
 
															+	void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
														
 
															 	struct amd_nb *nb;
														
 
															 	int i, nb_id;
														
@@ -399,7 +400,7 @@ static void amd_pmu_cpu_starting(int cpu)
 
															 			continue;
														
 
															 		if (nb->nb_id == nb_id) {
														
 
															-			cpuc->kfree_on_online = cpuc->amd_nb;
														
 
															+			*onln = cpuc->amd_nb;
														
 
															 			cpuc->amd_nb = nb;
														
 
															 			break;
														
 
															 		}
														
@@ -429,7 +430,8 @@ static void amd_pmu_cpu_dead(int cpu)
 
															 }
														
 
															 static struct event_constraint *
														
 
															-amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
														
 
															+amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
														
 
															+			  struct perf_event *event)
														
 
															 {
														
 
															 	/*
														
 
															 	 * if not NB event or no NB, then no constraints
														
@@ -537,7 +539,8 @@ static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
 
															 static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
														
 
															 static struct event_constraint *
														
 
															-amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event)
														
 
															+amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx,
														
 
															+			       struct perf_event *event)
														
 
															 {
														
 
															 	struct hw_perf_event *hwc = &event->hw;
														
 
															 	unsigned int event_code = amd_get_event_code(hwc);
														
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -796,7 +796,7 @@ static int setup_ibs_ctl(int ibs_eilvt_off)
 
															  * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
														
 
															  * is using the new offset.
														
 
															  */
														
 
															-static int force_ibs_eilvt_setup(void)
														
 
															+static void force_ibs_eilvt_setup(void)
														
 
															 {
														
 
															 	int offset;
														
 
															 	int ret;
														
@@ -811,26 +811,24 @@ static int force_ibs_eilvt_setup(void)
 
															 	if (offset == APIC_EILVT_NR_MAX) {
														
 
															 		printk(KERN_DEBUG "No EILVT entry available\n");
														
 
															-		return -EBUSY;
														
 
															+		return;
														
 
															 	}
														
 
															 	ret = setup_ibs_ctl(offset);
														
 
															 	if (ret)
														
 
															 		goto out;
														
 
															-	if (!ibs_eilvt_valid()) {
														
 
															-		ret = -EFAULT;
														
 
															+	if (!ibs_eilvt_valid())
														
 
															 		goto out;
														
 
															-	}
														
 
															 	pr_info("IBS: LVT offset %d assigned\n", offset);
														
 
															-	return 0;
														
 
															+	return;
														
 
															 out:
														
 
															 	preempt_disable();
														
 
															 	put_eilvt(offset);
														
 
															 	preempt_enable();
														
 
															-	return ret;
														
 
															+	return;
														
 
															 }
														
 
															 static void ibs_eilvt_setup(void)
														
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -12,6 +12,7 @@
 
															 #include <linux/init.h>
														
 
															 #include <linux/slab.h>
														
 
															 #include <linux/export.h>
														
 
															+#include <linux/watchdog.h>
														
 
															 #include <asm/cpufeature.h>
														
 
															 #include <asm/hardirq.h>
														
@@ -113,6 +114,12 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
 
															 	INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
														
 
															 	INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
														
 
															 	INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
														
 
															+
														
 
															+	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
														
 
															+	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
														
 
															+	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
														
 
															+	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
														
 
															+
														
 
															 	EVENT_CONSTRAINT_END
														
 
															 };
														
@@ -131,15 +138,12 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
 
															 	INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
														
 
															 	INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
														
 
															 	INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
														
 
															-	/*
														
 
															-	 * Errata BV98 -- MEM_*_RETIRED events can leak between counters of SMT
														
 
															-	 * siblings; disable these events because they can corrupt unrelated
														
 
															-	 * counters.
														
 
															-	 */
														
 
															-	INTEL_EVENT_CONSTRAINT(0xd0, 0x0), /* MEM_UOPS_RETIRED.* */
														
 
															-	INTEL_EVENT_CONSTRAINT(0xd1, 0x0), /* MEM_LOAD_UOPS_RETIRED.* */
														
 
															-	INTEL_EVENT_CONSTRAINT(0xd2, 0x0), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
														
 
															-	INTEL_EVENT_CONSTRAINT(0xd3, 0x0), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
														
 
															+
														
 
															+	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
														
 
															+	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
														
 
															+	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
														
 
															+	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
														
 
															+
														
 
															 	EVENT_CONSTRAINT_END
														
 
															 };
														
@@ -217,6 +221,21 @@ static struct event_constraint intel_hsw_event_constraints[] = {
 
															 	INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4),
														
 
															 	/* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
														
 
															 	INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf),
														
 
															+
														
 
															+	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
														
 
															+	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
														
 
															+	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
														
 
															+	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
														
 
															+
														
 
															+	EVENT_CONSTRAINT_END
														
 
															+};
														
 
															+
														
 
															+struct event_constraint intel_bdw_event_constraints[] = {
														
 
															+	FIXED_EVENT_CONSTRAINT(0x00c0, 0),	/* INST_RETIRED.ANY */
														
 
															+	FIXED_EVENT_CONSTRAINT(0x003c, 1),	/* CPU_CLK_UNHALTED.CORE */
														
 
															+	FIXED_EVENT_CONSTRAINT(0x0300, 2),	/* CPU_CLK_UNHALTED.REF */
														
 
															+	INTEL_UEVENT_CONSTRAINT(0x148, 0x4),	/* L1D_PEND_MISS.PENDING */
														
 
															+	INTEL_EVENT_CONSTRAINT(0xa3, 0x4),	/* CYCLE_ACTIVITY.* */
														
 
															 	EVENT_CONSTRAINT_END
														
 
															 };
														
@@ -415,6 +434,202 @@ static __initconst const u64 snb_hw_cache_event_ids
 
															 };
														
 
															+/*
														
 
															+ * Notes on the events:
														
 
															+ * - data reads do not include code reads (comparable to earlier tables)
														
 
															+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
														
 
															+ * - remote node access includes remote memory, remote cache, remote mmio.
														
 
															+ * - prefetches are not included in the counts because they are not
														
 
															+ *   reliably counted.
														
 
															+ */
														
 
															+
														
 
															+#define HSW_DEMAND_DATA_RD		BIT_ULL(0)
														
 
															+#define HSW_DEMAND_RFO			BIT_ULL(1)
														
 
															+#define HSW_ANY_RESPONSE		BIT_ULL(16)
														
 
															+#define HSW_SUPPLIER_NONE		BIT_ULL(17)
														
 
															+#define HSW_L3_MISS_LOCAL_DRAM		BIT_ULL(22)
														
 
															+#define HSW_L3_MISS_REMOTE_HOP0		BIT_ULL(27)
														
 
															+#define HSW_L3_MISS_REMOTE_HOP1		BIT_ULL(28)
														
 
															+#define HSW_L3_MISS_REMOTE_HOP2P	BIT_ULL(29)
														
 
															+#define HSW_L3_MISS			(HSW_L3_MISS_LOCAL_DRAM| \
														
 
															+					 HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
														
 
															+					 HSW_L3_MISS_REMOTE_HOP2P)
														
 
															+#define HSW_SNOOP_NONE			BIT_ULL(31)
														
 
															+#define HSW_SNOOP_NOT_NEEDED		BIT_ULL(32)
														
 
															+#define HSW_SNOOP_MISS			BIT_ULL(33)
														
 
															+#define HSW_SNOOP_HIT_NO_FWD		BIT_ULL(34)
														
 
															+#define HSW_SNOOP_HIT_WITH_FWD		BIT_ULL(35)
														
 
															+#define HSW_SNOOP_HITM			BIT_ULL(36)
														
 
															+#define HSW_SNOOP_NON_DRAM		BIT_ULL(37)
														
 
															+#define HSW_ANY_SNOOP			(HSW_SNOOP_NONE| \
														
 
															+					 HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
														
 
															+					 HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
														
 
															+					 HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
														
 
															+#define HSW_SNOOP_DRAM			(HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)
														
 
															+#define HSW_DEMAND_READ			HSW_DEMAND_DATA_RD
														
 
															+#define HSW_DEMAND_WRITE		HSW_DEMAND_RFO
														
 
															+#define HSW_L3_MISS_REMOTE		(HSW_L3_MISS_REMOTE_HOP0|\
														
 
															+					 HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
														
 
															+#define HSW_LLC_ACCESS			HSW_ANY_RESPONSE
														
 
															+
														
 
															+#define BDW_L3_MISS_LOCAL		BIT(26)
														
 
															+#define BDW_L3_MISS			(BDW_L3_MISS_LOCAL| \
														
 
															+					 HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
														
 
															+					 HSW_L3_MISS_REMOTE_HOP2P)
														
 
															+
														
 
															+
														
 
															+static __initconst const u64 hsw_hw_cache_event_ids
														
 
															+				[PERF_COUNT_HW_CACHE_MAX]
														
 
															+				[PERF_COUNT_HW_CACHE_OP_MAX]
														
 
															+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
														
 
															+{
														
 
															+ [ C(L1D ) ] = {
														
 
															+	[ C(OP_READ) ] = {
														
 
															+		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS */
														
 
															+		[ C(RESULT_MISS)   ] = 0x151,	/* L1D.REPLACEMENT */
														
 
															+	},
														
 
															+	[ C(OP_WRITE) ] = {
														
 
															+		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES */
														
 
															+		[ C(RESULT_MISS)   ] = 0x0,
														
 
															+	},
														
 
															+	[ C(OP_PREFETCH) ] = {
														
 
															+		[ C(RESULT_ACCESS) ] = 0x0,
														
 
															+		[ C(RESULT_MISS)   ] = 0x0,
														
 
															+	},
														
 
															+ },
														
 
															+ [ C(L1I ) ] = {
														
 
															+	[ C(OP_READ) ] = {
														
 
															+		[ C(RESULT_ACCESS) ] = 0x0,
														
 
															+		[ C(RESULT_MISS)   ] = 0x280,	/* ICACHE.MISSES */
														
 
															+	},
														
 
															+	[ C(OP_WRITE) ] = {
														
 
															+		[ C(RESULT_ACCESS) ] = -1,
														
 
															+		[ C(RESULT_MISS)   ] = -1,
														
 
															+	},
														
 
															+	[ C(OP_PREFETCH) ] = {
														
 
															+		[ C(RESULT_ACCESS) ] = 0x0,
														
 
															+		[ C(RESULT_MISS)   ] = 0x0,
														
 
															+	},
														
 
															+ },
														
 
															+ [ C(LL  ) ] = {
														
 
															+	[ C(OP_READ) ] = {
														
 
															+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
														
 
															+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
														
 
															+	},
														
 
															+	[ C(OP_WRITE) ] = {
														
 
															+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
														
 
															+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
														
 
															+	},
														
 
															+	[ C(OP_PREFETCH) ] = {
														
 
															+		[ C(RESULT_ACCESS) ] = 0x0,
														
 
															+		[ C(RESULT_MISS)   ] = 0x0,
														
 
															+	},
														
 
															+ },
														
 
															+ [ C(DTLB) ] = {
														
 
															+	[ C(OP_READ) ] = {
														
 
															+		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS */
														
 
															+		[ C(RESULT_MISS)   ] = 0x108,	/* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
														
 
															+	},
														
 
															+	[ C(OP_WRITE) ] = {
														
 
															+		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES */
														
 
															+		[ C(RESULT_MISS)   ] = 0x149,	/* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
														
 
															+	},
														
 
															+	[ C(OP_PREFETCH) ] = {
														
 
															+		[ C(RESULT_ACCESS) ] = 0x0,
														
 
															+		[ C(RESULT_MISS)   ] = 0x0,
														
 
															+	},
														
 
															+ },
														
 
															+ [ C(ITLB) ] = {
														
 
															+	[ C(OP_READ) ] = {
														
 
															+		[ C(RESULT_ACCESS) ] = 0x6085,	/* ITLB_MISSES.STLB_HIT */
														
 
															+		[ C(RESULT_MISS)   ] = 0x185,	/* ITLB_MISSES.MISS_CAUSES_A_WALK */
														
 
															+	},
														
 
															+	[ C(OP_WRITE) ] = {
														
 
															+		[ C(RESULT_ACCESS) ] = -1,
														
 
															+		[ C(RESULT_MISS)   ] = -1,
														
 
															+	},
														
 
															+	[ C(OP_PREFETCH) ] = {
														
 
															+		[ C(RESULT_ACCESS) ] = -1,
														
 
															+		[ C(RESULT_MISS)   ] = -1,
														
 
															+	},
														
 
															+ },
														
 
															+ [ C(BPU ) ] = {
														
 
															+	[ C(OP_READ) ] = {
														
 
															+		[ C(RESULT_ACCESS) ] = 0xc4,	/* BR_INST_RETIRED.ALL_BRANCHES */
														
 
															+		[ C(RESULT_MISS)   ] = 0xc5,	/* BR_MISP_RETIRED.ALL_BRANCHES */
														
 
															+	},
														
 
															+	[ C(OP_WRITE) ] = {
														
 
															+		[ C(RESULT_ACCESS) ] = -1,
														
 
															+		[ C(RESULT_MISS)   ] = -1,
														
 
															+	},
														
 
															+	[ C(OP_PREFETCH) ] = {
														
 
															+		[ C(RESULT_ACCESS) ] = -1,
														
 
															+		[ C(RESULT_MISS)   ] = -1,
														
 
															+	},
														
 
															+ },
														
 
															+ [ C(NODE) ] = {
														
 
															+	[ C(OP_READ) ] = {
														
 
															+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
														
 
															+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
														
 
															+	},
														
 
															+	[ C(OP_WRITE) ] = {
														
 
															+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
														
 
															+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
														
 
															+	},
														
 
															+	[ C(OP_PREFETCH) ] = {
														
 
															+		[ C(RESULT_ACCESS) ] = 0x0,
														
 
															+		[ C(RESULT_MISS)   ] = 0x0,
														
 
															+	},
														
 
															+ },
														
 
															+};
														
 
															+
														
 
															+static __initconst const u64 hsw_hw_cache_extra_regs
														
 
															+				[PERF_COUNT_HW_CACHE_MAX]
														
 
															+				[PERF_COUNT_HW_CACHE_OP_MAX]
														
 
															+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
														
 
															+{
														
 
															+ [ C(LL  ) ] = {
														
 
															+	[ C(OP_READ) ] = {
														
 
															+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
														
 
															+				       HSW_LLC_ACCESS,
														
 
															+		[ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
														
 
															+				       HSW_L3_MISS|HSW_ANY_SNOOP,
														
 
															+	},
														
 
															+	[ C(OP_WRITE) ] = {
														
 
															+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
														
 
															+				       HSW_LLC_ACCESS,
														
 
															+		[ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
														
 
															+				       HSW_L3_MISS|HSW_ANY_SNOOP,
														
 
															+	},
														
 
															+	[ C(OP_PREFETCH) ] = {
														
 
															+		[ C(RESULT_ACCESS) ] = 0x0,
														
 
															+		[ C(RESULT_MISS)   ] = 0x0,
														
 
															+	},
														
 
															+ },
														
 
															+ [ C(NODE) ] = {
														
 
															+	[ C(OP_READ) ] = {
														
 
															+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
														
 
															+				       HSW_L3_MISS_LOCAL_DRAM|
														
 
															+				       HSW_SNOOP_DRAM,
														
 
															+		[ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
														
 
															+				       HSW_L3_MISS_REMOTE|
														
 
															+				       HSW_SNOOP_DRAM,
														
 
															+	},
														
 
															+	[ C(OP_WRITE) ] = {
														
 
															+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
														
 
															+				       HSW_L3_MISS_LOCAL_DRAM|
														
 
															+				       HSW_SNOOP_DRAM,
														
 
															+		[ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
														
 
															+				       HSW_L3_MISS_REMOTE|
														
 
															+				       HSW_SNOOP_DRAM,
														
 
															+	},
														
 
															+	[ C(OP_PREFETCH) ] = {
														
 
															+		[ C(RESULT_ACCESS) ] = 0x0,
														
 
															+		[ C(RESULT_MISS)   ] = 0x0,
														
 
															+	},
														
 
															+ },
														
 
															+};
														
 
															+
														
 
															 static __initconst const u64 westmere_hw_cache_event_ids
														
 
															 				[PERF_COUNT_HW_CACHE_MAX]
														
 
															 				[PERF_COUNT_HW_CACHE_OP_MAX]
														
@@ -1029,21 +1244,10 @@ static __initconst const u64 slm_hw_cache_event_ids
 
															  },
														
 
															 };
														
 
															-static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
														
 
															-{
														
 
															-	/* user explicitly requested branch sampling */
														
 
															-	if (has_branch_stack(event))
														
 
															-		return true;
														
 
															-
														
 
															-	/* implicit branch sampling to correct PEBS skid */
														
 
															-	if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
														
 
															-	    x86_pmu.intel_cap.pebs_format < 2)
														
 
															-		return true;
														
 
															-
														
 
															-	return false;
														
 
															-}
														
 
															-
														
 
															-static void intel_pmu_disable_all(void)
														
 
															+/*
														
 
															+ * Use from PMIs where the LBRs are already disabled.
														
 
															+ */
														
 
															+static void __intel_pmu_disable_all(void)
														
 
															 {
														
 
															 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
														
@@ -1051,17 +1255,24 @@ static void intel_pmu_disable_all(void)
 
															 	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
														
 
															 		intel_pmu_disable_bts();
														
 
															+	else
														
 
															+		intel_bts_disable_local();
														
 
															 	intel_pmu_pebs_disable_all();
														
 
															+}
														
 
															+
														
 
															+static void intel_pmu_disable_all(void)
														
 
															+{
														
 
															+	__intel_pmu_disable_all();
														
 
															 	intel_pmu_lbr_disable_all();
														
 
															 }
														
 
															-static void intel_pmu_enable_all(int added)
														
 
															+static void __intel_pmu_enable_all(int added, bool pmi)
														
 
															 {
														
 
															 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
														
 
															 	intel_pmu_pebs_enable_all();
														
 
															-	intel_pmu_lbr_enable_all();
														
 
															+	intel_pmu_lbr_enable_all(pmi);
														
 
															 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
														
 
															 			x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
														
@@ -1073,7 +1284,13 @@ static void intel_pmu_enable_all(int added)
 
															 			return;
														
 
															 		intel_pmu_enable_bts(event->hw.config);
														
 
															-	}
														
 
															+	} else
														
 
															+		intel_bts_enable_local();
														
 
															+}
														
 
															+
														
 
															+static void intel_pmu_enable_all(int added)
														
 
															+{
														
 
															+	__intel_pmu_enable_all(added, false);
														
 
															 }
														
 
															 /*
														
@@ -1207,7 +1424,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
 
															 	 * must disable before any actual event
														
 
															 	 * because any event may be combined with LBR
														
 
															 	 */
														
 
															-	if (intel_pmu_needs_lbr_smpl(event))
														
 
															+	if (needs_branch_stack(event))
														
 
															 		intel_pmu_lbr_disable(event);
														
 
															 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
														
@@ -1268,7 +1485,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
 
															 	 * must enabled before any actual event
														
 
															 	 * because any event may be combined with LBR
														
 
															 	 */
														
 
															-	if (intel_pmu_needs_lbr_smpl(event))
														
 
															+	if (needs_branch_stack(event))
														
 
															 		intel_pmu_lbr_enable(event);
														
 
															 	if (event->attr.exclude_host)
														
@@ -1334,6 +1551,18 @@ static void intel_pmu_reset(void)
 
															 	if (ds)
														
 
															 		ds->bts_index = ds->bts_buffer_base;
														
 
															+	/* Ack all overflows and disable fixed counters */
														
 
															+	if (x86_pmu.version >= 2) {
														
 
															+		intel_pmu_ack_status(intel_pmu_get_status());
														
 
															+		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
														
 
															+	}
														
 
															+
														
 
															+	/* Reset LBRs and LBR freezing */
														
 
															+	if (x86_pmu.lbr_nr) {
														
 
															+		update_debugctlmsr(get_debugctlmsr() &
														
 
															+			~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR));
														
 
															+	}
														
 
															+
														
 
															 	local_irq_restore(flags);
														
 
															 }
														
@@ -1357,8 +1586,9 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 
															 	 */
														
 
															 	if (!x86_pmu.late_ack)
														
 
															 		apic_write(APIC_LVTPC, APIC_DM_NMI);
														
 
															-	intel_pmu_disable_all();
														
 
															+	__intel_pmu_disable_all();
														
 
															 	handled = intel_pmu_drain_bts_buffer();
														
 
															+	handled += intel_bts_interrupt();
														
 
															 	status = intel_pmu_get_status();
														
 
															 	if (!status)
														
 
															 		goto done;
														
@@ -1398,6 +1628,14 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 
															 		x86_pmu.drain_pebs(regs);
														
 
															 	}
														
 
															+	/*
														
 
															+	 * Intel PT
														
 
															+	 */
														
 
															+	if (__test_and_clear_bit(55, (unsigned long *)&status)) {
														
 
															+		handled++;
														
 
															+		intel_pt_interrupt();
														
 
															+	}
														
 
															+
														
 
															 	/*
														
 
															 	 * Checkpointed counters can lead to 'spurious' PMIs because the
														
 
															 	 * rollback caused by the PMI will have cleared the overflow status
														
@@ -1433,7 +1671,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 
															 		goto again;
														
 
															 done:
														
 
															-	intel_pmu_enable_all(0);
														
 
															+	__intel_pmu_enable_all(0, true);
														
 
															 	/*
														
 
															 	 * Only unmask the NMI after the overflow counters
														
 
															 	 * have been reset. This avoids spurious NMIs on
														
@@ -1464,7 +1702,7 @@ intel_bts_constraints(struct perf_event *event)
 
															 static int intel_alt_er(int idx)
														
 
															 {
														
 
															-	if (!(x86_pmu.er_flags & ERF_HAS_RSP_1))
														
 
															+	if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
														
 
															 		return idx;
														
 
															 	if (idx == EXTRA_REG_RSP_0)
														
@@ -1624,7 +1862,8 @@ intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
 
															 }
														
 
															 struct event_constraint *
														
 
															-x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
														
 
															+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
														
 
															+			  struct perf_event *event)
														
 
															 {
														
 
															 	struct event_constraint *c;
														
@@ -1641,7 +1880,8 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
 
															 }
														
 
															 static struct event_constraint *
														
 
															-intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
														
 
															+__intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
														
 
															+			    struct perf_event *event)
														
 
															 {
														
 
															 	struct event_constraint *c;
														
@@ -1657,7 +1897,278 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
 
															 	if (c)
														
 
															 		return c;
														
 
															-	return x86_get_event_constraints(cpuc, event);
														
 
															+	return x86_get_event_constraints(cpuc, idx, event);
														
 
															+}
														
 
															+
														
 
															+static void
														
 
															+intel_start_scheduling(struct cpu_hw_events *cpuc)
														
 
															+{
														
 
															+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
														
 
															+	struct intel_excl_states *xl, *xlo;
														
 
															+	int tid = cpuc->excl_thread_id;
														
 
															+	int o_tid = 1 - tid; /* sibling thread */
														
 
															+
														
 
															+	/*
														
 
															+	 * nothing needed if in group validation mode
														
 
															+	 */
														
 
															+	if (cpuc->is_fake || !is_ht_workaround_enabled())
														
 
															+		return;
														
 
															+
														
 
															+	/*
														
 
															+	 * no exclusion needed
														
 
															+	 */
														
 
															+	if (!excl_cntrs)
														
 
															+		return;
														
 
															+
														
 
															+	xlo = &excl_cntrs->states[o_tid];
														
 
															+	xl = &excl_cntrs->states[tid];
														
 
															+
														
 
															+	xl->sched_started = true;
														
 
															+	xl->num_alloc_cntrs = 0;
														
 
															+	/*
														
 
															+	 * lock shared state until we are done scheduling
														
 
															+	 * in stop_event_scheduling()
														
 
															+	 * makes scheduling appear as a transaction
														
 
															+	 */
														
 
															+	WARN_ON_ONCE(!irqs_disabled());
														
 
															+	raw_spin_lock(&excl_cntrs->lock);
														
 
															+
														
 
															+	/*
														
 
															+	 * save initial state of sibling thread
														
 
															+	 */
														
 
															+	memcpy(xlo->init_state, xlo->state, sizeof(xlo->init_state));
														
 
															+}
														
 
															+
														
 
															+static void
														
 
															+intel_stop_scheduling(struct cpu_hw_events *cpuc)
														
 
															+{
														
 
															+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
														
 
															+	struct intel_excl_states *xl, *xlo;
														
 
															+	int tid = cpuc->excl_thread_id;
														
 
															+	int o_tid = 1 - tid; /* sibling thread */
														
 
															+
														
 
															+	/*
														
 
															+	 * nothing needed if in group validation mode
														
 
															+	 */
														
 
															+	if (cpuc->is_fake || !is_ht_workaround_enabled())
														
 
															+		return;
														
 
															+	/*
														
 
															+	 * no exclusion needed
														
 
															+	 */
														
 
															+	if (!excl_cntrs)
														
 
															+		return;
														
 
															+
														
 
															+	xlo = &excl_cntrs->states[o_tid];
														
 
															+	xl = &excl_cntrs->states[tid];
														
 
															+
														
 
															+	/*
														
 
															+	 * make new sibling thread state visible
														
 
															+	 */
														
 
															+	memcpy(xlo->state, xlo->init_state, sizeof(xlo->state));
														
 
															+
														
 
															+	xl->sched_started = false;
														
 
															+	/*
														
 
															+	 * release shared state lock (acquired in intel_start_scheduling())
														
 
															+	 */
														
 
															+	raw_spin_unlock(&excl_cntrs->lock);
														
 
															+}
														
 
															+
														
 
															+static struct event_constraint *
														
 
															+intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
														
 
															+			   int idx, struct event_constraint *c)
														
 
															+{
														
 
															+	struct event_constraint *cx;
														
 
															+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
														
 
															+	struct intel_excl_states *xl, *xlo;
														
 
															+	int is_excl, i;
														
 
															+	int tid = cpuc->excl_thread_id;
														
 
															+	int o_tid = 1 - tid; /* alternate */
														
 
															+
														
 
															+	/*
														
 
															+	 * validating a group does not require
														
 
															+	 * enforcing cross-thread  exclusion
														
 
															+	 */
														
 
															+	if (cpuc->is_fake || !is_ht_workaround_enabled())
														
 
															+		return c;
														
 
															+
														
 
															+	/*
														
 
															+	 * no exclusion needed
														
 
															+	 */
														
 
															+	if (!excl_cntrs)
														
 
															+		return c;
														
 
															+	/*
														
 
															+	 * event requires exclusive counter access
														
 
															+	 * across HT threads
														
 
															+	 */
														
 
															+	is_excl = c->flags & PERF_X86_EVENT_EXCL;
														
 
															+
														
 
															+	/*
														
 
															+	 * xl = state of current HT
														
 
															+	 * xlo = state of sibling HT
														
 
															+	 */
														
 
															+	xl = &excl_cntrs->states[tid];
														
 
															+	xlo = &excl_cntrs->states[o_tid];
														
 
															+
														
 
															+	/*
														
 
															+	 * do not allow scheduling of more than max_alloc_cntrs
														
 
															+	 * which is set to half the available generic counters.
														
 
															+	 * this helps avoid counter starvation of sibling thread
														
 
															+	 * by ensuring at most half the counters cannot be in
														
 
															+	 * exclusive mode. There is not designated counters for the
														
 
															+	 * limits. Any N/2 counters can be used. This helps with
														
 
															+	 * events with specifix counter constraints
														
 
															+	 */
														
 
															+	if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs)
														
 
															+		return &emptyconstraint;
														
 
															+
														
 
															+	cx = c;
														
 
															+
														
 
															+	/*
														
 
															+	 * because we modify the constraint, we need
														
 
															+	 * to make a copy. Static constraints come
														
 
															+	 * from static const tables.
														
 
															+	 *
														
 
															+	 * only needed when constraint has not yet
														
 
															+	 * been cloned (marked dynamic)
														
 
															+	 */
														
 
															+	if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
														
 
															+
														
 
															+		/* sanity check */
														
 
															+		if (idx < 0)
														
 
															+			return &emptyconstraint;
														
 
															+
														
 
															+		/*
														
 
															+		 * grab pre-allocated constraint entry
														
 
															+		 */
														
 
															+		cx = &cpuc->constraint_list[idx];
														
 
															+
														
 
															+		/*
														
 
															+		 * initialize dynamic constraint
														
 
															+		 * with static constraint
														
 
															+		 */
														
 
															+		memcpy(cx, c, sizeof(*cx));
														
 
															+
														
 
															+		/*
														
 
															+		 * mark constraint as dynamic, so we
														
 
															+		 * can free it later on
														
 
															+		 */
														
 
															+		cx->flags |= PERF_X86_EVENT_DYNAMIC;
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * From here on, the constraint is dynamic.
														
 
															+	 * Either it was just allocated above, or it
														
 
															+	 * was allocated during a earlier invocation
														
 
															+	 * of this function
														
 
															+	 */
														
 
															+
														
 
															+	/*
														
 
															+	 * Modify static constraint with current dynamic
														
 
															+	 * state of thread
														
 
															+	 *
														
 
															+	 * EXCLUSIVE: sibling counter measuring exclusive event
														
 
															+	 * SHARED   : sibling counter measuring non-exclusive event
														
 
															+	 * UNUSED   : sibling counter unused
														
 
															+	 */
														
 
															+	for_each_set_bit(i, cx->idxmsk, X86_PMC_IDX_MAX) {
														
 
															+		/*
														
 
															+		 * exclusive event in sibling counter
														
 
															+		 * our corresponding counter cannot be used
														
 
															+		 * regardless of our event
														
 
															+		 */
														
 
															+		if (xl->state[i] == INTEL_EXCL_EXCLUSIVE)
														
 
															+			__clear_bit(i, cx->idxmsk);
														
 
															+		/*
														
 
															+		 * if measuring an exclusive event, sibling
														
 
															+		 * measuring non-exclusive, then counter cannot
														
 
															+		 * be used
														
 
															+		 */
														
 
															+		if (is_excl && xl->state[i] == INTEL_EXCL_SHARED)
														
 
															+			__clear_bit(i, cx->idxmsk);
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * recompute actual bit weight for scheduling algorithm
														
 
															+	 */
														
 
															+	cx->weight = hweight64(cx->idxmsk64);
														
 
															+
														
 
															+	/*
														
 
															+	 * if we return an empty mask, then switch
														
 
															+	 * back to static empty constraint to avoid
														
 
															+	 * the cost of freeing later on
														
 
															+	 */
														
 
															+	if (cx->weight == 0)
														
 
															+		cx = &emptyconstraint;
														
 
															+
														
 
															+	return cx;
														
 
															+}
														
 
															+
														
 
															+static struct event_constraint *
														
 
															+intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
														
 
															+			    struct perf_event *event)
														
 
															+{
														
 
															+	struct event_constraint *c1 = event->hw.constraint;
														
 
															+	struct event_constraint *c2;
														
 
															+
														
 
															+	/*
														
 
															+	 * first time only
														
 
															+	 * - static constraint: no change across incremental scheduling calls
														
 
															+	 * - dynamic constraint: handled by intel_get_excl_constraints()
														
 
															+	 */
														
 
															+	c2 = __intel_get_event_constraints(cpuc, idx, event);
														
 
															+	if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) {
														
 
															+		bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX);
														
 
															+		c1->weight = c2->weight;
														
 
															+		c2 = c1;
														
 
															+	}
														
 
															+
														
 
															+	if (cpuc->excl_cntrs)
														
 
															+		return intel_get_excl_constraints(cpuc, event, idx, c2);
														
 
															+
														
 
															+	return c2;
														
 
															+}
														
 
															+
														
 
															+static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
														
 
															+		struct perf_event *event)
														
 
															+{
														
 
															+	struct hw_perf_event *hwc = &event->hw;
														
 
															+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
														
 
															+	struct intel_excl_states *xlo, *xl;
														
 
															+	unsigned long flags = 0; /* keep compiler happy */
														
 
															+	int tid = cpuc->excl_thread_id;
														
 
															+	int o_tid = 1 - tid;
														
 
															+
														
 
															+	/*
														
 
															+	 * nothing needed if in group validation mode
														
 
															+	 */
														
 
															+	if (cpuc->is_fake)
														
 
															+		return;
														
 
															+
														
 
															+	WARN_ON_ONCE(!excl_cntrs);
														
 
															+
														
 
															+	if (!excl_cntrs)
														
 
															+		return;
														
 
															+
														
 
															+	xl = &excl_cntrs->states[tid];
														
 
															+	xlo = &excl_cntrs->states[o_tid];
														
 
															+
														
 
															+	/*
														
 
															+	 * put_constraint may be called from x86_schedule_events()
														
 
															+	 * which already has the lock held so here make locking
														
 
															+	 * conditional
														
 
															+	 */
														
 
															+	if (!xl->sched_started)
														
 
															+		raw_spin_lock_irqsave(&excl_cntrs->lock, flags);
														
 
															+
														
 
															+	/*
														
 
															+	 * if event was actually assigned, then mark the
														
 
															+	 * counter state as unused now
														
 
															+	 */
														
 
															+	if (hwc->idx >= 0)
														
 
															+		xlo->state[hwc->idx] = INTEL_EXCL_UNUSED;
														
 
															+
														
 
															+	if (!xl->sched_started)
														
 
															+		raw_spin_unlock_irqrestore(&excl_cntrs->lock, flags);
														
 
															 }
														
 
															 static void
														
@@ -1678,7 +2189,57 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
 
															 static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
														
 
															 					struct perf_event *event)
														
 
															 {
														
 
															+	struct event_constraint *c = event->hw.constraint;
														
 
															+
														
 
															 	intel_put_shared_regs_event_constraints(cpuc, event);
														
 
															+
														
 
															+	/*
														
 
															+	 * is PMU has exclusive counter restrictions, then
														
 
															+	 * all events are subject to and must call the
														
 
															+	 * put_excl_constraints() routine
														
 
															+	 */
														
 
															+	if (c && cpuc->excl_cntrs)
														
 
															+		intel_put_excl_constraints(cpuc, event);
														
 
															+
														
 
															+	/* cleanup dynamic constraint */
														
 
															+	if (c && (c->flags & PERF_X86_EVENT_DYNAMIC))
														
 
															+		event->hw.constraint = NULL;
														
 
															+}
														
 
															+
														
 
															+static void intel_commit_scheduling(struct cpu_hw_events *cpuc,
														
 
															+				    struct perf_event *event, int cntr)
														
 
															+{
														
 
															+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
														
 
															+	struct event_constraint *c = event->hw.constraint;
														
 
															+	struct intel_excl_states *xlo, *xl;
														
 
															+	int tid = cpuc->excl_thread_id;
														
 
															+	int o_tid = 1 - tid;
														
 
															+	int is_excl;
														
 
															+
														
 
															+	if (cpuc->is_fake || !c)
														
 
															+		return;
														
 
															+
														
 
															+	is_excl = c->flags & PERF_X86_EVENT_EXCL;
														
 
															+
														
 
															+	if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
														
 
															+		return;
														
 
															+
														
 
															+	WARN_ON_ONCE(!excl_cntrs);
														
 
															+
														
 
															+	if (!excl_cntrs)
														
 
															+		return;
														
 
															+
														
 
															+	xl = &excl_cntrs->states[tid];
														
 
															+	xlo = &excl_cntrs->states[o_tid];
														
 
															+
														
 
															+	WARN_ON_ONCE(!raw_spin_is_locked(&excl_cntrs->lock));
														
 
															+
														
 
															+	if (cntr >= 0) {
														
 
															+		if (is_excl)
														
 
															+			xlo->init_state[cntr] = INTEL_EXCL_EXCLUSIVE;
														
 
															+		else
														
 
															+			xlo->init_state[cntr] = INTEL_EXCL_SHARED;
														
 
															+	}
														
 
															 }
														
 
															 static void intel_pebs_aliases_core2(struct perf_event *event)
														
@@ -1747,10 +2308,21 @@ static int intel_pmu_hw_config(struct perf_event *event)
 
															 	if (event->attr.precise_ip && x86_pmu.pebs_aliases)
														
 
															 		x86_pmu.pebs_aliases(event);
														
 
															-	if (intel_pmu_needs_lbr_smpl(event)) {
														
 
															+	if (needs_branch_stack(event)) {
														
 
															 		ret = intel_pmu_setup_lbr_filter(event);
														
 
															 		if (ret)
														
 
															 			return ret;
														
 
															+
														
 
															+		/*
														
 
															+		 * BTS is set up earlier in this path, so don't account twice
														
 
															+		 */
														
 
															+		if (!intel_pmu_has_bts(event)) {
														
 
															+			/* disallow lbr if conflicting events are present */
														
 
															+			if (x86_add_exclusive(x86_lbr_exclusive_lbr))
														
 
															+				return -EBUSY;
														
 
															+
														
 
															+			event->destroy = hw_perf_lbr_event_destroy;
														
 
															+		}
														
 
															 	}
														
 
															 	if (event->attr.type != PERF_TYPE_RAW)
														
@@ -1891,9 +2463,12 @@ static struct event_constraint counter2_constraint =
 
															 			EVENT_CONSTRAINT(0, 0x4, 0);
														
 
															 static struct event_constraint *
														
 
															-hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
														
 
															+hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
														
 
															+			  struct perf_event *event)
														
 
															 {
														
 
															-	struct event_constraint *c = intel_get_event_constraints(cpuc, event);
														
 
															+	struct event_constraint *c;
														
 
															+
														
 
															+	c = intel_get_event_constraints(cpuc, idx, event);
														
 
															 	/* Handle special quirk on in_tx_checkpointed only in counter 2 */
														
 
															 	if (event->hw.config & HSW_IN_TX_CHECKPOINTED) {
														
@@ -1905,6 +2480,32 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
 
															 	return c;
														
 
															 }
														
 
															+/*
														
 
															+ * Broadwell:
														
 
															+ *
														
 
															+ * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared
														
 
															+ * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine
														
 
															+ * the two to enforce a minimum period of 128 (the smallest value that has bits
														
 
															+ * 0-5 cleared and >= 100).
														
 
															+ *
														
 
															+ * Because of how the code in x86_perf_event_set_period() works, the truncation
														
 
															+ * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period
														
 
															+ * to make up for the 'lost' events due to carrying the 'error' in period_left.
														
 
															+ *
														
 
															+ * Therefore the effective (average) period matches the requested period,
														
 
															+ * despite coarser hardware granularity.
														
 
															+ */
														
 
															+static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
														
 
															+{
														
 
															+	if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
														
 
															+			X86_CONFIG(.event=0xc0, .umask=0x01)) {
														
 
															+		if (left < 128)
														
 
															+			left = 128;
														
 
															+		left &= ~0x3fu;
														
 
															+	}
														
 
															+	return left;
														
 
															+}
														
 
															+
														
 
															 PMU_FORMAT_ATTR(event,	"config:0-7"	);
														
 
															 PMU_FORMAT_ATTR(umask,	"config:8-15"	);
														
 
															 PMU_FORMAT_ATTR(edge,	"config:18"	);
														
@@ -1979,16 +2580,52 @@ struct intel_shared_regs *allocate_shared_regs(int cpu)
 
															 	return regs;
														
 
															 }
														
 
															+static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
														
 
															+{
														
 
															+	struct intel_excl_cntrs *c;
														
 
															+	int i;
														
 
															+
														
 
															+	c = kzalloc_node(sizeof(struct intel_excl_cntrs),
														
 
															+			 GFP_KERNEL, cpu_to_node(cpu));
														
 
															+	if (c) {
														
 
															+		raw_spin_lock_init(&c->lock);
														
 
															+		for (i = 0; i < X86_PMC_IDX_MAX; i++) {
														
 
															+			c->states[0].state[i] = INTEL_EXCL_UNUSED;
														
 
															+			c->states[0].init_state[i] = INTEL_EXCL_UNUSED;
														
 
															+
														
 
															+			c->states[1].state[i] = INTEL_EXCL_UNUSED;
														
 
															+			c->states[1].init_state[i] = INTEL_EXCL_UNUSED;
														
 
															+		}
														
 
															+		c->core_id = -1;
														
 
															+	}
														
 
															+	return c;
														
 
															+}
														
 
															+
														
 
															 static int intel_pmu_cpu_prepare(int cpu)
														
 
															 {
														
 
															 	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
														
 
															-	if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map))
														
 
															-		return NOTIFY_OK;
														
 
															+	if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
														
 
															+		cpuc->shared_regs = allocate_shared_regs(cpu);
														
 
															+		if (!cpuc->shared_regs)
														
 
															+			return NOTIFY_BAD;
														
 
															+	}
														
 
															-	cpuc->shared_regs = allocate_shared_regs(cpu);
														
 
															-	if (!cpuc->shared_regs)
														
 
															-		return NOTIFY_BAD;
														
 
															+	if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
														
 
															+		size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
														
 
															+
														
 
															+		cpuc->constraint_list = kzalloc(sz, GFP_KERNEL);
														
 
															+		if (!cpuc->constraint_list)
														
 
															+			return NOTIFY_BAD;
														
 
															+
														
 
															+		cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
														
 
															+		if (!cpuc->excl_cntrs) {
														
 
															+			kfree(cpuc->constraint_list);
														
 
															+			kfree(cpuc->shared_regs);
														
 
															+			return NOTIFY_BAD;
														
 
															+		}
														
 
															+		cpuc->excl_thread_id = 0;
														
 
															+	}
														
 
															 	return NOTIFY_OK;
														
 
															 }
														
@@ -2010,13 +2647,15 @@ static void intel_pmu_cpu_starting(int cpu)
 
															 	if (!cpuc->shared_regs)
														
 
															 		return;
														
 
															-	if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) {
														
 
															+	if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
														
 
															+		void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
														
 
															+
														
 
															 		for_each_cpu(i, topology_thread_cpumask(cpu)) {
														
 
															 			struct intel_shared_regs *pc;
														
 
															 			pc = per_cpu(cpu_hw_events, i).shared_regs;
														
 
															 			if (pc && pc->core_id == core_id) {
														
 
															-				cpuc->kfree_on_online = cpuc->shared_regs;
														
 
															+				*onln = cpuc->shared_regs;
														
 
															 				cpuc->shared_regs = pc;
														
 
															 				break;
														
 
															 			}
														
@@ -2027,6 +2666,44 @@ static void intel_pmu_cpu_starting(int cpu)
 
															 	if (x86_pmu.lbr_sel_map)
														
 
															 		cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
														
 
															+
														
 
															+	if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
														
 
															+		int h = x86_pmu.num_counters >> 1;
														
 
															+
														
 
															+		for_each_cpu(i, topology_thread_cpumask(cpu)) {
														
 
															+			struct intel_excl_cntrs *c;
														
 
															+
														
 
															+			c = per_cpu(cpu_hw_events, i).excl_cntrs;
														
 
															+			if (c && c->core_id == core_id) {
														
 
															+				cpuc->kfree_on_online[1] = cpuc->excl_cntrs;
														
 
															+				cpuc->excl_cntrs = c;
														
 
															+				cpuc->excl_thread_id = 1;
														
 
															+				break;
														
 
															+			}
														
 
															+		}
														
 
															+		cpuc->excl_cntrs->core_id = core_id;
														
 
															+		cpuc->excl_cntrs->refcnt++;
														
 
															+		/*
														
 
															+		 * set hard limit to half the number of generic counters
														
 
															+		 */
														
 
															+		cpuc->excl_cntrs->states[0].max_alloc_cntrs = h;
														
 
															+		cpuc->excl_cntrs->states[1].max_alloc_cntrs = h;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void free_excl_cntrs(int cpu)
														
 
															+{
														
 
															+	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
														
 
															+	struct intel_excl_cntrs *c;
														
 
															+
														
 
															+	c = cpuc->excl_cntrs;
														
 
															+	if (c) {
														
 
															+		if (c->core_id == -1 || --c->refcnt == 0)
														
 
															+			kfree(c);
														
 
															+		cpuc->excl_cntrs = NULL;
														
 
															+		kfree(cpuc->constraint_list);
														
 
															+		cpuc->constraint_list = NULL;
														
 
															+	}
														
 
															 }
														
 
															 static void intel_pmu_cpu_dying(int cpu)
														
@@ -2041,19 +2718,9 @@ static void intel_pmu_cpu_dying(int cpu)
 
															 		cpuc->shared_regs = NULL;
														
 
															 	}
														
 
															-	fini_debug_store_on_cpu(cpu);
														
 
															-}
														
 
															+	free_excl_cntrs(cpu);
														
 
															-static void intel_pmu_flush_branch_stack(void)
														
 
															-{
														
 
															-	/*
														
 
															-	 * Intel LBR does not tag entries with the
														
 
															-	 * PID of the current task, then we need to
														
 
															-	 * flush it on ctxsw
														
 
															-	 * For now, we simply reset it
														
 
															-	 */
														
 
															-	if (x86_pmu.lbr_nr)
														
 
															-		intel_pmu_lbr_reset();
														
 
															+	fini_debug_store_on_cpu(cpu);
														
 
															 }
														
 
															 PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
														
@@ -2107,7 +2774,7 @@ static __initconst const struct x86_pmu intel_pmu = {
 
															 	.cpu_starting		= intel_pmu_cpu_starting,
														
 
															 	.cpu_dying		= intel_pmu_cpu_dying,
														
 
															 	.guest_get_msrs		= intel_guest_get_msrs,
														
 
															-	.flush_branch_stack	= intel_pmu_flush_branch_stack,
														
 
															+	.sched_task		= intel_pmu_lbr_sched_task,
														
 
															 };
														
 
															 static __init void intel_clovertown_quirk(void)
														
@@ -2264,6 +2931,27 @@ static __init void intel_nehalem_quirk(void)
 
															 	}
														
 
															 }
														
 
															+/*
														
 
															+ * enable software workaround for errata:
														
 
															+ * SNB: BJ122
														
 
															+ * IVB: BV98
														
 
															+ * HSW: HSD29
														
 
															+ *
														
 
															+ * Only needed when HT is enabled. However detecting
														
 
															+ * if HT is enabled is difficult (model specific). So instead,
														
 
															+ * we enable the workaround in the early boot, and verify if
														
 
															+ * it is needed in a later initcall phase once we have valid
														
 
															+ * topology information to check if HT is actually enabled
														
 
															+ */
														
 
															+static __init void intel_ht_bug(void)
														
 
															+{
														
 
															+	x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
														
 
															+
														
 
															+	x86_pmu.commit_scheduling = intel_commit_scheduling;
														
 
															+	x86_pmu.start_scheduling = intel_start_scheduling;
														
 
															+	x86_pmu.stop_scheduling = intel_stop_scheduling;
														
 
															+}
														
 
															+
														
 
															 EVENT_ATTR_STR(mem-loads,	mem_ld_hsw,	"event=0xcd,umask=0x1,ldlat=3");
														
 
															 EVENT_ATTR_STR(mem-stores,	mem_st_hsw,	"event=0xd0,umask=0x82")
														
@@ -2443,7 +3131,7 @@ __init int intel_pmu_init(void)
 
															 		x86_pmu.event_constraints = intel_slm_event_constraints;
														
 
															 		x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
														
 
															 		x86_pmu.extra_regs = intel_slm_extra_regs;
														
 
															-		x86_pmu.er_flags |= ERF_HAS_RSP_1;
														
 
															+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
														
 
															 		pr_cont("Silvermont events, ");
														
 
															 		break;
														
@@ -2461,7 +3149,7 @@ __init int intel_pmu_init(void)
 
															 		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
														
 
															 		x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
														
 
															 		x86_pmu.extra_regs = intel_westmere_extra_regs;
														
 
															-		x86_pmu.er_flags |= ERF_HAS_RSP_1;
														
 
															+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
														
 
															 		x86_pmu.cpu_events = nhm_events_attrs;
														
@@ -2478,6 +3166,7 @@ __init int intel_pmu_init(void)
 
															 	case 42: /* 32nm SandyBridge         */
														
 
															 	case 45: /* 32nm SandyBridge-E/EN/EP */
														
 
															 		x86_add_quirk(intel_sandybridge_quirk);
														
 
															+		x86_add_quirk(intel_ht_bug);
														
 
															 		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
														
 
															 		       sizeof(hw_cache_event_ids));
														
 
															 		memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
														
@@ -2492,9 +3181,11 @@ __init int intel_pmu_init(void)
 
															 			x86_pmu.extra_regs = intel_snbep_extra_regs;
														
 
															 		else
														
 
															 			x86_pmu.extra_regs = intel_snb_extra_regs;
														
 
															+
														
 
															+
														
 
															 		/* all extra regs are per-cpu when HT is on */
														
 
															-		x86_pmu.er_flags |= ERF_HAS_RSP_1;
														
 
															-		x86_pmu.er_flags |= ERF_NO_HT_SHARING;
														
 
															+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
														
 
															+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
														
 
															 		x86_pmu.cpu_events = snb_events_attrs;
														
@@ -2510,6 +3201,7 @@ __init int intel_pmu_init(void)
 
															 	case 58: /* 22nm IvyBridge       */
														
 
															 	case 62: /* 22nm IvyBridge-EP/EX */
														
 
															+		x86_add_quirk(intel_ht_bug);
														
 
															 		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
														
 
															 		       sizeof(hw_cache_event_ids));
														
 
															 		/* dTLB-load-misses on IVB is different than SNB */
														
@@ -2528,8 +3220,8 @@ __init int intel_pmu_init(void)
 
															 		else
														
 
															 			x86_pmu.extra_regs = intel_snb_extra_regs;
														
 
															 		/* all extra regs are per-cpu when HT is on */
														
 
															-		x86_pmu.er_flags |= ERF_HAS_RSP_1;
														
 
															-		x86_pmu.er_flags |= ERF_NO_HT_SHARING;
														
 
															+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
														
 
															+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
														
 
															 		x86_pmu.cpu_events = snb_events_attrs;
														
@@ -2545,19 +3237,20 @@ __init int intel_pmu_init(void)
 
															 	case 63: /* 22nm Haswell Server */
														
 
															 	case 69: /* 22nm Haswell ULT */
														
 
															 	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
														
 
															+		x86_add_quirk(intel_ht_bug);
														
 
															 		x86_pmu.late_ack = true;
														
 
															-		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
														
 
															-		memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
														
 
															+		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
														
 
															+		memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
														
 
															-		intel_pmu_lbr_init_snb();
														
 
															+		intel_pmu_lbr_init_hsw();
														
 
															 		x86_pmu.event_constraints = intel_hsw_event_constraints;
														
 
															 		x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
														
 
															 		x86_pmu.extra_regs = intel_snbep_extra_regs;
														
 
															 		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
														
 
															 		/* all extra regs are per-cpu when HT is on */
														
 
															-		x86_pmu.er_flags |= ERF_HAS_RSP_1;
														
 
															-		x86_pmu.er_flags |= ERF_NO_HT_SHARING;
														
 
															+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
														
 
															+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
														
 
															 		x86_pmu.hw_config = hsw_hw_config;
														
 
															 		x86_pmu.get_event_constraints = hsw_get_event_constraints;
														
@@ -2566,6 +3259,39 @@ __init int intel_pmu_init(void)
 
															 		pr_cont("Haswell events, ");
														
 
															 		break;
														
 
															+	case 61: /* 14nm Broadwell Core-M */
														
 
															+	case 86: /* 14nm Broadwell Xeon D */
														
 
															+		x86_pmu.late_ack = true;
														
 
															+		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
														
 
															+		memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
														
 
															+
														
 
															+		/* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
														
 
															+		hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
														
 
															+									 BDW_L3_MISS|HSW_SNOOP_DRAM;
														
 
															+		hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
														
 
															+									  HSW_SNOOP_DRAM;
														
 
															+		hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
														
 
															+									     BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
														
 
															+		hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
														
 
															+									      BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
														
 
															+
														
 
															+		intel_pmu_lbr_init_snb();
														
 
															+
														
 
															+		x86_pmu.event_constraints = intel_bdw_event_constraints;
														
 
															+		x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
														
 
															+		x86_pmu.extra_regs = intel_snbep_extra_regs;
														
 
															+		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
														
 
															+		/* all extra regs are per-cpu when HT is on */
														
 
															+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
														
 
															+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
														
 
															+
														
 
															+		x86_pmu.hw_config = hsw_hw_config;
														
 
															+		x86_pmu.get_event_constraints = hsw_get_event_constraints;
														
 
															+		x86_pmu.cpu_events = hsw_events_attrs;
														
 
															+		x86_pmu.limit_period = bdw_limit_period;
														
 
															+		pr_cont("Broadwell events, ");
														
 
															+		break;
														
 
															+
														
 
															 	default:
														
 
															 		switch (x86_pmu.version) {
														
 
															 		case 1:
														
@@ -2651,3 +3377,47 @@ __init int intel_pmu_init(void)
 
															 	return 0;
														
 
															 }
														
 
															+
														
 
															+/*
														
 
															+ * HT bug: phase 2 init
														
 
															+ * Called once we have valid topology information to check
														
 
															+ * whether or not HT is enabled
														
 
															+ * If HT is off, then we disable the workaround
														
 
															+ */
														
 
															+static __init int fixup_ht_bug(void)
														
 
															+{
														
 
															+	int cpu = smp_processor_id();
														
 
															+	int w, c;
														
 
															+	/*
														
 
															+	 * problem not present on this CPU model, nothing to do
														
 
															+	 */
														
 
															+	if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
														
 
															+		return 0;
														
 
															+
														
 
															+	w = cpumask_weight(topology_thread_cpumask(cpu));
														
 
															+	if (w > 1) {
														
 
															+		pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	watchdog_nmi_disable_all();
														
 
															+
														
 
															+	x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
														
 
															+
														
 
															+	x86_pmu.commit_scheduling = NULL;
														
 
															+	x86_pmu.start_scheduling = NULL;
														
 
															+	x86_pmu.stop_scheduling = NULL;
														
 
															+
														
 
															+	watchdog_nmi_enable_all();
														
 
															+
														
 
															+	get_online_cpus();
														
 
															+
														
 
															+	for_each_online_cpu(c) {
														
 
															+		free_excl_cntrs(c);
														
 
															+	}
														
 
															+
														
 
															+	put_online_cpus();
														
 
															+	pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
														
 
															+	return 0;
														
 
															+}
														
 
															+subsys_initcall(fixup_ht_bug)
														
--- a/arch/x86/kernel/cpu/perf_event_intel_bts.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c
@@ -0,0 +1,525 @@
 
															+/*
														
 
															+ * BTS PMU driver for perf
														
 
															+ * Copyright (c) 2013-2014, Intel Corporation.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or modify it
														
 
															+ * under the terms and conditions of the GNU General Public License,
														
 
															+ * version 2, as published by the Free Software Foundation.
														
 
															+ *
														
 
															+ * This program is distributed in the hope it will be useful, but WITHOUT
														
 
															+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
														
 
															+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
														
 
															+ * more details.
														
 
															+ */
														
 
															+
														
 
															+#undef DEBUG
														
 
															+
														
 
															+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
														
 
															+
														
 
															+#include <linux/bitops.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/debugfs.h>
														
 
															+#include <linux/device.h>
														
 
															+#include <linux/coredump.h>
														
 
															+
														
 
															+#include <asm-generic/sizes.h>
														
 
															+#include <asm/perf_event.h>
														
 
															+
														
 
															+#include "perf_event.h"
														
 
															+
														
 
															+struct bts_ctx {
														
 
															+	struct perf_output_handle	handle;
														
 
															+	struct debug_store		ds_back;
														
 
															+	int				started;
														
 
															+};
														
 
															+
														
 
															+static DEFINE_PER_CPU(struct bts_ctx, bts_ctx);
														
 
															+
														
 
															+#define BTS_RECORD_SIZE		24
														
 
															+#define BTS_SAFETY_MARGIN	4080
														
 
															+
														
 
															+struct bts_phys {
														
 
															+	struct page	*page;
														
 
															+	unsigned long	size;
														
 
															+	unsigned long	offset;
														
 
															+	unsigned long	displacement;
														
 
															+};
														
 
															+
														
 
															+struct bts_buffer {
														
 
															+	size_t		real_size;	/* multiple of BTS_RECORD_SIZE */
														
 
															+	unsigned int	nr_pages;
														
 
															+	unsigned int	nr_bufs;
														
 
															+	unsigned int	cur_buf;
														
 
															+	bool		snapshot;
														
 
															+	local_t		data_size;
														
 
															+	local_t		lost;
														
 
															+	local_t		head;
														
 
															+	unsigned long	end;
														
 
															+	void		**data_pages;
														
 
															+	struct bts_phys	buf[0];
														
 
															+};
														
 
															+
														
 
															+struct pmu bts_pmu;
														
 
															+
														
 
															+void intel_pmu_enable_bts(u64 config);
														
 
															+void intel_pmu_disable_bts(void);
														
 
															+
														
 
															+static size_t buf_size(struct page *page)
														
 
															+{
														
 
															+	return 1 << (PAGE_SHIFT + page_private(page));
														
 
															+}
														
 
															+
														
 
															+static void *
														
 
															+bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite)
														
 
															+{
														
 
															+	struct bts_buffer *buf;
														
 
															+	struct page *page;
														
 
															+	int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
														
 
															+	unsigned long offset;
														
 
															+	size_t size = nr_pages << PAGE_SHIFT;
														
 
															+	int pg, nbuf, pad;
														
 
															+
														
 
															+	/* count all the high order buffers */
														
 
															+	for (pg = 0, nbuf = 0; pg < nr_pages;) {
														
 
															+		page = virt_to_page(pages[pg]);
														
 
															+		if (WARN_ON_ONCE(!PagePrivate(page) && nr_pages > 1))
														
 
															+			return NULL;
														
 
															+		pg += 1 << page_private(page);
														
 
															+		nbuf++;
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * to avoid interrupts in overwrite mode, only allow one physical
														
 
															+	 */
														
 
															+	if (overwrite && nbuf > 1)
														
 
															+		return NULL;
														
 
															+
														
 
															+	buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node);
														
 
															+	if (!buf)
														
 
															+		return NULL;
														
 
															+
														
 
															+	buf->nr_pages = nr_pages;
														
 
															+	buf->nr_bufs = nbuf;
														
 
															+	buf->snapshot = overwrite;
														
 
															+	buf->data_pages = pages;
														
 
															+	buf->real_size = size - size % BTS_RECORD_SIZE;
														
 
															+
														
 
															+	for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) {
														
 
															+		unsigned int __nr_pages;
														
 
															+
														
 
															+		page = virt_to_page(pages[pg]);
														
 
															+		__nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1;
														
 
															+		buf->buf[nbuf].page = page;
														
 
															+		buf->buf[nbuf].offset = offset;
														
 
															+		buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
														
 
															+		buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement;
														
 
															+		pad = buf->buf[nbuf].size % BTS_RECORD_SIZE;
														
 
															+		buf->buf[nbuf].size -= pad;
														
 
															+
														
 
															+		pg += __nr_pages;
														
 
															+		offset += __nr_pages << PAGE_SHIFT;
														
 
															+	}
														
 
															+
														
 
															+	return buf;
														
 
															+}
														
 
															+
														
 
															+static void bts_buffer_free_aux(void *data)
														
 
															+{
														
 
															+	kfree(data);
														
 
															+}
														
 
															+
														
 
															+static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx)
														
 
															+{
														
 
															+	return buf->buf[idx].offset + buf->buf[idx].displacement;
														
 
															+}
														
 
															+
														
 
															+static void
														
 
															+bts_config_buffer(struct bts_buffer *buf)
														
 
															+{
														
 
															+	int cpu = raw_smp_processor_id();
														
 
															+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
														
 
															+	struct bts_phys *phys = &buf->buf[buf->cur_buf];
														
 
															+	unsigned long index, thresh = 0, end = phys->size;
														
 
															+	struct page *page = phys->page;
														
 
															+
														
 
															+	index = local_read(&buf->head);
														
 
															+
														
 
															+	if (!buf->snapshot) {
														
 
															+		if (buf->end < phys->offset + buf_size(page))
														
 
															+			end = buf->end - phys->offset - phys->displacement;
														
 
															+
														
 
															+		index -= phys->offset + phys->displacement;
														
 
															+
														
 
															+		if (end - index > BTS_SAFETY_MARGIN)
														
 
															+			thresh = end - BTS_SAFETY_MARGIN;
														
 
															+		else if (end - index > BTS_RECORD_SIZE)
														
 
															+			thresh = end - BTS_RECORD_SIZE;
														
 
															+		else
														
 
															+			thresh = end;
														
 
															+	}
														
 
															+
														
 
															+	ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement;
														
 
															+	ds->bts_index = ds->bts_buffer_base + index;
														
 
															+	ds->bts_absolute_maximum = ds->bts_buffer_base + end;
														
 
															+	ds->bts_interrupt_threshold = !buf->snapshot
														
 
															+		? ds->bts_buffer_base + thresh
														
 
															+		: ds->bts_absolute_maximum + BTS_RECORD_SIZE;
														
 
															+}
														
 
															+
														
 
															+static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
														
 
															+{
														
 
															+	unsigned long index = head - phys->offset;
														
 
															+
														
 
															+	memset(page_address(phys->page) + index, 0, phys->size - index);
														
 
															+}
														
 
															+
														
 
															+static bool bts_buffer_is_full(struct bts_buffer *buf, struct bts_ctx *bts)
														
 
															+{
														
 
															+	if (buf->snapshot)
														
 
															+		return false;
														
 
															+
														
 
															+	if (local_read(&buf->data_size) >= bts->handle.size ||
														
 
															+	    bts->handle.size - local_read(&buf->data_size) < BTS_RECORD_SIZE)
														
 
															+		return true;
														
 
															+
														
 
															+	return false;
														
 
															+}
														
 
															+
														
 
															+static void bts_update(struct bts_ctx *bts)
														
 
															+{
														
 
															+	int cpu = raw_smp_processor_id();
														
 
															+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
														
 
															+	struct bts_buffer *buf = perf_get_aux(&bts->handle);
														
 
															+	unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head;
														
 
															+
														
 
															+	if (!buf)
														
 
															+		return;
														
 
															+
														
 
															+	head = index + bts_buffer_offset(buf, buf->cur_buf);
														
 
															+	old = local_xchg(&buf->head, head);
														
 
															+
														
 
															+	if (!buf->snapshot) {
														
 
															+		if (old == head)
														
 
															+			return;
														
 
															+
														
 
															+		if (ds->bts_index >= ds->bts_absolute_maximum)
														
 
															+			local_inc(&buf->lost);
														
 
															+
														
 
															+		/*
														
 
															+		 * old and head are always in the same physical buffer, so we
														
 
															+		 * can subtract them to get the data size.
														
 
															+		 */
														
 
															+		local_add(head - old, &buf->data_size);
														
 
															+	} else {
														
 
															+		local_set(&buf->data_size, head);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void __bts_event_start(struct perf_event *event)
														
 
															+{
														
 
															+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
														
 
															+	struct bts_buffer *buf = perf_get_aux(&bts->handle);
														
 
															+	u64 config = 0;
														
 
															+
														
 
															+	if (!buf || bts_buffer_is_full(buf, bts))
														
 
															+		return;
														
 
															+
														
 
															+	event->hw.state = 0;
														
 
															+
														
 
															+	if (!buf->snapshot)
														
 
															+		config |= ARCH_PERFMON_EVENTSEL_INT;
														
 
															+	if (!event->attr.exclude_kernel)
														
 
															+		config |= ARCH_PERFMON_EVENTSEL_OS;
														
 
															+	if (!event->attr.exclude_user)
														
 
															+		config |= ARCH_PERFMON_EVENTSEL_USR;
														
 
															+
														
 
															+	bts_config_buffer(buf);
														
 
															+
														
 
															+	/*
														
 
															+	 * local barrier to make sure that ds configuration made it
														
 
															+	 * before we enable BTS
														
 
															+	 */
														
 
															+	wmb();
														
 
															+
														
 
															+	intel_pmu_enable_bts(config);
														
 
															+}
														
 
															+
														
 
															+static void bts_event_start(struct perf_event *event, int flags)
														
 
															+{
														
 
															+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
														
 
															+
														
 
															+	__bts_event_start(event);
														
 
															+
														
 
															+	/* PMI handler: this counter is running and likely generating PMIs */
														
 
															+	ACCESS_ONCE(bts->started) = 1;
														
 
															+}
														
 
															+
														
 
															+static void __bts_event_stop(struct perf_event *event)
														
 
															+{
														
 
															+	/*
														
 
															+	 * No extra synchronization is mandated by the documentation to have
														
 
															+	 * BTS data stores globally visible.
														
 
															+	 */
														
 
															+	intel_pmu_disable_bts();
														
 
															+
														
 
															+	if (event->hw.state & PERF_HES_STOPPED)
														
 
															+		return;
														
 
															+
														
 
															+	ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED;
														
 
															+}
														
 
															+
														
 
															+static void bts_event_stop(struct perf_event *event, int flags)
														
 
															+{
														
 
															+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
														
 
															+
														
 
															+	/* PMI handler: don't restart this counter */
														
 
															+	ACCESS_ONCE(bts->started) = 0;
														
 
															+
														
 
															+	__bts_event_stop(event);
														
 
															+
														
 
															+	if (flags & PERF_EF_UPDATE)
														
 
															+		bts_update(bts);
														
 
															+}
														
 
															+
														
 
															+void intel_bts_enable_local(void)
														
 
															+{
														
 
															+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
														
 
															+
														
 
															+	if (bts->handle.event && bts->started)
														
 
															+		__bts_event_start(bts->handle.event);
														
 
															+}
														
 
															+
														
 
															+void intel_bts_disable_local(void)
														
 
															+{
														
 
															+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
														
 
															+
														
 
															+	if (bts->handle.event)
														
 
															+		__bts_event_stop(bts->handle.event);
														
 
															+}
														
 
															+
														
 
															+static int
														
 
															+bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
														
 
															+{
														
 
															+	unsigned long head, space, next_space, pad, gap, skip, wakeup;
														
 
															+	unsigned int next_buf;
														
 
															+	struct bts_phys *phys, *next_phys;
														
 
															+	int ret;
														
 
															+
														
 
															+	if (buf->snapshot)
														
 
															+		return 0;
														
 
															+
														
 
															+	head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1);
														
 
															+	if (WARN_ON_ONCE(head != local_read(&buf->head)))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	phys = &buf->buf[buf->cur_buf];
														
 
															+	space = phys->offset + phys->displacement + phys->size - head;
														
 
															+	pad = space;
														
 
															+	if (space > handle->size) {
														
 
															+		space = handle->size;
														
 
															+		space -= space % BTS_RECORD_SIZE;
														
 
															+	}
														
 
															+	if (space <= BTS_SAFETY_MARGIN) {
														
 
															+		/* See if next phys buffer has more space */
														
 
															+		next_buf = buf->cur_buf + 1;
														
 
															+		if (next_buf >= buf->nr_bufs)
														
 
															+			next_buf = 0;
														
 
															+		next_phys = &buf->buf[next_buf];
														
 
															+		gap = buf_size(phys->page) - phys->displacement - phys->size +
														
 
															+		      next_phys->displacement;
														
 
															+		skip = pad + gap;
														
 
															+		if (handle->size >= skip) {
														
 
															+			next_space = next_phys->size;
														
 
															+			if (next_space + skip > handle->size) {
														
 
															+				next_space = handle->size - skip;
														
 
															+				next_space -= next_space % BTS_RECORD_SIZE;
														
 
															+			}
														
 
															+			if (next_space > space || !space) {
														
 
															+				if (pad)
														
 
															+					bts_buffer_pad_out(phys, head);
														
 
															+				ret = perf_aux_output_skip(handle, skip);
														
 
															+				if (ret)
														
 
															+					return ret;
														
 
															+				/* Advance to next phys buffer */
														
 
															+				phys = next_phys;
														
 
															+				space = next_space;
														
 
															+				head = phys->offset + phys->displacement;
														
 
															+				/*
														
 
															+				 * After this, cur_buf and head won't match ds
														
 
															+				 * anymore, so we must not be racing with
														
 
															+				 * bts_update().
														
 
															+				 */
														
 
															+				buf->cur_buf = next_buf;
														
 
															+				local_set(&buf->head, head);
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/* Don't go far beyond wakeup watermark */
														
 
															+	wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup -
														
 
															+		 handle->head;
														
 
															+	if (space > wakeup) {
														
 
															+		space = wakeup;
														
 
															+		space -= space % BTS_RECORD_SIZE;
														
 
															+	}
														
 
															+
														
 
															+	buf->end = head + space;
														
 
															+
														
 
															+	/*
														
 
															+	 * If we have no space, the lost notification would have been sent when
														
 
															+	 * we hit absolute_maximum - see bts_update()
														
 
															+	 */
														
 
															+	if (!space)
														
 
															+		return -ENOSPC;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+int intel_bts_interrupt(void)
														
 
															+{
														
 
															+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
														
 
															+	struct perf_event *event = bts->handle.event;
														
 
															+	struct bts_buffer *buf;
														
 
															+	s64 old_head;
														
 
															+	int err;
														
 
															+
														
 
															+	if (!event || !bts->started)
														
 
															+		return 0;
														
 
															+
														
 
															+	buf = perf_get_aux(&bts->handle);
														
 
															+	/*
														
 
															+	 * Skip snapshot counters: they don't use the interrupt, but
														
 
															+	 * there's no other way of telling, because the pointer will
														
 
															+	 * keep moving
														
 
															+	 */
														
 
															+	if (!buf || buf->snapshot)
														
 
															+		return 0;
														
 
															+
														
 
															+	old_head = local_read(&buf->head);
														
 
															+	bts_update(bts);
														
 
															+
														
 
															+	/* no new data */
														
 
															+	if (old_head == local_read(&buf->head))
														
 
															+		return 0;
														
 
															+
														
 
															+	perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
														
 
															+			    !!local_xchg(&buf->lost, 0));
														
 
															+
														
 
															+	buf = perf_aux_output_begin(&bts->handle, event);
														
 
															+	if (!buf)
														
 
															+		return 1;
														
 
															+
														
 
															+	err = bts_buffer_reset(buf, &bts->handle);
														
 
															+	if (err)
														
 
															+		perf_aux_output_end(&bts->handle, 0, false);
														
 
															+
														
 
															+	return 1;
														
 
															+}
														
 
															+
														
 
															+static void bts_event_del(struct perf_event *event, int mode)
														
 
															+{
														
 
															+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
														
 
															+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
														
 
															+	struct bts_buffer *buf = perf_get_aux(&bts->handle);
														
 
															+
														
 
															+	bts_event_stop(event, PERF_EF_UPDATE);
														
 
															+
														
 
															+	if (buf) {
														
 
															+		if (buf->snapshot)
														
 
															+			bts->handle.head =
														
 
															+				local_xchg(&buf->data_size,
														
 
															+					   buf->nr_pages << PAGE_SHIFT);
														
 
															+		perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
														
 
															+				    !!local_xchg(&buf->lost, 0));
														
 
															+	}
														
 
															+
														
 
															+	cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
														
 
															+	cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
														
 
															+	cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
														
 
															+	cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
														
 
															+}
														
 
															+
														
 
															+static int bts_event_add(struct perf_event *event, int mode)
														
 
															+{
														
 
															+	struct bts_buffer *buf;
														
 
															+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
														
 
															+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
														
 
															+	struct hw_perf_event *hwc = &event->hw;
														
 
															+	int ret = -EBUSY;
														
 
															+
														
 
															+	event->hw.state = PERF_HES_STOPPED;
														
 
															+
														
 
															+	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
														
 
															+		return -EBUSY;
														
 
															+
														
 
															+	if (bts->handle.event)
														
 
															+		return -EBUSY;
														
 
															+
														
 
															+	buf = perf_aux_output_begin(&bts->handle, event);
														
 
															+	if (!buf)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	ret = bts_buffer_reset(buf, &bts->handle);
														
 
															+	if (ret) {
														
 
															+		perf_aux_output_end(&bts->handle, 0, false);
														
 
															+		return ret;
														
 
															+	}
														
 
															+
														
 
															+	bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
														
 
															+	bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
														
 
															+	bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
														
 
															+
														
 
															+	if (mode & PERF_EF_START) {
														
 
															+		bts_event_start(event, 0);
														
 
															+		if (hwc->state & PERF_HES_STOPPED) {
														
 
															+			bts_event_del(event, 0);
														
 
															+			return -EBUSY;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static void bts_event_destroy(struct perf_event *event)
														
 
															+{
														
 
															+	x86_del_exclusive(x86_lbr_exclusive_bts);
														
 
															+}
														
 
															+
														
 
															+static int bts_event_init(struct perf_event *event)
														
 
															+{
														
 
															+	if (event->attr.type != bts_pmu.type)
														
 
															+		return -ENOENT;
														
 
															+
														
 
															+	if (x86_add_exclusive(x86_lbr_exclusive_bts))
														
 
															+		return -EBUSY;
														
 
															+
														
 
															+	event->destroy = bts_event_destroy;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static void bts_event_read(struct perf_event *event)
														
 
															+{
														
 
															+}
														
 
															+
														
 
															+static __init int bts_init(void)
														
 
															+{
														
 
															+	if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
														
 
															+		return -ENODEV;
														
 
															+
														
 
															+	bts_pmu.capabilities	= PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE;
														
 
															+	bts_pmu.task_ctx_nr	= perf_sw_context;
														
 
															+	bts_pmu.event_init	= bts_event_init;
														
 
															+	bts_pmu.add		= bts_event_add;
														
 
															+	bts_pmu.del		= bts_event_del;
														
 
															+	bts_pmu.start		= bts_event_start;
														
 
															+	bts_pmu.stop		= bts_event_stop;
														
 
															+	bts_pmu.read		= bts_event_read;
														
 
															+	bts_pmu.setup_aux	= bts_buffer_setup_aux;
														
 
															+	bts_pmu.free_aux	= bts_buffer_free_aux;
														
 
															+
														
 
															+	return perf_pmu_register(&bts_pmu, "intel_bts", -1);
														
 
															+}
														
 
															+
														
 
															+module_init(bts_init);
														
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -0,0 +1,1379 @@
 
															+/*
														
 
															+ * Intel Cache Quality-of-Service Monitoring (CQM) support.
														
 
															+ *
														
 
															+ * Based very, very heavily on work by Peter Zijlstra.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/perf_event.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <asm/cpu_device_id.h>
														
 
															+#include "perf_event.h"
														
 
															+
														
 
															+#define MSR_IA32_PQR_ASSOC	0x0c8f
														
 
															+#define MSR_IA32_QM_CTR		0x0c8e
														
 
															+#define MSR_IA32_QM_EVTSEL	0x0c8d
														
 
															+
														
 
															+static unsigned int cqm_max_rmid = -1;
														
 
															+static unsigned int cqm_l3_scale; /* supposedly cacheline size */
														
 
															+
														
 
															+struct intel_cqm_state {
														
 
															+	raw_spinlock_t		lock;
														
 
															+	int			rmid;
														
 
															+	int			cnt;
														
 
															+};
														
 
															+
														
 
															+static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state);
														
 
															+
														
 
															+/*
														
 
															+ * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
														
 
															+ * Also protects event->hw.cqm_rmid
														
 
															+ *
														
 
															+ * Hold either for stability, both for modification of ->hw.cqm_rmid.
														
 
															+ */
														
 
															+static DEFINE_MUTEX(cache_mutex);
														
 
															+static DEFINE_RAW_SPINLOCK(cache_lock);
														
 
															+
														
 
															+/*
														
 
															+ * Groups of events that have the same target(s), one RMID per group.
														
 
															+ */
														
 
															+static LIST_HEAD(cache_groups);
														
 
															+
														
 
															+/*
														
 
															+ * Mask of CPUs for reading CQM values. We only need one per-socket.
														
 
															+ */
														
 
															+static cpumask_t cqm_cpumask;
														
 
															+
														
 
															+#define RMID_VAL_ERROR		(1ULL << 63)
														
 
															+#define RMID_VAL_UNAVAIL	(1ULL << 62)
														
 
															+
														
 
															+#define QOS_L3_OCCUP_EVENT_ID	(1 << 0)
														
 
															+
														
 
															+#define QOS_EVENT_MASK	QOS_L3_OCCUP_EVENT_ID
														
 
															+
														
 
															+/*
														
 
															+ * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
														
 
															+ *
														
 
															+ * This rmid is always free and is guaranteed to have an associated
														
 
															+ * near-zero occupancy value, i.e. no cachelines are tagged with this
														
 
															+ * RMID, once __intel_cqm_rmid_rotate() returns.
														
 
															+ */
														
 
															+static unsigned int intel_cqm_rotation_rmid;
														
 
															+
														
 
															+#define INVALID_RMID		(-1)
														
 
															+
														
 
															+/*
														
 
															+ * Is @rmid valid for programming the hardware?
														
 
															+ *
														
 
															+ * rmid 0 is reserved by the hardware for all non-monitored tasks, which
														
 
															+ * means that we should never come across an rmid with that value.
														
 
															+ * Likewise, an rmid value of -1 is used to indicate "no rmid currently
														
 
															+ * assigned" and is used as part of the rotation code.
														
 
															+ */
														
 
															+static inline bool __rmid_valid(unsigned int rmid)
														
 
															+{
														
 
															+	if (!rmid || rmid == INVALID_RMID)
														
 
															+		return false;
														
 
															+
														
 
															+	return true;
														
 
															+}
														
 
															+
														
 
															+static u64 __rmid_read(unsigned int rmid)
														
 
															+{
														
 
															+	u64 val;
														
 
															+
														
 
															+	/*
														
 
															+	 * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
														
 
															+	 * it just says that to increase confusion.
														
 
															+	 */
														
 
															+	wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
														
 
															+	rdmsrl(MSR_IA32_QM_CTR, val);
														
 
															+
														
 
															+	/*
														
 
															+	 * Aside from the ERROR and UNAVAIL bits, assume this thing returns
														
 
															+	 * the number of cachelines tagged with @rmid.
														
 
															+	 */
														
 
															+	return val;
														
 
															+}
														
 
															+
														
 
															+enum rmid_recycle_state {
														
 
															+	RMID_YOUNG = 0,
														
 
															+	RMID_AVAILABLE,
														
 
															+	RMID_DIRTY,
														
 
															+};
														
 
															+
														
 
															+struct cqm_rmid_entry {
														
 
															+	unsigned int rmid;
														
 
															+	enum rmid_recycle_state state;
														
 
															+	struct list_head list;
														
 
															+	unsigned long queue_time;
														
 
															+};
														
 
															+
														
 
															+/*
														
 
															+ * cqm_rmid_free_lru - A least recently used list of RMIDs.
														
 
															+ *
														
 
															+ * Oldest entry at the head, newest (most recently used) entry at the
														
 
															+ * tail. This list is never traversed, it's only used to keep track of
														
 
															+ * the lru order. That is, we only pick entries of the head or insert
														
 
															+ * them on the tail.
														
 
															+ *
														
 
															+ * All entries on the list are 'free', and their RMIDs are not currently
														
 
															+ * in use. To mark an RMID as in use, remove its entry from the lru
														
 
															+ * list.
														
 
															+ *
														
 
															+ *
														
 
															+ * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
														
 
															+ *
														
 
															+ * This list is contains RMIDs that no one is currently using but that
														
 
															+ * may have a non-zero occupancy value associated with them. The
														
 
															+ * rotation worker moves RMIDs from the limbo list to the free list once
														
 
															+ * the occupancy value drops below __intel_cqm_threshold.
														
 
															+ *
														
 
															+ * Both lists are protected by cache_mutex.
														
 
															+ */
														
 
															+static LIST_HEAD(cqm_rmid_free_lru);
														
 
															+static LIST_HEAD(cqm_rmid_limbo_lru);
														
 
															+
														
 
															+/*
														
 
															+ * We use a simple array of pointers so that we can lookup a struct
														
 
															+ * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
														
 
															+ * and __put_rmid() from having to worry about dealing with struct
														
 
															+ * cqm_rmid_entry - they just deal with rmids, i.e. integers.
														
 
															+ *
														
 
															+ * Once this array is initialized it is read-only. No locks are required
														
 
															+ * to access it.
														
 
															+ *
														
 
															+ * All entries for all RMIDs can be looked up in the this array at all
														
 
															+ * times.
														
 
															+ */
														
 
															+static struct cqm_rmid_entry **cqm_rmid_ptrs;
														
 
															+
														
 
															+static inline struct cqm_rmid_entry *__rmid_entry(int rmid)
														
 
															+{
														
 
															+	struct cqm_rmid_entry *entry;
														
 
															+
														
 
															+	entry = cqm_rmid_ptrs[rmid];
														
 
															+	WARN_ON(entry->rmid != rmid);
														
 
															+
														
 
															+	return entry;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Returns < 0 on fail.
														
 
															+ *
														
 
															+ * We expect to be called with cache_mutex held.
														
 
															+ */
														
 
															+static int __get_rmid(void)
														
 
															+{
														
 
															+	struct cqm_rmid_entry *entry;
														
 
															+
														
 
															+	lockdep_assert_held(&cache_mutex);
														
 
															+
														
 
															+	if (list_empty(&cqm_rmid_free_lru))
														
 
															+		return INVALID_RMID;
														
 
															+
														
 
															+	entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
														
 
															+	list_del(&entry->list);
														
 
															+
														
 
															+	return entry->rmid;
														
 
															+}
														
 
															+
														
 
															+static void __put_rmid(unsigned int rmid)
														
 
															+{
														
 
															+	struct cqm_rmid_entry *entry;
														
 
															+
														
 
															+	lockdep_assert_held(&cache_mutex);
														
 
															+
														
 
															+	WARN_ON(!__rmid_valid(rmid));
														
 
															+	entry = __rmid_entry(rmid);
														
 
															+
														
 
															+	entry->queue_time = jiffies;
														
 
															+	entry->state = RMID_YOUNG;
														
 
															+
														
 
															+	list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
														
 
															+}
														
 
															+
														
 
															+static int intel_cqm_setup_rmid_cache(void)
														
 
															+{
														
 
															+	struct cqm_rmid_entry *entry;
														
 
															+	unsigned int nr_rmids;
														
 
															+	int r = 0;
														
 
															+
														
 
															+	nr_rmids = cqm_max_rmid + 1;
														
 
															+	cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
														
 
															+				nr_rmids, GFP_KERNEL);
														
 
															+	if (!cqm_rmid_ptrs)
														
 
															+		return -ENOMEM;
														
 
															+
														
 
															+	for (; r <= cqm_max_rmid; r++) {
														
 
															+		struct cqm_rmid_entry *entry;
														
 
															+
														
 
															+		entry = kmalloc(sizeof(*entry), GFP_KERNEL);
														
 
															+		if (!entry)
														
 
															+			goto fail;
														
 
															+
														
 
															+		INIT_LIST_HEAD(&entry->list);
														
 
															+		entry->rmid = r;
														
 
															+		cqm_rmid_ptrs[r] = entry;
														
 
															+
														
 
															+		list_add_tail(&entry->list, &cqm_rmid_free_lru);
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * RMID 0 is special and is always allocated. It's used for all
														
 
															+	 * tasks that are not monitored.
														
 
															+	 */
														
 
															+	entry = __rmid_entry(0);
														
 
															+	list_del(&entry->list);
														
 
															+
														
 
															+	mutex_lock(&cache_mutex);
														
 
															+	intel_cqm_rotation_rmid = __get_rmid();
														
 
															+	mutex_unlock(&cache_mutex);
														
 
															+
														
 
															+	return 0;
														
 
															+fail:
														
 
															+	while (r--)
														
 
															+		kfree(cqm_rmid_ptrs[r]);
														
 
															+
														
 
															+	kfree(cqm_rmid_ptrs);
														
 
															+	return -ENOMEM;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Determine if @a and @b measure the same set of tasks.
														
 
															+ *
														
 
															+ * If @a and @b measure the same set of tasks then we want to share a
														
 
															+ * single RMID.
														
 
															+ */
														
 
															+static bool __match_event(struct perf_event *a, struct perf_event *b)
														
 
															+{
														
 
															+	/* Per-cpu and task events don't mix */
														
 
															+	if ((a->attach_state & PERF_ATTACH_TASK) !=
														
 
															+	    (b->attach_state & PERF_ATTACH_TASK))
														
 
															+		return false;
														
 
															+
														
 
															+#ifdef CONFIG_CGROUP_PERF
														
 
															+	if (a->cgrp != b->cgrp)
														
 
															+		return false;
														
 
															+#endif
														
 
															+
														
 
															+	/* If not task event, we're machine wide */
														
 
															+	if (!(b->attach_state & PERF_ATTACH_TASK))
														
 
															+		return true;
														
 
															+
														
 
															+	/*
														
 
															+	 * Events that target same task are placed into the same cache group.
														
 
															+	 */
														
 
															+	if (a->hw.target == b->hw.target)
														
 
															+		return true;
														
 
															+
														
 
															+	/*
														
 
															+	 * Are we an inherited event?
														
 
															+	 */
														
 
															+	if (b->parent == a)
														
 
															+		return true;
														
 
															+
														
 
															+	return false;
														
 
															+}
														
 
															+
														
 
															+#ifdef CONFIG_CGROUP_PERF
														
 
															+static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
														
 
															+{
														
 
															+	if (event->attach_state & PERF_ATTACH_TASK)
														
 
															+		return perf_cgroup_from_task(event->hw.target);
														
 
															+
														
 
															+	return event->cgrp;
														
 
															+}
														
 
															+#endif
														
 
															+
														
 
															+/*
														
 
															+ * Determine if @a's tasks intersect with @b's tasks
														
 
															+ *
														
 
															+ * There are combinations of events that we explicitly prohibit,
														
 
															+ *
														
 
															+ *		   PROHIBITS
														
 
															+ *     system-wide    -> 	cgroup and task
														
 
															+ *     cgroup 	      ->	system-wide
														
 
															+ *     		      ->	task in cgroup
														
 
															+ *     task 	      -> 	system-wide
														
 
															+ *     		      ->	task in cgroup
														
 
															+ *
														
 
															+ * Call this function before allocating an RMID.
														
 
															+ */
														
 
															+static bool __conflict_event(struct perf_event *a, struct perf_event *b)
														
 
															+{
														
 
															+#ifdef CONFIG_CGROUP_PERF
														
 
															+	/*
														
 
															+	 * We can have any number of cgroups but only one system-wide
														
 
															+	 * event at a time.
														
 
															+	 */
														
 
															+	if (a->cgrp && b->cgrp) {
														
 
															+		struct perf_cgroup *ac = a->cgrp;
														
 
															+		struct perf_cgroup *bc = b->cgrp;
														
 
															+
														
 
															+		/*
														
 
															+		 * This condition should have been caught in
														
 
															+		 * __match_event() and we should be sharing an RMID.
														
 
															+		 */
														
 
															+		WARN_ON_ONCE(ac == bc);
														
 
															+
														
 
															+		if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
														
 
															+		    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
														
 
															+			return true;
														
 
															+
														
 
															+		return false;
														
 
															+	}
														
 
															+
														
 
															+	if (a->cgrp || b->cgrp) {
														
 
															+		struct perf_cgroup *ac, *bc;
														
 
															+
														
 
															+		/*
														
 
															+		 * cgroup and system-wide events are mutually exclusive
														
 
															+		 */
														
 
															+		if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
														
 
															+		    (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
														
 
															+			return true;
														
 
															+
														
 
															+		/*
														
 
															+		 * Ensure neither event is part of the other's cgroup
														
 
															+		 */
														
 
															+		ac = event_to_cgroup(a);
														
 
															+		bc = event_to_cgroup(b);
														
 
															+		if (ac == bc)
														
 
															+			return true;
														
 
															+
														
 
															+		/*
														
 
															+		 * Must have cgroup and non-intersecting task events.
														
 
															+		 */
														
 
															+		if (!ac || !bc)
														
 
															+			return false;
														
 
															+
														
 
															+		/*
														
 
															+		 * We have cgroup and task events, and the task belongs
														
 
															+		 * to a cgroup. Check for for overlap.
														
 
															+		 */
														
 
															+		if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
														
 
															+		    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
														
 
															+			return true;
														
 
															+
														
 
															+		return false;
														
 
															+	}
														
 
															+#endif
														
 
															+	/*
														
 
															+	 * If one of them is not a task, same story as above with cgroups.
														
 
															+	 */
														
 
															+	if (!(a->attach_state & PERF_ATTACH_TASK) ||
														
 
															+	    !(b->attach_state & PERF_ATTACH_TASK))
														
 
															+		return true;
														
 
															+
														
 
															+	/*
														
 
															+	 * Must be non-overlapping.
														
 
															+	 */
														
 
															+	return false;
														
 
															+}
														
 
															+
														
 
															+struct rmid_read {
														
 
															+	unsigned int rmid;
														
 
															+	atomic64_t value;
														
 
															+};
														
 
															+
														
 
															+static void __intel_cqm_event_count(void *info);
														
 
															+
														
 
															+/*
														
 
															+ * Exchange the RMID of a group of events.
														
 
															+ */
														
 
															+static unsigned int
														
 
															+intel_cqm_xchg_rmid(struct perf_event *group, unsigned int rmid)
														
 
															+{
														
 
															+	struct perf_event *event;
														
 
															+	unsigned int old_rmid = group->hw.cqm_rmid;
														
 
															+	struct list_head *head = &group->hw.cqm_group_entry;
														
 
															+
														
 
															+	lockdep_assert_held(&cache_mutex);
														
 
															+
														
 
															+	/*
														
 
															+	 * If our RMID is being deallocated, perform a read now.
														
 
															+	 */
														
 
															+	if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
														
 
															+		struct rmid_read rr = {
														
 
															+			.value = ATOMIC64_INIT(0),
														
 
															+			.rmid = old_rmid,
														
 
															+		};
														
 
															+
														
 
															+		on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count,
														
 
															+				 &rr, 1);
														
 
															+		local64_set(&group->count, atomic64_read(&rr.value));
														
 
															+	}
														
 
															+
														
 
															+	raw_spin_lock_irq(&cache_lock);
														
 
															+
														
 
															+	group->hw.cqm_rmid = rmid;
														
 
															+	list_for_each_entry(event, head, hw.cqm_group_entry)
														
 
															+		event->hw.cqm_rmid = rmid;
														
 
															+
														
 
															+	raw_spin_unlock_irq(&cache_lock);
														
 
															+
														
 
															+	return old_rmid;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
														
 
															+ * cachelines are still tagged with RMIDs in limbo, we progressively
														
 
															+ * increment the threshold until we find an RMID in limbo with <=
														
 
															+ * __intel_cqm_threshold lines tagged. This is designed to mitigate the
														
 
															+ * problem where cachelines tagged with an RMID are not steadily being
														
 
															+ * evicted.
														
 
															+ *
														
 
															+ * On successful rotations we decrease the threshold back towards zero.
														
 
															+ *
														
 
															+ * __intel_cqm_max_threshold provides an upper bound on the threshold,
														
 
															+ * and is measured in bytes because it's exposed to userland.
														
 
															+ */
														
 
															+static unsigned int __intel_cqm_threshold;
														
 
															+static unsigned int __intel_cqm_max_threshold;
														
 
															+
														
 
															+/*
														
 
															+ * Test whether an RMID has a zero occupancy value on this cpu.
														
 
															+ */
														
 
															+static void intel_cqm_stable(void *arg)
														
 
															+{
														
 
															+	struct cqm_rmid_entry *entry;
														
 
															+
														
 
															+	list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
														
 
															+		if (entry->state != RMID_AVAILABLE)
														
 
															+			break;
														
 
															+
														
 
															+		if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
														
 
															+			entry->state = RMID_DIRTY;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * If we have group events waiting for an RMID that don't conflict with
														
 
															+ * events already running, assign @rmid.
														
 
															+ */
														
 
															+static bool intel_cqm_sched_in_event(unsigned int rmid)
														
 
															+{
														
 
															+	struct perf_event *leader, *event;
														
 
															+
														
 
															+	lockdep_assert_held(&cache_mutex);
														
 
															+
														
 
															+	leader = list_first_entry(&cache_groups, struct perf_event,
														
 
															+				  hw.cqm_groups_entry);
														
 
															+	event = leader;
														
 
															+
														
 
															+	list_for_each_entry_continue(event, &cache_groups,
														
 
															+				     hw.cqm_groups_entry) {
														
 
															+		if (__rmid_valid(event->hw.cqm_rmid))
														
 
															+			continue;
														
 
															+
														
 
															+		if (__conflict_event(event, leader))
														
 
															+			continue;
														
 
															+
														
 
															+		intel_cqm_xchg_rmid(event, rmid);
														
 
															+		return true;
														
 
															+	}
														
 
															+
														
 
															+	return false;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Initially use this constant for both the limbo queue time and the
														
 
															+ * rotation timer interval, pmu::hrtimer_interval_ms.
														
 
															+ *
														
 
															+ * They don't need to be the same, but the two are related since if you
														
 
															+ * rotate faster than you recycle RMIDs, you may run out of available
														
 
															+ * RMIDs.
														
 
															+ */
														
 
															+#define RMID_DEFAULT_QUEUE_TIME 250	/* ms */
														
 
															+
														
 
															+static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
														
 
															+
														
 
															+/*
														
 
															+ * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
														
 
															+ * @nr_available: number of freeable RMIDs on the limbo list
														
 
															+ *
														
 
															+ * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
														
 
															+ * cachelines are tagged with those RMIDs. After this we can reuse them
														
 
															+ * and know that the current set of active RMIDs is stable.
														
 
															+ *
														
 
															+ * Return %true or %false depending on whether stabilization needs to be
														
 
															+ * reattempted.
														
 
															+ *
														
 
															+ * If we return %true then @nr_available is updated to indicate the
														
 
															+ * number of RMIDs on the limbo list that have been queued for the
														
 
															+ * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
														
 
															+ * are above __intel_cqm_threshold.
														
 
															+ */
														
 
															+static bool intel_cqm_rmid_stabilize(unsigned int *available)
														
 
															+{
														
 
															+	struct cqm_rmid_entry *entry, *tmp;
														
 
															+
														
 
															+	lockdep_assert_held(&cache_mutex);
														
 
															+
														
 
															+	*available = 0;
														
 
															+	list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
														
 
															+		unsigned long min_queue_time;
														
 
															+		unsigned long now = jiffies;
														
 
															+
														
 
															+		/*
														
 
															+		 * We hold RMIDs placed into limbo for a minimum queue
														
 
															+		 * time. Before the minimum queue time has elapsed we do
														
 
															+		 * not recycle RMIDs.
														
 
															+		 *
														
 
															+		 * The reasoning is that until a sufficient time has
														
 
															+		 * passed since we stopped using an RMID, any RMID
														
 
															+		 * placed onto the limbo list will likely still have
														
 
															+		 * data tagged in the cache, which means we'll probably
														
 
															+		 * fail to recycle it anyway.
														
 
															+		 *
														
 
															+		 * We can save ourselves an expensive IPI by skipping
														
 
															+		 * any RMIDs that have not been queued for the minimum
														
 
															+		 * time.
														
 
															+		 */
														
 
															+		min_queue_time = entry->queue_time +
														
 
															+			msecs_to_jiffies(__rmid_queue_time_ms);
														
 
															+
														
 
															+		if (time_after(min_queue_time, now))
														
 
															+			break;
														
 
															+
														
 
															+		entry->state = RMID_AVAILABLE;
														
 
															+		(*available)++;
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * Fast return if none of the RMIDs on the limbo list have been
														
 
															+	 * sitting on the queue for the minimum queue time.
														
 
															+	 */
														
 
															+	if (!*available)
														
 
															+		return false;
														
 
															+
														
 
															+	/*
														
 
															+	 * Test whether an RMID is free for each package.
														
 
															+	 */
														
 
															+	on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
														
 
															+
														
 
															+	list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
														
 
															+		/*
														
 
															+		 * Exhausted all RMIDs that have waited min queue time.
														
 
															+		 */
														
 
															+		if (entry->state == RMID_YOUNG)
														
 
															+			break;
														
 
															+
														
 
															+		if (entry->state == RMID_DIRTY)
														
 
															+			continue;
														
 
															+
														
 
															+		list_del(&entry->list);	/* remove from limbo */
														
 
															+
														
 
															+		/*
														
 
															+		 * The rotation RMID gets priority if it's
														
 
															+		 * currently invalid. In which case, skip adding
														
 
															+		 * the RMID to the the free lru.
														
 
															+		 */
														
 
															+		if (!__rmid_valid(intel_cqm_rotation_rmid)) {
														
 
															+			intel_cqm_rotation_rmid = entry->rmid;
														
 
															+			continue;
														
 
															+		}
														
 
															+
														
 
															+		/*
														
 
															+		 * If we have groups waiting for RMIDs, hand
														
 
															+		 * them one now provided they don't conflict.
														
 
															+		 */
														
 
															+		if (intel_cqm_sched_in_event(entry->rmid))
														
 
															+			continue;
														
 
															+
														
 
															+		/*
														
 
															+		 * Otherwise place it onto the free list.
														
 
															+		 */
														
 
															+		list_add_tail(&entry->list, &cqm_rmid_free_lru);
														
 
															+	}
														
 
															+
														
 
															+
														
 
															+	return __rmid_valid(intel_cqm_rotation_rmid);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Pick a victim group and move it to the tail of the group list.
														
 
															+ * @next: The first group without an RMID
														
 
															+ */
														
 
															+static void __intel_cqm_pick_and_rotate(struct perf_event *next)
														
 
															+{
														
 
															+	struct perf_event *rotor;
														
 
															+	unsigned int rmid;
														
 
															+
														
 
															+	lockdep_assert_held(&cache_mutex);
														
 
															+
														
 
															+	rotor = list_first_entry(&cache_groups, struct perf_event,
														
 
															+				 hw.cqm_groups_entry);
														
 
															+
														
 
															+	/*
														
 
															+	 * The group at the front of the list should always have a valid
														
 
															+	 * RMID. If it doesn't then no groups have RMIDs assigned and we
														
 
															+	 * don't need to rotate the list.
														
 
															+	 */
														
 
															+	if (next == rotor)
														
 
															+		return;
														
 
															+
														
 
															+	rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
														
 
															+	__put_rmid(rmid);
														
 
															+
														
 
															+	list_rotate_left(&cache_groups);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Deallocate the RMIDs from any events that conflict with @event, and
														
 
															+ * place them on the back of the group list.
														
 
															+ */
														
 
															+static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
														
 
															+{
														
 
															+	struct perf_event *group, *g;
														
 
															+	unsigned int rmid;
														
 
															+
														
 
															+	lockdep_assert_held(&cache_mutex);
														
 
															+
														
 
															+	list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
														
 
															+		if (group == event)
														
 
															+			continue;
														
 
															+
														
 
															+		rmid = group->hw.cqm_rmid;
														
 
															+
														
 
															+		/*
														
 
															+		 * Skip events that don't have a valid RMID.
														
 
															+		 */
														
 
															+		if (!__rmid_valid(rmid))
														
 
															+			continue;
														
 
															+
														
 
															+		/*
														
 
															+		 * No conflict? No problem! Leave the event alone.
														
 
															+		 */
														
 
															+		if (!__conflict_event(group, event))
														
 
															+			continue;
														
 
															+
														
 
															+		intel_cqm_xchg_rmid(group, INVALID_RMID);
														
 
															+		__put_rmid(rmid);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Attempt to rotate the groups and assign new RMIDs.
														
 
															+ *
														
 
															+ * We rotate for two reasons,
														
 
															+ *   1. To handle the scheduling of conflicting events
														
 
															+ *   2. To recycle RMIDs
														
 
															+ *
														
 
															+ * Rotating RMIDs is complicated because the hardware doesn't give us
														
 
															+ * any clues.
														
 
															+ *
														
 
															+ * There's problems with the hardware interface; when you change the
														
 
															+ * task:RMID map cachelines retain their 'old' tags, giving a skewed
														
 
															+ * picture. In order to work around this, we must always keep one free
														
 
															+ * RMID - intel_cqm_rotation_rmid.
														
 
															+ *
														
 
															+ * Rotation works by taking away an RMID from a group (the old RMID),
														
 
															+ * and assigning the free RMID to another group (the new RMID). We must
														
 
															+ * then wait for the old RMID to not be used (no cachelines tagged).
														
 
															+ * This ensure that all cachelines are tagged with 'active' RMIDs. At
														
 
															+ * this point we can start reading values for the new RMID and treat the
														
 
															+ * old RMID as the free RMID for the next rotation.
														
 
															+ *
														
 
															+ * Return %true or %false depending on whether we did any rotating.
														
 
															+ */
														
 
															+static bool __intel_cqm_rmid_rotate(void)
														
 
															+{
														
 
															+	struct perf_event *group, *start = NULL;
														
 
															+	unsigned int threshold_limit;
														
 
															+	unsigned int nr_needed = 0;
														
 
															+	unsigned int nr_available;
														
 
															+	bool rotated = false;
														
 
															+
														
 
															+	mutex_lock(&cache_mutex);
														
 
															+
														
 
															+again:
														
 
															+	/*
														
 
															+	 * Fast path through this function if there are no groups and no
														
 
															+	 * RMIDs that need cleaning.
														
 
															+	 */
														
 
															+	if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
														
 
															+		goto out;
														
 
															+
														
 
															+	list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
														
 
															+		if (!__rmid_valid(group->hw.cqm_rmid)) {
														
 
															+			if (!start)
														
 
															+				start = group;
														
 
															+			nr_needed++;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * We have some event groups, but they all have RMIDs assigned
														
 
															+	 * and no RMIDs need cleaning.
														
 
															+	 */
														
 
															+	if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
														
 
															+		goto out;
														
 
															+
														
 
															+	if (!nr_needed)
														
 
															+		goto stabilize;
														
 
															+
														
 
															+	/*
														
 
															+	 * We have more event groups without RMIDs than available RMIDs,
														
 
															+	 * or we have event groups that conflict with the ones currently
														
 
															+	 * scheduled.
														
 
															+	 *
														
 
															+	 * We force deallocate the rmid of the group at the head of
														
 
															+	 * cache_groups. The first event group without an RMID then gets
														
 
															+	 * assigned intel_cqm_rotation_rmid. This ensures we always make
														
 
															+	 * forward progress.
														
 
															+	 *
														
 
															+	 * Rotate the cache_groups list so the previous head is now the
														
 
															+	 * tail.
														
 
															+	 */
														
 
															+	__intel_cqm_pick_and_rotate(start);
														
 
															+
														
 
															+	/*
														
 
															+	 * If the rotation is going to succeed, reduce the threshold so
														
 
															+	 * that we don't needlessly reuse dirty RMIDs.
														
 
															+	 */
														
 
															+	if (__rmid_valid(intel_cqm_rotation_rmid)) {
														
 
															+		intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
														
 
															+		intel_cqm_rotation_rmid = __get_rmid();
														
 
															+
														
 
															+		intel_cqm_sched_out_conflicting_events(start);
														
 
															+
														
 
															+		if (__intel_cqm_threshold)
														
 
															+			__intel_cqm_threshold--;
														
 
															+	}
														
 
															+
														
 
															+	rotated = true;
														
 
															+
														
 
															+stabilize:
														
 
															+	/*
														
 
															+	 * We now need to stablize the RMID we freed above (if any) to
														
 
															+	 * ensure that the next time we rotate we have an RMID with zero
														
 
															+	 * occupancy value.
														
 
															+	 *
														
 
															+	 * Alternatively, if we didn't need to perform any rotation,
														
 
															+	 * we'll have a bunch of RMIDs in limbo that need stabilizing.
														
 
															+	 */
														
 
															+	threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
														
 
															+
														
 
															+	while (intel_cqm_rmid_stabilize(&nr_available) &&
														
 
															+	       __intel_cqm_threshold < threshold_limit) {
														
 
															+		unsigned int steal_limit;
														
 
															+
														
 
															+		/*
														
 
															+		 * Don't spin if nobody is actively waiting for an RMID,
														
 
															+		 * the rotation worker will be kicked as soon as an
														
 
															+		 * event needs an RMID anyway.
														
 
															+		 */
														
 
															+		if (!nr_needed)
														
 
															+			break;
														
 
															+
														
 
															+		/* Allow max 25% of RMIDs to be in limbo. */
														
 
															+		steal_limit = (cqm_max_rmid + 1) / 4;
														
 
															+
														
 
															+		/*
														
 
															+		 * We failed to stabilize any RMIDs so our rotation
														
 
															+		 * logic is now stuck. In order to make forward progress
														
 
															+		 * we have a few options:
														
 
															+		 *
														
 
															+		 *   1. rotate ("steal") another RMID
														
 
															+		 *   2. increase the threshold
														
 
															+		 *   3. do nothing
														
 
															+		 *
														
 
															+		 * We do both of 1. and 2. until we hit the steal limit.
														
 
															+		 *
														
 
															+		 * The steal limit prevents all RMIDs ending up on the
														
 
															+		 * limbo list. This can happen if every RMID has a
														
 
															+		 * non-zero occupancy above threshold_limit, and the
														
 
															+		 * occupancy values aren't dropping fast enough.
														
 
															+		 *
														
 
															+		 * Note that there is prioritisation at work here - we'd
														
 
															+		 * rather increase the number of RMIDs on the limbo list
														
 
															+		 * than increase the threshold, because increasing the
														
 
															+		 * threshold skews the event data (because we reuse
														
 
															+		 * dirty RMIDs) - threshold bumps are a last resort.
														
 
															+		 */
														
 
															+		if (nr_available < steal_limit)
														
 
															+			goto again;
														
 
															+
														
 
															+		__intel_cqm_threshold++;
														
 
															+	}
														
 
															+
														
 
															+out:
														
 
															+	mutex_unlock(&cache_mutex);
														
 
															+	return rotated;
														
 
															+}
														
 
															+
														
 
															+static void intel_cqm_rmid_rotate(struct work_struct *work);
														
 
															+
														
 
															+static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
														
 
															+
														
 
															+static struct pmu intel_cqm_pmu;
														
 
															+
														
 
															+static void intel_cqm_rmid_rotate(struct work_struct *work)
														
 
															+{
														
 
															+	unsigned long delay;
														
 
															+
														
 
															+	__intel_cqm_rmid_rotate();
														
 
															+
														
 
															+	delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
														
 
															+	schedule_delayed_work(&intel_cqm_rmid_work, delay);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Find a group and setup RMID.
														
 
															+ *
														
 
															+ * If we're part of a group, we use the group's RMID.
														
 
															+ */
														
 
															+static void intel_cqm_setup_event(struct perf_event *event,
														
 
															+				  struct perf_event **group)
														
 
															+{
														
 
															+	struct perf_event *iter;
														
 
															+	unsigned int rmid;
														
 
															+	bool conflict = false;
														
 
															+
														
 
															+	list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
														
 
															+		rmid = iter->hw.cqm_rmid;
														
 
															+
														
 
															+		if (__match_event(iter, event)) {
														
 
															+			/* All tasks in a group share an RMID */
														
 
															+			event->hw.cqm_rmid = rmid;
														
 
															+			*group = iter;
														
 
															+			return;
														
 
															+		}
														
 
															+
														
 
															+		/*
														
 
															+		 * We only care about conflicts for events that are
														
 
															+		 * actually scheduled in (and hence have a valid RMID).
														
 
															+		 */
														
 
															+		if (__conflict_event(iter, event) && __rmid_valid(rmid))
														
 
															+			conflict = true;
														
 
															+	}
														
 
															+
														
 
															+	if (conflict)
														
 
															+		rmid = INVALID_RMID;
														
 
															+	else
														
 
															+		rmid = __get_rmid();
														
 
															+
														
 
															+	event->hw.cqm_rmid = rmid;
														
 
															+}
														
 
															+
														
 
															+static void intel_cqm_event_read(struct perf_event *event)
														
 
															+{
														
 
															+	unsigned long flags;
														
 
															+	unsigned int rmid;
														
 
															+	u64 val;
														
 
															+
														
 
															+	/*
														
 
															+	 * Task events are handled by intel_cqm_event_count().
														
 
															+	 */
														
 
															+	if (event->cpu == -1)
														
 
															+		return;
														
 
															+
														
 
															+	raw_spin_lock_irqsave(&cache_lock, flags);
														
 
															+	rmid = event->hw.cqm_rmid;
														
 
															+
														
 
															+	if (!__rmid_valid(rmid))
														
 
															+		goto out;
														
 
															+
														
 
															+	val = __rmid_read(rmid);
														
 
															+
														
 
															+	/*
														
 
															+	 * Ignore this reading on error states and do not update the value.
														
 
															+	 */
														
 
															+	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
														
 
															+		goto out;
														
 
															+
														
 
															+	local64_set(&event->count, val);
														
 
															+out:
														
 
															+	raw_spin_unlock_irqrestore(&cache_lock, flags);
														
 
															+}
														
 
															+
														
 
															+static void __intel_cqm_event_count(void *info)
														
 
															+{
														
 
															+	struct rmid_read *rr = info;
														
 
															+	u64 val;
														
 
															+
														
 
															+	val = __rmid_read(rr->rmid);
														
 
															+
														
 
															+	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
														
 
															+		return;
														
 
															+
														
 
															+	atomic64_add(val, &rr->value);
														
 
															+}
														
 
															+
														
 
															+static inline bool cqm_group_leader(struct perf_event *event)
														
 
															+{
														
 
															+	return !list_empty(&event->hw.cqm_groups_entry);
														
 
															+}
														
 
															+
														
 
															+static u64 intel_cqm_event_count(struct perf_event *event)
														
 
															+{
														
 
															+	unsigned long flags;
														
 
															+	struct rmid_read rr = {
														
 
															+		.value = ATOMIC64_INIT(0),
														
 
															+	};
														
 
															+
														
 
															+	/*
														
 
															+	 * We only need to worry about task events. System-wide events
														
 
															+	 * are handled like usual, i.e. entirely with
														
 
															+	 * intel_cqm_event_read().
														
 
															+	 */
														
 
															+	if (event->cpu != -1)
														
 
															+		return __perf_event_count(event);
														
 
															+
														
 
															+	/*
														
 
															+	 * Only the group leader gets to report values. This stops us
														
 
															+	 * reporting duplicate values to userspace, and gives us a clear
														
 
															+	 * rule for which task gets to report the values.
														
 
															+	 *
														
 
															+	 * Note that it is impossible to attribute these values to
														
 
															+	 * specific packages - we forfeit that ability when we create
														
 
															+	 * task events.
														
 
															+	 */
														
 
															+	if (!cqm_group_leader(event))
														
 
															+		return 0;
														
 
															+
														
 
															+	/*
														
 
															+	 * Notice that we don't perform the reading of an RMID
														
 
															+	 * atomically, because we can't hold a spin lock across the
														
 
															+	 * IPIs.
														
 
															+	 *
														
 
															+	 * Speculatively perform the read, since @event might be
														
 
															+	 * assigned a different (possibly invalid) RMID while we're
														
 
															+	 * busying performing the IPI calls. It's therefore necessary to
														
 
															+	 * check @event's RMID afterwards, and if it has changed,
														
 
															+	 * discard the result of the read.
														
 
															+	 */
														
 
															+	rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
														
 
															+
														
 
															+	if (!__rmid_valid(rr.rmid))
														
 
															+		goto out;
														
 
															+
														
 
															+	on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
														
 
															+
														
 
															+	raw_spin_lock_irqsave(&cache_lock, flags);
														
 
															+	if (event->hw.cqm_rmid == rr.rmid)
														
 
															+		local64_set(&event->count, atomic64_read(&rr.value));
														
 
															+	raw_spin_unlock_irqrestore(&cache_lock, flags);
														
 
															+out:
														
 
															+	return __perf_event_count(event);
														
 
															+}
														
 
															+
														
 
															+static void intel_cqm_event_start(struct perf_event *event, int mode)
														
 
															+{
														
 
															+	struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
														
 
															+	unsigned int rmid = event->hw.cqm_rmid;
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	if (!(event->hw.cqm_state & PERF_HES_STOPPED))
														
 
															+		return;
														
 
															+
														
 
															+	event->hw.cqm_state &= ~PERF_HES_STOPPED;
														
 
															+
														
 
															+	raw_spin_lock_irqsave(&state->lock, flags);
														
 
															+
														
 
															+	if (state->cnt++)
														
 
															+		WARN_ON_ONCE(state->rmid != rmid);
														
 
															+	else
														
 
															+		WARN_ON_ONCE(state->rmid);
														
 
															+
														
 
															+	state->rmid = rmid;
														
 
															+	wrmsrl(MSR_IA32_PQR_ASSOC, state->rmid);
														
 
															+
														
 
															+	raw_spin_unlock_irqrestore(&state->lock, flags);
														
 
															+}
														
 
															+
														
 
															+static void intel_cqm_event_stop(struct perf_event *event, int mode)
														
 
															+{
														
 
															+	struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	if (event->hw.cqm_state & PERF_HES_STOPPED)
														
 
															+		return;
														
 
															+
														
 
															+	event->hw.cqm_state |= PERF_HES_STOPPED;
														
 
															+
														
 
															+	raw_spin_lock_irqsave(&state->lock, flags);
														
 
															+	intel_cqm_event_read(event);
														
 
															+
														
 
															+	if (!--state->cnt) {
														
 
															+		state->rmid = 0;
														
 
															+		wrmsrl(MSR_IA32_PQR_ASSOC, 0);
														
 
															+	} else {
														
 
															+		WARN_ON_ONCE(!state->rmid);
														
 
															+	}
														
 
															+
														
 
															+	raw_spin_unlock_irqrestore(&state->lock, flags);
														
 
															+}
														
 
															+
														
 
															+static int intel_cqm_event_add(struct perf_event *event, int mode)
														
 
															+{
														
 
															+	unsigned long flags;
														
 
															+	unsigned int rmid;
														
 
															+
														
 
															+	raw_spin_lock_irqsave(&cache_lock, flags);
														
 
															+
														
 
															+	event->hw.cqm_state = PERF_HES_STOPPED;
														
 
															+	rmid = event->hw.cqm_rmid;
														
 
															+
														
 
															+	if (__rmid_valid(rmid) && (mode & PERF_EF_START))
														
 
															+		intel_cqm_event_start(event, mode);
														
 
															+
														
 
															+	raw_spin_unlock_irqrestore(&cache_lock, flags);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static void intel_cqm_event_del(struct perf_event *event, int mode)
														
 
															+{
														
 
															+	intel_cqm_event_stop(event, mode);
														
 
															+}
														
 
															+
														
 
															+static void intel_cqm_event_destroy(struct perf_event *event)
														
 
															+{
														
 
															+	struct perf_event *group_other = NULL;
														
 
															+
														
 
															+	mutex_lock(&cache_mutex);
														
 
															+
														
 
															+	/*
														
 
															+	 * If there's another event in this group...
														
 
															+	 */
														
 
															+	if (!list_empty(&event->hw.cqm_group_entry)) {
														
 
															+		group_other = list_first_entry(&event->hw.cqm_group_entry,
														
 
															+					       struct perf_event,
														
 
															+					       hw.cqm_group_entry);
														
 
															+		list_del(&event->hw.cqm_group_entry);
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * And we're the group leader..
														
 
															+	 */
														
 
															+	if (cqm_group_leader(event)) {
														
 
															+		/*
														
 
															+		 * If there was a group_other, make that leader, otherwise
														
 
															+		 * destroy the group and return the RMID.
														
 
															+		 */
														
 
															+		if (group_other) {
														
 
															+			list_replace(&event->hw.cqm_groups_entry,
														
 
															+				     &group_other->hw.cqm_groups_entry);
														
 
															+		} else {
														
 
															+			unsigned int rmid = event->hw.cqm_rmid;
														
 
															+
														
 
															+			if (__rmid_valid(rmid))
														
 
															+				__put_rmid(rmid);
														
 
															+			list_del(&event->hw.cqm_groups_entry);
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	mutex_unlock(&cache_mutex);
														
 
															+}
														
 
															+
														
 
															+static int intel_cqm_event_init(struct perf_event *event)
														
 
															+{
														
 
															+	struct perf_event *group = NULL;
														
 
															+	bool rotate = false;
														
 
															+
														
 
															+	if (event->attr.type != intel_cqm_pmu.type)
														
 
															+		return -ENOENT;
														
 
															+
														
 
															+	if (event->attr.config & ~QOS_EVENT_MASK)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	/* unsupported modes and filters */
														
 
															+	if (event->attr.exclude_user   ||
														
 
															+	    event->attr.exclude_kernel ||
														
 
															+	    event->attr.exclude_hv     ||
														
 
															+	    event->attr.exclude_idle   ||
														
 
															+	    event->attr.exclude_host   ||
														
 
															+	    event->attr.exclude_guest  ||
														
 
															+	    event->attr.sample_period) /* no sampling */
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	INIT_LIST_HEAD(&event->hw.cqm_group_entry);
														
 
															+	INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
														
 
															+
														
 
															+	event->destroy = intel_cqm_event_destroy;
														
 
															+
														
 
															+	mutex_lock(&cache_mutex);
														
 
															+
														
 
															+	/* Will also set rmid */
														
 
															+	intel_cqm_setup_event(event, &group);
														
 
															+
														
 
															+	if (group) {
														
 
															+		list_add_tail(&event->hw.cqm_group_entry,
														
 
															+			      &group->hw.cqm_group_entry);
														
 
															+	} else {
														
 
															+		list_add_tail(&event->hw.cqm_groups_entry,
														
 
															+			      &cache_groups);
														
 
															+
														
 
															+		/*
														
 
															+		 * All RMIDs are either in use or have recently been
														
 
															+		 * used. Kick the rotation worker to clean/free some.
														
 
															+		 *
														
 
															+		 * We only do this for the group leader, rather than for
														
 
															+		 * every event in a group to save on needless work.
														
 
															+		 */
														
 
															+		if (!__rmid_valid(event->hw.cqm_rmid))
														
 
															+			rotate = true;
														
 
															+	}
														
 
															+
														
 
															+	mutex_unlock(&cache_mutex);
														
 
															+
														
 
															+	if (rotate)
														
 
															+		schedule_delayed_work(&intel_cqm_rmid_work, 0);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
														
 
															+EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
														
 
															+EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
														
 
															+EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
														
 
															+EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
														
 
															+
														
 
															+static struct attribute *intel_cqm_events_attr[] = {
														
 
															+	EVENT_PTR(intel_cqm_llc),
														
 
															+	EVENT_PTR(intel_cqm_llc_pkg),
														
 
															+	EVENT_PTR(intel_cqm_llc_unit),
														
 
															+	EVENT_PTR(intel_cqm_llc_scale),
														
 
															+	EVENT_PTR(intel_cqm_llc_snapshot),
														
 
															+	NULL,
														
 
															+};
														
 
															+
														
 
															+static struct attribute_group intel_cqm_events_group = {
														
 
															+	.name = "events",
														
 
															+	.attrs = intel_cqm_events_attr,
														
 
															+};
														
 
															+
														
 
															+PMU_FORMAT_ATTR(event, "config:0-7");
														
 
															+static struct attribute *intel_cqm_formats_attr[] = {
														
 
															+	&format_attr_event.attr,
														
 
															+	NULL,
														
 
															+};
														
 
															+
														
 
															+static struct attribute_group intel_cqm_format_group = {
														
 
															+	.name = "format",
														
 
															+	.attrs = intel_cqm_formats_attr,
														
 
															+};
														
 
															+
														
 
															+static ssize_t
														
 
															+max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
														
 
															+			   char *page)
														
 
															+{
														
 
															+	ssize_t rv;
														
 
															+
														
 
															+	mutex_lock(&cache_mutex);
														
 
															+	rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
														
 
															+	mutex_unlock(&cache_mutex);
														
 
															+
														
 
															+	return rv;
														
 
															+}
														
 
															+
														
 
															+static ssize_t
														
 
															+max_recycle_threshold_store(struct device *dev,
														
 
															+			    struct device_attribute *attr,
														
 
															+			    const char *buf, size_t count)
														
 
															+{
														
 
															+	unsigned int bytes, cachelines;
														
 
															+	int ret;
														
 
															+
														
 
															+	ret = kstrtouint(buf, 0, &bytes);
														
 
															+	if (ret)
														
 
															+		return ret;
														
 
															+
														
 
															+	mutex_lock(&cache_mutex);
														
 
															+
														
 
															+	__intel_cqm_max_threshold = bytes;
														
 
															+	cachelines = bytes / cqm_l3_scale;
														
 
															+
														
 
															+	/*
														
 
															+	 * The new maximum takes effect immediately.
														
 
															+	 */
														
 
															+	if (__intel_cqm_threshold > cachelines)
														
 
															+		__intel_cqm_threshold = cachelines;
														
 
															+
														
 
															+	mutex_unlock(&cache_mutex);
														
 
															+
														
 
															+	return count;
														
 
															+}
														
 
															+
														
 
															+static DEVICE_ATTR_RW(max_recycle_threshold);
														
 
															+
														
 
															+static struct attribute *intel_cqm_attrs[] = {
														
 
															+	&dev_attr_max_recycle_threshold.attr,
														
 
															+	NULL,
														
 
															+};
														
 
															+
														
 
															+static const struct attribute_group intel_cqm_group = {
														
 
															+	.attrs = intel_cqm_attrs,
														
 
															+};
														
 
															+
														
 
															+static const struct attribute_group *intel_cqm_attr_groups[] = {
														
 
															+	&intel_cqm_events_group,
														
 
															+	&intel_cqm_format_group,
														
 
															+	&intel_cqm_group,
														
 
															+	NULL,
														
 
															+};
														
 
															+
														
 
															+static struct pmu intel_cqm_pmu = {
														
 
															+	.hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
														
 
															+	.attr_groups	     = intel_cqm_attr_groups,
														
 
															+	.task_ctx_nr	     = perf_sw_context,
														
 
															+	.event_init	     = intel_cqm_event_init,
														
 
															+	.add		     = intel_cqm_event_add,
														
 
															+	.del		     = intel_cqm_event_del,
														
 
															+	.start		     = intel_cqm_event_start,
														
 
															+	.stop		     = intel_cqm_event_stop,
														
 
															+	.read		     = intel_cqm_event_read,
														
 
															+	.count		     = intel_cqm_event_count,
														
 
															+};
														
 
															+
														
 
															+static inline void cqm_pick_event_reader(int cpu)
														
 
															+{
														
 
															+	int phys_id = topology_physical_package_id(cpu);
														
 
															+	int i;
														
 
															+
														
 
															+	for_each_cpu(i, &cqm_cpumask) {
														
 
															+		if (phys_id == topology_physical_package_id(i))
														
 
															+			return;	/* already got reader for this socket */
														
 
															+	}
														
 
															+
														
 
															+	cpumask_set_cpu(cpu, &cqm_cpumask);
														
 
															+}
														
 
															+
														
 
															+static void intel_cqm_cpu_prepare(unsigned int cpu)
														
 
															+{
														
 
															+	struct intel_cqm_state *state = &per_cpu(cqm_state, cpu);
														
 
															+	struct cpuinfo_x86 *c = &cpu_data(cpu);
														
 
															+
														
 
															+	raw_spin_lock_init(&state->lock);
														
 
															+	state->rmid = 0;
														
 
															+	state->cnt  = 0;
														
 
															+
														
 
															+	WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
														
 
															+	WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
														
 
															+}
														
 
															+
														
 
															+static void intel_cqm_cpu_exit(unsigned int cpu)
														
 
															+{
														
 
															+	int phys_id = topology_physical_package_id(cpu);
														
 
															+	int i;
														
 
															+
														
 
															+	/*
														
 
															+	 * Is @cpu a designated cqm reader?
														
 
															+	 */
														
 
															+	if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
														
 
															+		return;
														
 
															+
														
 
															+	for_each_online_cpu(i) {
														
 
															+		if (i == cpu)
														
 
															+			continue;
														
 
															+
														
 
															+		if (phys_id == topology_physical_package_id(i)) {
														
 
															+			cpumask_set_cpu(i, &cqm_cpumask);
														
 
															+			break;
														
 
															+		}
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static int intel_cqm_cpu_notifier(struct notifier_block *nb,
														
 
															+				  unsigned long action, void *hcpu)
														
 
															+{
														
 
															+	unsigned int cpu  = (unsigned long)hcpu;
														
 
															+
														
 
															+	switch (action & ~CPU_TASKS_FROZEN) {
														
 
															+	case CPU_UP_PREPARE:
														
 
															+		intel_cqm_cpu_prepare(cpu);
														
 
															+		break;
														
 
															+	case CPU_DOWN_PREPARE:
														
 
															+		intel_cqm_cpu_exit(cpu);
														
 
															+		break;
														
 
															+	case CPU_STARTING:
														
 
															+		cqm_pick_event_reader(cpu);
														
 
															+		break;
														
 
															+	}
														
 
															+
														
 
															+	return NOTIFY_OK;
														
 
															+}
														
 
															+
														
 
															+static const struct x86_cpu_id intel_cqm_match[] = {
														
 
															+	{ .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
														
 
															+	{}
														
 
															+};
														
 
															+
														
 
															+static int __init intel_cqm_init(void)
														
 
															+{
														
 
															+	char *str, scale[20];
														
 
															+	int i, cpu, ret;
														
 
															+
														
 
															+	if (!x86_match_cpu(intel_cqm_match))
														
 
															+		return -ENODEV;
														
 
															+
														
 
															+	cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
														
 
															+
														
 
															+	/*
														
 
															+	 * It's possible that not all resources support the same number
														
 
															+	 * of RMIDs. Instead of making scheduling much more complicated
														
 
															+	 * (where we have to match a task's RMID to a cpu that supports
														
 
															+	 * that many RMIDs) just find the minimum RMIDs supported across
														
 
															+	 * all cpus.
														
 
															+	 *
														
 
															+	 * Also, check that the scales match on all cpus.
														
 
															+	 */
														
 
															+	cpu_notifier_register_begin();
														
 
															+
														
 
															+	for_each_online_cpu(cpu) {
														
 
															+		struct cpuinfo_x86 *c = &cpu_data(cpu);
														
 
															+
														
 
															+		if (c->x86_cache_max_rmid < cqm_max_rmid)
														
 
															+			cqm_max_rmid = c->x86_cache_max_rmid;
														
 
															+
														
 
															+		if (c->x86_cache_occ_scale != cqm_l3_scale) {
														
 
															+			pr_err("Multiple LLC scale values, disabling\n");
														
 
															+			ret = -EINVAL;
														
 
															+			goto out;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * A reasonable upper limit on the max threshold is the number
														
 
															+	 * of lines tagged per RMID if all RMIDs have the same number of
														
 
															+	 * lines tagged in the LLC.
														
 
															+	 *
														
 
															+	 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
														
 
															+	 */
														
 
															+	__intel_cqm_max_threshold =
														
 
															+		boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
														
 
															+
														
 
															+	snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
														
 
															+	str = kstrdup(scale, GFP_KERNEL);
														
 
															+	if (!str) {
														
 
															+		ret = -ENOMEM;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	event_attr_intel_cqm_llc_scale.event_str = str;
														
 
															+
														
 
															+	ret = intel_cqm_setup_rmid_cache();
														
 
															+	if (ret)
														
 
															+		goto out;
														
 
															+
														
 
															+	for_each_online_cpu(i) {
														
 
															+		intel_cqm_cpu_prepare(i);
														
 
															+		cqm_pick_event_reader(i);
														
 
															+	}
														
 
															+
														
 
															+	__perf_cpu_notifier(intel_cqm_cpu_notifier);
														
 
															+
														
 
															+	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
														
 
															+	if (ret)
														
 
															+		pr_err("Intel CQM perf registration failed: %d\n", ret);
														
 
															+	else
														
 
															+		pr_info("Intel CQM monitoring enabled\n");
														
 
															+
														
 
															+out:
														
 
															+	cpu_notifier_register_done();
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+device_initcall(intel_cqm_init);
														
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -461,7 +461,8 @@ void intel_pmu_enable_bts(u64 config)
 
															 	debugctlmsr |= DEBUGCTLMSR_TR;
														
 
															 	debugctlmsr |= DEBUGCTLMSR_BTS;
														
 
															-	debugctlmsr |= DEBUGCTLMSR_BTINT;
														
 
															+	if (config & ARCH_PERFMON_EVENTSEL_INT)
														
 
															+		debugctlmsr |= DEBUGCTLMSR_BTINT;
														
 
															 	if (!(config & ARCH_PERFMON_EVENTSEL_OS))
														
 
															 		debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
														
@@ -611,6 +612,10 @@ struct event_constraint intel_snb_pebs_event_constraints[] = {
 
															 	INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
														
 
															 	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
														
 
															 	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
														
 
															+        INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
														
 
															+        INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
														
 
															+        INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
														
 
															+        INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
														
 
															 	/* Allow all events as PEBS with no flags */
														
 
															 	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
														
 
															 	EVENT_CONSTRAINT_END
														
@@ -622,6 +627,10 @@ struct event_constraint intel_ivb_pebs_event_constraints[] = {
 
															 	INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
														
 
															 	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
														
 
															 	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
														
 
															+	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
														
 
															+	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
														
 
															+	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
														
 
															+	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
														
 
															 	/* Allow all events as PEBS with no flags */
														
 
															 	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
														
 
															         EVENT_CONSTRAINT_END
														
@@ -633,16 +642,16 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
 
															 	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
														
 
															 	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
														
 
															 	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
														
 
															-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
														
 
															-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
														
 
															-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
														
 
															-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
														
 
															-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
														
 
															-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
														
 
															-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
														
 
															-	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
														
 
															-	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf),    /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
														
 
															-	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf),    /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
														
 
															+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
														
 
															+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
														
 
															+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
														
 
															+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
														
 
															+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
														
 
															+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
														
 
															+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
														
 
															+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
														
 
															+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd2, 0xf),    /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
														
 
															+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd3, 0xf),    /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
														
 
															 	/* Allow all events as PEBS with no flags */
														
 
															 	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
														
 
															 	EVENT_CONSTRAINT_END
														
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -39,6 +39,7 @@ static enum {
 
															 #define LBR_IND_JMP_BIT		6 /* do not capture indirect jumps */
														
 
															 #define LBR_REL_JMP_BIT		7 /* do not capture relative jumps */
														
 
															 #define LBR_FAR_BIT		8 /* do not capture far branches */
														
 
															+#define LBR_CALL_STACK_BIT	9 /* enable call stack */
														
 
															 #define LBR_KERNEL	(1 << LBR_KERNEL_BIT)
														
 
															 #define LBR_USER	(1 << LBR_USER_BIT)
														
@@ -49,6 +50,7 @@ static enum {
 
															 #define LBR_REL_JMP	(1 << LBR_REL_JMP_BIT)
														
 
															 #define LBR_IND_JMP	(1 << LBR_IND_JMP_BIT)
														
 
															 #define LBR_FAR		(1 << LBR_FAR_BIT)
														
 
															+#define LBR_CALL_STACK	(1 << LBR_CALL_STACK_BIT)
														
 
															 #define LBR_PLM (LBR_KERNEL | LBR_USER)
														
@@ -69,33 +71,31 @@ static enum {
 
															 #define LBR_FROM_FLAG_IN_TX    (1ULL << 62)
														
 
															 #define LBR_FROM_FLAG_ABORT    (1ULL << 61)
														
 
															-#define for_each_branch_sample_type(x) \
														
 
															-	for ((x) = PERF_SAMPLE_BRANCH_USER; \
														
 
															-	     (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1)
														
 
															-
														
 
															 /*
														
 
															  * x86control flow change classification
														
 
															  * x86control flow changes include branches, interrupts, traps, faults
														
 
															  */
														
 
															 enum {
														
 
															-	X86_BR_NONE     = 0,      /* unknown */
														
 
															-
														
 
															-	X86_BR_USER     = 1 << 0, /* branch target is user */
														
 
															-	X86_BR_KERNEL   = 1 << 1, /* branch target is kernel */
														
 
															-
														
 
															-	X86_BR_CALL     = 1 << 2, /* call */
														
 
															-	X86_BR_RET      = 1 << 3, /* return */
														
 
															-	X86_BR_SYSCALL  = 1 << 4, /* syscall */
														
 
															-	X86_BR_SYSRET   = 1 << 5, /* syscall return */
														
 
															-	X86_BR_INT      = 1 << 6, /* sw interrupt */
														
 
															-	X86_BR_IRET     = 1 << 7, /* return from interrupt */
														
 
															-	X86_BR_JCC      = 1 << 8, /* conditional */
														
 
															-	X86_BR_JMP      = 1 << 9, /* jump */
														
 
															-	X86_BR_IRQ      = 1 << 10,/* hw interrupt or trap or fault */
														
 
															-	X86_BR_IND_CALL = 1 << 11,/* indirect calls */
														
 
															-	X86_BR_ABORT    = 1 << 12,/* transaction abort */
														
 
															-	X86_BR_IN_TX    = 1 << 13,/* in transaction */
														
 
															-	X86_BR_NO_TX    = 1 << 14,/* not in transaction */
														
 
															+	X86_BR_NONE		= 0,      /* unknown */
														
 
															+
														
 
															+	X86_BR_USER		= 1 << 0, /* branch target is user */
														
 
															+	X86_BR_KERNEL		= 1 << 1, /* branch target is kernel */
														
 
															+
														
 
															+	X86_BR_CALL		= 1 << 2, /* call */
														
 
															+	X86_BR_RET		= 1 << 3, /* return */
														
 
															+	X86_BR_SYSCALL		= 1 << 4, /* syscall */
														
 
															+	X86_BR_SYSRET		= 1 << 5, /* syscall return */
														
 
															+	X86_BR_INT		= 1 << 6, /* sw interrupt */
														
 
															+	X86_BR_IRET		= 1 << 7, /* return from interrupt */
														
 
															+	X86_BR_JCC		= 1 << 8, /* conditional */
														
 
															+	X86_BR_JMP		= 1 << 9, /* jump */
														
 
															+	X86_BR_IRQ		= 1 << 10,/* hw interrupt or trap or fault */
														
 
															+	X86_BR_IND_CALL		= 1 << 11,/* indirect calls */
														
 
															+	X86_BR_ABORT		= 1 << 12,/* transaction abort */
														
 
															+	X86_BR_IN_TX		= 1 << 13,/* in transaction */
														
 
															+	X86_BR_NO_TX		= 1 << 14,/* not in transaction */
														
 
															+	X86_BR_ZERO_CALL	= 1 << 15,/* zero length call */
														
 
															+	X86_BR_CALL_STACK	= 1 << 16,/* call stack */
														
 
															 };
														
 
															 #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
														
@@ -112,13 +112,15 @@ enum {
 
															 	 X86_BR_JMP	 |\
														
 
															 	 X86_BR_IRQ	 |\
														
 
															 	 X86_BR_ABORT	 |\
														
 
															-	 X86_BR_IND_CALL)
														
 
															+	 X86_BR_IND_CALL |\
														
 
															+	 X86_BR_ZERO_CALL)
														
 
															 #define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
														
 
															 #define X86_BR_ANY_CALL		 \
														
 
															 	(X86_BR_CALL		|\
														
 
															 	 X86_BR_IND_CALL	|\
														
 
															+	 X86_BR_ZERO_CALL	|\
														
 
															 	 X86_BR_SYSCALL		|\
														
 
															 	 X86_BR_IRQ		|\
														
 
															 	 X86_BR_INT)
														
@@ -130,17 +132,32 @@ static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
 
															  * otherwise it becomes near impossible to get a reliable stack.
														
 
															  */
														
 
															-static void __intel_pmu_lbr_enable(void)
														
 
															+static void __intel_pmu_lbr_enable(bool pmi)
														
 
															 {
														
 
															-	u64 debugctl;
														
 
															 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
														
 
															+	u64 debugctl, lbr_select = 0, orig_debugctl;
														
 
															-	if (cpuc->lbr_sel)
														
 
															-		wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config);
														
 
															+	/*
														
 
															+	 * No need to reprogram LBR_SELECT in a PMI, as it
														
 
															+	 * did not change.
														
 
															+	 */
														
 
															+	if (cpuc->lbr_sel && !pmi) {
														
 
															+		lbr_select = cpuc->lbr_sel->config;
														
 
															+		wrmsrl(MSR_LBR_SELECT, lbr_select);
														
 
															+	}
														
 
															 	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
														
 
															-	debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
														
 
															-	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
														
 
															+	orig_debugctl = debugctl;
														
 
															+	debugctl |= DEBUGCTLMSR_LBR;
														
 
															+	/*
														
 
															+	 * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
														
 
															+	 * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
														
 
															+	 * may cause superfluous increase/decrease of LBR_TOS.
														
 
															+	 */
														
 
															+	if (!(lbr_select & LBR_CALL_STACK))
														
 
															+		debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
														
 
															+	if (orig_debugctl != debugctl)
														
 
															+		wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
														
 
															 }
														
 
															 static void __intel_pmu_lbr_disable(void)
														
@@ -181,9 +198,116 @@ void intel_pmu_lbr_reset(void)
 
															 		intel_pmu_lbr_reset_64();
														
 
															 }
														
 
															+/*
														
 
															+ * TOS = most recently recorded branch
														
 
															+ */
														
 
															+static inline u64 intel_pmu_lbr_tos(void)
														
 
															+{
														
 
															+	u64 tos;
														
 
															+
														
 
															+	rdmsrl(x86_pmu.lbr_tos, tos);
														
 
															+	return tos;
														
 
															+}
														
 
															+
														
 
															+enum {
														
 
															+	LBR_NONE,
														
 
															+	LBR_VALID,
														
 
															+};
														
 
															+
														
 
															+static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
														
 
															+{
														
 
															+	int i;
														
 
															+	unsigned lbr_idx, mask;
														
 
															+	u64 tos;
														
 
															+
														
 
															+	if (task_ctx->lbr_callstack_users == 0 ||
														
 
															+	    task_ctx->lbr_stack_state == LBR_NONE) {
														
 
															+		intel_pmu_lbr_reset();
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	mask = x86_pmu.lbr_nr - 1;
														
 
															+	tos = intel_pmu_lbr_tos();
														
 
															+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
														
 
															+		lbr_idx = (tos - i) & mask;
														
 
															+		wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
														
 
															+		wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
														
 
															+	}
														
 
															+	task_ctx->lbr_stack_state = LBR_NONE;
														
 
															+}
														
 
															+
														
 
															+static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
														
 
															+{
														
 
															+	int i;
														
 
															+	unsigned lbr_idx, mask;
														
 
															+	u64 tos;
														
 
															+
														
 
															+	if (task_ctx->lbr_callstack_users == 0) {
														
 
															+		task_ctx->lbr_stack_state = LBR_NONE;
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	mask = x86_pmu.lbr_nr - 1;
														
 
															+	tos = intel_pmu_lbr_tos();
														
 
															+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
														
 
															+		lbr_idx = (tos - i) & mask;
														
 
															+		rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
														
 
															+		rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
														
 
															+	}
														
 
															+	task_ctx->lbr_stack_state = LBR_VALID;
														
 
															+}
														
 
															+
														
 
															+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
														
 
															+{
														
 
															+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
														
 
															+	struct x86_perf_task_context *task_ctx;
														
 
															+
														
 
															+	if (!x86_pmu.lbr_nr)
														
 
															+		return;
														
 
															+
														
 
															+	/*
														
 
															+	 * If LBR callstack feature is enabled and the stack was saved when
														
 
															+	 * the task was scheduled out, restore the stack. Otherwise flush
														
 
															+	 * the LBR stack.
														
 
															+	 */
														
 
															+	task_ctx = ctx ? ctx->task_ctx_data : NULL;
														
 
															+	if (task_ctx) {
														
 
															+		if (sched_in) {
														
 
															+			__intel_pmu_lbr_restore(task_ctx);
														
 
															+			cpuc->lbr_context = ctx;
														
 
															+		} else {
														
 
															+			__intel_pmu_lbr_save(task_ctx);
														
 
															+		}
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * When sampling the branck stack in system-wide, it may be
														
 
															+	 * necessary to flush the stack on context switch. This happens
														
 
															+	 * when the branch stack does not tag its entries with the pid
														
 
															+	 * of the current task. Otherwise it becomes impossible to
														
 
															+	 * associate a branch entry with a task. This ambiguity is more
														
 
															+	 * likely to appear when the branch stack supports priv level
														
 
															+	 * filtering and the user sets it to monitor only at the user
														
 
															+	 * level (which could be a useful measurement in system-wide
														
 
															+	 * mode). In that case, the risk is high of having a branch
														
 
															+	 * stack with branch from multiple tasks.
														
 
															+ 	 */
														
 
															+	if (sched_in) {
														
 
															+		intel_pmu_lbr_reset();
														
 
															+		cpuc->lbr_context = ctx;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static inline bool branch_user_callstack(unsigned br_sel)
														
 
															+{
														
 
															+	return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
														
 
															+}
														
 
															+
														
 
															 void intel_pmu_lbr_enable(struct perf_event *event)
														
 
															 {
														
 
															 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
														
 
															+	struct x86_perf_task_context *task_ctx;
														
 
															 	if (!x86_pmu.lbr_nr)
														
 
															 		return;
														
@@ -198,18 +322,33 @@ void intel_pmu_lbr_enable(struct perf_event *event)
 
															 	}
														
 
															 	cpuc->br_sel = event->hw.branch_reg.reg;
														
 
															+	if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
														
 
															+					event->ctx->task_ctx_data) {
														
 
															+		task_ctx = event->ctx->task_ctx_data;
														
 
															+		task_ctx->lbr_callstack_users++;
														
 
															+	}
														
 
															+
														
 
															 	cpuc->lbr_users++;
														
 
															+	perf_sched_cb_inc(event->ctx->pmu);
														
 
															 }
														
 
															 void intel_pmu_lbr_disable(struct perf_event *event)
														
 
															 {
														
 
															 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
														
 
															+	struct x86_perf_task_context *task_ctx;
														
 
															 	if (!x86_pmu.lbr_nr)
														
 
															 		return;
														
 
															+	if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
														
 
															+					event->ctx->task_ctx_data) {
														
 
															+		task_ctx = event->ctx->task_ctx_data;
														
 
															+		task_ctx->lbr_callstack_users--;
														
 
															+	}
														
 
															+
														
 
															 	cpuc->lbr_users--;
														
 
															 	WARN_ON_ONCE(cpuc->lbr_users < 0);
														
 
															+	perf_sched_cb_dec(event->ctx->pmu);
														
 
															 	if (cpuc->enabled && !cpuc->lbr_users) {
														
 
															 		__intel_pmu_lbr_disable();
														
@@ -218,12 +357,12 @@ void intel_pmu_lbr_disable(struct perf_event *event)
 
															 	}
														
 
															 }
														
 
															-void intel_pmu_lbr_enable_all(void)
														
 
															+void intel_pmu_lbr_enable_all(bool pmi)
														
 
															 {
														
 
															 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
														
 
															 	if (cpuc->lbr_users)
														
 
															-		__intel_pmu_lbr_enable();
														
 
															+		__intel_pmu_lbr_enable(pmi);
														
 
															 }
														
 
															 void intel_pmu_lbr_disable_all(void)
														
@@ -234,18 +373,6 @@ void intel_pmu_lbr_disable_all(void)
 
															 		__intel_pmu_lbr_disable();
														
 
															 }
														
 
															-/*
														
 
															- * TOS = most recently recorded branch
														
 
															- */
														
 
															-static inline u64 intel_pmu_lbr_tos(void)
														
 
															-{
														
 
															-	u64 tos;
														
 
															-
														
 
															-	rdmsrl(x86_pmu.lbr_tos, tos);
														
 
															-
														
 
															-	return tos;
														
 
															-}
														
 
															-
														
 
															 static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
														
 
															 {
														
 
															 	unsigned long mask = x86_pmu.lbr_nr - 1;
														
@@ -350,7 +477,7 @@ void intel_pmu_lbr_read(void)
 
															  * - in case there is no HW filter
														
 
															  * - in case the HW filter has errata or limitations
														
 
															  */
														
 
															-static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
														
 
															+static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
														
 
															 {
														
 
															 	u64 br_type = event->attr.branch_sample_type;
														
 
															 	int mask = 0;
														
@@ -387,11 +514,21 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
 
															 	if (br_type & PERF_SAMPLE_BRANCH_COND)
														
 
															 		mask |= X86_BR_JCC;
														
 
															+	if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) {
														
 
															+		if (!x86_pmu_has_lbr_callstack())
														
 
															+			return -EOPNOTSUPP;
														
 
															+		if (mask & ~(X86_BR_USER | X86_BR_KERNEL))
														
 
															+			return -EINVAL;
														
 
															+		mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET |
														
 
															+			X86_BR_CALL_STACK;
														
 
															+	}
														
 
															+
														
 
															 	/*
														
 
															 	 * stash actual user request into reg, it may
														
 
															 	 * be used by fixup code for some CPU
														
 
															 	 */
														
 
															 	event->hw.branch_reg.reg = mask;
														
 
															+	return 0;
														
 
															 }
														
 
															 /*
														
@@ -403,14 +540,14 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
 
															 {
														
 
															 	struct hw_perf_event_extra *reg;
														
 
															 	u64 br_type = event->attr.branch_sample_type;
														
 
															-	u64 mask = 0, m;
														
 
															-	u64 v;
														
 
															+	u64 mask = 0, v;
														
 
															+	int i;
														
 
															-	for_each_branch_sample_type(m) {
														
 
															-		if (!(br_type & m))
														
 
															+	for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) {
														
 
															+		if (!(br_type & (1ULL << i)))
														
 
															 			continue;
														
 
															-		v = x86_pmu.lbr_sel_map[m];
														
 
															+		v = x86_pmu.lbr_sel_map[i];
														
 
															 		if (v == LBR_NOT_SUPP)
														
 
															 			return -EOPNOTSUPP;
														
@@ -420,8 +557,12 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
 
															 	reg = &event->hw.branch_reg;
														
 
															 	reg->idx = EXTRA_REG_LBR;
														
 
															-	/* LBR_SELECT operates in suppress mode so invert mask */
														
 
															-	reg->config = ~mask & x86_pmu.lbr_sel_mask;
														
 
															+	/*
														
 
															+	 * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
														
 
															+	 * in suppress mode. So LBR_SELECT should be set to
														
 
															+	 * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK)
														
 
															+	 */
														
 
															+	reg->config = mask ^ x86_pmu.lbr_sel_mask;
														
 
															 	return 0;
														
 
															 }
														
@@ -439,7 +580,9 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event)
 
															 	/*
														
 
															 	 * setup SW LBR filter
														
 
															 	 */
														
 
															-	intel_pmu_setup_sw_lbr_filter(event);
														
 
															+	ret = intel_pmu_setup_sw_lbr_filter(event);
														
 
															+	if (ret)
														
 
															+		return ret;
														
 
															 	/*
														
 
															 	 * setup HW LBR filter, if any
														
@@ -568,6 +711,12 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
 
															 		ret = X86_BR_INT;
														
 
															 		break;
														
 
															 	case 0xe8: /* call near rel */
														
 
															+		insn_get_immediate(&insn);
														
 
															+		if (insn.immediate1.value == 0) {
														
 
															+			/* zero length call */
														
 
															+			ret = X86_BR_ZERO_CALL;
														
 
															+			break;
														
 
															+		}
														
 
															 	case 0x9a: /* call far absolute */
														
 
															 		ret = X86_BR_CALL;
														
 
															 		break;
														
@@ -678,35 +827,49 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 
															 /*
														
 
															  * Map interface branch filters onto LBR filters
														
 
															  */
														
 
															-static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
														
 
															-	[PERF_SAMPLE_BRANCH_ANY]	= LBR_ANY,
														
 
															-	[PERF_SAMPLE_BRANCH_USER]	= LBR_USER,
														
 
															-	[PERF_SAMPLE_BRANCH_KERNEL]	= LBR_KERNEL,
														
 
															-	[PERF_SAMPLE_BRANCH_HV]		= LBR_IGN,
														
 
															-	[PERF_SAMPLE_BRANCH_ANY_RETURN]	= LBR_RETURN | LBR_REL_JMP
														
 
															-					| LBR_IND_JMP | LBR_FAR,
														
 
															+static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
														
 
															+	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
														
 
															+	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
														
 
															+	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
														
 
															+	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
														
 
															+	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_REL_JMP
														
 
															+						| LBR_IND_JMP | LBR_FAR,
														
 
															 	/*
														
 
															 	 * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
														
 
															 	 */
														
 
															-	[PERF_SAMPLE_BRANCH_ANY_CALL] =
														
 
															+	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] =
														
 
															 	 LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
														
 
															 	/*
														
 
															 	 * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
														
 
															 	 */
														
 
															-	[PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP,
														
 
															-	[PERF_SAMPLE_BRANCH_COND]     = LBR_JCC,
														
 
															+	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
														
 
															+	[PERF_SAMPLE_BRANCH_COND_SHIFT]     = LBR_JCC,
														
 
															 };
														
 
															-static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
														
 
															-	[PERF_SAMPLE_BRANCH_ANY]	= LBR_ANY,
														
 
															-	[PERF_SAMPLE_BRANCH_USER]	= LBR_USER,
														
 
															-	[PERF_SAMPLE_BRANCH_KERNEL]	= LBR_KERNEL,
														
 
															-	[PERF_SAMPLE_BRANCH_HV]		= LBR_IGN,
														
 
															-	[PERF_SAMPLE_BRANCH_ANY_RETURN]	= LBR_RETURN | LBR_FAR,
														
 
															-	[PERF_SAMPLE_BRANCH_ANY_CALL]	= LBR_REL_CALL | LBR_IND_CALL
														
 
															-					| LBR_FAR,
														
 
															-	[PERF_SAMPLE_BRANCH_IND_CALL]	= LBR_IND_CALL,
														
 
															-	[PERF_SAMPLE_BRANCH_COND]       = LBR_JCC,
														
 
															+static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
														
 
															+	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
														
 
															+	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
														
 
															+	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
														
 
															+	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
														
 
															+	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_FAR,
														
 
															+	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
														
 
															+						| LBR_FAR,
														
 
															+	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]	= LBR_IND_CALL,
														
 
															+	[PERF_SAMPLE_BRANCH_COND_SHIFT]		= LBR_JCC,
														
 
															+};
														
 
															+
														
 
															+static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
														
 
															+	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
														
 
															+	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
														
 
															+	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
														
 
															+	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
														
 
															+	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_FAR,
														
 
															+	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
														
 
															+						| LBR_FAR,
														
 
															+	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]	= LBR_IND_CALL,
														
 
															+	[PERF_SAMPLE_BRANCH_COND_SHIFT]		= LBR_JCC,
														
 
															+	[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
														
 
															+						| LBR_RETURN | LBR_CALL_STACK,
														
 
															 };
														
 
															 /* core */
														
@@ -765,6 +928,20 @@ void __init intel_pmu_lbr_init_snb(void)
 
															 	pr_cont("16-deep LBR, ");
														
 
															 }
														
 
															+/* haswell */
														
 
															+void intel_pmu_lbr_init_hsw(void)
														
 
															+{
														
 
															+	x86_pmu.lbr_nr	 = 16;
														
 
															+	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
														
 
															+	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
														
 
															+	x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
														
 
															+
														
 
															+	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
														
 
															+	x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
														
 
															+
														
 
															+	pr_cont("16-deep LBR, ");
														
 
															+}
														
 
															+
														
 
															 /* atom */
														
 
															 void __init intel_pmu_lbr_init_atom(void)
														
 
															 {
														
--- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -0,0 +1,1103 @@
 
															+/*
														
 
															+ * Intel(R) Processor Trace PMU driver for perf
														
 
															+ * Copyright (c) 2013-2014, Intel Corporation.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or modify it
														
 
															+ * under the terms and conditions of the GNU General Public License,
														
 
															+ * version 2, as published by the Free Software Foundation.
														
 
															+ *
														
 
															+ * This program is distributed in the hope it will be useful, but WITHOUT
														
 
															+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
														
 
															+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
														
 
															+ * more details.
														
 
															+ *
														
 
															+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
														
 
															+ * Programming Reference:
														
 
															+ * http://software.intel.com/en-us/intel-isa-extensions
														
 
															+ */
														
 
															+
														
 
															+#undef DEBUG
														
 
															+
														
 
															+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
														
 
															+
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/device.h>
														
 
															+
														
 
															+#include <asm/perf_event.h>
														
 
															+#include <asm/insn.h>
														
 
															+#include <asm/io.h>
														
 
															+
														
 
															+#include "perf_event.h"
														
 
															+#include "intel_pt.h"
														
 
															+
														
 
															+static DEFINE_PER_CPU(struct pt, pt_ctx);
														
 
															+
														
 
															+static struct pt_pmu pt_pmu;
														
 
															+
														
 
															+enum cpuid_regs {
														
 
															+	CR_EAX = 0,
														
 
															+	CR_ECX,
														
 
															+	CR_EDX,
														
 
															+	CR_EBX
														
 
															+};
														
 
															+
														
 
															+/*
														
 
															+ * Capabilities of Intel PT hardware, such as number of address bits or
														
 
															+ * supported output schemes, are cached and exported to userspace as "caps"
														
 
															+ * attribute group of pt pmu device
														
 
															+ * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
														
 
															+ * relevant bits together with intel_pt traces.
														
 
															+ *
														
 
															+ * These are necessary for both trace decoding (payloads_lip, contains address
														
 
															+ * width encoded in IP-related packets), and event configuration (bitmasks with
														
 
															+ * permitted values for certain bit fields).
														
 
															+ */
														
 
															+#define PT_CAP(_n, _l, _r, _m)						\
														
 
															+	[PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l,	\
														
 
															+			    .reg = _r, .mask = _m }
														
 
															+
														
 
															+static struct pt_cap_desc {
														
 
															+	const char	*name;
														
 
															+	u32		leaf;
														
 
															+	u8		reg;
														
 
															+	u32		mask;
														
 
															+} pt_caps[] = {
														
 
															+	PT_CAP(max_subleaf,		0, CR_EAX, 0xffffffff),
														
 
															+	PT_CAP(cr3_filtering,		0, CR_EBX, BIT(0)),
														
 
															+	PT_CAP(topa_output,		0, CR_ECX, BIT(0)),
														
 
															+	PT_CAP(topa_multiple_entries,	0, CR_ECX, BIT(1)),
														
 
															+	PT_CAP(payloads_lip,		0, CR_ECX, BIT(31)),
														
 
															+};
														
 
															+
														
 
															+static u32 pt_cap_get(enum pt_capabilities cap)
														
 
															+{
														
 
															+	struct pt_cap_desc *cd = &pt_caps[cap];
														
 
															+	u32 c = pt_pmu.caps[cd->leaf * 4 + cd->reg];
														
 
															+	unsigned int shift = __ffs(cd->mask);
														
 
															+
														
 
															+	return (c & cd->mask) >> shift;
														
 
															+}
														
 
															+
														
 
															+static ssize_t pt_cap_show(struct device *cdev,
														
 
															+			   struct device_attribute *attr,
														
 
															+			   char *buf)
														
 
															+{
														
 
															+	struct dev_ext_attribute *ea =
														
 
															+		container_of(attr, struct dev_ext_attribute, attr);
														
 
															+	enum pt_capabilities cap = (long)ea->var;
														
 
															+
														
 
															+	return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap));
														
 
															+}
														
 
															+
														
 
															+static struct attribute_group pt_cap_group = {
														
 
															+	.name	= "caps",
														
 
															+};
														
 
															+
														
 
															+PMU_FORMAT_ATTR(tsc,		"config:10"	);
														
 
															+PMU_FORMAT_ATTR(noretcomp,	"config:11"	);
														
 
															+
														
 
															+static struct attribute *pt_formats_attr[] = {
														
 
															+	&format_attr_tsc.attr,
														
 
															+	&format_attr_noretcomp.attr,
														
 
															+	NULL,
														
 
															+};
														
 
															+
														
 
															+static struct attribute_group pt_format_group = {
														
 
															+	.name	= "format",
														
 
															+	.attrs	= pt_formats_attr,
														
 
															+};
														
 
															+
														
 
															+static const struct attribute_group *pt_attr_groups[] = {
														
 
															+	&pt_cap_group,
														
 
															+	&pt_format_group,
														
 
															+	NULL,
														
 
															+};
														
 
															+
														
 
															+static int __init pt_pmu_hw_init(void)
														
 
															+{
														
 
															+	struct dev_ext_attribute *de_attrs;
														
 
															+	struct attribute **attrs;
														
 
															+	size_t size;
														
 
															+	int ret;
														
 
															+	long i;
														
 
															+
														
 
															+	attrs = NULL;
														
 
															+	ret = -ENODEV;
														
 
															+	if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
														
 
															+		goto fail;
														
 
															+
														
 
															+	for (i = 0; i < PT_CPUID_LEAVES; i++) {
														
 
															+		cpuid_count(20, i,
														
 
															+			    &pt_pmu.caps[CR_EAX + i*4],
														
 
															+			    &pt_pmu.caps[CR_EBX + i*4],
														
 
															+			    &pt_pmu.caps[CR_ECX + i*4],
														
 
															+			    &pt_pmu.caps[CR_EDX + i*4]);
														
 
															+	}
														
 
															+
														
 
															+	ret = -ENOMEM;
														
 
															+	size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
														
 
															+	attrs = kzalloc(size, GFP_KERNEL);
														
 
															+	if (!attrs)
														
 
															+		goto fail;
														
 
															+
														
 
															+	size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
														
 
															+	de_attrs = kzalloc(size, GFP_KERNEL);
														
 
															+	if (!de_attrs)
														
 
															+		goto fail;
														
 
															+
														
 
															+	for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
														
 
															+		struct dev_ext_attribute *de_attr = de_attrs + i;
														
 
															+
														
 
															+		de_attr->attr.attr.name = pt_caps[i].name;
														
 
															+
														
 
															+		sysfs_attr_init(&de_attrs->attr.attr);
														
 
															+
														
 
															+		de_attr->attr.attr.mode		= S_IRUGO;
														
 
															+		de_attr->attr.show		= pt_cap_show;
														
 
															+		de_attr->var			= (void *)i;
														
 
															+
														
 
															+		attrs[i] = &de_attr->attr.attr;
														
 
															+	}
														
 
															+
														
 
															+	pt_cap_group.attrs = attrs;
														
 
															+
														
 
															+	return 0;
														
 
															+
														
 
															+fail:
														
 
															+	kfree(attrs);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC)
														
 
															+
														
 
															+static bool pt_event_valid(struct perf_event *event)
														
 
															+{
														
 
															+	u64 config = event->attr.config;
														
 
															+
														
 
															+	if ((config & PT_CONFIG_MASK) != config)
														
 
															+		return false;
														
 
															+
														
 
															+	return true;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * PT configuration helpers
														
 
															+ * These all are cpu affine and operate on a local PT
														
 
															+ */
														
 
															+
														
 
															+static bool pt_is_running(void)
														
 
															+{
														
 
															+	u64 ctl;
														
 
															+
														
 
															+	rdmsrl(MSR_IA32_RTIT_CTL, ctl);
														
 
															+
														
 
															+	return !!(ctl & RTIT_CTL_TRACEEN);
														
 
															+}
														
 
															+
														
 
															+static void pt_config(struct perf_event *event)
														
 
															+{
														
 
															+	u64 reg;
														
 
															+
														
 
															+	reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
														
 
															+
														
 
															+	if (!event->attr.exclude_kernel)
														
 
															+		reg |= RTIT_CTL_OS;
														
 
															+	if (!event->attr.exclude_user)
														
 
															+		reg |= RTIT_CTL_USR;
														
 
															+
														
 
															+	reg |= (event->attr.config & PT_CONFIG_MASK);
														
 
															+
														
 
															+	wrmsrl(MSR_IA32_RTIT_CTL, reg);
														
 
															+}
														
 
															+
														
 
															+static void pt_config_start(bool start)
														
 
															+{
														
 
															+	u64 ctl;
														
 
															+
														
 
															+	rdmsrl(MSR_IA32_RTIT_CTL, ctl);
														
 
															+	if (start)
														
 
															+		ctl |= RTIT_CTL_TRACEEN;
														
 
															+	else
														
 
															+		ctl &= ~RTIT_CTL_TRACEEN;
														
 
															+	wrmsrl(MSR_IA32_RTIT_CTL, ctl);
														
 
															+
														
 
															+	/*
														
 
															+	 * A wrmsr that disables trace generation serializes other PT
														
 
															+	 * registers and causes all data packets to be written to memory,
														
 
															+	 * but a fence is required for the data to become globally visible.
														
 
															+	 *
														
 
															+	 * The below WMB, separating data store and aux_head store matches
														
 
															+	 * the consumer's RMB that separates aux_head load and data load.
														
 
															+	 */
														
 
															+	if (!start)
														
 
															+		wmb();
														
 
															+}
														
 
															+
														
 
															+static void pt_config_buffer(void *buf, unsigned int topa_idx,
														
 
															+			     unsigned int output_off)
														
 
															+{
														
 
															+	u64 reg;
														
 
															+
														
 
															+	wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
														
 
															+
														
 
															+	reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
														
 
															+
														
 
															+	wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Keep ToPA table-related metadata on the same page as the actual table,
														
 
															+ * taking up a few words from the top
														
 
															+ */
														
 
															+
														
 
															+#define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1)
														
 
															+
														
 
															+/**
														
 
															+ * struct topa - page-sized ToPA table with metadata at the top
														
 
															+ * @table:	actual ToPA table entries, as understood by PT hardware
														
 
															+ * @list:	linkage to struct pt_buffer's list of tables
														
 
															+ * @phys:	physical address of this page
														
 
															+ * @offset:	offset of the first entry in this table in the buffer
														
 
															+ * @size:	total size of all entries in this table
														
 
															+ * @last:	index of the last initialized entry in this table
														
 
															+ */
														
 
															+struct topa {
														
 
															+	struct topa_entry	table[TENTS_PER_PAGE];
														
 
															+	struct list_head	list;
														
 
															+	u64			phys;
														
 
															+	u64			offset;
														
 
															+	size_t			size;
														
 
															+	int			last;
														
 
															+};
														
 
															+
														
 
															+/* make -1 stand for the last table entry */
														
 
															+#define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)])
														
 
															+
														
 
															+/**
														
 
															+ * topa_alloc() - allocate page-sized ToPA table
														
 
															+ * @cpu:	CPU on which to allocate.
														
 
															+ * @gfp:	Allocation flags.
														
 
															+ *
														
 
															+ * Return:	On success, return the pointer to ToPA table page.
														
 
															+ */
														
 
															+static struct topa *topa_alloc(int cpu, gfp_t gfp)
														
 
															+{
														
 
															+	int node = cpu_to_node(cpu);
														
 
															+	struct topa *topa;
														
 
															+	struct page *p;
														
 
															+
														
 
															+	p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
														
 
															+	if (!p)
														
 
															+		return NULL;
														
 
															+
														
 
															+	topa = page_address(p);
														
 
															+	topa->last = 0;
														
 
															+	topa->phys = page_to_phys(p);
														
 
															+
														
 
															+	/*
														
 
															+	 * In case of singe-entry ToPA, always put the self-referencing END
														
 
															+	 * link as the 2nd entry in the table
														
 
															+	 */
														
 
															+	if (!pt_cap_get(PT_CAP_topa_multiple_entries)) {
														
 
															+		TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT;
														
 
															+		TOPA_ENTRY(topa, 1)->end = 1;
														
 
															+	}
														
 
															+
														
 
															+	return topa;
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * topa_free() - free a page-sized ToPA table
														
 
															+ * @topa:	Table to deallocate.
														
 
															+ */
														
 
															+static void topa_free(struct topa *topa)
														
 
															+{
														
 
															+	free_page((unsigned long)topa);
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * topa_insert_table() - insert a ToPA table into a buffer
														
 
															+ * @buf:	 PT buffer that's being extended.
														
 
															+ * @topa:	 New topa table to be inserted.
														
 
															+ *
														
 
															+ * If it's the first table in this buffer, set up buffer's pointers
														
 
															+ * accordingly; otherwise, add a END=1 link entry to @topa to the current
														
 
															+ * "last" table and adjust the last table pointer to @topa.
														
 
															+ */
														
 
															+static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
														
 
															+{
														
 
															+	struct topa *last = buf->last;
														
 
															+
														
 
															+	list_add_tail(&topa->list, &buf->tables);
														
 
															+
														
 
															+	if (!buf->first) {
														
 
															+		buf->first = buf->last = buf->cur = topa;
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	topa->offset = last->offset + last->size;
														
 
															+	buf->last = topa;
														
 
															+
														
 
															+	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
														
 
															+		return;
														
 
															+
														
 
															+	BUG_ON(last->last != TENTS_PER_PAGE - 1);
														
 
															+
														
 
															+	TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT;
														
 
															+	TOPA_ENTRY(last, -1)->end = 1;
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * topa_table_full() - check if a ToPA table is filled up
														
 
															+ * @topa:	ToPA table.
														
 
															+ */
														
 
															+static bool topa_table_full(struct topa *topa)
														
 
															+{
														
 
															+	/* single-entry ToPA is a special case */
														
 
															+	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
														
 
															+		return !!topa->last;
														
 
															+
														
 
															+	return topa->last == TENTS_PER_PAGE - 1;
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * topa_insert_pages() - create a list of ToPA tables
														
 
															+ * @buf:	PT buffer being initialized.
														
 
															+ * @gfp:	Allocation flags.
														
 
															+ *
														
 
															+ * This initializes a list of ToPA tables with entries from
														
 
															+ * the data_pages provided by rb_alloc_aux().
														
 
															+ *
														
 
															+ * Return:	0 on success or error code.
														
 
															+ */
														
 
															+static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp)
														
 
															+{
														
 
															+	struct topa *topa = buf->last;
														
 
															+	int order = 0;
														
 
															+	struct page *p;
														
 
															+
														
 
															+	p = virt_to_page(buf->data_pages[buf->nr_pages]);
														
 
															+	if (PagePrivate(p))
														
 
															+		order = page_private(p);
														
 
															+
														
 
															+	if (topa_table_full(topa)) {
														
 
															+		topa = topa_alloc(buf->cpu, gfp);
														
 
															+		if (!topa)
														
 
															+			return -ENOMEM;
														
 
															+
														
 
															+		topa_insert_table(buf, topa);
														
 
															+	}
														
 
															+
														
 
															+	TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
														
 
															+	TOPA_ENTRY(topa, -1)->size = order;
														
 
															+	if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) {
														
 
															+		TOPA_ENTRY(topa, -1)->intr = 1;
														
 
															+		TOPA_ENTRY(topa, -1)->stop = 1;
														
 
															+	}
														
 
															+
														
 
															+	topa->last++;
														
 
															+	topa->size += sizes(order);
														
 
															+
														
 
															+	buf->nr_pages += 1ul << order;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * pt_topa_dump() - print ToPA tables and their entries
														
 
															+ * @buf:	PT buffer.
														
 
															+ */
														
 
															+static void pt_topa_dump(struct pt_buffer *buf)
														
 
															+{
														
 
															+	struct topa *topa;
														
 
															+
														
 
															+	list_for_each_entry(topa, &buf->tables, list) {
														
 
															+		int i;
														
 
															+
														
 
															+		pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table,
														
 
															+			 topa->phys, topa->offset, topa->size);
														
 
															+		for (i = 0; i < TENTS_PER_PAGE; i++) {
														
 
															+			pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
														
 
															+				 &topa->table[i],
														
 
															+				 (unsigned long)topa->table[i].base << TOPA_SHIFT,
														
 
															+				 sizes(topa->table[i].size),
														
 
															+				 topa->table[i].end ?  'E' : ' ',
														
 
															+				 topa->table[i].intr ? 'I' : ' ',
														
 
															+				 topa->table[i].stop ? 'S' : ' ',
														
 
															+				 *(u64 *)&topa->table[i]);
														
 
															+			if ((pt_cap_get(PT_CAP_topa_multiple_entries) &&
														
 
															+			     topa->table[i].stop) ||
														
 
															+			    topa->table[i].end)
														
 
															+				break;
														
 
															+		}
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * pt_buffer_advance() - advance to the next output region
														
 
															+ * @buf:	PT buffer.
														
 
															+ *
														
 
															+ * Advance the current pointers in the buffer to the next ToPA entry.
														
 
															+ */
														
 
															+static void pt_buffer_advance(struct pt_buffer *buf)
														
 
															+{
														
 
															+	buf->output_off = 0;
														
 
															+	buf->cur_idx++;
														
 
															+
														
 
															+	if (buf->cur_idx == buf->cur->last) {
														
 
															+		if (buf->cur == buf->last)
														
 
															+			buf->cur = buf->first;
														
 
															+		else
														
 
															+			buf->cur = list_entry(buf->cur->list.next, struct topa,
														
 
															+					      list);
														
 
															+		buf->cur_idx = 0;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * pt_update_head() - calculate current offsets and sizes
														
 
															+ * @pt:		Per-cpu pt context.
														
 
															+ *
														
 
															+ * Update buffer's current write pointer position and data size.
														
 
															+ */
														
 
															+static void pt_update_head(struct pt *pt)
														
 
															+{
														
 
															+	struct pt_buffer *buf = perf_get_aux(&pt->handle);
														
 
															+	u64 topa_idx, base, old;
														
 
															+
														
 
															+	/* offset of the first region in this table from the beginning of buf */
														
 
															+	base = buf->cur->offset + buf->output_off;
														
 
															+
														
 
															+	/* offset of the current output region within this table */
														
 
															+	for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
														
 
															+		base += sizes(buf->cur->table[topa_idx].size);
														
 
															+
														
 
															+	if (buf->snapshot) {
														
 
															+		local_set(&buf->data_size, base);
														
 
															+	} else {
														
 
															+		old = (local64_xchg(&buf->head, base) &
														
 
															+		       ((buf->nr_pages << PAGE_SHIFT) - 1));
														
 
															+		if (base < old)
														
 
															+			base += buf->nr_pages << PAGE_SHIFT;
														
 
															+
														
 
															+		local_add(base - old, &buf->data_size);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * pt_buffer_region() - obtain current output region's address
														
 
															+ * @buf:	PT buffer.
														
 
															+ */
														
 
															+static void *pt_buffer_region(struct pt_buffer *buf)
														
 
															+{
														
 
															+	return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT);
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * pt_buffer_region_size() - obtain current output region's size
														
 
															+ * @buf:	PT buffer.
														
 
															+ */
														
 
															+static size_t pt_buffer_region_size(struct pt_buffer *buf)
														
 
															+{
														
 
															+	return sizes(buf->cur->table[buf->cur_idx].size);
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * pt_handle_status() - take care of possible status conditions
														
 
															+ * @pt:		Per-cpu pt context.
														
 
															+ */
														
 
															+static void pt_handle_status(struct pt *pt)
														
 
															+{
														
 
															+	struct pt_buffer *buf = perf_get_aux(&pt->handle);
														
 
															+	int advance = 0;
														
 
															+	u64 status;
														
 
															+
														
 
															+	rdmsrl(MSR_IA32_RTIT_STATUS, status);
														
 
															+
														
 
															+	if (status & RTIT_STATUS_ERROR) {
														
 
															+		pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
														
 
															+		pt_topa_dump(buf);
														
 
															+		status &= ~RTIT_STATUS_ERROR;
														
 
															+	}
														
 
															+
														
 
															+	if (status & RTIT_STATUS_STOPPED) {
														
 
															+		status &= ~RTIT_STATUS_STOPPED;
														
 
															+
														
 
															+		/*
														
 
															+		 * On systems that only do single-entry ToPA, hitting STOP
														
 
															+		 * means we are already losing data; need to let the decoder
														
 
															+		 * know.
														
 
															+		 */
														
 
															+		if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
														
 
															+		    buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
														
 
															+			local_inc(&buf->lost);
														
 
															+			advance++;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * Also on single-entry ToPA implementations, interrupt will come
														
 
															+	 * before the output reaches its output region's boundary.
														
 
															+	 */
														
 
															+	if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot &&
														
 
															+	    pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
														
 
															+		void *head = pt_buffer_region(buf);
														
 
															+
														
 
															+		/* everything within this margin needs to be zeroed out */
														
 
															+		memset(head + buf->output_off, 0,
														
 
															+		       pt_buffer_region_size(buf) -
														
 
															+		       buf->output_off);
														
 
															+		advance++;
														
 
															+	}
														
 
															+
														
 
															+	if (advance)
														
 
															+		pt_buffer_advance(buf);
														
 
															+
														
 
															+	wrmsrl(MSR_IA32_RTIT_STATUS, status);
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * pt_read_offset() - translate registers into buffer pointers
														
 
															+ * @buf:	PT buffer.
														
 
															+ *
														
 
															+ * Set buffer's output pointers from MSR values.
														
 
															+ */
														
 
															+static void pt_read_offset(struct pt_buffer *buf)
														
 
															+{
														
 
															+	u64 offset, base_topa;
														
 
															+
														
 
															+	rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
														
 
															+	buf->cur = phys_to_virt(base_topa);
														
 
															+
														
 
															+	rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
														
 
															+	/* offset within current output region */
														
 
															+	buf->output_off = offset >> 32;
														
 
															+	/* index of current output region within this table */
														
 
															+	buf->cur_idx = (offset & 0xffffff80) >> 7;
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry
														
 
															+ * @buf:	PT buffer.
														
 
															+ * @pg:		Page offset in the buffer.
														
 
															+ *
														
 
															+ * When advancing to the next output region (ToPA entry), given a page offset
														
 
															+ * into the buffer, we need to find the offset of the first page in the next
														
 
															+ * region.
														
 
															+ */
														
 
															+static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
														
 
															+{
														
 
															+	struct topa_entry *te = buf->topa_index[pg];
														
 
															+
														
 
															+	/* one region */
														
 
															+	if (buf->first == buf->last && buf->first->last == 1)
														
 
															+		return pg;
														
 
															+
														
 
															+	do {
														
 
															+		pg++;
														
 
															+		pg &= buf->nr_pages - 1;
														
 
															+	} while (buf->topa_index[pg] == te);
														
 
															+
														
 
															+	return pg;
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
														
 
															+ * @buf:	PT buffer.
														
 
															+ * @handle:	Current output handle.
														
 
															+ *
														
 
															+ * Place INT and STOP marks to prevent overwriting old data that the consumer
														
 
															+ * hasn't yet collected.
														
 
															+ */
														
 
															+static int pt_buffer_reset_markers(struct pt_buffer *buf,
														
 
															+				   struct perf_output_handle *handle)
														
 
															+
														
 
															+{
														
 
															+	unsigned long idx, npages, end;
														
 
															+
														
 
															+	if (buf->snapshot)
														
 
															+		return 0;
														
 
															+
														
 
															+	/* can't stop in the middle of an output region */
														
 
															+	if (buf->output_off + handle->size + 1 <
														
 
															+	    sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+
														
 
															+	/* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
														
 
															+	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
														
 
															+		return 0;
														
 
															+
														
 
															+	/* clear STOP and INT from current entry */
														
 
															+	buf->topa_index[buf->stop_pos]->stop = 0;
														
 
															+	buf->topa_index[buf->intr_pos]->intr = 0;
														
 
															+
														
 
															+	if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
														
 
															+		npages = (handle->size + 1) >> PAGE_SHIFT;
														
 
															+		end = (local64_read(&buf->head) >> PAGE_SHIFT) + npages;
														
 
															+		/*if (end > handle->wakeup >> PAGE_SHIFT)
														
 
															+		  end = handle->wakeup >> PAGE_SHIFT;*/
														
 
															+		idx = end & (buf->nr_pages - 1);
														
 
															+		buf->stop_pos = idx;
														
 
															+		idx = (local64_read(&buf->head) >> PAGE_SHIFT) + npages - 1;
														
 
															+		idx &= buf->nr_pages - 1;
														
 
															+		buf->intr_pos = idx;
														
 
															+	}
														
 
															+
														
 
															+	buf->topa_index[buf->stop_pos]->stop = 1;
														
 
															+	buf->topa_index[buf->intr_pos]->intr = 1;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * pt_buffer_setup_topa_index() - build topa_index[] table of regions
														
 
															+ * @buf:	PT buffer.
														
 
															+ *
														
 
															+ * topa_index[] references output regions indexed by offset into the
														
 
															+ * buffer for purposes of quick reverse lookup.
														
 
															+ */
														
 
															+static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
														
 
															+{
														
 
															+	struct topa *cur = buf->first, *prev = buf->last;
														
 
															+	struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
														
 
															+		*te_prev = TOPA_ENTRY(prev, prev->last - 1);
														
 
															+	int pg = 0, idx = 0, ntopa = 0;
														
 
															+
														
 
															+	while (pg < buf->nr_pages) {
														
 
															+		int tidx;
														
 
															+
														
 
															+		/* pages within one topa entry */
														
 
															+		for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++)
														
 
															+			buf->topa_index[pg] = te_prev;
														
 
															+
														
 
															+		te_prev = te_cur;
														
 
															+
														
 
															+		if (idx == cur->last - 1) {
														
 
															+			/* advance to next topa table */
														
 
															+			idx = 0;
														
 
															+			cur = list_entry(cur->list.next, struct topa, list);
														
 
															+			ntopa++;
														
 
															+		} else
														
 
															+			idx++;
														
 
															+		te_cur = TOPA_ENTRY(cur, idx);
														
 
															+	}
														
 
															+
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
														
 
															+ * @buf:	PT buffer.
														
 
															+ * @head:	Write pointer (aux_head) from AUX buffer.
														
 
															+ *
														
 
															+ * Find the ToPA table and entry corresponding to given @head and set buffer's
														
 
															+ * "current" pointers accordingly.
														
 
															+ */
														
 
															+static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
														
 
															+{
														
 
															+	int pg;
														
 
															+
														
 
															+	if (buf->snapshot)
														
 
															+		head &= (buf->nr_pages << PAGE_SHIFT) - 1;
														
 
															+
														
 
															+	pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
														
 
															+	pg = pt_topa_next_entry(buf, pg);
														
 
															+
														
 
															+	buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK);
														
 
															+	buf->cur_idx = ((unsigned long)buf->topa_index[pg] -
														
 
															+			(unsigned long)buf->cur) / sizeof(struct topa_entry);
														
 
															+	buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1);
														
 
															+
														
 
															+	local64_set(&buf->head, head);
														
 
															+	local_set(&buf->data_size, 0);
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
														
 
															+ * @buf:	PT buffer.
														
 
															+ */
														
 
															+static void pt_buffer_fini_topa(struct pt_buffer *buf)
														
 
															+{
														
 
															+	struct topa *topa, *iter;
														
 
															+
														
 
															+	list_for_each_entry_safe(topa, iter, &buf->tables, list) {
														
 
															+		/*
														
 
															+		 * right now, this is in free_aux() path only, so
														
 
															+		 * no need to unlink this table from the list
														
 
															+		 */
														
 
															+		topa_free(topa);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * pt_buffer_init_topa() - initialize ToPA table for pt buffer
														
 
															+ * @buf:	PT buffer.
														
 
															+ * @size:	Total size of all regions within this ToPA.
														
 
															+ * @gfp:	Allocation flags.
														
 
															+ */
														
 
															+static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages,
														
 
															+			       gfp_t gfp)
														
 
															+{
														
 
															+	struct topa *topa;
														
 
															+	int err;
														
 
															+
														
 
															+	topa = topa_alloc(buf->cpu, gfp);
														
 
															+	if (!topa)
														
 
															+		return -ENOMEM;
														
 
															+
														
 
															+	topa_insert_table(buf, topa);
														
 
															+
														
 
															+	while (buf->nr_pages < nr_pages) {
														
 
															+		err = topa_insert_pages(buf, gfp);
														
 
															+		if (err) {
														
 
															+			pt_buffer_fini_topa(buf);
														
 
															+			return -ENOMEM;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	pt_buffer_setup_topa_index(buf);
														
 
															+
														
 
															+	/* link last table to the first one, unless we're double buffering */
														
 
															+	if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
														
 
															+		TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT;
														
 
															+		TOPA_ENTRY(buf->last, -1)->end = 1;
														
 
															+	}
														
 
															+
														
 
															+	pt_topa_dump(buf);
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * pt_buffer_setup_aux() - set up topa tables for a PT buffer
														
 
															+ * @cpu:	Cpu on which to allocate, -1 means current.
														
 
															+ * @pages:	Array of pointers to buffer pages passed from perf core.
														
 
															+ * @nr_pages:	Number of pages in the buffer.
														
 
															+ * @snapshot:	If this is a snapshot/overwrite counter.
														
 
															+ *
														
 
															+ * This is a pmu::setup_aux callback that sets up ToPA tables and all the
														
 
															+ * bookkeeping for an AUX buffer.
														
 
															+ *
														
 
															+ * Return:	Our private PT buffer structure.
														
 
															+ */
														
 
															+static void *
														
 
															+pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot)
														
 
															+{
														
 
															+	struct pt_buffer *buf;
														
 
															+	int node, ret;
														
 
															+
														
 
															+	if (!nr_pages)
														
 
															+		return NULL;
														
 
															+
														
 
															+	if (cpu == -1)
														
 
															+		cpu = raw_smp_processor_id();
														
 
															+	node = cpu_to_node(cpu);
														
 
															+
														
 
															+	buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]),
														
 
															+			   GFP_KERNEL, node);
														
 
															+	if (!buf)
														
 
															+		return NULL;
														
 
															+
														
 
															+	buf->cpu = cpu;
														
 
															+	buf->snapshot = snapshot;
														
 
															+	buf->data_pages = pages;
														
 
															+
														
 
															+	INIT_LIST_HEAD(&buf->tables);
														
 
															+
														
 
															+	ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL);
														
 
															+	if (ret) {
														
 
															+		kfree(buf);
														
 
															+		return NULL;
														
 
															+	}
														
 
															+
														
 
															+	return buf;
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * pt_buffer_free_aux() - perf AUX deallocation path callback
														
 
															+ * @data:	PT buffer.
														
 
															+ */
														
 
															+static void pt_buffer_free_aux(void *data)
														
 
															+{
														
 
															+	struct pt_buffer *buf = data;
														
 
															+
														
 
															+	pt_buffer_fini_topa(buf);
														
 
															+	kfree(buf);
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * pt_buffer_is_full() - check if the buffer is full
														
 
															+ * @buf:	PT buffer.
														
 
															+ * @pt:		Per-cpu pt handle.
														
 
															+ *
														
 
															+ * If the user hasn't read data from the output region that aux_head
														
 
															+ * points to, the buffer is considered full: the user needs to read at
														
 
															+ * least this region and update aux_tail to point past it.
														
 
															+ */
														
 
															+static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt)
														
 
															+{
														
 
															+	if (buf->snapshot)
														
 
															+		return false;
														
 
															+
														
 
															+	if (local_read(&buf->data_size) >= pt->handle.size)
														
 
															+		return true;
														
 
															+
														
 
															+	return false;
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * intel_pt_interrupt() - PT PMI handler
														
 
															+ */
														
 
															+void intel_pt_interrupt(void)
														
 
															+{
														
 
															+	struct pt *pt = this_cpu_ptr(&pt_ctx);
														
 
															+	struct pt_buffer *buf;
														
 
															+	struct perf_event *event = pt->handle.event;
														
 
															+
														
 
															+	/*
														
 
															+	 * There may be a dangling PT bit in the interrupt status register
														
 
															+	 * after PT has been disabled by pt_event_stop(). Make sure we don't
														
 
															+	 * do anything (particularly, re-enable) for this event here.
														
 
															+	 */
														
 
															+	if (!ACCESS_ONCE(pt->handle_nmi))
														
 
															+		return;
														
 
															+
														
 
															+	pt_config_start(false);
														
 
															+
														
 
															+	if (!event)
														
 
															+		return;
														
 
															+
														
 
															+	buf = perf_get_aux(&pt->handle);
														
 
															+	if (!buf)
														
 
															+		return;
														
 
															+
														
 
															+	pt_read_offset(buf);
														
 
															+
														
 
															+	pt_handle_status(pt);
														
 
															+
														
 
															+	pt_update_head(pt);
														
 
															+
														
 
															+	perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
														
 
															+			    local_xchg(&buf->lost, 0));
														
 
															+
														
 
															+	if (!event->hw.state) {
														
 
															+		int ret;
														
 
															+
														
 
															+		buf = perf_aux_output_begin(&pt->handle, event);
														
 
															+		if (!buf) {
														
 
															+			event->hw.state = PERF_HES_STOPPED;
														
 
															+			return;
														
 
															+		}
														
 
															+
														
 
															+		pt_buffer_reset_offsets(buf, pt->handle.head);
														
 
															+		ret = pt_buffer_reset_markers(buf, &pt->handle);
														
 
															+		if (ret) {
														
 
															+			perf_aux_output_end(&pt->handle, 0, true);
														
 
															+			return;
														
 
															+		}
														
 
															+
														
 
															+		pt_config_buffer(buf->cur->table, buf->cur_idx,
														
 
															+				 buf->output_off);
														
 
															+		wrmsrl(MSR_IA32_RTIT_STATUS, 0);
														
 
															+		pt_config(event);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * PMU callbacks
														
 
															+ */
														
 
															+
														
 
															+static void pt_event_start(struct perf_event *event, int mode)
														
 
															+{
														
 
															+	struct pt *pt = this_cpu_ptr(&pt_ctx);
														
 
															+	struct pt_buffer *buf = perf_get_aux(&pt->handle);
														
 
															+
														
 
															+	if (pt_is_running() || !buf || pt_buffer_is_full(buf, pt)) {
														
 
															+		event->hw.state = PERF_HES_STOPPED;
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	ACCESS_ONCE(pt->handle_nmi) = 1;
														
 
															+	event->hw.state = 0;
														
 
															+
														
 
															+	pt_config_buffer(buf->cur->table, buf->cur_idx,
														
 
															+			 buf->output_off);
														
 
															+	wrmsrl(MSR_IA32_RTIT_STATUS, 0);
														
 
															+	pt_config(event);
														
 
															+}
														
 
															+
														
 
															+static void pt_event_stop(struct perf_event *event, int mode)
														
 
															+{
														
 
															+	struct pt *pt = this_cpu_ptr(&pt_ctx);
														
 
															+
														
 
															+	/*
														
 
															+	 * Protect against the PMI racing with disabling wrmsr,
														
 
															+	 * see comment in intel_pt_interrupt().
														
 
															+	 */
														
 
															+	ACCESS_ONCE(pt->handle_nmi) = 0;
														
 
															+	pt_config_start(false);
														
 
															+
														
 
															+	if (event->hw.state == PERF_HES_STOPPED)
														
 
															+		return;
														
 
															+
														
 
															+	event->hw.state = PERF_HES_STOPPED;
														
 
															+
														
 
															+	if (mode & PERF_EF_UPDATE) {
														
 
															+		struct pt *pt = this_cpu_ptr(&pt_ctx);
														
 
															+		struct pt_buffer *buf = perf_get_aux(&pt->handle);
														
 
															+
														
 
															+		if (!buf)
														
 
															+			return;
														
 
															+
														
 
															+		if (WARN_ON_ONCE(pt->handle.event != event))
														
 
															+			return;
														
 
															+
														
 
															+		pt_read_offset(buf);
														
 
															+
														
 
															+		pt_handle_status(pt);
														
 
															+
														
 
															+		pt_update_head(pt);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static void pt_event_del(struct perf_event *event, int mode)
														
 
															+{
														
 
															+	struct pt *pt = this_cpu_ptr(&pt_ctx);
														
 
															+	struct pt_buffer *buf;
														
 
															+
														
 
															+	pt_event_stop(event, PERF_EF_UPDATE);
														
 
															+
														
 
															+	buf = perf_get_aux(&pt->handle);
														
 
															+
														
 
															+	if (buf) {
														
 
															+		if (buf->snapshot)
														
 
															+			pt->handle.head =
														
 
															+				local_xchg(&buf->data_size,
														
 
															+					   buf->nr_pages << PAGE_SHIFT);
														
 
															+		perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
														
 
															+				    local_xchg(&buf->lost, 0));
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static int pt_event_add(struct perf_event *event, int mode)
														
 
															+{
														
 
															+	struct pt_buffer *buf;
														
 
															+	struct pt *pt = this_cpu_ptr(&pt_ctx);
														
 
															+	struct hw_perf_event *hwc = &event->hw;
														
 
															+	int ret = -EBUSY;
														
 
															+
														
 
															+	if (pt->handle.event)
														
 
															+		goto out;
														
 
															+
														
 
															+	buf = perf_aux_output_begin(&pt->handle, event);
														
 
															+	if (!buf) {
														
 
															+		ret = -EINVAL;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	pt_buffer_reset_offsets(buf, pt->handle.head);
														
 
															+	if (!buf->snapshot) {
														
 
															+		ret = pt_buffer_reset_markers(buf, &pt->handle);
														
 
															+		if (ret) {
														
 
															+			perf_aux_output_end(&pt->handle, 0, true);
														
 
															+			goto out;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	if (mode & PERF_EF_START) {
														
 
															+		pt_event_start(event, 0);
														
 
															+		if (hwc->state == PERF_HES_STOPPED) {
														
 
															+			pt_event_del(event, 0);
														
 
															+			ret = -EBUSY;
														
 
															+		}
														
 
															+	} else {
														
 
															+		hwc->state = PERF_HES_STOPPED;
														
 
															+	}
														
 
															+
														
 
															+	ret = 0;
														
 
															+out:
														
 
															+
														
 
															+	if (ret)
														
 
															+		hwc->state = PERF_HES_STOPPED;
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static void pt_event_read(struct perf_event *event)
														
 
															+{
														
 
															+}
														
 
															+
														
 
															+static void pt_event_destroy(struct perf_event *event)
														
 
															+{
														
 
															+	x86_del_exclusive(x86_lbr_exclusive_pt);
														
 
															+}
														
 
															+
														
 
															+static int pt_event_init(struct perf_event *event)
														
 
															+{
														
 
															+	if (event->attr.type != pt_pmu.pmu.type)
														
 
															+		return -ENOENT;
														
 
															+
														
 
															+	if (!pt_event_valid(event))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	if (x86_add_exclusive(x86_lbr_exclusive_pt))
														
 
															+		return -EBUSY;
														
 
															+
														
 
															+	event->destroy = pt_event_destroy;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static __init int pt_init(void)
														
 
															+{
														
 
															+	int ret, cpu, prior_warn = 0;
														
 
															+
														
 
															+	BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
														
 
															+	get_online_cpus();
														
 
															+	for_each_online_cpu(cpu) {
														
 
															+		u64 ctl;
														
 
															+
														
 
															+		ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
														
 
															+		if (!ret && (ctl & RTIT_CTL_TRACEEN))
														
 
															+			prior_warn++;
														
 
															+	}
														
 
															+	put_online_cpus();
														
 
															+
														
 
															+	if (prior_warn) {
														
 
															+		x86_add_exclusive(x86_lbr_exclusive_pt);
														
 
															+		pr_warn("PT is enabled at boot time, doing nothing\n");
														
 
															+
														
 
															+		return -EBUSY;
														
 
															+	}
														
 
															+
														
 
															+	ret = pt_pmu_hw_init();
														
 
															+	if (ret)
														
 
															+		return ret;
														
 
															+
														
 
															+	if (!pt_cap_get(PT_CAP_topa_output)) {
														
 
															+		pr_warn("ToPA output is not supported on this CPU\n");
														
 
															+		return -ENODEV;
														
 
															+	}
														
 
															+
														
 
															+	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
														
 
															+		pt_pmu.pmu.capabilities =
														
 
															+			PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF;
														
 
															+
														
 
															+	pt_pmu.pmu.capabilities	|= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
														
 
															+	pt_pmu.pmu.attr_groups	= pt_attr_groups;
														
 
															+	pt_pmu.pmu.task_ctx_nr	= perf_sw_context;
														
 
															+	pt_pmu.pmu.event_init	= pt_event_init;
														
 
															+	pt_pmu.pmu.add		= pt_event_add;
														
 
															+	pt_pmu.pmu.del		= pt_event_del;
														
 
															+	pt_pmu.pmu.start	= pt_event_start;
														
 
															+	pt_pmu.pmu.stop		= pt_event_stop;
														
 
															+	pt_pmu.pmu.read		= pt_event_read;
														
 
															+	pt_pmu.pmu.setup_aux	= pt_buffer_setup_aux;
														
 
															+	pt_pmu.pmu.free_aux	= pt_buffer_free_aux;
														
 
															+	ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+module_init(pt_init);
														
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -1132,8 +1132,7 @@ static int snbep_pci2phy_map_init(int devid)
 
															 		}
														
 
															 	}
														
 
															-	if (ubox_dev)
														
 
															-		pci_dev_put(ubox_dev);
														
 
															+	pci_dev_put(ubox_dev);
														
 
															 	return err ? pcibios_err_to_errno(err) : 0;
														
 
															 }
														
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -41,6 +41,7 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 
															 		{ X86_FEATURE_HWP_ACT_WINDOW,	CR_EAX, 9, 0x00000006, 0 },
														
 
															 		{ X86_FEATURE_HWP_EPP,		CR_EAX,10, 0x00000006, 0 },
														
 
															 		{ X86_FEATURE_HWP_PKG_REQ,	CR_EAX,11, 0x00000006, 0 },
														
 
															+		{ X86_FEATURE_INTEL_PT,		CR_EBX,25, 0x00000007, 0 },
														
 
															 		{ X86_FEATURE_APERFMPERF,	CR_ECX, 0, 0x00000006, 0 },
														
 
															 		{ X86_FEATURE_EPB,		CR_ECX, 3, 0x00000006, 0 },
														
 
															 		{ X86_FEATURE_HW_PSTATE,	CR_EDX, 7, 0x80000007, 0 },
														
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -354,6 +354,7 @@ int __copy_instruction(u8 *dest, u8 *src)
 
															 {
														
 
															 	struct insn insn;
														
 
															 	kprobe_opcode_t buf[MAX_INSN_SIZE];
														
 
															+	int length;
														
 
															 	unsigned long recovered_insn =
														
 
															 		recover_probed_instruction(buf, (unsigned long)src);
														
@@ -361,16 +362,18 @@ int __copy_instruction(u8 *dest, u8 *src)
 
															 		return 0;
														
 
															 	kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
														
 
															 	insn_get_length(&insn);
														
 
															+	length = insn.length;
														
 
															+
														
 
															 	/* Another subsystem puts a breakpoint, failed to recover */
														
 
															 	if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
														
 
															 		return 0;
														
 
															-	memcpy(dest, insn.kaddr, insn.length);
														
 
															+	memcpy(dest, insn.kaddr, length);
														
 
															 #ifdef CONFIG_X86_64
														
 
															 	if (insn_rip_relative(&insn)) {
														
 
															 		s64 newdisp;
														
 
															 		u8 *disp;
														
 
															-		kernel_insn_init(&insn, dest, insn.length);
														
 
															+		kernel_insn_init(&insn, dest, length);
														
 
															 		insn_get_displacement(&insn);
														
 
															 		/*
														
 
															 		 * The copied instruction uses the %rip-relative addressing
														
@@ -394,7 +397,7 @@ int __copy_instruction(u8 *dest, u8 *src)
 
															 		*(s32 *) disp = (s32) newdisp;
														
 
															 	}
														
 
															 #endif
														
 
															-	return insn.length;
														
 
															+	return length;
														
 
															 }
														
 
															 static int arch_copy_kprobe(struct kprobe *p)
														
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -113,8 +113,6 @@ struct bpf_prog_type_list {
 
															 	enum bpf_prog_type type;
														
 
															 };
														
 
															-void bpf_register_prog_type(struct bpf_prog_type_list *tl);
														
 
															-
														
 
															 struct bpf_prog;
														
 
															 struct bpf_prog_aux {
														
@@ -129,11 +127,25 @@ struct bpf_prog_aux {
 
															 };
														
 
															 #ifdef CONFIG_BPF_SYSCALL
														
 
															+void bpf_register_prog_type(struct bpf_prog_type_list *tl);
														
 
															+
														
 
															 void bpf_prog_put(struct bpf_prog *prog);
														
 
															+struct bpf_prog *bpf_prog_get(u32 ufd);
														
 
															 #else
														
 
															-static inline void bpf_prog_put(struct bpf_prog *prog) {}
														
 
															+static inline void bpf_register_prog_type(struct bpf_prog_type_list *tl)
														
 
															+{
														
 
															+}
														
 
															+
														
 
															+static inline struct bpf_prog *bpf_prog_get(u32 ufd)
														
 
															+{
														
 
															+	return ERR_PTR(-EOPNOTSUPP);
														
 
															+}
														
 
															+
														
 
															+static inline void bpf_prog_put(struct bpf_prog *prog)
														
 
															+{
														
 
															+}
														
 
															 #endif
														
 
															-struct bpf_prog *bpf_prog_get(u32 ufd);
														
 
															+
														
 
															 /* verify correctness of eBPF program */
														
 
															 int bpf_check(struct bpf_prog *fp, union bpf_attr *attr);
														
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -13,6 +13,7 @@ struct trace_array;
 
															 struct trace_buffer;
														
 
															 struct tracer;
														
 
															 struct dentry;
														
 
															+struct bpf_prog;
														
 
															 struct trace_print_flags {
														
 
															 	unsigned long		mask;
														
@@ -252,6 +253,7 @@ enum {
 
															 	TRACE_EVENT_FL_WAS_ENABLED_BIT,
														
 
															 	TRACE_EVENT_FL_USE_CALL_FILTER_BIT,
														
 
															 	TRACE_EVENT_FL_TRACEPOINT_BIT,
														
 
															+	TRACE_EVENT_FL_KPROBE_BIT,
														
 
															 };
														
 
															 /*
														
@@ -265,6 +267,7 @@ enum {
 
															  *                     it is best to clear the buffers that used it).
														
 
															  *  USE_CALL_FILTER - For ftrace internal events, don't use file filter
														
 
															  *  TRACEPOINT    - Event is a tracepoint
														
 
															+ *  KPROBE        - Event is a kprobe
														
 
															  */
														
 
															 enum {
														
 
															 	TRACE_EVENT_FL_FILTERED		= (1 << TRACE_EVENT_FL_FILTERED_BIT),
														
@@ -274,6 +277,7 @@ enum {
 
															 	TRACE_EVENT_FL_WAS_ENABLED	= (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT),
														
 
															 	TRACE_EVENT_FL_USE_CALL_FILTER	= (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT),
														
 
															 	TRACE_EVENT_FL_TRACEPOINT	= (1 << TRACE_EVENT_FL_TRACEPOINT_BIT),
														
 
															+	TRACE_EVENT_FL_KPROBE		= (1 << TRACE_EVENT_FL_KPROBE_BIT),
														
 
															 };
														
 
															 struct ftrace_event_call {
														
@@ -303,6 +307,7 @@ struct ftrace_event_call {
 
															 #ifdef CONFIG_PERF_EVENTS
														
 
															 	int				perf_refcount;
														
 
															 	struct hlist_head __percpu	*perf_events;
														
 
															+	struct bpf_prog			*prog;
														
 
															 	int	(*perf_perm)(struct ftrace_event_call *,
														
 
															 			     struct perf_event *);
														
@@ -548,6 +553,15 @@ event_trigger_unlock_commit_regs(struct ftrace_event_file *file,
 
															 		event_triggers_post_call(file, tt);
														
 
															 }
														
 
															+#ifdef CONFIG_BPF_SYSCALL
														
 
															+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx);
														
 
															+#else
														
 
															+static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
														
 
															+{
														
 
															+	return 1;
														
 
															+}
														
 
															+#endif
														
 
															+
														
 
															 enum {
														
 
															 	FILTER_OTHER = 0,
														
 
															 	FILTER_STATIC_STRING,
														
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -53,6 +53,7 @@ struct perf_guest_info_callbacks {
 
															 #include <linux/sysfs.h>
														
 
															 #include <linux/perf_regs.h>
														
 
															 #include <linux/workqueue.h>
														
 
															+#include <linux/cgroup.h>
														
 
															 #include <asm/local.h>
														
 
															 struct perf_callchain_entry {
														
@@ -118,10 +119,19 @@ struct hw_perf_event {
 
															 			struct hrtimer	hrtimer;
														
 
															 		};
														
 
															 		struct { /* tracepoint */
														
 
															-			struct task_struct	*tp_target;
														
 
															 			/* for tp_event->class */
														
 
															 			struct list_head	tp_list;
														
 
															 		};
														
 
															+		struct { /* intel_cqm */
														
 
															+			int			cqm_state;
														
 
															+			int			cqm_rmid;
														
 
															+			struct list_head	cqm_events_entry;
														
 
															+			struct list_head	cqm_groups_entry;
														
 
															+			struct list_head	cqm_group_entry;
														
 
															+		};
														
 
															+		struct { /* itrace */
														
 
															+			int			itrace_started;
														
 
															+		};
														
 
															 #ifdef CONFIG_HAVE_HW_BREAKPOINT
														
 
															 		struct { /* breakpoint */
														
 
															 			/*
														
@@ -129,12 +139,12 @@ struct hw_perf_event {
 
															 			 * problem hw_breakpoint has with context
														
 
															 			 * creation and event initalization.
														
 
															 			 */
														
 
															-			struct task_struct		*bp_target;
														
 
															 			struct arch_hw_breakpoint	info;
														
 
															 			struct list_head		bp_list;
														
 
															 		};
														
 
															 #endif
														
 
															 	};
														
 
															+	struct task_struct		*target;
														
 
															 	int				state;
														
 
															 	local64_t			prev_count;
														
 
															 	u64				sample_period;
														
@@ -166,6 +176,11 @@ struct perf_event;
 
															  * pmu::capabilities flags
														
 
															  */
														
 
															 #define PERF_PMU_CAP_NO_INTERRUPT		0x01
														
 
															+#define PERF_PMU_CAP_NO_NMI			0x02
														
 
															+#define PERF_PMU_CAP_AUX_NO_SG			0x04
														
 
															+#define PERF_PMU_CAP_AUX_SW_DOUBLEBUF		0x08
														
 
															+#define PERF_PMU_CAP_EXCLUSIVE			0x10
														
 
															+#define PERF_PMU_CAP_ITRACE			0x20
														
 
															 /**
														
 
															  * struct pmu - generic performance monitoring unit
														
@@ -186,6 +201,7 @@ struct pmu {
 
															 	int * __percpu			pmu_disable_count;
														
 
															 	struct perf_cpu_context * __percpu pmu_cpu_context;
														
 
															+	atomic_t			exclusive_cnt; /* < 0: cpu; > 0: tsk */
														
 
															 	int				task_ctx_nr;
														
 
															 	int				hrtimer_interval_ms;
														
@@ -262,9 +278,32 @@ struct pmu {
 
															 	int (*event_idx)		(struct perf_event *event); /*optional */
														
 
															 	/*
														
 
															-	 * flush branch stack on context-switches (needed in cpu-wide mode)
														
 
															+	 * context-switches callback
														
 
															+	 */
														
 
															+	void (*sched_task)		(struct perf_event_context *ctx,
														
 
															+					bool sched_in);
														
 
															+	/*
														
 
															+	 * PMU specific data size
														
 
															+	 */
														
 
															+	size_t				task_ctx_size;
														
 
															+
														
 
															+
														
 
															+	/*
														
 
															+	 * Return the count value for a counter.
														
 
															+	 */
														
 
															+	u64 (*count)			(struct perf_event *event); /*optional*/
														
 
															+
														
 
															+	/*
														
 
															+	 * Set up pmu-private data structures for an AUX area
														
 
															 	 */
														
 
															-	void (*flush_branch_stack)	(void);
														
 
															+	void *(*setup_aux)		(int cpu, void **pages,
														
 
															+					 int nr_pages, bool overwrite);
														
 
															+					/* optional */
														
 
															+
														
 
															+	/*
														
 
															+	 * Free pmu-private AUX data structures
														
 
															+	 */
														
 
															+	void (*free_aux)		(void *aux); /* optional */
														
 
															 };
														
 
															 /**
														
@@ -300,6 +339,7 @@ struct swevent_hlist {
 
															 #define PERF_ATTACH_CONTEXT	0x01
														
 
															 #define PERF_ATTACH_GROUP	0x02
														
 
															 #define PERF_ATTACH_TASK	0x04
														
 
															+#define PERF_ATTACH_TASK_DATA	0x08
														
 
															 struct perf_cgroup;
														
 
															 struct ring_buffer;
														
@@ -438,6 +478,7 @@ struct perf_event {
 
															 	struct pid_namespace		*ns;
														
 
															 	u64				id;
														
 
															+	u64				(*clock)(void);
														
 
															 	perf_overflow_handler_t		overflow_handler;
														
 
															 	void				*overflow_handler_context;
														
@@ -504,7 +545,7 @@ struct perf_event_context {
 
															 	u64				generation;
														
 
															 	int				pin_count;
														
 
															 	int				nr_cgroups;	 /* cgroup evts */
														
 
															-	int				nr_branch_stack; /* branch_stack evt */
														
 
															+	void				*task_ctx_data; /* pmu specific data */
														
 
															 	struct rcu_head			rcu_head;
														
 
															 	struct delayed_work		orphans_remove;
														
@@ -536,12 +577,52 @@ struct perf_output_handle {
 
															 	struct ring_buffer		*rb;
														
 
															 	unsigned long			wakeup;
														
 
															 	unsigned long			size;
														
 
															-	void				*addr;
														
 
															+	union {
														
 
															+		void			*addr;
														
 
															+		unsigned long		head;
														
 
															+	};
														
 
															 	int				page;
														
 
															 };
														
 
															+#ifdef CONFIG_CGROUP_PERF
														
 
															+
														
 
															+/*
														
 
															+ * perf_cgroup_info keeps track of time_enabled for a cgroup.
														
 
															+ * This is a per-cpu dynamically allocated data structure.
														
 
															+ */
														
 
															+struct perf_cgroup_info {
														
 
															+	u64				time;
														
 
															+	u64				timestamp;
														
 
															+};
														
 
															+
														
 
															+struct perf_cgroup {
														
 
															+	struct cgroup_subsys_state	css;
														
 
															+	struct perf_cgroup_info	__percpu *info;
														
 
															+};
														
 
															+
														
 
															+/*
														
 
															+ * Must ensure cgroup is pinned (css_get) before calling
														
 
															+ * this function. In other words, we cannot call this function
														
 
															+ * if there is no cgroup event for the current CPU context.
														
 
															+ */
														
 
															+static inline struct perf_cgroup *
														
 
															+perf_cgroup_from_task(struct task_struct *task)
														
 
															+{
														
 
															+	return container_of(task_css(task, perf_event_cgrp_id),
														
 
															+			    struct perf_cgroup, css);
														
 
															+}
														
 
															+#endif /* CONFIG_CGROUP_PERF */
														
 
															+
														
 
															 #ifdef CONFIG_PERF_EVENTS
														
 
															+extern void *perf_aux_output_begin(struct perf_output_handle *handle,
														
 
															+				   struct perf_event *event);
														
 
															+extern void perf_aux_output_end(struct perf_output_handle *handle,
														
 
															+				unsigned long size, bool truncated);
														
 
															+extern int perf_aux_output_skip(struct perf_output_handle *handle,
														
 
															+				unsigned long size);
														
 
															+extern void *perf_get_aux(struct perf_output_handle *handle);
														
 
															+
														
 
															 extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
														
 
															 extern void perf_pmu_unregister(struct pmu *pmu);
														
@@ -558,6 +639,8 @@ extern void perf_event_delayed_put(struct task_struct *task);
 
															 extern void perf_event_print_debug(void);
														
 
															 extern void perf_pmu_disable(struct pmu *pmu);
														
 
															 extern void perf_pmu_enable(struct pmu *pmu);
														
 
															+extern void perf_sched_cb_dec(struct pmu *pmu);
														
 
															+extern void perf_sched_cb_inc(struct pmu *pmu);
														
 
															 extern int perf_event_task_disable(void);
														
 
															 extern int perf_event_task_enable(void);
														
 
															 extern int perf_event_refresh(struct perf_event *event, int refresh);
														
@@ -731,6 +814,11 @@ static inline void perf_event_task_sched_out(struct task_struct *prev,
 
															 		__perf_event_task_sched_out(prev, next);
														
 
															 }
														
 
															+static inline u64 __perf_event_count(struct perf_event *event)
														
 
															+{
														
 
															+	return local64_read(&event->count) + atomic64_read(&event->child_count);
														
 
															+}
														
 
															+
														
 
															 extern void perf_event_mmap(struct vm_area_struct *vma);
														
 
															 extern struct perf_guest_info_callbacks *perf_guest_cbs;
														
 
															 extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
														
@@ -800,6 +888,16 @@ static inline bool has_branch_stack(struct perf_event *event)
 
															 	return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
														
 
															 }
														
 
															+static inline bool needs_branch_stack(struct perf_event *event)
														
 
															+{
														
 
															+	return event->attr.branch_sample_type != 0;
														
 
															+}
														
 
															+
														
 
															+static inline bool has_aux(struct perf_event *event)
														
 
															+{
														
 
															+	return event->pmu->setup_aux;
														
 
															+}
														
 
															+
														
 
															 extern int perf_output_begin(struct perf_output_handle *handle,
														
 
															 			     struct perf_event *event, unsigned int size);
														
 
															 extern void perf_output_end(struct perf_output_handle *handle);
														
@@ -815,6 +913,17 @@ extern void perf_event_disable(struct perf_event *event);
 
															 extern int __perf_event_disable(void *info);
														
 
															 extern void perf_event_task_tick(void);
														
 
															 #else /* !CONFIG_PERF_EVENTS: */
														
 
															+static inline void *
														
 
															+perf_aux_output_begin(struct perf_output_handle *handle,
														
 
															+		      struct perf_event *event)				{ return NULL; }
														
 
															+static inline void
														
 
															+perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
														
 
															+		    bool truncated)					{ }
														
 
															+static inline int
														
 
															+perf_aux_output_skip(struct perf_output_handle *handle,
														
 
															+		     unsigned long size)				{ return -EINVAL; }
														
 
															+static inline void *
														
 
															+perf_get_aux(struct perf_output_handle *handle)				{ return NULL; }
														
 
															 static inline void
														
 
															 perf_event_task_sched_in(struct task_struct *prev,
														
 
															 			 struct task_struct *task)			{ }
														
--- a/include/linux/watchdog.h
+++ b/include/linux/watchdog.h
@@ -137,4 +137,12 @@ extern int watchdog_init_timeout(struct watchdog_device *wdd,
 
															 extern int watchdog_register_device(struct watchdog_device *);
														
 
															 extern void watchdog_unregister_device(struct watchdog_device *);
														
 
															+#ifdef CONFIG_HARDLOCKUP_DETECTOR
														
 
															+void watchdog_nmi_disable_all(void);
														
 
															+void watchdog_nmi_enable_all(void);
														
 
															+#else
														
 
															+static inline void watchdog_nmi_disable_all(void) {}
														
 
															+static inline void watchdog_nmi_enable_all(void) {}
														
 
															+#endif
														
 
															+
														
 
															 #endif  /* ifndef _LINUX_WATCHDOG_H */
														
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -118,6 +118,7 @@ enum bpf_map_type {
 
															 enum bpf_prog_type {
														
 
															 	BPF_PROG_TYPE_UNSPEC,
														
 
															 	BPF_PROG_TYPE_SOCKET_FILTER,
														
 
															+	BPF_PROG_TYPE_KPROBE,
														
 
															 };
														
 
															 /* flags for BPF_MAP_UPDATE_ELEM command */
														
@@ -151,6 +152,7 @@ union bpf_attr {
 
															 		__u32		log_level;	/* verbosity level of verifier */
														
 
															 		__u32		log_size;	/* size of user buffer */
														
 
															 		__aligned_u64	log_buf;	/* user supplied buffer */
														
 
															+		__u32		kern_version;	/* checked when prog_type=kprobe */
														
 
															 	};
														
 
															 } __attribute__((aligned(8)));
														
@@ -162,6 +164,9 @@ enum bpf_func_id {
 
															 	BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */
														
 
															 	BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
														
 
															 	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
														
 
															+	BPF_FUNC_probe_read,      /* int bpf_probe_read(void *dst, int size, void *src) */
														
 
															+	BPF_FUNC_ktime_get_ns,    /* u64 bpf_ktime_get_ns(void) */
														
 
															+	BPF_FUNC_trace_printk,    /* int bpf_trace_printk(const char *fmt, int fmt_size, ...) */
														
 
															 	__BPF_FUNC_MAX_ID,
														
 
															 };
														
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -152,21 +152,42 @@ enum perf_event_sample_format {
 
															  * The branch types can be combined, however BRANCH_ANY covers all types
														
 
															  * of branches and therefore it supersedes all the other types.
														
 
															  */
														
 
															+enum perf_branch_sample_type_shift {
														
 
															+	PERF_SAMPLE_BRANCH_USER_SHIFT		= 0, /* user branches */
														
 
															+	PERF_SAMPLE_BRANCH_KERNEL_SHIFT		= 1, /* kernel branches */
														
 
															+	PERF_SAMPLE_BRANCH_HV_SHIFT		= 2, /* hypervisor branches */
														
 
															+
														
 
															+	PERF_SAMPLE_BRANCH_ANY_SHIFT		= 3, /* any branch types */
														
 
															+	PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT	= 4, /* any call branch */
														
 
															+	PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT	= 5, /* any return branch */
														
 
															+	PERF_SAMPLE_BRANCH_IND_CALL_SHIFT	= 6, /* indirect calls */
														
 
															+	PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT	= 7, /* transaction aborts */
														
 
															+	PERF_SAMPLE_BRANCH_IN_TX_SHIFT		= 8, /* in transaction */
														
 
															+	PERF_SAMPLE_BRANCH_NO_TX_SHIFT		= 9, /* not in transaction */
														
 
															+	PERF_SAMPLE_BRANCH_COND_SHIFT		= 10, /* conditional branches */
														
 
															+
														
 
															+	PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT	= 11, /* call/ret stack */
														
 
															+
														
 
															+	PERF_SAMPLE_BRANCH_MAX_SHIFT		/* non-ABI */
														
 
															+};
														
 
															+
														
 
															 enum perf_branch_sample_type {
														
 
															-	PERF_SAMPLE_BRANCH_USER		= 1U << 0, /* user branches */
														
 
															-	PERF_SAMPLE_BRANCH_KERNEL	= 1U << 1, /* kernel branches */
														
 
															-	PERF_SAMPLE_BRANCH_HV		= 1U << 2, /* hypervisor branches */
														
 
															-
														
 
															-	PERF_SAMPLE_BRANCH_ANY		= 1U << 3, /* any branch types */
														
 
															-	PERF_SAMPLE_BRANCH_ANY_CALL	= 1U << 4, /* any call branch */
														
 
															-	PERF_SAMPLE_BRANCH_ANY_RETURN	= 1U << 5, /* any return branch */
														
 
															-	PERF_SAMPLE_BRANCH_IND_CALL	= 1U << 6, /* indirect calls */
														
 
															-	PERF_SAMPLE_BRANCH_ABORT_TX	= 1U << 7, /* transaction aborts */
														
 
															-	PERF_SAMPLE_BRANCH_IN_TX	= 1U << 8, /* in transaction */
														
 
															-	PERF_SAMPLE_BRANCH_NO_TX	= 1U << 9, /* not in transaction */
														
 
															-	PERF_SAMPLE_BRANCH_COND		= 1U << 10, /* conditional branches */
														
 
															-
														
 
															-	PERF_SAMPLE_BRANCH_MAX		= 1U << 11, /* non-ABI */
														
 
															+	PERF_SAMPLE_BRANCH_USER		= 1U << PERF_SAMPLE_BRANCH_USER_SHIFT,
														
 
															+	PERF_SAMPLE_BRANCH_KERNEL	= 1U << PERF_SAMPLE_BRANCH_KERNEL_SHIFT,
														
 
															+	PERF_SAMPLE_BRANCH_HV		= 1U << PERF_SAMPLE_BRANCH_HV_SHIFT,
														
 
															+
														
 
															+	PERF_SAMPLE_BRANCH_ANY		= 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT,
														
 
															+	PERF_SAMPLE_BRANCH_ANY_CALL	= 1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT,
														
 
															+	PERF_SAMPLE_BRANCH_ANY_RETURN	= 1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT,
														
 
															+	PERF_SAMPLE_BRANCH_IND_CALL	= 1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT,
														
 
															+	PERF_SAMPLE_BRANCH_ABORT_TX	= 1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT,
														
 
															+	PERF_SAMPLE_BRANCH_IN_TX	= 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT,
														
 
															+	PERF_SAMPLE_BRANCH_NO_TX	= 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT,
														
 
															+	PERF_SAMPLE_BRANCH_COND		= 1U << PERF_SAMPLE_BRANCH_COND_SHIFT,
														
 
															+
														
 
															+	PERF_SAMPLE_BRANCH_CALL_STACK	= 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT,
														
 
															+
														
 
															+	PERF_SAMPLE_BRANCH_MAX		= 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
														
 
															 };
														
 
															 #define PERF_SAMPLE_BRANCH_PLM_ALL \
														
@@ -240,6 +261,7 @@ enum perf_event_read_format {
 
															 #define PERF_ATTR_SIZE_VER3	96	/* add: sample_regs_user */
														
 
															 					/* add: sample_stack_user */
														
 
															 #define PERF_ATTR_SIZE_VER4	104	/* add: sample_regs_intr */
														
 
															+#define PERF_ATTR_SIZE_VER5	112	/* add: aux_watermark */
														
 
															 /*
														
 
															  * Hardware event_id to monitor via a performance monitoring event:
														
@@ -305,7 +327,8 @@ struct perf_event_attr {
 
															 				exclude_callchain_user   : 1, /* exclude user callchains */
														
 
															 				mmap2          :  1, /* include mmap with inode data     */
														
 
															 				comm_exec      :  1, /* flag comm events that are due to an exec */
														
 
															-				__reserved_1   : 39;
														
 
															+				use_clockid    :  1, /* use @clockid for time fields */
														
 
															+				__reserved_1   : 38;
														
 
															 	union {
														
 
															 		__u32		wakeup_events;	  /* wakeup every n events */
														
@@ -334,8 +357,7 @@ struct perf_event_attr {
 
															 	 */
														
 
															 	__u32	sample_stack_user;
														
 
															-	/* Align to u64. */
														
 
															-	__u32	__reserved_2;
														
 
															+	__s32	clockid;
														
 
															 	/*
														
 
															 	 * Defines set of regs to dump for each sample
														
 
															 	 * state captured on:
														
@@ -345,6 +367,12 @@ struct perf_event_attr {
 
															 	 * See asm/perf_regs.h for details.
														
 
															 	 */
														
 
															 	__u64	sample_regs_intr;
														
 
															+
														
 
															+	/*
														
 
															+	 * Wakeup watermark for AUX area
														
 
															+	 */
														
 
															+	__u32	aux_watermark;
														
 
															+	__u32	__reserved_2;	/* align to __u64 */
														
 
															 };
														
 
															 #define perf_flags(attr)	(*(&(attr)->read_format + 1))
														
@@ -360,6 +388,7 @@ struct perf_event_attr {
 
															 #define PERF_EVENT_IOC_SET_OUTPUT	_IO ('$', 5)
														
 
															 #define PERF_EVENT_IOC_SET_FILTER	_IOW('$', 6, char *)
														
 
															 #define PERF_EVENT_IOC_ID		_IOR('$', 7, __u64 *)
														
 
															+#define PERF_EVENT_IOC_SET_BPF		_IOW('$', 8, __u32)
														
 
															 enum perf_event_ioc_flags {
														
 
															 	PERF_IOC_FLAG_GROUP		= 1U << 0,
														
@@ -500,9 +529,30 @@ struct perf_event_mmap_page {
 
															 	 * In this case the kernel will not over-write unread data.
														
 
															 	 *
														
 
															 	 * See perf_output_put_handle() for the data ordering.
														
 
															+	 *
														
 
															+	 * data_{offset,size} indicate the location and size of the perf record
														
 
															+	 * buffer within the mmapped area.
														
 
															 	 */
														
 
															 	__u64   data_head;		/* head in the data section */
														
 
															 	__u64	data_tail;		/* user-space written tail */
														
 
															+	__u64	data_offset;		/* where the buffer starts */
														
 
															+	__u64	data_size;		/* data buffer size */
														
 
															+
														
 
															+	/*
														
 
															+	 * AUX area is defined by aux_{offset,size} fields that should be set
														
 
															+	 * by the userspace, so that
														
 
															+	 *
														
 
															+	 *   aux_offset >= data_offset + data_size
														
 
															+	 *
														
 
															+	 * prior to mmap()ing it. Size of the mmap()ed area should be aux_size.
														
 
															+	 *
														
 
															+	 * Ring buffer pointers aux_{head,tail} have the same semantics as
														
 
															+	 * data_{head,tail} and same ordering rules apply.
														
 
															+	 */
														
 
															+	__u64	aux_head;
														
 
															+	__u64	aux_tail;
														
 
															+	__u64	aux_offset;
														
 
															+	__u64	aux_size;
														
 
															 };
														
 
															 #define PERF_RECORD_MISC_CPUMODE_MASK		(7 << 0)
														
@@ -725,6 +775,31 @@ enum perf_event_type {
 
															 	 */
														
 
															 	PERF_RECORD_MMAP2			= 10,
														
 
															+	/*
														
 
															+	 * Records that new data landed in the AUX buffer part.
														
 
															+	 *
														
 
															+	 * struct {
														
 
															+	 * 	struct perf_event_header	header;
														
 
															+	 *
														
 
															+	 * 	u64				aux_offset;
														
 
															+	 * 	u64				aux_size;
														
 
															+	 *	u64				flags;
														
 
															+	 * 	struct sample_id		sample_id;
														
 
															+	 * };
														
 
															+	 */
														
 
															+	PERF_RECORD_AUX				= 11,
														
 
															+
														
 
															+	/*
														
 
															+	 * Indicates that instruction trace has started
														
 
															+	 *
														
 
															+	 * struct {
														
 
															+	 *	struct perf_event_header	header;
														
 
															+	 *	u32				pid;
														
 
															+	 *	u32				tid;
														
 
															+	 * };
														
 
															+	 */
														
 
															+	PERF_RECORD_ITRACE_START		= 12,
														
 
															+
														
 
															 	PERF_RECORD_MAX,			/* non-ABI */
														
 
															 };
														
@@ -742,6 +817,12 @@ enum perf_callchain_context {
 
															 	PERF_CONTEXT_MAX		= (__u64)-4095,
														
 
															 };
														
 
															+/**
														
 
															+ * PERF_RECORD_AUX::flags bits
														
 
															+ */
														
 
															+#define PERF_AUX_FLAG_TRUNCATED		0x01	/* record was truncated to fit */
														
 
															+#define PERF_AUX_FLAG_OVERWRITE		0x02	/* snapshot from overwrite mode */
														
 
															+
														
 
															 #define PERF_FLAG_FD_NO_GROUP		(1UL << 0)
														
 
															 #define PERF_FLAG_FD_OUTPUT		(1UL << 1)
														
 
															 #define PERF_FLAG_PID_CGROUP		(1UL << 2) /* pid=cgroup id, per-cpu mode only */
														
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1526,7 +1526,7 @@ config EVENTFD
 
															 # syscall, maps, verifier
														
 
															 config BPF_SYSCALL
														
 
															-	bool "Enable bpf() system call" if EXPERT
														
 
															+	bool "Enable bpf() system call"
														
 
															 	select ANON_INODES
														
 
															 	select BPF
														
 
															 	default n
														
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -16,6 +16,7 @@
 
															 #include <linux/file.h>
														
 
															 #include <linux/license.h>
														
 
															 #include <linux/filter.h>
														
 
															+#include <linux/version.h>
														
 
															 static LIST_HEAD(bpf_map_types);
														
@@ -467,7 +468,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
 
															 }
														
 
															 /* last field in 'union bpf_attr' used by this command */
														
 
															-#define	BPF_PROG_LOAD_LAST_FIELD log_buf
														
 
															+#define	BPF_PROG_LOAD_LAST_FIELD kern_version
														
 
															 static int bpf_prog_load(union bpf_attr *attr)
														
 
															 {
														
@@ -492,6 +493,10 @@ static int bpf_prog_load(union bpf_attr *attr)
 
															 	if (attr->insn_cnt >= BPF_MAXINSNS)
														
 
															 		return -EINVAL;
														
 
															+	if (type == BPF_PROG_TYPE_KPROBE &&
														
 
															+	    attr->kern_version != LINUX_VERSION_CODE)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															 	/* plain bpf_prog allocation */
														
 
															 	prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
														
 
															 	if (!prog)
														
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -34,14 +34,16 @@
 
															 #include <linux/syscalls.h>
														
 
															 #include <linux/anon_inodes.h>
														
 
															 #include <linux/kernel_stat.h>
														
 
															+#include <linux/cgroup.h>
														
 
															 #include <linux/perf_event.h>
														
 
															 #include <linux/ftrace_event.h>
														
 
															 #include <linux/hw_breakpoint.h>
														
 
															 #include <linux/mm_types.h>
														
 
															-#include <linux/cgroup.h>
														
 
															 #include <linux/module.h>
														
 
															 #include <linux/mman.h>
														
 
															 #include <linux/compat.h>
														
 
															+#include <linux/bpf.h>
														
 
															+#include <linux/filter.h>
														
 
															 #include "internal.h"
														
@@ -153,7 +155,7 @@ enum event_type_t {
 
															  */
														
 
															 struct static_key_deferred perf_sched_events __read_mostly;
														
 
															 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
														
 
															-static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
														
 
															+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
														
 
															 static atomic_t nr_mmap_events __read_mostly;
														
 
															 static atomic_t nr_comm_events __read_mostly;
														
@@ -327,6 +329,11 @@ static inline u64 perf_clock(void)
 
															 	return local_clock();
														
 
															 }
														
 
															+static inline u64 perf_event_clock(struct perf_event *event)
														
 
															+{
														
 
															+	return event->clock();
														
 
															+}
														
 
															+
														
 
															 static inline struct perf_cpu_context *
														
 
															 __get_cpu_context(struct perf_event_context *ctx)
														
 
															 {
														
@@ -351,32 +358,6 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
 
															 #ifdef CONFIG_CGROUP_PERF
														
 
															-/*
														
 
															- * perf_cgroup_info keeps track of time_enabled for a cgroup.
														
 
															- * This is a per-cpu dynamically allocated data structure.
														
 
															- */
														
 
															-struct perf_cgroup_info {
														
 
															-	u64				time;
														
 
															-	u64				timestamp;
														
 
															-};
														
 
															-
														
 
															-struct perf_cgroup {
														
 
															-	struct cgroup_subsys_state	css;
														
 
															-	struct perf_cgroup_info	__percpu *info;
														
 
															-};
														
 
															-
														
 
															-/*
														
 
															- * Must ensure cgroup is pinned (css_get) before calling
														
 
															- * this function. In other words, we cannot call this function
														
 
															- * if there is no cgroup event for the current CPU context.
														
 
															- */
														
 
															-static inline struct perf_cgroup *
														
 
															-perf_cgroup_from_task(struct task_struct *task)
														
 
															-{
														
 
															-	return container_of(task_css(task, perf_event_cgrp_id),
														
 
															-			    struct perf_cgroup, css);
														
 
															-}
														
 
															-
														
 
															 static inline bool
														
 
															 perf_cgroup_match(struct perf_event *event)
														
 
															 {
														
@@ -905,6 +886,15 @@ static void get_ctx(struct perf_event_context *ctx)
 
															 	WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
														
 
															 }
														
 
															+static void free_ctx(struct rcu_head *head)
														
 
															+{
														
 
															+	struct perf_event_context *ctx;
														
 
															+
														
 
															+	ctx = container_of(head, struct perf_event_context, rcu_head);
														
 
															+	kfree(ctx->task_ctx_data);
														
 
															+	kfree(ctx);
														
 
															+}
														
 
															+
														
 
															 static void put_ctx(struct perf_event_context *ctx)
														
 
															 {
														
 
															 	if (atomic_dec_and_test(&ctx->refcount)) {
														
@@ -912,7 +902,7 @@ static void put_ctx(struct perf_event_context *ctx)
 
															 			put_ctx(ctx->parent_ctx);
														
 
															 		if (ctx->task)
														
 
															 			put_task_struct(ctx->task);
														
 
															-		kfree_rcu(ctx, rcu_head);
														
 
															+		call_rcu(&ctx->rcu_head, free_ctx);
														
 
															 	}
														
 
															 }
														
@@ -1239,9 +1229,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 
															 	if (is_cgroup_event(event))
														
 
															 		ctx->nr_cgroups++;
														
 
															-	if (has_branch_stack(event))
														
 
															-		ctx->nr_branch_stack++;
														
 
															-
														
 
															 	list_add_rcu(&event->event_entry, &ctx->event_list);
														
 
															 	ctx->nr_events++;
														
 
															 	if (event->attr.inherit_stat)
														
@@ -1408,9 +1395,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 
															 			cpuctx->cgrp = NULL;
														
 
															 	}
														
 
															-	if (has_branch_stack(event))
														
 
															-		ctx->nr_branch_stack--;
														
 
															-
														
 
															 	ctx->nr_events--;
														
 
															 	if (event->attr.inherit_stat)
														
 
															 		ctx->nr_stat--;
														
@@ -1847,6 +1831,7 @@ static void perf_set_shadow_time(struct perf_event *event,
 
															 #define MAX_INTERRUPTS (~0ULL)
														
 
															 static void perf_log_throttle(struct perf_event *event, int enable);
														
 
															+static void perf_log_itrace_start(struct perf_event *event);
														
 
															 static int
														
 
															 event_sched_in(struct perf_event *event,
														
@@ -1881,6 +1866,12 @@ event_sched_in(struct perf_event *event,
 
															 	perf_pmu_disable(event->pmu);
														
 
															+	event->tstamp_running += tstamp - event->tstamp_stopped;
														
 
															+
														
 
															+	perf_set_shadow_time(event, ctx, tstamp);
														
 
															+
														
 
															+	perf_log_itrace_start(event);
														
 
															+
														
 
															 	if (event->pmu->add(event, PERF_EF_START)) {
														
 
															 		event->state = PERF_EVENT_STATE_INACTIVE;
														
 
															 		event->oncpu = -1;
														
@@ -1888,10 +1879,6 @@ event_sched_in(struct perf_event *event,
 
															 		goto out;
														
 
															 	}
														
 
															-	event->tstamp_running += tstamp - event->tstamp_stopped;
														
 
															-
														
 
															-	perf_set_shadow_time(event, ctx, tstamp);
														
 
															-
														
 
															 	if (!is_software_event(event))
														
 
															 		cpuctx->active_oncpu++;
														
 
															 	if (!ctx->nr_active++)
														
@@ -2559,6 +2546,9 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 
															 			next->perf_event_ctxp[ctxn] = ctx;
														
 
															 			ctx->task = next;
														
 
															 			next_ctx->task = task;
														
 
															+
														
 
															+			swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
														
 
															+
														
 
															 			do_switch = 0;
														
 
															 			perf_event_sync_stat(ctx, next_ctx);
														
@@ -2577,6 +2567,56 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 
															 	}
														
 
															 }
														
 
															+void perf_sched_cb_dec(struct pmu *pmu)
														
 
															+{
														
 
															+	this_cpu_dec(perf_sched_cb_usages);
														
 
															+}
														
 
															+
														
 
															+void perf_sched_cb_inc(struct pmu *pmu)
														
 
															+{
														
 
															+	this_cpu_inc(perf_sched_cb_usages);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * This function provides the context switch callback to the lower code
														
 
															+ * layer. It is invoked ONLY when the context switch callback is enabled.
														
 
															+ */
														
 
															+static void perf_pmu_sched_task(struct task_struct *prev,
														
 
															+				struct task_struct *next,
														
 
															+				bool sched_in)
														
 
															+{
														
 
															+	struct perf_cpu_context *cpuctx;
														
 
															+	struct pmu *pmu;
														
 
															+	unsigned long flags;
														
 
															+
														
 
															+	if (prev == next)
														
 
															+		return;
														
 
															+
														
 
															+	local_irq_save(flags);
														
 
															+
														
 
															+	rcu_read_lock();
														
 
															+
														
 
															+	list_for_each_entry_rcu(pmu, &pmus, entry) {
														
 
															+		if (pmu->sched_task) {
														
 
															+			cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
														
 
															+
														
 
															+			perf_ctx_lock(cpuctx, cpuctx->task_ctx);
														
 
															+
														
 
															+			perf_pmu_disable(pmu);
														
 
															+
														
 
															+			pmu->sched_task(cpuctx->task_ctx, sched_in);
														
 
															+
														
 
															+			perf_pmu_enable(pmu);
														
 
															+
														
 
															+			perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	rcu_read_unlock();
														
 
															+
														
 
															+	local_irq_restore(flags);
														
 
															+}
														
 
															+
														
 
															 #define for_each_task_context_nr(ctxn)					\
														
 
															 	for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
														
@@ -2596,6 +2636,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
 
															 {
														
 
															 	int ctxn;
														
 
															+	if (__this_cpu_read(perf_sched_cb_usages))
														
 
															+		perf_pmu_sched_task(task, next, false);
														
 
															+
														
 
															 	for_each_task_context_nr(ctxn)
														
 
															 		perf_event_context_sched_out(task, ctxn, next);
														
@@ -2754,64 +2797,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 
															 	perf_ctx_unlock(cpuctx, ctx);
														
 
															 }
														
 
															-/*
														
 
															- * When sampling the branck stack in system-wide, it may be necessary
														
 
															- * to flush the stack on context switch. This happens when the branch
														
 
															- * stack does not tag its entries with the pid of the current task.
														
 
															- * Otherwise it becomes impossible to associate a branch entry with a
														
 
															- * task. This ambiguity is more likely to appear when the branch stack
														
 
															- * supports priv level filtering and the user sets it to monitor only
														
 
															- * at the user level (which could be a useful measurement in system-wide
														
 
															- * mode). In that case, the risk is high of having a branch stack with
														
 
															- * branch from multiple tasks. Flushing may mean dropping the existing
														
 
															- * entries or stashing them somewhere in the PMU specific code layer.
														
 
															- *
														
 
															- * This function provides the context switch callback to the lower code
														
 
															- * layer. It is invoked ONLY when there is at least one system-wide context
														
 
															- * with at least one active event using taken branch sampling.
														
 
															- */
														
 
															-static void perf_branch_stack_sched_in(struct task_struct *prev,
														
 
															-				       struct task_struct *task)
														
 
															-{
														
 
															-	struct perf_cpu_context *cpuctx;
														
 
															-	struct pmu *pmu;
														
 
															-	unsigned long flags;
														
 
															-
														
 
															-	/* no need to flush branch stack if not changing task */
														
 
															-	if (prev == task)
														
 
															-		return;
														
 
															-
														
 
															-	local_irq_save(flags);
														
 
															-
														
 
															-	rcu_read_lock();
														
 
															-
														
 
															-	list_for_each_entry_rcu(pmu, &pmus, entry) {
														
 
															-		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
														
 
															-
														
 
															-		/*
														
 
															-		 * check if the context has at least one
														
 
															-		 * event using PERF_SAMPLE_BRANCH_STACK
														
 
															-		 */
														
 
															-		if (cpuctx->ctx.nr_branch_stack > 0
														
 
															-		    && pmu->flush_branch_stack) {
														
 
															-
														
 
															-			perf_ctx_lock(cpuctx, cpuctx->task_ctx);
														
 
															-
														
 
															-			perf_pmu_disable(pmu);
														
 
															-
														
 
															-			pmu->flush_branch_stack();
														
 
															-
														
 
															-			perf_pmu_enable(pmu);
														
 
															-
														
 
															-			perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
														
 
															-		}
														
 
															-	}
														
 
															-
														
 
															-	rcu_read_unlock();
														
 
															-
														
 
															-	local_irq_restore(flags);
														
 
															-}
														
 
															-
														
 
															 /*
														
 
															  * Called from scheduler to add the events of the current task
														
 
															  * with interrupts disabled.
														
@@ -2844,9 +2829,8 @@ void __perf_event_task_sched_in(struct task_struct *prev,
 
															 	if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
														
 
															 		perf_cgroup_sched_in(prev, task);
														
 
															-	/* check for system-wide branch_stack events */
														
 
															-	if (atomic_read(this_cpu_ptr(&perf_branch_stack_events)))
														
 
															-		perf_branch_stack_sched_in(prev, task);
														
 
															+	if (__this_cpu_read(perf_sched_cb_usages))
														
 
															+		perf_pmu_sched_task(prev, task, true);
														
 
															 }
														
 
															 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
														
@@ -3220,7 +3204,10 @@ static void __perf_event_read(void *info)
 
															 static inline u64 perf_event_count(struct perf_event *event)
														
 
															 {
														
 
															-	return local64_read(&event->count) + atomic64_read(&event->child_count);
														
 
															+	if (event->pmu->count)
														
 
															+		return event->pmu->count(event);
														
 
															+
														
 
															+	return __perf_event_count(event);
														
 
															 }
														
 
															 static u64 perf_event_read(struct perf_event *event)
														
@@ -3321,12 +3308,15 @@ find_lively_task_by_vpid(pid_t vpid)
 
															  * Returns a matching context with refcount and pincount.
														
 
															  */
														
 
															 static struct perf_event_context *
														
 
															-find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
														
 
															+find_get_context(struct pmu *pmu, struct task_struct *task,
														
 
															+		struct perf_event *event)
														
 
															 {
														
 
															 	struct perf_event_context *ctx, *clone_ctx = NULL;
														
 
															 	struct perf_cpu_context *cpuctx;
														
 
															+	void *task_ctx_data = NULL;
														
 
															 	unsigned long flags;
														
 
															 	int ctxn, err;
														
 
															+	int cpu = event->cpu;
														
 
															 	if (!task) {
														
 
															 		/* Must be root to operate on a CPU event: */
														
@@ -3354,11 +3344,24 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
 
															 	if (ctxn < 0)
														
 
															 		goto errout;
														
 
															+	if (event->attach_state & PERF_ATTACH_TASK_DATA) {
														
 
															+		task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
														
 
															+		if (!task_ctx_data) {
														
 
															+			err = -ENOMEM;
														
 
															+			goto errout;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															 retry:
														
 
															 	ctx = perf_lock_task_context(task, ctxn, &flags);
														
 
															 	if (ctx) {
														
 
															 		clone_ctx = unclone_ctx(ctx);
														
 
															 		++ctx->pin_count;
														
 
															+
														
 
															+		if (task_ctx_data && !ctx->task_ctx_data) {
														
 
															+			ctx->task_ctx_data = task_ctx_data;
														
 
															+			task_ctx_data = NULL;
														
 
															+		}
														
 
															 		raw_spin_unlock_irqrestore(&ctx->lock, flags);
														
 
															 		if (clone_ctx)
														
@@ -3369,6 +3372,11 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
 
															 		if (!ctx)
														
 
															 			goto errout;
														
 
															+		if (task_ctx_data) {
														
 
															+			ctx->task_ctx_data = task_ctx_data;
														
 
															+			task_ctx_data = NULL;
														
 
															+		}
														
 
															+
														
 
															 		err = 0;
														
 
															 		mutex_lock(&task->perf_event_mutex);
														
 
															 		/*
														
@@ -3395,13 +3403,16 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
 
															 		}
														
 
															 	}
														
 
															+	kfree(task_ctx_data);
														
 
															 	return ctx;
														
 
															 errout:
														
 
															+	kfree(task_ctx_data);
														
 
															 	return ERR_PTR(err);
														
 
															 }
														
 
															 static void perf_event_free_filter(struct perf_event *event);
														
 
															+static void perf_event_free_bpf_prog(struct perf_event *event);
														
 
															 static void free_event_rcu(struct rcu_head *head)
														
 
															 {
														
@@ -3411,10 +3422,10 @@ static void free_event_rcu(struct rcu_head *head)
 
															 	if (event->ns)
														
 
															 		put_pid_ns(event->ns);
														
 
															 	perf_event_free_filter(event);
														
 
															+	perf_event_free_bpf_prog(event);
														
 
															 	kfree(event);
														
 
															 }
														
 
															-static void ring_buffer_put(struct ring_buffer *rb);
														
 
															 static void ring_buffer_attach(struct perf_event *event,
														
 
															 			       struct ring_buffer *rb);
														
@@ -3423,10 +3434,6 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu)
 
															 	if (event->parent)
														
 
															 		return;
														
 
															-	if (has_branch_stack(event)) {
														
 
															-		if (!(event->attach_state & PERF_ATTACH_TASK))
														
 
															-			atomic_dec(&per_cpu(perf_branch_stack_events, cpu));
														
 
															-	}
														
 
															 	if (is_cgroup_event(event))
														
 
															 		atomic_dec(&per_cpu(perf_cgroup_events, cpu));
														
 
															 }
														
@@ -3454,6 +3461,91 @@ static void unaccount_event(struct perf_event *event)
 
															 	unaccount_event_cpu(event, event->cpu);
														
 
															 }
														
 
															+/*
														
 
															+ * The following implement mutual exclusion of events on "exclusive" pmus
														
 
															+ * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
														
 
															+ * at a time, so we disallow creating events that might conflict, namely:
														
 
															+ *
														
 
															+ *  1) cpu-wide events in the presence of per-task events,
														
 
															+ *  2) per-task events in the presence of cpu-wide events,
														
 
															+ *  3) two matching events on the same context.
														
 
															+ *
														
 
															+ * The former two cases are handled in the allocation path (perf_event_alloc(),
														
 
															+ * __free_event()), the latter -- before the first perf_install_in_context().
														
 
															+ */
														
 
															+static int exclusive_event_init(struct perf_event *event)
														
 
															+{
														
 
															+	struct pmu *pmu = event->pmu;
														
 
															+
														
 
															+	if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
														
 
															+		return 0;
														
 
															+
														
 
															+	/*
														
 
															+	 * Prevent co-existence of per-task and cpu-wide events on the
														
 
															+	 * same exclusive pmu.
														
 
															+	 *
														
 
															+	 * Negative pmu::exclusive_cnt means there are cpu-wide
														
 
															+	 * events on this "exclusive" pmu, positive means there are
														
 
															+	 * per-task events.
														
 
															+	 *
														
 
															+	 * Since this is called in perf_event_alloc() path, event::ctx
														
 
															+	 * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
														
 
															+	 * to mean "per-task event", because unlike other attach states it
														
 
															+	 * never gets cleared.
														
 
															+	 */
														
 
															+	if (event->attach_state & PERF_ATTACH_TASK) {
														
 
															+		if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
														
 
															+			return -EBUSY;
														
 
															+	} else {
														
 
															+		if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
														
 
															+			return -EBUSY;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static void exclusive_event_destroy(struct perf_event *event)
														
 
															+{
														
 
															+	struct pmu *pmu = event->pmu;
														
 
															+
														
 
															+	if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
														
 
															+		return;
														
 
															+
														
 
															+	/* see comment in exclusive_event_init() */
														
 
															+	if (event->attach_state & PERF_ATTACH_TASK)
														
 
															+		atomic_dec(&pmu->exclusive_cnt);
														
 
															+	else
														
 
															+		atomic_inc(&pmu->exclusive_cnt);
														
 
															+}
														
 
															+
														
 
															+static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
														
 
															+{
														
 
															+	if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
														
 
															+	    (e1->cpu == e2->cpu ||
														
 
															+	     e1->cpu == -1 ||
														
 
															+	     e2->cpu == -1))
														
 
															+		return true;
														
 
															+	return false;
														
 
															+}
														
 
															+
														
 
															+/* Called under the same ctx::mutex as perf_install_in_context() */
														
 
															+static bool exclusive_event_installable(struct perf_event *event,
														
 
															+					struct perf_event_context *ctx)
														
 
															+{
														
 
															+	struct perf_event *iter_event;
														
 
															+	struct pmu *pmu = event->pmu;
														
 
															+
														
 
															+	if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
														
 
															+		return true;
														
 
															+
														
 
															+	list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
														
 
															+		if (exclusive_event_match(iter_event, event))
														
 
															+			return false;
														
 
															+	}
														
 
															+
														
 
															+	return true;
														
 
															+}
														
 
															+
														
 
															 static void __free_event(struct perf_event *event)
														
 
															 {
														
 
															 	if (!event->parent) {
														
@@ -3467,8 +3559,10 @@ static void __free_event(struct perf_event *event)
 
															 	if (event->ctx)
														
 
															 		put_ctx(event->ctx);
														
 
															-	if (event->pmu)
														
 
															+	if (event->pmu) {
														
 
															+		exclusive_event_destroy(event);
														
 
															 		module_put(event->pmu->module);
														
 
															+	}
														
 
															 	call_rcu(&event->rcu_head, free_event_rcu);
														
 
															 }
														
@@ -3927,6 +4021,7 @@ static inline int perf_fget_light(int fd, struct fd *p)
 
															 static int perf_event_set_output(struct perf_event *event,
														
 
															 				 struct perf_event *output_event);
														
 
															 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
														
 
															+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
														
 
															 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
														
 
															 {
														
@@ -3980,6 +4075,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
 
															 	case PERF_EVENT_IOC_SET_FILTER:
														
 
															 		return perf_event_set_filter(event, (void __user *)arg);
														
 
															+	case PERF_EVENT_IOC_SET_BPF:
														
 
															+		return perf_event_set_bpf_prog(event, arg);
														
 
															+
														
 
															 	default:
														
 
															 		return -ENOTTY;
														
 
															 	}
														
@@ -4096,6 +4194,8 @@ static void perf_event_init_userpage(struct perf_event *event)
 
															 	/* Allow new userspace to detect that bit 0 is deprecated */
														
 
															 	userpg->cap_bit0_is_deprecated = 1;
														
 
															 	userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
														
 
															+	userpg->data_offset = PAGE_SIZE;
														
 
															+	userpg->data_size = perf_data_size(rb);
														
 
															 unlock:
														
 
															 	rcu_read_unlock();
														
@@ -4263,7 +4363,7 @@ static void rb_free_rcu(struct rcu_head *rcu_head)
 
															 	rb_free(rb);
														
 
															 }
														
 
															-static struct ring_buffer *ring_buffer_get(struct perf_event *event)
														
 
															+struct ring_buffer *ring_buffer_get(struct perf_event *event)
														
 
															 {
														
 
															 	struct ring_buffer *rb;
														
@@ -4278,7 +4378,7 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
 
															 	return rb;
														
 
															 }
														
 
															-static void ring_buffer_put(struct ring_buffer *rb)
														
 
															+void ring_buffer_put(struct ring_buffer *rb)
														
 
															 {
														
 
															 	if (!atomic_dec_and_test(&rb->refcount))
														
 
															 		return;
														
@@ -4295,6 +4395,9 @@ static void perf_mmap_open(struct vm_area_struct *vma)
 
															 	atomic_inc(&event->mmap_count);
														
 
															 	atomic_inc(&event->rb->mmap_count);
														
 
															+	if (vma->vm_pgoff)
														
 
															+		atomic_inc(&event->rb->aux_mmap_count);
														
 
															+
														
 
															 	if (event->pmu->event_mapped)
														
 
															 		event->pmu->event_mapped(event);
														
 
															 }
														
@@ -4319,6 +4422,20 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 
															 	if (event->pmu->event_unmapped)
														
 
															 		event->pmu->event_unmapped(event);
														
 
															+	/*
														
 
															+	 * rb->aux_mmap_count will always drop before rb->mmap_count and
														
 
															+	 * event->mmap_count, so it is ok to use event->mmap_mutex to
														
 
															+	 * serialize with perf_mmap here.
														
 
															+	 */
														
 
															+	if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
														
 
															+	    atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
														
 
															+		atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
														
 
															+		vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
														
 
															+
														
 
															+		rb_free_aux(rb);
														
 
															+		mutex_unlock(&event->mmap_mutex);
														
 
															+	}
														
 
															+
														
 
															 	atomic_dec(&rb->mmap_count);
														
 
															 	if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
														
@@ -4392,7 +4509,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 
															 static const struct vm_operations_struct perf_mmap_vmops = {
														
 
															 	.open		= perf_mmap_open,
														
 
															-	.close		= perf_mmap_close,
														
 
															+	.close		= perf_mmap_close, /* non mergable */
														
 
															 	.fault		= perf_mmap_fault,
														
 
															 	.page_mkwrite	= perf_mmap_fault,
														
 
															 };
														
@@ -4403,10 +4520,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 
															 	unsigned long user_locked, user_lock_limit;
														
 
															 	struct user_struct *user = current_user();
														
 
															 	unsigned long locked, lock_limit;
														
 
															-	struct ring_buffer *rb;
														
 
															+	struct ring_buffer *rb = NULL;
														
 
															 	unsigned long vma_size;
														
 
															 	unsigned long nr_pages;
														
 
															-	long user_extra, extra;
														
 
															+	long user_extra = 0, extra = 0;
														
 
															 	int ret = 0, flags = 0;
														
 
															 	/*
														
@@ -4421,7 +4538,66 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 
															 		return -EINVAL;
														
 
															 	vma_size = vma->vm_end - vma->vm_start;
														
 
															-	nr_pages = (vma_size / PAGE_SIZE) - 1;
														
 
															+
														
 
															+	if (vma->vm_pgoff == 0) {
														
 
															+		nr_pages = (vma_size / PAGE_SIZE) - 1;
														
 
															+	} else {
														
 
															+		/*
														
 
															+		 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
														
 
															+		 * mapped, all subsequent mappings should have the same size
														
 
															+		 * and offset. Must be above the normal perf buffer.
														
 
															+		 */
														
 
															+		u64 aux_offset, aux_size;
														
 
															+
														
 
															+		if (!event->rb)
														
 
															+			return -EINVAL;
														
 
															+
														
 
															+		nr_pages = vma_size / PAGE_SIZE;
														
 
															+
														
 
															+		mutex_lock(&event->mmap_mutex);
														
 
															+		ret = -EINVAL;
														
 
															+
														
 
															+		rb = event->rb;
														
 
															+		if (!rb)
														
 
															+			goto aux_unlock;
														
 
															+
														
 
															+		aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
														
 
															+		aux_size = ACCESS_ONCE(rb->user_page->aux_size);
														
 
															+
														
 
															+		if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
														
 
															+			goto aux_unlock;
														
 
															+
														
 
															+		if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
														
 
															+			goto aux_unlock;
														
 
															+
														
 
															+		/* already mapped with a different offset */
														
 
															+		if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
														
 
															+			goto aux_unlock;
														
 
															+
														
 
															+		if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
														
 
															+			goto aux_unlock;
														
 
															+
														
 
															+		/* already mapped with a different size */
														
 
															+		if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
														
 
															+			goto aux_unlock;
														
 
															+
														
 
															+		if (!is_power_of_2(nr_pages))
														
 
															+			goto aux_unlock;
														
 
															+
														
 
															+		if (!atomic_inc_not_zero(&rb->mmap_count))
														
 
															+			goto aux_unlock;
														
 
															+
														
 
															+		if (rb_has_aux(rb)) {
														
 
															+			atomic_inc(&rb->aux_mmap_count);
														
 
															+			ret = 0;
														
 
															+			goto unlock;
														
 
															+		}
														
 
															+
														
 
															+		atomic_set(&rb->aux_mmap_count, 1);
														
 
															+		user_extra = nr_pages;
														
 
															+
														
 
															+		goto accounting;
														
 
															+	}
														
 
															 	/*
														
 
															 	 * If we have rb pages ensure they're a power-of-two number, so we
														
@@ -4433,9 +4609,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 
															 	if (vma_size != PAGE_SIZE * (1 + nr_pages))
														
 
															 		return -EINVAL;
														
 
															-	if (vma->vm_pgoff != 0)
														
 
															-		return -EINVAL;
														
 
															-
														
 
															 	WARN_ON_ONCE(event->ctx->parent_ctx);
														
 
															 again:
														
 
															 	mutex_lock(&event->mmap_mutex);
														
@@ -4459,6 +4632,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 
															 	}
														
 
															 	user_extra = nr_pages + 1;
														
 
															+
														
 
															+accounting:
														
 
															 	user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
														
 
															 	/*
														
@@ -4468,7 +4643,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 
															 	user_locked = atomic_long_read(&user->locked_vm) + user_extra;
														
 
															-	extra = 0;
														
 
															 	if (user_locked > user_lock_limit)
														
 
															 		extra = user_locked - user_lock_limit;
														
@@ -4482,35 +4656,46 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 
															 		goto unlock;
														
 
															 	}
														
 
															-	WARN_ON(event->rb);
														
 
															+	WARN_ON(!rb && event->rb);
														
 
															 	if (vma->vm_flags & VM_WRITE)
														
 
															 		flags |= RING_BUFFER_WRITABLE;
														
 
															-	rb = rb_alloc(nr_pages, 
														
 
															-		event->attr.watermark ? event->attr.wakeup_watermark : 0,
														
 
															-		event->cpu, flags);
														
 
															-
														
 
															 	if (!rb) {
														
 
															-		ret = -ENOMEM;
														
 
															-		goto unlock;
														
 
															-	}
														
 
															+		rb = rb_alloc(nr_pages,
														
 
															+			      event->attr.watermark ? event->attr.wakeup_watermark : 0,
														
 
															+			      event->cpu, flags);
														
 
															-	atomic_set(&rb->mmap_count, 1);
														
 
															-	rb->mmap_locked = extra;
														
 
															-	rb->mmap_user = get_current_user();
														
 
															+		if (!rb) {
														
 
															+			ret = -ENOMEM;
														
 
															+			goto unlock;
														
 
															+		}
														
 
															-	atomic_long_add(user_extra, &user->locked_vm);
														
 
															-	vma->vm_mm->pinned_vm += extra;
														
 
															+		atomic_set(&rb->mmap_count, 1);
														
 
															+		rb->mmap_user = get_current_user();
														
 
															+		rb->mmap_locked = extra;
														
 
															-	ring_buffer_attach(event, rb);
														
 
															+		ring_buffer_attach(event, rb);
														
 
															-	perf_event_init_userpage(event);
														
 
															-	perf_event_update_userpage(event);
														
 
															+		perf_event_init_userpage(event);
														
 
															+		perf_event_update_userpage(event);
														
 
															+	} else {
														
 
															+		ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
														
 
															+				   event->attr.aux_watermark, flags);
														
 
															+		if (!ret)
														
 
															+			rb->aux_mmap_locked = extra;
														
 
															+	}
														
 
															 unlock:
														
 
															-	if (!ret)
														
 
															+	if (!ret) {
														
 
															+		atomic_long_add(user_extra, &user->locked_vm);
														
 
															+		vma->vm_mm->pinned_vm += extra;
														
 
															+
														
 
															 		atomic_inc(&event->mmap_count);
														
 
															+	} else if (rb) {
														
 
															+		atomic_dec(&rb->mmap_count);
														
 
															+	}
														
 
															+aux_unlock:
														
 
															 	mutex_unlock(&event->mmap_mutex);
														
 
															 	/*
														
@@ -4766,7 +4951,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
 
															 	}
														
 
															 	if (sample_type & PERF_SAMPLE_TIME)
														
 
															-		data->time = perf_clock();
														
 
															+		data->time = perf_event_clock(event);
														
 
															 	if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
														
 
															 		data->id = primary_event_id(event);
														
@@ -5344,6 +5529,8 @@ static void perf_event_task_output(struct perf_event *event,
 
															 	task_event->event_id.tid = perf_event_tid(event, task);
														
 
															 	task_event->event_id.ptid = perf_event_tid(event, current);
														
 
															+	task_event->event_id.time = perf_event_clock(event);
														
 
															+
														
 
															 	perf_output_put(&handle, task_event->event_id);
														
 
															 	perf_event__output_id_sample(event, &handle, &sample);
														
@@ -5377,7 +5564,7 @@ static void perf_event_task(struct task_struct *task,
 
															 			/* .ppid */
														
 
															 			/* .tid  */
														
 
															 			/* .ptid */
														
 
															-			.time = perf_clock(),
														
 
															+			/* .time */
														
 
															 		},
														
 
															 	};
														
@@ -5732,6 +5919,40 @@ void perf_event_mmap(struct vm_area_struct *vma)
 
															 	perf_event_mmap_event(&mmap_event);
														
 
															 }
														
 
															+void perf_event_aux_event(struct perf_event *event, unsigned long head,
														
 
															+			  unsigned long size, u64 flags)
														
 
															+{
														
 
															+	struct perf_output_handle handle;
														
 
															+	struct perf_sample_data sample;
														
 
															+	struct perf_aux_event {
														
 
															+		struct perf_event_header	header;
														
 
															+		u64				offset;
														
 
															+		u64				size;
														
 
															+		u64				flags;
														
 
															+	} rec = {
														
 
															+		.header = {
														
 
															+			.type = PERF_RECORD_AUX,
														
 
															+			.misc = 0,
														
 
															+			.size = sizeof(rec),
														
 
															+		},
														
 
															+		.offset		= head,
														
 
															+		.size		= size,
														
 
															+		.flags		= flags,
														
 
															+	};
														
 
															+	int ret;
														
 
															+
														
 
															+	perf_event_header__init_id(&rec.header, &sample, event);
														
 
															+	ret = perf_output_begin(&handle, event, rec.header.size);
														
 
															+
														
 
															+	if (ret)
														
 
															+		return;
														
 
															+
														
 
															+	perf_output_put(&handle, rec);
														
 
															+	perf_event__output_id_sample(event, &handle, &sample);
														
 
															+
														
 
															+	perf_output_end(&handle);
														
 
															+}
														
 
															+
														
 
															 /*
														
 
															  * IRQ throttle logging
														
 
															  */
														
@@ -5753,7 +5974,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
 
															 			.misc = 0,
														
 
															 			.size = sizeof(throttle_event),
														
 
															 		},
														
 
															-		.time		= perf_clock(),
														
 
															+		.time		= perf_event_clock(event),
														
 
															 		.id		= primary_event_id(event),
														
 
															 		.stream_id	= event->id,
														
 
															 	};
														
@@ -5773,6 +5994,44 @@ static void perf_log_throttle(struct perf_event *event, int enable)
 
															 	perf_output_end(&handle);
														
 
															 }
														
 
															+static void perf_log_itrace_start(struct perf_event *event)
														
 
															+{
														
 
															+	struct perf_output_handle handle;
														
 
															+	struct perf_sample_data sample;
														
 
															+	struct perf_aux_event {
														
 
															+		struct perf_event_header        header;
														
 
															+		u32				pid;
														
 
															+		u32				tid;
														
 
															+	} rec;
														
 
															+	int ret;
														
 
															+
														
 
															+	if (event->parent)
														
 
															+		event = event->parent;
														
 
															+
														
 
															+	if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
														
 
															+	    event->hw.itrace_started)
														
 
															+		return;
														
 
															+
														
 
															+	event->hw.itrace_started = 1;
														
 
															+
														
 
															+	rec.header.type	= PERF_RECORD_ITRACE_START;
														
 
															+	rec.header.misc	= 0;
														
 
															+	rec.header.size	= sizeof(rec);
														
 
															+	rec.pid	= perf_event_pid(event, current);
														
 
															+	rec.tid	= perf_event_tid(event, current);
														
 
															+
														
 
															+	perf_event_header__init_id(&rec.header, &sample, event);
														
 
															+	ret = perf_output_begin(&handle, event, rec.header.size);
														
 
															+
														
 
															+	if (ret)
														
 
															+		return;
														
 
															+
														
 
															+	perf_output_put(&handle, rec);
														
 
															+	perf_event__output_id_sample(event, &handle, &sample);
														
 
															+
														
 
															+	perf_output_end(&handle);
														
 
															+}
														
 
															+
														
 
															 /*
														
 
															  * Generic event overflow handling, sampling.
														
 
															  */
														
@@ -6133,6 +6392,7 @@ static int perf_swevent_add(struct perf_event *event, int flags)
 
															 	}
														
 
															 	hlist_add_head_rcu(&event->hlist_entry, head);
														
 
															+	perf_event_update_userpage(event);
														
 
															 	return 0;
														
 
															 }
														
@@ -6296,6 +6556,8 @@ static int perf_swevent_init(struct perf_event *event)
 
															 static struct pmu perf_swevent = {
														
 
															 	.task_ctx_nr	= perf_sw_context,
														
 
															+	.capabilities	= PERF_PMU_CAP_NO_NMI,
														
 
															+
														
 
															 	.event_init	= perf_swevent_init,
														
 
															 	.add		= perf_swevent_add,
														
 
															 	.del		= perf_swevent_del,
														
@@ -6449,6 +6711,49 @@ static void perf_event_free_filter(struct perf_event *event)
 
															 	ftrace_profile_free_filter(event);
														
 
															 }
														
 
															+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
														
 
															+{
														
 
															+	struct bpf_prog *prog;
														
 
															+
														
 
															+	if (event->attr.type != PERF_TYPE_TRACEPOINT)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	if (event->tp_event->prog)
														
 
															+		return -EEXIST;
														
 
															+
														
 
															+	if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
														
 
															+		/* bpf programs can only be attached to kprobes */
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	prog = bpf_prog_get(prog_fd);
														
 
															+	if (IS_ERR(prog))
														
 
															+		return PTR_ERR(prog);
														
 
															+
														
 
															+	if (prog->aux->prog_type != BPF_PROG_TYPE_KPROBE) {
														
 
															+		/* valid fd, but invalid bpf program type */
														
 
															+		bpf_prog_put(prog);
														
 
															+		return -EINVAL;
														
 
															+	}
														
 
															+
														
 
															+	event->tp_event->prog = prog;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static void perf_event_free_bpf_prog(struct perf_event *event)
														
 
															+{
														
 
															+	struct bpf_prog *prog;
														
 
															+
														
 
															+	if (!event->tp_event)
														
 
															+		return;
														
 
															+
														
 
															+	prog = event->tp_event->prog;
														
 
															+	if (prog) {
														
 
															+		event->tp_event->prog = NULL;
														
 
															+		bpf_prog_put(prog);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															 #else
														
 
															 static inline void perf_tp_register(void)
														
@@ -6464,6 +6769,14 @@ static void perf_event_free_filter(struct perf_event *event)
 
															 {
														
 
															 }
														
 
															+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
														
 
															+{
														
 
															+	return -ENOENT;
														
 
															+}
														
 
															+
														
 
															+static void perf_event_free_bpf_prog(struct perf_event *event)
														
 
															+{
														
 
															+}
														
 
															 #endif /* CONFIG_EVENT_TRACING */
														
 
															 #ifdef CONFIG_HAVE_HW_BREAKPOINT
														
@@ -6602,6 +6915,7 @@ static int cpu_clock_event_add(struct perf_event *event, int flags)
 
															 {
														
 
															 	if (flags & PERF_EF_START)
														
 
															 		cpu_clock_event_start(event, flags);
														
 
															+	perf_event_update_userpage(event);
														
 
															 	return 0;
														
 
															 }
														
@@ -6638,6 +6952,8 @@ static int cpu_clock_event_init(struct perf_event *event)
 
															 static struct pmu perf_cpu_clock = {
														
 
															 	.task_ctx_nr	= perf_sw_context,
														
 
															+	.capabilities	= PERF_PMU_CAP_NO_NMI,
														
 
															+
														
 
															 	.event_init	= cpu_clock_event_init,
														
 
															 	.add		= cpu_clock_event_add,
														
 
															 	.del		= cpu_clock_event_del,
														
@@ -6676,6 +6992,7 @@ static int task_clock_event_add(struct perf_event *event, int flags)
 
															 {
														
 
															 	if (flags & PERF_EF_START)
														
 
															 		task_clock_event_start(event, flags);
														
 
															+	perf_event_update_userpage(event);
														
 
															 	return 0;
														
 
															 }
														
@@ -6716,6 +7033,8 @@ static int task_clock_event_init(struct perf_event *event)
 
															 static struct pmu perf_task_clock = {
														
 
															 	.task_ctx_nr	= perf_sw_context,
														
 
															+	.capabilities	= PERF_PMU_CAP_NO_NMI,
														
 
															+
														
 
															 	.event_init	= task_clock_event_init,
														
 
															 	.add		= task_clock_event_add,
														
 
															 	.del		= task_clock_event_del,
														
@@ -6993,6 +7312,7 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
 
															 		pmu->event_idx = perf_event_idx_default;
														
 
															 	list_add_rcu(&pmu->entry, &pmus);
														
 
															+	atomic_set(&pmu->exclusive_cnt, 0);
														
 
															 	ret = 0;
														
 
															 unlock:
														
 
															 	mutex_unlock(&pmus_lock);
														
@@ -7037,12 +7357,23 @@ EXPORT_SYMBOL_GPL(perf_pmu_unregister);
 
															 static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
														
 
															 {
														
 
															+	struct perf_event_context *ctx = NULL;
														
 
															 	int ret;
														
 
															 	if (!try_module_get(pmu->module))
														
 
															 		return -ENODEV;
														
 
															+
														
 
															+	if (event->group_leader != event) {
														
 
															+		ctx = perf_event_ctx_lock(event->group_leader);
														
 
															+		BUG_ON(!ctx);
														
 
															+	}
														
 
															+
														
 
															 	event->pmu = pmu;
														
 
															 	ret = pmu->event_init(event);
														
 
															+
														
 
															+	if (ctx)
														
 
															+		perf_event_ctx_unlock(event->group_leader, ctx);
														
 
															+
														
 
															 	if (ret)
														
 
															 		module_put(pmu->module);
														
@@ -7089,10 +7420,6 @@ static void account_event_cpu(struct perf_event *event, int cpu)
 
															 	if (event->parent)
														
 
															 		return;
														
 
															-	if (has_branch_stack(event)) {
														
 
															-		if (!(event->attach_state & PERF_ATTACH_TASK))
														
 
															-			atomic_inc(&per_cpu(perf_branch_stack_events, cpu));
														
 
															-	}
														
 
															 	if (is_cgroup_event(event))
														
 
															 		atomic_inc(&per_cpu(perf_cgroup_events, cpu));
														
 
															 }
														
@@ -7131,7 +7458,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 
															 		 struct perf_event *group_leader,
														
 
															 		 struct perf_event *parent_event,
														
 
															 		 perf_overflow_handler_t overflow_handler,
														
 
															-		 void *context)
														
 
															+		 void *context, int cgroup_fd)
														
 
															 {
														
 
															 	struct pmu *pmu;
														
 
															 	struct perf_event *event;
														
@@ -7186,18 +7513,18 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 
															 	if (task) {
														
 
															 		event->attach_state = PERF_ATTACH_TASK;
														
 
															-
														
 
															-		if (attr->type == PERF_TYPE_TRACEPOINT)
														
 
															-			event->hw.tp_target = task;
														
 
															-#ifdef CONFIG_HAVE_HW_BREAKPOINT
														
 
															 		/*
														
 
															-		 * hw_breakpoint is a bit difficult here..
														
 
															+		 * XXX pmu::event_init needs to know what task to account to
														
 
															+		 * and we cannot use the ctx information because we need the
														
 
															+		 * pmu before we get a ctx.
														
 
															 		 */
														
 
															-		else if (attr->type == PERF_TYPE_BREAKPOINT)
														
 
															-			event->hw.bp_target = task;
														
 
															-#endif
														
 
															+		event->hw.target = task;
														
 
															 	}
														
 
															+	event->clock = &local_clock;
														
 
															+	if (parent_event)
														
 
															+		event->clock = parent_event->clock;
														
 
															+
														
 
															 	if (!overflow_handler && parent_event) {
														
 
															 		overflow_handler = parent_event->overflow_handler;
														
 
															 		context = parent_event->overflow_handler_context;
														
@@ -7224,6 +7551,15 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 
															 	if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
														
 
															 		goto err_ns;
														
 
															+	if (!has_branch_stack(event))
														
 
															+		event->attr.branch_sample_type = 0;
														
 
															+
														
 
															+	if (cgroup_fd != -1) {
														
 
															+		err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
														
 
															+		if (err)
														
 
															+			goto err_ns;
														
 
															+	}
														
 
															+
														
 
															 	pmu = perf_init_event(event);
														
 
															 	if (!pmu)
														
 
															 		goto err_ns;
														
@@ -7232,21 +7568,30 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 
															 		goto err_ns;
														
 
															 	}
														
 
															+	err = exclusive_event_init(event);
														
 
															+	if (err)
														
 
															+		goto err_pmu;
														
 
															+
														
 
															 	if (!event->parent) {
														
 
															 		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
														
 
															 			err = get_callchain_buffers();
														
 
															 			if (err)
														
 
															-				goto err_pmu;
														
 
															+				goto err_per_task;
														
 
															 		}
														
 
															 	}
														
 
															 	return event;
														
 
															+err_per_task:
														
 
															+	exclusive_event_destroy(event);
														
 
															+
														
 
															 err_pmu:
														
 
															 	if (event->destroy)
														
 
															 		event->destroy(event);
														
 
															 	module_put(pmu->module);
														
 
															 err_ns:
														
 
															+	if (is_cgroup_event(event))
														
 
															+		perf_detach_cgroup(event);
														
 
															 	if (event->ns)
														
 
															 		put_pid_ns(event->ns);
														
 
															 	kfree(event);
														
@@ -7409,6 +7754,19 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 
															 	if (output_event->cpu == -1 && output_event->ctx != event->ctx)
														
 
															 		goto out;
														
 
															+	/*
														
 
															+	 * Mixing clocks in the same buffer is trouble you don't need.
														
 
															+	 */
														
 
															+	if (output_event->clock != event->clock)
														
 
															+		goto out;
														
 
															+
														
 
															+	/*
														
 
															+	 * If both events generate aux data, they must be on the same PMU
														
 
															+	 */
														
 
															+	if (has_aux(event) && has_aux(output_event) &&
														
 
															+	    event->pmu != output_event->pmu)
														
 
															+		goto out;
														
 
															+
														
 
															 set:
														
 
															 	mutex_lock(&event->mmap_mutex);
														
 
															 	/* Can't redirect output if we've got an active mmap() */
														
@@ -7441,6 +7799,43 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b)
 
															 	mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
														
 
															 }
														
 
															+static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
														
 
															+{
														
 
															+	bool nmi_safe = false;
														
 
															+
														
 
															+	switch (clk_id) {
														
 
															+	case CLOCK_MONOTONIC:
														
 
															+		event->clock = &ktime_get_mono_fast_ns;
														
 
															+		nmi_safe = true;
														
 
															+		break;
														
 
															+
														
 
															+	case CLOCK_MONOTONIC_RAW:
														
 
															+		event->clock = &ktime_get_raw_fast_ns;
														
 
															+		nmi_safe = true;
														
 
															+		break;
														
 
															+
														
 
															+	case CLOCK_REALTIME:
														
 
															+		event->clock = &ktime_get_real_ns;
														
 
															+		break;
														
 
															+
														
 
															+	case CLOCK_BOOTTIME:
														
 
															+		event->clock = &ktime_get_boot_ns;
														
 
															+		break;
														
 
															+
														
 
															+	case CLOCK_TAI:
														
 
															+		event->clock = &ktime_get_tai_ns;
														
 
															+		break;
														
 
															+
														
 
															+	default:
														
 
															+		return -EINVAL;
														
 
															+	}
														
 
															+
														
 
															+	if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															 /**
														
 
															  * sys_perf_event_open - open a performance event, associate it to a task/cpu
														
 
															  *
														
@@ -7465,6 +7860,7 @@ SYSCALL_DEFINE5(perf_event_open,
 
															 	int move_group = 0;
														
 
															 	int err;
														
 
															 	int f_flags = O_RDWR;
														
 
															+	int cgroup_fd = -1;
														
 
															 	/* for future expandability... */
														
 
															 	if (flags & ~PERF_FLAG_ALL)
														
@@ -7530,21 +7926,16 @@ SYSCALL_DEFINE5(perf_event_open,
 
															 	get_online_cpus();
														
 
															+	if (flags & PERF_FLAG_PID_CGROUP)
														
 
															+		cgroup_fd = pid;
														
 
															+
														
 
															 	event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
														
 
															-				 NULL, NULL);
														
 
															+				 NULL, NULL, cgroup_fd);
														
 
															 	if (IS_ERR(event)) {
														
 
															 		err = PTR_ERR(event);
														
 
															 		goto err_cpus;
														
 
															 	}
														
 
															-	if (flags & PERF_FLAG_PID_CGROUP) {
														
 
															-		err = perf_cgroup_connect(pid, event, &attr, group_leader);
														
 
															-		if (err) {
														
 
															-			__free_event(event);
														
 
															-			goto err_cpus;
														
 
															-		}
														
 
															-	}
														
 
															-
														
 
															 	if (is_sampling_event(event)) {
														
 
															 		if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
														
 
															 			err = -ENOTSUPP;
														
@@ -7560,6 +7951,12 @@ SYSCALL_DEFINE5(perf_event_open,
 
															 	 */
														
 
															 	pmu = event->pmu;
														
 
															+	if (attr.use_clockid) {
														
 
															+		err = perf_event_set_clock(event, attr.clockid);
														
 
															+		if (err)
														
 
															+			goto err_alloc;
														
 
															+	}
														
 
															+
														
 
															 	if (group_leader &&
														
 
															 	    (is_software_event(event) != is_software_event(group_leader))) {
														
 
															 		if (is_software_event(event)) {
														
@@ -7586,12 +7983,17 @@ SYSCALL_DEFINE5(perf_event_open,
 
															 	/*
														
 
															 	 * Get the target context (task or percpu):
														
 
															 	 */
														
 
															-	ctx = find_get_context(pmu, task, event->cpu);
														
 
															+	ctx = find_get_context(pmu, task, event);
														
 
															 	if (IS_ERR(ctx)) {
														
 
															 		err = PTR_ERR(ctx);
														
 
															 		goto err_alloc;
														
 
															 	}
														
 
															+	if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
														
 
															+		err = -EBUSY;
														
 
															+		goto err_context;
														
 
															+	}
														
 
															+
														
 
															 	if (task) {
														
 
															 		put_task_struct(task);
														
 
															 		task = NULL;
														
@@ -7609,6 +8011,11 @@ SYSCALL_DEFINE5(perf_event_open,
 
															 		 */
														
 
															 		if (group_leader->group_leader != group_leader)
														
 
															 			goto err_context;
														
 
															+
														
 
															+		/* All events in a group should have the same clock */
														
 
															+		if (group_leader->clock != event->clock)
														
 
															+			goto err_context;
														
 
															+
														
 
															 		/*
														
 
															 		 * Do not allow to attach to a group in a different
														
 
															 		 * task or CPU context:
														
@@ -7709,6 +8116,13 @@ SYSCALL_DEFINE5(perf_event_open,
 
															 		get_ctx(ctx);
														
 
															 	}
														
 
															+	if (!exclusive_event_installable(event, ctx)) {
														
 
															+		err = -EBUSY;
														
 
															+		mutex_unlock(&ctx->mutex);
														
 
															+		fput(event_file);
														
 
															+		goto err_context;
														
 
															+	}
														
 
															+
														
 
															 	perf_install_in_context(ctx, event, event->cpu);
														
 
															 	perf_unpin_context(ctx);
														
@@ -7781,7 +8195,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 
															 	 */
														
 
															 	event = perf_event_alloc(attr, cpu, task, NULL, NULL,
														
 
															-				 overflow_handler, context);
														
 
															+				 overflow_handler, context, -1);
														
 
															 	if (IS_ERR(event)) {
														
 
															 		err = PTR_ERR(event);
														
 
															 		goto err;
														
@@ -7792,7 +8206,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 
															 	account_event(event);
														
 
															-	ctx = find_get_context(event->pmu, task, cpu);
														
 
															+	ctx = find_get_context(event->pmu, task, event);
														
 
															 	if (IS_ERR(ctx)) {
														
 
															 		err = PTR_ERR(ctx);
														
 
															 		goto err_free;
														
@@ -7800,6 +8214,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 
															 	WARN_ON_ONCE(ctx->parent_ctx);
														
 
															 	mutex_lock(&ctx->mutex);
														
 
															+	if (!exclusive_event_installable(event, ctx)) {
														
 
															+		mutex_unlock(&ctx->mutex);
														
 
															+		perf_unpin_context(ctx);
														
 
															+		put_ctx(ctx);
														
 
															+		err = -EBUSY;
														
 
															+		goto err_free;
														
 
															+	}
														
 
															+
														
 
															 	perf_install_in_context(ctx, event, cpu);
														
 
															 	perf_unpin_context(ctx);
														
 
															 	mutex_unlock(&ctx->mutex);
														
@@ -8142,7 +8564,7 @@ inherit_event(struct perf_event *parent_event,
 
															 					   parent_event->cpu,
														
 
															 					   child,
														
 
															 					   group_leader, parent_event,
														
 
															-				           NULL, NULL);
														
 
															+					   NULL, NULL, -1);
														
 
															 	if (IS_ERR(child_event))
														
 
															 		return child_event;
														
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -116,12 +116,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
 
															  */
														
 
															 static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
														
 
															 {
														
 
															-	struct task_struct *tsk = bp->hw.bp_target;
														
 
															+	struct task_struct *tsk = bp->hw.target;
														
 
															 	struct perf_event *iter;
														
 
															 	int count = 0;
														
 
															 	list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
														
 
															-		if (iter->hw.bp_target == tsk &&
														
 
															+		if (iter->hw.target == tsk &&
														
 
															 		    find_slot_idx(iter) == type &&
														
 
															 		    (iter->cpu < 0 || cpu == iter->cpu))
														
 
															 			count += hw_breakpoint_weight(iter);
														
@@ -153,7 +153,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
 
															 		int nr;
														
 
															 		nr = info->cpu_pinned;
														
 
															-		if (!bp->hw.bp_target)
														
 
															+		if (!bp->hw.target)
														
 
															 			nr += max_task_bp_pinned(cpu, type);
														
 
															 		else
														
 
															 			nr += task_bp_pinned(cpu, bp, type);
														
@@ -210,7 +210,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
 
															 		weight = -weight;
														
 
															 	/* Pinned counter cpu profiling */
														
 
															-	if (!bp->hw.bp_target) {
														
 
															+	if (!bp->hw.target) {
														
 
															 		get_bp_info(bp->cpu, type)->cpu_pinned += weight;
														
 
															 		return;
														
 
															 	}
														
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -27,6 +27,7 @@ struct ring_buffer {
 
															 	local_t				lost;		/* nr records lost   */
														
 
															 	long				watermark;	/* wakeup watermark  */
														
 
															+	long				aux_watermark;
														
 
															 	/* poll crap */
														
 
															 	spinlock_t			event_lock;
														
 
															 	struct list_head		event_list;
														
@@ -35,6 +36,20 @@ struct ring_buffer {
 
															 	unsigned long			mmap_locked;
														
 
															 	struct user_struct		*mmap_user;
														
 
															+	/* AUX area */
														
 
															+	local_t				aux_head;
														
 
															+	local_t				aux_nest;
														
 
															+	local_t				aux_wakeup;
														
 
															+	unsigned long			aux_pgoff;
														
 
															+	int				aux_nr_pages;
														
 
															+	int				aux_overwrite;
														
 
															+	atomic_t			aux_mmap_count;
														
 
															+	unsigned long			aux_mmap_locked;
														
 
															+	void				(*free_aux)(void *);
														
 
															+	atomic_t			aux_refcount;
														
 
															+	void				**aux_pages;
														
 
															+	void				*aux_priv;
														
 
															+
														
 
															 	struct perf_event_mmap_page	*user_page;
														
 
															 	void				*data_pages[0];
														
 
															 };
														
@@ -43,6 +58,19 @@ extern void rb_free(struct ring_buffer *rb);
 
															 extern struct ring_buffer *
														
 
															 rb_alloc(int nr_pages, long watermark, int cpu, int flags);
														
 
															 extern void perf_event_wakeup(struct perf_event *event);
														
 
															+extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
														
 
															+			pgoff_t pgoff, int nr_pages, long watermark, int flags);
														
 
															+extern void rb_free_aux(struct ring_buffer *rb);
														
 
															+extern struct ring_buffer *ring_buffer_get(struct perf_event *event);
														
 
															+extern void ring_buffer_put(struct ring_buffer *rb);
														
 
															+
														
 
															+static inline bool rb_has_aux(struct ring_buffer *rb)
														
 
															+{
														
 
															+	return !!rb->aux_nr_pages;
														
 
															+}
														
 
															+
														
 
															+void perf_event_aux_event(struct perf_event *event, unsigned long head,
														
 
															+			  unsigned long size, u64 flags);
														
 
															 extern void
														
 
															 perf_event_header__init_id(struct perf_event_header *header,
														
@@ -81,6 +109,11 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
 
															 	return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
														
 
															 }
														
 
															+static inline unsigned long perf_aux_size(struct ring_buffer *rb)
														
 
															+{
														
 
															+	return rb->aux_nr_pages << PAGE_SHIFT;
														
 
															+}
														
 
															+
														
 
															 #define DEFINE_OUTPUT_COPY(func_name, memcpy_func)			\
														
 
															 static inline unsigned long						\
														
 
															 func_name(struct perf_output_handle *handle,				\
														
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -243,14 +243,317 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
 
															 	spin_lock_init(&rb->event_lock);
														
 
															 }
														
 
															+/*
														
 
															+ * This is called before hardware starts writing to the AUX area to
														
 
															+ * obtain an output handle and make sure there's room in the buffer.
														
 
															+ * When the capture completes, call perf_aux_output_end() to commit
														
 
															+ * the recorded data to the buffer.
														
 
															+ *
														
 
															+ * The ordering is similar to that of perf_output_{begin,end}, with
														
 
															+ * the exception of (B), which should be taken care of by the pmu
														
 
															+ * driver, since ordering rules will differ depending on hardware.
														
 
															+ */
														
 
															+void *perf_aux_output_begin(struct perf_output_handle *handle,
														
 
															+			    struct perf_event *event)
														
 
															+{
														
 
															+	struct perf_event *output_event = event;
														
 
															+	unsigned long aux_head, aux_tail;
														
 
															+	struct ring_buffer *rb;
														
 
															+
														
 
															+	if (output_event->parent)
														
 
															+		output_event = output_event->parent;
														
 
															+
														
 
															+	/*
														
 
															+	 * Since this will typically be open across pmu::add/pmu::del, we
														
 
															+	 * grab ring_buffer's refcount instead of holding rcu read lock
														
 
															+	 * to make sure it doesn't disappear under us.
														
 
															+	 */
														
 
															+	rb = ring_buffer_get(output_event);
														
 
															+	if (!rb)
														
 
															+		return NULL;
														
 
															+
														
 
															+	if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount))
														
 
															+		goto err;
														
 
															+
														
 
															+	/*
														
 
															+	 * Nesting is not supported for AUX area, make sure nested
														
 
															+	 * writers are caught early
														
 
															+	 */
														
 
															+	if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1)))
														
 
															+		goto err_put;
														
 
															+
														
 
															+	aux_head = local_read(&rb->aux_head);
														
 
															+
														
 
															+	handle->rb = rb;
														
 
															+	handle->event = event;
														
 
															+	handle->head = aux_head;
														
 
															+	handle->size = 0;
														
 
															+
														
 
															+	/*
														
 
															+	 * In overwrite mode, AUX data stores do not depend on aux_tail,
														
 
															+	 * therefore (A) control dependency barrier does not exist. The
														
 
															+	 * (B) <-> (C) ordering is still observed by the pmu driver.
														
 
															+	 */
														
 
															+	if (!rb->aux_overwrite) {
														
 
															+		aux_tail = ACCESS_ONCE(rb->user_page->aux_tail);
														
 
															+		handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark;
														
 
															+		if (aux_head - aux_tail < perf_aux_size(rb))
														
 
															+			handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
														
 
															+
														
 
															+		/*
														
 
															+		 * handle->size computation depends on aux_tail load; this forms a
														
 
															+		 * control dependency barrier separating aux_tail load from aux data
														
 
															+		 * store that will be enabled on successful return
														
 
															+		 */
														
 
															+		if (!handle->size) { /* A, matches D */
														
 
															+			event->pending_disable = 1;
														
 
															+			perf_output_wakeup(handle);
														
 
															+			local_set(&rb->aux_nest, 0);
														
 
															+			goto err_put;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return handle->rb->aux_priv;
														
 
															+
														
 
															+err_put:
														
 
															+	rb_free_aux(rb);
														
 
															+
														
 
															+err:
														
 
															+	ring_buffer_put(rb);
														
 
															+	handle->event = NULL;
														
 
															+
														
 
															+	return NULL;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Commit the data written by hardware into the ring buffer by adjusting
														
 
															+ * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
														
 
															+ * pmu driver's responsibility to observe ordering rules of the hardware,
														
 
															+ * so that all the data is externally visible before this is called.
														
 
															+ */
														
 
															+void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
														
 
															+			 bool truncated)
														
 
															+{
														
 
															+	struct ring_buffer *rb = handle->rb;
														
 
															+	unsigned long aux_head;
														
 
															+	u64 flags = 0;
														
 
															+
														
 
															+	if (truncated)
														
 
															+		flags |= PERF_AUX_FLAG_TRUNCATED;
														
 
															+
														
 
															+	/* in overwrite mode, driver provides aux_head via handle */
														
 
															+	if (rb->aux_overwrite) {
														
 
															+		flags |= PERF_AUX_FLAG_OVERWRITE;
														
 
															+
														
 
															+		aux_head = handle->head;
														
 
															+		local_set(&rb->aux_head, aux_head);
														
 
															+	} else {
														
 
															+		aux_head = local_read(&rb->aux_head);
														
 
															+		local_add(size, &rb->aux_head);
														
 
															+	}
														
 
															+
														
 
															+	if (size || flags) {
														
 
															+		/*
														
 
															+		 * Only send RECORD_AUX if we have something useful to communicate
														
 
															+		 */
														
 
															+
														
 
															+		perf_event_aux_event(handle->event, aux_head, size, flags);
														
 
															+	}
														
 
															+
														
 
															+	aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
														
 
															+
														
 
															+	if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
														
 
															+		perf_output_wakeup(handle);
														
 
															+		local_add(rb->aux_watermark, &rb->aux_wakeup);
														
 
															+	}
														
 
															+	handle->event = NULL;
														
 
															+
														
 
															+	local_set(&rb->aux_nest, 0);
														
 
															+	rb_free_aux(rb);
														
 
															+	ring_buffer_put(rb);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Skip over a given number of bytes in the AUX buffer, due to, for example,
														
 
															+ * hardware's alignment constraints.
														
 
															+ */
														
 
															+int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
														
 
															+{
														
 
															+	struct ring_buffer *rb = handle->rb;
														
 
															+	unsigned long aux_head;
														
 
															+
														
 
															+	if (size > handle->size)
														
 
															+		return -ENOSPC;
														
 
															+
														
 
															+	local_add(size, &rb->aux_head);
														
 
															+
														
 
															+	aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
														
 
															+	if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
														
 
															+		perf_output_wakeup(handle);
														
 
															+		local_add(rb->aux_watermark, &rb->aux_wakeup);
														
 
															+		handle->wakeup = local_read(&rb->aux_wakeup) +
														
 
															+				 rb->aux_watermark;
														
 
															+	}
														
 
															+
														
 
															+	handle->head = aux_head;
														
 
															+	handle->size -= size;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+void *perf_get_aux(struct perf_output_handle *handle)
														
 
															+{
														
 
															+	/* this is only valid between perf_aux_output_begin and *_end */
														
 
															+	if (!handle->event)
														
 
															+		return NULL;
														
 
															+
														
 
															+	return handle->rb->aux_priv;
														
 
															+}
														
 
															+
														
 
															+#define PERF_AUX_GFP	(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
														
 
															+
														
 
															+static struct page *rb_alloc_aux_page(int node, int order)
														
 
															+{
														
 
															+	struct page *page;
														
 
															+
														
 
															+	if (order > MAX_ORDER)
														
 
															+		order = MAX_ORDER;
														
 
															+
														
 
															+	do {
														
 
															+		page = alloc_pages_node(node, PERF_AUX_GFP, order);
														
 
															+	} while (!page && order--);
														
 
															+
														
 
															+	if (page && order) {
														
 
															+		/*
														
 
															+		 * Communicate the allocation size to the driver
														
 
															+		 */
														
 
															+		split_page(page, order);
														
 
															+		SetPagePrivate(page);
														
 
															+		set_page_private(page, order);
														
 
															+	}
														
 
															+
														
 
															+	return page;
														
 
															+}
														
 
															+
														
 
															+static void rb_free_aux_page(struct ring_buffer *rb, int idx)
														
 
															+{
														
 
															+	struct page *page = virt_to_page(rb->aux_pages[idx]);
														
 
															+
														
 
															+	ClearPagePrivate(page);
														
 
															+	page->mapping = NULL;
														
 
															+	__free_page(page);
														
 
															+}
														
 
															+
														
 
															+int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
														
 
															+		 pgoff_t pgoff, int nr_pages, long watermark, int flags)
														
 
															+{
														
 
															+	bool overwrite = !(flags & RING_BUFFER_WRITABLE);
														
 
															+	int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
														
 
															+	int ret = -ENOMEM, max_order = 0;
														
 
															+
														
 
															+	if (!has_aux(event))
														
 
															+		return -ENOTSUPP;
														
 
															+
														
 
															+	if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) {
														
 
															+		/*
														
 
															+		 * We need to start with the max_order that fits in nr_pages,
														
 
															+		 * not the other way around, hence ilog2() and not get_order.
														
 
															+		 */
														
 
															+		max_order = ilog2(nr_pages);
														
 
															+
														
 
															+		/*
														
 
															+		 * PMU requests more than one contiguous chunks of memory
														
 
															+		 * for SW double buffering
														
 
															+		 */
														
 
															+		if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) &&
														
 
															+		    !overwrite) {
														
 
															+			if (!max_order)
														
 
															+				return -EINVAL;
														
 
															+
														
 
															+			max_order--;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	rb->aux_pages = kzalloc_node(nr_pages * sizeof(void *), GFP_KERNEL, node);
														
 
															+	if (!rb->aux_pages)
														
 
															+		return -ENOMEM;
														
 
															+
														
 
															+	rb->free_aux = event->pmu->free_aux;
														
 
															+	for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) {
														
 
															+		struct page *page;
														
 
															+		int last, order;
														
 
															+
														
 
															+		order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages));
														
 
															+		page = rb_alloc_aux_page(node, order);
														
 
															+		if (!page)
														
 
															+			goto out;
														
 
															+
														
 
															+		for (last = rb->aux_nr_pages + (1 << page_private(page));
														
 
															+		     last > rb->aux_nr_pages; rb->aux_nr_pages++)
														
 
															+			rb->aux_pages[rb->aux_nr_pages] = page_address(page++);
														
 
															+	}
														
 
															+
														
 
															+	rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages,
														
 
															+					     overwrite);
														
 
															+	if (!rb->aux_priv)
														
 
															+		goto out;
														
 
															+
														
 
															+	ret = 0;
														
 
															+
														
 
															+	/*
														
 
															+	 * aux_pages (and pmu driver's private data, aux_priv) will be
														
 
															+	 * referenced in both producer's and consumer's contexts, thus
														
 
															+	 * we keep a refcount here to make sure either of the two can
														
 
															+	 * reference them safely.
														
 
															+	 */
														
 
															+	atomic_set(&rb->aux_refcount, 1);
														
 
															+
														
 
															+	rb->aux_overwrite = overwrite;
														
 
															+	rb->aux_watermark = watermark;
														
 
															+
														
 
															+	if (!rb->aux_watermark && !rb->aux_overwrite)
														
 
															+		rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1);
														
 
															+
														
 
															+out:
														
 
															+	if (!ret)
														
 
															+		rb->aux_pgoff = pgoff;
														
 
															+	else
														
 
															+		rb_free_aux(rb);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static void __rb_free_aux(struct ring_buffer *rb)
														
 
															+{
														
 
															+	int pg;
														
 
															+
														
 
															+	if (rb->aux_priv) {
														
 
															+		rb->free_aux(rb->aux_priv);
														
 
															+		rb->free_aux = NULL;
														
 
															+		rb->aux_priv = NULL;
														
 
															+	}
														
 
															+
														
 
															+	for (pg = 0; pg < rb->aux_nr_pages; pg++)
														
 
															+		rb_free_aux_page(rb, pg);
														
 
															+
														
 
															+	kfree(rb->aux_pages);
														
 
															+	rb->aux_nr_pages = 0;
														
 
															+}
														
 
															+
														
 
															+void rb_free_aux(struct ring_buffer *rb)
														
 
															+{
														
 
															+	if (atomic_dec_and_test(&rb->aux_refcount))
														
 
															+		__rb_free_aux(rb);
														
 
															+}
														
 
															+
														
 
															 #ifndef CONFIG_PERF_USE_VMALLOC
														
 
															 /*
														
 
															  * Back perf_mmap() with regular GFP_KERNEL-0 pages.
														
 
															  */
														
 
															-struct page *
														
 
															-perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
														
 
															+static struct page *
														
 
															+__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
														
 
															 {
														
 
															 	if (pgoff > rb->nr_pages)
														
 
															 		return NULL;
														
@@ -340,8 +643,8 @@ static int data_page_nr(struct ring_buffer *rb)
 
															 	return rb->nr_pages << page_order(rb);
														
 
															 }
														
 
															-struct page *
														
 
															-perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
														
 
															+static struct page *
														
 
															+__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
														
 
															 {
														
 
															 	/* The '>' counts in the user page. */
														
 
															 	if (pgoff > data_page_nr(rb))
														
@@ -416,3 +719,19 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
 
															 }
														
 
															 #endif
														
 
															+
														
 
															+struct page *
														
 
															+perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
														
 
															+{
														
 
															+	if (rb->aux_nr_pages) {
														
 
															+		/* above AUX space */
														
 
															+		if (pgoff > rb->aux_pgoff + rb->aux_nr_pages)
														
 
															+			return NULL;
														
 
															+
														
 
															+		/* AUX space */
														
 
															+		if (pgoff >= rb->aux_pgoff)
														
 
															+			return virt_to_page(rb->aux_pages[pgoff - rb->aux_pgoff]);
														
 
															+	}
														
 
															+
														
 
															+	return __perf_mmap_to_page(rb, pgoff);
														
 
															+}
														
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -432,6 +432,14 @@ config UPROBE_EVENT
 
															 	  This option is required if you plan to use perf-probe subcommand
														
 
															 	  of perf tools on user space applications.
														
 
															+config BPF_EVENTS
														
 
															+	depends on BPF_SYSCALL
														
 
															+	depends on KPROBE_EVENT
														
 
															+	bool
														
 
															+	default y
														
 
															+	help
														
 
															+	  This allows the user to attach BPF programs to kprobe events.
														
 
															+
														
 
															 config PROBE_EVENTS
														
 
															 	def_bool n
														
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 
															 endif
														
 
															 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
														
 
															 obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
														
 
															+obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
														
 
															 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
														
 
															 obj-$(CONFIG_TRACEPOINTS) += power-traces.o
														
 
															 ifeq ($(CONFIG_PM),y)
														
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -0,0 +1,222 @@
 
															+/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of version 2 of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation.
														
 
															+ */
														
 
															+#include <linux/kernel.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/slab.h>
														
 
															+#include <linux/bpf.h>
														
 
															+#include <linux/filter.h>
														
 
															+#include <linux/uaccess.h>
														
 
															+#include <linux/ctype.h>
														
 
															+#include "trace.h"
														
 
															+
														
 
															+static DEFINE_PER_CPU(int, bpf_prog_active);
														
 
															+
														
 
															+/**
														
 
															+ * trace_call_bpf - invoke BPF program
														
 
															+ * @prog: BPF program
														
 
															+ * @ctx: opaque context pointer
														
 
															+ *
														
 
															+ * kprobe handlers execute BPF programs via this helper.
														
 
															+ * Can be used from static tracepoints in the future.
														
 
															+ *
														
 
															+ * Return: BPF programs always return an integer which is interpreted by
														
 
															+ * kprobe handler as:
														
 
															+ * 0 - return from kprobe (event is filtered out)
														
 
															+ * 1 - store kprobe event into ring buffer
														
 
															+ * Other values are reserved and currently alias to 1
														
 
															+ */
														
 
															+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
														
 
															+{
														
 
															+	unsigned int ret;
														
 
															+
														
 
															+	if (in_nmi()) /* not supported yet */
														
 
															+		return 1;
														
 
															+
														
 
															+	preempt_disable();
														
 
															+
														
 
															+	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
														
 
															+		/*
														
 
															+		 * since some bpf program is already running on this cpu,
														
 
															+		 * don't call into another bpf program (same or different)
														
 
															+		 * and don't send kprobe event into ring-buffer,
														
 
															+		 * so return zero here
														
 
															+		 */
														
 
															+		ret = 0;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	rcu_read_lock();
														
 
															+	ret = BPF_PROG_RUN(prog, ctx);
														
 
															+	rcu_read_unlock();
														
 
															+
														
 
															+ out:
														
 
															+	__this_cpu_dec(bpf_prog_active);
														
 
															+	preempt_enable();
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(trace_call_bpf);
														
 
															+
														
 
															+static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
														
 
															+{
														
 
															+	void *dst = (void *) (long) r1;
														
 
															+	int size = (int) r2;
														
 
															+	void *unsafe_ptr = (void *) (long) r3;
														
 
															+
														
 
															+	return probe_kernel_read(dst, unsafe_ptr, size);
														
 
															+}
														
 
															+
														
 
															+static const struct bpf_func_proto bpf_probe_read_proto = {
														
 
															+	.func		= bpf_probe_read,
														
 
															+	.gpl_only	= true,
														
 
															+	.ret_type	= RET_INTEGER,
														
 
															+	.arg1_type	= ARG_PTR_TO_STACK,
														
 
															+	.arg2_type	= ARG_CONST_STACK_SIZE,
														
 
															+	.arg3_type	= ARG_ANYTHING,
														
 
															+};
														
 
															+
														
 
															+static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
														
 
															+{
														
 
															+	/* NMI safe access to clock monotonic */
														
 
															+	return ktime_get_mono_fast_ns();
														
 
															+}
														
 
															+
														
 
															+static const struct bpf_func_proto bpf_ktime_get_ns_proto = {
														
 
															+	.func		= bpf_ktime_get_ns,
														
 
															+	.gpl_only	= true,
														
 
															+	.ret_type	= RET_INTEGER,
														
 
															+};
														
 
															+
														
 
															+/*
														
 
															+ * limited trace_printk()
														
 
															+ * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed
														
 
															+ */
														
 
															+static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
														
 
															+{
														
 
															+	char *fmt = (char *) (long) r1;
														
 
															+	int mod[3] = {};
														
 
															+	int fmt_cnt = 0;
														
 
															+	int i;
														
 
															+
														
 
															+	/*
														
 
															+	 * bpf_check()->check_func_arg()->check_stack_boundary()
														
 
															+	 * guarantees that fmt points to bpf program stack,
														
 
															+	 * fmt_size bytes of it were initialized and fmt_size > 0
														
 
															+	 */
														
 
															+	if (fmt[--fmt_size] != 0)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	/* check format string for allowed specifiers */
														
 
															+	for (i = 0; i < fmt_size; i++) {
														
 
															+		if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
														
 
															+			return -EINVAL;
														
 
															+
														
 
															+		if (fmt[i] != '%')
														
 
															+			continue;
														
 
															+
														
 
															+		if (fmt_cnt >= 3)
														
 
															+			return -EINVAL;
														
 
															+
														
 
															+		/* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
														
 
															+		i++;
														
 
															+		if (fmt[i] == 'l') {
														
 
															+			mod[fmt_cnt]++;
														
 
															+			i++;
														
 
															+		} else if (fmt[i] == 'p') {
														
 
															+			mod[fmt_cnt]++;
														
 
															+			i++;
														
 
															+			if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0)
														
 
															+				return -EINVAL;
														
 
															+			fmt_cnt++;
														
 
															+			continue;
														
 
															+		}
														
 
															+
														
 
															+		if (fmt[i] == 'l') {
														
 
															+			mod[fmt_cnt]++;
														
 
															+			i++;
														
 
															+		}
														
 
															+
														
 
															+		if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x')
														
 
															+			return -EINVAL;
														
 
															+		fmt_cnt++;
														
 
															+	}
														
 
															+
														
 
															+	return __trace_printk(1/* fake ip will not be printed */, fmt,
														
 
															+			      mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3,
														
 
															+			      mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4,
														
 
															+			      mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5);
														
 
															+}
														
 
															+
														
 
															+static const struct bpf_func_proto bpf_trace_printk_proto = {
														
 
															+	.func		= bpf_trace_printk,
														
 
															+	.gpl_only	= true,
														
 
															+	.ret_type	= RET_INTEGER,
														
 
															+	.arg1_type	= ARG_PTR_TO_STACK,
														
 
															+	.arg2_type	= ARG_CONST_STACK_SIZE,
														
 
															+};
														
 
															+
														
 
															+static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
														
 
															+{
														
 
															+	switch (func_id) {
														
 
															+	case BPF_FUNC_map_lookup_elem:
														
 
															+		return &bpf_map_lookup_elem_proto;
														
 
															+	case BPF_FUNC_map_update_elem:
														
 
															+		return &bpf_map_update_elem_proto;
														
 
															+	case BPF_FUNC_map_delete_elem:
														
 
															+		return &bpf_map_delete_elem_proto;
														
 
															+	case BPF_FUNC_probe_read:
														
 
															+		return &bpf_probe_read_proto;
														
 
															+	case BPF_FUNC_ktime_get_ns:
														
 
															+		return &bpf_ktime_get_ns_proto;
														
 
															+
														
 
															+	case BPF_FUNC_trace_printk:
														
 
															+		/*
														
 
															+		 * this program might be calling bpf_trace_printk,
														
 
															+		 * so allocate per-cpu printk buffers
														
 
															+		 */
														
 
															+		trace_printk_init_buffers();
														
 
															+
														
 
															+		return &bpf_trace_printk_proto;
														
 
															+	default:
														
 
															+		return NULL;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/* bpf+kprobe programs can access fields of 'struct pt_regs' */
														
 
															+static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type)
														
 
															+{
														
 
															+	/* check bounds */
														
 
															+	if (off < 0 || off >= sizeof(struct pt_regs))
														
 
															+		return false;
														
 
															+
														
 
															+	/* only read is allowed */
														
 
															+	if (type != BPF_READ)
														
 
															+		return false;
														
 
															+
														
 
															+	/* disallow misaligned access */
														
 
															+	if (off % size != 0)
														
 
															+		return false;
														
 
															+
														
 
															+	return true;
														
 
															+}
														
 
															+
														
 
															+static struct bpf_verifier_ops kprobe_prog_ops = {
														
 
															+	.get_func_proto  = kprobe_prog_func_proto,
														
 
															+	.is_valid_access = kprobe_prog_is_valid_access,
														
 
															+};
														
 
															+
														
 
															+static struct bpf_prog_type_list kprobe_tl = {
														
 
															+	.ops	= &kprobe_prog_ops,
														
 
															+	.type	= BPF_PROG_TYPE_KPROBE,
														
 
															+};
														
 
															+
														
 
															+static int __init register_kprobe_prog_ops(void)
														
 
															+{
														
 
															+	bpf_register_prog_type(&kprobe_tl);
														
 
															+	return 0;
														
 
															+}
														
 
															+late_initcall(register_kprobe_prog_ops);
														
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1135,11 +1135,15 @@ static void
 
															 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
														
 
															 {
														
 
															 	struct ftrace_event_call *call = &tk->tp.call;
														
 
															+	struct bpf_prog *prog = call->prog;
														
 
															 	struct kprobe_trace_entry_head *entry;
														
 
															 	struct hlist_head *head;
														
 
															 	int size, __size, dsize;
														
 
															 	int rctx;
														
 
															+	if (prog && !trace_call_bpf(prog, regs))
														
 
															+		return;
														
 
															+
														
 
															 	head = this_cpu_ptr(call->perf_events);
														
 
															 	if (hlist_empty(head))
														
 
															 		return;
														
@@ -1166,11 +1170,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 
															 		    struct pt_regs *regs)
														
 
															 {
														
 
															 	struct ftrace_event_call *call = &tk->tp.call;
														
 
															+	struct bpf_prog *prog = call->prog;
														
 
															 	struct kretprobe_trace_entry_head *entry;
														
 
															 	struct hlist_head *head;
														
 
															 	int size, __size, dsize;
														
 
															 	int rctx;
														
 
															+	if (prog && !trace_call_bpf(prog, regs))
														
 
															+		return;
														
 
															+
														
 
															 	head = this_cpu_ptr(call->perf_events);
														
 
															 	if (hlist_empty(head))
														
 
															 		return;
														
@@ -1287,7 +1295,7 @@ static int register_kprobe_event(struct trace_kprobe *tk)
 
															 		kfree(call->print_fmt);
														
 
															 		return -ENODEV;
														
 
															 	}
														
 
															-	call->flags = 0;
														
 
															+	call->flags = TRACE_EVENT_FL_KPROBE;
														
 
															 	call->class->reg = kprobe_register;
														
 
															 	call->data = tk;
														
 
															 	ret = trace_add_event_call(call);
														
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1006,7 +1006,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
 
															 		return true;
														
 
															 	list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
														
 
															-		if (event->hw.tp_target->mm == mm)
														
 
															+		if (event->hw.target->mm == mm)
														
 
															 			return true;
														
 
															 	}
														
@@ -1016,7 +1016,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
 
															 static inline bool
														
 
															 uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
														
 
															 {
														
 
															-	return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
														
 
															+	return __uprobe_perf_filter(&tu->filter, event->hw.target->mm);
														
 
															 }
														
 
															 static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
														
@@ -1024,10 +1024,10 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
 
															 	bool done;
														
 
															 	write_lock(&tu->filter.rwlock);
														
 
															-	if (event->hw.tp_target) {
														
 
															+	if (event->hw.target) {
														
 
															 		list_del(&event->hw.tp_list);
														
 
															 		done = tu->filter.nr_systemwide ||
														
 
															-			(event->hw.tp_target->flags & PF_EXITING) ||
														
 
															+			(event->hw.target->flags & PF_EXITING) ||
														
 
															 			uprobe_filter_event(tu, event);
														
 
															 	} else {
														
 
															 		tu->filter.nr_systemwide--;
														
@@ -1047,7 +1047,7 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
 
															 	int err;
														
 
															 	write_lock(&tu->filter.rwlock);
														
 
															-	if (event->hw.tp_target) {
														
 
															+	if (event->hw.target) {
														
 
															 		/*
														
 
															 		 * event->parent != NULL means copy_process(), we can avoid
														
 
															 		 * uprobe_apply(). current->mm must be probed and we can rely
														
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -567,9 +567,37 @@ static void watchdog_nmi_disable(unsigned int cpu)
 
															 		cpu0_err = 0;
														
 
															 	}
														
 
															 }
														
 
															+
														
 
															+void watchdog_nmi_enable_all(void)
														
 
															+{
														
 
															+	int cpu;
														
 
															+
														
 
															+	if (!watchdog_user_enabled)
														
 
															+		return;
														
 
															+
														
 
															+	get_online_cpus();
														
 
															+	for_each_online_cpu(cpu)
														
 
															+		watchdog_nmi_enable(cpu);
														
 
															+	put_online_cpus();
														
 
															+}
														
 
															+
														
 
															+void watchdog_nmi_disable_all(void)
														
 
															+{
														
 
															+	int cpu;
														
 
															+
														
 
															+	if (!watchdog_running)
														
 
															+		return;
														
 
															+
														
 
															+	get_online_cpus();
														
 
															+	for_each_online_cpu(cpu)
														
 
															+		watchdog_nmi_disable(cpu);
														
 
															+	put_online_cpus();
														
 
															+}
														
 
															 #else
														
 
															 static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
														
 
															 static void watchdog_nmi_disable(unsigned int cpu) { return; }
														
 
															+void watchdog_nmi_enable_all(void) {}
														
 
															+void watchdog_nmi_disable_all(void) {}
														
 
															 #endif /* CONFIG_HARDLOCKUP_DETECTOR */
														
 
															 static struct smp_hotplug_thread watchdog_threads = {
														
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -6,23 +6,39 @@ hostprogs-y := test_verifier test_maps
 
															 hostprogs-y += sock_example
														
 
															 hostprogs-y += sockex1
														
 
															 hostprogs-y += sockex2
														
 
															+hostprogs-y += tracex1
														
 
															+hostprogs-y += tracex2
														
 
															+hostprogs-y += tracex3
														
 
															+hostprogs-y += tracex4
														
 
															 test_verifier-objs := test_verifier.o libbpf.o
														
 
															 test_maps-objs := test_maps.o libbpf.o
														
 
															 sock_example-objs := sock_example.o libbpf.o
														
 
															 sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
														
 
															 sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
														
 
															+tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
														
 
															+tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
														
 
															+tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
														
 
															+tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
														
 
															 # Tell kbuild to always build the programs
														
 
															 always := $(hostprogs-y)
														
 
															 always += sockex1_kern.o
														
 
															 always += sockex2_kern.o
														
 
															+always += tracex1_kern.o
														
 
															+always += tracex2_kern.o
														
 
															+always += tracex3_kern.o
														
 
															+always += tracex4_kern.o
														
 
															 HOSTCFLAGS += -I$(objtree)/usr/include
														
 
															 HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
														
 
															 HOSTLOADLIBES_sockex1 += -lelf
														
 
															 HOSTLOADLIBES_sockex2 += -lelf
														
 
															+HOSTLOADLIBES_tracex1 += -lelf
														
 
															+HOSTLOADLIBES_tracex2 += -lelf
														
 
															+HOSTLOADLIBES_tracex3 += -lelf
														
 
															+HOSTLOADLIBES_tracex4 += -lelf -lrt
														
 
															 # point this to your LLVM backend with bpf support
														
 
															 LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
														
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -15,6 +15,12 @@ static int (*bpf_map_update_elem)(void *map, void *key, void *value,
 
															 	(void *) BPF_FUNC_map_update_elem;
														
 
															 static int (*bpf_map_delete_elem)(void *map, void *key) =
														
 
															 	(void *) BPF_FUNC_map_delete_elem;
														
 
															+static int (*bpf_probe_read)(void *dst, int size, void *unsafe_ptr) =
														
 
															+	(void *) BPF_FUNC_probe_read;
														
 
															+static unsigned long long (*bpf_ktime_get_ns)(void) =
														
 
															+	(void *) BPF_FUNC_ktime_get_ns;
														
 
															+static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) =
														
 
															+	(void *) BPF_FUNC_trace_printk;
														
 
															 /* llvm builtin functions that eBPF C program may use to
														
 
															  * emit BPF_LD_ABS and BPF_LD_IND instructions
														
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -8,29 +8,70 @@
 
															 #include <unistd.h>
														
 
															 #include <string.h>
														
 
															 #include <stdbool.h>
														
 
															+#include <stdlib.h>
														
 
															 #include <linux/bpf.h>
														
 
															 #include <linux/filter.h>
														
 
															+#include <linux/perf_event.h>
														
 
															+#include <sys/syscall.h>
														
 
															+#include <sys/ioctl.h>
														
 
															+#include <sys/mman.h>
														
 
															+#include <poll.h>
														
 
															 #include "libbpf.h"
														
 
															 #include "bpf_helpers.h"
														
 
															 #include "bpf_load.h"
														
 
															+#define DEBUGFS "/sys/kernel/debug/tracing/"
														
 
															+
														
 
															 static char license[128];
														
 
															+static int kern_version;
														
 
															 static bool processed_sec[128];
														
 
															 int map_fd[MAX_MAPS];
														
 
															 int prog_fd[MAX_PROGS];
														
 
															+int event_fd[MAX_PROGS];
														
 
															 int prog_cnt;
														
 
															 static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
														
 
															 {
														
 
															-	int fd;
														
 
															 	bool is_socket = strncmp(event, "socket", 6) == 0;
														
 
															-
														
 
															-	if (!is_socket)
														
 
															-		/* tracing events tbd */
														
 
															+	bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
														
 
															+	bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
														
 
															+	enum bpf_prog_type prog_type;
														
 
															+	char buf[256];
														
 
															+	int fd, efd, err, id;
														
 
															+	struct perf_event_attr attr = {};
														
 
															+
														
 
															+	attr.type = PERF_TYPE_TRACEPOINT;
														
 
															+	attr.sample_type = PERF_SAMPLE_RAW;
														
 
															+	attr.sample_period = 1;
														
 
															+	attr.wakeup_events = 1;
														
 
															+
														
 
															+	if (is_socket) {
														
 
															+		prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
														
 
															+	} else if (is_kprobe || is_kretprobe) {
														
 
															+		prog_type = BPF_PROG_TYPE_KPROBE;
														
 
															+	} else {
														
 
															+		printf("Unknown event '%s'\n", event);
														
 
															 		return -1;
														
 
															+	}
														
 
															+
														
 
															+	if (is_kprobe || is_kretprobe) {
														
 
															+		if (is_kprobe)
														
 
															+			event += 7;
														
 
															+		else
														
 
															+			event += 10;
														
 
															+
														
 
															+		snprintf(buf, sizeof(buf),
														
 
															+			 "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events",
														
 
															+			 is_kprobe ? 'p' : 'r', event, event);
														
 
															+		err = system(buf);
														
 
															+		if (err < 0) {
														
 
															+			printf("failed to create kprobe '%s' error '%s'\n",
														
 
															+			       event, strerror(errno));
														
 
															+			return -1;
														
 
															+		}
														
 
															+	}
														
 
															-	fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER,
														
 
															-			   prog, size, license);
														
 
															+	fd = bpf_prog_load(prog_type, prog, size, license, kern_version);
														
 
															 	if (fd < 0) {
														
 
															 		printf("bpf_prog_load() err=%d\n%s", errno, bpf_log_buf);
														
@@ -39,6 +80,41 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 
															 	prog_fd[prog_cnt++] = fd;
														
 
															+	if (is_socket)
														
 
															+		return 0;
														
 
															+
														
 
															+	strcpy(buf, DEBUGFS);
														
 
															+	strcat(buf, "events/kprobes/");
														
 
															+	strcat(buf, event);
														
 
															+	strcat(buf, "/id");
														
 
															+
														
 
															+	efd = open(buf, O_RDONLY, 0);
														
 
															+	if (efd < 0) {
														
 
															+		printf("failed to open event %s\n", event);
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	err = read(efd, buf, sizeof(buf));
														
 
															+	if (err < 0 || err >= sizeof(buf)) {
														
 
															+		printf("read from '%s' failed '%s'\n", event, strerror(errno));
														
 
															+		return -1;
														
 
															+	}
														
 
															+
														
 
															+	close(efd);
														
 
															+
														
 
															+	buf[err] = 0;
														
 
															+	id = atoi(buf);
														
 
															+	attr.config = id;
														
 
															+
														
 
															+	efd = perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
														
 
															+	if (efd < 0) {
														
 
															+		printf("event %d fd %d err %s\n", id, efd, strerror(errno));
														
 
															+		return -1;
														
 
															+	}
														
 
															+	event_fd[prog_cnt - 1] = efd;
														
 
															+	ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
														
 
															+	ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd);
														
 
															+
														
 
															 	return 0;
														
 
															 }
														
@@ -135,6 +211,9 @@ int load_bpf_file(char *path)
 
															 	if (gelf_getehdr(elf, &ehdr) != &ehdr)
														
 
															 		return 1;
														
 
															+	/* clear all kprobes */
														
 
															+	i = system("echo \"\" > /sys/kernel/debug/tracing/kprobe_events");
														
 
															+
														
 
															 	/* scan over all elf sections to get license and map info */
														
 
															 	for (i = 1; i < ehdr.e_shnum; i++) {
														
@@ -149,6 +228,14 @@ int load_bpf_file(char *path)
 
															 		if (strcmp(shname, "license") == 0) {
														
 
															 			processed_sec[i] = true;
														
 
															 			memcpy(license, data->d_buf, data->d_size);
														
 
															+		} else if (strcmp(shname, "version") == 0) {
														
 
															+			processed_sec[i] = true;
														
 
															+			if (data->d_size != sizeof(int)) {
														
 
															+				printf("invalid size of version section %zd\n",
														
 
															+				       data->d_size);
														
 
															+				return 1;
														
 
															+			}
														
 
															+			memcpy(&kern_version, data->d_buf, sizeof(int));
														
 
															 		} else if (strcmp(shname, "maps") == 0) {
														
 
															 			processed_sec[i] = true;
														
 
															 			if (load_maps(data->d_buf, data->d_size))
														
@@ -178,7 +265,8 @@ int load_bpf_file(char *path)
 
															 			if (parse_relo_and_apply(data, symbols, &shdr, insns))
														
 
															 				continue;
														
 
															-			if (memcmp(shname_prog, "events/", 7) == 0 ||
														
 
															+			if (memcmp(shname_prog, "kprobe/", 7) == 0 ||
														
 
															+			    memcmp(shname_prog, "kretprobe/", 10) == 0 ||
														
 
															 			    memcmp(shname_prog, "socket", 6) == 0)
														
 
															 				load_and_attach(shname_prog, insns, data_prog->d_size);
														
 
															 		}
														
@@ -193,7 +281,8 @@ int load_bpf_file(char *path)
 
															 		if (get_sec(elf, i, &ehdr, &shname, &shdr, &data))
														
 
															 			continue;
														
 
															-		if (memcmp(shname, "events/", 7) == 0 ||
														
 
															+		if (memcmp(shname, "kprobe/", 7) == 0 ||
														
 
															+		    memcmp(shname, "kretprobe/", 10) == 0 ||
														
 
															 		    memcmp(shname, "socket", 6) == 0)
														
 
															 			load_and_attach(shname, data->d_buf, data->d_size);
														
 
															 	}
														
@@ -201,3 +290,23 @@ int load_bpf_file(char *path)
 
															 	close(fd);
														
 
															 	return 0;
														
 
															 }
														
 
															+
														
 
															+void read_trace_pipe(void)
														
 
															+{
														
 
															+	int trace_fd;
														
 
															+
														
 
															+	trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
														
 
															+	if (trace_fd < 0)
														
 
															+		return;
														
 
															+
														
 
															+	while (1) {
														
 
															+		static char buf[4096];
														
 
															+		ssize_t sz;
														
 
															+
														
 
															+		sz = read(trace_fd, buf, sizeof(buf));
														
 
															+		if (sz > 0) {
														
 
															+			buf[sz] = 0;
														
 
															+			puts(buf);
														
 
															+		}
														
 
															+	}
														
 
															+}
														
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -6,6 +6,7 @@
 
															 extern int map_fd[MAX_MAPS];
														
 
															 extern int prog_fd[MAX_PROGS];
														
 
															+extern int event_fd[MAX_PROGS];
														
 
															 /* parses elf file compiled by llvm .c->.o
														
 
															  * . parses 'maps' section and creates maps via BPF syscall
														
@@ -21,4 +22,6 @@ extern int prog_fd[MAX_PROGS];
 
															  */
														
 
															 int load_bpf_file(char *path);
														
 
															+void read_trace_pipe(void);
														
 
															+
														
 
															 #endif
														
--- a/samples/bpf/libbpf.c
+++ b/samples/bpf/libbpf.c
@@ -81,7 +81,7 @@ char bpf_log_buf[LOG_BUF_SIZE];
 
															 int bpf_prog_load(enum bpf_prog_type prog_type,
														
 
															 		  const struct bpf_insn *insns, int prog_len,
														
 
															-		  const char *license)
														
 
															+		  const char *license, int kern_version)
														
 
															 {
														
 
															 	union bpf_attr attr = {
														
 
															 		.prog_type = prog_type,
														
@@ -93,6 +93,11 @@ int bpf_prog_load(enum bpf_prog_type prog_type,
 
															 		.log_level = 1,
														
 
															 	};
														
 
															+	/* assign one field outside of struct init to make sure any
														
 
															+	 * padding is zero initialized
														
 
															+	 */
														
 
															+	attr.kern_version = kern_version;
														
 
															+
														
 
															 	bpf_log_buf[0] = 0;
														
 
															 	return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
														
@@ -121,3 +126,10 @@ int open_raw_sock(const char *name)
 
															 	return sock;
														
 
															 }
														
 
															+
														
 
															+int perf_event_open(struct perf_event_attr *attr, int pid, int cpu,
														
 
															+		    int group_fd, unsigned long flags)
														
 
															+{
														
 
															+	return syscall(__NR_perf_event_open, attr, pid, cpu,
														
 
															+		       group_fd, flags);
														
 
															+}
														
--- a/samples/bpf/libbpf.h
+++ b/samples/bpf/libbpf.h
@@ -13,7 +13,7 @@ int bpf_get_next_key(int fd, void *key, void *next_key);
 
															 int bpf_prog_load(enum bpf_prog_type prog_type,
														
 
															 		  const struct bpf_insn *insns, int insn_len,
														
 
															-		  const char *license);
														
 
															+		  const char *license, int kern_version);
														
 
															 #define LOG_BUF_SIZE 65536
														
 
															 extern char bpf_log_buf[LOG_BUF_SIZE];
														
@@ -182,4 +182,7 @@ extern char bpf_log_buf[LOG_BUF_SIZE];
 
															 /* create RAW socket and bind to interface 'name' */
														
 
															 int open_raw_sock(const char *name);
														
 
															+struct perf_event_attr;
														
 
															+int perf_event_open(struct perf_event_attr *attr, int pid, int cpu,
														
 
															+		    int group_fd, unsigned long flags);
														
 
															 #endif
														
--- a/samples/bpf/sock_example.c
+++ b/samples/bpf/sock_example.c
@@ -56,7 +56,7 @@ static int test_sock(void)
 
															 	};
														
 
															 	prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog, sizeof(prog),
														
 
															-				"GPL");
														
 
															+				"GPL", 0);
														
 
															 	if (prog_fd < 0) {
														
 
															 		printf("failed to load prog '%s'\n", strerror(errno));
														
 
															 		goto cleanup;
														
--- a/samples/bpf/test_verifier.c
+++ b/samples/bpf/test_verifier.c
@@ -689,7 +689,7 @@ static int test(void)
 
															 		prog_fd = bpf_prog_load(BPF_PROG_TYPE_UNSPEC, prog,
														
 
															 					prog_len * sizeof(struct bpf_insn),
														
 
															-					"GPL");
														
 
															+					"GPL", 0);
														
 
															 		if (tests[i].result == ACCEPT) {
														
 
															 			if (prog_fd < 0) {
														
--- a/samples/bpf/tracex1_kern.c
+++ b/samples/bpf/tracex1_kern.c
@@ -0,0 +1,50 @@
 
															+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of version 2 of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation.
														
 
															+ */
														
 
															+#include <linux/skbuff.h>
														
 
															+#include <linux/netdevice.h>
														
 
															+#include <uapi/linux/bpf.h>
														
 
															+#include <linux/version.h>
														
 
															+#include "bpf_helpers.h"
														
 
															+
														
 
															+#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;})
														
 
															+
														
 
															+/* kprobe is NOT a stable ABI
														
 
															+ * kernel functions can be removed, renamed or completely change semantics.
														
 
															+ * Number of arguments and their positions can change, etc.
														
 
															+ * In such case this bpf+kprobe example will no longer be meaningful
														
 
															+ */
														
 
															+SEC("kprobe/__netif_receive_skb_core")
														
 
															+int bpf_prog1(struct pt_regs *ctx)
														
 
															+{
														
 
															+	/* attaches to kprobe netif_receive_skb,
														
 
															+	 * looks for packets on loobpack device and prints them
														
 
															+	 */
														
 
															+	char devname[IFNAMSIZ] = {};
														
 
															+	struct net_device *dev;
														
 
															+	struct sk_buff *skb;
														
 
															+	int len;
														
 
															+
														
 
															+	/* non-portable! works for the given kernel only */
														
 
															+	skb = (struct sk_buff *) ctx->di;
														
 
															+
														
 
															+	dev = _(skb->dev);
														
 
															+
														
 
															+	len = _(skb->len);
														
 
															+
														
 
															+	bpf_probe_read(devname, sizeof(devname), dev->name);
														
 
															+
														
 
															+	if (devname[0] == 'l' && devname[1] == 'o') {
														
 
															+		char fmt[] = "skb %p len %d\n";
														
 
															+		/* using bpf_trace_printk() for DEBUG ONLY */
														
 
															+		bpf_trace_printk(fmt, sizeof(fmt), skb, len);
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+char _license[] SEC("license") = "GPL";
														
 
															+u32 _version SEC("version") = LINUX_VERSION_CODE;
														
--- a/samples/bpf/tracex1_user.c
+++ b/samples/bpf/tracex1_user.c
@@ -0,0 +1,25 @@
 
															+#include <stdio.h>
														
 
															+#include <linux/bpf.h>
														
 
															+#include <unistd.h>
														
 
															+#include "libbpf.h"
														
 
															+#include "bpf_load.h"
														
 
															+
														
 
															+int main(int ac, char **argv)
														
 
															+{
														
 
															+	FILE *f;
														
 
															+	char filename[256];
														
 
															+
														
 
															+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
														
 
															+
														
 
															+	if (load_bpf_file(filename)) {
														
 
															+		printf("%s", bpf_log_buf);
														
 
															+		return 1;
														
 
															+	}
														
 
															+
														
 
															+	f = popen("taskset 1 ping -c5 localhost", "r");
														
 
															+	(void) f;
														
 
															+
														
 
															+	read_trace_pipe();
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
--- a/samples/bpf/tracex2_kern.c
+++ b/samples/bpf/tracex2_kern.c
@@ -0,0 +1,86 @@
 
															+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of version 2 of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation.
														
 
															+ */
														
 
															+#include <linux/skbuff.h>
														
 
															+#include <linux/netdevice.h>
														
 
															+#include <linux/version.h>
														
 
															+#include <uapi/linux/bpf.h>
														
 
															+#include "bpf_helpers.h"
														
 
															+
														
 
															+struct bpf_map_def SEC("maps") my_map = {
														
 
															+	.type = BPF_MAP_TYPE_HASH,
														
 
															+	.key_size = sizeof(long),
														
 
															+	.value_size = sizeof(long),
														
 
															+	.max_entries = 1024,
														
 
															+};
														
 
															+
														
 
															+/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe
														
 
															+ * example will no longer be meaningful
														
 
															+ */
														
 
															+SEC("kprobe/kfree_skb")
														
 
															+int bpf_prog2(struct pt_regs *ctx)
														
 
															+{
														
 
															+	long loc = 0;
														
 
															+	long init_val = 1;
														
 
															+	long *value;
														
 
															+
														
 
															+	/* x64 specific: read ip of kfree_skb caller.
														
 
															+	 * non-portable version of __builtin_return_address(0)
														
 
															+	 */
														
 
															+	bpf_probe_read(&loc, sizeof(loc), (void *)ctx->sp);
														
 
															+
														
 
															+	value = bpf_map_lookup_elem(&my_map, &loc);
														
 
															+	if (value)
														
 
															+		*value += 1;
														
 
															+	else
														
 
															+		bpf_map_update_elem(&my_map, &loc, &init_val, BPF_ANY);
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static unsigned int log2(unsigned int v)
														
 
															+{
														
 
															+	unsigned int r;
														
 
															+	unsigned int shift;
														
 
															+
														
 
															+	r = (v > 0xFFFF) << 4; v >>= r;
														
 
															+	shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
														
 
															+	shift = (v > 0xF) << 2; v >>= shift; r |= shift;
														
 
															+	shift = (v > 0x3) << 1; v >>= shift; r |= shift;
														
 
															+	r |= (v >> 1);
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															+static unsigned int log2l(unsigned long v)
														
 
															+{
														
 
															+	unsigned int hi = v >> 32;
														
 
															+	if (hi)
														
 
															+		return log2(hi) + 32;
														
 
															+	else
														
 
															+		return log2(v);
														
 
															+}
														
 
															+
														
 
															+struct bpf_map_def SEC("maps") my_hist_map = {
														
 
															+	.type = BPF_MAP_TYPE_ARRAY,
														
 
															+	.key_size = sizeof(u32),
														
 
															+	.value_size = sizeof(long),
														
 
															+	.max_entries = 64,
														
 
															+};
														
 
															+
														
 
															+SEC("kprobe/sys_write")
														
 
															+int bpf_prog3(struct pt_regs *ctx)
														
 
															+{
														
 
															+	long write_size = ctx->dx; /* arg3 */
														
 
															+	long init_val = 1;
														
 
															+	long *value;
														
 
															+	u32 index = log2l(write_size);
														
 
															+
														
 
															+	value = bpf_map_lookup_elem(&my_hist_map, &index);
														
 
															+	if (value)
														
 
															+		__sync_fetch_and_add(value, 1);
														
 
															+	return 0;
														
 
															+}
														
 
															+char _license[] SEC("license") = "GPL";
														
 
															+u32 _version SEC("version") = LINUX_VERSION_CODE;
														
--- a/samples/bpf/tracex2_user.c
+++ b/samples/bpf/tracex2_user.c
@@ -0,0 +1,95 @@
 
															+#include <stdio.h>
														
 
															+#include <unistd.h>
														
 
															+#include <stdlib.h>
														
 
															+#include <signal.h>
														
 
															+#include <linux/bpf.h>
														
 
															+#include "libbpf.h"
														
 
															+#include "bpf_load.h"
														
 
															+
														
 
															+#define MAX_INDEX	64
														
 
															+#define MAX_STARS	38
														
 
															+
														
 
															+static void stars(char *str, long val, long max, int width)
														
 
															+{
														
 
															+	int i;
														
 
															+
														
 
															+	for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
														
 
															+		str[i] = '*';
														
 
															+	if (val > max)
														
 
															+		str[i - 1] = '+';
														
 
															+	str[i] = '\0';
														
 
															+}
														
 
															+
														
 
															+static void print_hist(int fd)
														
 
															+{
														
 
															+	int key;
														
 
															+	long value;
														
 
															+	long data[MAX_INDEX] = {};
														
 
															+	char starstr[MAX_STARS];
														
 
															+	int i;
														
 
															+	int max_ind = -1;
														
 
															+	long max_value = 0;
														
 
															+
														
 
															+	for (key = 0; key < MAX_INDEX; key++) {
														
 
															+		bpf_lookup_elem(fd, &key, &value);
														
 
															+		data[key] = value;
														
 
															+		if (value && key > max_ind)
														
 
															+			max_ind = key;
														
 
															+		if (value > max_value)
														
 
															+			max_value = value;
														
 
															+	}
														
 
															+
														
 
															+	printf("           syscall write() stats\n");
														
 
															+	printf("     byte_size       : count     distribution\n");
														
 
															+	for (i = 1; i <= max_ind + 1; i++) {
														
 
															+		stars(starstr, data[i - 1], max_value, MAX_STARS);
														
 
															+		printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
														
 
															+		       (1l << i) >> 1, (1l << i) - 1, data[i - 1],
														
 
															+		       MAX_STARS, starstr);
														
 
															+	}
														
 
															+}
														
 
															+static void int_exit(int sig)
														
 
															+{
														
 
															+	print_hist(map_fd[1]);
														
 
															+	exit(0);
														
 
															+}
														
 
															+
														
 
															+int main(int ac, char **argv)
														
 
															+{
														
 
															+	char filename[256];
														
 
															+	long key, next_key, value;
														
 
															+	FILE *f;
														
 
															+	int i;
														
 
															+
														
 
															+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
														
 
															+
														
 
															+	signal(SIGINT, int_exit);
														
 
															+
														
 
															+	/* start 'ping' in the background to have some kfree_skb events */
														
 
															+	f = popen("ping -c5 localhost", "r");
														
 
															+	(void) f;
														
 
															+
														
 
															+	/* start 'dd' in the background to have plenty of 'write' syscalls */
														
 
															+	f = popen("dd if=/dev/zero of=/dev/null count=5000000", "r");
														
 
															+	(void) f;
														
 
															+
														
 
															+	if (load_bpf_file(filename)) {
														
 
															+		printf("%s", bpf_log_buf);
														
 
															+		return 1;
														
 
															+	}
														
 
															+
														
 
															+	for (i = 0; i < 5; i++) {
														
 
															+		key = 0;
														
 
															+		while (bpf_get_next_key(map_fd[0], &key, &next_key) == 0) {
														
 
															+			bpf_lookup_elem(map_fd[0], &next_key, &value);
														
 
															+			printf("location 0x%lx count %ld\n", next_key, value);
														
 
															+			key = next_key;
														
 
															+		}
														
 
															+		if (key)
														
 
															+			printf("\n");
														
 
															+		sleep(1);
														
 
															+	}
														
 
															+	print_hist(map_fd[1]);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
--- a/samples/bpf/tracex3_kern.c
+++ b/samples/bpf/tracex3_kern.c
@@ -0,0 +1,89 @@
 
															+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of version 2 of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation.
														
 
															+ */
														
 
															+#include <linux/skbuff.h>
														
 
															+#include <linux/netdevice.h>
														
 
															+#include <linux/version.h>
														
 
															+#include <uapi/linux/bpf.h>
														
 
															+#include "bpf_helpers.h"
														
 
															+
														
 
															+struct bpf_map_def SEC("maps") my_map = {
														
 
															+	.type = BPF_MAP_TYPE_HASH,
														
 
															+	.key_size = sizeof(long),
														
 
															+	.value_size = sizeof(u64),
														
 
															+	.max_entries = 4096,
														
 
															+};
														
 
															+
														
 
															+/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe
														
 
															+ * example will no longer be meaningful
														
 
															+ */
														
 
															+SEC("kprobe/blk_mq_start_request")
														
 
															+int bpf_prog1(struct pt_regs *ctx)
														
 
															+{
														
 
															+	long rq = ctx->di;
														
 
															+	u64 val = bpf_ktime_get_ns();
														
 
															+
														
 
															+	bpf_map_update_elem(&my_map, &rq, &val, BPF_ANY);
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static unsigned int log2l(unsigned long long n)
														
 
															+{
														
 
															+#define S(k) if (n >= (1ull << k)) { i += k; n >>= k; }
														
 
															+	int i = -(n == 0);
														
 
															+	S(32); S(16); S(8); S(4); S(2); S(1);
														
 
															+	return i;
														
 
															+#undef S
														
 
															+}
														
 
															+
														
 
															+#define SLOTS 100
														
 
															+
														
 
															+struct bpf_map_def SEC("maps") lat_map = {
														
 
															+	.type = BPF_MAP_TYPE_ARRAY,
														
 
															+	.key_size = sizeof(u32),
														
 
															+	.value_size = sizeof(u64),
														
 
															+	.max_entries = SLOTS,
														
 
															+};
														
 
															+
														
 
															+SEC("kprobe/blk_update_request")
														
 
															+int bpf_prog2(struct pt_regs *ctx)
														
 
															+{
														
 
															+	long rq = ctx->di;
														
 
															+	u64 *value, l, base;
														
 
															+	u32 index;
														
 
															+
														
 
															+	value = bpf_map_lookup_elem(&my_map, &rq);
														
 
															+	if (!value)
														
 
															+		return 0;
														
 
															+
														
 
															+	u64 cur_time = bpf_ktime_get_ns();
														
 
															+	u64 delta = cur_time - *value;
														
 
															+
														
 
															+	bpf_map_delete_elem(&my_map, &rq);
														
 
															+
														
 
															+	/* the lines below are computing index = log10(delta)*10
														
 
															+	 * using integer arithmetic
														
 
															+	 * index = 29 ~ 1 usec
														
 
															+	 * index = 59 ~ 1 msec
														
 
															+	 * index = 89 ~ 1 sec
														
 
															+	 * index = 99 ~ 10sec or more
														
 
															+	 * log10(x)*10 = log2(x)*10/log2(10) = log2(x)*3
														
 
															+	 */
														
 
															+	l = log2l(delta);
														
 
															+	base = 1ll << l;
														
 
															+	index = (l * 64 + (delta - base) * 64 / base) * 3 / 64;
														
 
															+
														
 
															+	if (index >= SLOTS)
														
 
															+		index = SLOTS - 1;
														
 
															+
														
 
															+	value = bpf_map_lookup_elem(&lat_map, &index);
														
 
															+	if (value)
														
 
															+		__sync_fetch_and_add((long *)value, 1);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+char _license[] SEC("license") = "GPL";
														
 
															+u32 _version SEC("version") = LINUX_VERSION_CODE;
														
--- a/samples/bpf/tracex3_user.c
+++ b/samples/bpf/tracex3_user.c
@@ -0,0 +1,150 @@
 
															+/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of version 2 of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation.
														
 
															+ */
														
 
															+#include <stdio.h>
														
 
															+#include <stdlib.h>
														
 
															+#include <signal.h>
														
 
															+#include <unistd.h>
														
 
															+#include <stdbool.h>
														
 
															+#include <string.h>
														
 
															+#include <linux/bpf.h>
														
 
															+#include "libbpf.h"
														
 
															+#include "bpf_load.h"
														
 
															+
														
 
															+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
														
 
															+
														
 
															+#define SLOTS 100
														
 
															+
														
 
															+static void clear_stats(int fd)
														
 
															+{
														
 
															+	__u32 key;
														
 
															+	__u64 value = 0;
														
 
															+
														
 
															+	for (key = 0; key < SLOTS; key++)
														
 
															+		bpf_update_elem(fd, &key, &value, BPF_ANY);
														
 
															+}
														
 
															+
														
 
															+const char *color[] = {
														
 
															+	"\033[48;5;255m",
														
 
															+	"\033[48;5;252m",
														
 
															+	"\033[48;5;250m",
														
 
															+	"\033[48;5;248m",
														
 
															+	"\033[48;5;246m",
														
 
															+	"\033[48;5;244m",
														
 
															+	"\033[48;5;242m",
														
 
															+	"\033[48;5;240m",
														
 
															+	"\033[48;5;238m",
														
 
															+	"\033[48;5;236m",
														
 
															+	"\033[48;5;234m",
														
 
															+	"\033[48;5;232m",
														
 
															+};
														
 
															+const int num_colors = ARRAY_SIZE(color);
														
 
															+
														
 
															+const char nocolor[] = "\033[00m";
														
 
															+
														
 
															+const char *sym[] = {
														
 
															+	" ",
														
 
															+	" ",
														
 
															+	".",
														
 
															+	".",
														
 
															+	"*",
														
 
															+	"*",
														
 
															+	"o",
														
 
															+	"o",
														
 
															+	"O",
														
 
															+	"O",
														
 
															+	"#",
														
 
															+	"#",
														
 
															+};
														
 
															+
														
 
															+bool full_range = false;
														
 
															+bool text_only = false;
														
 
															+
														
 
															+static void print_banner(void)
														
 
															+{
														
 
															+	if (full_range)
														
 
															+		printf("|1ns     |10ns     |100ns    |1us      |10us     |100us"
														
 
															+		       "    |1ms      |10ms     |100ms    |1s       |10s\n");
														
 
															+	else
														
 
															+		printf("|1us      |10us     |100us    |1ms      |10ms     "
														
 
															+		       "|100ms    |1s       |10s\n");
														
 
															+}
														
 
															+
														
 
															+static void print_hist(int fd)
														
 
															+{
														
 
															+	__u32 key;
														
 
															+	__u64 value;
														
 
															+	__u64 cnt[SLOTS];
														
 
															+	__u64 max_cnt = 0;
														
 
															+	__u64 total_events = 0;
														
 
															+
														
 
															+	for (key = 0; key < SLOTS; key++) {
														
 
															+		value = 0;
														
 
															+		bpf_lookup_elem(fd, &key, &value);
														
 
															+		cnt[key] = value;
														
 
															+		total_events += value;
														
 
															+		if (value > max_cnt)
														
 
															+			max_cnt = value;
														
 
															+	}
														
 
															+	clear_stats(fd);
														
 
															+	for (key = full_range ? 0 : 29; key < SLOTS; key++) {
														
 
															+		int c = num_colors * cnt[key] / (max_cnt + 1);
														
 
															+
														
 
															+		if (text_only)
														
 
															+			printf("%s", sym[c]);
														
 
															+		else
														
 
															+			printf("%s %s", color[c], nocolor);
														
 
															+	}
														
 
															+	printf(" # %lld\n", total_events);
														
 
															+}
														
 
															+
														
 
															+int main(int ac, char **argv)
														
 
															+{
														
 
															+	char filename[256];
														
 
															+	int i;
														
 
															+
														
 
															+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
														
 
															+
														
 
															+	if (load_bpf_file(filename)) {
														
 
															+		printf("%s", bpf_log_buf);
														
 
															+		return 1;
														
 
															+	}
														
 
															+
														
 
															+	for (i = 1; i < ac; i++) {
														
 
															+		if (strcmp(argv[i], "-a") == 0) {
														
 
															+			full_range = true;
														
 
															+		} else if (strcmp(argv[i], "-t") == 0) {
														
 
															+			text_only = true;
														
 
															+		} else if (strcmp(argv[i], "-h") == 0) {
														
 
															+			printf("Usage:\n"
														
 
															+			       "  -a display wider latency range\n"
														
 
															+			       "  -t text only\n");
														
 
															+			return 1;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	printf("  heatmap of IO latency\n");
														
 
															+	if (text_only)
														
 
															+		printf("  %s", sym[num_colors - 1]);
														
 
															+	else
														
 
															+		printf("  %s %s", color[num_colors - 1], nocolor);
														
 
															+	printf(" - many events with this latency\n");
														
 
															+
														
 
															+	if (text_only)
														
 
															+		printf("  %s", sym[0]);
														
 
															+	else
														
 
															+		printf("  %s %s", color[0], nocolor);
														
 
															+	printf(" - few events\n");
														
 
															+
														
 
															+	for (i = 0; ; i++) {
														
 
															+		if (i % 20 == 0)
														
 
															+			print_banner();
														
 
															+		print_hist(map_fd[1]);
														
 
															+		sleep(2);
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
--- a/samples/bpf/tracex4_kern.c
+++ b/samples/bpf/tracex4_kern.c
@@ -0,0 +1,54 @@
 
															+/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of version 2 of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation.
														
 
															+ */
														
 
															+#include <linux/ptrace.h>
														
 
															+#include <linux/version.h>
														
 
															+#include <uapi/linux/bpf.h>
														
 
															+#include "bpf_helpers.h"
														
 
															+
														
 
															+struct pair {
														
 
															+	u64 val;
														
 
															+	u64 ip;
														
 
															+};
														
 
															+
														
 
															+struct bpf_map_def SEC("maps") my_map = {
														
 
															+	.type = BPF_MAP_TYPE_HASH,
														
 
															+	.key_size = sizeof(long),
														
 
															+	.value_size = sizeof(struct pair),
														
 
															+	.max_entries = 1000000,
														
 
															+};
														
 
															+
														
 
															+/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe
														
 
															+ * example will no longer be meaningful
														
 
															+ */
														
 
															+SEC("kprobe/kmem_cache_free")
														
 
															+int bpf_prog1(struct pt_regs *ctx)
														
 
															+{
														
 
															+	long ptr = ctx->si;
														
 
															+
														
 
															+	bpf_map_delete_elem(&my_map, &ptr);
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+SEC("kretprobe/kmem_cache_alloc_node")
														
 
															+int bpf_prog2(struct pt_regs *ctx)
														
 
															+{
														
 
															+	long ptr = ctx->ax;
														
 
															+	long ip = 0;
														
 
															+
														
 
															+	/* get ip address of kmem_cache_alloc_node() caller */
														
 
															+	bpf_probe_read(&ip, sizeof(ip), (void *)(ctx->bp + sizeof(ip)));
														
 
															+
														
 
															+	struct pair v = {
														
 
															+		.val = bpf_ktime_get_ns(),
														
 
															+		.ip = ip,
														
 
															+	};
														
 
															+
														
 
															+	bpf_map_update_elem(&my_map, &ptr, &v, BPF_ANY);
														
 
															+	return 0;
														
 
															+}
														
 
															+char _license[] SEC("license") = "GPL";
														
 
															+u32 _version SEC("version") = LINUX_VERSION_CODE;
														
--- a/samples/bpf/tracex4_user.c
+++ b/samples/bpf/tracex4_user.c
@@ -0,0 +1,69 @@
 
															+/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of version 2 of the GNU General Public
														
 
															+ * License as published by the Free Software Foundation.
														
 
															+ */
														
 
															+#include <stdio.h>
														
 
															+#include <stdlib.h>
														
 
															+#include <signal.h>
														
 
															+#include <unistd.h>
														
 
															+#include <stdbool.h>
														
 
															+#include <string.h>
														
 
															+#include <time.h>
														
 
															+#include <linux/bpf.h>
														
 
															+#include "libbpf.h"
														
 
															+#include "bpf_load.h"
														
 
															+
														
 
															+struct pair {
														
 
															+	long long val;
														
 
															+	__u64 ip;
														
 
															+};
														
 
															+
														
 
															+static __u64 time_get_ns(void)
														
 
															+{
														
 
															+	struct timespec ts;
														
 
															+
														
 
															+	clock_gettime(CLOCK_MONOTONIC, &ts);
														
 
															+	return ts.tv_sec * 1000000000ull + ts.tv_nsec;
														
 
															+}
														
 
															+
														
 
															+static void print_old_objects(int fd)
														
 
															+{
														
 
															+	long long val = time_get_ns();
														
 
															+	__u64 key, next_key;
														
 
															+	struct pair v;
														
 
															+
														
 
															+	key = write(1, "\e[1;1H\e[2J", 12); /* clear screen */
														
 
															+
														
 
															+	key = -1;
														
 
															+	while (bpf_get_next_key(map_fd[0], &key, &next_key) == 0) {
														
 
															+		bpf_lookup_elem(map_fd[0], &next_key, &v);
														
 
															+		key = next_key;
														
 
															+		if (val - v.val < 1000000000ll)
														
 
															+			/* object was allocated more then 1 sec ago */
														
 
															+			continue;
														
 
															+		printf("obj 0x%llx is %2lldsec old was allocated at ip %llx\n",
														
 
															+		       next_key, (val - v.val) / 1000000000ll, v.ip);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+int main(int ac, char **argv)
														
 
															+{
														
 
															+	char filename[256];
														
 
															+	int i;
														
 
															+
														
 
															+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
														
 
															+
														
 
															+	if (load_bpf_file(filename)) {
														
 
															+		printf("%s", bpf_log_buf);
														
 
															+		return 1;
														
 
															+	}
														
 
															+
														
 
															+	for (i = 0; ; i++) {
														
 
															+		print_old_objects(map_fd[1]);
														
 
															+		sleep(1);
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
--- a/tools/build/Build.include
+++ b/tools/build/Build.include
@@ -0,0 +1,81 @@
 
															+###
														
 
															+# build: Generic definitions
														
 
															+#
														
 
															+#  Lots of this code have been borrowed or heavily inspired from parts
														
 
															+#  of kbuild code, which is not credited, but mostly developed by:
														
 
															+#
														
 
															+#  Copyright (C) Sam Ravnborg <sam@mars.ravnborg.org>, 2015
														
 
															+#  Copyright (C) Linus Torvalds <torvalds@linux-foundation.org>, 2015
														
 
															+#
														
 
															+
														
 
															+###
														
 
															+# Convenient variables
														
 
															+comma   := ,
														
 
															+squote  := '
														
 
															+
														
 
															+###
														
 
															+# Name of target with a '.' as filename prefix. foo/bar.o => foo/.bar.o
														
 
															+dot-target = $(dir $@).$(notdir $@)
														
 
															+
														
 
															+###
														
 
															+# filename of target with directory and extension stripped
														
 
															+basetarget = $(basename $(notdir $@))
														
 
															+
														
 
															+###
														
 
															+# The temporary file to save gcc -MD generated dependencies must not
														
 
															+# contain a comma
														
 
															+depfile = $(subst $(comma),_,$(dot-target).d)
														
 
															+
														
 
															+###
														
 
															+# Check if both arguments has same arguments. Result is empty string if equal.
														
 
															+arg-check = $(strip $(filter-out $(cmd_$(1)), $(cmd_$@)) \
														
 
															+                    $(filter-out $(cmd_$@),   $(cmd_$(1))) )
														
 
															+
														
 
															+###
														
 
															+# Escape single quote for use in echo statements
														
 
															+escsq = $(subst $(squote),'\$(squote)',$1)
														
 
															+
														
 
															+# Echo command
														
 
															+# Short version is used, if $(quiet) equals `quiet_', otherwise full one.
														
 
															+echo-cmd = $(if $($(quiet)cmd_$(1)),\
														
 
															+           echo '  $(call escsq,$($(quiet)cmd_$(1)))';)
														
 
															+
														
 
															+###
														
 
															+# Replace >$< with >$$< to preserve $ when reloading the .cmd file
														
 
															+# (needed for make)
														
 
															+# Replace >#< with >\#< to avoid starting a comment in the .cmd file
														
 
															+# (needed for make)
														
 
															+# Replace >'< with >'\''< to be able to enclose the whole string in '...'
														
 
															+# (needed for the shell)
														
 
															+make-cmd = $(call escsq,$(subst \#,\\\#,$(subst $$,$$$$,$(cmd_$(1)))))
														
 
															+
														
 
															+###
														
 
															+# Find any prerequisites that is newer than target or that does not exist.
														
 
															+# PHONY targets skipped in both cases.
														
 
															+any-prereq = $(filter-out $(PHONY),$?) $(filter-out $(PHONY) $(wildcard $^),$^)
														
 
															+
														
 
															+###
														
 
															+# if_changed_dep  - execute command if any prerequisite is newer than
														
 
															+#                   target, or command line has changed and update
														
 
															+#                   dependencies in the cmd file
														
 
															+if_changed_dep = $(if $(strip $(any-prereq) $(arg-check)),         \
														
 
															+	@set -e;                                                   \
														
 
															+	$(echo-cmd) $(cmd_$(1));                                   \
														
 
															+	cat $(depfile) > $(dot-target).cmd;                        \
														
 
															+	printf '%s\n' 'cmd_$@ := $(make-cmd)' >> $(dot-target).cmd)
														
 
															+
														
 
															+# if_changed      - execute command if any prerequisite is newer than
														
 
															+#                   target, or command line has changed
														
 
															+if_changed = $(if $(strip $(any-prereq) $(arg-check)),             \
														
 
															+	@set -e;                                                   \
														
 
															+	$(echo-cmd) $(cmd_$(1));                                   \
														
 
															+	printf '%s\n' 'cmd_$@ := $(make-cmd)' > $(dot-target).cmd)
														
 
															+
														
 
															+###
														
 
															+# C flags to be used in rule definitions, includes:
														
 
															+# - depfile generation
														
 
															+# - global $(CFLAGS)
														
 
															+# - per target C flags
														
 
															+# - per object C flags
														
 
															+# - BUILD_STR macro to allow '-D"$(variable)"' constructs
														
 
															+c_flags = -Wp,-MD,$(depfile),-MT,$@ $(CFLAGS) -D"BUILD_STR(s)=\#s" $(CFLAGS_$(basetarget).o) $(CFLAGS_$(obj))
														
--- a/tools/build/Documentation/Build.txt
+++ b/tools/build/Documentation/Build.txt
@@ -0,0 +1,139 @@
 
															+Build Framework
														
 
															+===============
														
 
															+
														
 
															+The perf build framework was adopted from the kernel build system, hence the
														
 
															+idea and the way how objects are built is the same.
														
 
															+
														
 
															+Basically the user provides set of 'Build' files that list objects and
														
 
															+directories to nest for specific target to be build.
														
 
															+
														
 
															+Unlike the kernel we don't have a single build object 'obj-y' list that where
														
 
															+we setup source objects, but we support more. This allows one 'Build' file to
														
 
															+carry a sources list for multiple build objects.
														
 
															+
														
 
															+a) Build framework makefiles
														
 
															+----------------------------
														
 
															+
														
 
															+The build framework consists of 2 Makefiles:
														
 
															+
														
 
															+  Build.include
														
 
															+  Makefile.build
														
 
															+
														
 
															+While the 'Build.include' file contains just some generic definitions, the
														
 
															+'Makefile.build' file is the makefile used from the outside. It's
														
 
															+interface/usage is following:
														
 
															+
														
 
															+  $ make -f tools/build/Makefile srctree=$(KSRC) dir=$(DIR) obj=$(OBJECT)
														
 
															+
														
 
															+where:
														
 
															+
														
 
															+  KSRC   - is the path to kernel sources
														
 
															+  DIR    - is the path to the project to be built
														
 
															+  OBJECT - is the name of the build object
														
 
															+
														
 
															+When succefully finished the $(DIR) directory contains the final object file
														
 
															+called $(OBJECT)-in.o:
														
 
															+
														
 
															+  $ ls $(DIR)/$(OBJECT)-in.o
														
 
															+
														
 
															+which includes all compiled sources described in 'Build' makefiles.
														
 
															+
														
 
															+a) Build makefiles
														
 
															+------------------
														
 
															+
														
 
															+The user supplies 'Build' makefiles that contains a objects list, and connects
														
 
															+the build to nested directories.
														
 
															+
														
 
															+Assume we have the following project structure:
														
 
															+
														
 
															+  ex/a.c
														
 
															+    /b.c
														
 
															+    /c.c
														
 
															+    /d.c
														
 
															+    /arch/e.c
														
 
															+    /arch/f.c
														
 
															+
														
 
															+Out of which you build the 'ex' binary ' and the 'libex.a' library:
														
 
															+
														
 
															+  'ex'      - consists of 'a.o', 'b.o' and libex.a
														
 
															+  'libex.a' - consists of 'c.o', 'd.o', 'e.o' and 'f.o'
														
 
															+
														
 
															+The build framework does not create the 'ex' and 'libex.a' binaries for you, it
														
 
															+only prepares proper objects to be compiled and grouped together.
														
 
															+
														
 
															+To follow the above example, the user provides following 'Build' files:
														
 
															+
														
 
															+  ex/Build:
														
 
															+    ex-y += a.o
														
 
															+    ex-y += b.o
														
 
															+
														
 
															+    libex-y += c.o
														
 
															+    libex-y += d.o
														
 
															+    libex-y += arch/
														
 
															+
														
 
															+  ex/arch/Build:
														
 
															+    libex-y += e.o
														
 
															+    libex-y += f.o
														
 
															+
														
 
															+and runs:
														
 
															+
														
 
															+  $ make -f tools/build/Makefile.build dir=. obj=ex
														
 
															+  $ make -f tools/build/Makefile.build dir=. obj=libex
														
 
															+
														
 
															+which creates the following objects:
														
 
															+
														
 
															+  ex/ex-in.o
														
 
															+  ex/libex-in.o
														
 
															+
														
 
															+that contain request objects names in Build files.
														
 
															+
														
 
															+It's only a matter of 2 single commands to create the final binaries:
														
 
															+
														
 
															+  $ ar  rcs libex.a libex-in.o
														
 
															+  $ gcc -o ex ex-in.o libex.a
														
 
															+
														
 
															+You can check the 'ex' example in 'tools/build/tests/ex' for more details.
														
 
															+
														
 
															+b) Rules
														
 
															+--------
														
 
															+
														
 
															+The build framework provides standard compilation rules to handle .S and .c
														
 
															+compilation.
														
 
															+
														
 
															+It's possible to include special rule if needed (like we do for flex or bison
														
 
															+code generation).
														
 
															+
														
 
															+c) CFLAGS
														
 
															+---------
														
 
															+
														
 
															+It's possible to alter the standard object C flags in the following way:
														
 
															+
														
 
															+  CFLAGS_perf.o += '...' - alters CFLAGS for perf.o object
														
 
															+  CFLAGS_gtk += '...'    - alters CFLAGS for gtk build object
														
 
															+
														
 
															+This C flags changes has the scope of the Build makefile they are defined in.
														
 
															+
														
 
															+
														
 
															+d) Dependencies
														
 
															+---------------
														
 
															+
														
 
															+For each built object file 'a.o' the '.a.cmd' is created and holds:
														
 
															+
														
 
															+  - Command line used to built that object
														
 
															+    (for each object)
														
 
															+
														
 
															+  - Dependency rules generated by 'gcc -Wp,-MD,...'
														
 
															+    (for compiled object)
														
 
															+
														
 
															+All existing '.cmd' files are included in the Build process to follow properly
														
 
															+the dependencies and trigger a rebuild when necessary.
														
 
															+
														
 
															+
														
 
															+e) Single rules
														
 
															+---------------
														
 
															+
														
 
															+It's possible to build single object file by choice, like:
														
 
															+
														
 
															+  $ make util/map.o    # objects
														
 
															+  $ make util/map.i    # preprocessor
														
 
															+  $ make util/map.s    # assembly
														
--- a/tools/build/Makefile.build
+++ b/tools/build/Makefile.build
@@ -0,0 +1,130 @@
 
															+###
														
 
															+# Main build makefile.
														
 
															+#
														
 
															+#  Lots of this code have been borrowed or heavily inspired from parts
														
 
															+#  of kbuild code, which is not credited, but mostly developed by:
														
 
															+#
														
 
															+#  Copyright (C) Sam Ravnborg <sam@mars.ravnborg.org>, 2015
														
 
															+#  Copyright (C) Linus Torvalds <torvalds@linux-foundation.org>, 2015
														
 
															+#
														
 
															+
														
 
															+PHONY := __build
														
 
															+__build:
														
 
															+
														
 
															+ifeq ($(V),1)
														
 
															+  quiet =
														
 
															+  Q =
														
 
															+else
														
 
															+  quiet=quiet_
														
 
															+  Q=@
														
 
															+endif
														
 
															+
														
 
															+build-dir := $(srctree)/tools/build
														
 
															+
														
 
															+# Generic definitions
														
 
															+include $(build-dir)/Build.include
														
 
															+
														
 
															+# do not force detected configuration
														
 
															+-include .config-detected
														
 
															+
														
 
															+# Init all relevant variables used in build files so
														
 
															+# 1) they have correct type
														
 
															+# 2) they do not inherit any value from the environment
														
 
															+subdir-y     :=
														
 
															+obj-y        :=
														
 
															+subdir-y     :=
														
 
															+subdir-obj-y :=
														
 
															+
														
 
															+# Build definitions
														
 
															+build-file := $(dir)/Build
														
 
															+include $(build-file)
														
 
															+
														
 
															+quiet_cmd_flex  = FLEX     $@
														
 
															+quiet_cmd_bison = BISON    $@
														
 
															+
														
 
															+# Create directory unless it exists
														
 
															+quiet_cmd_mkdir = MKDIR    $(dir $@)
														
 
															+      cmd_mkdir = mkdir -p $(dir $@)
														
 
															+     rule_mkdir = $(if $(wildcard $(dir $@)),,@$(call echo-cmd,mkdir) $(cmd_mkdir))
														
 
															+
														
 
															+# Compile command
														
 
															+quiet_cmd_cc_o_c = CC       $@
														
 
															+      cmd_cc_o_c = $(CC) $(c_flags) -c -o $@ $<
														
 
															+
														
 
															+quiet_cmd_cc_i_c = CPP      $@
														
 
															+      cmd_cc_i_c = $(CC) $(c_flags) -E -o $@ $<
														
 
															+
														
 
															+quiet_cmd_cc_s_c = AS       $@
														
 
															+      cmd_cc_s_c = $(CC) $(c_flags) -S -o $@ $<
														
 
															+
														
 
															+# Link agregate command
														
 
															+# If there's nothing to link, create empty $@ object.
														
 
															+quiet_cmd_ld_multi = LD       $@
														
 
															+      cmd_ld_multi = $(if $(strip $(obj-y)),\
														
 
															+		       $(LD) -r -o $@ $(obj-y),rm -f $@; $(AR) rcs $@)
														
 
															+
														
 
															+# Build rules
														
 
															+$(OUTPUT)%.o: %.c FORCE
														
 
															+	$(call rule_mkdir)
														
 
															+	$(call if_changed_dep,cc_o_c)
														
 
															+
														
 
															+$(OUTPUT)%.o: %.S FORCE
														
 
															+	$(call rule_mkdir)
														
 
															+	$(call if_changed_dep,cc_o_c)
														
 
															+
														
 
															+$(OUTPUT)%.i: %.c FORCE
														
 
															+	$(call rule_mkdir)
														
 
															+	$(call if_changed_dep,cc_i_c)
														
 
															+
														
 
															+$(OUTPUT)%.i: %.S FORCE
														
 
															+	$(call rule_mkdir)
														
 
															+	$(call if_changed_dep,cc_i_c)
														
 
															+
														
 
															+$(OUTPUT)%.s: %.c FORCE
														
 
															+	$(call rule_mkdir)
														
 
															+	$(call if_changed_dep,cc_s_c)
														
 
															+
														
 
															+# Gather build data:
														
 
															+#   obj-y        - list of build objects
														
 
															+#   subdir-y     - list of directories to nest
														
 
															+#   subdir-obj-y - list of directories objects 'dir/$(obj)-in.o'
														
 
															+obj-y        := $($(obj)-y)
														
 
															+subdir-y     := $(patsubst %/,%,$(filter %/, $(obj-y)))
														
 
															+obj-y        := $(patsubst %/, %/$(obj)-in.o, $(obj-y))
														
 
															+subdir-obj-y := $(filter %/$(obj)-in.o, $(obj-y))
														
 
															+
														
 
															+# '$(OUTPUT)/dir' prefix to all objects
														
 
															+prefix       := $(subst ./,,$(OUTPUT)$(dir)/)
														
 
															+obj-y        := $(addprefix $(prefix),$(obj-y))
														
 
															+subdir-obj-y := $(addprefix $(prefix),$(subdir-obj-y))
														
 
															+
														
 
															+# Final '$(obj)-in.o' object
														
 
															+in-target := $(prefix)$(obj)-in.o
														
 
															+
														
 
															+PHONY += $(subdir-y)
														
 
															+
														
 
															+$(subdir-y):
														
 
															+	$(Q)$(MAKE) -f $(build-dir)/Makefile.build dir=$(dir)/$@ obj=$(obj)
														
 
															+
														
 
															+$(sort $(subdir-obj-y)): $(subdir-y) ;
														
 
															+
														
 
															+$(in-target): $(obj-y) FORCE
														
 
															+	$(call rule_mkdir)
														
 
															+	$(call if_changed,ld_multi)
														
 
															+
														
 
															+__build: $(in-target)
														
 
															+	@:
														
 
															+
														
 
															+PHONY += FORCE
														
 
															+FORCE:
														
 
															+
														
 
															+# Include all cmd files to get all the dependency rules
														
 
															+# for all objects included
														
 
															+targets   := $(wildcard $(sort $(obj-y) $(in-target) $(MAKECMDGOALS)))
														
 
															+cmd_files := $(wildcard $(foreach f,$(targets),$(dir $(f)).$(notdir $(f)).cmd))
														
 
															+
														
 
															+ifneq ($(cmd_files),)
														
 
															+  include $(cmd_files)
														
 
															+endif
														
 
															+
														
 
															+.PHONY: $(PHONY)
														
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -0,0 +1,171 @@
 
															+feature_dir := $(srctree)/tools/build/feature
														
 
															+
														
 
															+ifneq ($(OUTPUT),)
														
 
															+  OUTPUT_FEATURES = $(OUTPUT)feature/
														
 
															+  $(shell mkdir -p $(OUTPUT_FEATURES))
														
 
															+endif
														
 
															+
														
 
															+feature_check = $(eval $(feature_check_code))
														
 
															+define feature_check_code
														
 
															+  feature-$(1) := $(shell $(MAKE) OUTPUT=$(OUTPUT_FEATURES) CFLAGS="$(EXTRA_CFLAGS) $(FEATURE_CHECK_CFLAGS-$(1))" LDFLAGS="$(LDFLAGS) $(FEATURE_CHECK_LDFLAGS-$(1))" -C $(feature_dir) test-$1.bin >/dev/null 2>/dev/null && echo 1 || echo 0)
														
 
															+endef
														
 
															+
														
 
															+feature_set = $(eval $(feature_set_code))
														
 
															+define feature_set_code
														
 
															+  feature-$(1) := 1
														
 
															+endef
														
 
															+
														
 
															+#
														
 
															+# Build the feature check binaries in parallel, ignore errors, ignore return value and suppress output:
														
 
															+#
														
 
															+
														
 
															+#
														
 
															+# Note that this is not a complete list of all feature tests, just
														
 
															+# those that are typically built on a fully configured system.
														
 
															+#
														
 
															+# [ Feature tests not mentioned here have to be built explicitly in
														
 
															+#   the rule that uses them - an example for that is the 'bionic'
														
 
															+#   feature check. ]
														
 
															+#
														
 
															+FEATURE_TESTS =			\
														
 
															+	backtrace			\
														
 
															+	dwarf				\
														
 
															+	fortify-source			\
														
 
															+	sync-compare-and-swap		\
														
 
															+	glibc				\
														
 
															+	gtk2				\
														
 
															+	gtk2-infobar			\
														
 
															+	libaudit			\
														
 
															+	libbfd				\
														
 
															+	libelf				\
														
 
															+	libelf-getphdrnum		\
														
 
															+	libelf-mmap			\
														
 
															+	libnuma				\
														
 
															+	libperl				\
														
 
															+	libpython			\
														
 
															+	libpython-version		\
														
 
															+	libslang			\
														
 
															+	libunwind			\
														
 
															+	pthread-attr-setaffinity-np	\
														
 
															+	stackprotector-all		\
														
 
															+	timerfd				\
														
 
															+	libdw-dwarf-unwind		\
														
 
															+	zlib				\
														
 
															+	lzma
														
 
															+
														
 
															+FEATURE_DISPLAY =			\
														
 
															+	dwarf				\
														
 
															+	glibc				\
														
 
															+	gtk2				\
														
 
															+	libaudit			\
														
 
															+	libbfd				\
														
 
															+	libelf				\
														
 
															+	libnuma				\
														
 
															+	libperl				\
														
 
															+	libpython			\
														
 
															+	libslang			\
														
 
															+	libunwind			\
														
 
															+	libdw-dwarf-unwind		\
														
 
															+	zlib				\
														
 
															+	lzma
														
 
															+
														
 
															+# Set FEATURE_CHECK_(C|LD)FLAGS-all for all FEATURE_TESTS features.
														
 
															+# If in the future we need per-feature checks/flags for features not
														
 
															+# mentioned in this list we need to refactor this ;-).
														
 
															+set_test_all_flags = $(eval $(set_test_all_flags_code))
														
 
															+define set_test_all_flags_code
														
 
															+  FEATURE_CHECK_CFLAGS-all  += $(FEATURE_CHECK_CFLAGS-$(1))
														
 
															+  FEATURE_CHECK_LDFLAGS-all += $(FEATURE_CHECK_LDFLAGS-$(1))
														
 
															+endef
														
 
															+
														
 
															+$(foreach feat,$(FEATURE_TESTS),$(call set_test_all_flags,$(feat)))
														
 
															+
														
 
															+#
														
 
															+# Special fast-path for the 'all features are available' case:
														
 
															+#
														
 
															+$(call feature_check,all,$(MSG))
														
 
															+
														
 
															+#
														
 
															+# Just in case the build freshly failed, make sure we print the
														
 
															+# feature matrix:
														
 
															+#
														
 
															+ifeq ($(feature-all), 1)
														
 
															+  #
														
 
															+  # test-all.c passed - just set all the core feature flags to 1:
														
 
															+  #
														
 
															+  $(foreach feat,$(FEATURE_TESTS),$(call feature_set,$(feat)))
														
 
															+else
														
 
															+  $(shell $(MAKE) OUTPUT=$(OUTPUT_FEATURES) CFLAGS="$(EXTRA_CFLAGS)" LDFLAGS=$(LDFLAGS) -i -j -C $(feature_dir) $(addsuffix .bin,$(FEATURE_TESTS)) >/dev/null 2>&1)
														
 
															+  $(foreach feat,$(FEATURE_TESTS),$(call feature_check,$(feat)))
														
 
															+endif
														
 
															+
														
 
															+#
														
 
															+# Print the result of the feature test:
														
 
															+#
														
 
															+feature_print_status = $(eval $(feature_print_status_code)) $(info $(MSG))
														
 
															+
														
 
															+define feature_print_status_code
														
 
															+  ifeq ($(feature-$(1)), 1)
														
 
															+    MSG = $(shell printf '...%30s: [ \033[32mon\033[m  ]' $(1))
														
 
															+  else
														
 
															+    MSG = $(shell printf '...%30s: [ \033[31mOFF\033[m ]' $(1))
														
 
															+  endif
														
 
															+endef
														
 
															+
														
 
															+feature_print_text = $(eval $(feature_print_text_code)) $(info $(MSG))
														
 
															+define feature_print_text_code
														
 
															+    MSG = $(shell printf '...%30s: %s' $(1) $(2))
														
 
															+endef
														
 
															+
														
 
															+FEATURE_DUMP := $(foreach feat,$(FEATURE_DISPLAY),feature-$(feat)($(feature-$(feat))))
														
 
															+FEATURE_DUMP_FILE := $(shell touch $(OUTPUT)FEATURE-DUMP; cat $(OUTPUT)FEATURE-DUMP)
														
 
															+
														
 
															+ifeq ($(dwarf-post-unwind),1)
														
 
															+  FEATURE_DUMP += dwarf-post-unwind($(dwarf-post-unwind-text))
														
 
															+endif
														
 
															+
														
 
															+# The $(feature_display) controls the default detection message
														
 
															+# output. It's set if:
														
 
															+# - detected features differes from stored features from
														
 
															+#   last build (in FEATURE-DUMP file)
														
 
															+# - one of the $(FEATURE_DISPLAY) is not detected
														
 
															+# - VF is enabled
														
 
															+
														
 
															+ifneq ("$(FEATURE_DUMP)","$(FEATURE_DUMP_FILE)")
														
 
															+  $(shell echo "$(FEATURE_DUMP)" > $(OUTPUT)FEATURE-DUMP)
														
 
															+  feature_display := 1
														
 
															+endif
														
 
															+
														
 
															+feature_display_check = $(eval $(feature_check_code))
														
 
															+define feature_display_check_code
														
 
															+  ifneq ($(feature-$(1)), 1)
														
 
															+    feature_display := 1
														
 
															+  endif
														
 
															+endef
														
 
															+
														
 
															+$(foreach feat,$(FEATURE_DISPLAY),$(call feature_display_check,$(feat)))
														
 
															+
														
 
															+ifeq ($(VF),1)
														
 
															+  feature_display := 1
														
 
															+  feature_verbose := 1
														
 
															+endif
														
 
															+
														
 
															+ifeq ($(feature_display),1)
														
 
															+  $(info )
														
 
															+  $(info Auto-detecting system features:)
														
 
															+  $(foreach feat,$(FEATURE_DISPLAY),$(call feature_print_status,$(feat),))
														
 
															+
														
 
															+  ifeq ($(dwarf-post-unwind),1)
														
 
															+    $(call feature_print_text,"DWARF post unwind library", $(dwarf-post-unwind-text))
														
 
															+  endif
														
 
															+
														
 
															+  ifneq ($(feature_verbose),1)
														
 
															+    $(info )
														
 
															+  endif
														
 
															+endif
														
 
															+
														
 
															+ifeq ($(feature_verbose),1)
														
 
															+  TMP := $(filter-out $(FEATURE_DISPLAY),$(FEATURE_TESTS))
														
 
															+  $(foreach feat,$(TMP),$(call feature_print_status,$(feat),))
														
 
															+  $(info )
														
 
															+endif
														
--- a/tools/perf/config/feature-checks/.gitignore
+++ b/tools/perf/config/feature-checks/.gitignore
@@ -1,2 +1,3 @@
 
															 *.d
														
 
															 *.bin
														
 
															+*.output
														
--- a/tools/perf/config/feature-checks/Makefile
+++ b/tools/perf/config/feature-checks/Makefile
@@ -29,33 +29,36 @@ FILES=					\
 
															 	test-stackprotector-all.bin	\
														
 
															 	test-timerfd.bin		\
														
 
															 	test-libdw-dwarf-unwind.bin	\
														
 
															+	test-libbabeltrace.bin		\
														
 
															 	test-compile-32.bin		\
														
 
															 	test-compile-x32.bin		\
														
 
															-	test-zlib.bin
														
 
															+	test-zlib.bin			\
														
 
															+	test-lzma.bin
														
 
															 CC := $(CROSS_COMPILE)gcc -MD
														
 
															 PKG_CONFIG := $(CROSS_COMPILE)pkg-config
														
 
															 all: $(FILES)
														
 
															-BUILD = $(CC) $(CFLAGS) -o $(OUTPUT)$@ $(patsubst %.bin,%.c,$@) $(LDFLAGS)
														
 
															+__BUILD = $(CC) $(CFLAGS) -Wall -Werror -o $(OUTPUT)$@ $(patsubst %.bin,%.c,$@) $(LDFLAGS)
														
 
															+  BUILD = $(__BUILD) > $(OUTPUT)$(@:.bin=.make.output) 2>&1
														
 
															 ###############################
														
 
															 test-all.bin:
														
 
															-	$(BUILD) -Werror -fstack-protector-all -O2 -Werror -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -laudit -I/usr/include/slang -lslang $(shell $(PKG_CONFIG) --libs --cflags gtk+-2.0 2>/dev/null) $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -DPACKAGE='"perf"' -lbfd -ldl -lz
														
 
															+	$(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -laudit -I/usr/include/slang -lslang $(shell $(PKG_CONFIG) --libs --cflags gtk+-2.0 2>/dev/null) $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -DPACKAGE='"perf"' -lbfd -ldl -lz -llzma
														
 
															 test-hello.bin:
														
 
															 	$(BUILD)
														
 
															 test-pthread-attr-setaffinity-np.bin:
														
 
															-	$(BUILD) -D_GNU_SOURCE -Werror -lpthread
														
 
															+	$(BUILD) -D_GNU_SOURCE -lpthread
														
 
															 test-stackprotector-all.bin:
														
 
															-	$(BUILD) -Werror -fstack-protector-all
														
 
															+	$(BUILD) -fstack-protector-all
														
 
															 test-fortify-source.bin:
														
 
															-	$(BUILD) -O2 -Werror -D_FORTIFY_SOURCE=2
														
 
															+	$(BUILD) -O2 -D_FORTIFY_SOURCE=2
														
 
															 test-bionic.bin:
														
 
															 	$(BUILD)
														
@@ -118,10 +121,10 @@ test-libbfd.bin:
 
															 	$(BUILD) -DPACKAGE='"perf"' -lbfd -lz -liberty -ldl
														
 
															 test-liberty.bin:
														
 
															-	$(CC) -o $(OUTPUT)$@ test-libbfd.c -DPACKAGE='"perf"' -lbfd -ldl -liberty
														
 
															+	$(CC) -Wall -Werror -o $(OUTPUT)$@ test-libbfd.c -DPACKAGE='"perf"' -lbfd -ldl -liberty
														
 
															 test-liberty-z.bin:
														
 
															-	$(CC) -o $(OUTPUT)$@ test-libbfd.c -DPACKAGE='"perf"' -lbfd -ldl -liberty -lz
														
 
															+	$(CC) -Wall -Werror -o $(OUTPUT)$@ test-libbfd.c -DPACKAGE='"perf"' -lbfd -ldl -liberty -lz
														
 
															 test-cplus-demangle.bin:
														
 
															 	$(BUILD) -liberty
														
@@ -133,10 +136,13 @@ test-timerfd.bin:
 
															 	$(BUILD)
														
 
															 test-libdw-dwarf-unwind.bin:
														
 
															-	$(BUILD)
														
 
															+	$(BUILD) # -ldw provided by $(FEATURE_CHECK_LDFLAGS-libdw-dwarf-unwind)
														
 
															+
														
 
															+test-libbabeltrace.bin:
														
 
															+	$(BUILD) # -lbabeltrace provided by $(FEATURE_CHECK_LDFLAGS-libbabeltrace)
														
 
															 test-sync-compare-and-swap.bin:
														
 
															-	$(BUILD) -Werror
														
 
															+	$(BUILD)
														
 
															 test-compile-32.bin:
														
 
															 	$(CC) -m32 -o $(OUTPUT)$@ test-compile.c
														
@@ -147,9 +153,12 @@ test-compile-x32.bin:
 
															 test-zlib.bin:
														
 
															 	$(BUILD) -lz
														
 
															+test-lzma.bin:
														
 
															+	$(BUILD) -llzma
														
 
															+
														
 
															 -include *.d
														
 
															 ###############################
														
 
															 clean:
														
 
															-	rm -f $(FILES) *.d
														
 
															+	rm -f $(FILES) *.d $(FILES:.bin=.make.output)
														
--- a/tools/perf/config/feature-checks/test-all.c
+++ b/tools/perf/config/feature-checks/test-all.c
@@ -98,7 +98,23 @@
 
															 #undef main
														
 
															 #define main main_test_pthread_attr_setaffinity_np
														
 
															-# include "test-pthread_attr_setaffinity_np.c"
														
 
															+# include "test-pthread-attr-setaffinity-np.c"
														
 
															+#undef main
														
 
															+
														
 
															+# if 0
														
 
															+/*
														
 
															+ * Disable libbabeltrace check for test-all, because the requested
														
 
															+ * library version is not released yet in most distributions. Will
														
 
															+ * reenable later.
														
 
															+ */
														
 
															+
														
 
															+#define main main_test_libbabeltrace
														
 
															+# include "test-libbabeltrace.c"
														
 
															+#undef main
														
 
															+#endif
														
 
															+
														
 
															+#define main main_test_lzma
														
 
															+# include "test-lzma.c"
														
 
															 #undef main
														
 
															 int main(int argc, char *argv[])
														
@@ -126,6 +142,7 @@ int main(int argc, char *argv[])
 
															 	main_test_sync_compare_and_swap(argc, argv);
														
 
															 	main_test_zlib();
														
 
															 	main_test_pthread_attr_setaffinity_np();
														
 
															+	main_test_lzma();
														
 
															 	return 0;
														
 
															 }
														
--- a/tools/perf/config/feature-checks/test-backtrace.c
+++ b/tools/perf/config/feature-checks/test-backtrace.c
--- a/tools/perf/config/feature-checks/test-bionic.c
+++ b/tools/perf/config/feature-checks/test-bionic.c
--- a/tools/perf/config/feature-checks/test-compile.c
+++ b/tools/perf/config/feature-checks/test-compile.c
--- a/tools/perf/config/feature-checks/test-cplus-demangle.c
+++ b/tools/perf/config/feature-checks/test-cplus-demangle.c
--- a/tools/perf/config/feature-checks/test-dwarf.c
+++ b/tools/perf/config/feature-checks/test-dwarf.c
--- a/tools/perf/config/feature-checks/test-fortify-source.c
+++ b/tools/perf/config/feature-checks/test-fortify-source.c
--- a/tools/perf/config/feature-checks/test-glibc.c
+++ b/tools/perf/config/feature-checks/test-glibc.c
--- a/tools/perf/config/feature-checks/test-gtk2-infobar.c
+++ b/tools/perf/config/feature-checks/test-gtk2-infobar.c
--- a/tools/perf/config/feature-checks/test-gtk2.c
+++ b/tools/perf/config/feature-checks/test-gtk2.c
--- a/tools/perf/config/feature-checks/test-hello.c
+++ b/tools/perf/config/feature-checks/test-hello.c
--- a/tools/perf/config/feature-checks/test-libaudit.c
+++ b/tools/perf/config/feature-checks/test-libaudit.c
--- a/tools/build/feature/test-libbabeltrace.c
+++ b/tools/build/feature/test-libbabeltrace.c
@@ -0,0 +1,9 @@
 
															+
														
 
															+#include <babeltrace/ctf-writer/writer.h>
														
 
															+#include <babeltrace/ctf-ir/stream-class.h>
														
 
															+
														
 
															+int main(void)
														
 
															+{
														
 
															+	bt_ctf_stream_class_get_packet_context_type((void *) 0);
														
 
															+	return 0;
														
 
															+}
														
--- a/tools/perf/config/feature-checks/test-libbfd.c
+++ b/tools/perf/config/feature-checks/test-libbfd.c
--- a/tools/perf/config/feature-checks/test-libdw-dwarf-unwind.c
+++ b/tools/perf/config/feature-checks/test-libdw-dwarf-unwind.c
--- a/tools/perf/config/feature-checks/test-libelf-getphdrnum.c
+++ b/tools/perf/config/feature-checks/test-libelf-getphdrnum.c
--- a/tools/perf/config/feature-checks/test-libelf-mmap.c
+++ b/tools/perf/config/feature-checks/test-libelf-mmap.c
--- a/tools/perf/config/feature-checks/test-libelf.c
+++ b/tools/perf/config/feature-checks/test-libelf.c
--- a/tools/perf/config/feature-checks/test-libnuma.c
+++ b/tools/perf/config/feature-checks/test-libnuma.c
--- a/tools/perf/config/feature-checks/test-libperl.c
+++ b/tools/perf/config/feature-checks/test-libperl.c
--- a/tools/perf/config/feature-checks/test-libpython-version.c
+++ b/tools/perf/config/feature-checks/test-libpython-version.c
--- a/tools/perf/config/feature-checks/test-libpython.c
+++ b/tools/perf/config/feature-checks/test-libpython.c
--- a/tools/perf/config/feature-checks/test-libslang.c
+++ b/tools/perf/config/feature-checks/test-libslang.c
--- a/tools/perf/config/feature-checks/test-libunwind-debug-frame.c
+++ b/tools/perf/config/feature-checks/test-libunwind-debug-frame.c
--- a/tools/perf/config/feature-checks/test-libunwind.c
+++ b/tools/perf/config/feature-checks/test-libunwind.c
--- a/tools/build/feature/test-lzma.c
+++ b/tools/build/feature/test-lzma.c
@@ -0,0 +1,10 @@
 
															+#include <lzma.h>
														
 
															+
														
 
															+int main(void)
														
 
															+{
														
 
															+	lzma_stream strm = LZMA_STREAM_INIT;
														
 
															+	int ret;
														
 
															+
														
 
															+	ret = lzma_stream_decoder(&strm, UINT64_MAX, LZMA_CONCATENATED);
														
 
															+	return ret ? -1 : 0;
														
 
															+}
														
--- a/tools/perf/config/feature-checks/test-pthread-attr-setaffinity-np.c
+++ b/tools/perf/config/feature-checks/test-pthread-attr-setaffinity-np.c
@@ -1,5 +1,6 @@
 
															 #include <stdint.h>
														
 
															 #include <pthread.h>
														
 
															+#include <sched.h>
														
 
															 int main(void)
														
 
															 {
														
@@ -8,7 +9,8 @@ int main(void)
 
															 	cpu_set_t cs;
														
 
															 	pthread_attr_init(&thread_attr);
														
 
															-	/* don't care abt exact args, just the API itself in libpthread */
														
 
															+	CPU_ZERO(&cs);
														
 
															+
														
 
															 	ret = pthread_attr_setaffinity_np(&thread_attr, sizeof(cs), &cs);
														
 
															 	return ret;
														
--- a/tools/perf/config/feature-checks/test-stackprotector-all.c
+++ b/tools/perf/config/feature-checks/test-stackprotector-all.c
--- a/tools/perf/config/feature-checks/test-sync-compare-and-swap.c
+++ b/tools/perf/config/feature-checks/test-sync-compare-and-swap.c
--- a/tools/perf/config/feature-checks/test-timerfd.c
+++ b/tools/perf/config/feature-checks/test-timerfd.c
--- a/tools/perf/config/feature-checks/test-zlib.c
+++ b/tools/perf/config/feature-checks/test-zlib.c
--- a/tools/build/tests/ex/Build
+++ b/tools/build/tests/ex/Build
@@ -0,0 +1,8 @@
 
															+ex-y += ex.o
														
 
															+ex-y += a.o
														
 
															+ex-y += b.o
														
 
															+ex-y += empty/
														
 
															+
														
 
															+libex-y += c.o
														
 
															+libex-y += d.o
														
 
															+libex-y += arch/
														
--- a/tools/build/tests/ex/Makefile
+++ b/tools/build/tests/ex/Makefile
@@ -0,0 +1,23 @@
 
															+export srctree := ../../../..
														
 
															+export CC      := gcc
														
 
															+export LD      := ld
														
 
															+export AR      := ar
														
 
															+
														
 
															+build := -f $(srctree)/tools/build/Makefile.build dir=. obj
														
 
															+ex: ex-in.o libex-in.o
														
 
															+	gcc -o $@ $^
														
 
															+
														
 
															+ex.%: FORCE
														
 
															+	make -f $(srctree)/tools/build/Makefile.build dir=. $@
														
 
															+
														
 
															+ex-in.o: FORCE
														
 
															+	make $(build)=ex
														
 
															+
														
 
															+libex-in.o: FORCE
														
 
															+	make $(build)=libex
														
 
															+
														
 
															+clean:
														
 
															+	find . -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
														
 
															+	rm -f ex ex.i ex.s
														
 
															+
														
 
															+.PHONY: FORCE
														
--- a/tools/build/tests/ex/a.c
+++ b/tools/build/tests/ex/a.c
@@ -0,0 +1,5 @@
 
															+
														
 
															+int a(void)
														
 
															+{
														
 
															+	return 0;
														
 
															+}
														
--- a/tools/build/tests/ex/arch/Build
+++ b/tools/build/tests/ex/arch/Build
@@ -0,0 +1,2 @@
 
															+libex-y += e.o
														
 
															+libex-y += f.o
														
--- a/tools/build/tests/ex/arch/e.c
+++ b/tools/build/tests/ex/arch/e.c
@@ -0,0 +1,5 @@
 
															+
														
 
															+int e(void)
														
 
															+{
														
 
															+	return 0;
														
 
															+}
														
--- a/tools/build/tests/ex/arch/f.c
+++ b/tools/build/tests/ex/arch/f.c
@@ -0,0 +1,5 @@
 
															+
														
 
															+int f(void)
														
 
															+{
														
 
															+	return 0;
														
 
															+}
														
--- a/tools/build/tests/ex/b.c
+++ b/tools/build/tests/ex/b.c
@@ -0,0 +1,5 @@
 
															+
														
 
															+int b(void)
														
 
															+{
														
 
															+	return 0;
														
 
															+}