9 years ago · 79078c53ba
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -505,6 +505,10 @@ int x86_pmu_hw_config(struct perf_event *event)
 
				 
			
 
				 		if (event->attr.precise_ip > precise)
			
 
				 			return -EOPNOTSUPP;
			
 
				+
			
 
				+		/* There's no sense in having PEBS for non sampling events: */
			
 
				+		if (!is_sampling_event(event))
			
 
				+			return -EINVAL;
			
 
				 	}
			
 
				 	/*
			
 
				 	 * check that PEBS LBR correction does not conflict with
			
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3987,7 +3987,7 @@ __init int intel_pmu_init(void)
 
				 		     x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
			
 
				 		x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC;
			
 
				 	}
			
 
				-	x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
			
 
				+	x86_pmu.intel_ctrl = (1ULL << x86_pmu.num_counters) - 1;
			
 
				 
			
 
				 	if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) {
			
 
				 		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
			
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -434,6 +434,7 @@ static struct pmu cstate_core_pmu = {
 
				 	.stop		= cstate_pmu_event_stop,
			
 
				 	.read		= cstate_pmu_event_update,
			
 
				 	.capabilities	= PERF_PMU_CAP_NO_INTERRUPT,
			
 
				+	.module		= THIS_MODULE,
			
 
				 };
			
 
				 
			
 
				 static struct pmu cstate_pkg_pmu = {
			
@@ -447,6 +448,7 @@ static struct pmu cstate_pkg_pmu = {
 
				 	.stop		= cstate_pmu_event_stop,
			
 
				 	.read		= cstate_pmu_event_update,
			
 
				 	.capabilities	= PERF_PMU_CAP_NO_INTERRUPT,
			
 
				+	.module		= THIS_MODULE,
			
 
				 };
			
 
				 
			
 
				 static const struct cstate_model nhm_cstates __initconst = {
			
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1389,9 +1389,13 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 
				 			continue;
			
 
				 
			
 
				 		/* log dropped samples number */
			
 
				-		if (error[bit])
			
 
				+		if (error[bit]) {
			
 
				 			perf_log_lost_samples(event, error[bit]);
			
 
				 
			
 
				+			if (perf_event_account_interrupt(event))
			
 
				+				x86_pmu_stop(event, 0);
			
 
				+		}
			
 
				+
			
 
				 		if (counts[bit]) {
			
 
				 			__intel_pmu_pebs_event(event, iregs, base,
			
 
				 					       top, bit, counts[bit]);
			
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -697,6 +697,7 @@ static int __init init_rapl_pmus(void)
 
				 	rapl_pmus->pmu.start		= rapl_pmu_event_start;
			
 
				 	rapl_pmus->pmu.stop		= rapl_pmu_event_stop;
			
 
				 	rapl_pmus->pmu.read		= rapl_pmu_event_read;
			
 
				+	rapl_pmus->pmu.module		= THIS_MODULE;
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -733,6 +733,7 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
 
				 			.start		= uncore_pmu_event_start,
			
 
				 			.stop		= uncore_pmu_event_stop,
			
 
				 			.read		= uncore_pmu_event_read,
			
 
				+			.module		= THIS_MODULE,
			
 
				 		};
			
 
				 	} else {
			
 
				 		pmu->pmu = *pmu->type->pmu;
			
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -2686,7 +2686,7 @@ static struct intel_uncore_type *hswep_msr_uncores[] = {
 
				 
			
 
				 void hswep_uncore_cpu_init(void)
			
 
				 {
			
 
				-	int pkg = topology_phys_to_logical_pkg(0);
			
 
				+	int pkg = boot_cpu_data.logical_proc_id;
			
 
				 
			
 
				 	if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
			
 
				 		hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
			
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1259,6 +1259,7 @@ extern void perf_event_disable(struct perf_event *event);
 
				 extern void perf_event_disable_local(struct perf_event *event);
			
 
				 extern void perf_event_disable_inatomic(struct perf_event *event);
			
 
				 extern void perf_event_task_tick(void);
			
 
				+extern int perf_event_account_interrupt(struct perf_event *event);
			
 
				 #else /* !CONFIG_PERF_EVENTS: */
			
 
				 static inline void *
			
 
				 perf_aux_output_begin(struct perf_output_handle *handle,
			
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2249,7 +2249,7 @@ static int  __perf_install_in_context(void *info)
 
				 	struct perf_event_context *ctx = event->ctx;
			
 
				 	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
			
 
				 	struct perf_event_context *task_ctx = cpuctx->task_ctx;
			
 
				-	bool activate = true;
			
 
				+	bool reprogram = true;
			
 
				 	int ret = 0;
			
 
				 
			
 
				 	raw_spin_lock(&cpuctx->ctx.lock);
			
@@ -2257,27 +2257,26 @@ static int  __perf_install_in_context(void *info)
 
				 		raw_spin_lock(&ctx->lock);
			
 
				 		task_ctx = ctx;
			
 
				 
			
 
				-		/* If we're on the wrong CPU, try again */
			
 
				-		if (task_cpu(ctx->task) != smp_processor_id()) {
			
 
				-			ret = -ESRCH;
			
 
				-			goto unlock;
			
 
				-		}
			
 
				+		reprogram = (ctx->task == current);
			
 
				 
			
 
				 		/*
			
 
				-		 * If we're on the right CPU, see if the task we target is
			
 
				-		 * current, if not we don't have to activate the ctx, a future
			
 
				-		 * context switch will do that for us.
			
 
				+		 * If the task is running, it must be running on this CPU,
			
 
				+		 * otherwise we cannot reprogram things.
			
 
				+		 *
			
 
				+		 * If its not running, we don't care, ctx->lock will
			
 
				+		 * serialize against it becoming runnable.
			
 
				 		 */
			
 
				-		if (ctx->task != current)
			
 
				-			activate = false;
			
 
				-		else
			
 
				-			WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx);
			
 
				+		if (task_curr(ctx->task) && !reprogram) {
			
 
				+			ret = -ESRCH;
			
 
				+			goto unlock;
			
 
				+		}
			
 
				 
			
 
				+		WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
			
 
				 	} else if (task_ctx) {
			
 
				 		raw_spin_lock(&task_ctx->lock);
			
 
				 	}
			
 
				 
			
 
				-	if (activate) {
			
 
				+	if (reprogram) {
			
 
				 		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
			
 
				 		add_event_to_ctx(event, ctx);
			
 
				 		ctx_resched(cpuctx, task_ctx);
			
@@ -2328,13 +2327,36 @@ perf_install_in_context(struct perf_event_context *ctx,
 
				 	/*
			
 
				 	 * Installing events is tricky because we cannot rely on ctx->is_active
			
 
				 	 * to be set in case this is the nr_events 0 -> 1 transition.
			
 
				+	 *
			
 
				+	 * Instead we use task_curr(), which tells us if the task is running.
			
 
				+	 * However, since we use task_curr() outside of rq::lock, we can race
			
 
				+	 * against the actual state. This means the result can be wrong.
			
 
				+	 *
			
 
				+	 * If we get a false positive, we retry, this is harmless.
			
 
				+	 *
			
 
				+	 * If we get a false negative, things are complicated. If we are after
			
 
				+	 * perf_event_context_sched_in() ctx::lock will serialize us, and the
			
 
				+	 * value must be correct. If we're before, it doesn't matter since
			
 
				+	 * perf_event_context_sched_in() will program the counter.
			
 
				+	 *
			
 
				+	 * However, this hinges on the remote context switch having observed
			
 
				+	 * our task->perf_event_ctxp[] store, such that it will in fact take
			
 
				+	 * ctx::lock in perf_event_context_sched_in().
			
 
				+	 *
			
 
				+	 * We do this by task_function_call(), if the IPI fails to hit the task
			
 
				+	 * we know any future context switch of task must see the
			
 
				+	 * perf_event_ctpx[] store.
			
 
				 	 */
			
 
				-again:
			
 
				+
			
 
				 	/*
			
 
				-	 * Cannot use task_function_call() because we need to run on the task's
			
 
				-	 * CPU regardless of whether its current or not.
			
 
				+	 * This smp_mb() orders the task->perf_event_ctxp[] store with the
			
 
				+	 * task_cpu() load, such that if the IPI then does not find the task
			
 
				+	 * running, a future context switch of that task must observe the
			
 
				+	 * store.
			
 
				 	 */
			
 
				-	if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event))
			
 
				+	smp_mb();
			
 
				+again:
			
 
				+	if (!task_function_call(task, __perf_install_in_context, event))
			
 
				 		return;
			
 
				 
			
 
				 	raw_spin_lock_irq(&ctx->lock);
			
@@ -2348,12 +2370,16 @@ perf_install_in_context(struct perf_event_context *ctx,
 
				 		raw_spin_unlock_irq(&ctx->lock);
			
 
				 		return;
			
 
				 	}
			
 
				-	raw_spin_unlock_irq(&ctx->lock);
			
 
				 	/*
			
 
				-	 * Since !ctx->is_active doesn't mean anything, we must IPI
			
 
				-	 * unconditionally.
			
 
				+	 * If the task is not running, ctx->lock will avoid it becoming so,
			
 
				+	 * thus we can safely install the event.
			
 
				 	 */
			
 
				-	goto again;
			
 
				+	if (task_curr(task)) {
			
 
				+		raw_spin_unlock_irq(&ctx->lock);
			
 
				+		goto again;
			
 
				+	}
			
 
				+	add_event_to_ctx(event, ctx);
			
 
				+	raw_spin_unlock_irq(&ctx->lock);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -7034,25 +7060,12 @@ static void perf_log_itrace_start(struct perf_event *event)
 
				 	perf_output_end(&handle);
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Generic event overflow handling, sampling.
			
 
				- */
			
 
				-
			
 
				-static int __perf_event_overflow(struct perf_event *event,
			
 
				-				   int throttle, struct perf_sample_data *data,
			
 
				-				   struct pt_regs *regs)
			
 
				+static int
			
 
				+__perf_event_account_interrupt(struct perf_event *event, int throttle)
			
 
				 {
			
 
				-	int events = atomic_read(&event->event_limit);
			
 
				 	struct hw_perf_event *hwc = &event->hw;
			
 
				-	u64 seq;
			
 
				 	int ret = 0;
			
 
				-
			
 
				-	/*
			
 
				-	 * Non-sampling counters might still use the PMI to fold short
			
 
				-	 * hardware counters, ignore those.
			
 
				-	 */
			
 
				-	if (unlikely(!is_sampling_event(event)))
			
 
				-		return 0;
			
 
				+	u64 seq;
			
 
				 
			
 
				 	seq = __this_cpu_read(perf_throttled_seq);
			
 
				 	if (seq != hwc->interrupts_seq) {
			
@@ -7080,6 +7093,34 @@ static int __perf_event_overflow(struct perf_event *event,
 
				 			perf_adjust_period(event, delta, hwc->last_period, true);
			
 
				 	}
			
 
				 
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+int perf_event_account_interrupt(struct perf_event *event)
			
 
				+{
			
 
				+	return __perf_event_account_interrupt(event, 1);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Generic event overflow handling, sampling.
			
 
				+ */
			
 
				+
			
 
				+static int __perf_event_overflow(struct perf_event *event,
			
 
				+				   int throttle, struct perf_sample_data *data,
			
 
				+				   struct pt_regs *regs)
			
 
				+{
			
 
				+	int events = atomic_read(&event->event_limit);
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	/*
			
 
				+	 * Non-sampling counters might still use the PMI to fold short
			
 
				+	 * hardware counters, ignore those.
			
 
				+	 */
			
 
				+	if (unlikely(!is_sampling_event(event)))
			
 
				+		return 0;
			
 
				+
			
 
				+	ret = __perf_event_account_interrupt(event, throttle);
			
 
				+
			
 
				 	/*
			
 
				 	 * XXX event_limit might not quite work as expected on inherited
			
 
				 	 * events
			
@@ -9503,6 +9544,37 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Variation on perf_event_ctx_lock_nested(), except we take two context
			
 
				+ * mutexes.
			
 
				+ */
			
 
				+static struct perf_event_context *
			
 
				+__perf_event_ctx_lock_double(struct perf_event *group_leader,
			
 
				+			     struct perf_event_context *ctx)
			
 
				+{
			
 
				+	struct perf_event_context *gctx;
			
 
				+
			
 
				+again:
			
 
				+	rcu_read_lock();
			
 
				+	gctx = READ_ONCE(group_leader->ctx);
			
 
				+	if (!atomic_inc_not_zero(&gctx->refcount)) {
			
 
				+		rcu_read_unlock();
			
 
				+		goto again;
			
 
				+	}
			
 
				+	rcu_read_unlock();
			
 
				+
			
 
				+	mutex_lock_double(&gctx->mutex, &ctx->mutex);
			
 
				+
			
 
				+	if (group_leader->ctx != gctx) {
			
 
				+		mutex_unlock(&ctx->mutex);
			
 
				+		mutex_unlock(&gctx->mutex);
			
 
				+		put_ctx(gctx);
			
 
				+		goto again;
			
 
				+	}
			
 
				+
			
 
				+	return gctx;
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * sys_perf_event_open - open a performance event, associate it to a task/cpu
			
 
				  *
			
@@ -9746,12 +9818,31 @@ SYSCALL_DEFINE5(perf_event_open,
 
				 	}
			
 
				 
			
 
				 	if (move_group) {
			
 
				-		gctx = group_leader->ctx;
			
 
				-		mutex_lock_double(&gctx->mutex, &ctx->mutex);
			
 
				+		gctx = __perf_event_ctx_lock_double(group_leader, ctx);
			
 
				+
			
 
				 		if (gctx->task == TASK_TOMBSTONE) {
			
 
				 			err = -ESRCH;
			
 
				 			goto err_locked;
			
 
				 		}
			
 
				+
			
 
				+		/*
			
 
				+		 * Check if we raced against another sys_perf_event_open() call
			
 
				+		 * moving the software group underneath us.
			
 
				+		 */
			
 
				+		if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
			
 
				+			/*
			
 
				+			 * If someone moved the group out from under us, check
			
 
				+			 * if this new event wound up on the same ctx, if so
			
 
				+			 * its the regular !move_group case, otherwise fail.
			
 
				+			 */
			
 
				+			if (gctx != ctx) {
			
 
				+				err = -EINVAL;
			
 
				+				goto err_locked;
			
 
				+			} else {
			
 
				+				perf_event_ctx_unlock(group_leader, gctx);
			
 
				+				move_group = 0;
			
 
				+			}
			
 
				+		}
			
 
				 	} else {
			
 
				 		mutex_lock(&ctx->mutex);
			
 
				 	}
			
@@ -9853,7 +9944,7 @@ SYSCALL_DEFINE5(perf_event_open,
 
				 	perf_unpin_context(ctx);
			
 
				 
			
 
				 	if (move_group)
			
 
				-		mutex_unlock(&gctx->mutex);
			
 
				+		perf_event_ctx_unlock(group_leader, gctx);
			
 
				 	mutex_unlock(&ctx->mutex);
			
 
				 
			
 
				 	if (task) {
			
@@ -9879,7 +9970,7 @@ SYSCALL_DEFINE5(perf_event_open,
 
				 
			
 
				 err_locked:
			
 
				 	if (move_group)
			
 
				-		mutex_unlock(&gctx->mutex);
			
 
				+		perf_event_ctx_unlock(group_leader, gctx);
			
 
				 	mutex_unlock(&ctx->mutex);
			
 
				 /* err_file: */
			
 
				 	fput(event_file);
			
--- a/samples/bpf/sock_example.h
+++ b/samples/bpf/sock_example.h
@@ -4,7 +4,7 @@
 
				 #include <unistd.h>
			
 
				 #include <string.h>
			
 
				 #include <errno.h>
			
 
				-#include <net/ethernet.h>
			
 
				+#include <linux/if_ether.h>
			
 
				 #include <net/if.h>
			
 
				 #include <linux/if_packet.h>
			
 
				 #include <arpa/inet.h>
			
--- a/samples/bpf/trace_output_user.c
+++ b/samples/bpf/trace_output_user.c
@@ -9,7 +9,6 @@
 
				 #include <string.h>
			
 
				 #include <fcntl.h>
			
 
				 #include <poll.h>
			
 
				-#include <sys/ioctl.h>
			
 
				 #include <linux/perf_event.h>
			
 
				 #include <linux/bpf.h>
			
 
				 #include <errno.h>
			
--- a/tools/lib/subcmd/parse-options.c
+++ b/tools/lib/subcmd/parse-options.c
@@ -213,6 +213,9 @@ static int get_value(struct parse_opt_ctx_t *p,
 
				 		else
			
 
				 			err = get_arg(p, opt, flags, (const char **)opt->value);
			
 
				 
			
 
				+		if (opt->set)
			
 
				+			*(bool *)opt->set = true;
			
 
				+
			
 
				 		/* PARSE_OPT_NOEMPTY: Allow NULL but disallow empty string. */
			
 
				 		if (opt->flags & PARSE_OPT_NOEMPTY) {
			
 
				 			const char *val = *(const char **)opt->value;
			
--- a/tools/lib/subcmd/parse-options.h
+++ b/tools/lib/subcmd/parse-options.h
@@ -137,6 +137,11 @@ struct option {
 
				 	{ .type = OPTION_STRING,  .short_name = (s), .long_name = (l), \
			
 
				 	  .value = check_vtype(v, const char **), (a), .help = (h), \
			
 
				 	  .flags = PARSE_OPT_OPTARG, .defval = (intptr_t)(d) }
			
 
				+#define OPT_STRING_OPTARG_SET(s, l, v, os, a, h, d) \
			
 
				+	{ .type = OPTION_STRING, .short_name = (s), .long_name = (l), \
			
 
				+	  .value = check_vtype(v, const char **), (a), .help = (h), \
			
 
				+	  .flags = PARSE_OPT_OPTARG, .defval = (intptr_t)(d), \
			
 
				+	  .set = check_vtype(os, bool *)}
			
 
				 #define OPT_STRING_NOEMPTY(s, l, v, a, h)   { .type = OPTION_STRING,  .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), (a), .help = (h), .flags = PARSE_OPT_NOEMPTY}
			
 
				 #define OPT_DATE(s, l, v, h) \
			
 
				 	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), .argh = "time", .help = (h), .callback = parse_opt_approxidate_cb }
			
--- a/tools/lib/traceevent/plugin_sched_switch.c
+++ b/tools/lib/traceevent/plugin_sched_switch.c
@@ -111,7 +111,7 @@ static int sched_switch_handler(struct trace_seq *s,
 
				 	trace_seq_printf(s, "%lld ", val);
			
 
				 
			
 
				 	if (pevent_get_field_val(s, event, "prev_prio", record, &val, 0) == 0)
			
 
				-		trace_seq_printf(s, "[%lld] ", val);
			
 
				+		trace_seq_printf(s, "[%d] ", (int) val);
			
 
				 
			
 
				 	if (pevent_get_field_val(s,  event, "prev_state", record, &val, 0) == 0)
			
 
				 		write_state(s, val);
			
@@ -129,7 +129,7 @@ static int sched_switch_handler(struct trace_seq *s,
 
				 	trace_seq_printf(s, "%lld", val);
			
 
				 
			
 
				 	if (pevent_get_field_val(s, event, "next_prio", record, &val, 0) == 0)
			
 
				-		trace_seq_printf(s, " [%lld]", val);
			
 
				+		trace_seq_printf(s, " [%d]", (int) val);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -430,6 +430,10 @@ that gets then processed, possibly via a perf script, to decide if that
 
				 particular perf.data snapshot should be kept or not.
			
 
				 
			
 
				 Implies --timestamp-filename, --no-buildid and --no-buildid-cache.
			
 
				+The reason for the latter two is to reduce the data file switching
			
 
				+overhead. You can still switch them on with:
			
 
				+
			
 
				+  --switch-output --no-no-buildid  --no-no-buildid-cache
			
 
				 
			
 
				 --dry-run::
			
 
				 Parse options then exit. --dry-run can be used to detect errors in cmdline
			
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -704,9 +704,9 @@ install-tests: all install-gtk
 
				 		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr'; \
			
 
				 		$(INSTALL) tests/attr/* '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr'
			
 
				 
			
 
				-install-bin: install-tools install-tests
			
 
				+install-bin: install-tools install-tests install-traceevent-plugins
			
 
				 
			
 
				-install: install-bin try-install-man install-traceevent-plugins
			
 
				+install: install-bin try-install-man
			
 
				 
			
 
				 install-python_ext:
			
 
				 	$(PYTHON_WORD) util/setup.py --quiet install --root='/$(DESTDIR_SQ)'
			
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1405,7 +1405,7 @@ static bool dry_run;
 
				  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
			
 
				  * using pipes, etc.
			
 
				  */
			
 
				-struct option __record_options[] = {
			
 
				+static struct option __record_options[] = {
			
 
				 	OPT_CALLBACK('e', "event", &record.evlist, "event",
			
 
				 		     "event selector. use 'perf list' to list available events",
			
 
				 		     parse_events_option),
			
@@ -1636,7 +1636,7 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
 
				 		 * overhead. Still generate buildid if they are required
			
 
				 		 * explicitly using
			
 
				 		 *
			
 
				-		 *  perf record --signal-trigger --no-no-buildid \
			
 
				+		 *  perf record --switch-output --no-no-buildid \
			
 
				 		 *              --no-no-buildid-cache
			
 
				 		 *
			
 
				 		 * Following code equals to:
			
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -209,6 +209,7 @@ struct perf_sched {
 
				 	u64		skipped_samples;
			
 
				 	const char	*time_str;
			
 
				 	struct perf_time_interval ptime;
			
 
				+	struct perf_time_interval hist_time;
			
 
				 };
			
 
				 
			
 
				 /* per thread run time data */
			
@@ -2460,6 +2461,11 @@ static int timehist_sched_change_event(struct perf_tool *tool,
 
				 		timehist_print_sample(sched, sample, &al, thread, t);
			
 
				 
			
 
				 out:
			
 
				+	if (sched->hist_time.start == 0 && t >= ptime->start)
			
 
				+		sched->hist_time.start = t;
			
 
				+	if (ptime->end == 0 || t <= ptime->end)
			
 
				+		sched->hist_time.end = t;
			
 
				+
			
 
				 	if (tr) {
			
 
				 		/* time of this sched_switch event becomes last time task seen */
			
 
				 		tr->last_time = sample->time;
			
@@ -2624,6 +2630,7 @@ static void timehist_print_summary(struct perf_sched *sched,
 
				 	struct thread *t;
			
 
				 	struct thread_runtime *r;
			
 
				 	int i;
			
 
				+	u64 hist_time = sched->hist_time.end - sched->hist_time.start;
			
 
				 
			
 
				 	memset(&totals, 0, sizeof(totals));
			
 
				 
			
@@ -2665,7 +2672,7 @@ static void timehist_print_summary(struct perf_sched *sched,
 
				 			totals.sched_count += r->run_stats.n;
			
 
				 			printf("    CPU %2d idle for ", i);
			
 
				 			print_sched_time(r->total_run_time, 6);
			
 
				-			printf(" msec\n");
			
 
				+			printf(" msec  (%6.2f%%)\n", 100.0 * r->total_run_time / hist_time);
			
 
				 		} else
			
 
				 			printf("    CPU %2d idle entire time window\n", i);
			
 
				 	}
			
@@ -2701,12 +2708,16 @@ static void timehist_print_summary(struct perf_sched *sched,
 
				 
			
 
				 	printf("\n"
			
 
				 	       "    Total number of unique tasks: %" PRIu64 "\n"
			
 
				-	       "Total number of context switches: %" PRIu64 "\n"
			
 
				-	       "           Total run time (msec): ",
			
 
				+	       "Total number of context switches: %" PRIu64 "\n",
			
 
				 	       totals.task_count, totals.sched_count);
			
 
				 
			
 
				+	printf("           Total run time (msec): ");
			
 
				 	print_sched_time(totals.total_run_time, 2);
			
 
				 	printf("\n");
			
 
				+
			
 
				+	printf("    Total scheduling time (msec): ");
			
 
				+	print_sched_time(hist_time, 2);
			
 
				+	printf(" (x %d)\n", sched->max_cpu);
			
 
				 }
			
 
				 
			
 
				 typedef int (*sched_handler)(struct perf_tool *tool,
			
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -163,7 +163,7 @@ static struct map *kernel_get_module_map(const char *module)
 
				 
			
 
				 	/* A file path -- this is an offline module */
			
 
				 	if (module && strchr(module, '/'))
			
 
				-		return machine__findnew_module_map(host_machine, 0, module);
			
 
				+		return dso__new_map(module);
			
 
				 
			
 
				 	if (!module)
			
 
				 		module = "kernel";
			
@@ -173,6 +173,7 @@ static struct map *kernel_get_module_map(const char *module)
 
				 		if (strncmp(pos->dso->short_name + 1, module,
			
 
				 			    pos->dso->short_name_len - 2) == 0 &&
			
 
				 		    module[pos->dso->short_name_len - 2] == '\0') {
			
 
				+			map__get(pos);
			
 
				 			return pos;
			
 
				 		}
			
 
				 	}
			
@@ -188,15 +189,6 @@ struct map *get_target_map(const char *target, bool user)
 
				 		return kernel_get_module_map(target);
			
 
				 }
			
 
				 
			
 
				-static void put_target_map(struct map *map, bool user)
			
 
				-{
			
 
				-	if (map && user) {
			
 
				-		/* Only the user map needs to be released */
			
 
				-		map__put(map);
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-
			
 
				 static int convert_exec_to_group(const char *exec, char **result)
			
 
				 {
			
 
				 	char *ptr1, *ptr2, *exec_copy;
			
@@ -267,21 +259,6 @@ static bool kprobe_warn_out_range(const char *symbol, unsigned long address)
 
				 	return true;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * NOTE:
			
 
				- * '.gnu.linkonce.this_module' section of kernel module elf directly
			
 
				- * maps to 'struct module' from linux/module.h. This section contains
			
 
				- * actual module name which will be used by kernel after loading it.
			
 
				- * But, we cannot use 'struct module' here since linux/module.h is not
			
 
				- * exposed to user-space. Offset of 'name' has remained same from long
			
 
				- * time, so hardcoding it here.
			
 
				- */
			
 
				-#ifdef __LP64__
			
 
				-#define MOD_NAME_OFFSET 24
			
 
				-#else
			
 
				-#define MOD_NAME_OFFSET 12
			
 
				-#endif
			
 
				-
			
 
				 /*
			
 
				  * @module can be module name of module file path. In case of path,
			
 
				  * inspect elf and find out what is actual module name.
			
@@ -296,6 +273,7 @@ static char *find_module_name(const char *module)
 
				 	Elf_Data *data;
			
 
				 	Elf_Scn *sec;
			
 
				 	char *mod_name = NULL;
			
 
				+	int name_offset;
			
 
				 
			
 
				 	fd = open(module, O_RDONLY);
			
 
				 	if (fd < 0)
			
@@ -317,7 +295,21 @@ static char *find_module_name(const char *module)
 
				 	if (!data || !data->d_buf)
			
 
				 		goto ret_err;
			
 
				 
			
 
				-	mod_name = strdup((char *)data->d_buf + MOD_NAME_OFFSET);
			
 
				+	/*
			
 
				+	 * NOTE:
			
 
				+	 * '.gnu.linkonce.this_module' section of kernel module elf directly
			
 
				+	 * maps to 'struct module' from linux/module.h. This section contains
			
 
				+	 * actual module name which will be used by kernel after loading it.
			
 
				+	 * But, we cannot use 'struct module' here since linux/module.h is not
			
 
				+	 * exposed to user-space. Offset of 'name' has remained same from long
			
 
				+	 * time, so hardcoding it here.
			
 
				+	 */
			
 
				+	if (ehdr.e_ident[EI_CLASS] == ELFCLASS32)
			
 
				+		name_offset = 12;
			
 
				+	else	/* expect ELFCLASS64 by default */
			
 
				+		name_offset = 24;
			
 
				+
			
 
				+	mod_name = strdup((char *)data->d_buf + name_offset);
			
 
				 
			
 
				 ret_err:
			
 
				 	elf_end(elf);
			
@@ -412,7 +404,7 @@ static int find_alternative_probe_point(struct debuginfo *dinfo,
 
				 	}
			
 
				 
			
 
				 out:
			
 
				-	put_target_map(map, uprobes);
			
 
				+	map__put(map);
			
 
				 	return ret;
			
 
				 
			
 
				 }
			
@@ -618,6 +610,51 @@ static int find_perf_probe_point_from_dwarf(struct probe_trace_point *tp,
 
				 	return ret ? : -ENOENT;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Rename DWARF symbols to ELF symbols -- gcc sometimes optimizes functions
			
 
				+ * and generate new symbols with suffixes such as .constprop.N or .isra.N
			
 
				+ * etc. Since those symbols are not recorded in DWARF, we have to find
			
 
				+ * correct generated symbols from offline ELF binary.
			
 
				+ * For online kernel or uprobes we don't need this because those are
			
 
				+ * rebased on _text, or already a section relative address.
			
 
				+ */
			
 
				+static int
			
 
				+post_process_offline_probe_trace_events(struct probe_trace_event *tevs,
			
 
				+					int ntevs, const char *pathname)
			
 
				+{
			
 
				+	struct symbol *sym;
			
 
				+	struct map *map;
			
 
				+	unsigned long stext = 0;
			
 
				+	u64 addr;
			
 
				+	int i;
			
 
				+
			
 
				+	/* Prepare a map for offline binary */
			
 
				+	map = dso__new_map(pathname);
			
 
				+	if (!map || get_text_start_address(pathname, &stext) < 0) {
			
 
				+		pr_warning("Failed to get ELF symbols for %s\n", pathname);
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	for (i = 0; i < ntevs; i++) {
			
 
				+		addr = tevs[i].point.address + tevs[i].point.offset - stext;
			
 
				+		sym = map__find_symbol(map, addr);
			
 
				+		if (!sym)
			
 
				+			continue;
			
 
				+		if (!strcmp(sym->name, tevs[i].point.symbol))
			
 
				+			continue;
			
 
				+		/* If we have no realname, use symbol for it */
			
 
				+		if (!tevs[i].point.realname)
			
 
				+			tevs[i].point.realname = tevs[i].point.symbol;
			
 
				+		else
			
 
				+			free(tevs[i].point.symbol);
			
 
				+		tevs[i].point.symbol = strdup(sym->name);
			
 
				+		tevs[i].point.offset = addr - sym->start;
			
 
				+	}
			
 
				+	map__put(map);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 static int add_exec_to_probe_trace_events(struct probe_trace_event *tevs,
			
 
				 					  int ntevs, const char *exec)
			
 
				 {
			
@@ -679,7 +716,8 @@ post_process_kernel_probe_trace_events(struct probe_trace_event *tevs,
 
				 
			
 
				 	/* Skip post process if the target is an offline kernel */
			
 
				 	if (symbol_conf.ignore_vmlinux_buildid)
			
 
				-		return 0;
			
 
				+		return post_process_offline_probe_trace_events(tevs, ntevs,
			
 
				+						symbol_conf.vmlinux_name);
			
 
				 
			
 
				 	reloc_sym = kernel_get_ref_reloc_sym();
			
 
				 	if (!reloc_sym) {
			
@@ -2869,7 +2907,7 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
 
				 	}
			
 
				 
			
 
				 out:
			
 
				-	put_target_map(map, pev->uprobes);
			
 
				+	map__put(map);
			
 
				 	free(syms);
			
 
				 	return ret;
			
 
				 
			
@@ -3362,10 +3400,7 @@ int show_available_funcs(const char *target, struct strfilter *_filter,
 
				 		return ret;
			
 
				 
			
 
				 	/* Get a symbol map */
			
 
				-	if (user)
			
 
				-		map = dso__new_map(target);
			
 
				-	else
			
 
				-		map = kernel_get_module_map(target);
			
 
				+	map = get_target_map(target, user);
			
 
				 	if (!map) {
			
 
				 		pr_err("Failed to get a map for %s\n", (target) ? : "kernel");
			
 
				 		return -EINVAL;
			
@@ -3397,9 +3432,7 @@ int show_available_funcs(const char *target, struct strfilter *_filter,
 
				         }
			
 
				 
			
 
				 end:
			
 
				-	if (user) {
			
 
				-		map__put(map);
			
 
				-	}
			
 
				+	map__put(map);
			
 
				 	exit_probe_symbol_maps();
			
 
				 
			
 
				 	return ret;
			
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -537,6 +537,12 @@ int sysfs__read_build_id(const char *filename, void *build_id, size_t size)
 
				 				break;
			
 
				 		} else {
			
 
				 			int n = namesz + descsz;
			
 
				+
			
 
				+			if (n > (int)sizeof(bf)) {
			
 
				+				n = sizeof(bf);
			
 
				+				pr_debug("%s: truncating reading of build id in sysfs file %s: n_namesz=%u, n_descsz=%u.\n",
			
 
				+					 __func__, filename, nhdr.n_namesz, nhdr.n_descsz);
			
 
				+			}
			
 
				 			if (read(fd, bf, n) != n)
			
 
				 				break;
			
 
				 		}