Browse Source

Merge tag 'perf-core-for-mingo-20160427' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/core

Pull perf/core improvements and fixes from Arnaldo Carvalho de Melo:

User visible changes:

- perf trace --pf maj/min/all works with --call-graph: (Arnaldo Carvalho de Melo)

  Tracing write syscalls and major page faults with callchains while starting
  firefox, limiting the stack to 5 frames:

 # perf trace -e write --pf maj --max-stack 5 firefox
   589.549 ( 0.014 ms): firefox/15377 write(fd: 4, buf: 0x7fff80acc898, count: 151) = 151
                                       [0xfaed] (/usr/lib64/libpthread-2.22.so)
                                       fire_glxtest_process+0x5c (/usr/lib64/firefox/libxul.so)
                                       InstallGdkErrorHandler+0x41 (/usr/lib64/firefox/libxul.so)
                                       XREMain::XRE_mainInit+0x12c (/usr/lib64/firefox/libxul.so)
                                       XREMain::XRE_main+0x1e4 (/usr/lib64/firefox/libxul.so)
   760.704 ( 0.000 ms): firefox/15332 majfault [gtk_tree_view_accessible_get_type+0x0] => /usr/lib64/libgtk-3.so.0.1800.9@0xa0850 (x.)
                                       gtk_tree_view_accessible_get_type+0x0 (/usr/lib64/libgtk-3.so.0.1800.9)
                                       gtk_tree_view_class_intern_init+0x1a54 (/usr/lib64/libgtk-3.so.0.1800.9)
                                       g_type_class_ref+0x6dd (/usr/lib64/libgobject-2.0.so.0.4600.2)
                                       [0x115378] (/usr/lib64/libgnutls.so.30.6.3)

  This automagically selects "--call-graph dwarf", use "--call-graph fp" on systems
  where -fno-omit-frame-pointer was used to built the components of interest, to
  incur in less overhead, or tune "--call-graph dwarf" appropriately, see 'perf record --help'.

- Allow /proc/sys/kernel/perf_event_max_stack, that defaults to the old hard coded value
  of PERF_MAX_STACK_DEPTH (127), useful for huge callstacks for things like Groovy, Ruby, etc,
  and also to reduce overhead by limiting it to a smaller value, upcoming work will allow
  this to be done per-event (Arnaldo Carvalho de Melo)

- Make 'perf trace --min-stack' be honoured by --pf and --event (Arnaldo Carvalho de Melo)

- Make 'perf evlist -v' decode perf_event_attr->branch_sample_type (Arnaldo Carvalho de Melo)

   # perf record --call lbr usleep 1
   # perf evlist -v
   cycles:ppp: ... sample_type: IP|TID|TIME|CALLCHAIN|PERIOD|BRANCH_STACK, ...
            branch_sample_type: USER|CALL_STACK|NO_FLAGS|NO_CYCLES
   #

- Clear dummy entry accumulated period, fixing such 'perf top/report' output
  as: (Kan Liang)

    4769.98%  0.01%  0.00%  0.01%  tchain_edit  [kernel] [k] update_fast_timekeeper

- System calls with pid_t arguments gets them augmented with the COMM event
  more thoroughly:

  # trace -e perf_event_open perf stat -e cycles -p 15608
   6.876 ( 0.014 ms): perf_event_open(attr_uptr: 0x2ae20d8, pid: 15608 (hexchat), cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 3
   6.882 ( 0.005 ms): perf_event_open(attr_uptr: 0x2ae20d8, pid: 15639 (gmain), cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 4
   6.889 ( 0.005 ms): perf_event_open(attr_uptr: 0x2ae20d8, pid: 15640 (gdbus), cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 5
                                                            ^^^^^^^^^^^^^^^^^^
   ^C

- Fix offline module name mismatch issue in 'perf probe' (Ravi Bangoria)

- Fix module probe issue if no dwarf support in (Ravi Bangoria)

Assorted fixes:

- Fix off-by-one in write_buildid() (Andrey Ryabinin)

- Fix segfault when printing callchains in 'perf script' (Chris Phlipot)

- Replace assignment with comparison on assert check in 'perf test' entry (Colin Ian King)

- Fix off-by-one comparison in intel-pt code (Colin Ian King)

- Close target file on error path in 'perf probe' (Masami Hiramatsu)

- Set default kprobe group name if not given in 'perf probe' (Masami Hiramatsu)

- Avoid partial perf_event_header reads (Wang Nan)

Infrastructure changes:

- Update x86's syscall_64.tbl copy, adding preadv2 & pwritev2 (Arnaldo Carvalho de Melo)

- Make the x86 clean quiet wrt syscall table removal (Jiri Olsa)

Cleanups:

- Simplify wrapper for LOCK_PI in 'perf bench futex' (Davidlohr Bueso)

- Remove duplicate const qualifier (Eric Engestrom)

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Ingo Molnar 9 years ago
parent
commit
a8944c5bf8
49 changed files with 475 additions and 179 deletions
  1. 14 0
      Documentation/sysctl/kernel.txt
  2. 1 1
      arch/arm/kernel/perf_callchain.c
  3. 2 2
      arch/arm64/kernel/perf_callchain.c
  4. 1 1
      arch/metag/kernel/perf_callchain.c
  5. 2 2
      arch/mips/kernel/perf_event.c
  6. 2 2
      arch/powerpc/perf/callchain.c
  7. 3 3
      arch/sparc/kernel/perf_event.c
  8. 2 2
      arch/x86/events/core.c
  9. 2 2
      arch/xtensa/kernel/perf_event.c
  10. 6 2
      include/linux/perf_event.h
  11. 4 4
      kernel/bpf/stackmap.c
  12. 33 2
      kernel/events/callchain.c
  13. 12 0
      kernel/sysctl.c
  14. 2 1
      tools/Makefile
  15. 13 0
      tools/lib/api/fs/fs.c
  16. 2 0
      tools/lib/api/fs/fs.h
  17. 1 1
      tools/perf/Documentation/perf-report.txt
  18. 1 1
      tools/perf/Documentation/perf-script.txt
  19. 1 1
      tools/perf/Documentation/perf-top.txt
  20. 1 1
      tools/perf/Documentation/perf-trace.txt
  21. 1 1
      tools/perf/arch/x86/Makefile
  22. 2 0
      tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
  23. 1 1
      tools/perf/bench/futex-lock-pi.c
  24. 2 4
      tools/perf/bench/futex.h
  25. 15 7
      tools/perf/bench/mem-functions.c
  26. 2 2
      tools/perf/builtin-report.c
  27. 9 7
      tools/perf/builtin-script.c
  28. 2 2
      tools/perf/builtin-top.c
  29. 88 86
      tools/perf/builtin-trace.c
  30. 5 0
      tools/perf/perf.c
  31. 1 1
      tools/perf/tests/event_update.c
  32. 1 1
      tools/perf/tests/hists_cumulate.c
  33. 1 1
      tools/perf/tests/hists_filter.c
  34. 1 1
      tools/perf/tests/hists_output.c
  35. 43 0
      tools/perf/trace/beauty/perf_event_open.c
  36. 4 1
      tools/perf/trace/beauty/pid.c
  37. 3 3
      tools/perf/util/build-id.c
  38. 10 2
      tools/perf/util/evlist.c
  39. 17 1
      tools/perf/util/evsel.c
  40. 2 0
      tools/perf/util/hist.c
  41. 1 1
      tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
  42. 3 3
      tools/perf/util/machine.c
  43. 92 22
      tools/perf/util/probe-event.c
  44. 1 2
      tools/perf/util/probe-file.c
  45. 1 1
      tools/perf/util/scripting-engines/trace-event-perl.c
  46. 20 1
      tools/perf/util/thread.c
  47. 2 0
      tools/perf/util/thread.h
  48. 36 0
      tools/perf/util/util.c
  49. 4 0
      tools/perf/util/util.h

+ 14 - 0
Documentation/sysctl/kernel.txt

@@ -60,6 +60,7 @@ show up in /proc/sys/kernel:
 - panic_on_warn
 - perf_cpu_time_max_percent
 - perf_event_paranoid
+- perf_event_max_stack
 - pid_max
 - powersave-nap               [ PPC only ]
 - printk
@@ -654,6 +655,19 @@ users (without CAP_SYS_ADMIN).  The default value is 1.
 
 ==============================================================
 
+perf_event_max_stack:
+
+Controls maximum number of stack frames to copy for (attr.sample_type &
+PERF_SAMPLE_CALLCHAIN) configured events, for instance, when using
+'perf record -g' or 'perf trace --call-graph fp'.
+
+This can only be done when no events are in use that have callchains
+enabled, otherwise writing to this file will return -EBUSY.
+
+The default value is 127.
+
+==============================================================
+
 pid_max:
 
 PID allocation wrap value.  When the kernel's next PID value

+ 1 - 1
arch/arm/kernel/perf_callchain.c

@@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
 	tail = (struct frame_tail __user *)regs->ARM_fp - 1;
 
-	while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+	while ((entry->nr < sysctl_perf_event_max_stack) &&
 	       tail && !((unsigned long)tail & 0x3))
 		tail = user_backtrace(tail, entry);
 }

+ 2 - 2
arch/arm64/kernel/perf_callchain.c

@@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
 		tail = (struct frame_tail __user *)regs->regs[29];
 
-		while (entry->nr < PERF_MAX_STACK_DEPTH &&
+		while (entry->nr < sysctl_perf_event_max_stack &&
 		       tail && !((unsigned long)tail & 0xf))
 			tail = user_backtrace(tail, entry);
 	} else {
@@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
 		tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;
 
-		while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+		while ((entry->nr < sysctl_perf_event_max_stack) &&
 			tail && !((unsigned long)tail & 0x3))
 			tail = compat_user_backtrace(tail, entry);
 #endif

+ 1 - 1
arch/metag/kernel/perf_callchain.c

@@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
 	--frame;
 
-	while ((entry->nr < PERF_MAX_STACK_DEPTH) && frame)
+	while ((entry->nr < sysctl_perf_event_max_stack) && frame)
 		frame = user_backtrace(frame, entry);
 }
 

+ 2 - 2
arch/mips/kernel/perf_event.c

@@ -35,7 +35,7 @@ static void save_raw_perf_callchain(struct perf_callchain_entry *entry,
 		addr = *sp++;
 		if (__kernel_text_address(addr)) {
 			perf_callchain_store(entry, addr);
-			if (entry->nr >= PERF_MAX_STACK_DEPTH)
+			if (entry->nr >= sysctl_perf_event_max_stack)
 				break;
 		}
 	}
@@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 	}
 	do {
 		perf_callchain_store(entry, pc);
-		if (entry->nr >= PERF_MAX_STACK_DEPTH)
+		if (entry->nr >= sysctl_perf_event_max_stack)
 			break;
 		pc = unwind_stack(current, &sp, pc, &ra);
 	} while (pc);

+ 2 - 2
arch/powerpc/perf/callchain.c

@@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		fp = (unsigned long __user *) sp;
 		if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
 			return;
@@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		fp = (unsigned int __user *) (unsigned long) sp;
 		if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
 			return;

+ 3 - 3
arch/sparc/kernel/perf_event.c

@@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 			}
 		}
 #endif
-	} while (entry->nr < PERF_MAX_STACK_DEPTH);
+	} while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 static inline int
@@ -1790,7 +1790,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
 		pc = sf.callers_pc;
 		ufp = (unsigned long)sf.fp + STACK_BIAS;
 		perf_callchain_store(entry, pc);
-	} while (entry->nr < PERF_MAX_STACK_DEPTH);
+	} while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 static void perf_callchain_user_32(struct perf_callchain_entry *entry,
@@ -1822,7 +1822,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 			ufp = (unsigned long)sf.fp;
 		}
 		perf_callchain_store(entry, pc);
-	} while (entry->nr < PERF_MAX_STACK_DEPTH);
+	} while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 void

+ 2 - 2
arch/x86/events/core.c

@@ -2277,7 +2277,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 
 	fp = compat_ptr(ss_base + regs->bp);
 	pagefault_disable();
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		unsigned long bytes;
 		frame.next_frame     = 0;
 		frame.return_address = 0;
@@ -2337,7 +2337,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 		return;
 
 	pagefault_disable();
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		unsigned long bytes;
 		frame.next_frame	     = NULL;
 		frame.return_address = 0;

+ 2 - 2
arch/xtensa/kernel/perf_event.c

@@ -332,14 +332,14 @@ static int callchain_trace(struct stackframe *frame, void *data)
 void perf_callchain_kernel(struct perf_callchain_entry *entry,
 			   struct pt_regs *regs)
 {
-	xtensa_backtrace_kernel(regs, PERF_MAX_STACK_DEPTH,
+	xtensa_backtrace_kernel(regs, sysctl_perf_event_max_stack,
 				callchain_trace, NULL, entry);
 }
 
 void perf_callchain_user(struct perf_callchain_entry *entry,
 			 struct pt_regs *regs)
 {
-	xtensa_backtrace_user(regs, PERF_MAX_STACK_DEPTH,
+	xtensa_backtrace_user(regs, sysctl_perf_event_max_stack,
 			      callchain_trace, entry);
 }
 

+ 6 - 2
include/linux/perf_event.h

@@ -58,7 +58,7 @@ struct perf_guest_info_callbacks {
 
 struct perf_callchain_entry {
 	__u64				nr;
-	__u64				ip[PERF_MAX_STACK_DEPTH];
+	__u64				ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
 };
 
 struct perf_raw_record {
@@ -993,9 +993,11 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 extern int get_callchain_buffers(void);
 extern void put_callchain_buffers(void);
 
+extern int sysctl_perf_event_max_stack;
+
 static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
 {
-	if (entry->nr < PERF_MAX_STACK_DEPTH) {
+	if (entry->nr < sysctl_perf_event_max_stack) {
 		entry->ip[entry->nr++] = ip;
 		return 0;
 	} else {
@@ -1017,6 +1019,8 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp, loff_t *ppos);
 
 static inline bool perf_paranoid_tracepoint_raw(void)
 {

+ 4 - 4
kernel/bpf/stackmap.c

@@ -66,7 +66,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
 	    value_size < 8 || value_size % 8 ||
-	    value_size / 8 > PERF_MAX_STACK_DEPTH)
+	    value_size / 8 > sysctl_perf_event_max_stack)
 		return ERR_PTR(-EINVAL);
 
 	/* hash table size must be power of 2 */
@@ -124,8 +124,8 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 	struct perf_callchain_entry *trace;
 	struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
 	u32 max_depth = map->value_size / 8;
-	/* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */
-	u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth;
+	/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
+	u32 init_nr = sysctl_perf_event_max_stack - max_depth;
 	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
 	u32 hash, id, trace_nr, trace_len;
 	bool user = flags & BPF_F_USER_STACK;
@@ -143,7 +143,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 		return -EFAULT;
 
 	/* get_perf_callchain() guarantees that trace->nr >= init_nr
-	 * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth
+	 * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
 	 */
 	trace_nr = trace->nr - init_nr;
 

+ 33 - 2
kernel/events/callchain.c

@@ -18,6 +18,14 @@ struct callchain_cpus_entries {
 	struct perf_callchain_entry	*cpu_entries[0];
 };
 
+int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
+
+static inline size_t perf_callchain_entry__sizeof(void)
+{
+	return (sizeof(struct perf_callchain_entry) +
+		sizeof(__u64) * sysctl_perf_event_max_stack);
+}
+
 static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
 static atomic_t nr_callchain_events;
 static DEFINE_MUTEX(callchain_mutex);
@@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
 	if (!entries)
 		return -ENOMEM;
 
-	size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+	size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
 
 	for_each_possible_cpu(cpu) {
 		entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
@@ -147,7 +155,8 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
 
 	cpu = smp_processor_id();
 
-	return &entries->cpu_entries[cpu][*rctx];
+	return (((void *)entries->cpu_entries[cpu]) +
+		(*rctx * perf_callchain_entry__sizeof()));
 }
 
 static void
@@ -215,3 +224,25 @@ exit_put:
 
 	return entry;
 }
+
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int new_value = sysctl_perf_event_max_stack, ret;
+	struct ctl_table new_table = *table;
+
+	new_table.data = &new_value;
+	ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos);
+	if (ret || !write)
+		return ret;
+
+	mutex_lock(&callchain_mutex);
+	if (atomic_read(&nr_callchain_events))
+		ret = -EBUSY;
+	else
+		sysctl_perf_event_max_stack = new_value;
+
+	mutex_unlock(&callchain_mutex);
+
+	return ret;
+}

+ 12 - 0
kernel/sysctl.c

@@ -130,6 +130,9 @@ static int one_thousand = 1000;
 #ifdef CONFIG_PRINTK
 static int ten_thousand = 10000;
 #endif
+#ifdef CONFIG_PERF_EVENTS
+static int six_hundred_forty_kb = 640 * 1024;
+#endif
 
 /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
 static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
@@ -1144,6 +1147,15 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
+	{
+		.procname	= "perf_event_max_stack",
+		.data		= NULL, /* filled in by handler */
+		.maxlen		= sizeof(sysctl_perf_event_max_stack),
+		.mode		= 0644,
+		.proc_handler	= perf_event_max_stack_handler,
+		.extra1		= &zero,
+		.extra2		= &six_hundred_forty_kb,
+	},
 #endif
 #ifdef CONFIG_KMEMCHECK
 	{

+ 2 - 1
tools/Makefile

@@ -137,7 +137,8 @@ libsubcmd_clean:
 	$(call descend,lib/subcmd,clean)
 
 perf_clean:
-	$(call descend,$(@:_clean=),clean)
+	$(Q)mkdir -p $(PERF_O) .
+	$(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= clean
 
 selftests_clean:
 	$(call descend,testing/$(@:_clean=),clean)

+ 13 - 0
tools/lib/api/fs/fs.c

@@ -351,6 +351,19 @@ int filename__read_str(const char *filename, char **buf, size_t *sizep)
 	return err;
 }
 
+int procfs__read_str(const char *entry, char **buf, size_t *sizep)
+{
+	char path[PATH_MAX];
+	const char *procfs = procfs__mountpoint();
+
+	if (!procfs)
+		return -1;
+
+	snprintf(path, sizeof(path), "%s/%s", procfs, entry);
+
+	return filename__read_str(path, buf, sizep);
+}
+
 int sysfs__read_ull(const char *entry, unsigned long long *value)
 {
 	char path[PATH_MAX];

+ 2 - 0
tools/lib/api/fs/fs.h

@@ -29,6 +29,8 @@ int filename__read_int(const char *filename, int *value);
 int filename__read_ull(const char *filename, unsigned long long *value);
 int filename__read_str(const char *filename, char **buf, size_t *sizep);
 
+int procfs__read_str(const char *entry, char **buf, size_t *sizep);
+
 int sysctl__read_int(const char *sysctl, int *value);
 int sysfs__read_int(const char *entry, int *value);
 int sysfs__read_ull(const char *entry, unsigned long long *value);

+ 1 - 1
tools/perf/Documentation/perf-report.txt

@@ -248,7 +248,7 @@ OPTIONS
 	Note that when using the --itrace option the synthesized callchain size
 	will override this value if the synthesized callchain size is bigger.
 
-	Default: 127
+	Default: /proc/sys/kernel/perf_event_max_stack when present, 127 otherwise.
 
 -G::
 --inverted::

+ 1 - 1
tools/perf/Documentation/perf-script.txt

@@ -267,7 +267,7 @@ include::itrace.txt[]
         Note that when using the --itrace option the synthesized callchain size
         will override this value if the synthesized callchain size is bigger.
 
-        Default: 127
+        Default: /proc/sys/kernel/perf_event_max_stack when present, 127 otherwise.
 
 --ns::
 	Use 9 decimal places when displaying time (i.e. show the nanoseconds)

+ 1 - 1
tools/perf/Documentation/perf-top.txt

@@ -177,7 +177,7 @@ Default is to monitor all CPUS.
 	between information loss and faster processing especially for
 	workloads that can have a very long callchain stack.
 
-	Default: 127
+	Default: /proc/sys/kernel/perf_event_max_stack when present, 127 otherwise.
 
 --ignore-callees=<regex>::
         Ignore callees of the function(s) matching the given regex.

+ 1 - 1
tools/perf/Documentation/perf-trace.txt

@@ -143,7 +143,7 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
         Implies '--call-graph dwarf' when --call-graph not present on the
         command line, on systems where DWARF unwinding was built in.
 
-        Default: 127
+        Default: /proc/sys/kernel/perf_event_max_stack when present, 127 otherwise.
 
 --min-stack::
         Set the stack depth limit when parsing the callchain, anything

+ 1 - 1
tools/perf/arch/x86/Makefile

@@ -24,6 +24,6 @@ $(header): $(sys)/syscall_64.tbl $(systbl)
 	$(Q)$(SHELL) '$(systbl)' $(sys)/syscall_64.tbl 'x86_64' > $@
 
 clean::
-	rm -f $(header)
+	$(call QUIET_CLEAN, x86) $(RM) $(header)
 
 archheaders: $(header)

+ 2 - 0
tools/perf/arch/x86/entry/syscalls/syscall_64.tbl

@@ -333,6 +333,8 @@
 324	common	membarrier		sys_membarrier
 325	common	mlock2			sys_mlock2
 326	common	copy_file_range		sys_copy_file_range
+327	64	preadv2			sys_preadv2
+328	64	pwritev2		sys_pwritev2
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact

+ 1 - 1
tools/perf/bench/futex-lock-pi.c

@@ -83,7 +83,7 @@ static void *workerfn(void *arg)
 	do {
 		int ret;
 	again:
-		ret = futex_lock_pi(w->futex, NULL, 0, futex_flag);
+		ret = futex_lock_pi(w->futex, NULL, futex_flag);
 
 		if (ret) { /* handle lock acquisition */
 			if (!silent)

+ 2 - 4
tools/perf/bench/futex.h

@@ -57,13 +57,11 @@ futex_wake(u_int32_t *uaddr, int nr_wake, int opflags)
 
 /**
  * futex_lock_pi() - block on uaddr as a PI mutex
- * @detect:	whether (1) or not (0) to perform deadlock detection
  */
 static inline int
-futex_lock_pi(u_int32_t *uaddr, struct timespec *timeout, int detect,
-	      int opflags)
+futex_lock_pi(u_int32_t *uaddr, struct timespec *timeout, int opflags)
 {
-	return futex(uaddr, FUTEX_LOCK_PI, detect, timeout, NULL, 0, opflags);
+	return futex(uaddr, FUTEX_LOCK_PI, 0, timeout, NULL, 0, opflags);
 }
 
 /**

+ 15 - 7
tools/perf/bench/mem-functions.c

@@ -6,6 +6,7 @@
  * Written by Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
  */
 
+#include "debug.h"
 #include "../perf.h"
 #include "../util/util.h"
 #include <subcmd/parse-options.h>
@@ -63,14 +64,16 @@ static struct perf_event_attr cycle_attr = {
 	.config		= PERF_COUNT_HW_CPU_CYCLES
 };
 
-static void init_cycles(void)
+static int init_cycles(void)
 {
 	cycles_fd = sys_perf_event_open(&cycle_attr, getpid(), -1, -1, perf_event_open_cloexec_flag());
 
-	if (cycles_fd < 0 && errno == ENOSYS)
-		die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
-	else
-		BUG_ON(cycles_fd < 0);
+	if (cycles_fd < 0 && errno == ENOSYS) {
+		pr_debug("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
+		return -1;
+	}
+
+	return cycles_fd;
 }
 
 static u64 get_cycles(void)
@@ -155,8 +158,13 @@ static int bench_mem_common(int argc, const char **argv, struct bench_mem_info *
 
 	argc = parse_options(argc, argv, options, info->usage, 0);
 
-	if (use_cycles)
-		init_cycles();
+	if (use_cycles) {
+		i = init_cycles();
+		if (i < 0) {
+			fprintf(stderr, "Failed to open cycles counter\n");
+			return i;
+		}
+	}
 
 	size = (size_t)perf_atoll((char *)size_str);
 	size_total = (double)size * nr_loops;

+ 2 - 2
tools/perf/builtin-report.c

@@ -691,7 +691,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
 			.ordered_events	 = true,
 			.ordering_requires_timestamps = true,
 		},
-		.max_stack		 = PERF_MAX_STACK_DEPTH,
+		.max_stack		 = sysctl_perf_event_max_stack,
 		.pretty_printing_style	 = "normal",
 		.socket_filter		 = -1,
 	};
@@ -744,7 +744,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
 	OPT_INTEGER(0, "max-stack", &report.max_stack,
 		    "Set the maximum stack depth when parsing the callchain, "
 		    "anything beyond the specified depth will be ignored. "
-		    "Default: " __stringify(PERF_MAX_STACK_DEPTH)),
+		    "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
 	OPT_BOOLEAN('G', "inverted", &report.inverted_callchain,
 		    "alias for inverted call graph"),
 	OPT_CALLBACK(0, "ignore-callees", NULL, "regex",

+ 9 - 7
tools/perf/builtin-script.c

@@ -570,12 +570,12 @@ static void print_sample_bts(struct perf_sample *sample,
 	/* print branch_from information */
 	if (PRINT_FIELD(IP)) {
 		unsigned int print_opts = output[attr->type].print_ip_opts;
-		struct callchain_cursor *cursor = NULL, cursor_callchain;
+		struct callchain_cursor *cursor = NULL;
 
 		if (symbol_conf.use_callchain && sample->callchain &&
-		    thread__resolve_callchain(al->thread, &cursor_callchain, evsel,
+		    thread__resolve_callchain(al->thread, &callchain_cursor, evsel,
 					      sample, NULL, NULL, scripting_max_stack) == 0)
-			cursor = &cursor_callchain;
+			cursor = &callchain_cursor;
 
 		if (cursor == NULL) {
 			putchar(' ');
@@ -789,12 +789,12 @@ static void process_event(struct perf_script *script,
 		printf("%16" PRIu64, sample->weight);
 
 	if (PRINT_FIELD(IP)) {
-		struct callchain_cursor *cursor = NULL, cursor_callchain;
+		struct callchain_cursor *cursor = NULL;
 
 		if (symbol_conf.use_callchain && sample->callchain &&
-		    thread__resolve_callchain(al->thread, &cursor_callchain, evsel,
+		    thread__resolve_callchain(al->thread, &callchain_cursor, evsel,
 					      sample, NULL, NULL, scripting_max_stack) == 0)
-			cursor = &cursor_callchain;
+			cursor = &callchain_cursor;
 
 		putchar(cursor ? '\n' : ' ');
 		sample__fprintf_sym(sample, al, 0, output[attr->type].print_ip_opts, cursor, stdout);
@@ -2031,7 +2031,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 	OPT_UINTEGER(0, "max-stack", &scripting_max_stack,
 		     "Set the maximum stack depth when parsing the callchain, "
 		     "anything beyond the specified depth will be ignored. "
-		     "Default: " __stringify(PERF_MAX_STACK_DEPTH)),
+		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
 	OPT_BOOLEAN('I', "show-info", &show_full_info,
 		    "display extended information from perf.data file"),
 	OPT_BOOLEAN('\0', "show-kernel-path", &symbol_conf.show_kernel_path,
@@ -2067,6 +2067,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 		NULL
 	};
 
+	scripting_max_stack = sysctl_perf_event_max_stack;
+
 	setup_scripting();
 
 	argc = parse_options_subcommand(argc, argv, options, script_subcommands, script_usage,

+ 2 - 2
tools/perf/builtin-top.c

@@ -1103,7 +1103,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
 			},
 			.proc_map_timeout    = 500,
 		},
-		.max_stack	     = PERF_MAX_STACK_DEPTH,
+		.max_stack	     = sysctl_perf_event_max_stack,
 		.sym_pcnt_filter     = 5,
 	};
 	struct record_opts *opts = &top.record_opts;
@@ -1171,7 +1171,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
 		    "Accumulate callchains of children and show total overhead as well"),
 	OPT_INTEGER(0, "max-stack", &top.max_stack,
 		    "Set the maximum stack depth when parsing the callchain. "
-		    "Default: " __stringify(PERF_MAX_STACK_DEPTH)),
+		    "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
 	OPT_CALLBACK(0, "ignore-callees", NULL, "regex",
 		   "ignore callees of these functions in call graphs",
 		   report_parse_ignore_callees_opt),

+ 88 - 86
tools/perf/builtin-trace.c

@@ -56,22 +56,6 @@
 # define MSG_CMSG_CLOEXEC	0x40000000
 #endif
 
-#ifndef PERF_FLAG_FD_NO_GROUP
-# define PERF_FLAG_FD_NO_GROUP		(1UL << 0)
-#endif
-
-#ifndef PERF_FLAG_FD_OUTPUT
-# define PERF_FLAG_FD_OUTPUT		(1UL << 1)
-#endif
-
-#ifndef PERF_FLAG_PID_CGROUP
-# define PERF_FLAG_PID_CGROUP		(1UL << 2) /* pid=cgroup id, per-cpu mode only */
-#endif
-
-#ifndef PERF_FLAG_FD_CLOEXEC
-# define PERF_FLAG_FD_CLOEXEC		(1UL << 3) /* O_CLOEXEC */
-#endif
-
 struct trace {
 	struct perf_tool	tool;
 	struct syscalltbl	*sctbl;
@@ -674,34 +658,6 @@ static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
 
 #define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
 
-static size_t syscall_arg__scnprintf_perf_flags(char *bf, size_t size,
-						struct syscall_arg *arg)
-{
-	int printed = 0, flags = arg->val;
-
-	if (flags == 0)
-		return 0;
-
-#define	P_FLAG(n) \
-	if (flags & PERF_FLAG_##n) { \
-		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
-		flags &= ~PERF_FLAG_##n; \
-	}
-
-	P_FLAG(FD_NO_GROUP);
-	P_FLAG(FD_OUTPUT);
-	P_FLAG(PID_CGROUP);
-	P_FLAG(FD_CLOEXEC);
-#undef P_FLAG
-
-	if (flags)
-		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
-
-	return printed;
-}
-
-#define SCA_PERF_FLAGS syscall_arg__scnprintf_perf_flags
-
 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
 						struct syscall_arg *arg)
 {
@@ -894,6 +850,7 @@ static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
 #include "trace/beauty/pid.c"
 #include "trace/beauty/mmap.c"
 #include "trace/beauty/mode_t.c"
+#include "trace/beauty/perf_event_open.c"
 #include "trace/beauty/sched_policy.c"
 #include "trace/beauty/socket_type.c"
 #include "trace/beauty/waitid_options.c"
@@ -1086,8 +1043,7 @@ static struct syscall_fmt {
 			     [1] = SCA_FILENAME, /* filename */
 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
 	{ .name	    = "perf_event_open", .errmsg = true,
-	  .arg_scnprintf = { [1] = SCA_INT, /* pid */
-			     [2] = SCA_INT, /* cpu */
+	  .arg_scnprintf = { [2] = SCA_INT, /* cpu */
 			     [3] = SCA_FD,  /* group_fd */
 			     [4] = SCA_PERF_FLAGS,  /* flags */ }, },
 	{ .name	    = "pipe2",	    .errmsg = true,
@@ -2126,6 +2082,17 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
 				union perf_event *event __maybe_unused,
 				struct perf_sample *sample)
 {
+	int callchain_ret = 0;
+
+	if (sample->callchain) {
+		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
+		if (callchain_ret == 0) {
+			if (callchain_cursor.nr < trace->min_stack)
+				goto out;
+			callchain_ret = 1;
+		}
+	}
+
 	trace__printf_interrupted_entry(trace, sample);
 	trace__fprintf_tstamp(trace, sample->time, trace->output);
 
@@ -2144,11 +2111,11 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
 
 	fprintf(trace->output, ")\n");
 
-	if (sample->callchain) {
-		if (trace__resolve_callchain(trace, evsel, sample, &callchain_cursor) == 0)
-			trace__fprintf_callchain(trace, sample);
-	}
-
+	if (callchain_ret > 0)
+		trace__fprintf_callchain(trace, sample);
+	else if (callchain_ret < 0)
+		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
+out:
 	return 0;
 }
 
@@ -2179,8 +2146,19 @@ static int trace__pgfault(struct trace *trace,
 	char map_type = 'd';
 	struct thread_trace *ttrace;
 	int err = -1;
+	int callchain_ret = 0;
 
 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
+
+	if (sample->callchain) {
+		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
+		if (callchain_ret == 0) {
+			if (callchain_cursor.nr < trace->min_stack)
+				goto out_put;
+			callchain_ret = 1;
+		}
+	}
+
 	ttrace = thread__trace(thread, trace->output);
 	if (ttrace == NULL)
 		goto out_put;
@@ -2222,6 +2200,11 @@ static int trace__pgfault(struct trace *trace,
 	print_location(trace->output, sample, &al, true, false);
 
 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
+
+	if (callchain_ret > 0)
+		trace__fprintf_callchain(trace, sample);
+	else if (callchain_ret < 0)
+		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
 out:
 	err = 0;
 out_put:
@@ -2381,8 +2364,7 @@ static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
 	return true;
 }
 
-static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
-				    u64 config)
+static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
 {
 	struct perf_evsel *evsel;
 	struct perf_event_attr attr = {
@@ -2396,13 +2378,10 @@ static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
 	event_attr_init(&attr);
 
 	evsel = perf_evsel__new(&attr);
-	if (!evsel)
-		return -ENOMEM;
+	if (evsel)
+		evsel->handler = trace__pgfault;
 
-	evsel->handler = trace__pgfault;
-	perf_evlist__add(evlist, evsel);
-
-	return 0;
+	return evsel;
 }
 
 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
@@ -2504,7 +2483,7 @@ out_enomem:
 static int trace__run(struct trace *trace, int argc, const char **argv)
 {
 	struct perf_evlist *evlist = trace->evlist;
-	struct perf_evsel *evsel;
+	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
 	int err = -1, i;
 	unsigned long before;
 	const bool forks = argc > 0;
@@ -2518,14 +2497,19 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 	if (trace->trace_syscalls)
 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
 
-	if ((trace->trace_pgfaults & TRACE_PFMAJ) &&
-	    perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ)) {
-		goto out_error_mem;
+	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
+		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
+		if (pgfault_maj == NULL)
+			goto out_error_mem;
+		perf_evlist__add(evlist, pgfault_maj);
 	}
 
-	if ((trace->trace_pgfaults & TRACE_PFMIN) &&
-	    perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MIN))
-		goto out_error_mem;
+	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
+		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
+		if (pgfault_min == NULL)
+			goto out_error_mem;
+		perf_evlist__add(evlist, pgfault_min);
+	}
 
 	if (trace->sched &&
 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
@@ -2546,24 +2530,42 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 
 	perf_evlist__config(evlist, &trace->opts, NULL);
 
-	if (callchain_param.enabled && trace->syscalls.events.sys_exit) {
-		perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
-					     &trace->opts, &callchain_param);
-               /*
-                * Now we have evsels with different sample_ids, use
-                * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
-                * from a fixed position in each ring buffer record.
-                *
-                * As of this the changeset introducing this comment, this
-                * isn't strictly needed, as the fields that can come before
-                * PERF_SAMPLE_ID are all used, but we'll probably disable
-                * some of those for things like copying the payload of
-                * pointer syscall arguments, and for vfs_getname we don't
-                * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
-                * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
-                */
-		perf_evlist__set_sample_bit(evlist, IDENTIFIER);
-		perf_evlist__reset_sample_bit(evlist, ID);
+	if (callchain_param.enabled) {
+		bool use_identifier = false;
+
+		if (trace->syscalls.events.sys_exit) {
+			perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
+						     &trace->opts, &callchain_param);
+			use_identifier = true;
+		}
+
+		if (pgfault_maj) {
+			perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
+			use_identifier = true;
+		}
+
+		if (pgfault_min) {
+			perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
+			use_identifier = true;
+		}
+
+		if (use_identifier) {
+		       /*
+			* Now we have evsels with different sample_ids, use
+			* PERF_SAMPLE_IDENTIFIER to map from sample to evsel
+			* from a fixed position in each ring buffer record.
+			*
+			* As of this the changeset introducing this comment, this
+			* isn't strictly needed, as the fields that can come before
+			* PERF_SAMPLE_ID are all used, but we'll probably disable
+			* some of those for things like copying the payload of
+			* pointer syscall arguments, and for vfs_getname we don't
+			* need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
+			* here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
+			*/
+			perf_evlist__set_sample_bit(evlist, IDENTIFIER);
+			perf_evlist__reset_sample_bit(evlist, ID);
+		}
 	}
 
 	signal(SIGCHLD, sig_handler);
@@ -3104,7 +3106,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
 	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
 		     "Set the maximum stack depth when parsing the callchain, "
 		     "anything beyond the specified depth will be ignored. "
-		     "Default: " __stringify(PERF_MAX_STACK_DEPTH)),
+		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
 			"per thread proc mmap processing timeout in ms"),
 	OPT_END()
@@ -3148,7 +3150,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
 		mmap_pages_user_set = false;
 
 	if (trace.max_stack == UINT_MAX) {
-		trace.max_stack = PERF_MAX_STACK_DEPTH;
+		trace.max_stack = sysctl_perf_event_max_stack;
 		max_stack_user_set = false;
 	}
 

+ 5 - 0
tools/perf/perf.c

@@ -17,6 +17,7 @@
 #include <subcmd/parse-options.h>
 #include "util/bpf-loader.h"
 #include "util/debug.h"
+#include <api/fs/fs.h>
 #include <api/fs/tracing_path.h>
 #include <pthread.h>
 #include <stdlib.h>
@@ -533,6 +534,7 @@ int main(int argc, const char **argv)
 {
 	const char *cmd;
 	char sbuf[STRERR_BUFSIZE];
+	int value;
 
 	/* libsubcmd init */
 	exec_cmd_init("perf", PREFIX, PERF_EXEC_PATH, EXEC_PATH_ENVIRONMENT);
@@ -542,6 +544,9 @@ int main(int argc, const char **argv)
 	page_size = sysconf(_SC_PAGE_SIZE);
 	cacheline_size = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
 
+	if (sysctl__read_int("kernel/perf_event_max_stack", &value) == 0)
+		sysctl_perf_event_max_stack = value;
+
 	cmd = extract_argv0_path(argv[0]);
 	if (!cmd)
 		cmd = "perf-help";

+ 1 - 1
tools/perf/tests/event_update.c

@@ -30,7 +30,7 @@ static int process_event_scale(struct perf_tool *tool __maybe_unused,
 
 	TEST_ASSERT_VAL("wrong id", ev->id == 123);
 	TEST_ASSERT_VAL("wrong id", ev->type == PERF_EVENT_UPDATE__SCALE);
-	TEST_ASSERT_VAL("wrong scale", ev_data->scale = 0.123);
+	TEST_ASSERT_VAL("wrong scale", ev_data->scale == 0.123);
 	return 0;
 }
 

+ 1 - 1
tools/perf/tests/hists_cumulate.c

@@ -101,7 +101,7 @@ static int add_hist_entries(struct hists *hists, struct machine *machine)
 		if (machine__resolve(machine, &al, &sample) < 0)
 			goto out;
 
-		if (hist_entry_iter__add(&iter, &al, PERF_MAX_STACK_DEPTH,
+		if (hist_entry_iter__add(&iter, &al, sysctl_perf_event_max_stack,
 					 NULL) < 0) {
 			addr_location__put(&al);
 			goto out;

+ 1 - 1
tools/perf/tests/hists_filter.c

@@ -81,7 +81,7 @@ static int add_hist_entries(struct perf_evlist *evlist,
 
 			al.socket = fake_samples[i].socket;
 			if (hist_entry_iter__add(&iter, &al,
-						 PERF_MAX_STACK_DEPTH, NULL) < 0) {
+						 sysctl_perf_event_max_stack, NULL) < 0) {
 				addr_location__put(&al);
 				goto out;
 			}

+ 1 - 1
tools/perf/tests/hists_output.c

@@ -67,7 +67,7 @@ static int add_hist_entries(struct hists *hists, struct machine *machine)
 		if (machine__resolve(machine, &al, &sample) < 0)
 			goto out;
 
-		if (hist_entry_iter__add(&iter, &al, PERF_MAX_STACK_DEPTH,
+		if (hist_entry_iter__add(&iter, &al, sysctl_perf_event_max_stack,
 					 NULL) < 0) {
 			addr_location__put(&al);
 			goto out;

+ 43 - 0
tools/perf/trace/beauty/perf_event_open.c

@@ -0,0 +1,43 @@
+#ifndef PERF_FLAG_FD_NO_GROUP
+# define PERF_FLAG_FD_NO_GROUP		(1UL << 0)
+#endif
+
+#ifndef PERF_FLAG_FD_OUTPUT
+# define PERF_FLAG_FD_OUTPUT		(1UL << 1)
+#endif
+
+#ifndef PERF_FLAG_PID_CGROUP
+# define PERF_FLAG_PID_CGROUP		(1UL << 2) /* pid=cgroup id, per-cpu mode only */
+#endif
+
+#ifndef PERF_FLAG_FD_CLOEXEC
+# define PERF_FLAG_FD_CLOEXEC		(1UL << 3) /* O_CLOEXEC */
+#endif
+
+static size_t syscall_arg__scnprintf_perf_flags(char *bf, size_t size,
+						struct syscall_arg *arg)
+{
+	int printed = 0, flags = arg->val;
+
+	if (flags == 0)
+		return 0;
+
+#define	P_FLAG(n) \
+	if (flags & PERF_FLAG_##n) { \
+		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+		flags &= ~PERF_FLAG_##n; \
+	}
+
+	P_FLAG(FD_NO_GROUP);
+	P_FLAG(FD_OUTPUT);
+	P_FLAG(PID_CGROUP);
+	P_FLAG(FD_CLOEXEC);
+#undef P_FLAG
+
+	if (flags)
+		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+	return printed;
+}
+
+#define SCA_PERF_FLAGS syscall_arg__scnprintf_perf_flags

+ 4 - 1
tools/perf/trace/beauty/pid.c

@@ -3,9 +3,12 @@ static size_t syscall_arg__scnprintf_pid(char *bf, size_t size, struct syscall_a
 	int pid = arg->val;
 	struct trace *trace = arg->trace;
 	size_t printed = scnprintf(bf, size, "%d", pid);
-	struct thread *thread = machine__find_thread(trace->host, pid, pid);
+	struct thread *thread = machine__findnew_thread(trace->host, pid, pid);
 
 	if (thread != NULL) {
+		if (!thread->comm_set)
+			thread__set_comm_from_proc(thread);
+
 		if (thread->comm_set)
 			printed += scnprintf(bf + printed, size - printed,
 					     " (%s)", thread__comm_str(thread));

+ 3 - 3
tools/perf/util/build-id.c

@@ -261,14 +261,14 @@ static int machine__write_buildid_table(struct machine *machine, int fd)
 
 		if (dso__is_vdso(pos)) {
 			name = pos->short_name;
-			name_len = pos->short_name_len + 1;
+			name_len = pos->short_name_len;
 		} else if (dso__is_kcore(pos)) {
 			machine__mmap_name(machine, nm, sizeof(nm));
 			name = nm;
-			name_len = strlen(nm) + 1;
+			name_len = strlen(nm);
 		} else {
 			name = pos->long_name;
-			name_len = pos->long_name_len + 1;
+			name_len = pos->long_name_len;
 		}
 
 		in_kernel = pos->kernel ||

+ 10 - 2
tools/perf/util/evlist.c

@@ -684,6 +684,7 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx)
 	struct perf_mmap *md = &evlist->mmap[idx];
 	u64 head;
 	u64 old = md->prev;
+	int diff;
 	unsigned char *data = md->base + page_size;
 	union perf_event *event = NULL;
 
@@ -694,6 +695,7 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx)
 		return NULL;
 
 	head = perf_mmap__read_head(md);
+	diff = head - old;
 	if (evlist->overwrite) {
 		/*
 		 * If we're further behind than half the buffer, there's a chance
@@ -703,7 +705,6 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx)
 		 *
 		 * In either case, truncate and restart at head.
 		 */
-		int diff = head - old;
 		if (diff > md->mask / 2 || diff < 0) {
 			fprintf(stderr, "WARNING: failed to keep up with mmap data.\n");
 
@@ -711,15 +712,21 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx)
 			 * head points to a known good entry, start there.
 			 */
 			old = head;
+			diff = 0;
 		}
 	}
 
-	if (old != head) {
+	if (diff >= (int)sizeof(event->header)) {
 		size_t size;
 
 		event = (union perf_event *)&data[old & md->mask];
 		size = event->header.size;
 
+		if (size < sizeof(event->header) || diff < (int)size) {
+			event = NULL;
+			goto broken_event;
+		}
+
 		/*
 		 * Event straddles the mmap boundary -- header should always
 		 * be inside due to u64 alignment of output.
@@ -743,6 +750,7 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx)
 		old += size;
 	}
 
+broken_event:
 	md->prev = old;
 
 	return event;

+ 17 - 1
tools/perf/util/evsel.c

@@ -1231,6 +1231,21 @@ static void __p_sample_type(char *buf, size_t size, u64 value)
 	__p_bits(buf, size, value, bits);
 }
 
+static void __p_branch_sample_type(char *buf, size_t size, u64 value)
+{
+#define bit_name(n) { PERF_SAMPLE_BRANCH_##n, #n }
+	struct bit_names bits[] = {
+		bit_name(USER), bit_name(KERNEL), bit_name(HV), bit_name(ANY),
+		bit_name(ANY_CALL), bit_name(ANY_RETURN), bit_name(IND_CALL),
+		bit_name(ABORT_TX), bit_name(IN_TX), bit_name(NO_TX),
+		bit_name(COND), bit_name(CALL_STACK), bit_name(IND_JUMP),
+		bit_name(CALL), bit_name(NO_FLAGS), bit_name(NO_CYCLES),
+		{ .name = NULL, }
+	};
+#undef bit_name
+	__p_bits(buf, size, value, bits);
+}
+
 static void __p_read_format(char *buf, size_t size, u64 value)
 {
 #define bit_name(n) { PERF_FORMAT_##n, #n }
@@ -1249,6 +1264,7 @@ static void __p_read_format(char *buf, size_t size, u64 value)
 #define p_unsigned(val)		snprintf(buf, BUF_SIZE, "%"PRIu64, (uint64_t)(val))
 #define p_signed(val)		snprintf(buf, BUF_SIZE, "%"PRId64, (int64_t)(val))
 #define p_sample_type(val)	__p_sample_type(buf, BUF_SIZE, val)
+#define p_branch_sample_type(val) __p_branch_sample_type(buf, BUF_SIZE, val)
 #define p_read_format(val)	__p_read_format(buf, BUF_SIZE, val)
 
 #define PRINT_ATTRn(_n, _f, _p)				\
@@ -1305,7 +1321,7 @@ int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr,
 	PRINT_ATTRf(bp_type, p_unsigned);
 	PRINT_ATTRn("{ bp_addr, config1 }", bp_addr, p_hex);
 	PRINT_ATTRn("{ bp_len, config2 }", bp_len, p_hex);
-	PRINT_ATTRf(branch_sample_type, p_unsigned);
+	PRINT_ATTRf(branch_sample_type, p_branch_sample_type);
 	PRINT_ATTRf(sample_regs_user, p_hex);
 	PRINT_ATTRf(sample_stack_user, p_unsigned);
 	PRINT_ATTRf(clockid, p_signed);

+ 2 - 0
tools/perf/util/hist.c

@@ -2062,6 +2062,8 @@ static struct hist_entry *hists__add_dummy_entry(struct hists *hists,
 	if (he) {
 		memset(&he->stat, 0, sizeof(he->stat));
 		he->hists = hists;
+		if (symbol_conf.cumulate_callchain)
+			memset(he->stat_acc, 0, sizeof(he->stat));
 		rb_link_node(&he->rb_node_in, parent, p);
 		rb_insert_color(&he->rb_node_in, root);
 		hists__inc_stats(hists, he);

+ 1 - 1
tools/perf/util/intel-pt-decoder/intel-pt-decoder.c

@@ -356,7 +356,7 @@ static const char *intel_pt_err_msgs[] = {
 
 int intel_pt__strerror(int code, char *buf, size_t buflen)
 {
-	if (code < 1 || code > INTEL_PT_ERR_MAX)
+	if (code < 1 || code >= INTEL_PT_ERR_MAX)
 		code = INTEL_PT_ERR_UNK;
 	strlcpy(buf, intel_pt_err_msgs[code], buflen);
 	return 0;

+ 3 - 3
tools/perf/util/machine.c

@@ -1764,7 +1764,7 @@ static int resolve_lbr_callchain_sample(struct thread *thread,
 		 */
 		int mix_chain_nr = i + 1 + lbr_nr + 1;
 
-		if (mix_chain_nr > PERF_MAX_STACK_DEPTH + PERF_MAX_BRANCH_DEPTH) {
+		if (mix_chain_nr > (int)sysctl_perf_event_max_stack + PERF_MAX_BRANCH_DEPTH) {
 			pr_warning("corrupted callchain. skipping...\n");
 			return 0;
 		}
@@ -1825,7 +1825,7 @@ static int thread__resolve_callchain_sample(struct thread *thread,
 	 * Based on DWARF debug information, some architectures skip
 	 * a callchain entry saved by the kernel.
 	 */
-	if (chain->nr < PERF_MAX_STACK_DEPTH)
+	if (chain->nr < sysctl_perf_event_max_stack)
 		skip_idx = arch_skip_callchain_idx(thread, chain);
 
 	/*
@@ -1886,7 +1886,7 @@ static int thread__resolve_callchain_sample(struct thread *thread,
 	}
 
 check_calls:
-	if (chain->nr > PERF_MAX_STACK_DEPTH && (int)chain->nr > max_stack) {
+	if (chain->nr > sysctl_perf_event_max_stack && (int)chain->nr > max_stack) {
 		pr_warning("corrupted callchain. skipping...\n");
 		return 0;
 	}

+ 92 - 22
tools/perf/util/probe-event.c

@@ -265,6 +265,65 @@ static bool kprobe_warn_out_range(const char *symbol, unsigned long address)
 	return true;
 }
 
+/*
+ * NOTE:
+ * '.gnu.linkonce.this_module' section of kernel module elf directly
+ * maps to 'struct module' from linux/module.h. This section contains
+ * actual module name which will be used by kernel after loading it.
+ * But, we cannot use 'struct module' here since linux/module.h is not
+ * exposed to user-space. Offset of 'name' has remained same from long
+ * time, so hardcoding it here.
+ */
+#ifdef __LP64__
+#define MOD_NAME_OFFSET 24
+#else
+#define MOD_NAME_OFFSET 12
+#endif
+
+/*
+ * @module can be module name of module file path. In case of path,
+ * inspect elf and find out what is actual module name.
+ * Caller has to free mod_name after using it.
+ */
+static char *find_module_name(const char *module)
+{
+	int fd;
+	Elf *elf;
+	GElf_Ehdr ehdr;
+	GElf_Shdr shdr;
+	Elf_Data *data;
+	Elf_Scn *sec;
+	char *mod_name = NULL;
+
+	fd = open(module, O_RDONLY);
+	if (fd < 0)
+		return NULL;
+
+	elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL);
+	if (elf == NULL)
+		goto elf_err;
+
+	if (gelf_getehdr(elf, &ehdr) == NULL)
+		goto ret_err;
+
+	sec = elf_section_by_name(elf, &ehdr, &shdr,
+			".gnu.linkonce.this_module", NULL);
+	if (!sec)
+		goto ret_err;
+
+	data = elf_getdata(sec, NULL);
+	if (!data || !data->d_buf)
+		goto ret_err;
+
+	mod_name = strdup((char *)data->d_buf + MOD_NAME_OFFSET);
+
+ret_err:
+	elf_end(elf);
+elf_err:
+	close(fd);
+	return mod_name;
+}
+
 #ifdef HAVE_DWARF_SUPPORT
 
 static int kernel_get_module_dso(const char *module, struct dso **pdso)
@@ -486,8 +545,10 @@ static int get_text_start_address(const char *exec, unsigned long *address)
 		return -errno;
 
 	elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL);
-	if (elf == NULL)
-		return -EINVAL;
+	if (elf == NULL) {
+		ret = -EINVAL;
+		goto out_close;
+	}
 
 	if (gelf_getehdr(elf, &ehdr) == NULL)
 		goto out;
@@ -499,6 +560,9 @@ static int get_text_start_address(const char *exec, unsigned long *address)
 	ret = 0;
 out:
 	elf_end(elf);
+out_close:
+	close(fd);
+
 	return ret;
 }
 
@@ -583,32 +647,23 @@ static int add_module_to_probe_trace_events(struct probe_trace_event *tevs,
 					    int ntevs, const char *module)
 {
 	int i, ret = 0;
-	char *tmp;
+	char *mod_name = NULL;
 
 	if (!module)
 		return 0;
 
-	tmp = strrchr(module, '/');
-	if (tmp) {
-		/* This is a module path -- get the module name */
-		module = strdup(tmp + 1);
-		if (!module)
-			return -ENOMEM;
-		tmp = strchr(module, '.');
-		if (tmp)
-			*tmp = '\0';
-		tmp = (char *)module;	/* For free() */
-	}
+	mod_name = find_module_name(module);
 
 	for (i = 0; i < ntevs; i++) {
-		tevs[i].point.module = strdup(module);
+		tevs[i].point.module =
+			strdup(mod_name ? mod_name : module);
 		if (!tevs[i].point.module) {
 			ret = -ENOMEM;
 			break;
 		}
 	}
 
-	free(tmp);
+	free(mod_name);
 	return ret;
 }
 
@@ -2516,6 +2571,7 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
 	struct probe_trace_point *tp;
 	int num_matched_functions;
 	int ret, i, j, skipped = 0;
+	char *mod_name;
 
 	map = get_target_map(pev->target, pev->uprobes);
 	if (!map) {
@@ -2600,9 +2656,19 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
 		tp->realname = strdup_or_goto(sym->name, nomem_out);
 
 		tp->retprobe = pp->retprobe;
-		if (pev->target)
-			tev->point.module = strdup_or_goto(pev->target,
-							   nomem_out);
+		if (pev->target) {
+			if (pev->uprobes) {
+				tev->point.module = strdup_or_goto(pev->target,
+								   nomem_out);
+			} else {
+				mod_name = find_module_name(pev->target);
+				tev->point.module =
+					strdup(mod_name ? mod_name : pev->target);
+				free(mod_name);
+				if (!tev->point.module)
+					goto nomem_out;
+			}
+		}
 		tev->uprobes = pev->uprobes;
 		tev->nargs = pev->nargs;
 		if (tev->nargs) {
@@ -2743,9 +2809,13 @@ static int convert_to_probe_trace_events(struct perf_probe_event *pev,
 {
 	int ret;
 
-	if (pev->uprobes && !pev->group) {
-		/* Replace group name if not given */
-		ret = convert_exec_to_group(pev->target, &pev->group);
+	if (!pev->group) {
+		/* Set group name if not given */
+		if (!pev->uprobes) {
+			pev->group = strdup(PERFPROBE_GROUP);
+			ret = pev->group ? 0 : -ENOMEM;
+		} else
+			ret = convert_exec_to_group(pev->target, &pev->group);
 		if (ret != 0) {
 			pr_warning("Failed to make a group name.\n");
 			return ret;

+ 1 - 2
tools/perf/util/probe-file.c

@@ -220,8 +220,7 @@ int probe_file__add_event(int fd, struct probe_trace_event *tev)
 
 	pr_debug("Writing event: %s\n", buf);
 	if (!probe_event_dry_run) {
-		ret = write(fd, buf, strlen(buf));
-		if (ret <= 0) {
+		if (write(fd, buf, strlen(buf)) < (int)strlen(buf)) {
 			ret = -errno;
 			pr_warning("Failed to write event: %s\n",
 				   strerror_r(errno, sbuf, sizeof(sbuf)));

+ 1 - 1
tools/perf/util/scripting-engines/trace-event-perl.c

@@ -265,7 +265,7 @@ static SV *perl_process_callchain(struct perf_sample *sample,
 
 	if (thread__resolve_callchain(al->thread, &callchain_cursor, evsel,
 				      sample, NULL, NULL,
-				      PERF_MAX_STACK_DEPTH) != 0) {
+				      sysctl_perf_event_max_stack) != 0) {
 		pr_err("Failed to resolve callchain. Skipping\n");
 		goto exit;
 	}

+ 20 - 1
tools/perf/util/thread.c

@@ -10,6 +10,8 @@
 #include "comm.h"
 #include "unwind.h"
 
+#include <api/fs/fs.h>
+
 int thread__init_map_groups(struct thread *thread, struct machine *machine)
 {
 	struct thread *leader;
@@ -153,6 +155,23 @@ int __thread__set_comm(struct thread *thread, const char *str, u64 timestamp,
 	return 0;
 }
 
+int thread__set_comm_from_proc(struct thread *thread)
+{
+	char path[64];
+	char *comm = NULL;
+	size_t sz;
+	int err = -1;
+
+	if (!(snprintf(path, sizeof(path), "%d/task/%d/comm",
+		       thread->pid_, thread->tid) >= (int)sizeof(path)) &&
+	    procfs__read_str(path, &comm, &sz) == 0) {
+		comm[sz - 1] = '\0';
+		err = thread__set_comm(thread, comm, 0);
+	}
+
+	return err;
+}
+
 const char *thread__comm_str(const struct thread *thread)
 {
 	const struct comm *comm = thread__comm(thread);
@@ -233,7 +252,7 @@ void thread__find_cpumode_addr_location(struct thread *thread,
 					struct addr_location *al)
 {
 	size_t i;
-	const u8 const cpumodes[] = {
+	const u8 cpumodes[] = {
 		PERF_RECORD_MISC_USER,
 		PERF_RECORD_MISC_KERNEL,
 		PERF_RECORD_MISC_GUEST_USER,

+ 2 - 0
tools/perf/util/thread.h

@@ -71,6 +71,8 @@ static inline int thread__set_comm(struct thread *thread, const char *comm,
 	return __thread__set_comm(thread, comm, timestamp, false);
 }
 
+int thread__set_comm_from_proc(struct thread *thread);
+
 int thread__comm_len(struct thread *thread);
 struct comm *thread__comm(const struct thread *thread);
 struct comm *thread__exec_comm(const struct thread *thread);

+ 36 - 0
tools/perf/util/util.c

@@ -33,6 +33,8 @@ struct callchain_param	callchain_param = {
 unsigned int page_size;
 int cacheline_size;
 
+unsigned int sysctl_perf_event_max_stack = PERF_MAX_STACK_DEPTH;
+
 bool test_attr__enabled;
 
 bool perf_host  = true;
@@ -117,6 +119,40 @@ int rm_rf(char *path)
 	return rmdir(path);
 }
 
+/* A filter which removes dot files */
+bool lsdir_no_dot_filter(const char *name __maybe_unused, struct dirent *d)
+{
+	return d->d_name[0] != '.';
+}
+
+/* lsdir reads a directory and store it in strlist */
+struct strlist *lsdir(const char *name,
+		      bool (*filter)(const char *, struct dirent *))
+{
+	struct strlist *list = NULL;
+	DIR *dir;
+	struct dirent *d;
+
+	dir = opendir(name);
+	if (!dir)
+		return NULL;
+
+	list = strlist__new(NULL, NULL);
+	if (!list) {
+		errno = -ENOMEM;
+		goto out;
+	}
+
+	while ((d = readdir(dir)) != NULL) {
+		if (!filter || filter(name, d))
+			strlist__add(list, d->d_name);
+	}
+
+out:
+	closedir(dir);
+	return list;
+}
+
 static int slow_copyfile(const char *from, const char *to)
 {
 	int err = -1;

+ 4 - 0
tools/perf/util/util.h

@@ -79,6 +79,7 @@
 #include <termios.h>
 #include <linux/bitops.h>
 #include <termios.h>
+#include "strlist.h"
 
 extern const char *graph_line;
 extern const char *graph_dotted_line;
@@ -222,6 +223,8 @@ static inline int sane_case(int x, int high)
 
 int mkdir_p(char *path, mode_t mode);
 int rm_rf(char *path);
+struct strlist *lsdir(const char *name, bool (*filter)(const char *, struct dirent *));
+bool lsdir_no_dot_filter(const char *name, struct dirent *d);
 int copyfile(const char *from, const char *to);
 int copyfile_mode(const char *from, const char *to, mode_t mode);
 int copyfile_offset(int fromfd, loff_t from_ofs, int tofd, loff_t to_ofs, u64 size);
@@ -264,6 +267,7 @@ void sighandler_dump_stack(int sig);
 
 extern unsigned int page_size;
 extern int cacheline_size;
+extern unsigned int sysctl_perf_event_max_stack;
 
 struct parse_tag {
 	char tag;