Browse Source

Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf updates from Ingo Molnar:
 "Kernel side changes:

   - Intel Knights Landing support.  (Harish Chegondi)

   - Intel Broadwell-EP uncore PMU support.  (Kan Liang)

   - Core code improvements.  (Peter Zijlstra.)

   - Event filter, LBR and PEBS fixes.  (Stephane Eranian)

   - Enable cycles:pp on Intel Atom.  (Stephane Eranian)

   - Add cycles:ppp support for Skylake.  (Andi Kleen)

   - Various x86 NMI overhead optimizations.  (Andi Kleen)

   - Intel PT enhancements.  (Takao Indoh)

   - AMD cache events fix.  (Vince Weaver)

  Tons of tooling changes:

   - Show random perf tool tips in the 'perf report' bottom line
     (Namhyung Kim)

   - perf report now defaults to --group if the perf.data file has
     grouped events, try it with:

      # perf record -e '{cycles,instructions}' -a sleep 1
      [ perf record: Woken up 1 times to write data ]
      [ perf record: Captured and wrote 1.093 MB perf.data (1247 samples) ]
      # perf report
      # Samples: 1K of event 'anon group { cycles, instructions }'
      # Event count (approx.): 1955219195
      #
      #       Overhead  Command     Shared Object      Symbol

         2.86%   0.22%  swapper     [kernel.kallsyms]  [k] intel_idle
         1.05%   0.33%  firefox     libxul.so          [.] js::SetObjectElement
         1.05%   0.00%  kworker/0:3 [kernel.kallsyms]  [k] gen6_ring_get_seqno
         0.88%   0.17%  chrome      chrome             [.] 0x0000000000ee27ab
         0.65%   0.86%  firefox     libxul.so          [.] js::ValueToId<(js::AllowGC)1>
         0.64%   0.23%  JS Helper   libxul.so          [.] js::SplayTree<js::jit::LiveRange*, js::jit::LiveRange>::splay
         0.62%   1.27%  firefox     libxul.so          [.] js::GetIterator
         0.61%   1.74%  firefox     libxul.so          [.] js::NativeSetProperty
         0.61%   0.31%  firefox     libxul.so          [.] js::SetPropertyByDefining

   - Introduce the 'perf stat record/report' workflow:

     Generate perf.data files from 'perf stat', to tap into the
     scripting capabilities perf has instead of defining a 'perf stat'
     specific scripting support to calculate event ratios, etc.

     Simple example:

        $ perf stat record -e cycles usleep 1

         Performance counter stats for 'usleep 1':

               1,134,996      cycles

             0.000670644 seconds time elapsed

        $ perf stat report

         Performance counter stats for '/home/acme/bin/perf stat record -e cycles usleep 1':

               1,134,996      cycles

             0.000670644 seconds time elapsed

        $

     It generates PERF_RECORD_ userspace records to store the details:

        $ perf report -D | grep PERF_RECORD
        0xf0 [0x28]: PERF_RECORD_THREAD_MAP nr: 1 thread: 27637
        0x118 [0x12]: PERF_RECORD_CPU_MAP nr: 1 cpu: 65535
        0x12a [0x40]: PERF_RECORD_STAT_CONFIG
        0x16a [0x30]: PERF_RECORD_STAT
        -1 -1 0x19a [0x40]: PERF_RECORD_MMAP -1/0: [0xffffffff81000000(0x1f000000) @ 0xffffffff81000000]: x [kernel.kallsyms]_text
        0x1da [0x18]: PERF_RECORD_STAT_ROUND
        [acme@ssdandy linux]$

     An effort was made to make perf.data files generated like this to
     not generate cryptic messages when processed by older tools.

     The 'perf script' bits need rebasing, will go up later.

   - Make command line options always available, even when they depend
     on some feature being enabled, warning the user about use of such
     options (Wang Nan)

   - Support hw breakpoint events (mem:0xAddress) in the default output
     mode in 'perf script' (Wang Nan)

   - Fixes and improvements for supporting annotating ARM binaries,
     support ARM call and jump instructions, more work needed to have
     arch specific stuff separated into tools/perf/arch/*/annotate/
     (Russell King)

   - Add initial 'perf config' command, for now just with a --list
     command to the contents of the configuration file in use and a
     basic man page describing its format, commands for doing edits and
     detailed documentation are being reviewed and proof-read.  (Taeung
     Song)

   - Allows BPF scriptlets specify arguments to be fetched using DWARF
     info, using a prologue generated at compile/build time (He Kuang,
     Wang Nan)

   - Allow attaching BPF scriptlets to module symbols (Wang Nan)

   - Allow attaching BPF scriptlets to userspace code using uprobe (Wang
     Nan)

   - BPF programs now can specify 'perf probe' tunables via its section
     name, separating key=val values using semicolons (Wang Nan)

     Testing some of these new BPF features:

        Use case: get callchains when receiving SSL packets, filter then in the
                  kernel, at arbitrary place.

        # cat ssl.bpf.c
        #define SEC(NAME) __attribute__((section(NAME), used))

        struct pt_regs;

        SEC("func=__inet_lookup_established hnum")
        int func(struct pt_regs *ctx, int err, unsigned short port)
        {
                return err == 0 && port == 443;
        }

        char _license[] SEC("license") = "GPL";
        int  _version   SEC("version") = LINUX_VERSION_CODE;
        #
        # perf record -a -g -e ssl.bpf.c
        ^C[ perf record: Woken up 1 times to write data ]
        [ perf record: Captured and wrote 0.787 MB perf.data (3 samples) ]
        # perf script | head -30
        swapper     0 [000] 58783.268118: perf_bpf_probe:func: (ffffffff816a0f60) hnum=0x1bb
           8a0f61 __inet_lookup_established (/lib/modules/4.3.0+/build/vmlinux)
           896def ip_rcv_finish (/lib/modules/4.3.0+/build/vmlinux)
           8976c2 ip_rcv (/lib/modules/4.3.0+/build/vmlinux)
           855eba __netif_receive_skb_core (/lib/modules/4.3.0+/build/vmlinux)
           8565d8 __netif_receive_skb (/lib/modules/4.3.0+/build/vmlinux)
           8572a8 process_backlog (/lib/modules/4.3.0+/build/vmlinux)
           856b11 net_rx_action (/lib/modules/4.3.0+/build/vmlinux)
           2a284b __do_softirq (/lib/modules/4.3.0+/build/vmlinux)
           2a2ba3 irq_exit (/lib/modules/4.3.0+/build/vmlinux)
           96b7a4 do_IRQ (/lib/modules/4.3.0+/build/vmlinux)
           969807 ret_from_intr (/lib/modules/4.3.0+/build/vmlinux)
           2dede5 cpu_startup_entry (/lib/modules/4.3.0+/build/vmlinux)
           95d5bc rest_init (/lib/modules/4.3.0+/build/vmlinux)
          1163ffa start_kernel ([kernel.vmlinux].init.text)
          11634d7 x86_64_start_reservations ([kernel.vmlinux].init.text)
          1163623 x86_64_start_kernel ([kernel.vmlinux].init.text)

        qemu-system-x86  9178 [003] 58785.792417: perf_bpf_probe:func: (ffffffff816a0f60) hnum=0x1bb
           8a0f61 __inet_lookup_established (/lib/modules/4.3.0+/build/vmlinux)
           896def ip_rcv_finish (/lib/modules/4.3.0+/build/vmlinux)
           8976c2 ip_rcv (/lib/modules/4.3.0+/build/vmlinux)
           855eba __netif_receive_skb_core (/lib/modules/4.3.0+/build/vmlinux)
           8565d8 __netif_receive_skb (/lib/modules/4.3.0+/build/vmlinux)
           856660 netif_receive_skb_internal (/lib/modules/4.3.0+/build/vmlinux)
           8566ec netif_receive_skb_sk (/lib/modules/4.3.0+/build/vmlinux)
             430a br_handle_frame_finish ([bridge])
             48bc br_handle_frame ([bridge])
           855f44 __netif_receive_skb_core (/lib/modules/4.3.0+/build/vmlinux)
           8565d8 __netif_receive_skb (/lib/modules/4.3.0+/build/vmlinux)
        #

   - Use 'perf probe' various options to list functions, see what
     variables can be collected at any given point, experiment first
     collecting without a filter, then filter, use it together with
     'perf trace', 'perf top', with or without callchains, if it
     explodes, please tell us!

   - Introduce a new callchain mode: "folded", that will list per line
     representations of all callchains for a give histogram entry,
     facilitating 'perf report' output processing by other tools, such
     as Brendan Gregg's flamegraph tools (Namhyung Kim)

     E.g:

        # perf report | grep -v ^# | head
           18.37%     0.00%  swapper  [kernel.kallsyms]   [k] cpu_startup_entry
                           |
                           ---cpu_startup_entry
                              |
                              |--12.07%--start_secondary
                              |
                               --6.30%--rest_init
                                         start_kernel
                                         x86_64_start_reservations
                                         x86_64_start_kernel
         #

     Becomes, in "folded" mode:

        # perf report -g folded | grep -v ^# | head -5
            18.37%     0.00%  swapper [kernel.kallsyms]   [k] cpu_startup_entry
          12.07% cpu_startup_entry;start_secondary
           6.30% cpu_startup_entry;rest_init;start_kernel;x86_64_start_reservations;x86_64_start_kernel
            16.90%     0.00%  swapper [kernel.kallsyms]   [k] call_cpuidle
          11.23% call_cpuidle;cpu_startup_entry;start_secondary
           5.67% call_cpuidle;cpu_startup_entry;rest_init;start_kernel;x86_64_start_reservations;x86_64_start_kernel
            16.90%     0.00%  swapper [kernel.kallsyms]   [k] cpuidle_enter
          11.23% cpuidle_enter;call_cpuidle;cpu_startup_entry;start_secondary
           5.67% cpuidle_enter;call_cpuidle;cpu_startup_entry;rest_init;start_kernel;x86_64_start_reservations;x86_64_start_kernel
            15.12%     0.00%  swapper [kernel.kallsyms]   [k] cpuidle_enter_state
         #

     The user can also select one of "count", "period" or "percent" as
     the first column.

  ... and lots of infrastructure enhancements, plus fixes and other
  changes, features I failed to list - see the shortlog and the git log
  for details"

* 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (271 commits)
  perf evlist: Add --trace-fields option to show trace fields
  perf record: Store data mmaps for dwarf unwind
  perf libdw: Check for mmaps also in MAP__VARIABLE tree
  perf unwind: Check for mmaps also in MAP__VARIABLE tree
  perf unwind: Use find_map function in access_dso_mem
  perf evlist: Remove perf_evlist__(enable|disable)_event functions
  perf evlist: Make perf_evlist__open() open evsels with their cpus and threads (like perf record does)
  perf report: Show random usage tip on the help line
  perf hists: Export a couple of hist functions
  perf diff: Use perf_hpp__register_sort_field interface
  perf tools: Add overhead/overhead_children keys defaults via string
  perf tools: Remove list entry from struct sort_entry
  perf tools: Include all tools/lib directory for tags/cscope/TAGS targets
  perf script: Align event name properly
  perf tools: Add missing headers in perf's MANIFEST
  perf tools: Do not show trace command if it's not compiled in
  perf report: Change default to use event group view
  perf top: Decay periods in callchains
  tools lib: Move bitmap.[ch] from tools/perf/ to tools/{lib,include}/
  tools lib: Sync tools/lib/find_bit.c with the kernel
  ...
Linus Torvalds 9 năm trước cách đây
mục cha
commit
5cb52b5e16
100 tập tin đã thay đổi với 3155 bổ sung870 xóa
  1. 37 0
      Documentation/trace/events-msr.txt
  2. 37 0
      Documentation/trace/postprocess/decode_msr.py
  3. 0 1
      arch/x86/include/asm/atomic.h
  4. 0 1
      arch/x86/include/asm/atomic64_32.h
  5. 10 0
      arch/x86/include/asm/intel_pt.h
  6. 57 0
      arch/x86/include/asm/msr-trace.h
  7. 31 0
      arch/x86/include/asm/msr.h
  8. 9 0
      arch/x86/include/asm/uaccess.h
  9. 32 4
      arch/x86/kernel/cpu/perf_event.c
  10. 9 12
      arch/x86/kernel/cpu/perf_event.h
  11. 1 1
      arch/x86/kernel/cpu/perf_event_amd.c
  12. 110 5
      arch/x86/kernel/cpu/perf_event_intel.c
  13. 31 8
      arch/x86/kernel/cpu/perf_event_intel_ds.c
  14. 37 5
      arch/x86/kernel/cpu/perf_event_intel_lbr.c
  15. 9 0
      arch/x86/kernel/cpu/perf_event_intel_pt.c
  16. 6 19
      arch/x86/kernel/cpu/perf_event_intel_rapl.c
  17. 17 0
      arch/x86/kernel/cpu/perf_event_intel_uncore.c
  18. 3 0
      arch/x86/kernel/cpu/perf_event_intel_uncore.h
  19. 1 1
      arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
  20. 629 6
      arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
  21. 11 0
      arch/x86/kernel/crash.c
  22. 26 0
      arch/x86/lib/msr.c
  23. 27 0
      include/linux/tracepoint-defs.h
  24. 1 15
      include/linux/tracepoint.h
  25. 6 0
      include/uapi/linux/perf_event.h
  26. 109 177
      kernel/events/core.c
  27. 4 0
      lib/atomic64_test.c
  28. 1 1
      tools/Makefile
  29. 1 1
      tools/build/Makefile
  30. 28 16
      tools/build/Makefile.feature
  31. 1 1
      tools/build/Makefile.include
  32. 47 46
      tools/build/feature/Makefile
  33. 2 0
      tools/include/linux/bitmap.h
  34. 15 0
      tools/include/linux/string.h
  35. 0 0
      tools/lib/bitmap.c
  36. 14 0
      tools/lib/bpf/Makefile
  37. 14 0
      tools/lib/bpf/bpf.c
  38. 2 0
      tools/lib/bpf/bpf.h
  39. 339 73
      tools/lib/bpf/libbpf.c
  40. 88 0
      tools/lib/bpf/libbpf.h
  41. 84 0
      tools/lib/find_bit.c
  42. 89 0
      tools/lib/string.c
  43. 7 0
      tools/lib/subcmd/Build
  44. 48 0
      tools/lib/subcmd/Makefile
  45. 209 0
      tools/lib/subcmd/exec-cmd.c
  46. 16 0
      tools/lib/subcmd/exec-cmd.h
  47. 54 125
      tools/lib/subcmd/help.c
  48. 9 4
      tools/lib/subcmd/help.h
  49. 14 9
      tools/lib/subcmd/pager.c
  50. 9 0
      tools/lib/subcmd/pager.h
  51. 183 67
      tools/lib/subcmd/parse-options.c
  52. 13 13
      tools/lib/subcmd/parse-options.h
  53. 16 8
      tools/lib/subcmd/run-command.c
  54. 7 5
      tools/lib/subcmd/run-command.h
  55. 2 1
      tools/lib/subcmd/sigchain.c
  56. 3 3
      tools/lib/subcmd/sigchain.h
  57. 11 0
      tools/lib/subcmd/subcmd-config.c
  58. 14 0
      tools/lib/subcmd/subcmd-config.h
  59. 91 0
      tools/lib/subcmd/subcmd-util.h
  60. 70 64
      tools/lib/traceevent/event-parse.c
  61. 4 0
      tools/lib/traceevent/event-parse.h
  62. 0 89
      tools/lib/util/find_next_bit.c
  63. 7 1
      tools/perf/Build
  64. 103 0
      tools/perf/Documentation/perf-config.txt
  65. 3 0
      tools/perf/Documentation/perf-evlist.txt
  66. 21 3
      tools/perf/Documentation/perf-record.txt
  67. 37 4
      tools/perf/Documentation/perf-report.txt
  68. 34 0
      tools/perf/Documentation/perf-stat.txt
  69. 3 0
      tools/perf/Documentation/perf-top.txt
  70. 14 0
      tools/perf/Documentation/tips.txt
  71. 6 1
      tools/perf/MANIFEST
  72. 30 14
      tools/perf/Makefile.perf
  73. 4 4
      tools/perf/arch/x86/include/arch-tests.h
  74. 1 1
      tools/perf/arch/x86/tests/insn-x86.c
  75. 2 2
      tools/perf/arch/x86/tests/intel-cqm.c
  76. 1 2
      tools/perf/arch/x86/tests/perf-time-to-tsc.c
  77. 1 1
      tools/perf/arch/x86/tests/rdpmc.c
  78. 1 0
      tools/perf/arch/x86/util/Build
  79. 2 2
      tools/perf/arch/x86/util/intel-bts.c
  80. 3 3
      tools/perf/arch/x86/util/intel-pt.c
  81. 1 1
      tools/perf/bench/futex-hash.c
  82. 1 1
      tools/perf/bench/futex-lock-pi.c
  83. 1 1
      tools/perf/bench/futex-requeue.c
  84. 1 1
      tools/perf/bench/futex-wake-parallel.c
  85. 1 1
      tools/perf/bench/futex-wake.c
  86. 1 1
      tools/perf/bench/mem-functions.c
  87. 1 1
      tools/perf/bench/numa.c
  88. 1 1
      tools/perf/bench/sched-messaging.c
  89. 1 1
      tools/perf/bench/sched-pipe.c
  90. 23 21
      tools/perf/builtin-annotate.c
  91. 1 1
      tools/perf/builtin-bench.c
  92. 1 1
      tools/perf/builtin-buildid-cache.c
  93. 1 1
      tools/perf/builtin-buildid-list.c
  94. 66 0
      tools/perf/builtin-config.c
  95. 1 1
      tools/perf/builtin-data.c
  96. 7 8
      tools/perf/builtin-diff.c
  97. 11 2
      tools/perf/builtin-evlist.c
  98. 5 5
      tools/perf/builtin-help.c
  99. 1 1
      tools/perf/builtin-inject.c
  100. 1 1
      tools/perf/builtin-kmem.c

+ 37 - 0
Documentation/trace/events-msr.txt

@@ -0,0 +1,37 @@
+
+The x86 kernel supports tracing most MSR (Model Specific Register) accesses.
+To see the definition of the MSRs on Intel systems please see the SDM
+at http://www.intel.com/sdm (Volume 3)
+
+Available trace points:
+
+/sys/kernel/debug/tracing/events/msr/
+
+Trace MSR reads
+
+read_msr
+
+msr: MSR number
+val: Value written
+failed: 1 if the access failed, otherwise 0
+
+
+Trace MSR writes
+
+write_msr
+
+msr: MSR number
+val: Value written
+failed: 1 if the access failed, otherwise 0
+
+
+Trace RDPMC in kernel
+
+rdpmc
+
+The trace data can be post processed with the postprocess/decode_msr.py script
+
+cat /sys/kernel/debug/tracing/trace | decode_msr.py /usr/src/linux/include/asm/msr-index.h
+
+to add symbolic MSR names.
+

+ 37 - 0
Documentation/trace/postprocess/decode_msr.py

@@ -0,0 +1,37 @@
+#!/usr/bin/python
+# add symbolic names to read_msr / write_msr in trace
+# decode_msr msr-index.h < trace
+import sys
+import re
+
+msrs = dict()
+
+with open(sys.argv[1] if len(sys.argv) > 1 else "msr-index.h", "r") as f:
+	for j in f:
+		m = re.match(r'#define (MSR_\w+)\s+(0x[0-9a-fA-F]+)', j)
+		if m:
+			msrs[int(m.group(2), 16)] = m.group(1)
+
+extra_ranges = (
+	( "MSR_LASTBRANCH_%d_FROM_IP", 0x680, 0x69F ),
+	( "MSR_LASTBRANCH_%d_TO_IP", 0x6C0, 0x6DF ),
+	( "LBR_INFO_%d", 0xdc0, 0xddf ),
+)
+
+for j in sys.stdin:
+	m = re.search(r'(read|write)_msr:\s+([0-9a-f]+)', j)
+	if m:
+		r = None
+		num = int(m.group(2), 16)
+		if num in msrs:
+			r = msrs[num]
+		else:
+			for er in extra_ranges:
+				if er[1] <= num <= er[2]:
+					r = er[0] % (num - er[1],)
+					break
+		if r:
+			j = j.replace(" " + m.group(2), " " + r + "(" + m.group(2) + ")")
+	print j,
+
+

+ 0 - 1
arch/x86/include/asm/atomic.h

@@ -3,7 +3,6 @@
 
 #include <linux/compiler.h>
 #include <linux/types.h>
-#include <asm/processor.h>
 #include <asm/alternative.h>
 #include <asm/cmpxchg.h>
 #include <asm/rmwcc.h>

+ 0 - 1
arch/x86/include/asm/atomic64_32.h

@@ -3,7 +3,6 @@
 
 #include <linux/compiler.h>
 #include <linux/types.h>
-#include <asm/processor.h>
 //#include <asm/cmpxchg.h>
 
 /* An 64bit atomic type */

+ 10 - 0
arch/x86/include/asm/intel_pt.h

@@ -0,0 +1,10 @@
+#ifndef _ASM_X86_INTEL_PT_H
+#define _ASM_X86_INTEL_PT_H
+
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
+void cpu_emergency_stop_pt(void);
+#else
+static inline void cpu_emergency_stop_pt(void) {}
+#endif
+
+#endif /* _ASM_X86_INTEL_PT_H */

+ 57 - 0
arch/x86/include/asm/msr-trace.h

@@ -0,0 +1,57 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM msr
+
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE msr-trace
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH asm/
+
+#if !defined(_TRACE_MSR_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MSR_H
+
+#include <linux/tracepoint.h>
+
+/*
+ * Tracing for x86 model specific registers. Directly maps to the
+ * RDMSR/WRMSR instructions.
+ */
+
+DECLARE_EVENT_CLASS(msr_trace_class,
+	    TP_PROTO(unsigned msr, u64 val, int failed),
+	    TP_ARGS(msr, val, failed),
+	    TP_STRUCT__entry(
+		    __field(	unsigned,	msr )
+		    __field(    u64,		val )
+		    __field(    int,		failed )
+	    ),
+	    TP_fast_assign(
+		    __entry->msr = msr;
+		    __entry->val = val;
+		    __entry->failed = failed;
+	    ),
+	    TP_printk("%x, value %llx%s",
+		      __entry->msr,
+		      __entry->val,
+		      __entry->failed ? " #GP" : "")
+);
+
+DEFINE_EVENT(msr_trace_class, read_msr,
+	     TP_PROTO(unsigned msr, u64 val, int failed),
+	     TP_ARGS(msr, val, failed)
+);
+
+DEFINE_EVENT(msr_trace_class, write_msr,
+	     TP_PROTO(unsigned msr, u64 val, int failed),
+	     TP_ARGS(msr, val, failed)
+);
+
+DEFINE_EVENT(msr_trace_class, rdpmc,
+	     TP_PROTO(unsigned msr, u64 val, int failed),
+	     TP_ARGS(msr, val, failed)
+);
+
+#endif /* _TRACE_MSR_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>

+ 31 - 0
arch/x86/include/asm/msr.h

@@ -57,11 +57,34 @@ static inline unsigned long long native_read_tscp(unsigned int *aux)
 #define EAX_EDX_RET(val, low, high)	"=A" (val)
 #endif
 
+#ifdef CONFIG_TRACEPOINTS
+/*
+ * Be very careful with includes. This header is prone to include loops.
+ */
+#include <asm/atomic.h>
+#include <linux/tracepoint-defs.h>
+
+extern struct tracepoint __tracepoint_read_msr;
+extern struct tracepoint __tracepoint_write_msr;
+extern struct tracepoint __tracepoint_rdpmc;
+#define msr_tracepoint_active(t) static_key_false(&(t).key)
+extern void do_trace_write_msr(unsigned msr, u64 val, int failed);
+extern void do_trace_read_msr(unsigned msr, u64 val, int failed);
+extern void do_trace_rdpmc(unsigned msr, u64 val, int failed);
+#else
+#define msr_tracepoint_active(t) false
+static inline void do_trace_write_msr(unsigned msr, u64 val, int failed) {}
+static inline void do_trace_read_msr(unsigned msr, u64 val, int failed) {}
+static inline void do_trace_rdpmc(unsigned msr, u64 val, int failed) {}
+#endif
+
 static inline unsigned long long native_read_msr(unsigned int msr)
 {
 	DECLARE_ARGS(val, low, high);
 
 	asm volatile("rdmsr" : EAX_EDX_RET(val, low, high) : "c" (msr));
+	if (msr_tracepoint_active(__tracepoint_read_msr))
+		do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), 0);
 	return EAX_EDX_VAL(val, low, high);
 }
 
@@ -78,6 +101,8 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr,
 		     _ASM_EXTABLE(2b, 3b)
 		     : [err] "=r" (*err), EAX_EDX_RET(val, low, high)
 		     : "c" (msr), [fault] "i" (-EIO));
+	if (msr_tracepoint_active(__tracepoint_read_msr))
+		do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), *err);
 	return EAX_EDX_VAL(val, low, high);
 }
 
@@ -85,6 +110,8 @@ static inline void native_write_msr(unsigned int msr,
 				    unsigned low, unsigned high)
 {
 	asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory");
+	if (msr_tracepoint_active(__tracepoint_read_msr))
+		do_trace_write_msr(msr, ((u64)high << 32 | low), 0);
 }
 
 /* Can be uninlined because referenced by paravirt */
@@ -102,6 +129,8 @@ notrace static inline int native_write_msr_safe(unsigned int msr,
 		     : "c" (msr), "0" (low), "d" (high),
 		       [fault] "i" (-EIO)
 		     : "memory");
+	if (msr_tracepoint_active(__tracepoint_read_msr))
+		do_trace_write_msr(msr, ((u64)high << 32 | low), err);
 	return err;
 }
 
@@ -160,6 +189,8 @@ static inline unsigned long long native_read_pmc(int counter)
 	DECLARE_ARGS(val, low, high);
 
 	asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter));
+	if (msr_tracepoint_active(__tracepoint_rdpmc))
+		do_trace_rdpmc(counter, EAX_EDX_VAL(val, low, high), 0);
 	return EAX_EDX_VAL(val, low, high);
 }
 

+ 9 - 0
arch/x86/include/asm/uaccess.h

@@ -745,5 +745,14 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
 #undef __copy_from_user_overflow
 #undef __copy_to_user_overflow
 
+/*
+ * We rely on the nested NMI work to allow atomic faults from the NMI path; the
+ * nested NMI paths are careful to preserve CR2.
+ *
+ * Caller must use pagefault_enable/disable, or run in interrupt context,
+ * and also do a uaccess_ok() check
+ */
+#define __copy_from_user_nmi __copy_from_user_inatomic
+
 #endif /* _ASM_X86_UACCESS_H */
 

+ 32 - 4
arch/x86/kernel/cpu/perf_event.c

@@ -482,6 +482,9 @@ int x86_pmu_hw_config(struct perf_event *event)
 			/* Support for IP fixup */
 			if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
 				precise++;
+
+			if (x86_pmu.pebs_prec_dist)
+				precise++;
 		}
 
 		if (event->attr.precise_ip > precise)
@@ -1531,6 +1534,7 @@ static void __init filter_events(struct attribute **attrs)
 {
 	struct device_attribute *d;
 	struct perf_pmu_events_attr *pmu_attr;
+	int offset = 0;
 	int i, j;
 
 	for (i = 0; attrs[i]; i++) {
@@ -1539,7 +1543,7 @@ static void __init filter_events(struct attribute **attrs)
 		/* str trumps id */
 		if (pmu_attr->event_str)
 			continue;
-		if (x86_pmu.event_map(i))
+		if (x86_pmu.event_map(i + offset))
 			continue;
 
 		for (j = i; attrs[j]; j++)
@@ -1547,6 +1551,14 @@ static void __init filter_events(struct attribute **attrs)
 
 		/* Check the shifted attr. */
 		i--;
+
+		/*
+		 * event_map() is index based, the attrs array is organized
+		 * by increasing event index. If we shift the events, then
+		 * we need to compensate for the event_map(), otherwise
+		 * we are looking up the wrong event in the map
+		 */
+		offset++;
 	}
 }
 
@@ -2250,12 +2262,19 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 	ss_base = get_segment_base(regs->ss);
 
 	fp = compat_ptr(ss_base + regs->bp);
+	pagefault_disable();
 	while (entry->nr < PERF_MAX_STACK_DEPTH) {
 		unsigned long bytes;
 		frame.next_frame     = 0;
 		frame.return_address = 0;
 
-		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
+		if (!access_ok(VERIFY_READ, fp, 8))
+			break;
+
+		bytes = __copy_from_user_nmi(&frame.next_frame, fp, 4);
+		if (bytes != 0)
+			break;
+		bytes = __copy_from_user_nmi(&frame.return_address, fp+4, 4);
 		if (bytes != 0)
 			break;
 
@@ -2265,6 +2284,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 		perf_callchain_store(entry, cs_base + frame.return_address);
 		fp = compat_ptr(ss_base + frame.next_frame);
 	}
+	pagefault_enable();
 	return 1;
 }
 #else
@@ -2302,12 +2322,19 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 	if (perf_callchain_user32(regs, entry))
 		return;
 
+	pagefault_disable();
 	while (entry->nr < PERF_MAX_STACK_DEPTH) {
 		unsigned long bytes;
 		frame.next_frame	     = NULL;
 		frame.return_address = 0;
 
-		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
+		if (!access_ok(VERIFY_READ, fp, 16))
+			break;
+
+		bytes = __copy_from_user_nmi(&frame.next_frame, fp, 8);
+		if (bytes != 0)
+			break;
+		bytes = __copy_from_user_nmi(&frame.return_address, fp+8, 8);
 		if (bytes != 0)
 			break;
 
@@ -2315,8 +2342,9 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 			break;
 
 		perf_callchain_store(entry, frame.return_address);
-		fp = frame.next_frame;
+		fp = (void __user *)frame.next_frame;
 	}
+	pagefault_enable();
 }
 
 /*

+ 9 - 12
arch/x86/kernel/cpu/perf_event.h

@@ -14,17 +14,7 @@
 
 #include <linux/perf_event.h>
 
-#if 0
-#undef wrmsrl
-#define wrmsrl(msr, val) 						\
-do {									\
-	unsigned int _msr = (msr);					\
-	u64 _val = (val);						\
-	trace_printk("wrmsrl(%x, %Lx)\n", (unsigned int)(_msr),		\
-			(unsigned long long)(_val));			\
-	native_write_msr((_msr), (u32)(_val), (u32)(_val >> 32));	\
-} while (0)
-#endif
+/* To enable MSR tracing please use the generic trace points. */
 
 /*
  *          |   NHM/WSM    |      SNB     |
@@ -318,6 +308,10 @@ struct cpu_hw_events {
 #define INTEL_UEVENT_CONSTRAINT(c, n)	\
 	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
 
+/* Constraint on specific umask bit only + event */
+#define INTEL_UBIT_EVENT_CONSTRAINT(c, n)	\
+	EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|(c))
+
 /* Like UEVENT_CONSTRAINT, but match flags too */
 #define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n)	\
 	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
@@ -589,7 +583,8 @@ struct x86_pmu {
 			bts_active	:1,
 			pebs		:1,
 			pebs_active	:1,
-			pebs_broken	:1;
+			pebs_broken	:1,
+			pebs_prec_dist	:1;
 	int		pebs_record_size;
 	void		(*drain_pebs)(struct pt_regs *regs);
 	struct event_constraint *pebs_constraints;
@@ -907,6 +902,8 @@ void intel_pmu_lbr_init_hsw(void);
 
 void intel_pmu_lbr_init_skl(void);
 
+void intel_pmu_lbr_init_knl(void);
+
 int intel_pmu_setup_lbr_filter(struct perf_event *event);
 
 void intel_pt_interrupt(void);

+ 1 - 1
arch/x86/kernel/cpu/perf_event_amd.c

@@ -18,7 +18,7 @@ static __initconst const u64 amd_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0x0141, /* Data Cache Misses          */
 	},
 	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
+		[ C(RESULT_ACCESS) ] = 0,
 		[ C(RESULT_MISS)   ] = 0,
 	},
 	[ C(OP_PREFETCH) ] = {

+ 110 - 5
arch/x86/kernel/cpu/perf_event_intel.c

@@ -185,6 +185,14 @@ struct event_constraint intel_skl_event_constraints[] = {
 	EVENT_CONSTRAINT_END
 };
 
+static struct extra_reg intel_knl_extra_regs[] __read_mostly = {
+	INTEL_UEVENT_EXTRA_REG(0x01b7,
+			       MSR_OFFCORE_RSP_0, 0x7f9ffbffffull, RSP_0),
+	INTEL_UEVENT_EXTRA_REG(0x02b7,
+			       MSR_OFFCORE_RSP_1, 0x3f9ffbffffull, RSP_1),
+	EVENT_EXTRA_END
+};
+
 static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
 	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
 	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
@@ -255,7 +263,7 @@ struct event_constraint intel_bdw_event_constraints[] = {
 	FIXED_EVENT_CONSTRAINT(0x003c, 1),	/* CPU_CLK_UNHALTED.CORE */
 	FIXED_EVENT_CONSTRAINT(0x0300, 2),	/* CPU_CLK_UNHALTED.REF */
 	INTEL_UEVENT_CONSTRAINT(0x148, 0x4),	/* L1D_PEND_MISS.PENDING */
-	INTEL_UEVENT_CONSTRAINT(0x8a3, 0x4),	/* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
+	INTEL_UBIT_EVENT_CONSTRAINT(0x8a3, 0x4),	/* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
 	EVENT_CONSTRAINT_END
 };
 
@@ -1457,6 +1465,42 @@ static __initconst const u64 slm_hw_cache_event_ids
  },
 };
 
+#define KNL_OT_L2_HITE		BIT_ULL(19) /* Other Tile L2 Hit */
+#define KNL_OT_L2_HITF		BIT_ULL(20) /* Other Tile L2 Hit */
+#define KNL_MCDRAM_LOCAL	BIT_ULL(21)
+#define KNL_MCDRAM_FAR		BIT_ULL(22)
+#define KNL_DDR_LOCAL		BIT_ULL(23)
+#define KNL_DDR_FAR		BIT_ULL(24)
+#define KNL_DRAM_ANY		(KNL_MCDRAM_LOCAL | KNL_MCDRAM_FAR | \
+				    KNL_DDR_LOCAL | KNL_DDR_FAR)
+#define KNL_L2_READ		SLM_DMND_READ
+#define KNL_L2_WRITE		SLM_DMND_WRITE
+#define KNL_L2_PREFETCH		SLM_DMND_PREFETCH
+#define KNL_L2_ACCESS		SLM_LLC_ACCESS
+#define KNL_L2_MISS		(KNL_OT_L2_HITE | KNL_OT_L2_HITF | \
+				   KNL_DRAM_ANY | SNB_SNP_ANY | \
+						  SNB_NON_DRAM)
+
+static __initconst const u64 knl_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+	[C(LL)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)] = KNL_L2_READ | KNL_L2_ACCESS,
+			[C(RESULT_MISS)]   = 0,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)] = KNL_L2_WRITE | KNL_L2_ACCESS,
+			[C(RESULT_MISS)]   = KNL_L2_WRITE | KNL_L2_MISS,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)] = KNL_L2_PREFETCH | KNL_L2_ACCESS,
+			[C(RESULT_MISS)]   = KNL_L2_PREFETCH | KNL_L2_MISS,
+		},
+	},
+};
+
 /*
  * Use from PMIs where the LBRs are already disabled.
  */
@@ -2475,6 +2519,44 @@ static void intel_pebs_aliases_snb(struct perf_event *event)
 	}
 }
 
+static void intel_pebs_aliases_precdist(struct perf_event *event)
+{
+	if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+		/*
+		 * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+		 * (0x003c) so that we can use it with PEBS.
+		 *
+		 * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+		 * PEBS capable. However we can use INST_RETIRED.PREC_DIST
+		 * (0x01c0), which is a PEBS capable event, to get the same
+		 * count.
+		 *
+		 * The PREC_DIST event has special support to minimize sample
+		 * shadowing effects. One drawback is that it can be
+		 * only programmed on counter 1, but that seems like an
+		 * acceptable trade off.
+		 */
+		u64 alt_config = X86_CONFIG(.event=0xc0, .umask=0x01, .inv=1, .cmask=16);
+
+		alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+		event->hw.config = alt_config;
+	}
+}
+
+static void intel_pebs_aliases_ivb(struct perf_event *event)
+{
+	if (event->attr.precise_ip < 3)
+		return intel_pebs_aliases_snb(event);
+	return intel_pebs_aliases_precdist(event);
+}
+
+static void intel_pebs_aliases_skl(struct perf_event *event)
+{
+	if (event->attr.precise_ip < 3)
+		return intel_pebs_aliases_core2(event);
+	return intel_pebs_aliases_precdist(event);
+}
+
 static unsigned long intel_pmu_free_running_flags(struct perf_event *event)
 {
 	unsigned long flags = x86_pmu.free_running_flags;
@@ -3332,6 +3414,7 @@ __init int intel_pmu_init(void)
 
 		x86_pmu.event_constraints = intel_gen_event_constraints;
 		x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_core2;
 		pr_cont("Atom events, ");
 		break;
 
@@ -3431,7 +3514,8 @@ __init int intel_pmu_init(void)
 
 		x86_pmu.event_constraints = intel_ivb_event_constraints;
 		x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
-		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+		x86_pmu.pebs_prec_dist = true;
 		if (boot_cpu_data.x86_model == 62)
 			x86_pmu.extra_regs = intel_snbep_extra_regs;
 		else
@@ -3464,7 +3548,8 @@ __init int intel_pmu_init(void)
 		x86_pmu.event_constraints = intel_hsw_event_constraints;
 		x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
 		x86_pmu.extra_regs = intel_snbep_extra_regs;
-		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+		x86_pmu.pebs_prec_dist = true;
 		/* all extra regs are per-cpu when HT is on */
 		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
 		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
@@ -3499,7 +3584,8 @@ __init int intel_pmu_init(void)
 		x86_pmu.event_constraints = intel_bdw_event_constraints;
 		x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
 		x86_pmu.extra_regs = intel_snbep_extra_regs;
-		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+		x86_pmu.pebs_prec_dist = true;
 		/* all extra regs are per-cpu when HT is on */
 		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
 		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
@@ -3511,6 +3597,24 @@ __init int intel_pmu_init(void)
 		pr_cont("Broadwell events, ");
 		break;
 
+	case 87: /* Knights Landing Xeon Phi */
+		memcpy(hw_cache_event_ids,
+		       slm_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs,
+		       knl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+		intel_pmu_lbr_init_knl();
+
+		x86_pmu.event_constraints = intel_slm_event_constraints;
+		x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_knl_extra_regs;
+
+		/* all extra regs are per-cpu when HT is on */
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+		pr_cont("Knights Landing events, ");
+		break;
+
 	case 78: /* 14nm Skylake Mobile */
 	case 94: /* 14nm Skylake Desktop */
 		x86_pmu.late_ack = true;
@@ -3521,7 +3625,8 @@ __init int intel_pmu_init(void)
 		x86_pmu.event_constraints = intel_skl_event_constraints;
 		x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
 		x86_pmu.extra_regs = intel_skl_extra_regs;
-		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_skl;
+		x86_pmu.pebs_prec_dist = true;
 		/* all extra regs are per-cpu when HT is on */
 		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
 		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;

+ 31 - 8
arch/x86/kernel/cpu/perf_event_intel_ds.c

@@ -620,6 +620,8 @@ struct event_constraint intel_atom_pebs_event_constraints[] = {
 	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
 	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
+	/* Allow all events as PEBS with no flags */
+	INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
 	EVENT_CONSTRAINT_END
 };
 
@@ -686,6 +688,8 @@ struct event_constraint intel_ivb_pebs_event_constraints[] = {
 	INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
 	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+	/* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
 	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
 	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
 	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
@@ -700,6 +704,8 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
 	INTEL_PLD_CONSTRAINT(0x01cd, 0xf),    /* MEM_TRANS_RETIRED.* */
 	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+	/* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
 	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
 	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
 	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
@@ -718,9 +724,10 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
 
 struct event_constraint intel_skl_pebs_event_constraints[] = {
 	INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2),	/* INST_RETIRED.PREC_DIST */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
-	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+	/* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+	/* INST_RETIRED.TOTAL_CYCLES_PS (inv=1, cmask=16) (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
 	INTEL_PLD_CONSTRAINT(0x1cd, 0xf),		      /* MEM_TRANS_RETIRED.* */
 	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
 	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
@@ -1101,6 +1108,13 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit)
 	void *at;
 	u64 pebs_status;
 
+	/*
+	 * fmt0 does not have a status bitfield (does not use
+	 * perf_record_nhm format)
+	 */
+	if (x86_pmu.intel_cap.pebs_format < 1)
+		return base;
+
 	if (base == NULL)
 		return NULL;
 
@@ -1186,7 +1200,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	if (!event->attr.precise_ip)
 		return;
 
-	n = (top - at) / x86_pmu.pebs_record_size;
+	n = top - at;
 	if (n <= 0)
 		return;
 
@@ -1230,12 +1244,21 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 		pebs_status = p->status & cpuc->pebs_enabled;
 		pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1;
 
+		/*
+		 * On some CPUs the PEBS status can be zero when PEBS is
+		 * racing with clearing of GLOBAL_STATUS.
+		 *
+		 * Normally we would drop that record, but in the
+		 * case when there is only a single active PEBS event
+		 * we can assume it's for that event.
+		 */
+		if (!pebs_status && cpuc->pebs_enabled &&
+			!(cpuc->pebs_enabled & (cpuc->pebs_enabled-1)))
+			pebs_status = cpuc->pebs_enabled;
+
 		bit = find_first_bit((unsigned long *)&pebs_status,
 					x86_pmu.max_pebs_events);
-		if (WARN(bit >= x86_pmu.max_pebs_events,
-			 "PEBS record without PEBS event! status=%Lx pebs_enabled=%Lx active_mask=%Lx",
-			 (unsigned long long)p->status, (unsigned long long)cpuc->pebs_enabled,
-			 *(unsigned long long *)cpuc->active_mask))
+		if (bit >= x86_pmu.max_pebs_events)
 			continue;
 
 		/*

+ 37 - 5
arch/x86/kernel/cpu/perf_event_intel_lbr.c

@@ -42,6 +42,13 @@ static enum {
 #define LBR_FAR_BIT		8 /* do not capture far branches */
 #define LBR_CALL_STACK_BIT	9 /* enable call stack */
 
+/*
+ * Following bit only exists in Linux; we mask it out before writing it to
+ * the actual MSR. But it helps the constraint perf code to understand
+ * that this is a separate configuration.
+ */
+#define LBR_NO_INFO_BIT	       63 /* don't read LBR_INFO. */
+
 #define LBR_KERNEL	(1 << LBR_KERNEL_BIT)
 #define LBR_USER	(1 << LBR_USER_BIT)
 #define LBR_JCC		(1 << LBR_JCC_BIT)
@@ -52,6 +59,7 @@ static enum {
 #define LBR_IND_JMP	(1 << LBR_IND_JMP_BIT)
 #define LBR_FAR		(1 << LBR_FAR_BIT)
 #define LBR_CALL_STACK	(1 << LBR_CALL_STACK_BIT)
+#define LBR_NO_INFO	(1ULL << LBR_NO_INFO_BIT)
 
 #define LBR_PLM (LBR_KERNEL | LBR_USER)
 
@@ -152,8 +160,8 @@ static void __intel_pmu_lbr_enable(bool pmi)
 	 * did not change.
 	 */
 	if (cpuc->lbr_sel)
-		lbr_select = cpuc->lbr_sel->config;
-	if (!pmi)
+		lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask;
+	if (!pmi && cpuc->lbr_sel)
 		wrmsrl(MSR_LBR_SELECT, lbr_select);
 
 	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
@@ -422,6 +430,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
  */
 static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 {
+	bool need_info = false;
 	unsigned long mask = x86_pmu.lbr_nr - 1;
 	int lbr_format = x86_pmu.intel_cap.lbr_format;
 	u64 tos = intel_pmu_lbr_tos();
@@ -429,8 +438,11 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 	int out = 0;
 	int num = x86_pmu.lbr_nr;
 
-	if (cpuc->lbr_sel->config & LBR_CALL_STACK)
-		num = tos;
+	if (cpuc->lbr_sel) {
+		need_info = !(cpuc->lbr_sel->config & LBR_NO_INFO);
+		if (cpuc->lbr_sel->config & LBR_CALL_STACK)
+			num = tos;
+	}
 
 	for (i = 0; i < num; i++) {
 		unsigned long lbr_idx = (tos - i) & mask;
@@ -442,7 +454,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 		rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
 		rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);
 
-		if (lbr_format == LBR_FORMAT_INFO) {
+		if (lbr_format == LBR_FORMAT_INFO && need_info) {
 			u64 info;
 
 			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info);
@@ -590,6 +602,7 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
 		if (v != LBR_IGN)
 			mask |= v;
 	}
+
 	reg = &event->hw.branch_reg;
 	reg->idx = EXTRA_REG_LBR;
 
@@ -600,6 +613,11 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
 	 */
 	reg->config = mask ^ x86_pmu.lbr_sel_mask;
 
+	if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) &&
+	    (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) &&
+	    (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO))
+		reg->config |= LBR_NO_INFO;
+
 	return 0;
 }
 
@@ -1028,3 +1046,17 @@ void __init intel_pmu_lbr_init_atom(void)
 	 */
 	pr_cont("8-deep LBR, ");
 }
+
+/* Knights Landing */
+void intel_pmu_lbr_init_knl(void)
+{
+	x86_pmu.lbr_nr	   = 8;
+	x86_pmu.lbr_tos    = MSR_LBR_TOS;
+	x86_pmu.lbr_from   = MSR_LBR_NHM_FROM;
+	x86_pmu.lbr_to     = MSR_LBR_NHM_TO;
+
+	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+	x86_pmu.lbr_sel_map  = snb_lbr_sel_map;
+
+	pr_cont("8-deep LBR, ");
+}

+ 9 - 0
arch/x86/kernel/cpu/perf_event_intel_pt.c

@@ -27,6 +27,7 @@
 #include <asm/perf_event.h>
 #include <asm/insn.h>
 #include <asm/io.h>
+#include <asm/intel_pt.h>
 
 #include "perf_event.h"
 #include "intel_pt.h"
@@ -1122,6 +1123,14 @@ static int pt_event_init(struct perf_event *event)
 	return 0;
 }
 
+void cpu_emergency_stop_pt(void)
+{
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+
+	if (pt->handle.event)
+		pt_event_stop(pt->handle.event, PERF_EF_UPDATE);
+}
+
 static __init int pt_init(void)
 {
 	int ret, cpu, prior_warn = 0;

+ 6 - 19
arch/x86/kernel/cpu/perf_event_intel_rapl.c

@@ -63,7 +63,7 @@
 #define INTEL_RAPL_PP1		0x4	/* pseudo-encoding */
 
 #define NR_RAPL_DOMAINS         0x4
-static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
+static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
 	"pp0-core",
 	"package",
 	"dram",
@@ -109,11 +109,11 @@ static struct kobj_attribute format_attr_##_var =		\
 
 #define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
 
-#define RAPL_EVENT_ATTR_STR(_name, v, str)				\
-static struct perf_pmu_events_attr event_attr_##v = {			\
-	.attr		= __ATTR(_name, 0444, rapl_sysfs_show, NULL),	\
-	.id		= 0,						\
-	.event_str	= str,						\
+#define RAPL_EVENT_ATTR_STR(_name, v, str)					\
+static struct perf_pmu_events_attr event_attr_##v = {				\
+	.attr		= __ATTR(_name, 0444, perf_event_sysfs_show, NULL),	\
+	.id		= 0,							\
+	.event_str	= str,							\
 };
 
 struct rapl_pmu {
@@ -405,19 +405,6 @@ static struct attribute_group rapl_pmu_attr_group = {
 	.attrs = rapl_pmu_attrs,
 };
 
-static ssize_t rapl_sysfs_show(struct device *dev,
-			       struct device_attribute *attr,
-			       char *page)
-{
-	struct perf_pmu_events_attr *pmu_attr = \
-		container_of(attr, struct perf_pmu_events_attr, attr);
-
-	if (pmu_attr->event_str)
-		return sprintf(page, "%s", pmu_attr->event_str);
-
-	return 0;
-}
-
 RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
 RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
 RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");

+ 17 - 0
arch/x86/kernel/cpu/perf_event_intel_uncore.c

@@ -884,6 +884,15 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 	 * each box has a different function id.
 	 */
 	pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
+	/* Knights Landing uses a common PCI device ID for multiple instances of
+	 * an uncore PMU device type. There is only one entry per device type in
+	 * the knl_uncore_pci_ids table inspite of multiple devices present for
+	 * some device types. Hence PCI device idx would be 0 for all devices.
+	 * So increment pmu pointer to point to an unused array element.
+	 */
+	if (boot_cpu_data.x86_model == 87)
+		while (pmu->func_id >= 0)
+			pmu++;
 	if (pmu->func_id < 0)
 		pmu->func_id = pdev->devfn;
 	else
@@ -966,6 +975,7 @@ static int __init uncore_pci_init(void)
 	case 63: /* Haswell-EP */
 		ret = hswep_uncore_pci_init();
 		break;
+	case 79: /* BDX-EP */
 	case 86: /* BDX-DE */
 		ret = bdx_uncore_pci_init();
 		break;
@@ -982,6 +992,9 @@ static int __init uncore_pci_init(void)
 	case 61: /* Broadwell */
 		ret = bdw_uncore_pci_init();
 		break;
+	case 87: /* Knights Landing */
+		ret = knl_uncore_pci_init();
+		break;
 	default:
 		return 0;
 	}
@@ -1287,9 +1300,13 @@ static int __init uncore_cpu_init(void)
 	case 63: /* Haswell-EP */
 		hswep_uncore_cpu_init();
 		break;
+	case 79: /* BDX-EP */
 	case 86: /* BDX-DE */
 		bdx_uncore_cpu_init();
 		break;
+	case 87: /* Knights Landing */
+		knl_uncore_cpu_init();
+		break;
 	default:
 		return 0;
 	}

+ 3 - 0
arch/x86/kernel/cpu/perf_event_intel_uncore.h

@@ -338,6 +338,7 @@ int hsw_uncore_pci_init(void);
 int bdw_uncore_pci_init(void);
 void snb_uncore_cpu_init(void);
 void nhm_uncore_cpu_init(void);
+int snb_pci2phy_map_init(int devid);
 
 /* perf_event_intel_uncore_snbep.c */
 int snbep_uncore_pci_init(void);
@@ -348,6 +349,8 @@ int hswep_uncore_pci_init(void);
 void hswep_uncore_cpu_init(void);
 int bdx_uncore_pci_init(void);
 void bdx_uncore_cpu_init(void);
+int knl_uncore_pci_init(void);
+void knl_uncore_cpu_init(void);
 
 /* perf_event_intel_uncore_nhmex.c */
 void nhmex_uncore_cpu_init(void);

+ 1 - 1
arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c

@@ -417,7 +417,7 @@ static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
 	}
 }
 
-static int snb_pci2phy_map_init(int devid)
+int snb_pci2phy_map_init(int devid)
 {
 	struct pci_dev *dev = NULL;
 	struct pci2phy_map *map;

+ 629 - 6
arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c

@@ -209,31 +209,98 @@
 #define HSWEP_PCU_MSR_PMON_BOX_CTL		0x710
 #define HSWEP_PCU_MSR_PMON_BOX_FILTER		0x715
 
+/* KNL Ubox */
+#define KNL_U_MSR_PMON_RAW_EVENT_MASK \
+					(SNBEP_U_MSR_PMON_RAW_EVENT_MASK | \
+						SNBEP_CBO_PMON_CTL_TID_EN)
+/* KNL CHA */
+#define KNL_CHA_MSR_OFFSET			0xc
+#define KNL_CHA_MSR_PMON_CTL_QOR		(1 << 16)
+#define KNL_CHA_MSR_PMON_RAW_EVENT_MASK \
+					(SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK | \
+					 KNL_CHA_MSR_PMON_CTL_QOR)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_TID		0x1ff
+#define KNL_CHA_MSR_PMON_BOX_FILTER_STATE	(7 << 18)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_OP		(0xfffffe2aULL << 32)
+
+/* KNL EDC/MC UCLK */
+#define KNL_UCLK_MSR_PMON_CTR0_LOW		0x400
+#define KNL_UCLK_MSR_PMON_CTL0			0x420
+#define KNL_UCLK_MSR_PMON_BOX_CTL		0x430
+#define KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW	0x44c
+#define KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL	0x454
+#define KNL_PMON_FIXED_CTL_EN			0x1
+
+/* KNL EDC */
+#define KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW		0xa00
+#define KNL_EDC0_ECLK_MSR_PMON_CTL0		0xa20
+#define KNL_EDC0_ECLK_MSR_PMON_BOX_CTL		0xa30
+#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW	0xa3c
+#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL	0xa44
+
+/* KNL MC */
+#define KNL_MC0_CH0_MSR_PMON_CTR0_LOW		0xb00
+#define KNL_MC0_CH0_MSR_PMON_CTL0		0xb20
+#define KNL_MC0_CH0_MSR_PMON_BOX_CTL		0xb30
+#define KNL_MC0_CH0_MSR_PMON_FIXED_LOW		0xb3c
+#define KNL_MC0_CH0_MSR_PMON_FIXED_CTL		0xb44
+
+/* KNL IRP */
+#define KNL_IRP_PCI_PMON_BOX_CTL		0xf0
+#define KNL_IRP_PCI_PMON_RAW_EVENT_MASK		(SNBEP_PMON_RAW_EVENT_MASK | \
+						 KNL_CHA_MSR_PMON_CTL_QOR)
+/* KNL PCU */
+#define KNL_PCU_PMON_CTL_EV_SEL_MASK		0x0000007f
+#define KNL_PCU_PMON_CTL_USE_OCC_CTR		(1 << 7)
+#define KNL_PCU_MSR_PMON_CTL_TRESH_MASK		0x3f000000
+#define KNL_PCU_MSR_PMON_RAW_EVENT_MASK	\
+				(KNL_PCU_PMON_CTL_EV_SEL_MASK | \
+				 KNL_PCU_PMON_CTL_USE_OCC_CTR | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+				 SNBEP_PMON_CTL_EDGE_DET | \
+				 SNBEP_CBO_PMON_CTL_TID_EN | \
+				 SNBEP_PMON_CTL_EV_SEL_EXT | \
+				 SNBEP_PMON_CTL_INVERT | \
+				 KNL_PCU_MSR_PMON_CTL_TRESH_MASK | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
 
 DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6");
 DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
+DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7");
 DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16");
 DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
 DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
 DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
 DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(thresh6, thresh, "config:24-29");
 DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
 DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
 DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
 DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
+DEFINE_UNCORE_FORMAT_ATTR(occ_edge_det, occ_edge_det, "config:31");
 DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
 DEFINE_UNCORE_FORMAT_ATTR(filter_tid2, filter_tid, "config1:0");
 DEFINE_UNCORE_FORMAT_ATTR(filter_tid3, filter_tid, "config1:0-5");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid4, filter_tid, "config1:0-8");
 DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5");
 DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
 DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12");
 DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
 DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
 DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
 DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22");
 DEFINE_UNCORE_FORMAT_ATTR(filter_state3, filter_state, "config1:17-23");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state4, filter_state, "config1:18-20");
+DEFINE_UNCORE_FORMAT_ATTR(filter_local, filter_local, "config1:33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_all_op, filter_all_op, "config1:35");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nnm, filter_nnm, "config1:37");
 DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
 DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc3, filter_opc, "config1:41-60");
 DEFINE_UNCORE_FORMAT_ATTR(filter_nc, filter_nc, "config1:62");
 DEFINE_UNCORE_FORMAT_ATTR(filter_c6, filter_c6, "config1:61");
 DEFINE_UNCORE_FORMAT_ATTR(filter_isoc, filter_isoc, "config1:63");
@@ -315,8 +382,9 @@ static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct pe
 static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
 {
 	struct pci_dev *pdev = box->pci_dev;
+	int box_ctl = uncore_pci_box_ctl(box);
 
-	pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, SNBEP_PMON_BOX_CTL_INT);
+	pci_write_config_dword(pdev, box_ctl, SNBEP_PMON_BOX_CTL_INT);
 }
 
 static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
@@ -1728,6 +1796,419 @@ int ivbep_uncore_pci_init(void)
 }
 /* end of IvyTown uncore support */
 
+/* KNL uncore support */
+static struct attribute *knl_uncore_ubox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_tid_en.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh5.attr,
+	NULL,
+};
+
+static struct attribute_group knl_uncore_ubox_format_group = {
+	.name = "format",
+	.attrs = knl_uncore_ubox_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_ubox = {
+	.name			= "ubox",
+	.num_counters		= 2,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.fixed_ctr_bits		= 48,
+	.perf_ctr		= HSWEP_U_MSR_PMON_CTR0,
+	.event_ctl		= HSWEP_U_MSR_PMON_CTL0,
+	.event_mask		= KNL_U_MSR_PMON_RAW_EVENT_MASK,
+	.fixed_ctr		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+	.fixed_ctl		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+	.ops			= &snbep_uncore_msr_ops,
+	.format_group		= &knl_uncore_ubox_format_group,
+};
+
+static struct attribute *knl_uncore_cha_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_qor.attr,
+	&format_attr_edge.attr,
+	&format_attr_tid_en.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	&format_attr_filter_tid4.attr,
+	&format_attr_filter_link3.attr,
+	&format_attr_filter_state4.attr,
+	&format_attr_filter_local.attr,
+	&format_attr_filter_all_op.attr,
+	&format_attr_filter_nnm.attr,
+	&format_attr_filter_opc3.attr,
+	&format_attr_filter_nc.attr,
+	&format_attr_filter_isoc.attr,
+	NULL,
+};
+
+static struct attribute_group knl_uncore_cha_format_group = {
+	.name = "format",
+	.attrs = knl_uncore_cha_formats_attr,
+};
+
+static struct event_constraint knl_uncore_cha_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x1f, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+	EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg knl_uncore_cha_extra_regs[] = {
+	SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+				  SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x3d, 0xff, 0x2),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x35, 0xff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x36, 0xff, 0x4),
+	EVENT_EXTRA_END
+};
+
+static u64 knl_cha_filter_mask(int fields)
+{
+	u64 mask = 0;
+
+	if (fields & 0x1)
+		mask |= KNL_CHA_MSR_PMON_BOX_FILTER_TID;
+	if (fields & 0x2)
+		mask |= KNL_CHA_MSR_PMON_BOX_FILTER_STATE;
+	if (fields & 0x4)
+		mask |= KNL_CHA_MSR_PMON_BOX_FILTER_OP;
+	return mask;
+}
+
+static struct event_constraint *
+knl_cha_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	return __snbep_cbox_get_constraint(box, event, knl_cha_filter_mask);
+}
+
+static int knl_cha_hw_config(struct intel_uncore_box *box,
+			     struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct extra_reg *er;
+	int idx = 0;
+
+	for (er = knl_uncore_cha_extra_regs; er->msr; er++) {
+		if (er->event != (event->hw.config & er->config_mask))
+			continue;
+		idx |= er->idx;
+	}
+
+	if (idx) {
+		reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
+			    KNL_CHA_MSR_OFFSET * box->pmu->pmu_idx;
+		reg1->config = event->attr.config1 & knl_cha_filter_mask(idx);
+		reg1->idx = idx;
+	}
+	return 0;
+}
+
+static void hswep_cbox_enable_event(struct intel_uncore_box *box,
+				    struct perf_event *event);
+
+static struct intel_uncore_ops knl_uncore_cha_ops = {
+	.init_box		= snbep_uncore_msr_init_box,
+	.disable_box		= snbep_uncore_msr_disable_box,
+	.enable_box		= snbep_uncore_msr_enable_box,
+	.disable_event		= snbep_uncore_msr_disable_event,
+	.enable_event		= hswep_cbox_enable_event,
+	.read_counter		= uncore_msr_read_counter,
+	.hw_config		= knl_cha_hw_config,
+	.get_constraint		= knl_cha_get_constraint,
+	.put_constraint		= snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type knl_uncore_cha = {
+	.name			= "cha",
+	.num_counters		= 4,
+	.num_boxes		= 38,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= HSWEP_C0_MSR_PMON_CTL0,
+	.perf_ctr		= HSWEP_C0_MSR_PMON_CTR0,
+	.event_mask		= KNL_CHA_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= HSWEP_C0_MSR_PMON_BOX_CTL,
+	.msr_offset		= KNL_CHA_MSR_OFFSET,
+	.num_shared_regs	= 1,
+	.constraints		= knl_uncore_cha_constraints,
+	.ops			= &knl_uncore_cha_ops,
+	.format_group		= &knl_uncore_cha_format_group,
+};
+
+static struct attribute *knl_uncore_pcu_formats_attr[] = {
+	&format_attr_event2.attr,
+	&format_attr_use_occ_ctr.attr,
+	&format_attr_occ_sel.attr,
+	&format_attr_edge.attr,
+	&format_attr_tid_en.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh6.attr,
+	&format_attr_occ_invert.attr,
+	&format_attr_occ_edge_det.attr,
+	NULL,
+};
+
+static struct attribute_group knl_uncore_pcu_format_group = {
+	.name = "format",
+	.attrs = knl_uncore_pcu_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_pcu = {
+	.name			= "pcu",
+	.num_counters		= 4,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= HSWEP_PCU_MSR_PMON_CTR0,
+	.event_ctl		= HSWEP_PCU_MSR_PMON_CTL0,
+	.event_mask		= KNL_PCU_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= HSWEP_PCU_MSR_PMON_BOX_CTL,
+	.ops			= &snbep_uncore_msr_ops,
+	.format_group		= &knl_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *knl_msr_uncores[] = {
+	&knl_uncore_ubox,
+	&knl_uncore_cha,
+	&knl_uncore_pcu,
+	NULL,
+};
+
+void knl_uncore_cpu_init(void)
+{
+	uncore_msr_uncores = knl_msr_uncores;
+}
+
+static void knl_uncore_imc_enable_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	int box_ctl = uncore_pci_box_ctl(box);
+
+	pci_write_config_dword(pdev, box_ctl, 0);
+}
+
+static void knl_uncore_imc_enable_event(struct intel_uncore_box *box,
+					struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+
+	if ((event->attr.config & SNBEP_PMON_CTL_EV_SEL_MASK)
+							== UNCORE_FIXED_EVENT)
+		pci_write_config_dword(pdev, hwc->config_base,
+				       hwc->config | KNL_PMON_FIXED_CTL_EN);
+	else
+		pci_write_config_dword(pdev, hwc->config_base,
+				       hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops knl_uncore_imc_ops = {
+	.init_box	= snbep_uncore_pci_init_box,
+	.disable_box	= snbep_uncore_pci_disable_box,
+	.enable_box	= knl_uncore_imc_enable_box,
+	.read_counter	= snbep_uncore_pci_read_counter,
+	.enable_event	= knl_uncore_imc_enable_event,
+	.disable_event	= snbep_uncore_pci_disable_event,
+};
+
+static struct intel_uncore_type knl_uncore_imc_uclk = {
+	.name			= "imc_uclk",
+	.num_counters		= 4,
+	.num_boxes		= 2,
+	.perf_ctr_bits		= 48,
+	.fixed_ctr_bits		= 48,
+	.perf_ctr		= KNL_UCLK_MSR_PMON_CTR0_LOW,
+	.event_ctl		= KNL_UCLK_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
+	.fixed_ctr		= KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
+	.fixed_ctl		= KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
+	.box_ctl		= KNL_UCLK_MSR_PMON_BOX_CTL,
+	.ops			= &knl_uncore_imc_ops,
+	.format_group		= &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_imc_dclk = {
+	.name			= "imc",
+	.num_counters		= 4,
+	.num_boxes		= 6,
+	.perf_ctr_bits		= 48,
+	.fixed_ctr_bits		= 48,
+	.perf_ctr		= KNL_MC0_CH0_MSR_PMON_CTR0_LOW,
+	.event_ctl		= KNL_MC0_CH0_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
+	.fixed_ctr		= KNL_MC0_CH0_MSR_PMON_FIXED_LOW,
+	.fixed_ctl		= KNL_MC0_CH0_MSR_PMON_FIXED_CTL,
+	.box_ctl		= KNL_MC0_CH0_MSR_PMON_BOX_CTL,
+	.ops			= &knl_uncore_imc_ops,
+	.format_group		= &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_edc_uclk = {
+	.name			= "edc_uclk",
+	.num_counters		= 4,
+	.num_boxes		= 8,
+	.perf_ctr_bits		= 48,
+	.fixed_ctr_bits		= 48,
+	.perf_ctr		= KNL_UCLK_MSR_PMON_CTR0_LOW,
+	.event_ctl		= KNL_UCLK_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
+	.fixed_ctr		= KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
+	.fixed_ctl		= KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
+	.box_ctl		= KNL_UCLK_MSR_PMON_BOX_CTL,
+	.ops			= &knl_uncore_imc_ops,
+	.format_group		= &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_edc_eclk = {
+	.name			= "edc_eclk",
+	.num_counters		= 4,
+	.num_boxes		= 8,
+	.perf_ctr_bits		= 48,
+	.fixed_ctr_bits		= 48,
+	.perf_ctr		= KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW,
+	.event_ctl		= KNL_EDC0_ECLK_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
+	.fixed_ctr		= KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW,
+	.fixed_ctl		= KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL,
+	.box_ctl		= KNL_EDC0_ECLK_MSR_PMON_BOX_CTL,
+	.ops			= &knl_uncore_imc_ops,
+	.format_group		= &snbep_uncore_format_group,
+};
+
+static struct event_constraint knl_uncore_m2pcie_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type knl_uncore_m2pcie = {
+	.name		= "m2pcie",
+	.num_counters   = 4,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 48,
+	.constraints	= knl_uncore_m2pcie_constraints,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct attribute *knl_uncore_irp_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_qor.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	NULL,
+};
+
+static struct attribute_group knl_uncore_irp_format_group = {
+	.name = "format",
+	.attrs = knl_uncore_irp_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_irp = {
+	.name			= "irp",
+	.num_counters		= 2,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= SNBEP_PCI_PMON_CTR0,
+	.event_ctl		= SNBEP_PCI_PMON_CTL0,
+	.event_mask		= KNL_IRP_PCI_PMON_RAW_EVENT_MASK,
+	.box_ctl		= KNL_IRP_PCI_PMON_BOX_CTL,
+	.ops			= &snbep_uncore_pci_ops,
+	.format_group		= &knl_uncore_irp_format_group,
+};
+
+enum {
+	KNL_PCI_UNCORE_MC_UCLK,
+	KNL_PCI_UNCORE_MC_DCLK,
+	KNL_PCI_UNCORE_EDC_UCLK,
+	KNL_PCI_UNCORE_EDC_ECLK,
+	KNL_PCI_UNCORE_M2PCIE,
+	KNL_PCI_UNCORE_IRP,
+};
+
+static struct intel_uncore_type *knl_pci_uncores[] = {
+	[KNL_PCI_UNCORE_MC_UCLK]	= &knl_uncore_imc_uclk,
+	[KNL_PCI_UNCORE_MC_DCLK]	= &knl_uncore_imc_dclk,
+	[KNL_PCI_UNCORE_EDC_UCLK]	= &knl_uncore_edc_uclk,
+	[KNL_PCI_UNCORE_EDC_ECLK]	= &knl_uncore_edc_eclk,
+	[KNL_PCI_UNCORE_M2PCIE]		= &knl_uncore_m2pcie,
+	[KNL_PCI_UNCORE_IRP]		= &knl_uncore_irp,
+	NULL,
+};
+
+/*
+ * KNL uses a common PCI device ID for multiple instances of an Uncore PMU
+ * device type. prior to KNL, each instance of a PMU device type had a unique
+ * device ID.
+ *
+ *	PCI Device ID	Uncore PMU Devices
+ *	----------------------------------
+ *	0x7841		MC0 UClk, MC1 UClk
+ *	0x7843		MC0 DClk CH 0, MC0 DClk CH 1, MC0 DClk CH 2,
+ *			MC1 DClk CH 0, MC1 DClk CH 1, MC1 DClk CH 2
+ *	0x7833		EDC0 UClk, EDC1 UClk, EDC2 UClk, EDC3 UClk,
+ *			EDC4 UClk, EDC5 UClk, EDC6 UClk, EDC7 UClk
+ *	0x7835		EDC0 EClk, EDC1 EClk, EDC2 EClk, EDC3 EClk,
+ *			EDC4 EClk, EDC5 EClk, EDC6 EClk, EDC7 EClk
+ *	0x7817		M2PCIe
+ *	0x7814		IRP
+*/
+
+static const struct pci_device_id knl_uncore_pci_ids[] = {
+	{ /* MC UClk */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7841),
+		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_UCLK, 0),
+	},
+	{ /* MC DClk Channel */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_DCLK, 0),
+	},
+	{ /* EDC UClk */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_UCLK, 0),
+	},
+	{ /* EDC EClk */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_ECLK, 0),
+	},
+	{ /* M2PCIe */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7817),
+		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_M2PCIE, 0),
+	},
+	{ /* IRP */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7814),
+		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_IRP, 0),
+	},
+	{ /* end: all zeroes */ }
+};
+
+static struct pci_driver knl_uncore_pci_driver = {
+	.name		= "knl_uncore",
+	.id_table	= knl_uncore_pci_ids,
+};
+
+int knl_uncore_pci_init(void)
+{
+	int ret;
+
+	/* All KNL PCI based PMON units are on the same PCI bus except IRP */
+	ret = snb_pci2phy_map_init(0x7814); /* IRP */
+	if (ret)
+		return ret;
+	ret = snb_pci2phy_map_init(0x7817); /* M2PCIe */
+	if (ret)
+		return ret;
+	uncore_pci_uncores = knl_pci_uncores;
+	uncore_pci_driver = &knl_uncore_pci_driver;
+	return 0;
+}
+
+/* end of KNL uncore support */
+
 /* Haswell-EP uncore support */
 static struct attribute *hswep_uncore_ubox_formats_attr[] = {
 	&format_attr_event.attr,
@@ -2338,7 +2819,7 @@ int hswep_uncore_pci_init(void)
 }
 /* end of Haswell-EP uncore support */
 
-/* BDX-DE uncore support */
+/* BDX uncore support */
 
 static struct intel_uncore_type bdx_uncore_ubox = {
 	.name			= "ubox",
@@ -2360,13 +2841,14 @@ static struct event_constraint bdx_uncore_cbox_constraints[] = {
 	UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
 	UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
 	UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
 	EVENT_CONSTRAINT_END
 };
 
 static struct intel_uncore_type bdx_uncore_cbox = {
 	.name			= "cbox",
 	.num_counters		= 4,
-	.num_boxes		= 8,
+	.num_boxes		= 24,
 	.perf_ctr_bits		= 48,
 	.event_ctl		= HSWEP_C0_MSR_PMON_CTL0,
 	.perf_ctr		= HSWEP_C0_MSR_PMON_CTR0,
@@ -2379,9 +2861,24 @@ static struct intel_uncore_type bdx_uncore_cbox = {
 	.format_group		= &hswep_uncore_cbox_format_group,
 };
 
+static struct intel_uncore_type bdx_uncore_sbox = {
+	.name			= "sbox",
+	.num_counters		= 4,
+	.num_boxes		= 4,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= HSWEP_S0_MSR_PMON_CTL0,
+	.perf_ctr		= HSWEP_S0_MSR_PMON_CTR0,
+	.event_mask		= HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= HSWEP_S0_MSR_PMON_BOX_CTL,
+	.msr_offset		= HSWEP_SBOX_MSR_OFFSET,
+	.ops			= &hswep_uncore_sbox_msr_ops,
+	.format_group		= &hswep_uncore_sbox_format_group,
+};
+
 static struct intel_uncore_type *bdx_msr_uncores[] = {
 	&bdx_uncore_ubox,
 	&bdx_uncore_cbox,
+	&bdx_uncore_sbox,
 	&hswep_uncore_pcu,
 	NULL,
 };
@@ -2396,7 +2893,7 @@ void bdx_uncore_cpu_init(void)
 static struct intel_uncore_type bdx_uncore_ha = {
 	.name		= "ha",
 	.num_counters   = 4,
-	.num_boxes	= 1,
+	.num_boxes	= 2,
 	.perf_ctr_bits	= 48,
 	SNBEP_UNCORE_PCI_COMMON_INIT(),
 };
@@ -2404,7 +2901,7 @@ static struct intel_uncore_type bdx_uncore_ha = {
 static struct intel_uncore_type bdx_uncore_imc = {
 	.name		= "imc",
 	.num_counters   = 5,
-	.num_boxes	= 2,
+	.num_boxes	= 8,
 	.perf_ctr_bits	= 48,
 	.fixed_ctr_bits	= 48,
 	.fixed_ctr	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
@@ -2424,6 +2921,19 @@ static struct intel_uncore_type bdx_uncore_irp = {
 	.format_group		= &snbep_uncore_format_group,
 };
 
+static struct intel_uncore_type bdx_uncore_qpi = {
+	.name			= "qpi",
+	.num_counters		= 4,
+	.num_boxes		= 3,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= SNBEP_PCI_PMON_CTR0,
+	.event_ctl		= SNBEP_PCI_PMON_CTL0,
+	.event_mask		= SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
+	.num_shared_regs	= 1,
+	.ops			= &snbep_uncore_qpi_ops,
+	.format_group		= &snbep_uncore_qpi_format_group,
+};
 
 static struct event_constraint bdx_uncore_r2pcie_constraints[] = {
 	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
@@ -2432,6 +2942,8 @@ static struct event_constraint bdx_uncore_r2pcie_constraints[] = {
 	UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
 	UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
 	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
 	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
 	EVENT_CONSTRAINT_END
 };
@@ -2445,18 +2957,65 @@ static struct intel_uncore_type bdx_uncore_r2pcie = {
 	SNBEP_UNCORE_PCI_COMMON_INIT(),
 };
 
+static struct event_constraint bdx_uncore_r3qpi_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x01, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_r3qpi = {
+	.name		= "r3qpi",
+	.num_counters   = 3,
+	.num_boxes	= 3,
+	.perf_ctr_bits	= 48,
+	.constraints	= bdx_uncore_r3qpi_constraints,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
 enum {
 	BDX_PCI_UNCORE_HA,
 	BDX_PCI_UNCORE_IMC,
 	BDX_PCI_UNCORE_IRP,
+	BDX_PCI_UNCORE_QPI,
 	BDX_PCI_UNCORE_R2PCIE,
+	BDX_PCI_UNCORE_R3QPI,
 };
 
 static struct intel_uncore_type *bdx_pci_uncores[] = {
 	[BDX_PCI_UNCORE_HA]	= &bdx_uncore_ha,
 	[BDX_PCI_UNCORE_IMC]	= &bdx_uncore_imc,
 	[BDX_PCI_UNCORE_IRP]	= &bdx_uncore_irp,
+	[BDX_PCI_UNCORE_QPI]	= &bdx_uncore_qpi,
 	[BDX_PCI_UNCORE_R2PCIE]	= &bdx_uncore_r2pcie,
+	[BDX_PCI_UNCORE_R3QPI]	= &bdx_uncore_r3qpi,
 	NULL,
 };
 
@@ -2465,6 +3024,10 @@ static const struct pci_device_id bdx_uncore_pci_ids[] = {
 		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30),
 		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0),
 	},
+	{ /* Home Agent 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f38),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 1),
+	},
 	{ /* MC0 Channel 0 */
 		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb0),
 		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 0),
@@ -2473,14 +3036,74 @@ static const struct pci_device_id bdx_uncore_pci_ids[] = {
 		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb1),
 		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 1),
 	},
+	{ /* MC0 Channel 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb4),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 2),
+	},
+	{ /* MC0 Channel 3 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb5),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 3),
+	},
+	{ /* MC1 Channel 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd0),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 4),
+	},
+	{ /* MC1 Channel 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd1),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 5),
+	},
+	{ /* MC1 Channel 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd4),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 6),
+	},
+	{ /* MC1 Channel 3 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd5),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 7),
+	},
 	{ /* IRP */
 		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f39),
 		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IRP, 0),
 	},
+	{ /* QPI0 Port 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f32),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 0),
+	},
+	{ /* QPI0 Port 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f33),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 1),
+	},
+	{ /* QPI1 Port 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3a),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 2),
+	},
 	{ /* R2PCIe */
 		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f34),
 		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R2PCIE, 0),
 	},
+	{ /* R3QPI0 Link 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f36),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 0),
+	},
+	{ /* R3QPI0 Link 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f37),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 1),
+	},
+	{ /* R3QPI1 Link 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3e),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 2),
+	},
+	{ /* QPI Port 0 filter  */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f86),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 0),
+	},
+	{ /* QPI Port 1 filter  */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f96),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 1),
+	},
+	{ /* QPI Port 2 filter  */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f46),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 2),
+	},
 	{ /* end: all zeroes */ }
 };
 
@@ -2500,4 +3123,4 @@ int bdx_uncore_pci_init(void)
 	return 0;
 }
 
-/* end of BDX-DE uncore support */
+/* end of BDX uncore support */

+ 11 - 0
arch/x86/kernel/crash.c

@@ -35,6 +35,7 @@
 #include <asm/cpu.h>
 #include <asm/reboot.h>
 #include <asm/virtext.h>
+#include <asm/intel_pt.h>
 
 /* Alignment required for elf header segment */
 #define ELF_CORE_HEADER_ALIGN   4096
@@ -125,6 +126,11 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
 	cpu_emergency_vmxoff();
 	cpu_emergency_svm_disable();
 
+	/*
+	 * Disable Intel PT to stop its logging
+	 */
+	cpu_emergency_stop_pt();
+
 	disable_local_APIC();
 }
 
@@ -169,6 +175,11 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
 	cpu_emergency_vmxoff();
 	cpu_emergency_svm_disable();
 
+	/*
+	 * Disable Intel PT to stop its logging
+	 */
+	cpu_emergency_stop_pt();
+
 #ifdef CONFIG_X86_IO_APIC
 	/* Prevent crash_kexec() from deadlocking on ioapic_lock. */
 	ioapic_zap_locks();

+ 26 - 0
arch/x86/lib/msr.c

@@ -1,6 +1,8 @@
 #include <linux/module.h>
 #include <linux/preempt.h>
 #include <asm/msr.h>
+#define CREATE_TRACE_POINTS
+#include <asm/msr-trace.h>
 
 struct msr *msrs_alloc(void)
 {
@@ -108,3 +110,27 @@ int msr_clear_bit(u32 msr, u8 bit)
 {
 	return __flip_bit(msr, bit, false);
 }
+
+#ifdef CONFIG_TRACEPOINTS
+void do_trace_write_msr(unsigned msr, u64 val, int failed)
+{
+	trace_write_msr(msr, val, failed);
+}
+EXPORT_SYMBOL(do_trace_write_msr);
+EXPORT_TRACEPOINT_SYMBOL(write_msr);
+
+void do_trace_read_msr(unsigned msr, u64 val, int failed)
+{
+	trace_read_msr(msr, val, failed);
+}
+EXPORT_SYMBOL(do_trace_read_msr);
+EXPORT_TRACEPOINT_SYMBOL(read_msr);
+
+void do_trace_rdpmc(unsigned counter, u64 val, int failed)
+{
+	trace_rdpmc(counter, val, failed);
+}
+EXPORT_SYMBOL(do_trace_rdpmc);
+EXPORT_TRACEPOINT_SYMBOL(rdpmc);
+
+#endif

+ 27 - 0
include/linux/tracepoint-defs.h

@@ -0,0 +1,27 @@
+#ifndef TRACEPOINT_DEFS_H
+#define TRACEPOINT_DEFS_H 1
+
+/*
+ * File can be included directly by headers who only want to access
+ * tracepoint->key to guard out of line trace calls. Otherwise
+ * linux/tracepoint.h should be used.
+ */
+
+#include <linux/atomic.h>
+#include <linux/static_key.h>
+
+struct tracepoint_func {
+	void *func;
+	void *data;
+	int prio;
+};
+
+struct tracepoint {
+	const char *name;		/* Tracepoint name */
+	struct static_key key;
+	void (*regfunc)(void);
+	void (*unregfunc)(void);
+	struct tracepoint_func __rcu *funcs;
+};
+
+#endif

+ 1 - 15
include/linux/tracepoint.h

@@ -17,26 +17,12 @@
 #include <linux/errno.h>
 #include <linux/types.h>
 #include <linux/rcupdate.h>
-#include <linux/static_key.h>
+#include <linux/tracepoint-defs.h>
 
 struct module;
 struct tracepoint;
 struct notifier_block;
 
-struct tracepoint_func {
-	void *func;
-	void *data;
-	int prio;
-};
-
-struct tracepoint {
-	const char *name;		/* Tracepoint name */
-	struct static_key key;
-	void (*regfunc)(void);
-	void (*unregfunc)(void);
-	struct tracepoint_func __rcu *funcs;
-};
-
 struct trace_enum_map {
 	const char		*system;
 	const char		*enum_string;

+ 6 - 0
include/uapi/linux/perf_event.h

@@ -171,6 +171,9 @@ enum perf_branch_sample_type_shift {
 	PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT	= 12, /* indirect jumps */
 	PERF_SAMPLE_BRANCH_CALL_SHIFT		= 13, /* direct call */
 
+	PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT	= 14, /* no flags */
+	PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT	= 15, /* no cycles */
+
 	PERF_SAMPLE_BRANCH_MAX_SHIFT		/* non-ABI */
 };
 
@@ -192,6 +195,9 @@ enum perf_branch_sample_type {
 	PERF_SAMPLE_BRANCH_IND_JUMP	= 1U << PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT,
 	PERF_SAMPLE_BRANCH_CALL		= 1U << PERF_SAMPLE_BRANCH_CALL_SHIFT,
 
+	PERF_SAMPLE_BRANCH_NO_FLAGS	= 1U << PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT,
+	PERF_SAMPLE_BRANCH_NO_CYCLES	= 1U << PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT,
+
 	PERF_SAMPLE_BRANCH_MAX		= 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
 };
 

+ 109 - 177
kernel/events/core.c

@@ -126,6 +126,37 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info)
 	return data.ret;
 }
 
+static void event_function_call(struct perf_event *event,
+				int (*active)(void *),
+				void (*inactive)(void *),
+				void *data)
+{
+	struct perf_event_context *ctx = event->ctx;
+	struct task_struct *task = ctx->task;
+
+	if (!task) {
+		cpu_function_call(event->cpu, active, data);
+		return;
+	}
+
+again:
+	if (!task_function_call(task, active, data))
+		return;
+
+	raw_spin_lock_irq(&ctx->lock);
+	if (ctx->is_active) {
+		/*
+		 * Reload the task pointer, it might have been changed by
+		 * a concurrent perf_event_context_sched_out().
+		 */
+		task = ctx->task;
+		raw_spin_unlock_irq(&ctx->lock);
+		goto again;
+	}
+	inactive(data);
+	raw_spin_unlock_irq(&ctx->lock);
+}
+
 #define EVENT_OWNER_KERNEL ((void *) -1)
 
 static bool is_kernel_event(struct perf_event *event)
@@ -1629,6 +1660,17 @@ struct remove_event {
 	bool detach_group;
 };
 
+static void ___perf_remove_from_context(void *info)
+{
+	struct remove_event *re = info;
+	struct perf_event *event = re->event;
+	struct perf_event_context *ctx = event->ctx;
+
+	if (re->detach_group)
+		perf_group_detach(event);
+	list_del_event(event, ctx);
+}
+
 /*
  * Cross CPU call to remove a performance event
  *
@@ -1656,7 +1698,6 @@ static int __perf_remove_from_context(void *info)
 	return 0;
 }
 
-
 /*
  * Remove the event from a task's (or a CPU's) list of events.
  *
@@ -1673,7 +1714,6 @@ static int __perf_remove_from_context(void *info)
 static void perf_remove_from_context(struct perf_event *event, bool detach_group)
 {
 	struct perf_event_context *ctx = event->ctx;
-	struct task_struct *task = ctx->task;
 	struct remove_event re = {
 		.event = event,
 		.detach_group = detach_group,
@@ -1681,44 +1721,8 @@ static void perf_remove_from_context(struct perf_event *event, bool detach_group
 
 	lockdep_assert_held(&ctx->mutex);
 
-	if (!task) {
-		/*
-		 * Per cpu events are removed via an smp call. The removal can
-		 * fail if the CPU is currently offline, but in that case we
-		 * already called __perf_remove_from_context from
-		 * perf_event_exit_cpu.
-		 */
-		cpu_function_call(event->cpu, __perf_remove_from_context, &re);
-		return;
-	}
-
-retry:
-	if (!task_function_call(task, __perf_remove_from_context, &re))
-		return;
-
-	raw_spin_lock_irq(&ctx->lock);
-	/*
-	 * If we failed to find a running task, but find the context active now
-	 * that we've acquired the ctx->lock, retry.
-	 */
-	if (ctx->is_active) {
-		raw_spin_unlock_irq(&ctx->lock);
-		/*
-		 * Reload the task pointer, it might have been changed by
-		 * a concurrent perf_event_context_sched_out().
-		 */
-		task = ctx->task;
-		goto retry;
-	}
-
-	/*
-	 * Since the task isn't running, its safe to remove the event, us
-	 * holding the ctx->lock ensures the task won't get scheduled in.
-	 */
-	if (detach_group)
-		perf_group_detach(event);
-	list_del_event(event, ctx);
-	raw_spin_unlock_irq(&ctx->lock);
+	event_function_call(event, __perf_remove_from_context,
+			    ___perf_remove_from_context, &re);
 }
 
 /*
@@ -1762,6 +1766,20 @@ int __perf_event_disable(void *info)
 	return 0;
 }
 
+void ___perf_event_disable(void *info)
+{
+	struct perf_event *event = info;
+
+	/*
+	 * Since we have the lock this context can't be scheduled
+	 * in, so we can change the state safely.
+	 */
+	if (event->state == PERF_EVENT_STATE_INACTIVE) {
+		update_group_times(event);
+		event->state = PERF_EVENT_STATE_OFF;
+	}
+}
+
 /*
  * Disable a event.
  *
@@ -1778,43 +1796,16 @@ int __perf_event_disable(void *info)
 static void _perf_event_disable(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
-	struct task_struct *task = ctx->task;
-
-	if (!task) {
-		/*
-		 * Disable the event on the cpu that it's on
-		 */
-		cpu_function_call(event->cpu, __perf_event_disable, event);
-		return;
-	}
-
-retry:
-	if (!task_function_call(task, __perf_event_disable, event))
-		return;
 
 	raw_spin_lock_irq(&ctx->lock);
-	/*
-	 * If the event is still active, we need to retry the cross-call.
-	 */
-	if (event->state == PERF_EVENT_STATE_ACTIVE) {
+	if (event->state <= PERF_EVENT_STATE_OFF) {
 		raw_spin_unlock_irq(&ctx->lock);
-		/*
-		 * Reload the task pointer, it might have been changed by
-		 * a concurrent perf_event_context_sched_out().
-		 */
-		task = ctx->task;
-		goto retry;
-	}
-
-	/*
-	 * Since we have the lock this context can't be scheduled
-	 * in, so we can change the state safely.
-	 */
-	if (event->state == PERF_EVENT_STATE_INACTIVE) {
-		update_group_times(event);
-		event->state = PERF_EVENT_STATE_OFF;
+		return;
 	}
 	raw_spin_unlock_irq(&ctx->lock);
+
+	event_function_call(event, __perf_event_disable,
+			    ___perf_event_disable, event);
 }
 
 /*
@@ -2067,6 +2058,18 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
 		ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
 }
 
+static void ___perf_install_in_context(void *info)
+{
+	struct perf_event *event = info;
+	struct perf_event_context *ctx = event->ctx;
+
+	/*
+	 * Since the task isn't running, its safe to add the event, us holding
+	 * the ctx->lock ensures the task won't get scheduled in.
+	 */
+	add_event_to_ctx(event, ctx);
+}
+
 /*
  * Cross CPU call to install and enable a performance event
  *
@@ -2143,48 +2146,14 @@ perf_install_in_context(struct perf_event_context *ctx,
 			struct perf_event *event,
 			int cpu)
 {
-	struct task_struct *task = ctx->task;
-
 	lockdep_assert_held(&ctx->mutex);
 
 	event->ctx = ctx;
 	if (event->cpu != -1)
 		event->cpu = cpu;
 
-	if (!task) {
-		/*
-		 * Per cpu events are installed via an smp call and
-		 * the install is always successful.
-		 */
-		cpu_function_call(cpu, __perf_install_in_context, event);
-		return;
-	}
-
-retry:
-	if (!task_function_call(task, __perf_install_in_context, event))
-		return;
-
-	raw_spin_lock_irq(&ctx->lock);
-	/*
-	 * If we failed to find a running task, but find the context active now
-	 * that we've acquired the ctx->lock, retry.
-	 */
-	if (ctx->is_active) {
-		raw_spin_unlock_irq(&ctx->lock);
-		/*
-		 * Reload the task pointer, it might have been changed by
-		 * a concurrent perf_event_context_sched_out().
-		 */
-		task = ctx->task;
-		goto retry;
-	}
-
-	/*
-	 * Since the task isn't running, its safe to add the event, us holding
-	 * the ctx->lock ensures the task won't get scheduled in.
-	 */
-	add_event_to_ctx(event, ctx);
-	raw_spin_unlock_irq(&ctx->lock);
+	event_function_call(event, __perf_install_in_context,
+			    ___perf_install_in_context, event);
 }
 
 /*
@@ -2287,6 +2256,11 @@ unlock:
 	return 0;
 }
 
+void ___perf_event_enable(void *info)
+{
+	__perf_event_mark_enabled((struct perf_event *)info);
+}
+
 /*
  * Enable a event.
  *
@@ -2299,58 +2273,26 @@ unlock:
 static void _perf_event_enable(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
-	struct task_struct *task = ctx->task;
 
-	if (!task) {
-		/*
-		 * Enable the event on the cpu that it's on
-		 */
-		cpu_function_call(event->cpu, __perf_event_enable, event);
+	raw_spin_lock_irq(&ctx->lock);
+	if (event->state >= PERF_EVENT_STATE_INACTIVE) {
+		raw_spin_unlock_irq(&ctx->lock);
 		return;
 	}
 
-	raw_spin_lock_irq(&ctx->lock);
-	if (event->state >= PERF_EVENT_STATE_INACTIVE)
-		goto out;
-
 	/*
 	 * If the event is in error state, clear that first.
-	 * That way, if we see the event in error state below, we
-	 * know that it has gone back into error state, as distinct
-	 * from the task having been scheduled away before the
-	 * cross-call arrived.
+	 *
+	 * That way, if we see the event in error state below, we know that it
+	 * has gone back into error state, as distinct from the task having
+	 * been scheduled away before the cross-call arrived.
 	 */
 	if (event->state == PERF_EVENT_STATE_ERROR)
 		event->state = PERF_EVENT_STATE_OFF;
-
-retry:
-	if (!ctx->is_active) {
-		__perf_event_mark_enabled(event);
-		goto out;
-	}
-
 	raw_spin_unlock_irq(&ctx->lock);
 
-	if (!task_function_call(task, __perf_event_enable, event))
-		return;
-
-	raw_spin_lock_irq(&ctx->lock);
-
-	/*
-	 * If the context is active and the event is still off,
-	 * we need to retry the cross-call.
-	 */
-	if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
-		/*
-		 * task could have been flipped by a concurrent
-		 * perf_event_context_sched_out()
-		 */
-		task = ctx->task;
-		goto retry;
-	}
-
-out:
-	raw_spin_unlock_irq(&ctx->lock);
+	event_function_call(event, __perf_event_enable,
+			    ___perf_event_enable, event);
 }
 
 /*
@@ -4149,6 +4091,22 @@ struct period_event {
 	u64 value;
 };
 
+static void ___perf_event_period(void *info)
+{
+	struct period_event *pe = info;
+	struct perf_event *event = pe->event;
+	u64 value = pe->value;
+
+	if (event->attr.freq) {
+		event->attr.sample_freq = value;
+	} else {
+		event->attr.sample_period = value;
+		event->hw.sample_period = value;
+	}
+
+	local64_set(&event->hw.period_left, 0);
+}
+
 static int __perf_event_period(void *info)
 {
 	struct period_event *pe = info;
@@ -4185,8 +4143,6 @@ static int __perf_event_period(void *info)
 static int perf_event_period(struct perf_event *event, u64 __user *arg)
 {
 	struct period_event pe = { .event = event, };
-	struct perf_event_context *ctx = event->ctx;
-	struct task_struct *task;
 	u64 value;
 
 	if (!is_sampling_event(event))
@@ -4201,34 +4157,10 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
 	if (event->attr.freq && value > sysctl_perf_event_sample_rate)
 		return -EINVAL;
 
-	task = ctx->task;
 	pe.value = value;
 
-	if (!task) {
-		cpu_function_call(event->cpu, __perf_event_period, &pe);
-		return 0;
-	}
-
-retry:
-	if (!task_function_call(task, __perf_event_period, &pe))
-		return 0;
-
-	raw_spin_lock_irq(&ctx->lock);
-	if (ctx->is_active) {
-		raw_spin_unlock_irq(&ctx->lock);
-		task = ctx->task;
-		goto retry;
-	}
-
-	if (event->attr.freq) {
-		event->attr.sample_freq = value;
-	} else {
-		event->attr.sample_period = value;
-		event->hw.sample_period = value;
-	}
-
-	local64_set(&event->hw.period_left, 0);
-	raw_spin_unlock_irq(&ctx->lock);
+	event_function_call(event, __perf_event_period,
+			    ___perf_event_period, &pe);
 
 	return 0;
 }

+ 4 - 0
lib/atomic64_test.c

@@ -16,6 +16,10 @@
 #include <linux/kernel.h>
 #include <linux/atomic.h>
 
+#ifdef CONFIG_X86
+#include <asm/processor.h>	/* for boot_cpu_has below */
+#endif
+
 #define TEST(bit, op, c_op, val)				\
 do {								\
 	atomic##bit##_set(&v, v0);				\

+ 1 - 1
tools/Makefile

@@ -96,7 +96,7 @@ cgroup_install firewire_install hv_install lguest_install perf_install usb_insta
 	$(call descend,$(@:_install=),install)
 
 selftests_install:
-	$(call descend,testing/$(@:_clean=),install)
+	$(call descend,testing/$(@:_install=),install)
 
 turbostat_install x86_energy_perf_policy_install:
 	$(call descend,power/x86/$(@:_install=),install)

+ 1 - 1
tools/build/Makefile

@@ -25,7 +25,7 @@ export Q srctree CC LD
 MAKEFLAGS := --no-print-directory
 build     := -f $(srctree)/tools/build/Makefile.build dir=. obj
 
-all: fixdep
+all: $(OUTPUT)fixdep
 
 clean:
 	$(call QUIET_CLEAN, fixdep)

+ 28 - 16
tools/build/Makefile.feature

@@ -7,7 +7,7 @@ endif
 
 feature_check = $(eval $(feature_check_code))
 define feature_check_code
-  feature-$(1) := $(shell $(MAKE) OUTPUT=$(OUTPUT_FEATURES) CFLAGS="$(EXTRA_CFLAGS) $(FEATURE_CHECK_CFLAGS-$(1))" LDFLAGS="$(LDFLAGS) $(FEATURE_CHECK_LDFLAGS-$(1))" -C $(feature_dir) test-$1.bin >/dev/null 2>/dev/null && echo 1 || echo 0)
+  feature-$(1) := $(shell $(MAKE) OUTPUT=$(OUTPUT_FEATURES) CFLAGS="$(EXTRA_CFLAGS) $(FEATURE_CHECK_CFLAGS-$(1))" LDFLAGS="$(LDFLAGS) $(FEATURE_CHECK_LDFLAGS-$(1))" -C $(feature_dir) $(OUTPUT_FEATURES)test-$1.bin >/dev/null 2>/dev/null && echo 1 || echo 0)
 endef
 
 feature_set = $(eval $(feature_set_code))
@@ -101,7 +101,6 @@ ifeq ($(feature-all), 1)
   #
   $(foreach feat,$(FEATURE_TESTS),$(call feature_set,$(feat)))
 else
-  $(shell $(MAKE) OUTPUT=$(OUTPUT_FEATURES) CFLAGS="$(EXTRA_CFLAGS)" LDFLAGS=$(LDFLAGS) -i -j -C $(feature_dir) $(addsuffix .bin,$(FEATURE_TESTS)) >/dev/null 2>&1)
   $(foreach feat,$(FEATURE_TESTS),$(call feature_check,$(feat)))
 endif
 
@@ -123,13 +122,31 @@ define feature_print_text_code
     MSG = $(shell printf '...%30s: %s' $(1) $(2))
 endef
 
+#
+# generates feature value assignment for name, like:
+#   $(call feature_assign,dwarf) == feature-dwarf=1
+#
+feature_assign = feature-$(1)=$(feature-$(1))
+
 FEATURE_DUMP_FILENAME = $(OUTPUT)FEATURE-DUMP$(FEATURE_USER)
-FEATURE_DUMP := $(foreach feat,$(FEATURE_DISPLAY),feature-$(feat)($(feature-$(feat))))
-FEATURE_DUMP_FILE := $(shell touch $(FEATURE_DUMP_FILENAME); cat $(FEATURE_DUMP_FILENAME))
+FEATURE_DUMP := $(shell touch $(FEATURE_DUMP_FILENAME); cat $(FEATURE_DUMP_FILENAME))
 
-ifeq ($(dwarf-post-unwind),1)
-  FEATURE_DUMP += dwarf-post-unwind($(dwarf-post-unwind-text))
-endif
+feature_dump_check = $(eval $(feature_dump_check_code))
+define feature_dump_check_code
+  ifeq ($(findstring $(1),$(FEATURE_DUMP)),)
+    $(2) := 1
+  endif
+endef
+
+#
+# First check if any test from FEATURE_DISPLAY
+# and set feature_display := 1 if it does
+$(foreach feat,$(FEATURE_DISPLAY),$(call feature_dump_check,$(call feature_assign,$(feat)),feature_display))
+
+#
+# Now also check if any other test changed,
+# so we force FEATURE-DUMP generation
+$(foreach feat,$(FEATURE_TESTS),$(call feature_dump_check,$(call feature_assign,$(feat)),feature_dump_changed))
 
 # The $(feature_display) controls the default detection message
 # output. It's set if:
@@ -138,13 +155,13 @@ endif
 # - one of the $(FEATURE_DISPLAY) is not detected
 # - VF is enabled
 
-ifneq ("$(FEATURE_DUMP)","$(FEATURE_DUMP_FILE)")
-  $(shell echo "$(FEATURE_DUMP)" > $(FEATURE_DUMP_FILENAME))
-  feature_display := 1
+ifeq ($(feature_dump_changed),1)
+  $(shell rm -f $(FEATURE_DUMP_FILENAME))
+  $(foreach feat,$(FEATURE_TESTS),$(shell echo "$(call feature_assign,$(feat))" >> $(FEATURE_DUMP_FILENAME)))
 endif
 
 feature_display_check = $(eval $(feature_check_display_code))
-define feature_display_check_code
+define feature_check_display_code
   ifneq ($(feature-$(1)), 1)
     feature_display := 1
   endif
@@ -161,11 +178,6 @@ ifeq ($(feature_display),1)
   $(info )
   $(info Auto-detecting system features:)
   $(foreach feat,$(FEATURE_DISPLAY),$(call feature_print_status,$(feat),))
-
-  ifeq ($(dwarf-post-unwind),1)
-    $(call feature_print_text,"DWARF post unwind library", $(dwarf-post-unwind-text))
-  endif
-
   ifneq ($(feature_verbose),1)
     $(info )
   endif

+ 1 - 1
tools/build/Makefile.include

@@ -4,7 +4,7 @@ ifdef CROSS_COMPILE
 fixdep:
 else
 fixdep:
-	$(Q)$(MAKE) -C $(srctree)/tools/build fixdep
+	$(Q)$(MAKE) -C $(srctree)/tools/build CFLAGS= LDFLAGS= $(OUTPUT)fixdep
 endif
 
 .PHONY: fixdep

+ 47 - 46
tools/build/feature/Makefile

@@ -1,4 +1,3 @@
-
 FILES=					\
 	test-all.bin			\
 	test-backtrace.bin		\
@@ -38,38 +37,40 @@ FILES=					\
 	test-bpf.bin			\
 	test-get_cpuid.bin
 
+FILES := $(addprefix $(OUTPUT),$(FILES))
+
 CC := $(CROSS_COMPILE)gcc -MD
 PKG_CONFIG := $(CROSS_COMPILE)pkg-config
 
 all: $(FILES)
 
-__BUILD = $(CC) $(CFLAGS) -Wall -Werror -o $(OUTPUT)$@ $(patsubst %.bin,%.c,$@) $(LDFLAGS)
-  BUILD = $(__BUILD) > $(OUTPUT)$(@:.bin=.make.output) 2>&1
+__BUILD = $(CC) $(CFLAGS) -Wall -Werror -o $@ $(patsubst %.bin,%.c,$(@F)) $(LDFLAGS)
+  BUILD = $(__BUILD) > $(@:.bin=.make.output) 2>&1
 
 ###############################
 
-test-all.bin:
+$(OUTPUT)test-all.bin:
 	$(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -laudit -I/usr/include/slang -lslang $(shell $(PKG_CONFIG) --libs --cflags gtk+-2.0 2>/dev/null) $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -DPACKAGE='"perf"' -lbfd -ldl -lz -llzma
 
-test-hello.bin:
+$(OUTPUT)test-hello.bin:
 	$(BUILD)
 
-test-pthread-attr-setaffinity-np.bin:
+$(OUTPUT)test-pthread-attr-setaffinity-np.bin:
 	$(BUILD) -D_GNU_SOURCE -lpthread
 
-test-stackprotector-all.bin:
+$(OUTPUT)test-stackprotector-all.bin:
 	$(BUILD) -fstack-protector-all
 
-test-fortify-source.bin:
+$(OUTPUT)test-fortify-source.bin:
 	$(BUILD) -O2 -D_FORTIFY_SOURCE=2
 
-test-bionic.bin:
+$(OUTPUT)test-bionic.bin:
 	$(BUILD)
 
-test-libelf.bin:
+$(OUTPUT)test-libelf.bin:
 	$(BUILD) -lelf
 
-test-glibc.bin:
+$(OUTPUT)test-glibc.bin:
 	$(BUILD)
 
 DWARFLIBS := -ldw
@@ -77,37 +78,37 @@ ifeq ($(findstring -static,${LDFLAGS}),-static)
 DWARFLIBS += -lelf -lebl -lz -llzma -lbz2
 endif
 
-test-dwarf.bin:
+$(OUTPUT)test-dwarf.bin:
 	$(BUILD) $(DWARFLIBS)
 
-test-libelf-mmap.bin:
+$(OUTPUT)test-libelf-mmap.bin:
 	$(BUILD) -lelf
 
-test-libelf-getphdrnum.bin:
+$(OUTPUT)test-libelf-getphdrnum.bin:
 	$(BUILD) -lelf
 
-test-libnuma.bin:
+$(OUTPUT)test-libnuma.bin:
 	$(BUILD) -lnuma
 
-test-numa_num_possible_cpus.bin:
+$(OUTPUT)test-numa_num_possible_cpus.bin:
 	$(BUILD) -lnuma
 
-test-libunwind.bin:
+$(OUTPUT)test-libunwind.bin:
 	$(BUILD) -lelf
 
-test-libunwind-debug-frame.bin:
+$(OUTPUT)test-libunwind-debug-frame.bin:
 	$(BUILD) -lelf
 
-test-libaudit.bin:
+$(OUTPUT)test-libaudit.bin:
 	$(BUILD) -laudit
 
-test-libslang.bin:
+$(OUTPUT)test-libslang.bin:
 	$(BUILD) -I/usr/include/slang -lslang
 
-test-gtk2.bin:
+$(OUTPUT)test-gtk2.bin:
 	$(BUILD) $(shell $(PKG_CONFIG) --libs --cflags gtk+-2.0 2>/dev/null)
 
-test-gtk2-infobar.bin:
+$(OUTPUT)test-gtk2-infobar.bin:
 	$(BUILD) $(shell $(PKG_CONFIG) --libs --cflags gtk+-2.0 2>/dev/null)
 
 grep-libs  = $(filter -l%,$(1))
@@ -119,63 +120,63 @@ PERL_EMBED_LIBADD = $(call grep-libs,$(PERL_EMBED_LDOPTS))
 PERL_EMBED_CCOPTS = `perl -MExtUtils::Embed -e ccopts 2>/dev/null`
 FLAGS_PERL_EMBED=$(PERL_EMBED_CCOPTS) $(PERL_EMBED_LDOPTS)
 
-test-libperl.bin:
+$(OUTPUT)test-libperl.bin:
 	$(BUILD) $(FLAGS_PERL_EMBED)
 
-test-libpython.bin:
+$(OUTPUT)test-libpython.bin:
 	$(BUILD)
 
-test-libpython-version.bin:
+$(OUTPUT)test-libpython-version.bin:
 	$(BUILD)
 
-test-libbfd.bin:
+$(OUTPUT)test-libbfd.bin:
 	$(BUILD) -DPACKAGE='"perf"' -lbfd -lz -liberty -ldl
 
-test-liberty.bin:
-	$(CC) $(CFLAGS) -Wall -Werror -o $(OUTPUT)$@ test-libbfd.c -DPACKAGE='"perf"' $(LDFLAGS) -lbfd -ldl -liberty
+$(OUTPUT)test-liberty.bin:
+	$(CC) $(CFLAGS) -Wall -Werror -o $@ test-libbfd.c -DPACKAGE='"perf"' $(LDFLAGS) -lbfd -ldl -liberty
 
-test-liberty-z.bin:
-	$(CC) $(CFLAGS) -Wall -Werror -o $(OUTPUT)$@ test-libbfd.c -DPACKAGE='"perf"' $(LDFLAGS) -lbfd -ldl -liberty -lz
+$(OUTPUT)test-liberty-z.bin:
+	$(CC) $(CFLAGS) -Wall -Werror -o $@ test-libbfd.c -DPACKAGE='"perf"' $(LDFLAGS) -lbfd -ldl -liberty -lz
 
-test-cplus-demangle.bin:
+$(OUTPUT)test-cplus-demangle.bin:
 	$(BUILD) -liberty
 
-test-backtrace.bin:
+$(OUTPUT)test-backtrace.bin:
 	$(BUILD)
 
-test-timerfd.bin:
+$(OUTPUT)test-timerfd.bin:
 	$(BUILD)
 
-test-libdw-dwarf-unwind.bin:
+$(OUTPUT)test-libdw-dwarf-unwind.bin:
 	$(BUILD) # -ldw provided by $(FEATURE_CHECK_LDFLAGS-libdw-dwarf-unwind)
 
-test-libbabeltrace.bin:
+$(OUTPUT)test-libbabeltrace.bin:
 	$(BUILD) # -lbabeltrace provided by $(FEATURE_CHECK_LDFLAGS-libbabeltrace)
 
-test-sync-compare-and-swap.bin:
+$(OUTPUT)test-sync-compare-and-swap.bin:
 	$(BUILD)
 
-test-compile-32.bin:
-	$(CC) -m32 -o $(OUTPUT)$@ test-compile.c
+$(OUTPUT)test-compile-32.bin:
+	$(CC) -m32 -o $@ test-compile.c
 
-test-compile-x32.bin:
-	$(CC) -mx32 -o $(OUTPUT)$@ test-compile.c
+$(OUTPUT)test-compile-x32.bin:
+	$(CC) -mx32 -o $@ test-compile.c
 
-test-zlib.bin:
+$(OUTPUT)test-zlib.bin:
 	$(BUILD) -lz
 
-test-lzma.bin:
+$(OUTPUT)test-lzma.bin:
 	$(BUILD) -llzma
 
-test-get_cpuid.bin:
+$(OUTPUT)test-get_cpuid.bin:
 	$(BUILD)
 
-test-bpf.bin:
+$(OUTPUT)test-bpf.bin:
 	$(BUILD)
 
--include *.d
+-include $(OUTPUT)*.d
 
 ###############################
 
 clean:
-	rm -f $(FILES) *.d $(FILES:.bin=.make.output)
+	rm -f $(FILES) $(OUTPUT)*.d $(FILES:.bin=.make.output)

+ 2 - 0
tools/perf/util/include/linux/bitmap.h → tools/include/linux/bitmap.h

@@ -11,6 +11,8 @@ int __bitmap_weight(const unsigned long *bitmap, int bits);
 void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
 		 const unsigned long *bitmap2, int bits);
 
+#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
+
 #define BITMAP_LAST_WORD_MASK(nbits)					\
 (									\
 	((nbits) % BITS_PER_LONG) ?					\

+ 15 - 0
tools/include/linux/string.h

@@ -0,0 +1,15 @@
+#ifndef _TOOLS_LINUX_STRING_H_
+#define _TOOLS_LINUX_STRING_H_
+
+
+#include <linux/types.h>	/* for size_t */
+
+void *memdup(const void *src, size_t len);
+
+int strtobool(const char *s, bool *res);
+
+#ifndef __UCLIBC__
+extern size_t strlcpy(char *dest, const char *src, size_t size);
+#endif
+
+#endif /* _LINUX_STRING_H_ */

+ 0 - 0
tools/perf/util/bitmap.c → tools/lib/bitmap.c


+ 14 - 0
tools/lib/bpf/Makefile

@@ -71,7 +71,21 @@ FEATURE_DISPLAY = libelf bpf
 INCLUDES = -I. -I$(srctree)/tools/include -I$(srctree)/arch/$(ARCH)/include/uapi -I$(srctree)/include/uapi
 FEATURE_CHECK_CFLAGS-bpf = $(INCLUDES)
 
+check_feat := 1
+NON_CHECK_FEAT_TARGETS := clean TAGS tags cscope help
+ifdef MAKECMDGOALS
+ifeq ($(filter-out $(NON_CHECK_FEAT_TARGETS),$(MAKECMDGOALS)),)
+  check_feat := 0
+endif
+endif
+
+ifeq ($(check_feat),1)
+ifeq ($(FEATURES_DUMP),)
 include $(srctree)/tools/build/Makefile.feature
+else
+include $(FEATURES_DUMP)
+endif
+endif
 
 export prefix libdir src obj
 

+ 14 - 0
tools/lib/bpf/bpf.c

@@ -83,3 +83,17 @@ int bpf_load_program(enum bpf_prog_type type, struct bpf_insn *insns,
 	log_buf[0] = 0;
 	return sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
 }
+
+int bpf_map_update_elem(int fd, void *key, void *value,
+			u64 flags)
+{
+	union bpf_attr attr;
+
+	bzero(&attr, sizeof(attr));
+	attr.map_fd = fd;
+	attr.key = ptr_to_u64(key);
+	attr.value = ptr_to_u64(value);
+	attr.flags = flags;
+
+	return sys_bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
+}

+ 2 - 0
tools/lib/bpf/bpf.h

@@ -20,4 +20,6 @@ int bpf_load_program(enum bpf_prog_type type, struct bpf_insn *insns,
 		     u32 kern_version, char *log_buf,
 		     size_t log_buf_sz);
 
+int bpf_map_update_elem(int fd, void *key, void *value,
+			u64 flags);
 #endif

+ 339 - 73
tools/lib/bpf/libbpf.c

@@ -152,29 +152,36 @@ struct bpf_program {
 	} *reloc_desc;
 	int nr_reloc;
 
-	int fd;
+	struct {
+		int nr;
+		int *fds;
+	} instances;
+	bpf_program_prep_t preprocessor;
 
 	struct bpf_object *obj;
 	void *priv;
 	bpf_program_clear_priv_t clear_priv;
 };
 
+struct bpf_map {
+	int fd;
+	char *name;
+	struct bpf_map_def def;
+	void *priv;
+	bpf_map_clear_priv_t clear_priv;
+};
+
 static LIST_HEAD(bpf_objects_list);
 
 struct bpf_object {
 	char license[64];
 	u32 kern_version;
-	void *maps_buf;
-	size_t maps_buf_sz;
 
 	struct bpf_program *programs;
 	size_t nr_programs;
-	int *map_fds;
-	/*
-	 * This field is required because maps_buf will be freed and
-	 * maps_buf_sz will be set to 0 after loaded.
-	 */
-	size_t nr_map_fds;
+	struct bpf_map *maps;
+	size_t nr_maps;
+
 	bool loaded;
 
 	/*
@@ -188,6 +195,7 @@ struct bpf_object {
 		Elf *elf;
 		GElf_Ehdr ehdr;
 		Elf_Data *symbols;
+		size_t strtabidx;
 		struct {
 			GElf_Shdr shdr;
 			Elf_Data *data;
@@ -206,10 +214,25 @@ struct bpf_object {
 
 static void bpf_program__unload(struct bpf_program *prog)
 {
+	int i;
+
 	if (!prog)
 		return;
 
-	zclose(prog->fd);
+	/*
+	 * If the object is opened but the program was never loaded,
+	 * it is possible that prog->instances.nr == -1.
+	 */
+	if (prog->instances.nr > 0) {
+		for (i = 0; i < prog->instances.nr; i++)
+			zclose(prog->instances.fds[i]);
+	} else if (prog->instances.nr != -1) {
+		pr_warning("Internal error: instances.nr is %d\n",
+			   prog->instances.nr);
+	}
+
+	prog->instances.nr = -1;
+	zfree(&prog->instances.fds);
 }
 
 static void bpf_program__exit(struct bpf_program *prog)
@@ -260,7 +283,8 @@ bpf_program__init(void *data, size_t size, char *name, int idx,
 	memcpy(prog->insns, data,
 	       prog->insns_cnt * sizeof(struct bpf_insn));
 	prog->idx = idx;
-	prog->fd = -1;
+	prog->instances.fds = NULL;
+	prog->instances.nr = -1;
 
 	return 0;
 errout:
@@ -469,21 +493,77 @@ static int
 bpf_object__init_maps(struct bpf_object *obj, void *data,
 		      size_t size)
 {
-	if (size == 0) {
+	size_t nr_maps;
+	int i;
+
+	nr_maps = size / sizeof(struct bpf_map_def);
+	if (!data || !nr_maps) {
 		pr_debug("%s doesn't need map definition\n",
 			 obj->path);
 		return 0;
 	}
 
-	obj->maps_buf = malloc(size);
-	if (!obj->maps_buf) {
-		pr_warning("malloc maps failed: %s\n", obj->path);
+	pr_debug("maps in %s: %zd bytes\n", obj->path, size);
+
+	obj->maps = calloc(nr_maps, sizeof(obj->maps[0]));
+	if (!obj->maps) {
+		pr_warning("alloc maps for object failed\n");
 		return -ENOMEM;
 	}
+	obj->nr_maps = nr_maps;
 
-	obj->maps_buf_sz = size;
-	memcpy(obj->maps_buf, data, size);
-	pr_debug("maps in %s: %ld bytes\n", obj->path, (long)size);
+	for (i = 0; i < nr_maps; i++) {
+		struct bpf_map_def *def = &obj->maps[i].def;
+
+		/*
+		 * fill all fd with -1 so won't close incorrect
+		 * fd (fd=0 is stdin) when failure (zclose won't close
+		 * negative fd)).
+		 */
+		obj->maps[i].fd = -1;
+
+		/* Save map definition into obj->maps */
+		*def = ((struct bpf_map_def *)data)[i];
+	}
+	return 0;
+}
+
+static int
+bpf_object__init_maps_name(struct bpf_object *obj, int maps_shndx)
+{
+	int i;
+	Elf_Data *symbols = obj->efile.symbols;
+
+	if (!symbols || maps_shndx < 0)
+		return -EINVAL;
+
+	for (i = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) {
+		GElf_Sym sym;
+		size_t map_idx;
+		const char *map_name;
+
+		if (!gelf_getsym(symbols, i, &sym))
+			continue;
+		if (sym.st_shndx != maps_shndx)
+			continue;
+
+		map_name = elf_strptr(obj->efile.elf,
+				      obj->efile.strtabidx,
+				      sym.st_name);
+		map_idx = sym.st_value / sizeof(struct bpf_map_def);
+		if (map_idx >= obj->nr_maps) {
+			pr_warning("index of map \"%s\" is buggy: %zu > %zu\n",
+				   map_name, map_idx, obj->nr_maps);
+			continue;
+		}
+		obj->maps[map_idx].name = strdup(map_name);
+		if (!obj->maps[map_idx].name) {
+			pr_warning("failed to alloc map name\n");
+			return -ENOMEM;
+		}
+		pr_debug("map %zu is \"%s\"\n", map_idx,
+			 obj->maps[map_idx].name);
+	}
 	return 0;
 }
 
@@ -492,7 +572,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
 	Elf *elf = obj->efile.elf;
 	GElf_Ehdr *ep = &obj->efile.ehdr;
 	Elf_Scn *scn = NULL;
-	int idx = 0, err = 0;
+	int idx = 0, err = 0, maps_shndx = -1;
 
 	/* Elf is corrupted/truncated, avoid calling elf_strptr. */
 	if (!elf_rawdata(elf_getscn(elf, ep->e_shstrndx), NULL)) {
@@ -542,16 +622,19 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
 			err = bpf_object__init_kversion(obj,
 							data->d_buf,
 							data->d_size);
-		else if (strcmp(name, "maps") == 0)
+		else if (strcmp(name, "maps") == 0) {
 			err = bpf_object__init_maps(obj, data->d_buf,
 						    data->d_size);
-		else if (sh.sh_type == SHT_SYMTAB) {
+			maps_shndx = idx;
+		} else if (sh.sh_type == SHT_SYMTAB) {
 			if (obj->efile.symbols) {
 				pr_warning("bpf: multiple SYMTAB in %s\n",
 					   obj->path);
 				err = -LIBBPF_ERRNO__FORMAT;
-			} else
+			} else {
 				obj->efile.symbols = data;
+				obj->efile.strtabidx = sh.sh_link;
+			}
 		} else if ((sh.sh_type == SHT_PROGBITS) &&
 			   (sh.sh_flags & SHF_EXECINSTR) &&
 			   (data->d_size > 0)) {
@@ -586,6 +669,13 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
 		if (err)
 			goto out;
 	}
+
+	if (!obj->efile.strtabidx || obj->efile.strtabidx >= idx) {
+		pr_warning("Corrupted ELF file: index of strtab invalid\n");
+		return LIBBPF_ERRNO__FORMAT;
+	}
+	if (maps_shndx >= 0)
+		err = bpf_object__init_maps_name(obj, maps_shndx);
 out:
 	return err;
 }
@@ -668,37 +758,15 @@ static int
 bpf_object__create_maps(struct bpf_object *obj)
 {
 	unsigned int i;
-	size_t nr_maps;
-	int *pfd;
-
-	nr_maps = obj->maps_buf_sz / sizeof(struct bpf_map_def);
-	if (!obj->maps_buf || !nr_maps) {
-		pr_debug("don't need create maps for %s\n",
-			 obj->path);
-		return 0;
-	}
-
-	obj->map_fds = malloc(sizeof(int) * nr_maps);
-	if (!obj->map_fds) {
-		pr_warning("realloc perf_bpf_map_fds failed\n");
-		return -ENOMEM;
-	}
-	obj->nr_map_fds = nr_maps;
 
-	/* fill all fd with -1 */
-	memset(obj->map_fds, -1, sizeof(int) * nr_maps);
-
-	pfd = obj->map_fds;
-	for (i = 0; i < nr_maps; i++) {
-		struct bpf_map_def def;
+	for (i = 0; i < obj->nr_maps; i++) {
+		struct bpf_map_def *def = &obj->maps[i].def;
+		int *pfd = &obj->maps[i].fd;
 
-		def = *(struct bpf_map_def *)(obj->maps_buf +
-				i * sizeof(struct bpf_map_def));
-
-		*pfd = bpf_create_map(def.type,
-				      def.key_size,
-				      def.value_size,
-				      def.max_entries);
+		*pfd = bpf_create_map(def->type,
+				      def->key_size,
+				      def->value_size,
+				      def->max_entries);
 		if (*pfd < 0) {
 			size_t j;
 			int err = *pfd;
@@ -706,22 +774,17 @@ bpf_object__create_maps(struct bpf_object *obj)
 			pr_warning("failed to create map: %s\n",
 				   strerror(errno));
 			for (j = 0; j < i; j++)
-				zclose(obj->map_fds[j]);
-			obj->nr_map_fds = 0;
-			zfree(&obj->map_fds);
+				zclose(obj->maps[j].fd);
 			return err;
 		}
 		pr_debug("create map: fd=%d\n", *pfd);
-		pfd++;
 	}
 
-	zfree(&obj->maps_buf);
-	obj->maps_buf_sz = 0;
 	return 0;
 }
 
 static int
-bpf_program__relocate(struct bpf_program *prog, int *map_fds)
+bpf_program__relocate(struct bpf_program *prog, struct bpf_object *obj)
 {
 	int i;
 
@@ -741,7 +804,7 @@ bpf_program__relocate(struct bpf_program *prog, int *map_fds)
 			return -LIBBPF_ERRNO__RELOC;
 		}
 		insns[insn_idx].src_reg = BPF_PSEUDO_MAP_FD;
-		insns[insn_idx].imm = map_fds[map_idx];
+		insns[insn_idx].imm = obj->maps[map_idx].fd;
 	}
 
 	zfree(&prog->reloc_desc);
@@ -760,7 +823,7 @@ bpf_object__relocate(struct bpf_object *obj)
 	for (i = 0; i < obj->nr_programs; i++) {
 		prog = &obj->programs[i];
 
-		err = bpf_program__relocate(prog, obj->map_fds);
+		err = bpf_program__relocate(prog, obj);
 		if (err) {
 			pr_warning("failed to relocate '%s'\n",
 				   prog->section_name);
@@ -784,8 +847,7 @@ static int bpf_object__collect_reloc(struct bpf_object *obj)
 		Elf_Data *data = obj->efile.reloc[i].data;
 		int idx = shdr->sh_info;
 		struct bpf_program *prog;
-		size_t nr_maps = obj->maps_buf_sz /
-				 sizeof(struct bpf_map_def);
+		size_t nr_maps = obj->nr_maps;
 
 		if (shdr->sh_type != SHT_REL) {
 			pr_warning("internal error at %d\n", __LINE__);
@@ -860,13 +922,73 @@ static int
 bpf_program__load(struct bpf_program *prog,
 		  char *license, u32 kern_version)
 {
-	int err, fd;
+	int err = 0, fd, i;
 
-	err = load_program(prog->insns, prog->insns_cnt,
-			   license, kern_version, &fd);
-	if (!err)
-		prog->fd = fd;
+	if (prog->instances.nr < 0 || !prog->instances.fds) {
+		if (prog->preprocessor) {
+			pr_warning("Internal error: can't load program '%s'\n",
+				   prog->section_name);
+			return -LIBBPF_ERRNO__INTERNAL;
+		}
+
+		prog->instances.fds = malloc(sizeof(int));
+		if (!prog->instances.fds) {
+			pr_warning("Not enough memory for BPF fds\n");
+			return -ENOMEM;
+		}
+		prog->instances.nr = 1;
+		prog->instances.fds[0] = -1;
+	}
+
+	if (!prog->preprocessor) {
+		if (prog->instances.nr != 1) {
+			pr_warning("Program '%s' is inconsistent: nr(%d) != 1\n",
+				   prog->section_name, prog->instances.nr);
+		}
+		err = load_program(prog->insns, prog->insns_cnt,
+				   license, kern_version, &fd);
+		if (!err)
+			prog->instances.fds[0] = fd;
+		goto out;
+	}
+
+	for (i = 0; i < prog->instances.nr; i++) {
+		struct bpf_prog_prep_result result;
+		bpf_program_prep_t preprocessor = prog->preprocessor;
+
+		bzero(&result, sizeof(result));
+		err = preprocessor(prog, i, prog->insns,
+				   prog->insns_cnt, &result);
+		if (err) {
+			pr_warning("Preprocessing the %dth instance of program '%s' failed\n",
+				   i, prog->section_name);
+			goto out;
+		}
+
+		if (!result.new_insn_ptr || !result.new_insn_cnt) {
+			pr_debug("Skip loading the %dth instance of program '%s'\n",
+				 i, prog->section_name);
+			prog->instances.fds[i] = -1;
+			if (result.pfd)
+				*result.pfd = -1;
+			continue;
+		}
+
+		err = load_program(result.new_insn_ptr,
+				   result.new_insn_cnt,
+				   license, kern_version, &fd);
+
+		if (err) {
+			pr_warning("Loading the %dth instance of program '%s' failed\n",
+					i, prog->section_name);
+			goto out;
+		}
 
+		if (result.pfd)
+			*result.pfd = fd;
+		prog->instances.fds[i] = fd;
+	}
+out:
 	if (err)
 		pr_warning("failed to load program '%s'\n",
 			   prog->section_name);
@@ -970,10 +1092,8 @@ int bpf_object__unload(struct bpf_object *obj)
 	if (!obj)
 		return -EINVAL;
 
-	for (i = 0; i < obj->nr_map_fds; i++)
-		zclose(obj->map_fds[i]);
-	zfree(&obj->map_fds);
-	obj->nr_map_fds = 0;
+	for (i = 0; i < obj->nr_maps; i++)
+		zclose(obj->maps[i].fd);
 
 	for (i = 0; i < obj->nr_programs; i++)
 		bpf_program__unload(&obj->programs[i]);
@@ -1016,7 +1136,16 @@ void bpf_object__close(struct bpf_object *obj)
 	bpf_object__elf_finish(obj);
 	bpf_object__unload(obj);
 
-	zfree(&obj->maps_buf);
+	for (i = 0; i < obj->nr_maps; i++) {
+		zfree(&obj->maps[i].name);
+		if (obj->maps[i].clear_priv)
+			obj->maps[i].clear_priv(&obj->maps[i],
+						obj->maps[i].priv);
+		obj->maps[i].priv = NULL;
+		obj->maps[i].clear_priv = NULL;
+	}
+	zfree(&obj->maps);
+	obj->nr_maps = 0;
 
 	if (obj->programs && obj->nr_programs) {
 		for (i = 0; i < obj->nr_programs; i++)
@@ -1121,5 +1250,142 @@ const char *bpf_program__title(struct bpf_program *prog, bool needs_copy)
 
 int bpf_program__fd(struct bpf_program *prog)
 {
-	return prog->fd;
+	return bpf_program__nth_fd(prog, 0);
+}
+
+int bpf_program__set_prep(struct bpf_program *prog, int nr_instances,
+			  bpf_program_prep_t prep)
+{
+	int *instances_fds;
+
+	if (nr_instances <= 0 || !prep)
+		return -EINVAL;
+
+	if (prog->instances.nr > 0 || prog->instances.fds) {
+		pr_warning("Can't set pre-processor after loading\n");
+		return -EINVAL;
+	}
+
+	instances_fds = malloc(sizeof(int) * nr_instances);
+	if (!instances_fds) {
+		pr_warning("alloc memory failed for fds\n");
+		return -ENOMEM;
+	}
+
+	/* fill all fd with -1 */
+	memset(instances_fds, -1, sizeof(int) * nr_instances);
+
+	prog->instances.nr = nr_instances;
+	prog->instances.fds = instances_fds;
+	prog->preprocessor = prep;
+	return 0;
+}
+
+int bpf_program__nth_fd(struct bpf_program *prog, int n)
+{
+	int fd;
+
+	if (n >= prog->instances.nr || n < 0) {
+		pr_warning("Can't get the %dth fd from program %s: only %d instances\n",
+			   n, prog->section_name, prog->instances.nr);
+		return -EINVAL;
+	}
+
+	fd = prog->instances.fds[n];
+	if (fd < 0) {
+		pr_warning("%dth instance of program '%s' is invalid\n",
+			   n, prog->section_name);
+		return -ENOENT;
+	}
+
+	return fd;
+}
+
+int bpf_map__get_fd(struct bpf_map *map)
+{
+	if (!map)
+		return -EINVAL;
+
+	return map->fd;
+}
+
+int bpf_map__get_def(struct bpf_map *map, struct bpf_map_def *pdef)
+{
+	if (!map || !pdef)
+		return -EINVAL;
+
+	*pdef = map->def;
+	return 0;
+}
+
+const char *bpf_map__get_name(struct bpf_map *map)
+{
+	if (!map)
+		return NULL;
+	return map->name;
+}
+
+int bpf_map__set_private(struct bpf_map *map, void *priv,
+			 bpf_map_clear_priv_t clear_priv)
+{
+	if (!map)
+		return -EINVAL;
+
+	if (map->priv) {
+		if (map->clear_priv)
+			map->clear_priv(map, map->priv);
+	}
+
+	map->priv = priv;
+	map->clear_priv = clear_priv;
+	return 0;
+}
+
+int bpf_map__get_private(struct bpf_map *map, void **ppriv)
+{
+	if (!map)
+		return -EINVAL;
+
+	if (ppriv)
+		*ppriv = map->priv;
+	return 0;
+}
+
+struct bpf_map *
+bpf_map__next(struct bpf_map *prev, struct bpf_object *obj)
+{
+	size_t idx;
+	struct bpf_map *s, *e;
+
+	if (!obj || !obj->maps)
+		return NULL;
+
+	s = obj->maps;
+	e = obj->maps + obj->nr_maps;
+
+	if (prev == NULL)
+		return s;
+
+	if ((prev < s) || (prev >= e)) {
+		pr_warning("error in %s: map handler doesn't belong to object\n",
+			   __func__);
+		return NULL;
+	}
+
+	idx = (prev - obj->maps) + 1;
+	if (idx >= obj->nr_maps)
+		return NULL;
+	return &obj->maps[idx];
+}
+
+struct bpf_map *
+bpf_object__get_map_by_name(struct bpf_object *obj, const char *name)
+{
+	struct bpf_map *pos;
+
+	bpf_map__for_each(pos, obj) {
+		if (pos->name && !strcmp(pos->name, name))
+			return pos;
+	}
+	return NULL;
 }

+ 88 - 0
tools/lib/bpf/libbpf.h

@@ -88,6 +88,70 @@ const char *bpf_program__title(struct bpf_program *prog, bool needs_copy);
 
 int bpf_program__fd(struct bpf_program *prog);
 
+struct bpf_insn;
+
+/*
+ * Libbpf allows callers to adjust BPF programs before being loaded
+ * into kernel. One program in an object file can be transform into
+ * multiple variants to be attached to different code.
+ *
+ * bpf_program_prep_t, bpf_program__set_prep and bpf_program__nth_fd
+ * are APIs for this propose.
+ *
+ * - bpf_program_prep_t:
+ *   It defines 'preprocessor', which is a caller defined function
+ *   passed to libbpf through bpf_program__set_prep(), and will be
+ *   called before program is loaded. The processor should adjust
+ *   the program one time for each instances according to the number
+ *   passed to it.
+ *
+ * - bpf_program__set_prep:
+ *   Attachs a preprocessor to a BPF program. The number of instances
+ *   whould be created is also passed through this function.
+ *
+ * - bpf_program__nth_fd:
+ *   After the program is loaded, get resuling fds from bpf program for
+ *   each instances.
+ *
+ * If bpf_program__set_prep() is not used, the program whould be loaded
+ * without adjustment during bpf_object__load(). The program has only
+ * one instance. In this case bpf_program__fd(prog) is equal to
+ * bpf_program__nth_fd(prog, 0).
+ */
+
+struct bpf_prog_prep_result {
+	/*
+	 * If not NULL, load new instruction array.
+	 * If set to NULL, don't load this instance.
+	 */
+	struct bpf_insn *new_insn_ptr;
+	int new_insn_cnt;
+
+	/* If not NULL, result fd is set to it */
+	int *pfd;
+};
+
+/*
+ * Parameters of bpf_program_prep_t:
+ *  - prog:	The bpf_program being loaded.
+ *  - n:	Index of instance being generated.
+ *  - insns:	BPF instructions array.
+ *  - insns_cnt:Number of instructions in insns.
+ *  - res:	Output parameter, result of transformation.
+ *
+ * Return value:
+ *  - Zero: pre-processing success.
+ *  - Non-zero: pre-processing, stop loading.
+ */
+typedef int (*bpf_program_prep_t)(struct bpf_program *prog, int n,
+				  struct bpf_insn *insns, int insns_cnt,
+				  struct bpf_prog_prep_result *res);
+
+int bpf_program__set_prep(struct bpf_program *prog, int nr_instance,
+			  bpf_program_prep_t prep);
+
+int bpf_program__nth_fd(struct bpf_program *prog, int n);
+
 /*
  * We don't need __attribute__((packed)) now since it is
  * unnecessary for 'bpf_map_def' because they are all aligned.
@@ -101,4 +165,28 @@ struct bpf_map_def {
 	unsigned int max_entries;
 };
 
+/*
+ * There is another 'struct bpf_map' in include/linux/map.h. However,
+ * it is not a uapi header so no need to consider name clash.
+ */
+struct bpf_map;
+struct bpf_map *
+bpf_object__get_map_by_name(struct bpf_object *obj, const char *name);
+
+struct bpf_map *
+bpf_map__next(struct bpf_map *map, struct bpf_object *obj);
+#define bpf_map__for_each(pos, obj)		\
+	for ((pos) = bpf_map__next(NULL, (obj));	\
+	     (pos) != NULL;				\
+	     (pos) = bpf_map__next((pos), (obj)))
+
+int bpf_map__get_fd(struct bpf_map *map);
+int bpf_map__get_def(struct bpf_map *map, struct bpf_map_def *pdef);
+const char *bpf_map__get_name(struct bpf_map *map);
+
+typedef void (*bpf_map_clear_priv_t)(struct bpf_map *, void *);
+int bpf_map__set_private(struct bpf_map *map, void *priv,
+			 bpf_map_clear_priv_t clear_priv);
+int bpf_map__get_private(struct bpf_map *map, void **ppriv);
+
 #endif

+ 84 - 0
tools/lib/find_bit.c

@@ -0,0 +1,84 @@
+/* bit search implementation
+ *
+ * Copied from lib/find_bit.c to tools/lib/find_bit.c
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * Copyright (C) 2008 IBM Corporation
+ * 'find_last_bit' is written by Rusty Russell <rusty@rustcorp.com.au>
+ * (Inspired by David Howell's find_next_bit implementation)
+ *
+ * Rewritten by Yury Norov <yury.norov@gmail.com> to decrease
+ * size and improve performance, 2015.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/bitops.h>
+#include <linux/bitmap.h>
+#include <linux/kernel.h>
+
+#if !defined(find_next_bit)
+
+/*
+ * This is a common helper function for find_next_bit and
+ * find_next_zero_bit.  The difference is the "invert" argument, which
+ * is XORed with each fetched word before searching it for one bits.
+ */
+static unsigned long _find_next_bit(const unsigned long *addr,
+		unsigned long nbits, unsigned long start, unsigned long invert)
+{
+	unsigned long tmp;
+
+	if (!nbits || start >= nbits)
+		return nbits;
+
+	tmp = addr[start / BITS_PER_LONG] ^ invert;
+
+	/* Handle 1st word. */
+	tmp &= BITMAP_FIRST_WORD_MASK(start);
+	start = round_down(start, BITS_PER_LONG);
+
+	while (!tmp) {
+		start += BITS_PER_LONG;
+		if (start >= nbits)
+			return nbits;
+
+		tmp = addr[start / BITS_PER_LONG] ^ invert;
+	}
+
+	return min(start + __ffs(tmp), nbits);
+}
+#endif
+
+#ifndef find_next_bit
+/*
+ * Find the next set bit in a memory region.
+ */
+unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
+			    unsigned long offset)
+{
+	return _find_next_bit(addr, size, offset, 0UL);
+}
+#endif
+
+#ifndef find_first_bit
+/*
+ * Find the first set bit in a memory region.
+ */
+unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
+{
+	unsigned long idx;
+
+	for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
+		if (addr[idx])
+			return min(idx * BITS_PER_LONG + __ffs(addr[idx]), size);
+	}
+
+	return size;
+}
+#endif

+ 89 - 0
tools/lib/string.c

@@ -0,0 +1,89 @@
+/*
+ *  linux/tools/lib/string.c
+ *
+ *  Copied from linux/lib/string.c, where it is:
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  More specifically, the first copied function was strtobool, which
+ *  was introduced by:
+ *
+ *  d0f1fed29e6e ("Add a strtobool function matching semantics of existing in kernel equivalents")
+ *  Author: Jonathan Cameron <jic23@cam.ac.uk>
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <linux/string.h>
+#include <linux/compiler.h>
+
+/**
+ * memdup - duplicate region of memory
+ *
+ * @src: memory region to duplicate
+ * @len: memory region length
+ */
+void *memdup(const void *src, size_t len)
+{
+	void *p = malloc(len);
+
+	if (p)
+		memcpy(p, src, len);
+
+	return p;
+}
+
+/**
+ * strtobool - convert common user inputs into boolean values
+ * @s: input string
+ * @res: result
+ *
+ * This routine returns 0 iff the first character is one of 'Yy1Nn0'.
+ * Otherwise it will return -EINVAL.  Value pointed to by res is
+ * updated upon finding a match.
+ */
+int strtobool(const char *s, bool *res)
+{
+	switch (s[0]) {
+	case 'y':
+	case 'Y':
+	case '1':
+		*res = true;
+		break;
+	case 'n':
+	case 'N':
+	case '0':
+		*res = false;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * strlcpy - Copy a C-string into a sized buffer
+ * @dest: Where to copy the string to
+ * @src: Where to copy the string from
+ * @size: size of destination buffer
+ *
+ * Compatible with *BSD: the result is always a valid
+ * NUL-terminated string that fits in the buffer (unless,
+ * of course, the buffer size is zero). It does not pad
+ * out the result like strncpy() does.
+ *
+ * If libc has strlcpy() then that version will override this
+ * implementation:
+ */
+size_t __weak strlcpy(char *dest, const char *src, size_t size)
+{
+	size_t ret = strlen(src);
+
+	if (size) {
+		size_t len = (ret >= size) ? size - 1 : ret;
+		memcpy(dest, src, len);
+		dest[len] = '\0';
+	}
+	return ret;
+}

+ 7 - 0
tools/lib/subcmd/Build

@@ -0,0 +1,7 @@
+libsubcmd-y += exec-cmd.o
+libsubcmd-y += help.o
+libsubcmd-y += pager.o
+libsubcmd-y += parse-options.o
+libsubcmd-y += run-command.o
+libsubcmd-y += sigchain.o
+libsubcmd-y += subcmd-config.o

+ 48 - 0
tools/lib/subcmd/Makefile

@@ -0,0 +1,48 @@
+include ../../scripts/Makefile.include
+include ../../perf/config/utilities.mak		# QUIET_CLEAN
+
+ifeq ($(srctree),)
+srctree := $(patsubst %/,%,$(dir $(shell pwd)))
+srctree := $(patsubst %/,%,$(dir $(srctree)))
+srctree := $(patsubst %/,%,$(dir $(srctree)))
+#$(info Determined 'srctree' to be $(srctree))
+endif
+
+CC = $(CROSS_COMPILE)gcc
+AR = $(CROSS_COMPILE)ar
+RM = rm -f
+
+MAKEFLAGS += --no-print-directory
+
+LIBFILE = $(OUTPUT)libsubcmd.a
+
+CFLAGS := $(EXTRA_WARNINGS) $(EXTRA_CFLAGS)
+CFLAGS += -ggdb3 -Wall -Wextra -std=gnu99 -Werror -O6 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 -fPIC
+CFLAGS += -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE
+
+CFLAGS += -I$(srctree)/tools/include/
+CFLAGS += -I$(srctree)/include/uapi
+CFLAGS += -I$(srctree)/include
+
+SUBCMD_IN := $(OUTPUT)libsubcmd-in.o
+
+all:
+
+export srctree OUTPUT CC LD CFLAGS V
+include $(srctree)/tools/build/Makefile.include
+
+all: fixdep $(LIBFILE)
+
+$(SUBCMD_IN): FORCE
+	@$(MAKE) $(build)=libsubcmd
+
+$(LIBFILE): $(SUBCMD_IN)
+	$(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(SUBCMD_IN)
+
+clean:
+	$(call QUIET_CLEAN, libsubcmd) $(RM) $(LIBFILE); \
+	find $(if $(OUTPUT),$(OUTPUT),.) -name \*.o -or -name \*.o.cmd -or -name \*.o.d | xargs $(RM)
+
+FORCE:
+
+.PHONY: clean FORCE

+ 209 - 0
tools/lib/subcmd/exec-cmd.c

@@ -0,0 +1,209 @@
+#include <linux/compiler.h>
+#include <linux/string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "subcmd-util.h"
+#include "exec-cmd.h"
+#include "subcmd-config.h"
+
+#define MAX_ARGS	32
+#define PATH_MAX	4096
+
+static const char *argv_exec_path;
+static const char *argv0_path;
+
+void exec_cmd_init(const char *exec_name, const char *prefix,
+		   const char *exec_path, const char *exec_path_env)
+{
+	subcmd_config.exec_name		= exec_name;
+	subcmd_config.prefix		= prefix;
+	subcmd_config.exec_path		= exec_path;
+	subcmd_config.exec_path_env	= exec_path_env;
+}
+
+#define is_dir_sep(c) ((c) == '/')
+
+static int is_absolute_path(const char *path)
+{
+	return path[0] == '/';
+}
+
+static const char *get_pwd_cwd(void)
+{
+	static char cwd[PATH_MAX + 1];
+	char *pwd;
+	struct stat cwd_stat, pwd_stat;
+	if (getcwd(cwd, PATH_MAX) == NULL)
+		return NULL;
+	pwd = getenv("PWD");
+	if (pwd && strcmp(pwd, cwd)) {
+		stat(cwd, &cwd_stat);
+		if (!stat(pwd, &pwd_stat) &&
+		    pwd_stat.st_dev == cwd_stat.st_dev &&
+		    pwd_stat.st_ino == cwd_stat.st_ino) {
+			strlcpy(cwd, pwd, PATH_MAX);
+		}
+	}
+	return cwd;
+}
+
+static const char *make_nonrelative_path(const char *path)
+{
+	static char buf[PATH_MAX + 1];
+
+	if (is_absolute_path(path)) {
+		if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX)
+			die("Too long path: %.*s", 60, path);
+	} else {
+		const char *cwd = get_pwd_cwd();
+		if (!cwd)
+			die("Cannot determine the current working directory");
+		if (snprintf(buf, PATH_MAX, "%s/%s", cwd, path) >= PATH_MAX)
+			die("Too long path: %.*s", 60, path);
+	}
+	return buf;
+}
+
+char *system_path(const char *path)
+{
+	char *buf = NULL;
+
+	if (is_absolute_path(path))
+		return strdup(path);
+
+	astrcatf(&buf, "%s/%s", subcmd_config.prefix, path);
+
+	return buf;
+}
+
+const char *extract_argv0_path(const char *argv0)
+{
+	const char *slash;
+
+	if (!argv0 || !*argv0)
+		return NULL;
+	slash = argv0 + strlen(argv0);
+
+	while (argv0 <= slash && !is_dir_sep(*slash))
+		slash--;
+
+	if (slash >= argv0) {
+		argv0_path = strndup(argv0, slash - argv0);
+		return argv0_path ? slash + 1 : NULL;
+	}
+
+	return argv0;
+}
+
+void set_argv_exec_path(const char *exec_path)
+{
+	argv_exec_path = exec_path;
+	/*
+	 * Propagate this setting to external programs.
+	 */
+	setenv(subcmd_config.exec_path_env, exec_path, 1);
+}
+
+
+/* Returns the highest-priority location to look for subprograms. */
+char *get_argv_exec_path(void)
+{
+	char *env;
+
+	if (argv_exec_path)
+		return strdup(argv_exec_path);
+
+	env = getenv(subcmd_config.exec_path_env);
+	if (env && *env)
+		return strdup(env);
+
+	return system_path(subcmd_config.exec_path);
+}
+
+static void add_path(char **out, const char *path)
+{
+	if (path && *path) {
+		if (is_absolute_path(path))
+			astrcat(out, path);
+		else
+			astrcat(out, make_nonrelative_path(path));
+
+		astrcat(out, ":");
+	}
+}
+
+void setup_path(void)
+{
+	const char *old_path = getenv("PATH");
+	char *new_path = NULL;
+	char *tmp = get_argv_exec_path();
+
+	add_path(&new_path, tmp);
+	add_path(&new_path, argv0_path);
+	free(tmp);
+
+	if (old_path)
+		astrcat(&new_path, old_path);
+	else
+		astrcat(&new_path, "/usr/local/bin:/usr/bin:/bin");
+
+	setenv("PATH", new_path, 1);
+
+	free(new_path);
+}
+
+static const char **prepare_exec_cmd(const char **argv)
+{
+	int argc;
+	const char **nargv;
+
+	for (argc = 0; argv[argc]; argc++)
+		; /* just counting */
+	nargv = malloc(sizeof(*nargv) * (argc + 2));
+
+	nargv[0] = subcmd_config.exec_name;
+	for (argc = 0; argv[argc]; argc++)
+		nargv[argc + 1] = argv[argc];
+	nargv[argc + 1] = NULL;
+	return nargv;
+}
+
+int execv_cmd(const char **argv) {
+	const char **nargv = prepare_exec_cmd(argv);
+
+	/* execvp() can only ever return if it fails */
+	execvp(subcmd_config.exec_name, (char **)nargv);
+
+	free(nargv);
+	return -1;
+}
+
+
+int execl_cmd(const char *cmd,...)
+{
+	int argc;
+	const char *argv[MAX_ARGS + 1];
+	const char *arg;
+	va_list param;
+
+	va_start(param, cmd);
+	argv[0] = cmd;
+	argc = 1;
+	while (argc < MAX_ARGS) {
+		arg = argv[argc++] = va_arg(param, char *);
+		if (!arg)
+			break;
+	}
+	va_end(param);
+	if (MAX_ARGS <= argc) {
+		fprintf(stderr, " Error: too many args to run %s\n", cmd);
+		return -1;
+	}
+
+	argv[argc] = NULL;
+	return execv_cmd(argv);
+}

+ 16 - 0
tools/lib/subcmd/exec-cmd.h

@@ -0,0 +1,16 @@
+#ifndef __SUBCMD_EXEC_CMD_H
+#define __SUBCMD_EXEC_CMD_H
+
+extern void exec_cmd_init(const char *exec_name, const char *prefix,
+			  const char *exec_path, const char *exec_path_env);
+
+extern void set_argv_exec_path(const char *exec_path);
+extern const char *extract_argv0_path(const char *path);
+extern void setup_path(void);
+extern int execv_cmd(const char **argv); /* NULL terminated */
+extern int execl_cmd(const char *cmd, ...);
+/* get_argv_exec_path and system_path return malloc'd string, caller must free it */
+extern char *get_argv_exec_path(void);
+extern char *system_path(const char *path);
+
+#endif /* __SUBCMD_EXEC_CMD_H */

+ 54 - 125
tools/perf/util/help.c → tools/lib/subcmd/help.c

@@ -1,9 +1,15 @@
-#include "cache.h"
-#include "../builtin.h"
-#include "exec_cmd.h"
-#include "levenshtein.h"
-#include "help.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 #include <termios.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <dirent.h>
+#include "subcmd-util.h"
+#include "help.h"
+#include "exec-cmd.h"
 
 void add_cmdname(struct cmdnames *cmds, const char *name, size_t len)
 {
@@ -17,7 +23,7 @@ void add_cmdname(struct cmdnames *cmds, const char *name, size_t len)
 	cmds->names[cmds->cnt++] = ent;
 }
 
-static void clean_cmdnames(struct cmdnames *cmds)
+void clean_cmdnames(struct cmdnames *cmds)
 {
 	unsigned int i;
 
@@ -28,14 +34,14 @@ static void clean_cmdnames(struct cmdnames *cmds)
 	cmds->alloc = 0;
 }
 
-static int cmdname_compare(const void *a_, const void *b_)
+int cmdname_compare(const void *a_, const void *b_)
 {
 	struct cmdname *a = *(struct cmdname **)a_;
 	struct cmdname *b = *(struct cmdname **)b_;
 	return strcmp(a->name, b->name);
 }
 
-static void uniq(struct cmdnames *cmds)
+void uniq(struct cmdnames *cmds)
 {
 	unsigned int i, j;
 
@@ -71,6 +77,28 @@ void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes)
 	cmds->cnt = cj;
 }
 
+static void get_term_dimensions(struct winsize *ws)
+{
+	char *s = getenv("LINES");
+
+	if (s != NULL) {
+		ws->ws_row = atoi(s);
+		s = getenv("COLUMNS");
+		if (s != NULL) {
+			ws->ws_col = atoi(s);
+			if (ws->ws_row && ws->ws_col)
+				return;
+		}
+	}
+#ifdef TIOCGWINSZ
+	if (ioctl(1, TIOCGWINSZ, ws) == 0 &&
+	    ws->ws_row && ws->ws_col)
+		return;
+#endif
+	ws->ws_row = 25;
+	ws->ws_col = 80;
+}
+
 static void pretty_print_string_list(struct cmdnames *cmds, int longest)
 {
 	int cols = 1, rows;
@@ -114,6 +142,14 @@ static int is_executable(const char *name)
 	return st.st_mode & S_IXUSR;
 }
 
+static int has_extension(const char *filename, const char *ext)
+{
+	size_t len = strlen(filename);
+	size_t extlen = strlen(ext);
+
+	return len > extlen && !memcmp(filename + len - extlen, ext, extlen);
+}
+
 static void list_commands_in_dir(struct cmdnames *cmds,
 					 const char *path,
 					 const char *prefix)
@@ -121,8 +157,7 @@ static void list_commands_in_dir(struct cmdnames *cmds,
 	int prefix_len;
 	DIR *dir = opendir(path);
 	struct dirent *de;
-	struct strbuf buf = STRBUF_INIT;
-	int len;
+	char *buf = NULL;
 
 	if (!dir)
 		return;
@@ -130,8 +165,7 @@ static void list_commands_in_dir(struct cmdnames *cmds,
 		prefix = "perf-";
 	prefix_len = strlen(prefix);
 
-	strbuf_addf(&buf, "%s/", path);
-	len = buf.len;
+	astrcatf(&buf, "%s/", path);
 
 	while ((de = readdir(dir)) != NULL) {
 		int entlen;
@@ -139,9 +173,8 @@ static void list_commands_in_dir(struct cmdnames *cmds,
 		if (prefixcmp(de->d_name, prefix))
 			continue;
 
-		strbuf_setlen(&buf, len);
-		strbuf_addstr(&buf, de->d_name);
-		if (!is_executable(buf.buf))
+		astrcat(&buf, de->d_name);
+		if (!is_executable(buf))
 			continue;
 
 		entlen = strlen(de->d_name) - prefix_len;
@@ -151,7 +184,7 @@ static void list_commands_in_dir(struct cmdnames *cmds,
 		add_cmdname(cmds, de->d_name + prefix_len, entlen);
 	}
 	closedir(dir);
-	strbuf_release(&buf);
+	free(buf);
 }
 
 void load_command_list(const char *prefix,
@@ -159,7 +192,7 @@ void load_command_list(const char *prefix,
 		struct cmdnames *other_cmds)
 {
 	const char *env_path = getenv("PATH");
-	const char *exec_path = perf_exec_path();
+	char *exec_path = get_argv_exec_path();
 
 	if (exec_path) {
 		list_commands_in_dir(main_cmds, exec_path, prefix);
@@ -172,7 +205,7 @@ void load_command_list(const char *prefix,
 		char *paths, *path, *colon;
 		path = paths = strdup(env_path);
 		while (1) {
-			if ((colon = strchr(path, PATH_SEP)))
+			if ((colon = strchr(path, ':')))
 				*colon = 0;
 			if (!exec_path || strcmp(path, exec_path))
 				list_commands_in_dir(other_cmds, path, prefix);
@@ -187,6 +220,7 @@ void load_command_list(const char *prefix,
 		      sizeof(*other_cmds->names), cmdname_compare);
 		uniq(other_cmds);
 	}
+	free(exec_path);
 	exclude_cmds(other_cmds, main_cmds);
 }
 
@@ -203,13 +237,14 @@ void list_commands(const char *title, struct cmdnames *main_cmds,
 			longest = other_cmds->names[i]->len;
 
 	if (main_cmds->cnt) {
-		const char *exec_path = perf_exec_path();
+		char *exec_path = get_argv_exec_path();
 		printf("available %s in '%s'\n", title, exec_path);
 		printf("----------------");
 		mput_char('-', strlen(title) + strlen(exec_path));
 		putchar('\n');
 		pretty_print_string_list(main_cmds, longest);
 		putchar('\n');
+		free(exec_path);
 	}
 
 	if (other_cmds->cnt) {
@@ -231,109 +266,3 @@ int is_in_cmdlist(struct cmdnames *c, const char *s)
 			return 1;
 	return 0;
 }
-
-static int autocorrect;
-static struct cmdnames aliases;
-
-static int perf_unknown_cmd_config(const char *var, const char *value, void *cb)
-{
-	if (!strcmp(var, "help.autocorrect"))
-		autocorrect = perf_config_int(var,value);
-	/* Also use aliases for command lookup */
-	if (!prefixcmp(var, "alias."))
-		add_cmdname(&aliases, var + 6, strlen(var + 6));
-
-	return perf_default_config(var, value, cb);
-}
-
-static int levenshtein_compare(const void *p1, const void *p2)
-{
-	const struct cmdname *const *c1 = p1, *const *c2 = p2;
-	const char *s1 = (*c1)->name, *s2 = (*c2)->name;
-	int l1 = (*c1)->len;
-	int l2 = (*c2)->len;
-	return l1 != l2 ? l1 - l2 : strcmp(s1, s2);
-}
-
-static void add_cmd_list(struct cmdnames *cmds, struct cmdnames *old)
-{
-	unsigned int i;
-
-	ALLOC_GROW(cmds->names, cmds->cnt + old->cnt, cmds->alloc);
-
-	for (i = 0; i < old->cnt; i++)
-		cmds->names[cmds->cnt++] = old->names[i];
-	zfree(&old->names);
-	old->cnt = 0;
-}
-
-const char *help_unknown_cmd(const char *cmd)
-{
-	unsigned int i, n = 0, best_similarity = 0;
-	struct cmdnames main_cmds, other_cmds;
-
-	memset(&main_cmds, 0, sizeof(main_cmds));
-	memset(&other_cmds, 0, sizeof(main_cmds));
-	memset(&aliases, 0, sizeof(aliases));
-
-	perf_config(perf_unknown_cmd_config, NULL);
-
-	load_command_list("perf-", &main_cmds, &other_cmds);
-
-	add_cmd_list(&main_cmds, &aliases);
-	add_cmd_list(&main_cmds, &other_cmds);
-	qsort(main_cmds.names, main_cmds.cnt,
-	      sizeof(main_cmds.names), cmdname_compare);
-	uniq(&main_cmds);
-
-	if (main_cmds.cnt) {
-		/* This reuses cmdname->len for similarity index */
-		for (i = 0; i < main_cmds.cnt; ++i)
-			main_cmds.names[i]->len =
-				levenshtein(cmd, main_cmds.names[i]->name, 0, 2, 1, 4);
-
-		qsort(main_cmds.names, main_cmds.cnt,
-		      sizeof(*main_cmds.names), levenshtein_compare);
-
-		best_similarity = main_cmds.names[0]->len;
-		n = 1;
-		while (n < main_cmds.cnt && best_similarity == main_cmds.names[n]->len)
-			++n;
-	}
-
-	if (autocorrect && n == 1) {
-		const char *assumed = main_cmds.names[0]->name;
-
-		main_cmds.names[0] = NULL;
-		clean_cmdnames(&main_cmds);
-		fprintf(stderr, "WARNING: You called a perf program named '%s', "
-			"which does not exist.\n"
-			"Continuing under the assumption that you meant '%s'\n",
-			cmd, assumed);
-		if (autocorrect > 0) {
-			fprintf(stderr, "in %0.1f seconds automatically...\n",
-				(float)autocorrect/10.0);
-			poll(NULL, 0, autocorrect * 100);
-		}
-		return assumed;
-	}
-
-	fprintf(stderr, "perf: '%s' is not a perf-command. See 'perf --help'.\n", cmd);
-
-	if (main_cmds.cnt && best_similarity < 6) {
-		fprintf(stderr, "\nDid you mean %s?\n",
-			n < 2 ? "this": "one of these");
-
-		for (i = 0; i < n; i++)
-			fprintf(stderr, "\t%s\n", main_cmds.names[i]->name);
-	}
-
-	exit(1);
-}
-
-int cmd_version(int argc __maybe_unused, const char **argv __maybe_unused,
-		const char *prefix __maybe_unused)
-{
-	printf("perf version %s\n", perf_version_string);
-	return 0;
-}

+ 9 - 4
tools/perf/util/help.h → tools/lib/subcmd/help.h

@@ -1,12 +1,14 @@
-#ifndef __PERF_HELP_H
-#define __PERF_HELP_H
+#ifndef __SUBCMD_HELP_H
+#define __SUBCMD_HELP_H
+
+#include <sys/types.h>
 
 struct cmdnames {
 	size_t alloc;
 	size_t cnt;
 	struct cmdname {
 		size_t len; /* also used for similarity index in help.c */
-		char name[FLEX_ARRAY];
+		char name[];
 	} **names;
 };
 
@@ -20,10 +22,13 @@ void load_command_list(const char *prefix,
 		struct cmdnames *main_cmds,
 		struct cmdnames *other_cmds);
 void add_cmdname(struct cmdnames *cmds, const char *name, size_t len);
+void clean_cmdnames(struct cmdnames *cmds);
+int cmdname_compare(const void *a, const void *b);
+void uniq(struct cmdnames *cmds);
 /* Here we require that excludes is a sorted list. */
 void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes);
 int is_in_cmdlist(struct cmdnames *c, const char *s);
 void list_commands(const char *title, struct cmdnames *main_cmds,
 		   struct cmdnames *other_cmds);
 
-#endif /* __PERF_HELP_H */
+#endif /* __SUBCMD_HELP_H */

+ 14 - 9
tools/perf/util/pager.c → tools/lib/subcmd/pager.c

@@ -1,6 +1,12 @@
-#include "cache.h"
+#include <sys/select.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+#include "pager.h"
 #include "run-command.h"
 #include "sigchain.h"
+#include "subcmd-config.h"
 
 /*
  * This is split up from the rest of git so that we can do
@@ -9,6 +15,11 @@
 
 static int spawned_pager;
 
+void pager_init(const char *pager_env)
+{
+	subcmd_config.pager_env = pager_env;
+}
+
 static void pager_preexec(void)
 {
 	/*
@@ -46,7 +57,7 @@ static void wait_for_pager_signal(int signo)
 
 void setup_pager(void)
 {
-	const char *pager = getenv("PERF_PAGER");
+	const char *pager = getenv(subcmd_config.pager_env);
 
 	if (!isatty(1))
 		return;
@@ -85,11 +96,5 @@ void setup_pager(void)
 
 int pager_in_use(void)
 {
-	const char *env;
-
-	if (spawned_pager)
-		return 1;
-
-	env = getenv("PERF_PAGER_IN_USE");
-	return env ? perf_config_bool("PERF_PAGER_IN_USE", env) : 0;
+	return spawned_pager;
 }

+ 9 - 0
tools/lib/subcmd/pager.h

@@ -0,0 +1,9 @@
+#ifndef __SUBCMD_PAGER_H
+#define __SUBCMD_PAGER_H
+
+extern void pager_init(const char *pager_env);
+
+extern void setup_pager(void);
+extern int pager_in_use(void);
+
+#endif /* __SUBCMD_PAGER_H */

+ 183 - 67
tools/perf/util/parse-options.c → tools/lib/subcmd/parse-options.c

@@ -1,37 +1,66 @@
-#include "util.h"
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <ctype.h>
+#include "subcmd-util.h"
 #include "parse-options.h"
-#include "cache.h"
-#include "header.h"
-#include <linux/string.h>
+#include "subcmd-config.h"
+#include "pager.h"
 
 #define OPT_SHORT 1
 #define OPT_UNSET 2
 
-static struct strbuf error_buf = STRBUF_INIT;
+char *error_buf;
 
 static int opterror(const struct option *opt, const char *reason, int flags)
 {
 	if (flags & OPT_SHORT)
-		return error("switch `%c' %s", opt->short_name, reason);
-	if (flags & OPT_UNSET)
-		return error("option `no-%s' %s", opt->long_name, reason);
-	return error("option `%s' %s", opt->long_name, reason);
+		fprintf(stderr, " Error: switch `%c' %s", opt->short_name, reason);
+	else if (flags & OPT_UNSET)
+		fprintf(stderr, " Error: option `no-%s' %s", opt->long_name, reason);
+	else
+		fprintf(stderr, " Error: option `%s' %s", opt->long_name, reason);
+
+	return -1;
+}
+
+static const char *skip_prefix(const char *str, const char *prefix)
+{
+	size_t len = strlen(prefix);
+	return strncmp(str, prefix, len) ? NULL : str + len;
+}
+
+static void optwarning(const struct option *opt, const char *reason, int flags)
+{
+	if (flags & OPT_SHORT)
+		fprintf(stderr, " Warning: switch `%c' %s", opt->short_name, reason);
+	else if (flags & OPT_UNSET)
+		fprintf(stderr, " Warning: option `no-%s' %s", opt->long_name, reason);
+	else
+		fprintf(stderr, " Warning: option `%s' %s", opt->long_name, reason);
 }
 
 static int get_arg(struct parse_opt_ctx_t *p, const struct option *opt,
 		   int flags, const char **arg)
 {
+	const char *res;
+
 	if (p->opt) {
-		*arg = p->opt;
+		res = p->opt;
 		p->opt = NULL;
 	} else if ((opt->flags & PARSE_OPT_LASTARG_DEFAULT) && (p->argc == 1 ||
 		    **(p->argv + 1) == '-')) {
-		*arg = (const char *)opt->defval;
+		res = (const char *)opt->defval;
 	} else if (p->argc > 1) {
 		p->argc--;
-		*arg = *++p->argv;
+		res = *++p->argv;
 	} else
 		return opterror(opt, "requires a value", flags);
+	if (arg)
+		*arg = res;
 	return 0;
 }
 
@@ -55,11 +84,11 @@ static int get_value(struct parse_opt_ctx_t *p,
 
 			if (((flags & OPT_SHORT) && p->excl_opt->short_name) ||
 			    p->excl_opt->long_name == NULL) {
-				scnprintf(msg, sizeof(msg), "cannot be used with switch `%c'",
-					  p->excl_opt->short_name);
+				snprintf(msg, sizeof(msg), "cannot be used with switch `%c'",
+					 p->excl_opt->short_name);
 			} else {
-				scnprintf(msg, sizeof(msg), "cannot be used with %s",
-					  p->excl_opt->long_name);
+				snprintf(msg, sizeof(msg), "cannot be used with %s",
+					 p->excl_opt->long_name);
 			}
 			opterror(opt, msg, flags);
 			return -3;
@@ -91,6 +120,64 @@ static int get_value(struct parse_opt_ctx_t *p,
 		}
 	}
 
+	if (opt->flags & PARSE_OPT_NOBUILD) {
+		char reason[128];
+		bool noarg = false;
+
+		err = snprintf(reason, sizeof(reason),
+				opt->flags & PARSE_OPT_CANSKIP ?
+					"is being ignored because %s " :
+					"is not available because %s",
+				opt->build_opt);
+		reason[sizeof(reason) - 1] = '\0';
+
+		if (err < 0)
+			strncpy(reason, opt->flags & PARSE_OPT_CANSKIP ?
+					"is being ignored" :
+					"is not available",
+					sizeof(reason));
+
+		if (!(opt->flags & PARSE_OPT_CANSKIP))
+			return opterror(opt, reason, flags);
+
+		err = 0;
+		if (unset)
+			noarg = true;
+		if (opt->flags & PARSE_OPT_NOARG)
+			noarg = true;
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+			noarg = true;
+
+		switch (opt->type) {
+		case OPTION_BOOLEAN:
+		case OPTION_INCR:
+		case OPTION_BIT:
+		case OPTION_SET_UINT:
+		case OPTION_SET_PTR:
+		case OPTION_END:
+		case OPTION_ARGUMENT:
+		case OPTION_GROUP:
+			noarg = true;
+			break;
+		case OPTION_CALLBACK:
+		case OPTION_STRING:
+		case OPTION_INTEGER:
+		case OPTION_UINTEGER:
+		case OPTION_LONG:
+		case OPTION_U64:
+		default:
+			break;
+		}
+
+		if (!noarg)
+			err = get_arg(p, opt, flags, NULL);
+		if (err)
+			return err;
+
+		optwarning(opt, reason, flags);
+		return 0;
+	}
+
 	switch (opt->type) {
 	case OPTION_BIT:
 		if (unset)
@@ -327,14 +414,16 @@ match:
 		return get_value(p, options, flags);
 	}
 
-	if (ambiguous_option)
-		return error("Ambiguous option: %s "
-			"(could be --%s%s or --%s%s)",
-			arg,
-			(ambiguous_flags & OPT_UNSET) ?  "no-" : "",
-			ambiguous_option->long_name,
-			(abbrev_flags & OPT_UNSET) ?  "no-" : "",
-			abbrev_option->long_name);
+	if (ambiguous_option) {
+		 fprintf(stderr,
+			 " Error: Ambiguous option: %s (could be --%s%s or --%s%s)",
+			 arg,
+			 (ambiguous_flags & OPT_UNSET) ?  "no-" : "",
+			 ambiguous_option->long_name,
+			 (abbrev_flags & OPT_UNSET) ?  "no-" : "",
+			 abbrev_option->long_name);
+		 return -1;
+	}
 	if (abbrev_option)
 		return get_value(p, abbrev_option, abbrev_flags);
 	return -2;
@@ -346,7 +435,7 @@ static void check_typos(const char *arg, const struct option *options)
 		return;
 
 	if (!prefixcmp(arg, "no-")) {
-		error ("did you mean `--%s` (with two dashes ?)", arg);
+		fprintf(stderr, " Error: did you mean `--%s` (with two dashes ?)", arg);
 		exit(129);
 	}
 
@@ -354,14 +443,14 @@ static void check_typos(const char *arg, const struct option *options)
 		if (!options->long_name)
 			continue;
 		if (!prefixcmp(options->long_name, arg)) {
-			error ("did you mean `--%s` (with two dashes ?)", arg);
+			fprintf(stderr, " Error: did you mean `--%s` (with two dashes ?)", arg);
 			exit(129);
 		}
 	}
 }
 
-void parse_options_start(struct parse_opt_ctx_t *ctx,
-			 int argc, const char **argv, int flags)
+static void parse_options_start(struct parse_opt_ctx_t *ctx,
+				int argc, const char **argv, int flags)
 {
 	memset(ctx, 0, sizeof(*ctx));
 	ctx->argc = argc - 1;
@@ -378,9 +467,9 @@ static int usage_with_options_internal(const char * const *,
 				       const struct option *, int,
 				       struct parse_opt_ctx_t *);
 
-int parse_options_step(struct parse_opt_ctx_t *ctx,
-		       const struct option *options,
-		       const char * const usagestr[])
+static int parse_options_step(struct parse_opt_ctx_t *ctx,
+			      const struct option *options,
+			      const char * const usagestr[])
 {
 	int internal_help = !(ctx->flags & PARSE_OPT_NO_INTERNAL_HELP);
 	int excl_short_opt = 1;
@@ -489,7 +578,7 @@ exclusive:
 	return PARSE_OPT_HELP;
 }
 
-int parse_options_end(struct parse_opt_ctx_t *ctx)
+static int parse_options_end(struct parse_opt_ctx_t *ctx)
 {
 	memmove(ctx->out + ctx->cpidx, ctx->argv, ctx->argc * sizeof(*ctx->out));
 	ctx->out[ctx->cpidx + ctx->argc] = NULL;
@@ -501,22 +590,20 @@ int parse_options_subcommand(int argc, const char **argv, const struct option *o
 {
 	struct parse_opt_ctx_t ctx;
 
-	perf_env__set_cmdline(&perf_env, argc, argv);
-
 	/* build usage string if it's not provided */
 	if (subcommands && !usagestr[0]) {
-		struct strbuf buf = STRBUF_INIT;
+		char *buf = NULL;
+
+		astrcatf(&buf, "%s %s [<options>] {", subcmd_config.exec_name, argv[0]);
 
-		strbuf_addf(&buf, "perf %s [<options>] {", argv[0]);
 		for (int i = 0; subcommands[i]; i++) {
 			if (i)
-				strbuf_addstr(&buf, "|");
-			strbuf_addstr(&buf, subcommands[i]);
+				astrcat(&buf, "|");
+			astrcat(&buf, subcommands[i]);
 		}
-		strbuf_addstr(&buf, "}");
+		astrcat(&buf, "}");
 
-		usagestr[0] = strdup(buf.buf);
-		strbuf_release(&buf);
+		usagestr[0] = buf;
 	}
 
 	parse_options_start(&ctx, argc, argv, flags);
@@ -541,13 +628,11 @@ int parse_options_subcommand(int argc, const char **argv, const struct option *o
 		putchar('\n');
 		exit(130);
 	default: /* PARSE_OPT_UNKNOWN */
-		if (ctx.argv[0][1] == '-') {
-			strbuf_addf(&error_buf, "unknown option `%s'",
-				    ctx.argv[0] + 2);
-		} else {
-			strbuf_addf(&error_buf, "unknown switch `%c'",
-				    *ctx.opt);
-		}
+		if (ctx.argv[0][1] == '-')
+			astrcatf(&error_buf, "unknown option `%s'",
+				 ctx.argv[0] + 2);
+		else
+			astrcatf(&error_buf, "unknown switch `%c'", *ctx.opt);
 		usage_with_options(usagestr, options);
 	}
 
@@ -647,6 +732,10 @@ static void print_option_help(const struct option *opts, int full)
 		pad = USAGE_OPTS_WIDTH;
 	}
 	fprintf(stderr, "%*s%s\n", pad + USAGE_GAP, "", opts->help);
+	if (opts->flags & PARSE_OPT_NOBUILD)
+		fprintf(stderr, "%*s(not built-in because %s)\n",
+			USAGE_OPTS_WIDTH + USAGE_GAP, "",
+			opts->build_opt);
 }
 
 static int option__cmp(const void *va, const void *vb)
@@ -672,16 +761,18 @@ static int option__cmp(const void *va, const void *vb)
 
 static struct option *options__order(const struct option *opts)
 {
-	int nr_opts = 0;
+	int nr_opts = 0, len;
 	const struct option *o = opts;
 	struct option *ordered;
 
 	for (o = opts; o->type != OPTION_END; o++)
 		++nr_opts;
 
-	ordered = memdup(opts, sizeof(*o) * (nr_opts + 1));
-	if (ordered == NULL)
+	len = sizeof(*o) * (nr_opts + 1);
+	ordered = malloc(len);
+	if (!ordered)
 		goto out;
+	memcpy(ordered, opts, len);
 
 	qsort(ordered, nr_opts, sizeof(*o), option__cmp);
 out:
@@ -719,9 +810,9 @@ static bool option__in_argv(const struct option *opt, const struct parse_opt_ctx
 	return false;
 }
 
-int usage_with_options_internal(const char * const *usagestr,
-				const struct option *opts, int full,
-				struct parse_opt_ctx_t *ctx)
+static int usage_with_options_internal(const char * const *usagestr,
+				       const struct option *opts, int full,
+				       struct parse_opt_ctx_t *ctx)
 {
 	struct option *ordered;
 
@@ -730,9 +821,9 @@ int usage_with_options_internal(const char * const *usagestr,
 
 	setup_pager();
 
-	if (strbuf_avail(&error_buf)) {
-		fprintf(stderr, "  Error: %s\n", error_buf.buf);
-		strbuf_release(&error_buf);
+	if (error_buf) {
+		fprintf(stderr, "  Error: %s\n", error_buf);
+		zfree(&error_buf);
 	}
 
 	fprintf(stderr, "\n Usage: %s\n", *usagestr++);
@@ -768,7 +859,6 @@ int usage_with_options_internal(const char * const *usagestr,
 void usage_with_options(const char * const *usagestr,
 			const struct option *opts)
 {
-	exit_browser(false);
 	usage_with_options_internal(usagestr, opts, 0, NULL);
 	exit(129);
 }
@@ -777,13 +867,15 @@ void usage_with_options_msg(const char * const *usagestr,
 			    const struct option *opts, const char *fmt, ...)
 {
 	va_list ap;
-
-	exit_browser(false);
+	char *tmp = error_buf;
 
 	va_start(ap, fmt);
-	strbuf_addv(&error_buf, fmt, ap);
+	if (vasprintf(&error_buf, fmt, ap) == -1)
+		die("vasprintf failed");
 	va_end(ap);
 
+	free(tmp);
+
 	usage_with_options_internal(usagestr, opts, 0, NULL);
 	exit(129);
 }
@@ -853,15 +945,39 @@ int parse_opt_verbosity_cb(const struct option *opt,
 	return 0;
 }
 
-void set_option_flag(struct option *opts, int shortopt, const char *longopt,
-		     int flag)
+static struct option *
+find_option(struct option *opts, int shortopt, const char *longopt)
 {
 	for (; opts->type != OPTION_END; opts++) {
 		if ((shortopt && opts->short_name == shortopt) ||
 		    (opts->long_name && longopt &&
-		     !strcmp(opts->long_name, longopt))) {
-			opts->flags |= flag;
-			break;
-		}
+		     !strcmp(opts->long_name, longopt)))
+			return opts;
 	}
+	return NULL;
+}
+
+void set_option_flag(struct option *opts, int shortopt, const char *longopt,
+		     int flag)
+{
+	struct option *opt = find_option(opts, shortopt, longopt);
+
+	if (opt)
+		opt->flags |= flag;
+	return;
+}
+
+void set_option_nobuild(struct option *opts, int shortopt,
+			const char *longopt,
+			const char *build_opt,
+			bool can_skip)
+{
+	struct option *opt = find_option(opts, shortopt, longopt);
+
+	if (!opt)
+		return;
+
+	opt->flags |= PARSE_OPT_NOBUILD;
+	opt->flags |= can_skip ? PARSE_OPT_CANSKIP : 0;
+	opt->build_opt = build_opt;
 }

+ 13 - 13
tools/perf/util/parse-options.h → tools/lib/subcmd/parse-options.h

@@ -1,8 +1,8 @@
-#ifndef __PERF_PARSE_OPTIONS_H
-#define __PERF_PARSE_OPTIONS_H
+#ifndef __SUBCMD_PARSE_OPTIONS_H
+#define __SUBCMD_PARSE_OPTIONS_H
 
-#include <linux/kernel.h>
 #include <stdbool.h>
+#include <stdint.h>
 
 enum parse_opt_type {
 	/* special types */
@@ -41,6 +41,8 @@ enum parse_opt_option_flags {
 	PARSE_OPT_DISABLED = 32,
 	PARSE_OPT_EXCLUSIVE = 64,
 	PARSE_OPT_NOEMPTY  = 128,
+	PARSE_OPT_NOBUILD  = 256,
+	PARSE_OPT_CANSKIP  = 512,
 };
 
 struct option;
@@ -96,6 +98,7 @@ struct option {
 	void *value;
 	const char *argh;
 	const char *help;
+	const char *build_opt;
 
 	int flags;
 	parse_opt_cb *callback;
@@ -149,6 +152,9 @@ struct option {
 /* parse_options() will filter out the processed options and leave the
  * non-option argments in argv[].
  * Returns the number of arguments left in argv[].
+ *
+ * NOTE: parse_options() and parse_options_subcommand() may call exit() in the
+ * case of an error (or for 'special' options like --list-cmds or --list-opts).
  */
 extern int parse_options(int argc, const char **argv,
                          const struct option *options,
@@ -195,15 +201,6 @@ extern int parse_options_usage(const char * const *usagestr,
 			       const char *optstr,
 			       bool short_opt);
 
-extern void parse_options_start(struct parse_opt_ctx_t *ctx,
-				int argc, const char **argv, int flags);
-
-extern int parse_options_step(struct parse_opt_ctx_t *ctx,
-			      const struct option *options,
-			      const char * const usagestr[]);
-
-extern int parse_options_end(struct parse_opt_ctx_t *ctx);
-
 
 /*----- some often used options -----*/
 extern int parse_opt_abbrev_cb(const struct option *, const char *, int);
@@ -226,4 +223,7 @@ extern int parse_opt_verbosity_cb(const struct option *, const char *, int);
 extern const char *parse_options_fix_filename(const char *prefix, const char *file);
 
 void set_option_flag(struct option *opts, int sopt, const char *lopt, int flag);
-#endif /* __PERF_PARSE_OPTIONS_H */
+void set_option_nobuild(struct option *opts, int shortopt, const char *longopt,
+			const char *build_opt, bool can_skip);
+
+#endif /* __SUBCMD_PARSE_OPTIONS_H */

+ 16 - 8
tools/perf/util/run-command.c → tools/lib/subcmd/run-command.c

@@ -1,7 +1,15 @@
-#include "cache.h"
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/wait.h>
+#include "subcmd-util.h"
 #include "run-command.h"
-#include "exec_cmd.h"
-#include "debug.h"
+#include "exec-cmd.h"
+
+#define STRERR_BUFSIZE 128
 
 static inline void close_pair(int fd[2])
 {
@@ -112,8 +120,8 @@ int start_command(struct child_process *cmd)
 		}
 		if (cmd->preexec_cb)
 			cmd->preexec_cb();
-		if (cmd->perf_cmd) {
-			execv_perf_cmd(cmd->argv);
+		if (cmd->exec_cmd) {
+			execv_cmd(cmd->argv);
 		} else {
 			execvp(cmd->argv[0], (char *const*) cmd->argv);
 		}
@@ -164,8 +172,8 @@ static int wait_or_whine(pid_t pid)
 		if (waiting < 0) {
 			if (errno == EINTR)
 				continue;
-			error("waitpid failed (%s)",
-			      strerror_r(errno, sbuf, sizeof(sbuf)));
+			fprintf(stderr, " Error: waitpid failed (%s)",
+				strerror_r(errno, sbuf, sizeof(sbuf)));
 			return -ERR_RUN_COMMAND_WAITPID;
 		}
 		if (waiting != pid)
@@ -207,7 +215,7 @@ static void prepare_run_command_v_opt(struct child_process *cmd,
 	memset(cmd, 0, sizeof(*cmd));
 	cmd->argv = argv;
 	cmd->no_stdin = opt & RUN_COMMAND_NO_STDIN ? 1 : 0;
-	cmd->perf_cmd = opt & RUN_PERF_CMD ? 1 : 0;
+	cmd->exec_cmd = opt & RUN_EXEC_CMD ? 1 : 0;
 	cmd->stdout_to_stderr = opt & RUN_COMMAND_STDOUT_TO_STDERR ? 1 : 0;
 }
 

+ 7 - 5
tools/perf/util/run-command.h → tools/lib/subcmd/run-command.h

@@ -1,5 +1,7 @@
-#ifndef __PERF_RUN_COMMAND_H
-#define __PERF_RUN_COMMAND_H
+#ifndef __SUBCMD_RUN_COMMAND_H
+#define __SUBCMD_RUN_COMMAND_H
+
+#include <unistd.h>
 
 enum {
 	ERR_RUN_COMMAND_FORK = 10000,
@@ -41,7 +43,7 @@ struct child_process {
 	unsigned no_stdin:1;
 	unsigned no_stdout:1;
 	unsigned no_stderr:1;
-	unsigned perf_cmd:1; /* if this is to be perf sub-command */
+	unsigned exec_cmd:1; /* if this is to be external sub-command */
 	unsigned stdout_to_stderr:1;
 	void (*preexec_cb)(void);
 };
@@ -51,8 +53,8 @@ int finish_command(struct child_process *);
 int run_command(struct child_process *);
 
 #define RUN_COMMAND_NO_STDIN 1
-#define RUN_PERF_CMD	     2	/*If this is to be perf sub-command */
+#define RUN_EXEC_CMD	     2	/*If this is to be external sub-command */
 #define RUN_COMMAND_STDOUT_TO_STDERR 4
 int run_command_v_opt(const char **argv, int opt);
 
-#endif /* __PERF_RUN_COMMAND_H */
+#endif /* __SUBCMD_RUN_COMMAND_H */

+ 2 - 1
tools/perf/util/sigchain.c → tools/lib/subcmd/sigchain.c

@@ -1,5 +1,6 @@
+#include <signal.h>
+#include "subcmd-util.h"
 #include "sigchain.h"
-#include "cache.h"
 
 #define SIGCHAIN_MAX_SIGNALS 32
 

+ 3 - 3
tools/perf/util/sigchain.h → tools/lib/subcmd/sigchain.h

@@ -1,5 +1,5 @@
-#ifndef __PERF_SIGCHAIN_H
-#define __PERF_SIGCHAIN_H
+#ifndef __SUBCMD_SIGCHAIN_H
+#define __SUBCMD_SIGCHAIN_H
 
 typedef void (*sigchain_fun)(int);
 
@@ -7,4 +7,4 @@ int sigchain_pop(int sig);
 
 void sigchain_push_common(sigchain_fun f);
 
-#endif /* __PERF_SIGCHAIN_H */
+#endif /* __SUBCMD_SIGCHAIN_H */

+ 11 - 0
tools/lib/subcmd/subcmd-config.c

@@ -0,0 +1,11 @@
+#include "subcmd-config.h"
+
+#define UNDEFINED "SUBCMD_HAS_NOT_BEEN_INITIALIZED"
+
+struct subcmd_config subcmd_config = {
+	.exec_name	= UNDEFINED,
+	.prefix		= UNDEFINED,
+	.exec_path	= UNDEFINED,
+	.exec_path_env	= UNDEFINED,
+	.pager_env	= UNDEFINED,
+};

+ 14 - 0
tools/lib/subcmd/subcmd-config.h

@@ -0,0 +1,14 @@
+#ifndef __PERF_SUBCMD_CONFIG_H
+#define __PERF_SUBCMD_CONFIG_H
+
+struct subcmd_config {
+	const char *exec_name;
+	const char *prefix;
+	const char *exec_path;
+	const char *exec_path_env;
+	const char *pager_env;
+};
+
+extern struct subcmd_config subcmd_config;
+
+#endif /* __PERF_SUBCMD_CONFIG_H */

+ 91 - 0
tools/lib/subcmd/subcmd-util.h

@@ -0,0 +1,91 @@
+#ifndef __SUBCMD_UTIL_H
+#define __SUBCMD_UTIL_H
+
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#define NORETURN __attribute__((__noreturn__))
+
+static inline void report(const char *prefix, const char *err, va_list params)
+{
+	char msg[1024];
+	vsnprintf(msg, sizeof(msg), err, params);
+	fprintf(stderr, " %s%s\n", prefix, msg);
+}
+
+static NORETURN inline void die(const char *err, ...)
+{
+	va_list params;
+
+	va_start(params, err);
+	report(" Fatal: ", err, params);
+	exit(128);
+	va_end(params);
+}
+
+#define zfree(ptr) ({ free(*ptr); *ptr = NULL; })
+
+#define alloc_nr(x) (((x)+16)*3/2)
+
+/*
+ * Realloc the buffer pointed at by variable 'x' so that it can hold
+ * at least 'nr' entries; the number of entries currently allocated
+ * is 'alloc', using the standard growing factor alloc_nr() macro.
+ *
+ * DO NOT USE any expression with side-effect for 'x' or 'alloc'.
+ */
+#define ALLOC_GROW(x, nr, alloc) \
+	do { \
+		if ((nr) > alloc) { \
+			if (alloc_nr(alloc) < (nr)) \
+				alloc = (nr); \
+			else \
+				alloc = alloc_nr(alloc); \
+			x = xrealloc((x), alloc * sizeof(*(x))); \
+		} \
+	} while(0)
+
+static inline void *xrealloc(void *ptr, size_t size)
+{
+	void *ret = realloc(ptr, size);
+	if (!ret && !size)
+		ret = realloc(ptr, 1);
+	if (!ret) {
+		ret = realloc(ptr, size);
+		if (!ret && !size)
+			ret = realloc(ptr, 1);
+		if (!ret)
+			die("Out of memory, realloc failed");
+	}
+	return ret;
+}
+
+#define astrcatf(out, fmt, ...)						\
+({									\
+	char *tmp = *(out);						\
+	if (asprintf((out), "%s" fmt, tmp ?: "", ## __VA_ARGS__) == -1)	\
+		die("asprintf failed");					\
+	free(tmp);							\
+})
+
+static inline void astrcat(char **out, const char *add)
+{
+	char *tmp = *out;
+
+	if (asprintf(out, "%s%s", tmp ?: "", add) == -1)
+		die("asprintf failed");
+
+	free(tmp);
+}
+
+static inline int prefixcmp(const char *str, const char *prefix)
+{
+	for (; ; str++, prefix++)
+		if (!*prefix)
+			return 0;
+		else if (*str != *prefix)
+			return (unsigned char)*prefix - (unsigned char)*str;
+}
+
+#endif /* __SUBCMD_UTIL_H */

+ 70 - 64
tools/lib/traceevent/event-parse.c

@@ -4735,73 +4735,80 @@ static int is_printable_array(char *p, unsigned int len)
 	return 1;
 }
 
-static void print_event_fields(struct trace_seq *s, void *data,
-			       int size __maybe_unused,
-			       struct event_format *event)
+void pevent_print_field(struct trace_seq *s, void *data,
+			struct format_field *field)
 {
-	struct format_field *field;
 	unsigned long long val;
 	unsigned int offset, len, i;
-
-	field = event->format.fields;
-	while (field) {
-		trace_seq_printf(s, " %s=", field->name);
-		if (field->flags & FIELD_IS_ARRAY) {
-			offset = field->offset;
-			len = field->size;
-			if (field->flags & FIELD_IS_DYNAMIC) {
-				val = pevent_read_number(event->pevent, data + offset, len);
-				offset = val;
-				len = offset >> 16;
-				offset &= 0xffff;
-			}
-			if (field->flags & FIELD_IS_STRING &&
-			    is_printable_array(data + offset, len)) {
-				trace_seq_printf(s, "%s", (char *)data + offset);
-			} else {
-				trace_seq_puts(s, "ARRAY[");
-				for (i = 0; i < len; i++) {
-					if (i)
-						trace_seq_puts(s, ", ");
-					trace_seq_printf(s, "%02x",
-							 *((unsigned char *)data + offset + i));
-				}
-				trace_seq_putc(s, ']');
-				field->flags &= ~FIELD_IS_STRING;
-			}
+	struct pevent *pevent = field->event->pevent;
+
+	if (field->flags & FIELD_IS_ARRAY) {
+		offset = field->offset;
+		len = field->size;
+		if (field->flags & FIELD_IS_DYNAMIC) {
+			val = pevent_read_number(pevent, data + offset, len);
+			offset = val;
+			len = offset >> 16;
+			offset &= 0xffff;
+		}
+		if (field->flags & FIELD_IS_STRING &&
+		    is_printable_array(data + offset, len)) {
+			trace_seq_printf(s, "%s", (char *)data + offset);
 		} else {
-			val = pevent_read_number(event->pevent, data + field->offset,
-						 field->size);
-			if (field->flags & FIELD_IS_POINTER) {
-				trace_seq_printf(s, "0x%llx", val);
-			} else if (field->flags & FIELD_IS_SIGNED) {
-				switch (field->size) {
-				case 4:
-					/*
-					 * If field is long then print it in hex.
-					 * A long usually stores pointers.
-					 */
-					if (field->flags & FIELD_IS_LONG)
-						trace_seq_printf(s, "0x%x", (int)val);
-					else
-						trace_seq_printf(s, "%d", (int)val);
-					break;
-				case 2:
-					trace_seq_printf(s, "%2d", (short)val);
-					break;
-				case 1:
-					trace_seq_printf(s, "%1d", (char)val);
-					break;
-				default:
-					trace_seq_printf(s, "%lld", val);
-				}
-			} else {
+			trace_seq_puts(s, "ARRAY[");
+			for (i = 0; i < len; i++) {
+				if (i)
+					trace_seq_puts(s, ", ");
+				trace_seq_printf(s, "%02x",
+						 *((unsigned char *)data + offset + i));
+			}
+			trace_seq_putc(s, ']');
+			field->flags &= ~FIELD_IS_STRING;
+		}
+	} else {
+		val = pevent_read_number(pevent, data + field->offset,
+					 field->size);
+		if (field->flags & FIELD_IS_POINTER) {
+			trace_seq_printf(s, "0x%llx", val);
+		} else if (field->flags & FIELD_IS_SIGNED) {
+			switch (field->size) {
+			case 4:
+				/*
+				 * If field is long then print it in hex.
+				 * A long usually stores pointers.
+				 */
 				if (field->flags & FIELD_IS_LONG)
-					trace_seq_printf(s, "0x%llx", val);
+					trace_seq_printf(s, "0x%x", (int)val);
 				else
-					trace_seq_printf(s, "%llu", val);
+					trace_seq_printf(s, "%d", (int)val);
+				break;
+			case 2:
+				trace_seq_printf(s, "%2d", (short)val);
+				break;
+			case 1:
+				trace_seq_printf(s, "%1d", (char)val);
+				break;
+			default:
+				trace_seq_printf(s, "%lld", val);
 			}
+		} else {
+			if (field->flags & FIELD_IS_LONG)
+				trace_seq_printf(s, "0x%llx", val);
+			else
+				trace_seq_printf(s, "%llu", val);
 		}
+	}
+}
+
+void pevent_print_fields(struct trace_seq *s, void *data,
+			 int size __maybe_unused, struct event_format *event)
+{
+	struct format_field *field;
+
+	field = event->format.fields;
+	while (field) {
+		trace_seq_printf(s, " %s=", field->name);
+		pevent_print_field(s, data, field);
 		field = field->next;
 	}
 }
@@ -4827,7 +4834,7 @@ static void pretty_print(struct trace_seq *s, void *data, int size, struct event
 
 	if (event->flags & EVENT_FL_FAILED) {
 		trace_seq_printf(s, "[FAILED TO PARSE]");
-		print_event_fields(s, data, size, event);
+		pevent_print_fields(s, data, size, event);
 		return;
 	}
 
@@ -4968,13 +4975,12 @@ static void pretty_print(struct trace_seq *s, void *data, int size, struct event
 				    sizeof(long) != 8) {
 					char *p;
 
-					ls = 2;
 					/* make %l into %ll */
-					p = strchr(format, 'l');
-					if (p)
+					if (ls == 1 && (p = strchr(format, 'l')))
 						memmove(p+1, p, strlen(p)+1);
 					else if (strcmp(format, "%p") == 0)
 						strcpy(format, "0x%llx");
+					ls = 2;
 				}
 				switch (ls) {
 				case -2:
@@ -5302,7 +5308,7 @@ void pevent_event_info(struct trace_seq *s, struct event_format *event,
 	int print_pretty = 1;
 
 	if (event->pevent->print_raw || (event->flags & EVENT_FL_PRINTRAW))
-		print_event_fields(s, record->data, record->size, event);
+		pevent_print_fields(s, record->data, record->size, event);
 	else {
 
 		if (event->handler && !(event->flags & EVENT_FL_NOHANDLE))

+ 4 - 0
tools/lib/traceevent/event-parse.h

@@ -705,6 +705,10 @@ struct cmdline *pevent_data_pid_from_comm(struct pevent *pevent, const char *com
 					  struct cmdline *next);
 int pevent_cmdline_pid(struct pevent *pevent, struct cmdline *cmdline);
 
+void pevent_print_field(struct trace_seq *s, void *data,
+			struct format_field *field);
+void pevent_print_fields(struct trace_seq *s, void *data,
+			 int size __maybe_unused, struct event_format *event);
 void pevent_event_info(struct trace_seq *s, struct event_format *event,
 		       struct pevent_record *record);
 int pevent_strerror(struct pevent *pevent, enum pevent_errno errnum,

+ 0 - 89
tools/lib/util/find_next_bit.c

@@ -1,89 +0,0 @@
-/* find_next_bit.c: fallback find next bit implementation
- *
- * Copied from lib/find_next_bit.c to tools/lib/next_bit.c
- *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/bitops.h>
-#include <asm/types.h>
-#include <asm/byteorder.h>
-
-#define BITOP_WORD(nr)		((nr) / BITS_PER_LONG)
-
-#ifndef find_next_bit
-/*
- * Find the next set bit in a memory region.
- */
-unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
-			    unsigned long offset)
-{
-	const unsigned long *p = addr + BITOP_WORD(offset);
-	unsigned long result = offset & ~(BITS_PER_LONG-1);
-	unsigned long tmp;
-
-	if (offset >= size)
-		return size;
-	size -= result;
-	offset %= BITS_PER_LONG;
-	if (offset) {
-		tmp = *(p++);
-		tmp &= (~0UL << offset);
-		if (size < BITS_PER_LONG)
-			goto found_first;
-		if (tmp)
-			goto found_middle;
-		size -= BITS_PER_LONG;
-		result += BITS_PER_LONG;
-	}
-	while (size & ~(BITS_PER_LONG-1)) {
-		if ((tmp = *(p++)))
-			goto found_middle;
-		result += BITS_PER_LONG;
-		size -= BITS_PER_LONG;
-	}
-	if (!size)
-		return result;
-	tmp = *p;
-
-found_first:
-	tmp &= (~0UL >> (BITS_PER_LONG - size));
-	if (tmp == 0UL)		/* Are any bits set? */
-		return result + size;	/* Nope. */
-found_middle:
-	return result + __ffs(tmp);
-}
-#endif
-
-#ifndef find_first_bit
-/*
- * Find the first set bit in a memory region.
- */
-unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
-{
-	const unsigned long *p = addr;
-	unsigned long result = 0;
-	unsigned long tmp;
-
-	while (size & ~(BITS_PER_LONG-1)) {
-		if ((tmp = *(p++)))
-			goto found;
-		result += BITS_PER_LONG;
-		size -= BITS_PER_LONG;
-	}
-	if (!size)
-		return result;
-
-	tmp = (*p) & (~0UL >> (BITS_PER_LONG - size));
-	if (tmp == 0UL)		/* Are any bits set? */
-		return result + size;	/* Nope. */
-found:
-	return result + __ffs(tmp);
-}
-#endif

+ 7 - 1
tools/perf/Build

@@ -1,5 +1,6 @@
 perf-y += builtin-bench.o
 perf-y += builtin-annotate.o
+perf-y += builtin-config.o
 perf-y += builtin-diff.o
 perf-y += builtin-evlist.o
 perf-y += builtin-help.o
@@ -19,6 +20,7 @@ perf-y += builtin-kvm.o
 perf-y += builtin-inject.o
 perf-y += builtin-mem.o
 perf-y += builtin-data.o
+perf-y += builtin-version.o
 
 perf-$(CONFIG_AUDIT) += builtin-trace.o
 perf-$(CONFIG_LIBELF) += builtin-probe.o
@@ -34,8 +36,12 @@ paths += -DPERF_MAN_PATH="BUILD_STR($(mandir_SQ))"
 
 CFLAGS_builtin-help.o      += $(paths)
 CFLAGS_builtin-timechart.o += $(paths)
-CFLAGS_perf.o              += -DPERF_HTML_PATH="BUILD_STR($(htmldir_SQ))" -include $(OUTPUT)PERF-VERSION-FILE
+CFLAGS_perf.o              += -DPERF_HTML_PATH="BUILD_STR($(htmldir_SQ))"	\
+			      -DPERF_EXEC_PATH="BUILD_STR($(perfexecdir_SQ))"	\
+			      -DPREFIX="BUILD_STR($(prefix_SQ))"		\
+			      -include $(OUTPUT)PERF-VERSION-FILE
 CFLAGS_builtin-trace.o	   += -DSTRACE_GROUPS_DIR="BUILD_STR($(STRACE_GROUPS_DIR_SQ))"
+CFLAGS_builtin-report.o	   += -DTIPDIR="BUILD_STR($(tipdir_SQ))"
 
 libperf-y += util/
 libperf-y += arch/

+ 103 - 0
tools/perf/Documentation/perf-config.txt

@@ -0,0 +1,103 @@
+perf-config(1)
+==============
+
+NAME
+----
+perf-config - Get and set variables in a configuration file.
+
+SYNOPSIS
+--------
+[verse]
+'perf config' -l | --list
+
+DESCRIPTION
+-----------
+You can manage variables in a configuration file with this command.
+
+OPTIONS
+-------
+
+-l::
+--list::
+	Show current config variables, name and value, for all sections.
+
+CONFIGURATION FILE
+------------------
+
+The perf configuration file contains many variables to change various
+aspects of each of its tools, including output, disk usage, etc.
+The '$HOME/.perfconfig' file is used to store a per-user configuration.
+The file '$(sysconfdir)/perfconfig' can be used to
+store a system-wide default configuration.
+
+Syntax
+~~~~~~
+
+The file consist of sections. A section starts with its name
+surrounded by square brackets and continues till the next section
+begins. Each variable must be in a section, and have the form
+'name = value', for example:
+
+	[section]
+		name1 = value1
+		name2 = value2
+
+Section names are case sensitive and can contain any characters except
+newline (double quote `"` and backslash have to be escaped as `\"` and `\\`,
+respectively). Section headers can't span multiple lines.
+
+Example
+~~~~~~~
+
+Given a $HOME/.perfconfig like this:
+
+#
+# This is the config file, and
+# a '#' and ';' character indicates a comment
+#
+
+	[colors]
+		# Color variables
+		top = red, default
+		medium = green, default
+		normal = lightgray, default
+		selected = white, lightgray
+		code = blue, default
+		addr = magenta, default
+		root = white, blue
+
+	[tui]
+		# Defaults if linked with libslang
+		report = on
+		annotate = on
+		top = on
+
+	[buildid]
+		# Default, disable using /dev/null
+		dir = ~/.debug
+
+	[annotate]
+		# Defaults
+		hide_src_code = false
+		use_offset = true
+		jump_arrows = true
+		show_nr_jumps = false
+
+	[help]
+		# Format can be man, info, web or html
+		format = man
+		autocorrect = 0
+
+	[ui]
+		show-headers = true
+
+	[call-graph]
+		# fp (framepointer), dwarf
+		record-mode = fp
+		print-type = graph
+		order = caller
+		sort-key = function
+
+SEE ALSO
+--------
+linkperf:perf[1]

+ 3 - 0
tools/perf/Documentation/perf-evlist.txt

@@ -32,6 +32,9 @@ OPTIONS
 --group::
 	Show event group information.
 
+--trace-fields::
+	Show tracepoint field names.
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-list[1],

+ 21 - 3
tools/perf/Documentation/perf-record.txt

@@ -207,11 +207,23 @@ comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-
 In per-thread mode with inheritance mode on (default), samples are captured only when
 the thread executes on the designated CPUs. Default is to monitor all CPUs.
 
+-B::
+--no-buildid::
+Do not save the build ids of binaries in the perf.data files. This skips
+post processing after recording, which sometimes makes the final step in
+the recording process to take a long time, as it needs to process all
+events looking for mmap records. The downside is that it can misresolve
+symbols if the workload binaries used when recording get locally rebuilt
+or upgraded, because the only key available in this case is the
+pathname. You can also set the "record.build-id" config variable to
+'skip to have this behaviour permanently.
+
 -N::
 --no-buildid-cache::
 Do not update the buildid cache. This saves some overhead in situations
 where the information in the perf.data file (which includes buildids)
-is sufficient.
+is sufficient.  You can also set the "record.build-id" config variable to
+'no-cache' to have the same effect.
 
 -G name,...::
 --cgroup name,...::
@@ -314,11 +326,17 @@ This option sets the time out limit. The default value is 500 ms.
 Record context switch events i.e. events of type PERF_RECORD_SWITCH or
 PERF_RECORD_SWITCH_CPU_WIDE.
 
---clang-path::
+--clang-path=PATH::
 Path to clang binary to use for compiling BPF scriptlets.
+(enabled when BPF support is on)
 
---clang-opt::
+--clang-opt=OPTIONS::
 Options passed to clang when compiling BPF scriptlets.
+(enabled when BPF support is on)
+
+--vmlinux=PATH::
+Specify vmlinux path which has debuginfo.
+(enabled when BPF prologue is on)
 
 SEE ALSO
 --------

+ 37 - 4
tools/perf/Documentation/perf-report.txt

@@ -117,6 +117,30 @@ OPTIONS
 	And default sort keys are changed to comm, dso_from, symbol_from, dso_to
 	and symbol_to, see '--branch-stack'.
 
+	If the data file has tracepoint event(s), following (dynamic) sort keys
+	are also available:
+	trace, trace_fields, [<event>.]<field>[/raw]
+
+	- trace: pretty printed trace output in a single column
+	- trace_fields: fields in tracepoints in separate columns
+	- <field name>: optional event and field name for a specific field
+
+	The last form consists of event and field names.  If event name is
+	omitted, it searches all events for matching field name.  The matched
+	field will be shown only for the event has the field.  The event name
+	supports substring match so user doesn't need to specify full subsystem
+	and event name everytime.  For example, 'sched:sched_switch' event can
+	be shortened to 'switch' as long as it's not ambiguous.  Also event can
+	be specified by its index (starting from 1) preceded by the '%'.
+	So '%1' is the first event, '%2' is the second, and so on.
+
+	The field name can have '/raw' suffix which disables pretty printing
+	and shows raw field value like hex numbers.  The --raw-trace option
+	has the same effect for all dynamic sort keys.
+
+	The default sort keys are changed to 'trace' if all events in the data
+	file are tracepoint.
+
 -F::
 --fields=::
 	Specify output field - multiple keys can be specified in CSV format.
@@ -170,17 +194,18 @@ OPTIONS
         Dump raw trace in ASCII.
 
 -g::
---call-graph=<print_type,threshold[,print_limit],order,sort_key,branch>::
+--call-graph=<print_type,threshold[,print_limit],order,sort_key[,branch],value>::
         Display call chains using type, min percent threshold, print limit,
-	call order, sort key and branch.  Note that ordering of parameters is not
-	fixed so any parement can be given in an arbitraty order.  One exception
-	is the print_limit which should be preceded by threshold.
+	call order, sort key, optional branch and value.  Note that ordering of
+	parameters is not fixed so any parement can be given in an arbitraty order.
+	One exception is the print_limit which should be preceded by threshold.
 
 	print_type can be either:
 	- flat: single column, linear exposure of call chains.
 	- graph: use a graph tree, displaying absolute overhead rates. (default)
 	- fractal: like graph, but displays relative rates. Each branch of
 		 the tree is considered as a new profiled object.
+	- folded: call chains are displayed in a line, separated by semicolons
 	- none: disable call chain display.
 
 	threshold is a percentage value which specifies a minimum percent to be
@@ -204,6 +229,11 @@ OPTIONS
 	- branch: include last branch information in callgraph when available.
 	          Usually more convenient to use --branch-history for this.
 
+	value can be:
+	- percent: diplay overhead percent (default)
+	- period: display event period
+	- count: display event count
+
 --children::
 	Accumulate callchain of children to parent entry so that then can
 	show up in the output.  The output will have a new "Children" column
@@ -365,6 +395,9 @@ include::itrace.txt[]
 --socket-filter::
 	Only report the samples on the processor socket that match with this filter
 
+--raw-trace::
+	When displaying traceevent output, do not use print fmt or plugins.
+
 include::callchain-overhead-calculation.txt[]
 
 SEE ALSO

+ 34 - 0
tools/perf/Documentation/perf-stat.txt

@@ -10,6 +10,8 @@ SYNOPSIS
 [verse]
 'perf stat' [-e <EVENT> | --event=EVENT] [-a] <command>
 'perf stat' [-e <EVENT> | --event=EVENT] [-a] -- <command> [<options>]
+'perf stat' [-e <EVENT> | --event=EVENT] [-a] record [-o file] -- <command> [<options>]
+'perf stat' report [-i file]
 
 DESCRIPTION
 -----------
@@ -22,6 +24,11 @@ OPTIONS
 <command>...::
 	Any command you can specify in a shell.
 
+record::
+	See STAT RECORD.
+
+report::
+	See STAT REPORT.
 
 -e::
 --event=::
@@ -159,6 +166,33 @@ filter out the startup phase of the program, which is often very different.
 
 Print statistics of transactional execution if supported.
 
+STAT RECORD
+-----------
+Stores stat data into perf data file.
+
+-o file::
+--output file::
+Output file name.
+
+STAT REPORT
+-----------
+Reads and reports stat data from perf data file.
+
+-i file::
+--input file::
+Input file name.
+
+--per-socket::
+Aggregate counts per processor socket for system-wide mode measurements.
+
+--per-core::
+Aggregate counts per physical processor for system-wide mode measurements.
+
+-A::
+--no-aggr::
+Do not aggregate counts across all monitored CPUs.
+
+
 EXAMPLES
 --------
 

+ 3 - 0
tools/perf/Documentation/perf-top.txt

@@ -230,6 +230,9 @@ Default is to monitor all CPUS.
 	The various filters must be specified as a comma separated list: --branch-filter any_ret,u,k
 	Note that this feature may not be available on all processors.
 
+--raw-trace::
+	When displaying traceevent output, do not use print fmt or plugins.
+
 INTERACTIVE PROMPTING KEYS
 --------------------------
 

+ 14 - 0
tools/perf/Documentation/tips.txt

@@ -0,0 +1,14 @@
+For a higher level overview, try: perf report --sort comm,dso
+Sample related events with: perf record -e '{cycles,instructions}:S'
+Compare performance results with: perf diff [<old file> <new file>]
+Boolean options have negative forms, e.g.: perf report --no-children
+Customize output of perf script with: perf script -F event,ip,sym
+Generate a script for your data: perf script -g <lang>
+Save output of perf stat using: perf stat record <target workload>
+Create an archive with symtabs to analyse on other machine: perf archive
+Search options using a keyword: perf report -h <keyword>
+Use parent filter to see specific call path: perf report -p <regex>
+List events using substring match: perf list <keyword>
+To see list of saved events and attributes: perf evlist -v
+Use --symfs <dir> if your symbol files are in non-standard locations
+To see callchains in a more compact form: perf report -g folded

+ 6 - 1
tools/perf/MANIFEST

@@ -1,6 +1,7 @@
 tools/perf
 tools/arch/alpha/include/asm/barrier.h
 tools/arch/arm/include/asm/barrier.h
+tools/arch/arm64/include/asm/barrier.h
 tools/arch/ia64/include/asm/barrier.h
 tools/arch/mips/include/asm/barrier.h
 tools/arch/powerpc/include/asm/barrier.h
@@ -20,14 +21,17 @@ tools/lib/traceevent
 tools/lib/bpf
 tools/lib/api
 tools/lib/bpf
+tools/lib/subcmd
 tools/lib/hweight.c
 tools/lib/rbtree.c
+tools/lib/string.c
 tools/lib/symbol/kallsyms.c
 tools/lib/symbol/kallsyms.h
-tools/lib/util/find_next_bit.c
+tools/lib/find_bit.c
 tools/include/asm/atomic.h
 tools/include/asm/barrier.h
 tools/include/asm/bug.h
+tools/include/asm-generic/atomic-gcc.h
 tools/include/asm-generic/barrier.h
 tools/include/asm-generic/bitops/arch_hweight.h
 tools/include/asm-generic/bitops/atomic.h
@@ -50,6 +54,7 @@ tools/include/linux/log2.h
 tools/include/linux/poison.h
 tools/include/linux/rbtree.h
 tools/include/linux/rbtree_augmented.h
+tools/include/linux/string.h
 tools/include/linux/types.h
 tools/include/linux/err.h
 include/asm-generic/bitops/arch_hweight.h

+ 30 - 14
tools/perf/Makefile.perf

@@ -145,9 +145,10 @@ BISON   = bison
 STRIP   = strip
 AWK     = awk
 
-LIB_DIR          = $(srctree)/tools/lib/api/
+LIB_DIR		= $(srctree)/tools/lib/api/
 TRACE_EVENT_DIR = $(srctree)/tools/lib/traceevent/
-BPF_DIR = $(srctree)/tools/lib/bpf/
+BPF_DIR		= $(srctree)/tools/lib/bpf/
+SUBCMD_DIR	= $(srctree)/tools/lib/subcmd/
 
 # include config/Makefile by default and rule out
 # non-config cases
@@ -184,15 +185,17 @@ strip-libs = $(filter-out -l%,$(1))
 ifneq ($(OUTPUT),)
   TE_PATH=$(OUTPUT)
   BPF_PATH=$(OUTPUT)
+  SUBCMD_PATH=$(OUTPUT)
 ifneq ($(subdir),)
-  LIB_PATH=$(OUTPUT)/../lib/api/
+  API_PATH=$(OUTPUT)/../lib/api/
 else
-  LIB_PATH=$(OUTPUT)
+  API_PATH=$(OUTPUT)
 endif
 else
   TE_PATH=$(TRACE_EVENT_DIR)
-  LIB_PATH=$(LIB_DIR)
+  API_PATH=$(LIB_DIR)
   BPF_PATH=$(BPF_DIR)
+  SUBCMD_PATH=$(SUBCMD_DIR)
 endif
 
 LIBTRACEEVENT = $(TE_PATH)libtraceevent.a
@@ -201,11 +204,13 @@ export LIBTRACEEVENT
 LIBTRACEEVENT_DYNAMIC_LIST = $(TE_PATH)libtraceevent-dynamic-list
 LIBTRACEEVENT_DYNAMIC_LIST_LDFLAGS = -Xlinker --dynamic-list=$(LIBTRACEEVENT_DYNAMIC_LIST)
 
-LIBAPI = $(LIB_PATH)libapi.a
+LIBAPI = $(API_PATH)libapi.a
 export LIBAPI
 
 LIBBPF = $(BPF_PATH)libbpf.a
 
+LIBSUBCMD = $(SUBCMD_PATH)libsubcmd.a
+
 # python extension build directories
 PYTHON_EXTBUILD     := $(OUTPUT)python_ext_build/
 PYTHON_EXTBUILD_LIB := $(PYTHON_EXTBUILD)lib/
@@ -257,7 +262,7 @@ export PERL_PATH
 
 LIB_FILE=$(OUTPUT)libperf.a
 
-PERFLIBS = $(LIB_FILE) $(LIBAPI) $(LIBTRACEEVENT)
+PERFLIBS = $(LIB_FILE) $(LIBAPI) $(LIBTRACEEVENT) $(LIBSUBCMD)
 ifndef NO_LIBBPF
   PERFLIBS += $(LIBBPF)
 endif
@@ -420,7 +425,7 @@ $(LIBTRACEEVENT)-clean:
 	$(call QUIET_CLEAN, libtraceevent)
 	$(Q)$(MAKE) -C $(TRACE_EVENT_DIR) O=$(OUTPUT) clean >/dev/null
 
-install-traceevent-plugins: $(LIBTRACEEVENT)
+install-traceevent-plugins: libtraceevent_plugins
 	$(Q)$(MAKE) -C $(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) install_plugins
 
 $(LIBAPI): fixdep FORCE
@@ -431,12 +436,19 @@ $(LIBAPI)-clean:
 	$(Q)$(MAKE) -C $(LIB_DIR) O=$(OUTPUT) clean >/dev/null
 
 $(LIBBPF): fixdep FORCE
-	$(Q)$(MAKE) -C $(BPF_DIR) O=$(OUTPUT) $(OUTPUT)libbpf.a
+	$(Q)$(MAKE) -C $(BPF_DIR) O=$(OUTPUT) $(OUTPUT)libbpf.a FEATURES_DUMP=$(realpath $(OUTPUT)FEATURE-DUMP)
 
 $(LIBBPF)-clean:
 	$(call QUIET_CLEAN, libbpf)
 	$(Q)$(MAKE) -C $(BPF_DIR) O=$(OUTPUT) clean >/dev/null
 
+$(LIBSUBCMD): fixdep FORCE
+	$(Q)$(MAKE) -C $(SUBCMD_DIR) O=$(OUTPUT) $(OUTPUT)libsubcmd.a
+
+$(LIBSUBCMD)-clean:
+	$(call QUIET_CLEAN, libsubcmd)
+	$(Q)$(MAKE) -C $(SUBCMD_DIR) O=$(OUTPUT) clean
+
 help:
 	@echo 'Perf make targets:'
 	@echo '  doc		- make *all* documentation (see below)'
@@ -476,7 +488,7 @@ INSTALL_DOC_TARGETS += quick-install-doc quick-install-man quick-install-html
 $(DOC_TARGETS):
 	$(QUIET_SUBDIR0)Documentation $(QUIET_SUBDIR1) $(@:doc=all)
 
-TAG_FOLDERS= . ../lib/traceevent ../lib/api ../lib/symbol ../include ../lib/bpf
+TAG_FOLDERS= . ../lib ../include
 TAG_FILES= ../../include/uapi/linux/perf_event.h
 
 TAGS:
@@ -555,6 +567,9 @@ endif
 	$(call QUIET_INSTALL, perf_completion-script) \
 		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(sysconfdir_SQ)/bash_completion.d'; \
 		$(INSTALL) perf-completion.sh '$(DESTDIR_SQ)$(sysconfdir_SQ)/bash_completion.d/perf'
+	$(call QUIET_INSTALL, perf-tip) \
+		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(tip_instdir_SQ)'; \
+		$(INSTALL) Documentation/tips.txt -t '$(DESTDIR_SQ)$(tip_instdir_SQ)'
 
 install-tests: all install-gtk
 	$(call QUIET_INSTALL, tests) \
@@ -582,15 +597,16 @@ $(INSTALL_DOC_TARGETS):
 #
 config-clean:
 	$(call QUIET_CLEAN, config)
-	$(Q)$(MAKE) -C $(srctree)/tools/build/feature/ clean >/dev/null
+	$(Q)$(MAKE) -C $(srctree)/tools/build/feature/ $(if $(OUTPUT),OUTPUT=$(OUTPUT)feature/,) clean >/dev/null
 
-clean: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean config-clean
+clean: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean config-clean
 	$(call QUIET_CLEAN, core-objs)  $(RM) $(LIB_FILE) $(OUTPUT)perf-archive $(OUTPUT)perf-with-kcore $(LANG_BINDINGS)
-	$(Q)find . -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
+	$(Q)find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
 	$(Q)$(RM) $(OUTPUT).config-detected
 	$(call QUIET_CLEAN, core-progs) $(RM) $(ALL_PROGRAMS) perf perf-read-vdso32 perf-read-vdsox32
 	$(call QUIET_CLEAN, core-gen)   $(RM)  *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope* $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)FEATURE-DUMP $(OUTPUT)util/*-bison* $(OUTPUT)util/*-flex* \
-		$(OUTPUT)util/intel-pt-decoder/inat-tables.c
+		$(OUTPUT)util/intel-pt-decoder/inat-tables.c $(OUTPUT)fixdep \
+		$(OUTPUT)tests/llvm-src-{base,kbuild,prologue}.c
 	$(QUIET_SUBDIR0)Documentation $(QUIET_SUBDIR1) clean
 	$(python-clean)
 

+ 4 - 4
tools/perf/arch/x86/include/arch-tests.h

@@ -2,10 +2,10 @@
 #define ARCH_TESTS_H
 
 /* Tests */
-int test__rdpmc(void);
-int test__perf_time_to_tsc(void);
-int test__insn_x86(void);
-int test__intel_cqm_count_nmi_context(void);
+int test__rdpmc(int subtest);
+int test__perf_time_to_tsc(int subtest);
+int test__insn_x86(int subtest);
+int test__intel_cqm_count_nmi_context(int subtest);
 
 #ifdef HAVE_DWARF_UNWIND_SUPPORT
 struct thread;

+ 1 - 1
tools/perf/arch/x86/tests/insn-x86.c

@@ -171,7 +171,7 @@ static int test_data_set(struct test_data *dat_set, int x86_64)
  * verbose (-v) option to see all the instructions and whether or not they
  * decoded successfuly.
  */
-int test__insn_x86(void)
+int test__insn_x86(int subtest __maybe_unused)
 {
 	int ret = 0;
 

+ 2 - 2
tools/perf/arch/x86/tests/intel-cqm.c

@@ -33,7 +33,7 @@ static pid_t spawn(void)
  * the last read counter value to avoid triggering a WARN_ON_ONCE() in
  * smp_call_function_many() caused by sending IPIs from NMI context.
  */
-int test__intel_cqm_count_nmi_context(void)
+int test__intel_cqm_count_nmi_context(int subtest __maybe_unused)
 {
 	struct perf_evlist *evlist = NULL;
 	struct perf_evsel *evsel = NULL;
@@ -54,7 +54,7 @@ int test__intel_cqm_count_nmi_context(void)
 
 	ret = parse_events(evlist, "intel_cqm/llc_occupancy/", NULL);
 	if (ret) {
-		pr_debug("parse_events failed\n");
+		pr_debug("parse_events failed, is \"intel_cqm/llc_occupancy/\" available?\n");
 		err = TEST_SKIP;
 		goto out;
 	}

+ 1 - 2
tools/perf/arch/x86/tests/perf-time-to-tsc.c

@@ -35,13 +35,12 @@
  * %0 is returned, otherwise %-1 is returned.  If TSC conversion is not
  * supported then then the test passes but " (not supported)" is printed.
  */
-int test__perf_time_to_tsc(void)
+int test__perf_time_to_tsc(int subtest __maybe_unused)
 {
 	struct record_opts opts = {
 		.mmap_pages	     = UINT_MAX,
 		.user_freq	     = UINT_MAX,
 		.user_interval	     = ULLONG_MAX,
-		.freq		     = 4000,
 		.target		     = {
 			.uses_mmap   = true,
 		},

+ 1 - 1
tools/perf/arch/x86/tests/rdpmc.c

@@ -149,7 +149,7 @@ out_close:
 	return 0;
 }
 
-int test__rdpmc(void)
+int test__rdpmc(int subtest __maybe_unused)
 {
 	int status = 0;
 	int wret = 0;

+ 1 - 0
tools/perf/arch/x86/util/Build

@@ -5,6 +5,7 @@ libperf-y += kvm-stat.o
 libperf-y += perf_regs.o
 
 libperf-$(CONFIG_DWARF) += dwarf-regs.o
+libperf-$(CONFIG_BPF_PROLOGUE) += dwarf-regs.o
 
 libperf-$(CONFIG_LIBUNWIND)          += unwind-libunwind.o
 libperf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o

+ 2 - 2
tools/perf/arch/x86/util/intel-bts.c

@@ -327,7 +327,7 @@ static int intel_bts_snapshot_start(struct auxtrace_record *itr)
 
 	evlist__for_each(btsr->evlist, evsel) {
 		if (evsel->attr.type == btsr->intel_bts_pmu->type)
-			return perf_evlist__disable_event(btsr->evlist, evsel);
+			return perf_evsel__disable(evsel);
 	}
 	return -EINVAL;
 }
@@ -340,7 +340,7 @@ static int intel_bts_snapshot_finish(struct auxtrace_record *itr)
 
 	evlist__for_each(btsr->evlist, evsel) {
 		if (evsel->attr.type == btsr->intel_bts_pmu->type)
-			return perf_evlist__enable_event(btsr->evlist, evsel);
+			return perf_evsel__enable(evsel);
 	}
 	return -EINVAL;
 }

+ 3 - 3
tools/perf/arch/x86/util/intel-pt.c

@@ -26,7 +26,7 @@
 #include "../../util/evlist.h"
 #include "../../util/evsel.h"
 #include "../../util/cpumap.h"
-#include "../../util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "../../util/parse-events.h"
 #include "../../util/pmu.h"
 #include "../../util/debug.h"
@@ -725,7 +725,7 @@ static int intel_pt_snapshot_start(struct auxtrace_record *itr)
 
 	evlist__for_each(ptr->evlist, evsel) {
 		if (evsel->attr.type == ptr->intel_pt_pmu->type)
-			return perf_evlist__disable_event(ptr->evlist, evsel);
+			return perf_evsel__disable(evsel);
 	}
 	return -EINVAL;
 }
@@ -738,7 +738,7 @@ static int intel_pt_snapshot_finish(struct auxtrace_record *itr)
 
 	evlist__for_each(ptr->evlist, evsel) {
 		if (evsel->attr.type == ptr->intel_pt_pmu->type)
-			return perf_evlist__enable_event(ptr->evlist, evsel);
+			return perf_evsel__enable(evsel);
 	}
 	return -EINVAL;
 }

+ 1 - 1
tools/perf/bench/futex-hash.c

@@ -11,7 +11,7 @@
 #include "../perf.h"
 #include "../util/util.h"
 #include "../util/stat.h"
-#include "../util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "../util/header.h"
 #include "bench.h"
 #include "futex.h"

+ 1 - 1
tools/perf/bench/futex-lock-pi.c

@@ -5,7 +5,7 @@
 #include "../perf.h"
 #include "../util/util.h"
 #include "../util/stat.h"
-#include "../util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "../util/header.h"
 #include "bench.h"
 #include "futex.h"

+ 1 - 1
tools/perf/bench/futex-requeue.c

@@ -11,7 +11,7 @@
 #include "../perf.h"
 #include "../util/util.h"
 #include "../util/stat.h"
-#include "../util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "../util/header.h"
 #include "bench.h"
 #include "futex.h"

+ 1 - 1
tools/perf/bench/futex-wake-parallel.c

@@ -10,7 +10,7 @@
 #include "../perf.h"
 #include "../util/util.h"
 #include "../util/stat.h"
-#include "../util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "../util/header.h"
 #include "bench.h"
 #include "futex.h"

+ 1 - 1
tools/perf/bench/futex-wake.c

@@ -11,7 +11,7 @@
 #include "../perf.h"
 #include "../util/util.h"
 #include "../util/stat.h"
-#include "../util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "../util/header.h"
 #include "bench.h"
 #include "futex.h"

+ 1 - 1
tools/perf/bench/mem-functions.c

@@ -8,7 +8,7 @@
 
 #include "../perf.h"
 #include "../util/util.h"
-#include "../util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "../util/header.h"
 #include "../util/cloexec.h"
 #include "bench.h"

+ 1 - 1
tools/perf/bench/numa.c

@@ -7,7 +7,7 @@
 #include "../perf.h"
 #include "../builtin.h"
 #include "../util/util.h"
-#include "../util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "../util/cloexec.h"
 
 #include "bench.h"

+ 1 - 1
tools/perf/bench/sched-messaging.c

@@ -11,7 +11,7 @@
 
 #include "../perf.h"
 #include "../util/util.h"
-#include "../util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "../builtin.h"
 #include "bench.h"
 

+ 1 - 1
tools/perf/bench/sched-pipe.c

@@ -10,7 +10,7 @@
  */
 #include "../perf.h"
 #include "../util/util.h"
-#include "../util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "../builtin.h"
 #include "bench.h"
 

+ 23 - 21
tools/perf/builtin-annotate.c

@@ -21,7 +21,7 @@
 #include "util/evsel.h"
 #include "util/annotate.h"
 #include "util/event.h"
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "util/parse-events.h"
 #include "util/thread.h"
 #include "util/sort.h"
@@ -47,7 +47,7 @@ struct perf_annotate {
 };
 
 static int perf_evsel__add_sample(struct perf_evsel *evsel,
-				  struct perf_sample *sample __maybe_unused,
+				  struct perf_sample *sample,
 				  struct addr_location *al,
 				  struct perf_annotate *ann)
 {
@@ -72,7 +72,10 @@ static int perf_evsel__add_sample(struct perf_evsel *evsel,
 		return 0;
 	}
 
-	he = __hists__add_entry(hists, al, NULL, NULL, NULL, 1, 1, 0, true);
+	sample->period = 1;
+	sample->weight = 1;
+
+	he = __hists__add_entry(hists, al, NULL, NULL, NULL, sample, true);
 	if (he == NULL)
 		return -ENOMEM;
 
@@ -343,18 +346,19 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __maybe_unused)
 		return ret;
 
 	argc = parse_options(argc, argv, options, annotate_usage, 0);
+	if (argc) {
+		/*
+		 * Special case: if there's an argument left then assume that
+		 * it's a symbol filter:
+		 */
+		if (argc > 1)
+			usage_with_options(annotate_usage, options);
 
-	if (annotate.use_stdio)
-		use_browser = 0;
-	else if (annotate.use_tui)
-		use_browser = 1;
-	else if (annotate.use_gtk)
-		use_browser = 2;
+		annotate.sym_hist_filter = argv[0];
+	}
 
 	file.path  = input_name;
 
-	setup_browser(true);
-
 	annotate.session = perf_session__new(&file, false, &annotate.tool);
 	if (annotate.session == NULL)
 		return -1;
@@ -366,19 +370,17 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __maybe_unused)
 	if (ret < 0)
 		goto out_delete;
 
-	if (setup_sorting() < 0)
+	if (setup_sorting(NULL) < 0)
 		usage_with_options(annotate_usage, options);
 
-	if (argc) {
-		/*
-		 * Special case: if there's an argument left then assume that
-		 * it's a symbol filter:
-		 */
-		if (argc > 1)
-			usage_with_options(annotate_usage, options);
+	if (annotate.use_stdio)
+		use_browser = 0;
+	else if (annotate.use_tui)
+		use_browser = 1;
+	else if (annotate.use_gtk)
+		use_browser = 2;
 
-		annotate.sym_hist_filter = argv[0];
-	}
+	setup_browser(true);
 
 	ret = __cmd_annotate(&annotate);
 

+ 1 - 1
tools/perf/builtin-bench.c

@@ -16,7 +16,7 @@
  */
 #include "perf.h"
 #include "util/util.h"
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "builtin.h"
 #include "bench/bench.h"
 

+ 1 - 1
tools/perf/builtin-buildid-cache.c

@@ -16,7 +16,7 @@
 #include "util/cache.h"
 #include "util/debug.h"
 #include "util/header.h"
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "util/strlist.h"
 #include "util/build-id.h"
 #include "util/session.h"

+ 1 - 1
tools/perf/builtin-buildid-list.c

@@ -12,7 +12,7 @@
 #include "util/build-id.h"
 #include "util/cache.h"
 #include "util/debug.h"
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "util/session.h"
 #include "util/symbol.h"
 #include "util/data.h"

+ 66 - 0
tools/perf/builtin-config.c

@@ -0,0 +1,66 @@
+/*
+ * builtin-config.c
+ *
+ * Copyright (C) 2015, Taeung Song <treeze.taeung@gmail.com>
+ *
+ */
+#include "builtin.h"
+
+#include "perf.h"
+
+#include "util/cache.h"
+#include <subcmd/parse-options.h>
+#include "util/util.h"
+#include "util/debug.h"
+
+static const char * const config_usage[] = {
+	"perf config [options]",
+	NULL
+};
+
+enum actions {
+	ACTION_LIST = 1
+} actions;
+
+static struct option config_options[] = {
+	OPT_SET_UINT('l', "list", &actions,
+		     "show current config variables", ACTION_LIST),
+	OPT_END()
+};
+
+static int show_config(const char *key, const char *value,
+		       void *cb __maybe_unused)
+{
+	if (value)
+		printf("%s=%s\n", key, value);
+	else
+		printf("%s\n", key);
+
+	return 0;
+}
+
+int cmd_config(int argc, const char **argv, const char *prefix __maybe_unused)
+{
+	int ret = 0;
+
+	argc = parse_options(argc, argv, config_options, config_usage,
+			     PARSE_OPT_STOP_AT_NON_OPTION);
+
+	switch (actions) {
+	case ACTION_LIST:
+		if (argc) {
+			pr_err("Error: takes no arguments\n");
+			parse_options_usage(config_usage, config_options, "l", 1);
+		} else {
+			ret = perf_config(show_config, NULL);
+			if (ret < 0)
+				pr_err("Nothing configured, "
+				       "please check your ~/.perfconfig file\n");
+		}
+		break;
+	default:
+		usage_with_options(config_usage, config_options);
+	}
+
+	return ret;
+}

+ 1 - 1
tools/perf/builtin-data.c

@@ -2,7 +2,7 @@
 #include "builtin.h"
 #include "perf.h"
 #include "debug.h"
-#include "parse-options.h"
+#include <subcmd/parse-options.h>
 #include "data-convert-bt.h"
 
 typedef int (*data_cmd_fn_t)(int argc, const char **argv, const char *prefix);

+ 7 - 8
tools/perf/builtin-diff.c

@@ -311,11 +311,11 @@ static int formula_fprintf(struct hist_entry *he, struct hist_entry *pair,
 }
 
 static int hists__add_entry(struct hists *hists,
-			    struct addr_location *al, u64 period,
-			    u64 weight, u64 transaction)
+			    struct addr_location *al,
+			    struct perf_sample *sample)
 {
-	if (__hists__add_entry(hists, al, NULL, NULL, NULL, period, weight,
-			       transaction, true) != NULL)
+	if (__hists__add_entry(hists, al, NULL, NULL, NULL,
+			       sample, true) != NULL)
 		return 0;
 	return -ENOMEM;
 }
@@ -336,8 +336,7 @@ static int diff__process_sample_event(struct perf_tool *tool __maybe_unused,
 		return -1;
 	}
 
-	if (hists__add_entry(hists, &al, sample->period,
-			     sample->weight, sample->transaction)) {
+	if (hists__add_entry(hists, &al, sample)) {
 		pr_warning("problem incrementing symbol period, skipping event\n");
 		goto out_put;
 	}
@@ -1208,7 +1207,7 @@ static int ui_init(void)
 		BUG_ON(1);
 	}
 
-	list_add(&fmt->sort_list, &perf_hpp__sort_list);
+	perf_hpp__register_sort_field(fmt);
 	return 0;
 }
 
@@ -1280,7 +1279,7 @@ int cmd_diff(int argc, const char **argv, const char *prefix __maybe_unused)
 
 	sort__mode = SORT_MODE__DIFF;
 
-	if (setup_sorting() < 0)
+	if (setup_sorting(NULL) < 0)
 		usage_with_options(diff_usage, options);
 
 	setup_pager();

+ 11 - 2
tools/perf/builtin-evlist.c

@@ -12,7 +12,7 @@
 #include "util/evlist.h"
 #include "util/evsel.h"
 #include "util/parse-events.h"
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "util/session.h"
 #include "util/data.h"
 #include "util/debug.h"
@@ -26,14 +26,22 @@ static int __cmd_evlist(const char *file_name, struct perf_attr_details *details
 		.mode = PERF_DATA_MODE_READ,
 		.force = details->force,
 	};
+	bool has_tracepoint = false;
 
 	session = perf_session__new(&file, 0, NULL);
 	if (session == NULL)
 		return -1;
 
-	evlist__for_each(session->evlist, pos)
+	evlist__for_each(session->evlist, pos) {
 		perf_evsel__fprintf(pos, details, stdout);
 
+		if (pos->attr.type == PERF_TYPE_TRACEPOINT)
+			has_tracepoint = true;
+	}
+
+	if (has_tracepoint && !details->trace_fields)
+		printf("# Tip: use 'perf evlist --trace-fields' to show fields for tracepoint events\n");
+
 	perf_session__delete(session);
 	return 0;
 }
@@ -49,6 +57,7 @@ int cmd_evlist(int argc, const char **argv, const char *prefix __maybe_unused)
 	OPT_BOOLEAN('g', "group", &details.event_group,
 		    "Show event group information"),
 	OPT_BOOLEAN('f', "force", &details.force, "don't complain, do it"),
+	OPT_BOOLEAN(0, "trace-fields", &details.trace_fields, "Show tracepoint fields"),
 	OPT_END()
 	};
 	const char * const evlist_usage[] = {

+ 5 - 5
tools/perf/builtin-help.c

@@ -6,11 +6,11 @@
 #include "perf.h"
 #include "util/cache.h"
 #include "builtin.h"
-#include "util/exec_cmd.h"
+#include <subcmd/exec-cmd.h>
 #include "common-cmds.h"
-#include "util/parse-options.h"
-#include "util/run-command.h"
-#include "util/help.h"
+#include <subcmd/parse-options.h>
+#include <subcmd/run-command.h>
+#include <subcmd/help.h>
 #include "util/debug.h"
 
 static struct man_viewer_list {
@@ -407,7 +407,7 @@ static int get_html_page_path(struct strbuf *page_path, const char *page)
 #ifndef open_html
 static void open_html(const char *path)
 {
-	execl_perf_cmd("web--browse", "-c", "help.browser", path, NULL);
+	execl_cmd("web--browse", "-c", "help.browser", path, NULL);
 }
 #endif
 

+ 1 - 1
tools/perf/builtin-inject.c

@@ -18,7 +18,7 @@
 #include "util/data.h"
 #include "util/auxtrace.h"
 
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 
 #include <linux/list.h>
 

+ 1 - 1
tools/perf/builtin-kmem.c

@@ -12,7 +12,7 @@
 #include "util/tool.h"
 #include "util/callchain.h"
 
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "util/trace-event.h"
 #include "util/data.h"
 #include "util/cpumap.h"

Một số tệp đã không được hiển thị bởi vì quá nhiều tập tin thay đổi trong này khác