Browse Source

Merge branch 'bpf-perf'

Kaixu Xia says:

====================
bpf: Introduce the new ability of eBPF programs to access hardware PMU counter

This patchset is base on the net-next:
 git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git
commit 9dc20a649609c95ce7c5ac4282656ba627b67d49.

Previous patch v6 url:
https://lkml.org/lkml/2015/8/4/188

changes in V7:
 - rebase the whole patch set to net-next tree(9dc20a64);
 - split out the core perf APIs into Patch 1/5;
 - change the return value of function perf_event_attrs()
   from struct perf_event * to const struct perf_event * in
   Patch 1/5;
 - rename the function perf_event_read_internal() to perf_event_
   read_local() and rewrite it in Patch 1/5;
 - rename the function check_func_limit() to check_map_func
   _compatibility() and remove the unnecessary pass pointer to
   a pointer in Patch 4/5;

changes in V6:
 - make the Patch 1/4 commit message more meaning and readable;
 - remove the unnecessary comment in Patch 2/4 and make it clean;
 - declare the function perf_event_release_kernel() in include/
   linux/perf_event.h to fix the build error when CONFIG_PERF_EVENTS
   isn't configured in Patch 2/4;
 - add function perf_event_attrs() to get the struct perf_event_attr
   in Patch 2/4.
 - move the related code from kernel/trace/bpf_trace.c to kernel/
   events/core.c and add function perf_event_read_internal() to
   avoid poking inside of the event outside of perf code in Patch 3/4;
 - generial the func & map match-pair with an array in Patch 3/4;

changes in V5:
 - move struct fd_array_map_ops* fd_ops to bpf_map;
 - move array perf event decrement refcnt function to
   map_free;
 - fix the NULL ptr of perf_event_get();
 - move bpf_perf_event_read() to kernel/bpf/bpf_trace.c;
 - get rid of the remaining struct bpf_prog;
 - move the unnecessay cast on void *;

changes in V4:
 - make the bpf_prog_array_map more generic;
 - fix the bug of event refcnt leak;
 - use more useful errno in bpf_perf_event_read();

changes in V3:
 - collapse V2 patches 1-3 into one;
 - drop the function map->ops->map_traverse_elem() and release
   the struct perf_event in map_free;
 - only allow to access bpf_perf_event_read() from programs;
 - update the perf_event_array_map elem via xchg();
 - pass index directly to bpf_perf_event_read() instead of
   MAP_KEY;

changes in V2:
 - put atomic_long_inc_not_zero() between fdget() and fdput();
 - limit the event type to PERF_TYPE_RAW and PERF_TYPE_HARDWARE;
 - Only read the event counter on current CPU or on current
   process;
 - add new map type BPF_MAP_TYPE_PERF_EVENT_ARRAY to store the
   pointer to the struct perf_event;
 - according to the perf_event_map_fd and key, the function
   bpf_perf_event_read() can get the Hardware PMU counter value;

Patch 5/5 is a simple example and shows how to use this new eBPF
programs ability. The PMU counter data can be found in
/sys/kernel/debug/tracing/trace(trace_pipe).(the cycles PMU
value when 'kprobe/sys_write' sampling)

  $ cat /sys/kernel/debug/tracing/trace_pipe
  $ ./tracex6
       ...
       syslog-ng-548   [000] d..1    76.905673: : CPU-0   681765271
       syslog-ng-548   [000] d..1    76.905690: : CPU-0   681787855
       syslog-ng-548   [000] d..1    76.905707: : CPU-0   681810504
       syslog-ng-548   [000] d..1    76.905725: : CPU-0   681834771
       syslog-ng-548   [000] d..1    76.905745: : CPU-0   681859519
       syslog-ng-548   [000] d..1    76.905766: : CPU-0   681890419
       syslog-ng-548   [000] d..1    76.905783: : CPU-0   681914045
       syslog-ng-548   [000] d..1    76.905800: : CPU-0   681935950
       syslog-ng-548   [000] d..1    76.905816: : CPU-0   681958299
              ls-690   [005] d..1    82.241308: : CPU-5   3138451
              sh-691   [004] d..1    82.244570: : CPU-4   7324988
           <...>-699   [007] d..1    99.961387: : CPU-7   3194027
           <...>-695   [003] d..1    99.961474: : CPU-3   288901
           <...>-695   [003] d..1    99.961541: : CPU-3   383145
           <...>-695   [003] d..1    99.961591: : CPU-3   450365
           <...>-695   [003] d..1    99.961639: : CPU-3   515751
           <...>-695   [003] d..1    99.961686: : CPU-3   579047
       ...

The detail of patches is as follow:

Patch 1/5 add the necessary core perf APIs perf_event_attrs(),
perf_event_get(),perf_event_read_local() when accessing events
counters in eBPF programs

Patch 2/5 rewrites part of the bpf_prog_array map code and make it
more generic;

Patch 3/5 introduces a new bpf map type. This map only stores the
pointer to struct perf_event;

Patch 4/5 implements function bpf_perf_event_read() that get the
selected hardware PMU conuter;

Patch 5/5 gives a simple example.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
David S. Miller 10 years ago
parent
commit
d74a790d52

+ 3 - 3
arch/x86/net/bpf_jit_comp.c

@@ -246,7 +246,7 @@ static void emit_prologue(u8 **pprog)
  *     goto out;
  *     goto out;
  *   if (++tail_call_cnt > MAX_TAIL_CALL_CNT)
  *   if (++tail_call_cnt > MAX_TAIL_CALL_CNT)
  *     goto out;
  *     goto out;
- *   prog = array->prog[index];
+ *   prog = array->ptrs[index];
  *   if (prog == NULL)
  *   if (prog == NULL)
  *     goto out;
  *     goto out;
  *   goto *(prog->bpf_func + prologue_size);
  *   goto *(prog->bpf_func + prologue_size);
@@ -284,9 +284,9 @@ static void emit_bpf_tail_call(u8 **pprog)
 	EMIT3(0x83, 0xC0, 0x01);                  /* add eax, 1 */
 	EMIT3(0x83, 0xC0, 0x01);                  /* add eax, 1 */
 	EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */
 	EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */
 
 
-	/* prog = array->prog[index]; */
+	/* prog = array->ptrs[index]; */
 	EMIT4_off32(0x48, 0x8D, 0x84, 0xD6,       /* lea rax, [rsi + rdx * 8 + offsetof(...)] */
 	EMIT4_off32(0x48, 0x8D, 0x84, 0xD6,       /* lea rax, [rsi + rdx * 8 + offsetof(...)] */
-		    offsetof(struct bpf_array, prog));
+		    offsetof(struct bpf_array, ptrs));
 	EMIT3(0x48, 0x8B, 0x00);                  /* mov rax, qword ptr [rax] */
 	EMIT3(0x48, 0x8B, 0x00);                  /* mov rax, qword ptr [rax] */
 
 
 	/* if (prog == NULL)
 	/* if (prog == NULL)

+ 8 - 2
include/linux/bpf.h

@@ -10,6 +10,7 @@
 #include <uapi/linux/bpf.h>
 #include <uapi/linux/bpf.h>
 #include <linux/workqueue.h>
 #include <linux/workqueue.h>
 #include <linux/file.h>
 #include <linux/file.h>
+#include <linux/perf_event.h>
 
 
 struct bpf_map;
 struct bpf_map;
 
 
@@ -24,6 +25,10 @@ struct bpf_map_ops {
 	void *(*map_lookup_elem)(struct bpf_map *map, void *key);
 	void *(*map_lookup_elem)(struct bpf_map *map, void *key);
 	int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
 	int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
 	int (*map_delete_elem)(struct bpf_map *map, void *key);
 	int (*map_delete_elem)(struct bpf_map *map, void *key);
+
+	/* funcs called by prog_array and perf_event_array map */
+	void *(*map_fd_get_ptr) (struct bpf_map *map, int fd);
+	void (*map_fd_put_ptr) (void *ptr);
 };
 };
 
 
 struct bpf_map {
 struct bpf_map {
@@ -142,13 +147,13 @@ struct bpf_array {
 	bool owner_jited;
 	bool owner_jited;
 	union {
 	union {
 		char value[0] __aligned(8);
 		char value[0] __aligned(8);
-		struct bpf_prog *prog[0] __aligned(8);
+		void *ptrs[0] __aligned(8);
 	};
 	};
 };
 };
 #define MAX_TAIL_CALL_CNT 32
 #define MAX_TAIL_CALL_CNT 32
 
 
 u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
 u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
-void bpf_prog_array_map_clear(struct bpf_map *map);
+void bpf_fd_array_map_clear(struct bpf_map *map);
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
 
 
@@ -185,6 +190,7 @@ extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
 extern const struct bpf_func_proto bpf_map_update_elem_proto;
 extern const struct bpf_func_proto bpf_map_update_elem_proto;
 extern const struct bpf_func_proto bpf_map_delete_elem_proto;
 extern const struct bpf_func_proto bpf_map_delete_elem_proto;
 
 
+extern const struct bpf_func_proto bpf_perf_event_read_proto;
 extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
 extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
 extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
 extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
 extern const struct bpf_func_proto bpf_tail_call_proto;
 extern const struct bpf_func_proto bpf_tail_call_proto;

+ 10 - 0
include/linux/perf_event.h

@@ -641,6 +641,8 @@ extern int perf_event_init_task(struct task_struct *child);
 extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
 extern void perf_event_free_task(struct task_struct *task);
 extern void perf_event_delayed_put(struct task_struct *task);
 extern void perf_event_delayed_put(struct task_struct *task);
+extern struct perf_event *perf_event_get(unsigned int fd);
+extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event);
 extern void perf_event_print_debug(void);
 extern void perf_event_print_debug(void);
 extern void perf_pmu_disable(struct pmu *pmu);
 extern void perf_pmu_disable(struct pmu *pmu);
 extern void perf_pmu_enable(struct pmu *pmu);
 extern void perf_pmu_enable(struct pmu *pmu);
@@ -659,6 +661,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr,
 				void *context);
 				void *context);
 extern void perf_pmu_migrate_context(struct pmu *pmu,
 extern void perf_pmu_migrate_context(struct pmu *pmu,
 				int src_cpu, int dst_cpu);
 				int src_cpu, int dst_cpu);
+extern u64 perf_event_read_local(struct perf_event *event);
 extern u64 perf_event_read_value(struct perf_event *event,
 extern u64 perf_event_read_value(struct perf_event *event,
 				 u64 *enabled, u64 *running);
 				 u64 *enabled, u64 *running);
 
 
@@ -979,6 +982,12 @@ static inline int perf_event_init_task(struct task_struct *child)	{ return 0; }
 static inline void perf_event_exit_task(struct task_struct *child)	{ }
 static inline void perf_event_exit_task(struct task_struct *child)	{ }
 static inline void perf_event_free_task(struct task_struct *task)	{ }
 static inline void perf_event_free_task(struct task_struct *task)	{ }
 static inline void perf_event_delayed_put(struct task_struct *task)	{ }
 static inline void perf_event_delayed_put(struct task_struct *task)	{ }
+static inline struct perf_event *perf_event_get(unsigned int fd)	{ return ERR_PTR(-EINVAL); }
+static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
+{
+	return ERR_PTR(-EINVAL);
+}
+static inline u64 perf_event_read_local(struct perf_event *event)	{ return -EINVAL; }
 static inline void perf_event_print_debug(void)				{ }
 static inline void perf_event_print_debug(void)				{ }
 static inline int perf_event_task_disable(void)				{ return -EINVAL; }
 static inline int perf_event_task_disable(void)				{ return -EINVAL; }
 static inline int perf_event_task_enable(void)				{ return -EINVAL; }
 static inline int perf_event_task_enable(void)				{ return -EINVAL; }
@@ -1011,6 +1020,7 @@ static inline void perf_event_enable(struct perf_event *event)		{ }
 static inline void perf_event_disable(struct perf_event *event)		{ }
 static inline void perf_event_disable(struct perf_event *event)		{ }
 static inline int __perf_event_disable(void *info)			{ return -1; }
 static inline int __perf_event_disable(void *info)			{ return -1; }
 static inline void perf_event_task_tick(void)				{ }
 static inline void perf_event_task_tick(void)				{ }
+static inline int perf_event_release_kernel(struct perf_event *event)	{ return 0; }
 #endif
 #endif
 
 
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_NO_HZ_FULL)
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_NO_HZ_FULL)

+ 2 - 0
include/uapi/linux/bpf.h

@@ -114,6 +114,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_HASH,
 	BPF_MAP_TYPE_HASH,
 	BPF_MAP_TYPE_ARRAY,
 	BPF_MAP_TYPE_ARRAY,
 	BPF_MAP_TYPE_PROG_ARRAY,
 	BPF_MAP_TYPE_PROG_ARRAY,
+	BPF_MAP_TYPE_PERF_EVENT_ARRAY,
 };
 };
 
 
 enum bpf_prog_type {
 enum bpf_prog_type {
@@ -270,6 +271,7 @@ enum bpf_func_id {
 	 */
 	 */
 	BPF_FUNC_skb_get_tunnel_key,
 	BPF_FUNC_skb_get_tunnel_key,
 	BPF_FUNC_skb_set_tunnel_key,
 	BPF_FUNC_skb_set_tunnel_key,
+	BPF_FUNC_perf_event_read,	/* u64 bpf_perf_event_read(&map, index) */
 	__BPF_FUNC_MAX_ID,
 	__BPF_FUNC_MAX_ID,
 };
 };
 
 

+ 106 - 31
kernel/bpf/arraymap.c

@@ -150,15 +150,15 @@ static int __init register_array_map(void)
 }
 }
 late_initcall(register_array_map);
 late_initcall(register_array_map);
 
 
-static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
+static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
 {
 {
-	/* only bpf_prog file descriptors can be stored in prog_array map */
+	/* only file descriptors can be stored in this type of map */
 	if (attr->value_size != sizeof(u32))
 	if (attr->value_size != sizeof(u32))
 		return ERR_PTR(-EINVAL);
 		return ERR_PTR(-EINVAL);
 	return array_map_alloc(attr);
 	return array_map_alloc(attr);
 }
 }
 
 
-static void prog_array_map_free(struct bpf_map *map)
+static void fd_array_map_free(struct bpf_map *map)
 {
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	int i;
 	int i;
@@ -167,21 +167,21 @@ static void prog_array_map_free(struct bpf_map *map)
 
 
 	/* make sure it's empty */
 	/* make sure it's empty */
 	for (i = 0; i < array->map.max_entries; i++)
 	for (i = 0; i < array->map.max_entries; i++)
-		BUG_ON(array->prog[i] != NULL);
+		BUG_ON(array->ptrs[i] != NULL);
 	kvfree(array);
 	kvfree(array);
 }
 }
 
 
-static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key)
+static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
 {
 {
 	return NULL;
 	return NULL;
 }
 }
 
 
 /* only called from syscall */
 /* only called from syscall */
-static int prog_array_map_update_elem(struct bpf_map *map, void *key,
-				      void *value, u64 map_flags)
+static int fd_array_map_update_elem(struct bpf_map *map, void *key,
+				    void *value, u64 map_flags)
 {
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
-	struct bpf_prog *prog, *old_prog;
+	void *new_ptr, *old_ptr;
 	u32 index = *(u32 *)key, ufd;
 	u32 index = *(u32 *)key, ufd;
 
 
 	if (map_flags != BPF_ANY)
 	if (map_flags != BPF_ANY)
@@ -191,57 +191,75 @@ static int prog_array_map_update_elem(struct bpf_map *map, void *key,
 		return -E2BIG;
 		return -E2BIG;
 
 
 	ufd = *(u32 *)value;
 	ufd = *(u32 *)value;
-	prog = bpf_prog_get(ufd);
-	if (IS_ERR(prog))
-		return PTR_ERR(prog);
-
-	if (!bpf_prog_array_compatible(array, prog)) {
-		bpf_prog_put(prog);
-		return -EINVAL;
-	}
+	new_ptr = map->ops->map_fd_get_ptr(map, ufd);
+	if (IS_ERR(new_ptr))
+		return PTR_ERR(new_ptr);
 
 
-	old_prog = xchg(array->prog + index, prog);
-	if (old_prog)
-		bpf_prog_put_rcu(old_prog);
+	old_ptr = xchg(array->ptrs + index, new_ptr);
+	if (old_ptr)
+		map->ops->map_fd_put_ptr(old_ptr);
 
 
 	return 0;
 	return 0;
 }
 }
 
 
-static int prog_array_map_delete_elem(struct bpf_map *map, void *key)
+static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
 {
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
-	struct bpf_prog *old_prog;
+	void *old_ptr;
 	u32 index = *(u32 *)key;
 	u32 index = *(u32 *)key;
 
 
 	if (index >= array->map.max_entries)
 	if (index >= array->map.max_entries)
 		return -E2BIG;
 		return -E2BIG;
 
 
-	old_prog = xchg(array->prog + index, NULL);
-	if (old_prog) {
-		bpf_prog_put_rcu(old_prog);
+	old_ptr = xchg(array->ptrs + index, NULL);
+	if (old_ptr) {
+		map->ops->map_fd_put_ptr(old_ptr);
 		return 0;
 		return 0;
 	} else {
 	} else {
 		return -ENOENT;
 		return -ENOENT;
 	}
 	}
 }
 }
 
 
+static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct bpf_prog *prog = bpf_prog_get(fd);
+	if (IS_ERR(prog))
+		return prog;
+
+	if (!bpf_prog_array_compatible(array, prog)) {
+		bpf_prog_put(prog);
+		return ERR_PTR(-EINVAL);
+	}
+	return prog;
+}
+
+static void prog_fd_array_put_ptr(void *ptr)
+{
+	struct bpf_prog *prog = ptr;
+
+	bpf_prog_put_rcu(prog);
+}
+
 /* decrement refcnt of all bpf_progs that are stored in this map */
 /* decrement refcnt of all bpf_progs that are stored in this map */
-void bpf_prog_array_map_clear(struct bpf_map *map)
+void bpf_fd_array_map_clear(struct bpf_map *map)
 {
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	int i;
 	int i;
 
 
 	for (i = 0; i < array->map.max_entries; i++)
 	for (i = 0; i < array->map.max_entries; i++)
-		prog_array_map_delete_elem(map, &i);
+		fd_array_map_delete_elem(map, &i);
 }
 }
 
 
 static const struct bpf_map_ops prog_array_ops = {
 static const struct bpf_map_ops prog_array_ops = {
-	.map_alloc = prog_array_map_alloc,
-	.map_free = prog_array_map_free,
+	.map_alloc = fd_array_map_alloc,
+	.map_free = fd_array_map_free,
 	.map_get_next_key = array_map_get_next_key,
 	.map_get_next_key = array_map_get_next_key,
-	.map_lookup_elem = prog_array_map_lookup_elem,
-	.map_update_elem = prog_array_map_update_elem,
-	.map_delete_elem = prog_array_map_delete_elem,
+	.map_lookup_elem = fd_array_map_lookup_elem,
+	.map_update_elem = fd_array_map_update_elem,
+	.map_delete_elem = fd_array_map_delete_elem,
+	.map_fd_get_ptr = prog_fd_array_get_ptr,
+	.map_fd_put_ptr = prog_fd_array_put_ptr,
 };
 };
 
 
 static struct bpf_map_type_list prog_array_type __read_mostly = {
 static struct bpf_map_type_list prog_array_type __read_mostly = {
@@ -255,3 +273,60 @@ static int __init register_prog_array_map(void)
 	return 0;
 	return 0;
 }
 }
 late_initcall(register_prog_array_map);
 late_initcall(register_prog_array_map);
+
+static void perf_event_array_map_free(struct bpf_map *map)
+{
+	bpf_fd_array_map_clear(map);
+	fd_array_map_free(map);
+}
+
+static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
+{
+	struct perf_event *event;
+	const struct perf_event_attr *attr;
+
+	event = perf_event_get(fd);
+	if (IS_ERR(event))
+		return event;
+
+	attr = perf_event_attrs(event);
+	if (IS_ERR(attr))
+		return (void *)attr;
+
+	if (attr->type != PERF_TYPE_RAW &&
+	    attr->type != PERF_TYPE_HARDWARE) {
+		perf_event_release_kernel(event);
+		return ERR_PTR(-EINVAL);
+	}
+	return event;
+}
+
+static void perf_event_fd_array_put_ptr(void *ptr)
+{
+	struct perf_event *event = ptr;
+
+	perf_event_release_kernel(event);
+}
+
+static const struct bpf_map_ops perf_event_array_ops = {
+	.map_alloc = fd_array_map_alloc,
+	.map_free = perf_event_array_map_free,
+	.map_get_next_key = array_map_get_next_key,
+	.map_lookup_elem = fd_array_map_lookup_elem,
+	.map_update_elem = fd_array_map_update_elem,
+	.map_delete_elem = fd_array_map_delete_elem,
+	.map_fd_get_ptr = perf_event_fd_array_get_ptr,
+	.map_fd_put_ptr = perf_event_fd_array_put_ptr,
+};
+
+static struct bpf_map_type_list perf_event_array_type __read_mostly = {
+	.ops = &perf_event_array_ops,
+	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+};
+
+static int __init register_perf_event_array_map(void)
+{
+	bpf_register_map_type(&perf_event_array_type);
+	return 0;
+}
+late_initcall(register_perf_event_array_map);

+ 1 - 1
kernel/bpf/core.c

@@ -450,7 +450,7 @@ select_insn:
 
 
 		tail_call_cnt++;
 		tail_call_cnt++;
 
 
-		prog = READ_ONCE(array->prog[index]);
+		prog = READ_ONCE(array->ptrs[index]);
 		if (unlikely(!prog))
 		if (unlikely(!prog))
 			goto out;
 			goto out;
 
 

+ 1 - 1
kernel/bpf/syscall.c

@@ -72,7 +72,7 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
 		/* prog_array stores refcnt-ed bpf_prog pointers
 		/* prog_array stores refcnt-ed bpf_prog pointers
 		 * release them all when user space closes prog_array_fd
 		 * release them all when user space closes prog_array_fd
 		 */
 		 */
-		bpf_prog_array_map_clear(map);
+		bpf_fd_array_map_clear(map);
 
 
 	bpf_map_put(map);
 	bpf_map_put(map);
 	return 0;
 	return 0;

+ 33 - 15
kernel/bpf/verifier.c

@@ -238,6 +238,14 @@ static const char * const reg_type_str[] = {
 	[CONST_IMM]		= "imm",
 	[CONST_IMM]		= "imm",
 };
 };
 
 
+static const struct {
+	int map_type;
+	int func_id;
+} func_limit[] = {
+	{BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
+	{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
+};
+
 static void print_verifier_state(struct verifier_env *env)
 static void print_verifier_state(struct verifier_env *env)
 {
 {
 	enum bpf_reg_type t;
 	enum bpf_reg_type t;
@@ -837,6 +845,28 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 	return err;
 	return err;
 }
 }
 
 
+static int check_map_func_compatibility(struct bpf_map *map, int func_id)
+{
+	bool bool_map, bool_func;
+	int i;
+
+	if (!map)
+		return 0;
+
+	for (i = 0; i <= ARRAY_SIZE(func_limit); i++) {
+		bool_map = (map->map_type == func_limit[i].map_type);
+		bool_func = (func_id == func_limit[i].func_id);
+		/* only when map & func pair match it can continue.
+		 * don't allow any other map type to be passed into
+		 * the special func;
+		 */
+		if (bool_map != bool_func)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int check_call(struct verifier_env *env, int func_id)
 static int check_call(struct verifier_env *env, int func_id)
 {
 {
 	struct verifier_state *state = &env->cur_state;
 	struct verifier_state *state = &env->cur_state;
@@ -912,21 +942,9 @@ static int check_call(struct verifier_env *env, int func_id)
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
-	if (map && map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
-	    func_id != BPF_FUNC_tail_call)
-		/* prog_array map type needs extra care:
-		 * only allow to pass it into bpf_tail_call() for now.
-		 * bpf_map_delete_elem() can be allowed in the future,
-		 * while bpf_map_update_elem() must only be done via syscall
-		 */
-		return -EINVAL;
-
-	if (func_id == BPF_FUNC_tail_call &&
-	    map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
-		/* don't allow any other map type to be passed into
-		 * bpf_tail_call()
-		 */
-		return -EINVAL;
+	err = check_map_func_compatibility(map, func_id);
+	if (err)
+		return err;
 
 
 	return 0;
 	return 0;
 }
 }

+ 78 - 0
kernel/events/core.c

@@ -3212,6 +3212,59 @@ static inline u64 perf_event_count(struct perf_event *event)
 	return __perf_event_count(event);
 	return __perf_event_count(event);
 }
 }
 
 
+/*
+ * NMI-safe method to read a local event, that is an event that
+ * is:
+ *   - either for the current task, or for this CPU
+ *   - does not have inherit set, for inherited task events
+ *     will not be local and we cannot read them atomically
+ *   - must not have a pmu::count method
+ */
+u64 perf_event_read_local(struct perf_event *event)
+{
+	unsigned long flags;
+	u64 val;
+
+	/*
+	 * Disabling interrupts avoids all counter scheduling (context
+	 * switches, timer based rotation and IPIs).
+	 */
+	local_irq_save(flags);
+
+	/* If this is a per-task event, it must be for current */
+	WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
+		     event->hw.target != current);
+
+	/* If this is a per-CPU event, it must be for this CPU */
+	WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
+		     event->cpu != smp_processor_id());
+
+	/*
+	 * It must not be an event with inherit set, we cannot read
+	 * all child counters from atomic context.
+	 */
+	WARN_ON_ONCE(event->attr.inherit);
+
+	/*
+	 * It must not have a pmu::count method, those are not
+	 * NMI safe.
+	 */
+	WARN_ON_ONCE(event->pmu->count);
+
+	/*
+	 * If the event is currently on this CPU, its either a per-task event,
+	 * or local to this CPU. Furthermore it means its ACTIVE (otherwise
+	 * oncpu == -1).
+	 */
+	if (event->oncpu == smp_processor_id())
+		event->pmu->read(event);
+
+	val = local64_read(&event->count);
+	local_irq_restore(flags);
+
+	return val;
+}
+
 static u64 perf_event_read(struct perf_event *event)
 static u64 perf_event_read(struct perf_event *event)
 {
 {
 	/*
 	/*
@@ -8574,6 +8627,31 @@ void perf_event_delayed_put(struct task_struct *task)
 		WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
 		WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
 }
 }
 
 
+struct perf_event *perf_event_get(unsigned int fd)
+{
+	int err;
+	struct fd f;
+	struct perf_event *event;
+
+	err = perf_fget_light(fd, &f);
+	if (err)
+		return ERR_PTR(err);
+
+	event = f.file->private_data;
+	atomic_long_inc(&event->refcount);
+	fdput(f);
+
+	return event;
+}
+
+const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
+{
+	if (!event)
+		return ERR_PTR(-EINVAL);
+
+	return &event->attr;
+}
+
 /*
 /*
  * inherit a event from parent task to child task:
  * inherit a event from parent task to child task:
  */
  */

+ 31 - 0
kernel/trace/bpf_trace.c

@@ -158,6 +158,35 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
 	return &bpf_trace_printk_proto;
 	return &bpf_trace_printk_proto;
 }
 }
 
 
+static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
+{
+	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct perf_event *event;
+
+	if (unlikely(index >= array->map.max_entries))
+		return -E2BIG;
+
+	event = (struct perf_event *)array->ptrs[index];
+	if (!event)
+		return -ENOENT;
+
+	/*
+	 * we don't know if the function is run successfully by the
+	 * return value. It can be judged in other places, such as
+	 * eBPF programs.
+	 */
+	return perf_event_read_local(event);
+}
+
+const struct bpf_func_proto bpf_perf_event_read_proto = {
+	.func		= bpf_perf_event_read,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
 static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
 {
 {
 	switch (func_id) {
 	switch (func_id) {
@@ -183,6 +212,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return bpf_get_trace_printk_proto();
 		return bpf_get_trace_printk_proto();
 	case BPF_FUNC_get_smp_processor_id:
 	case BPF_FUNC_get_smp_processor_id:
 		return &bpf_get_smp_processor_id_proto;
 		return &bpf_get_smp_processor_id_proto;
+	case BPF_FUNC_perf_event_read:
+		return &bpf_perf_event_read_proto;
 	default:
 	default:
 		return NULL;
 		return NULL;
 	}
 	}

+ 4 - 0
samples/bpf/Makefile

@@ -12,6 +12,7 @@ hostprogs-y += tracex2
 hostprogs-y += tracex3
 hostprogs-y += tracex3
 hostprogs-y += tracex4
 hostprogs-y += tracex4
 hostprogs-y += tracex5
 hostprogs-y += tracex5
+hostprogs-y += tracex6
 hostprogs-y += lathist
 hostprogs-y += lathist
 
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_verifier-objs := test_verifier.o libbpf.o
@@ -25,6 +26,7 @@ tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
 tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
 tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
 tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
 tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
 tracex5-objs := bpf_load.o libbpf.o tracex5_user.o
 tracex5-objs := bpf_load.o libbpf.o tracex5_user.o
+tracex6-objs := bpf_load.o libbpf.o tracex6_user.o
 lathist-objs := bpf_load.o libbpf.o lathist_user.o
 lathist-objs := bpf_load.o libbpf.o lathist_user.o
 
 
 # Tell kbuild to always build the programs
 # Tell kbuild to always build the programs
@@ -37,6 +39,7 @@ always += tracex2_kern.o
 always += tracex3_kern.o
 always += tracex3_kern.o
 always += tracex4_kern.o
 always += tracex4_kern.o
 always += tracex5_kern.o
 always += tracex5_kern.o
+always += tracex6_kern.o
 always += tcbpf1_kern.o
 always += tcbpf1_kern.o
 always += lathist_kern.o
 always += lathist_kern.o
 
 
@@ -51,6 +54,7 @@ HOSTLOADLIBES_tracex2 += -lelf
 HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
 HOSTLOADLIBES_tracex5 += -lelf
+HOSTLOADLIBES_tracex6 += -lelf
 HOSTLOADLIBES_lathist += -lelf
 HOSTLOADLIBES_lathist += -lelf
 
 
 # point this to your LLVM backend with bpf support
 # point this to your LLVM backend with bpf support

+ 2 - 0
samples/bpf/bpf_helpers.h

@@ -31,6 +31,8 @@ static unsigned long long (*bpf_get_current_uid_gid)(void) =
 	(void *) BPF_FUNC_get_current_uid_gid;
 	(void *) BPF_FUNC_get_current_uid_gid;
 static int (*bpf_get_current_comm)(void *buf, int buf_size) =
 static int (*bpf_get_current_comm)(void *buf, int buf_size) =
 	(void *) BPF_FUNC_get_current_comm;
 	(void *) BPF_FUNC_get_current_comm;
+static int (*bpf_perf_event_read)(void *map, int index) =
+	(void *) BPF_FUNC_perf_event_read;
 
 
 /* llvm builtin functions that eBPF C program may use to
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
  * emit BPF_LD_ABS and BPF_LD_IND instructions

+ 26 - 0
samples/bpf/tracex6_kern.c

@@ -0,0 +1,26 @@
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(u32),
+	.max_entries = 32,
+};
+
+SEC("kprobe/sys_write")
+int bpf_prog1(struct pt_regs *ctx)
+{
+	u64 count;
+	u32 key = bpf_get_smp_processor_id();
+	char fmt[] = "CPU-%d   %llu\n";
+
+	count = bpf_perf_event_read(&my_map, key);
+	bpf_trace_printk(fmt, sizeof(fmt), key, count);
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;

+ 68 - 0
samples/bpf/tracex6_user.c

@@ -0,0 +1,68 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <sys/ioctl.h>
+#include <linux/perf_event.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define SAMPLE_PERIOD  0x7fffffffffffffffULL
+
+static void test_bpf_perf_event(void)
+{
+	int nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+	int *pmu_fd = malloc(nr_cpus * sizeof(int));
+	unsigned long value;
+	int i;
+
+	struct perf_event_attr attr_insn_pmu = {
+		.freq = 0,
+		.sample_period = SAMPLE_PERIOD,
+		.inherit = 0,
+		.type = PERF_TYPE_HARDWARE,
+		.read_format = 0,
+		.sample_type = 0,
+		.config = 0,/* PMU: cycles */
+	};
+
+	for (i = 0; i < nr_cpus; i++) {
+		pmu_fd[i] = perf_event_open(&attr_insn_pmu, -1/*pid*/, i/*cpu*/, -1/*group_fd*/, 0);
+		if (pmu_fd[i] < 0)
+			printf("event syscall failed\n");
+
+		bpf_update_elem(map_fd[0], &i, &pmu_fd[i], BPF_ANY);
+		ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0);
+	}
+
+	system("ls");
+	system("pwd");
+	system("sleep 2");
+
+	for (i = 0; i < nr_cpus; i++)
+		close(pmu_fd[i]);
+
+	close(map_fd);
+
+	free(pmu_fd);
+}
+
+int main(int argc, char **argv)
+{
+	char filename[256];
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	test_bpf_perf_event();
+
+	return 0;
+}