9 năm trước cách đây · 4df20483ab
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -297,6 +297,10 @@ static inline struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
 
				 static inline void bpf_prog_put(struct bpf_prog *prog)
			
 
				 {
			
 
				 }
			
 
				+static inline struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
			
 
				+{
			
 
				+	return ERR_PTR(-EOPNOTSUPP);
			
 
				+}
			
 
				 #endif /* CONFIG_BPF_SYSCALL */
			
 
				 
			
 
				 /* verifier prototypes for helper functions called from eBPF programs */
			
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -679,6 +679,10 @@ struct perf_event {
 
				 	u64				(*clock)(void);
			
 
				 	perf_overflow_handler_t		overflow_handler;
			
 
				 	void				*overflow_handler_context;
			
 
				+#ifdef CONFIG_BPF_SYSCALL
			
 
				+	perf_overflow_handler_t		orig_overflow_handler;
			
 
				+	struct bpf_prog			*prog;
			
 
				+#endif
			
 
				 
			
 
				 #ifdef CONFIG_EVENT_TRACING
			
 
				 	struct trace_event_call		*tp_event;
			
@@ -788,6 +792,11 @@ struct perf_output_handle {
 
				 	int				page;
			
 
				 };
			
 
				 
			
 
				+struct bpf_perf_event_data_kern {
			
 
				+	struct pt_regs *regs;
			
 
				+	struct perf_sample_data *data;
			
 
				+};
			
 
				+
			
 
				 #ifdef CONFIG_CGROUP_PERF
			
 
				 
			
 
				 /*
			
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -71,6 +71,7 @@ header-y += binfmts.h
 
				 header-y += blkpg.h
			
 
				 header-y += blktrace_api.h
			
 
				 header-y += bpf_common.h
			
 
				+header-y += bpf_perf_event.h
			
 
				 header-y += bpf.h
			
 
				 header-y += bpqether.h
			
 
				 header-y += bsg.h
			
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -95,6 +95,7 @@ enum bpf_prog_type {
 
				 	BPF_PROG_TYPE_SCHED_ACT,
			
 
				 	BPF_PROG_TYPE_TRACEPOINT,
			
 
				 	BPF_PROG_TYPE_XDP,
			
 
				+	BPF_PROG_TYPE_PERF_EVENT,
			
 
				 };
			
 
				 
			
 
				 #define BPF_PSEUDO_MAP_FD	1
			
--- a/include/uapi/linux/bpf_perf_event.h
+++ b/include/uapi/linux/bpf_perf_event.h
@@ -0,0 +1,18 @@
 
				+/* Copyright (c) 2016 Facebook
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of version 2 of the GNU General Public
			
 
				+ * License as published by the Free Software Foundation.
			
 
				+ */
			
 
				+#ifndef _UAPI__LINUX_BPF_PERF_EVENT_H__
			
 
				+#define _UAPI__LINUX_BPF_PERF_EVENT_H__
			
 
				+
			
 
				+#include <linux/types.h>
			
 
				+#include <linux/ptrace.h>
			
 
				+
			
 
				+struct bpf_perf_event_data {
			
 
				+	struct pt_regs regs;
			
 
				+	__u64 sample_period;
			
 
				+};
			
 
				+
			
 
				+#endif /* _UAPI__LINUX_BPF_PERF_EVENT_H__ */
			
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2333,7 +2333,8 @@ static int do_check(struct verifier_env *env)
 
				 			if (err)
			
 
				 				return err;
			
 
				 
			
 
				-			if (BPF_SIZE(insn->code) != BPF_W) {
			
 
				+			if (BPF_SIZE(insn->code) != BPF_W &&
			
 
				+			    BPF_SIZE(insn->code) != BPF_DW) {
			
 
				 				insn_idx++;
			
 
				 				continue;
			
 
				 			}
			
@@ -2510,6 +2511,20 @@ process_bpf_exit:
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static int check_map_prog_compatibility(struct bpf_map *map,
			
 
				+					struct bpf_prog *prog)
			
 
				+
			
 
				+{
			
 
				+	if (prog->type == BPF_PROG_TYPE_PERF_EVENT &&
			
 
				+	    (map->map_type == BPF_MAP_TYPE_HASH ||
			
 
				+	     map->map_type == BPF_MAP_TYPE_PERCPU_HASH) &&
			
 
				+	    (map->map_flags & BPF_F_NO_PREALLOC)) {
			
 
				+		verbose("perf_event programs can only use preallocated hash map\n");
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 /* look for pseudo eBPF instructions that access map FDs and
			
 
				  * replace them with actual map pointers
			
 
				  */
			
@@ -2517,7 +2532,7 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
 
				 {
			
 
				 	struct bpf_insn *insn = env->prog->insnsi;
			
 
				 	int insn_cnt = env->prog->len;
			
 
				-	int i, j;
			
 
				+	int i, j, err;
			
 
				 
			
 
				 	for (i = 0; i < insn_cnt; i++, insn++) {
			
 
				 		if (BPF_CLASS(insn->code) == BPF_LDX &&
			
@@ -2561,6 +2576,12 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
 
				 				return PTR_ERR(map);
			
 
				 			}
			
 
				 
			
 
				+			err = check_map_prog_compatibility(map, env->prog);
			
 
				+			if (err) {
			
 
				+				fdput(f);
			
 
				+				return err;
			
 
				+			}
			
 
				+
			
 
				 			/* store map pointer inside BPF_LD_IMM64 instruction */
			
 
				 			insn[0].imm = (u32) (unsigned long) map;
			
 
				 			insn[1].imm = ((u64) (unsigned long) map) >> 32;
			
@@ -2642,9 +2663,11 @@ static int convert_ctx_accesses(struct verifier_env *env)
 
				 	for (i = 0; i < insn_cnt; i++, insn++) {
			
 
				 		u32 insn_delta, cnt;
			
 
				 
			
 
				-		if (insn->code == (BPF_LDX | BPF_MEM | BPF_W))
			
 
				+		if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
			
 
				+		    insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
			
 
				 			type = BPF_READ;
			
 
				-		else if (insn->code == (BPF_STX | BPF_MEM | BPF_W))
			
 
				+		else if (insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
			
 
				+			 insn->code == (BPF_STX | BPF_MEM | BPF_DW))
			
 
				 			type = BPF_WRITE;
			
 
				 		else
			
 
				 			continue;
			
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7022,7 +7022,7 @@ static int __perf_event_overflow(struct perf_event *event,
 
				 		irq_work_queue(&event->pending);
			
 
				 	}
			
 
				 
			
 
				-	event->overflow_handler(event, data, regs);
			
 
				+	READ_ONCE(event->overflow_handler)(event, data, regs);
			
 
				 
			
 
				 	if (*perf_event_fasync(event) && event->pending_kill) {
			
 
				 		event->pending_wakeup = 1;
			
@@ -7637,11 +7637,83 @@ static void perf_event_free_filter(struct perf_event *event)
 
				 	ftrace_profile_free_filter(event);
			
 
				 }
			
 
				 
			
 
				+#ifdef CONFIG_BPF_SYSCALL
			
 
				+static void bpf_overflow_handler(struct perf_event *event,
			
 
				+				 struct perf_sample_data *data,
			
 
				+				 struct pt_regs *regs)
			
 
				+{
			
 
				+	struct bpf_perf_event_data_kern ctx = {
			
 
				+		.data = data,
			
 
				+		.regs = regs,
			
 
				+	};
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	preempt_disable();
			
 
				+	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
			
 
				+		goto out;
			
 
				+	rcu_read_lock();
			
 
				+	ret = BPF_PROG_RUN(event->prog, (void *)&ctx);
			
 
				+	rcu_read_unlock();
			
 
				+out:
			
 
				+	__this_cpu_dec(bpf_prog_active);
			
 
				+	preempt_enable();
			
 
				+	if (!ret)
			
 
				+		return;
			
 
				+
			
 
				+	event->orig_overflow_handler(event, data, regs);
			
 
				+}
			
 
				+
			
 
				+static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
			
 
				+{
			
 
				+	struct bpf_prog *prog;
			
 
				+
			
 
				+	if (event->overflow_handler_context)
			
 
				+		/* hw breakpoint or kernel counter */
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	if (event->prog)
			
 
				+		return -EEXIST;
			
 
				+
			
 
				+	prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
			
 
				+	if (IS_ERR(prog))
			
 
				+		return PTR_ERR(prog);
			
 
				+
			
 
				+	event->prog = prog;
			
 
				+	event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
			
 
				+	WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void perf_event_free_bpf_handler(struct perf_event *event)
			
 
				+{
			
 
				+	struct bpf_prog *prog = event->prog;
			
 
				+
			
 
				+	if (!prog)
			
 
				+		return;
			
 
				+
			
 
				+	WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
			
 
				+	event->prog = NULL;
			
 
				+	bpf_prog_put(prog);
			
 
				+}
			
 
				+#else
			
 
				+static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
			
 
				+{
			
 
				+	return -EOPNOTSUPP;
			
 
				+}
			
 
				+static void perf_event_free_bpf_handler(struct perf_event *event)
			
 
				+{
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
			
 
				 {
			
 
				 	bool is_kprobe, is_tracepoint;
			
 
				 	struct bpf_prog *prog;
			
 
				 
			
 
				+	if (event->attr.type == PERF_TYPE_HARDWARE ||
			
 
				+	    event->attr.type == PERF_TYPE_SOFTWARE)
			
 
				+		return perf_event_set_bpf_handler(event, prog_fd);
			
 
				+
			
 
				 	if (event->attr.type != PERF_TYPE_TRACEPOINT)
			
 
				 		return -EINVAL;
			
 
				 
			
@@ -7682,6 +7754,8 @@ static void perf_event_free_bpf_prog(struct perf_event *event)
 
				 {
			
 
				 	struct bpf_prog *prog;
			
 
				 
			
 
				+	perf_event_free_bpf_handler(event);
			
 
				+
			
 
				 	if (!event->tp_event)
			
 
				 		return;
			
 
				 
			
@@ -8998,6 +9072,19 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 
				 	if (!overflow_handler && parent_event) {
			
 
				 		overflow_handler = parent_event->overflow_handler;
			
 
				 		context = parent_event->overflow_handler_context;
			
 
				+#ifdef CONFIG_BPF_SYSCALL
			
 
				+		if (overflow_handler == bpf_overflow_handler) {
			
 
				+			struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
			
 
				+
			
 
				+			if (IS_ERR(prog)) {
			
 
				+				err = PTR_ERR(prog);
			
 
				+				goto err_ns;
			
 
				+			}
			
 
				+			event->prog = prog;
			
 
				+			event->orig_overflow_handler =
			
 
				+				parent_event->orig_overflow_handler;
			
 
				+		}
			
 
				+#endif
			
 
				 	}
			
 
				 
			
 
				 	if (overflow_handler) {
			
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1,4 +1,5 @@
 
				 /* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
			
 
				+ * Copyright (c) 2016 Facebook
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or
			
 
				  * modify it under the terms of version 2 of the GNU General Public
			
@@ -8,6 +9,7 @@
 
				 #include <linux/types.h>
			
 
				 #include <linux/slab.h>
			
 
				 #include <linux/bpf.h>
			
 
				+#include <linux/bpf_perf_event.h>
			
 
				 #include <linux/filter.h>
			
 
				 #include <linux/uaccess.h>
			
 
				 #include <linux/ctype.h>
			
@@ -552,10 +554,69 @@ static struct bpf_prog_type_list tracepoint_tl = {
 
				 	.type	= BPF_PROG_TYPE_TRACEPOINT,
			
 
				 };
			
 
				 
			
 
				+static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
			
 
				+				    enum bpf_reg_type *reg_type)
			
 
				+{
			
 
				+	if (off < 0 || off >= sizeof(struct bpf_perf_event_data))
			
 
				+		return false;
			
 
				+	if (type != BPF_READ)
			
 
				+		return false;
			
 
				+	if (off % size != 0)
			
 
				+		return false;
			
 
				+	if (off == offsetof(struct bpf_perf_event_data, sample_period)) {
			
 
				+		if (size != sizeof(u64))
			
 
				+			return false;
			
 
				+	} else {
			
 
				+		if (size != sizeof(long))
			
 
				+			return false;
			
 
				+	}
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg,
			
 
				+				      int src_reg, int ctx_off,
			
 
				+				      struct bpf_insn *insn_buf,
			
 
				+				      struct bpf_prog *prog)
			
 
				+{
			
 
				+	struct bpf_insn *insn = insn_buf;
			
 
				+
			
 
				+	switch (ctx_off) {
			
 
				+	case offsetof(struct bpf_perf_event_data, sample_period):
			
 
				+		BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64));
			
 
				+		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct bpf_perf_event_data_kern, data)),
			
 
				+				      dst_reg, src_reg,
			
 
				+				      offsetof(struct bpf_perf_event_data_kern, data));
			
 
				+		*insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg,
			
 
				+				      offsetof(struct perf_sample_data, period));
			
 
				+		break;
			
 
				+	default:
			
 
				+		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct bpf_perf_event_data_kern, regs)),
			
 
				+				      dst_reg, src_reg,
			
 
				+				      offsetof(struct bpf_perf_event_data_kern, regs));
			
 
				+		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(long)),
			
 
				+				      dst_reg, dst_reg, ctx_off);
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	return insn - insn_buf;
			
 
				+}
			
 
				+
			
 
				+static const struct bpf_verifier_ops perf_event_prog_ops = {
			
 
				+	.get_func_proto		= tp_prog_func_proto,
			
 
				+	.is_valid_access	= pe_prog_is_valid_access,
			
 
				+	.convert_ctx_access	= pe_prog_convert_ctx_access,
			
 
				+};
			
 
				+
			
 
				+static struct bpf_prog_type_list perf_event_tl = {
			
 
				+	.ops	= &perf_event_prog_ops,
			
 
				+	.type	= BPF_PROG_TYPE_PERF_EVENT,
			
 
				+};
			
 
				+
			
 
				 static int __init register_kprobe_prog_ops(void)
			
 
				 {
			
 
				 	bpf_register_prog_type(&kprobe_tl);
			
 
				 	bpf_register_prog_type(&tracepoint_tl);
			
 
				+	bpf_register_prog_type(&perf_event_tl);
			
 
				 	return 0;
			
 
				 }
			
 
				 late_initcall(register_kprobe_prog_ops);
			
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -25,6 +25,8 @@ hostprogs-y += test_cgrp2_array_pin
 
				 hostprogs-y += xdp1
			
 
				 hostprogs-y += xdp2
			
 
				 hostprogs-y += test_current_task_under_cgroup
			
 
				+hostprogs-y += trace_event
			
 
				+hostprogs-y += sampleip
			
 
				 
			
 
				 test_verifier-objs := test_verifier.o libbpf.o
			
 
				 test_maps-objs := test_maps.o libbpf.o
			
@@ -52,6 +54,8 @@ xdp1-objs := bpf_load.o libbpf.o xdp1_user.o
 
				 xdp2-objs := bpf_load.o libbpf.o xdp1_user.o
			
 
				 test_current_task_under_cgroup-objs := bpf_load.o libbpf.o \
			
 
				 				       test_current_task_under_cgroup_user.o
			
 
				+trace_event-objs := bpf_load.o libbpf.o trace_event_user.o
			
 
				+sampleip-objs := bpf_load.o libbpf.o sampleip_user.o
			
 
				 
			
 
				 # Tell kbuild to always build the programs
			
 
				 always := $(hostprogs-y)
			
@@ -79,6 +83,8 @@ always += test_cgrp2_tc_kern.o
 
				 always += xdp1_kern.o
			
 
				 always += xdp2_kern.o
			
 
				 always += test_current_task_under_cgroup_kern.o
			
 
				+always += trace_event_kern.o
			
 
				+always += sampleip_kern.o
			
 
				 
			
 
				 HOSTCFLAGS += -I$(objtree)/usr/include
			
 
				 
			
@@ -103,6 +109,8 @@ HOSTLOADLIBES_test_overhead += -lelf -lrt
 
				 HOSTLOADLIBES_xdp1 += -lelf
			
 
				 HOSTLOADLIBES_xdp2 += -lelf
			
 
				 HOSTLOADLIBES_test_current_task_under_cgroup += -lelf
			
 
				+HOSTLOADLIBES_trace_event += -lelf
			
 
				+HOSTLOADLIBES_sampleip += -lelf
			
 
				 
			
 
				 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
			
 
				 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
			
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -55,6 +55,8 @@ static int (*bpf_skb_get_tunnel_opt)(void *ctx, void *md, int size) =
 
				 	(void *) BPF_FUNC_skb_get_tunnel_opt;
			
 
				 static int (*bpf_skb_set_tunnel_opt)(void *ctx, void *md, int size) =
			
 
				 	(void *) BPF_FUNC_skb_set_tunnel_opt;
			
 
				+static unsigned long long (*bpf_get_prandom_u32)(void) =
			
 
				+	(void *) BPF_FUNC_get_prandom_u32;
			
 
				 
			
 
				 /* llvm builtin functions that eBPF C program may use to
			
 
				  * emit BPF_LD_ABS and BPF_LD_IND instructions
			
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -51,6 +51,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 
				 	bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
			
 
				 	bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0;
			
 
				 	bool is_xdp = strncmp(event, "xdp", 3) == 0;
			
 
				+	bool is_perf_event = strncmp(event, "perf_event", 10) == 0;
			
 
				 	enum bpf_prog_type prog_type;
			
 
				 	char buf[256];
			
 
				 	int fd, efd, err, id;
			
@@ -69,6 +70,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 
				 		prog_type = BPF_PROG_TYPE_TRACEPOINT;
			
 
				 	} else if (is_xdp) {
			
 
				 		prog_type = BPF_PROG_TYPE_XDP;
			
 
				+	} else if (is_perf_event) {
			
 
				+		prog_type = BPF_PROG_TYPE_PERF_EVENT;
			
 
				 	} else {
			
 
				 		printf("Unknown event '%s'\n", event);
			
 
				 		return -1;
			
@@ -82,7 +85,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 
				 
			
 
				 	prog_fd[prog_cnt++] = fd;
			
 
				 
			
 
				-	if (is_xdp)
			
 
				+	if (is_xdp || is_perf_event)
			
 
				 		return 0;
			
 
				 
			
 
				 	if (is_socket) {
			
@@ -326,6 +329,7 @@ int load_bpf_file(char *path)
 
				 			    memcmp(shname_prog, "kretprobe/", 10) == 0 ||
			
 
				 			    memcmp(shname_prog, "tracepoint/", 11) == 0 ||
			
 
				 			    memcmp(shname_prog, "xdp", 3) == 0 ||
			
 
				+			    memcmp(shname_prog, "perf_event", 10) == 0 ||
			
 
				 			    memcmp(shname_prog, "socket", 6) == 0)
			
 
				 				load_and_attach(shname_prog, insns, data_prog->d_size);
			
 
				 		}
			
@@ -344,6 +348,7 @@ int load_bpf_file(char *path)
 
				 		    memcmp(shname, "kretprobe/", 10) == 0 ||
			
 
				 		    memcmp(shname, "tracepoint/", 11) == 0 ||
			
 
				 		    memcmp(shname, "xdp", 3) == 0 ||
			
 
				+		    memcmp(shname, "perf_event", 10) == 0 ||
			
 
				 		    memcmp(shname, "socket", 6) == 0)
			
 
				 			load_and_attach(shname, data->d_buf, data->d_size);
			
 
				 	}
			
--- a/samples/bpf/sampleip_kern.c
+++ b/samples/bpf/sampleip_kern.c
@@ -0,0 +1,38 @@
 
				+/* Copyright 2016 Netflix, Inc.
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of version 2 of the GNU General Public
			
 
				+ * License as published by the Free Software Foundation.
			
 
				+ */
			
 
				+#include <linux/version.h>
			
 
				+#include <linux/ptrace.h>
			
 
				+#include <uapi/linux/bpf.h>
			
 
				+#include <uapi/linux/bpf_perf_event.h>
			
 
				+#include "bpf_helpers.h"
			
 
				+
			
 
				+#define MAX_IPS		8192
			
 
				+
			
 
				+struct bpf_map_def SEC("maps") ip_map = {
			
 
				+	.type = BPF_MAP_TYPE_HASH,
			
 
				+	.key_size = sizeof(u64),
			
 
				+	.value_size = sizeof(u32),
			
 
				+	.max_entries = MAX_IPS,
			
 
				+};
			
 
				+
			
 
				+SEC("perf_event")
			
 
				+int do_sample(struct bpf_perf_event_data *ctx)
			
 
				+{
			
 
				+	u64 ip;
			
 
				+	u32 *value, init_val = 1;
			
 
				+
			
 
				+	ip = ctx->regs.ip;
			
 
				+	value = bpf_map_lookup_elem(&ip_map, &ip);
			
 
				+	if (value)
			
 
				+		*value += 1;
			
 
				+	else
			
 
				+		/* E2BIG not tested for this example only */
			
 
				+		bpf_map_update_elem(&ip_map, &ip, &init_val, BPF_NOEXIST);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+char _license[] SEC("license") = "GPL";
			
--- a/samples/bpf/sampleip_user.c
+++ b/samples/bpf/sampleip_user.c
@@ -0,0 +1,196 @@
 
				+/*
			
 
				+ * sampleip: sample instruction pointer and frequency count in a BPF map.
			
 
				+ *
			
 
				+ * Copyright 2016 Netflix, Inc.
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of version 2 of the GNU General Public
			
 
				+ * License as published by the Free Software Foundation.
			
 
				+ */
			
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <stdio.h>
			
 
				+#include <unistd.h>
			
 
				+#include <errno.h>
			
 
				+#include <signal.h>
			
 
				+#include <string.h>
			
 
				+#include <assert.h>
			
 
				+#include <linux/perf_event.h>
			
 
				+#include <linux/ptrace.h>
			
 
				+#include <linux/bpf.h>
			
 
				+#include <sys/ioctl.h>
			
 
				+#include "libbpf.h"
			
 
				+#include "bpf_load.h"
			
 
				+
			
 
				+#define DEFAULT_FREQ	99
			
 
				+#define DEFAULT_SECS	5
			
 
				+#define MAX_IPS		8192
			
 
				+#define PAGE_OFFSET	0xffff880000000000
			
 
				+
			
 
				+static int nr_cpus;
			
 
				+
			
 
				+static void usage(void)
			
 
				+{
			
 
				+	printf("USAGE: sampleip [-F freq] [duration]\n");
			
 
				+	printf("       -F freq    # sample frequency (Hertz), default 99\n");
			
 
				+	printf("       duration   # sampling duration (seconds), default 5\n");
			
 
				+}
			
 
				+
			
 
				+static int sampling_start(int *pmu_fd, int freq)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	struct perf_event_attr pe_sample_attr = {
			
 
				+		.type = PERF_TYPE_SOFTWARE,
			
 
				+		.freq = 1,
			
 
				+		.sample_period = freq,
			
 
				+		.config = PERF_COUNT_SW_CPU_CLOCK,
			
 
				+		.inherit = 1,
			
 
				+	};
			
 
				+
			
 
				+	for (i = 0; i < nr_cpus; i++) {
			
 
				+		pmu_fd[i] = perf_event_open(&pe_sample_attr, -1 /* pid */, i,
			
 
				+					    -1 /* group_fd */, 0 /* flags */);
			
 
				+		if (pmu_fd[i] < 0) {
			
 
				+			fprintf(stderr, "ERROR: Initializing perf sampling\n");
			
 
				+			return 1;
			
 
				+		}
			
 
				+		assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_SET_BPF,
			
 
				+			     prog_fd[0]) == 0);
			
 
				+		assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0) == 0);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void sampling_end(int *pmu_fd)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < nr_cpus; i++)
			
 
				+		close(pmu_fd[i]);
			
 
				+}
			
 
				+
			
 
				+struct ipcount {
			
 
				+	__u64 ip;
			
 
				+	__u32 count;
			
 
				+};
			
 
				+
			
 
				+/* used for sorting */
			
 
				+struct ipcount counts[MAX_IPS];
			
 
				+
			
 
				+static int count_cmp(const void *p1, const void *p2)
			
 
				+{
			
 
				+	return ((struct ipcount *)p1)->count - ((struct ipcount *)p2)->count;
			
 
				+}
			
 
				+
			
 
				+static void print_ip_map(int fd)
			
 
				+{
			
 
				+	struct ksym *sym;
			
 
				+	__u64 key, next_key;
			
 
				+	__u32 value;
			
 
				+	int i, max;
			
 
				+
			
 
				+	printf("%-19s %-32s %s\n", "ADDR", "KSYM", "COUNT");
			
 
				+
			
 
				+	/* fetch IPs and counts */
			
 
				+	key = 0, i = 0;
			
 
				+	while (bpf_get_next_key(fd, &key, &next_key) == 0) {
			
 
				+		bpf_lookup_elem(fd, &next_key, &value);
			
 
				+		counts[i].ip = next_key;
			
 
				+		counts[i++].count = value;
			
 
				+		key = next_key;
			
 
				+	}
			
 
				+	max = i;
			
 
				+
			
 
				+	/* sort and print */
			
 
				+	qsort(counts, max, sizeof(struct ipcount), count_cmp);
			
 
				+	for (i = 0; i < max; i++) {
			
 
				+		if (counts[i].ip > PAGE_OFFSET) {
			
 
				+			sym = ksym_search(counts[i].ip);
			
 
				+			printf("0x%-17llx %-32s %u\n", counts[i].ip, sym->name,
			
 
				+			       counts[i].count);
			
 
				+		} else {
			
 
				+			printf("0x%-17llx %-32s %u\n", counts[i].ip, "(user)",
			
 
				+			       counts[i].count);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (max == MAX_IPS) {
			
 
				+		printf("WARNING: IP hash was full (max %d entries); ", max);
			
 
				+		printf("may have dropped samples\n");
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void int_exit(int sig)
			
 
				+{
			
 
				+	printf("\n");
			
 
				+	print_ip_map(map_fd[0]);
			
 
				+	exit(0);
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	char filename[256];
			
 
				+	int *pmu_fd, opt, freq = DEFAULT_FREQ, secs = DEFAULT_SECS;
			
 
				+
			
 
				+	/* process arguments */
			
 
				+	while ((opt = getopt(argc, argv, "F:h")) != -1) {
			
 
				+		switch (opt) {
			
 
				+		case 'F':
			
 
				+			freq = atoi(optarg);
			
 
				+			break;
			
 
				+		case 'h':
			
 
				+		default:
			
 
				+			usage();
			
 
				+			return 0;
			
 
				+		}
			
 
				+	}
			
 
				+	if (argc - optind == 1)
			
 
				+		secs = atoi(argv[optind]);
			
 
				+	if (freq == 0 || secs == 0) {
			
 
				+		usage();
			
 
				+		return 1;
			
 
				+	}
			
 
				+
			
 
				+	/* initialize kernel symbol translation */
			
 
				+	if (load_kallsyms()) {
			
 
				+		fprintf(stderr, "ERROR: loading /proc/kallsyms\n");
			
 
				+		return 2;
			
 
				+	}
			
 
				+
			
 
				+	/* create perf FDs for each CPU */
			
 
				+	nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
			
 
				+	pmu_fd = malloc(nr_cpus * sizeof(int));
			
 
				+	if (pmu_fd == NULL) {
			
 
				+		fprintf(stderr, "ERROR: malloc of pmu_fd\n");
			
 
				+		return 1;
			
 
				+	}
			
 
				+
			
 
				+	/* load BPF program */
			
 
				+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
			
 
				+	if (load_bpf_file(filename)) {
			
 
				+		fprintf(stderr, "ERROR: loading BPF program (errno %d):\n",
			
 
				+			errno);
			
 
				+		if (strcmp(bpf_log_buf, "") == 0)
			
 
				+			fprintf(stderr, "Try: ulimit -l unlimited\n");
			
 
				+		else
			
 
				+			fprintf(stderr, "%s", bpf_log_buf);
			
 
				+		return 1;
			
 
				+	}
			
 
				+	signal(SIGINT, int_exit);
			
 
				+
			
 
				+	/* do sampling */
			
 
				+	printf("Sampling at %d Hertz for %d seconds. Ctrl-C also ends.\n",
			
 
				+	       freq, secs);
			
 
				+	if (sampling_start(pmu_fd, freq) != 0)
			
 
				+		return 1;
			
 
				+	sleep(secs);
			
 
				+	sampling_end(pmu_fd);
			
 
				+	free(pmu_fd);
			
 
				+
			
 
				+	/* output sample counts */
			
 
				+	print_ip_map(map_fd[0]);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/samples/bpf/trace_event_kern.c
+++ b/samples/bpf/trace_event_kern.c
@@ -0,0 +1,65 @@
 
				+/* Copyright (c) 2016 Facebook
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of version 2 of the GNU General Public
			
 
				+ * License as published by the Free Software Foundation.
			
 
				+ */
			
 
				+#include <linux/ptrace.h>
			
 
				+#include <linux/version.h>
			
 
				+#include <uapi/linux/bpf.h>
			
 
				+#include <uapi/linux/bpf_perf_event.h>
			
 
				+#include <uapi/linux/perf_event.h>
			
 
				+#include "bpf_helpers.h"
			
 
				+
			
 
				+struct key_t {
			
 
				+	char comm[TASK_COMM_LEN];
			
 
				+	u32 kernstack;
			
 
				+	u32 userstack;
			
 
				+};
			
 
				+
			
 
				+struct bpf_map_def SEC("maps") counts = {
			
 
				+	.type = BPF_MAP_TYPE_HASH,
			
 
				+	.key_size = sizeof(struct key_t),
			
 
				+	.value_size = sizeof(u64),
			
 
				+	.max_entries = 10000,
			
 
				+};
			
 
				+
			
 
				+struct bpf_map_def SEC("maps") stackmap = {
			
 
				+	.type = BPF_MAP_TYPE_STACK_TRACE,
			
 
				+	.key_size = sizeof(u32),
			
 
				+	.value_size = PERF_MAX_STACK_DEPTH * sizeof(u64),
			
 
				+	.max_entries = 10000,
			
 
				+};
			
 
				+
			
 
				+#define KERN_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP)
			
 
				+#define USER_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK)
			
 
				+
			
 
				+SEC("perf_event")
			
 
				+int bpf_prog1(struct bpf_perf_event_data *ctx)
			
 
				+{
			
 
				+	char fmt[] = "CPU-%d period %lld ip %llx";
			
 
				+	u32 cpu = bpf_get_smp_processor_id();
			
 
				+	struct key_t key;
			
 
				+	u64 *val, one = 1;
			
 
				+
			
 
				+	if (ctx->sample_period < 10000)
			
 
				+		/* ignore warmup */
			
 
				+		return 0;
			
 
				+	bpf_get_current_comm(&key.comm, sizeof(key.comm));
			
 
				+	key.kernstack = bpf_get_stackid(ctx, &stackmap, KERN_STACKID_FLAGS);
			
 
				+	key.userstack = bpf_get_stackid(ctx, &stackmap, USER_STACKID_FLAGS);
			
 
				+	if ((int)key.kernstack < 0 && (int)key.userstack < 0) {
			
 
				+		bpf_trace_printk(fmt, sizeof(fmt), cpu, ctx->sample_period,
			
 
				+				 ctx->regs.ip);
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	val = bpf_map_lookup_elem(&counts, &key);
			
 
				+	if (val)
			
 
				+		(*val)++;
			
 
				+	else
			
 
				+		bpf_map_update_elem(&counts, &key, &one, BPF_NOEXIST);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+char _license[] SEC("license") = "GPL";
			
--- a/samples/bpf/trace_event_user.c
+++ b/samples/bpf/trace_event_user.c
@@ -0,0 +1,213 @@
 
				+/* Copyright (c) 2016 Facebook
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of version 2 of the GNU General Public
			
 
				+ * License as published by the Free Software Foundation.
			
 
				+ */
			
 
				+#include <stdio.h>
			
 
				+#include <unistd.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <stdbool.h>
			
 
				+#include <string.h>
			
 
				+#include <fcntl.h>
			
 
				+#include <poll.h>
			
 
				+#include <sys/ioctl.h>
			
 
				+#include <linux/perf_event.h>
			
 
				+#include <linux/bpf.h>
			
 
				+#include <signal.h>
			
 
				+#include <assert.h>
			
 
				+#include <errno.h>
			
 
				+#include <sys/resource.h>
			
 
				+#include "libbpf.h"
			
 
				+#include "bpf_load.h"
			
 
				+
			
 
				+#define SAMPLE_FREQ 50
			
 
				+
			
 
				+static bool sys_read_seen, sys_write_seen;
			
 
				+
			
 
				+static void print_ksym(__u64 addr)
			
 
				+{
			
 
				+	struct ksym *sym;
			
 
				+
			
 
				+	if (!addr)
			
 
				+		return;
			
 
				+	sym = ksym_search(addr);
			
 
				+	printf("%s;", sym->name);
			
 
				+	if (!strcmp(sym->name, "sys_read"))
			
 
				+		sys_read_seen = true;
			
 
				+	else if (!strcmp(sym->name, "sys_write"))
			
 
				+		sys_write_seen = true;
			
 
				+}
			
 
				+
			
 
				+static void print_addr(__u64 addr)
			
 
				+{
			
 
				+	if (!addr)
			
 
				+		return;
			
 
				+	printf("%llx;", addr);
			
 
				+}
			
 
				+
			
 
				+#define TASK_COMM_LEN 16
			
 
				+
			
 
				+struct key_t {
			
 
				+	char comm[TASK_COMM_LEN];
			
 
				+	__u32 kernstack;
			
 
				+	__u32 userstack;
			
 
				+};
			
 
				+
			
 
				+static void print_stack(struct key_t *key, __u64 count)
			
 
				+{
			
 
				+	__u64 ip[PERF_MAX_STACK_DEPTH] = {};
			
 
				+	static bool warned;
			
 
				+	int i;
			
 
				+
			
 
				+	printf("%3lld %s;", count, key->comm);
			
 
				+	if (bpf_lookup_elem(map_fd[1], &key->kernstack, ip) != 0) {
			
 
				+		printf("---;");
			
 
				+	} else {
			
 
				+		for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--)
			
 
				+			print_ksym(ip[i]);
			
 
				+	}
			
 
				+	printf("-;");
			
 
				+	if (bpf_lookup_elem(map_fd[1], &key->userstack, ip) != 0) {
			
 
				+		printf("---;");
			
 
				+	} else {
			
 
				+		for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--)
			
 
				+			print_addr(ip[i]);
			
 
				+	}
			
 
				+	printf("\n");
			
 
				+
			
 
				+	if (key->kernstack == -EEXIST && !warned) {
			
 
				+		printf("stackmap collisions seen. Consider increasing size\n");
			
 
				+		warned = true;
			
 
				+	} else if ((int)key->kernstack < 0 && (int)key->userstack < 0) {
			
 
				+		printf("err stackid %d %d\n", key->kernstack, key->userstack);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void int_exit(int sig)
			
 
				+{
			
 
				+	kill(0, SIGKILL);
			
 
				+	exit(0);
			
 
				+}
			
 
				+
			
 
				+static void print_stacks(void)
			
 
				+{
			
 
				+	struct key_t key = {}, next_key;
			
 
				+	__u64 value;
			
 
				+	__u32 stackid = 0, next_id;
			
 
				+	int fd = map_fd[0], stack_map = map_fd[1];
			
 
				+
			
 
				+	sys_read_seen = sys_write_seen = false;
			
 
				+	while (bpf_get_next_key(fd, &key, &next_key) == 0) {
			
 
				+		bpf_lookup_elem(fd, &next_key, &value);
			
 
				+		print_stack(&next_key, value);
			
 
				+		bpf_delete_elem(fd, &next_key);
			
 
				+		key = next_key;
			
 
				+	}
			
 
				+
			
 
				+	if (!sys_read_seen || !sys_write_seen) {
			
 
				+		printf("BUG kernel stack doesn't contain sys_read() and sys_write()\n");
			
 
				+		int_exit(0);
			
 
				+	}
			
 
				+
			
 
				+	/* clear stack map */
			
 
				+	while (bpf_get_next_key(stack_map, &stackid, &next_id) == 0) {
			
 
				+		bpf_delete_elem(stack_map, &next_id);
			
 
				+		stackid = next_id;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void test_perf_event_all_cpu(struct perf_event_attr *attr)
			
 
				+{
			
 
				+	int nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
			
 
				+	int *pmu_fd = malloc(nr_cpus * sizeof(int));
			
 
				+	int i;
			
 
				+
			
 
				+	/* open perf_event on all cpus */
			
 
				+	for (i = 0; i < nr_cpus; i++) {
			
 
				+		pmu_fd[i] = perf_event_open(attr, -1, i, -1, 0);
			
 
				+		if (pmu_fd[i] < 0) {
			
 
				+			printf("perf_event_open failed\n");
			
 
				+			goto all_cpu_err;
			
 
				+		}
			
 
				+		assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_SET_BPF, prog_fd[0]) == 0);
			
 
				+		assert(ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0) == 0);
			
 
				+	}
			
 
				+	system("dd if=/dev/zero of=/dev/null count=5000k");
			
 
				+	print_stacks();
			
 
				+all_cpu_err:
			
 
				+	for (i--; i >= 0; i--)
			
 
				+		close(pmu_fd[i]);
			
 
				+	free(pmu_fd);
			
 
				+}
			
 
				+
			
 
				+static void test_perf_event_task(struct perf_event_attr *attr)
			
 
				+{
			
 
				+	int pmu_fd;
			
 
				+
			
 
				+	/* open task bound event */
			
 
				+	pmu_fd = perf_event_open(attr, 0, -1, -1, 0);
			
 
				+	if (pmu_fd < 0) {
			
 
				+		printf("perf_event_open failed\n");
			
 
				+		return;
			
 
				+	}
			
 
				+	assert(ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) == 0);
			
 
				+	assert(ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0) == 0);
			
 
				+	system("dd if=/dev/zero of=/dev/null count=5000k");
			
 
				+	print_stacks();
			
 
				+	close(pmu_fd);
			
 
				+}
			
 
				+
			
 
				+static void test_bpf_perf_event(void)
			
 
				+{
			
 
				+	struct perf_event_attr attr_type_hw = {
			
 
				+		.sample_freq = SAMPLE_FREQ,
			
 
				+		.freq = 1,
			
 
				+		.type = PERF_TYPE_HARDWARE,
			
 
				+		.config = PERF_COUNT_HW_CPU_CYCLES,
			
 
				+		.inherit = 1,
			
 
				+	};
			
 
				+	struct perf_event_attr attr_type_sw = {
			
 
				+		.sample_freq = SAMPLE_FREQ,
			
 
				+		.freq = 1,
			
 
				+		.type = PERF_TYPE_SOFTWARE,
			
 
				+		.config = PERF_COUNT_SW_CPU_CLOCK,
			
 
				+		.inherit = 1,
			
 
				+	};
			
 
				+
			
 
				+	test_perf_event_all_cpu(&attr_type_hw);
			
 
				+	test_perf_event_task(&attr_type_hw);
			
 
				+	test_perf_event_all_cpu(&attr_type_sw);
			
 
				+	test_perf_event_task(&attr_type_sw);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
			
 
				+	char filename[256];
			
 
				+
			
 
				+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
			
 
				+	setrlimit(RLIMIT_MEMLOCK, &r);
			
 
				+
			
 
				+	signal(SIGINT, int_exit);
			
 
				+
			
 
				+	if (load_kallsyms()) {
			
 
				+		printf("failed to process /proc/kallsyms\n");
			
 
				+		return 1;
			
 
				+	}
			
 
				+
			
 
				+	if (load_bpf_file(filename)) {
			
 
				+		printf("%s", bpf_log_buf);
			
 
				+		return 2;
			
 
				+	}
			
 
				+
			
 
				+	if (fork() == 0) {
			
 
				+		read_trace_pipe();
			
 
				+		return 0;
			
 
				+	}
			
 
				+	test_bpf_perf_event();
			
 
				+
			
 
				+	int_exit(0);
			
 
				+	return 0;
			
 
				+}