11 years ago · 99c55f7d47
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -1001,6 +1001,45 @@ instruction that loads 64-bit immediate value into a dst_reg.
 
				 Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads
			
 
				 32-bit immediate value into a register.
			
 
				 
			
 
				+eBPF maps
			
 
				+---------
			
 
				+'maps' is a generic storage of different types for sharing data between kernel
			
 
				+and userspace.
			
 
				+
			
 
				+The maps are accessed from user space via BPF syscall, which has commands:
			
 
				+- create a map with given type and attributes
			
 
				+  map_fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
			
 
				+  using attr->map_type, attr->key_size, attr->value_size, attr->max_entries
			
 
				+  returns process-local file descriptor or negative error
			
 
				+
			
 
				+- lookup key in a given map
			
 
				+  err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
			
 
				+  using attr->map_fd, attr->key, attr->value
			
 
				+  returns zero and stores found elem into value or negative error
			
 
				+
			
 
				+- create or update key/value pair in a given map
			
 
				+  err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
			
 
				+  using attr->map_fd, attr->key, attr->value
			
 
				+  returns zero or negative error
			
 
				+
			
 
				+- find and delete element by key in a given map
			
 
				+  err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
			
 
				+  using attr->map_fd, attr->key
			
 
				+
			
 
				+- to delete map: close(fd)
			
 
				+  Exiting process will delete maps automatically
			
 
				+
			
 
				+userspace programs use this syscall to create/access maps that eBPF programs
			
 
				+are concurrently updating.
			
 
				+
			
 
				+maps can have different types: hash, array, bloom filter, radix-tree, etc.
			
 
				+
			
 
				+The map is defined by:
			
 
				+  . type
			
 
				+  . max number of elements
			
 
				+  . key size in bytes
			
 
				+  . value size in bytes
			
 
				+
			
 
				 Testing
			
 
				 -------
			
 
				 
			
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -0,0 +1,41 @@
 
				+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of version 2 of the GNU General Public
			
 
				+ * License as published by the Free Software Foundation.
			
 
				+ */
			
 
				+#ifndef _LINUX_BPF_H
			
 
				+#define _LINUX_BPF_H 1
			
 
				+
			
 
				+#include <uapi/linux/bpf.h>
			
 
				+#include <linux/workqueue.h>
			
 
				+
			
 
				+struct bpf_map;
			
 
				+
			
 
				+/* map is generic key/value storage optionally accesible by eBPF programs */
			
 
				+struct bpf_map_ops {
			
 
				+	/* funcs callable from userspace (via syscall) */
			
 
				+	struct bpf_map *(*map_alloc)(union bpf_attr *attr);
			
 
				+	void (*map_free)(struct bpf_map *);
			
 
				+};
			
 
				+
			
 
				+struct bpf_map {
			
 
				+	atomic_t refcnt;
			
 
				+	enum bpf_map_type map_type;
			
 
				+	u32 key_size;
			
 
				+	u32 value_size;
			
 
				+	u32 max_entries;
			
 
				+	struct bpf_map_ops *ops;
			
 
				+	struct work_struct work;
			
 
				+};
			
 
				+
			
 
				+struct bpf_map_type_list {
			
 
				+	struct list_head list_node;
			
 
				+	struct bpf_map_ops *ops;
			
 
				+	enum bpf_map_type type;
			
 
				+};
			
 
				+
			
 
				+void bpf_register_map_type(struct bpf_map_type_list *tl);
			
 
				+void bpf_map_put(struct bpf_map *map);
			
 
				+
			
 
				+#endif /* _LINUX_BPF_H */
			
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -62,4 +62,27 @@ struct bpf_insn {
 
				 	__s32	imm;		/* signed immediate constant */
			
 
				 };
			
 
				 
			
 
				+/* BPF syscall commands */
			
 
				+enum bpf_cmd {
			
 
				+	/* create a map with given type and attributes
			
 
				+	 * fd = bpf(BPF_MAP_CREATE, union bpf_attr *, u32 size)
			
 
				+	 * returns fd or negative error
			
 
				+	 * map is deleted when fd is closed
			
 
				+	 */
			
 
				+	BPF_MAP_CREATE,
			
 
				+};
			
 
				+
			
 
				+enum bpf_map_type {
			
 
				+	BPF_MAP_TYPE_UNSPEC,
			
 
				+};
			
 
				+
			
 
				+union bpf_attr {
			
 
				+	struct { /* anonymous struct used by BPF_MAP_CREATE command */
			
 
				+		__u32	map_type;	/* one of enum bpf_map_type */
			
 
				+		__u32	key_size;	/* size of key in bytes */
			
 
				+		__u32	value_size;	/* size of value in bytes */
			
 
				+		__u32	max_entries;	/* max number of entries in a map */
			
 
				+	};
			
 
				+} __attribute__((aligned(8)));
			
 
				+
			
 
				 #endif /* _UAPI__LINUX_BPF_H__ */
			
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1 +1 @@
 
				-obj-y := core.o
			
 
				+obj-y := core.o syscall.o
			
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -0,0 +1,169 @@
 
				+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of version 2 of the GNU General Public
			
 
				+ * License as published by the Free Software Foundation.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
			
 
				+ * General Public License for more details.
			
 
				+ */
			
 
				+#include <linux/bpf.h>
			
 
				+#include <linux/syscalls.h>
			
 
				+#include <linux/slab.h>
			
 
				+#include <linux/anon_inodes.h>
			
 
				+
			
 
				+static LIST_HEAD(bpf_map_types);
			
 
				+
			
 
				+static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
			
 
				+{
			
 
				+	struct bpf_map_type_list *tl;
			
 
				+	struct bpf_map *map;
			
 
				+
			
 
				+	list_for_each_entry(tl, &bpf_map_types, list_node) {
			
 
				+		if (tl->type == attr->map_type) {
			
 
				+			map = tl->ops->map_alloc(attr);
			
 
				+			if (IS_ERR(map))
			
 
				+				return map;
			
 
				+			map->ops = tl->ops;
			
 
				+			map->map_type = attr->map_type;
			
 
				+			return map;
			
 
				+		}
			
 
				+	}
			
 
				+	return ERR_PTR(-EINVAL);
			
 
				+}
			
 
				+
			
 
				+/* boot time registration of different map implementations */
			
 
				+void bpf_register_map_type(struct bpf_map_type_list *tl)
			
 
				+{
			
 
				+	list_add(&tl->list_node, &bpf_map_types);
			
 
				+}
			
 
				+
			
 
				+/* called from workqueue */
			
 
				+static void bpf_map_free_deferred(struct work_struct *work)
			
 
				+{
			
 
				+	struct bpf_map *map = container_of(work, struct bpf_map, work);
			
 
				+
			
 
				+	/* implementation dependent freeing */
			
 
				+	map->ops->map_free(map);
			
 
				+}
			
 
				+
			
 
				+/* decrement map refcnt and schedule it for freeing via workqueue
			
 
				+ * (unrelying map implementation ops->map_free() might sleep)
			
 
				+ */
			
 
				+void bpf_map_put(struct bpf_map *map)
			
 
				+{
			
 
				+	if (atomic_dec_and_test(&map->refcnt)) {
			
 
				+		INIT_WORK(&map->work, bpf_map_free_deferred);
			
 
				+		schedule_work(&map->work);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int bpf_map_release(struct inode *inode, struct file *filp)
			
 
				+{
			
 
				+	struct bpf_map *map = filp->private_data;
			
 
				+
			
 
				+	bpf_map_put(map);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static const struct file_operations bpf_map_fops = {
			
 
				+	.release = bpf_map_release,
			
 
				+};
			
 
				+
			
 
				+/* helper macro to check that unused fields 'union bpf_attr' are zero */
			
 
				+#define CHECK_ATTR(CMD) \
			
 
				+	memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
			
 
				+		   sizeof(attr->CMD##_LAST_FIELD), 0, \
			
 
				+		   sizeof(*attr) - \
			
 
				+		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
			
 
				+		   sizeof(attr->CMD##_LAST_FIELD)) != NULL
			
 
				+
			
 
				+#define BPF_MAP_CREATE_LAST_FIELD max_entries
			
 
				+/* called via syscall */
			
 
				+static int map_create(union bpf_attr *attr)
			
 
				+{
			
 
				+	struct bpf_map *map;
			
 
				+	int err;
			
 
				+
			
 
				+	err = CHECK_ATTR(BPF_MAP_CREATE);
			
 
				+	if (err)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
			
 
				+	map = find_and_alloc_map(attr);
			
 
				+	if (IS_ERR(map))
			
 
				+		return PTR_ERR(map);
			
 
				+
			
 
				+	atomic_set(&map->refcnt, 1);
			
 
				+
			
 
				+	err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
			
 
				+
			
 
				+	if (err < 0)
			
 
				+		/* failed to allocate fd */
			
 
				+		goto free_map;
			
 
				+
			
 
				+	return err;
			
 
				+
			
 
				+free_map:
			
 
				+	map->ops->map_free(map);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
			
 
				+{
			
 
				+	union bpf_attr attr = {};
			
 
				+	int err;
			
 
				+
			
 
				+	/* the syscall is limited to root temporarily. This restriction will be
			
 
				+	 * lifted when security audit is clean. Note that eBPF+tracing must have
			
 
				+	 * this restriction, since it may pass kernel data to user space
			
 
				+	 */
			
 
				+	if (!capable(CAP_SYS_ADMIN))
			
 
				+		return -EPERM;
			
 
				+
			
 
				+	if (!access_ok(VERIFY_READ, uattr, 1))
			
 
				+		return -EFAULT;
			
 
				+
			
 
				+	if (size > PAGE_SIZE)	/* silly large */
			
 
				+		return -E2BIG;
			
 
				+
			
 
				+	/* If we're handed a bigger struct than we know of,
			
 
				+	 * ensure all the unknown bits are 0 - i.e. new
			
 
				+	 * user-space does not rely on any kernel feature
			
 
				+	 * extensions we dont know about yet.
			
 
				+	 */
			
 
				+	if (size > sizeof(attr)) {
			
 
				+		unsigned char __user *addr;
			
 
				+		unsigned char __user *end;
			
 
				+		unsigned char val;
			
 
				+
			
 
				+		addr = (void __user *)uattr + sizeof(attr);
			
 
				+		end  = (void __user *)uattr + size;
			
 
				+
			
 
				+		for (; addr < end; addr++) {
			
 
				+			err = get_user(val, addr);
			
 
				+			if (err)
			
 
				+				return err;
			
 
				+			if (val)
			
 
				+				return -E2BIG;
			
 
				+		}
			
 
				+		size = sizeof(attr);
			
 
				+	}
			
 
				+
			
 
				+	/* copy attributes from user space, may be less than sizeof(bpf_attr) */
			
 
				+	if (copy_from_user(&attr, uattr, size) != 0)
			
 
				+		return -EFAULT;
			
 
				+
			
 
				+	switch (cmd) {
			
 
				+	case BPF_MAP_CREATE:
			
 
				+		err = map_create(&attr);
			
 
				+		break;
			
 
				+	default:
			
 
				+		err = -EINVAL;
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	return err;
			
 
				+}