|
@@ -0,0 +1,169 @@
|
|
|
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
|
|
|
+ *
|
|
|
+ * This program is free software; you can redistribute it and/or
|
|
|
+ * modify it under the terms of version 2 of the GNU General Public
|
|
|
+ * License as published by the Free Software Foundation.
|
|
|
+ *
|
|
|
+ * This program is distributed in the hope that it will be useful, but
|
|
|
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
+ * General Public License for more details.
|
|
|
+ */
|
|
|
+#include <linux/bpf.h>
|
|
|
+#include <linux/syscalls.h>
|
|
|
+#include <linux/slab.h>
|
|
|
+#include <linux/anon_inodes.h>
|
|
|
+
|
|
|
+static LIST_HEAD(bpf_map_types);
|
|
|
+
|
|
|
+static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
|
|
|
+{
|
|
|
+ struct bpf_map_type_list *tl;
|
|
|
+ struct bpf_map *map;
|
|
|
+
|
|
|
+ list_for_each_entry(tl, &bpf_map_types, list_node) {
|
|
|
+ if (tl->type == attr->map_type) {
|
|
|
+ map = tl->ops->map_alloc(attr);
|
|
|
+ if (IS_ERR(map))
|
|
|
+ return map;
|
|
|
+ map->ops = tl->ops;
|
|
|
+ map->map_type = attr->map_type;
|
|
|
+ return map;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return ERR_PTR(-EINVAL);
|
|
|
+}
|
|
|
+
|
|
|
+/* boot time registration of different map implementations */
|
|
|
+void bpf_register_map_type(struct bpf_map_type_list *tl)
|
|
|
+{
|
|
|
+ list_add(&tl->list_node, &bpf_map_types);
|
|
|
+}
|
|
|
+
|
|
|
+/* called from workqueue */
|
|
|
+static void bpf_map_free_deferred(struct work_struct *work)
|
|
|
+{
|
|
|
+ struct bpf_map *map = container_of(work, struct bpf_map, work);
|
|
|
+
|
|
|
+ /* implementation dependent freeing */
|
|
|
+ map->ops->map_free(map);
|
|
|
+}
|
|
|
+
|
|
|
+/* decrement map refcnt and schedule it for freeing via workqueue
|
|
|
+ * (unrelying map implementation ops->map_free() might sleep)
|
|
|
+ */
|
|
|
+void bpf_map_put(struct bpf_map *map)
|
|
|
+{
|
|
|
+ if (atomic_dec_and_test(&map->refcnt)) {
|
|
|
+ INIT_WORK(&map->work, bpf_map_free_deferred);
|
|
|
+ schedule_work(&map->work);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+static int bpf_map_release(struct inode *inode, struct file *filp)
|
|
|
+{
|
|
|
+ struct bpf_map *map = filp->private_data;
|
|
|
+
|
|
|
+ bpf_map_put(map);
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+static const struct file_operations bpf_map_fops = {
|
|
|
+ .release = bpf_map_release,
|
|
|
+};
|
|
|
+
|
|
|
+/* helper macro to check that unused fields 'union bpf_attr' are zero */
|
|
|
+#define CHECK_ATTR(CMD) \
|
|
|
+ memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
|
|
|
+ sizeof(attr->CMD##_LAST_FIELD), 0, \
|
|
|
+ sizeof(*attr) - \
|
|
|
+ offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
|
|
|
+ sizeof(attr->CMD##_LAST_FIELD)) != NULL
|
|
|
+
|
|
|
+#define BPF_MAP_CREATE_LAST_FIELD max_entries
|
|
|
+/* called via syscall */
|
|
|
+static int map_create(union bpf_attr *attr)
|
|
|
+{
|
|
|
+ struct bpf_map *map;
|
|
|
+ int err;
|
|
|
+
|
|
|
+ err = CHECK_ATTR(BPF_MAP_CREATE);
|
|
|
+ if (err)
|
|
|
+ return -EINVAL;
|
|
|
+
|
|
|
+ /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
|
|
|
+ map = find_and_alloc_map(attr);
|
|
|
+ if (IS_ERR(map))
|
|
|
+ return PTR_ERR(map);
|
|
|
+
|
|
|
+ atomic_set(&map->refcnt, 1);
|
|
|
+
|
|
|
+ err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
|
|
|
+
|
|
|
+ if (err < 0)
|
|
|
+ /* failed to allocate fd */
|
|
|
+ goto free_map;
|
|
|
+
|
|
|
+ return err;
|
|
|
+
|
|
|
+free_map:
|
|
|
+ map->ops->map_free(map);
|
|
|
+ return err;
|
|
|
+}
|
|
|
+
|
|
|
+SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
|
|
|
+{
|
|
|
+ union bpf_attr attr = {};
|
|
|
+ int err;
|
|
|
+
|
|
|
+ /* the syscall is limited to root temporarily. This restriction will be
|
|
|
+ * lifted when security audit is clean. Note that eBPF+tracing must have
|
|
|
+ * this restriction, since it may pass kernel data to user space
|
|
|
+ */
|
|
|
+ if (!capable(CAP_SYS_ADMIN))
|
|
|
+ return -EPERM;
|
|
|
+
|
|
|
+ if (!access_ok(VERIFY_READ, uattr, 1))
|
|
|
+ return -EFAULT;
|
|
|
+
|
|
|
+ if (size > PAGE_SIZE) /* silly large */
|
|
|
+ return -E2BIG;
|
|
|
+
|
|
|
+ /* If we're handed a bigger struct than we know of,
|
|
|
+ * ensure all the unknown bits are 0 - i.e. new
|
|
|
+ * user-space does not rely on any kernel feature
|
|
|
+ * extensions we dont know about yet.
|
|
|
+ */
|
|
|
+ if (size > sizeof(attr)) {
|
|
|
+ unsigned char __user *addr;
|
|
|
+ unsigned char __user *end;
|
|
|
+ unsigned char val;
|
|
|
+
|
|
|
+ addr = (void __user *)uattr + sizeof(attr);
|
|
|
+ end = (void __user *)uattr + size;
|
|
|
+
|
|
|
+ for (; addr < end; addr++) {
|
|
|
+ err = get_user(val, addr);
|
|
|
+ if (err)
|
|
|
+ return err;
|
|
|
+ if (val)
|
|
|
+ return -E2BIG;
|
|
|
+ }
|
|
|
+ size = sizeof(attr);
|
|
|
+ }
|
|
|
+
|
|
|
+ /* copy attributes from user space, may be less than sizeof(bpf_attr) */
|
|
|
+ if (copy_from_user(&attr, uattr, size) != 0)
|
|
|
+ return -EFAULT;
|
|
|
+
|
|
|
+ switch (cmd) {
|
|
|
+ case BPF_MAP_CREATE:
|
|
|
+ err = map_create(&attr);
|
|
|
+ break;
|
|
|
+ default:
|
|
|
+ err = -EINVAL;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
+ return err;
|
|
|
+}
|