6 years ago · b741f16303
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -37,7 +37,10 @@ struct bpf_storage_buffer {
 
															 };
														
 
															 struct bpf_cgroup_storage {
														
 
															-	struct bpf_storage_buffer *buf;
														
 
															+	union {
														
 
															+		struct bpf_storage_buffer *buf;
														
 
															+		void __percpu *percpu_buf;
														
 
															+	};
														
 
															 	struct bpf_cgroup_storage_map *map;
														
 
															 	struct bpf_cgroup_storage_key key;
														
 
															 	struct list_head list;
														
@@ -109,6 +112,9 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 
															 static inline enum bpf_cgroup_storage_type cgroup_storage_type(
														
 
															 	struct bpf_map *map)
														
 
															 {
														
 
															+	if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
														
 
															+		return BPF_CGROUP_STORAGE_PERCPU;
														
 
															+
														
 
															 	return BPF_CGROUP_STORAGE_SHARED;
														
 
															 }
														
@@ -131,6 +137,10 @@ void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage);
 
															 int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map);
														
 
															 void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map);
														
 
															+int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value);
														
 
															+int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
														
 
															+				     void *value, u64 flags);
														
 
															+
														
 
															 /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
														
 
															 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
														
 
															 ({									      \
														
@@ -285,6 +295,14 @@ static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
 
															 	struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return 0; }
														
 
															 static inline void bpf_cgroup_storage_free(
														
 
															 	struct bpf_cgroup_storage *storage) {}
														
 
															+static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key,
														
 
															+						 void *value) {
														
 
															+	return 0;
														
 
															+}
														
 
															+static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
														
 
															+					void *key, void *value, u64 flags) {
														
 
															+	return 0;
														
 
															+}
														
 
															 #define cgroup_bpf_enabled (0)
														
 
															 #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
														
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -274,6 +274,7 @@ struct bpf_prog_offload {
 
															 enum bpf_cgroup_storage_type {
														
 
															 	BPF_CGROUP_STORAGE_SHARED,
														
 
															+	BPF_CGROUP_STORAGE_PERCPU,
														
 
															 	__BPF_CGROUP_STORAGE_MAX
														
 
															 };
														
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -43,6 +43,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops)
 
															 #endif
														
 
															 #ifdef CONFIG_CGROUP_BPF
														
 
															 BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops)
														
 
															+BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, cgroup_storage_map_ops)
														
 
															 #endif
														
 
															 BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops)
														
 
															 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops)
														
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -127,6 +127,7 @@ enum bpf_map_type {
 
															 	BPF_MAP_TYPE_SOCKHASH,
														
 
															 	BPF_MAP_TYPE_CGROUP_STORAGE,
														
 
															 	BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
														
 
															+	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
														
 
															 };
														
 
															 enum bpf_prog_type {
														
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -206,10 +206,16 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
 
															 	 */
														
 
															 	enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
														
 
															 	struct bpf_cgroup_storage *storage;
														
 
															+	void *ptr;
														
 
															 	storage = this_cpu_read(bpf_cgroup_storage[stype]);
														
 
															-	return (unsigned long)&READ_ONCE(storage->buf)->data[0];
														
 
															+	if (stype == BPF_CGROUP_STORAGE_SHARED)
														
 
															+		ptr = &READ_ONCE(storage->buf)->data[0];
														
 
															+	else
														
 
															+		ptr = this_cpu_ptr(storage->percpu_buf);
														
 
															+
														
 
															+	return (unsigned long)ptr;
														
 
															 }
														
 
															 const struct bpf_func_proto bpf_get_local_storage_proto = {
														
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -152,6 +152,71 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
 
															 	return 0;
														
 
															 }
														
 
															+int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key,
														
 
															+				   void *value)
														
 
															+{
														
 
															+	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
														
 
															+	struct bpf_cgroup_storage_key *key = _key;
														
 
															+	struct bpf_cgroup_storage *storage;
														
 
															+	int cpu, off = 0;
														
 
															+	u32 size;
														
 
															+
														
 
															+	rcu_read_lock();
														
 
															+	storage = cgroup_storage_lookup(map, key, false);
														
 
															+	if (!storage) {
														
 
															+		rcu_read_unlock();
														
 
															+		return -ENOENT;
														
 
															+	}
														
 
															+
														
 
															+	/* per_cpu areas are zero-filled and bpf programs can only
														
 
															+	 * access 'value_size' of them, so copying rounded areas
														
 
															+	 * will not leak any kernel data
														
 
															+	 */
														
 
															+	size = round_up(_map->value_size, 8);
														
 
															+	for_each_possible_cpu(cpu) {
														
 
															+		bpf_long_memcpy(value + off,
														
 
															+				per_cpu_ptr(storage->percpu_buf, cpu), size);
														
 
															+		off += size;
														
 
															+	}
														
 
															+	rcu_read_unlock();
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key,
														
 
															+				     void *value, u64 map_flags)
														
 
															+{
														
 
															+	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
														
 
															+	struct bpf_cgroup_storage_key *key = _key;
														
 
															+	struct bpf_cgroup_storage *storage;
														
 
															+	int cpu, off = 0;
														
 
															+	u32 size;
														
 
															+
														
 
															+	if (map_flags != BPF_ANY && map_flags != BPF_EXIST)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	rcu_read_lock();
														
 
															+	storage = cgroup_storage_lookup(map, key, false);
														
 
															+	if (!storage) {
														
 
															+		rcu_read_unlock();
														
 
															+		return -ENOENT;
														
 
															+	}
														
 
															+
														
 
															+	/* the user space will provide round_up(value_size, 8) bytes that
														
 
															+	 * will be copied into per-cpu area. bpf programs can only access
														
 
															+	 * value_size of it. During lookup the same extra bytes will be
														
 
															+	 * returned or zeros which were zero-filled by percpu_alloc,
														
 
															+	 * so no kernel data leaks possible
														
 
															+	 */
														
 
															+	size = round_up(_map->value_size, 8);
														
 
															+	for_each_possible_cpu(cpu) {
														
 
															+		bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu),
														
 
															+				value + off, size);
														
 
															+		off += size;
														
 
															+	}
														
 
															+	rcu_read_unlock();
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															 static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key,
														
 
															 				       void *_next_key)
														
 
															 {
														
@@ -287,60 +352,105 @@ void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map)
 
															 	spin_unlock_bh(&map->lock);
														
 
															 }
														
 
															+static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages)
														
 
															+{
														
 
															+	size_t size;
														
 
															+
														
 
															+	if (cgroup_storage_type(map) == BPF_CGROUP_STORAGE_SHARED) {
														
 
															+		size = sizeof(struct bpf_storage_buffer) + map->value_size;
														
 
															+		*pages = round_up(sizeof(struct bpf_cgroup_storage) + size,
														
 
															+				  PAGE_SIZE) >> PAGE_SHIFT;
														
 
															+	} else {
														
 
															+		size = map->value_size;
														
 
															+		*pages = round_up(round_up(size, 8) * num_possible_cpus(),
														
 
															+				  PAGE_SIZE) >> PAGE_SHIFT;
														
 
															+	}
														
 
															+
														
 
															+	return size;
														
 
															+}
														
 
															+
														
 
															 struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
														
 
															 					enum bpf_cgroup_storage_type stype)
														
 
															 {
														
 
															 	struct bpf_cgroup_storage *storage;
														
 
															 	struct bpf_map *map;
														
 
															+	gfp_t flags;
														
 
															+	size_t size;
														
 
															 	u32 pages;
														
 
															 	map = prog->aux->cgroup_storage[stype];
														
 
															 	if (!map)
														
 
															 		return NULL;
														
 
															-	pages = round_up(sizeof(struct bpf_cgroup_storage) +
														
 
															-			 sizeof(struct bpf_storage_buffer) +
														
 
															-			 map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
														
 
															+	size = bpf_cgroup_storage_calculate_size(map, &pages);
														
 
															+
														
 
															 	if (bpf_map_charge_memlock(map, pages))
														
 
															 		return ERR_PTR(-EPERM);
														
 
															 	storage = kmalloc_node(sizeof(struct bpf_cgroup_storage),
														
 
															 			       __GFP_ZERO | GFP_USER, map->numa_node);
														
 
															-	if (!storage) {
														
 
															-		bpf_map_uncharge_memlock(map, pages);
														
 
															-		return ERR_PTR(-ENOMEM);
														
 
															-	}
														
 
															+	if (!storage)
														
 
															+		goto enomem;
														
 
															-	storage->buf = kmalloc_node(sizeof(struct bpf_storage_buffer) +
														
 
															-				    map->value_size, __GFP_ZERO | GFP_USER,
														
 
															-				    map->numa_node);
														
 
															-	if (!storage->buf) {
														
 
															-		bpf_map_uncharge_memlock(map, pages);
														
 
															-		kfree(storage);
														
 
															-		return ERR_PTR(-ENOMEM);
														
 
															+	flags = __GFP_ZERO | GFP_USER;
														
 
															+
														
 
															+	if (stype == BPF_CGROUP_STORAGE_SHARED) {
														
 
															+		storage->buf = kmalloc_node(size, flags, map->numa_node);
														
 
															+		if (!storage->buf)
														
 
															+			goto enomem;
														
 
															+	} else {
														
 
															+		storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags);
														
 
															+		if (!storage->percpu_buf)
														
 
															+			goto enomem;
														
 
															 	}
														
 
															 	storage->map = (struct bpf_cgroup_storage_map *)map;
														
 
															 	return storage;
														
 
															+
														
 
															+enomem:
														
 
															+	bpf_map_uncharge_memlock(map, pages);
														
 
															+	kfree(storage);
														
 
															+	return ERR_PTR(-ENOMEM);
														
 
															+}
														
 
															+
														
 
															+static void free_shared_cgroup_storage_rcu(struct rcu_head *rcu)
														
 
															+{
														
 
															+	struct bpf_cgroup_storage *storage =
														
 
															+		container_of(rcu, struct bpf_cgroup_storage, rcu);
														
 
															+
														
 
															+	kfree(storage->buf);
														
 
															+	kfree(storage);
														
 
															+}
														
 
															+
														
 
															+static void free_percpu_cgroup_storage_rcu(struct rcu_head *rcu)
														
 
															+{
														
 
															+	struct bpf_cgroup_storage *storage =
														
 
															+		container_of(rcu, struct bpf_cgroup_storage, rcu);
														
 
															+
														
 
															+	free_percpu(storage->percpu_buf);
														
 
															+	kfree(storage);
														
 
															 }
														
 
															 void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage)
														
 
															 {
														
 
															-	u32 pages;
														
 
															+	enum bpf_cgroup_storage_type stype;
														
 
															 	struct bpf_map *map;
														
 
															+	u32 pages;
														
 
															 	if (!storage)
														
 
															 		return;
														
 
															 	map = &storage->map->map;
														
 
															-	pages = round_up(sizeof(struct bpf_cgroup_storage) +
														
 
															-			 sizeof(struct bpf_storage_buffer) +
														
 
															-			 map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
														
 
															+
														
 
															+	bpf_cgroup_storage_calculate_size(map, &pages);
														
 
															 	bpf_map_uncharge_memlock(map, pages);
														
 
															-	kfree_rcu(storage->buf, rcu);
														
 
															-	kfree_rcu(storage, rcu);
														
 
															+	stype = cgroup_storage_type(map);
														
 
															+	if (stype == BPF_CGROUP_STORAGE_SHARED)
														
 
															+		call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu);
														
 
															+	else
														
 
															+		call_rcu(&storage->rcu, free_percpu_cgroup_storage_rcu);
														
 
															 }
														
 
															 void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
														
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -686,7 +686,8 @@ static int map_lookup_elem(union bpf_attr *attr)
 
															 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
														
 
															 	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
														
 
															-	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
														
 
															+	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
														
 
															+	    map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
														
 
															 		value_size = round_up(map->value_size, 8) * num_possible_cpus();
														
 
															 	else if (IS_FD_MAP(map))
														
 
															 		value_size = sizeof(u32);
														
@@ -705,6 +706,8 @@ static int map_lookup_elem(union bpf_attr *attr)
 
															 		err = bpf_percpu_hash_copy(map, key, value);
														
 
															 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
														
 
															 		err = bpf_percpu_array_copy(map, key, value);
														
 
															+	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
														
 
															+		err = bpf_percpu_cgroup_storage_copy(map, key, value);
														
 
															 	} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
														
 
															 		err = bpf_stackmap_copy(map, key, value);
														
 
															 	} else if (IS_FD_ARRAY(map)) {
														
@@ -774,7 +777,8 @@ static int map_update_elem(union bpf_attr *attr)
 
															 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
														
 
															 	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
														
 
															-	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
														
 
															+	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
														
 
															+	    map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
														
 
															 		value_size = round_up(map->value_size, 8) * num_possible_cpus();
														
 
															 	else
														
 
															 		value_size = map->value_size;
														
@@ -809,6 +813,9 @@ static int map_update_elem(union bpf_attr *attr)
 
															 		err = bpf_percpu_hash_update(map, key, value, attr->flags);
														
 
															 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
														
 
															 		err = bpf_percpu_array_update(map, key, value, attr->flags);
														
 
															+	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
														
 
															+		err = bpf_percpu_cgroup_storage_update(map, key, value,
														
 
															+						       attr->flags);
														
 
															 	} else if (IS_FD_ARRAY(map)) {
														
 
															 		rcu_read_lock();
														
 
															 		err = bpf_fd_array_map_update_elem(map, f.file, key, value,
														
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2074,6 +2074,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 
															 			goto error;
														
 
															 		break;
														
 
															 	case BPF_MAP_TYPE_CGROUP_STORAGE:
														
 
															+	case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
														
 
															 		if (func_id != BPF_FUNC_get_local_storage)
														
 
															 			goto error;
														
 
															 		break;
														
@@ -2164,7 +2165,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 
															 			goto error;
														
 
															 		break;
														
 
															 	case BPF_FUNC_get_local_storage:
														
 
															-		if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE)
														
 
															+		if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
														
 
															+		    map->map_type != BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
														
 
															 			goto error;
														
 
															 		break;
														
 
															 	case BPF_FUNC_sk_select_reuseport:
														
@@ -5049,6 +5051,12 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
 
															 	return 0;
														
 
															 }
														
 
															+static bool bpf_map_is_cgroup_storage(struct bpf_map *map)
														
 
															+{
														
 
															+	return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
														
 
															+		map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
														
 
															+}
														
 
															+
														
 
															 /* look for pseudo eBPF instructions that access map FDs and
														
 
															  * replace them with actual map pointers
														
 
															  */
														
@@ -5139,10 +5147,9 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 
															 			}
														
 
															 			env->used_maps[env->used_map_cnt++] = map;
														
 
															-			if (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE &&
														
 
															+			if (bpf_map_is_cgroup_storage(map) &&
														
 
															 			    bpf_cgroup_storage_assign(env->prog, map)) {
														
 
															-				verbose(env,
														
 
															-					"only one cgroup storage is allowed\n");
														
 
															+				verbose(env, "only one cgroup storage of each type is allowed\n");
														
 
															 				fdput(f);
														
 
															 				return -EBUSY;
														
 
															 			}