9 년 전 · f1c89c033f
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2295,7 +2295,7 @@ static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len)
 
				 
			
 
				 int ___pskb_trim(struct sk_buff *skb, unsigned int len);
			
 
				 
			
 
				-static inline void __skb_trim(struct sk_buff *skb, unsigned int len)
			
 
				+static inline void __skb_set_length(struct sk_buff *skb, unsigned int len)
			
 
				 {
			
 
				 	if (unlikely(skb_is_nonlinear(skb))) {
			
 
				 		WARN_ON(1);
			
@@ -2305,6 +2305,11 @@ static inline void __skb_trim(struct sk_buff *skb, unsigned int len)
 
				 	skb_set_tail_pointer(skb, len);
			
 
				 }
			
 
				 
			
 
				+static inline void __skb_trim(struct sk_buff *skb, unsigned int len)
			
 
				+{
			
 
				+	__skb_set_length(skb, len);
			
 
				+}
			
 
				+
			
 
				 void skb_trim(struct sk_buff *skb, unsigned int len);
			
 
				 
			
 
				 static inline int __pskb_trim(struct sk_buff *skb, unsigned int len)
			
@@ -2335,6 +2340,20 @@ static inline void pskb_trim_unique(struct sk_buff *skb, unsigned int len)
 
				 	BUG_ON(err);
			
 
				 }
			
 
				 
			
 
				+static inline int __skb_grow(struct sk_buff *skb, unsigned int len)
			
 
				+{
			
 
				+	unsigned int diff = len - skb->len;
			
 
				+
			
 
				+	if (skb_tailroom(skb) < diff) {
			
 
				+		int ret = pskb_expand_head(skb, 0, diff - skb_tailroom(skb),
			
 
				+					   GFP_ATOMIC);
			
 
				+		if (ret)
			
 
				+			return ret;
			
 
				+	}
			
 
				+	__skb_set_length(skb, len);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  *	skb_orphan - orphan a buffer
			
 
				  *	@skb: buffer to orphan
			
@@ -2938,6 +2957,21 @@ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len)
 
				 	return __pskb_trim(skb, len);
			
 
				 }
			
 
				 
			
 
				+static inline int __skb_trim_rcsum(struct sk_buff *skb, unsigned int len)
			
 
				+{
			
 
				+	if (skb->ip_summed == CHECKSUM_COMPLETE)
			
 
				+		skb->ip_summed = CHECKSUM_NONE;
			
 
				+	__skb_trim(skb, len);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
			
 
				+{
			
 
				+	if (skb->ip_summed == CHECKSUM_COMPLETE)
			
 
				+		skb->ip_summed = CHECKSUM_NONE;
			
 
				+	return __skb_grow(skb, len);
			
 
				+}
			
 
				+
			
 
				 #define skb_queue_walk(queue, skb) \
			
 
				 		for (skb = (queue)->next;					\
			
 
				 		     skb != (struct sk_buff *)(queue);				\
			
@@ -3726,6 +3760,13 @@ static inline bool skb_is_gso_v6(const struct sk_buff *skb)
 
				 	return skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6;
			
 
				 }
			
 
				 
			
 
				+static inline void skb_gso_reset(struct sk_buff *skb)
			
 
				+{
			
 
				+	skb_shinfo(skb)->gso_size = 0;
			
 
				+	skb_shinfo(skb)->gso_segs = 0;
			
 
				+	skb_shinfo(skb)->gso_type = 0;
			
 
				+}
			
 
				+
			
 
				 void __skb_warn_lro_forwarding(const struct sk_buff *skb);
			
 
				 
			
 
				 static inline bool skb_warn_if_lro(const struct sk_buff *skb)
			
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1114,6 +1114,16 @@ static inline bool sk_stream_is_writeable(const struct sock *sk)
 
				 	       sk_stream_memory_free(sk);
			
 
				 }
			
 
				 
			
 
				+static inline int sk_under_cgroup_hierarchy(struct sock *sk,
			
 
				+					    struct cgroup *ancestor)
			
 
				+{
			
 
				+#ifdef CONFIG_SOCK_CGROUP_DATA
			
 
				+	return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data),
			
 
				+				    ancestor);
			
 
				+#else
			
 
				+	return -ENOTSUPP;
			
 
				+#endif
			
 
				+}
			
 
				 
			
 
				 static inline bool sk_has_memory_pressure(const struct sock *sk)
			
 
				 {
			
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -386,6 +386,17 @@ enum bpf_func_id {
 
				 	 */
			
 
				 	BPF_FUNC_current_task_under_cgroup,
			
 
				 
			
 
				+	/**
			
 
				+	 * bpf_skb_change_tail(skb, len, flags)
			
 
				+	 * The helper will resize the skb to the given new size,
			
 
				+	 * to be used f.e. with control messages.
			
 
				+	 * @skb: pointer to skb
			
 
				+	 * @len: new skb length
			
 
				+	 * @flags: reserved
			
 
				+	 * Return: 0 on success or negative error
			
 
				+	 */
			
 
				+	BPF_FUNC_skb_change_tail,
			
 
				+
			
 
				 	__BPF_FUNC_MAX_ID,
			
 
				 };
			
 
				 
			
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1350,14 +1350,18 @@ struct bpf_scratchpad {
 
				 
			
 
				 static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp);
			
 
				 
			
 
				+static inline int __bpf_try_make_writable(struct sk_buff *skb,
			
 
				+					  unsigned int write_len)
			
 
				+{
			
 
				+	return skb_ensure_writable(skb, write_len);
			
 
				+}
			
 
				+
			
 
				 static inline int bpf_try_make_writable(struct sk_buff *skb,
			
 
				 					unsigned int write_len)
			
 
				 {
			
 
				-	int err;
			
 
				+	int err = __bpf_try_make_writable(skb, write_len);
			
 
				 
			
 
				-	err = skb_ensure_writable(skb, write_len);
			
 
				 	bpf_compute_data_end(skb);
			
 
				-
			
 
				 	return err;
			
 
				 }
			
 
				 
			
@@ -1976,8 +1980,8 @@ static u64 bpf_skb_change_type(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 
				 	u32 pkt_type = r2;
			
 
				 
			
 
				 	/* We only allow a restricted subset to be changed for now. */
			
 
				-	if (unlikely(skb->pkt_type > PACKET_OTHERHOST ||
			
 
				-		     pkt_type > PACKET_OTHERHOST))
			
 
				+	if (unlikely(!skb_pkt_type_ok(skb->pkt_type) ||
			
 
				+		     !skb_pkt_type_ok(pkt_type)))
			
 
				 		return -EINVAL;
			
 
				 
			
 
				 	skb->pkt_type = pkt_type;
			
@@ -1992,6 +1996,92 @@ static const struct bpf_func_proto bpf_skb_change_type_proto = {
 
				 	.arg2_type	= ARG_ANYTHING,
			
 
				 };
			
 
				 
			
 
				+static u32 __bpf_skb_min_len(const struct sk_buff *skb)
			
 
				+{
			
 
				+	u32 min_len = skb_network_offset(skb);
			
 
				+
			
 
				+	if (skb_transport_header_was_set(skb))
			
 
				+		min_len = skb_transport_offset(skb);
			
 
				+	if (skb->ip_summed == CHECKSUM_PARTIAL)
			
 
				+		min_len = skb_checksum_start_offset(skb) +
			
 
				+			  skb->csum_offset + sizeof(__sum16);
			
 
				+	return min_len;
			
 
				+}
			
 
				+
			
 
				+static u32 __bpf_skb_max_len(const struct sk_buff *skb)
			
 
				+{
			
 
				+	return skb->dev ? skb->dev->mtu + skb->dev->hard_header_len :
			
 
				+	       65536;
			
 
				+}
			
 
				+
			
 
				+static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len)
			
 
				+{
			
 
				+	unsigned int old_len = skb->len;
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = __skb_grow_rcsum(skb, new_len);
			
 
				+	if (!ret)
			
 
				+		memset(skb->data + old_len, 0, new_len - old_len);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
			
 
				+{
			
 
				+	return __skb_trim_rcsum(skb, new_len);
			
 
				+}
			
 
				+
			
 
				+static u64 bpf_skb_change_tail(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
			
 
				+{
			
 
				+	struct sk_buff *skb = (struct sk_buff *)(long) r1;
			
 
				+	u32 max_len = __bpf_skb_max_len(skb);
			
 
				+	u32 min_len = __bpf_skb_min_len(skb);
			
 
				+	u32 new_len = (u32) r2;
			
 
				+	int ret;
			
 
				+
			
 
				+	if (unlikely(flags || new_len > max_len || new_len < min_len))
			
 
				+		return -EINVAL;
			
 
				+	if (skb->encapsulation)
			
 
				+		return -ENOTSUPP;
			
 
				+
			
 
				+	/* The basic idea of this helper is that it's performing the
			
 
				+	 * needed work to either grow or trim an skb, and eBPF program
			
 
				+	 * rewrites the rest via helpers like bpf_skb_store_bytes(),
			
 
				+	 * bpf_lX_csum_replace() and others rather than passing a raw
			
 
				+	 * buffer here. This one is a slow path helper and intended
			
 
				+	 * for replies with control messages.
			
 
				+	 *
			
 
				+	 * Like in bpf_skb_change_proto(), we want to keep this rather
			
 
				+	 * minimal and without protocol specifics so that we are able
			
 
				+	 * to separate concerns as in bpf_skb_store_bytes() should only
			
 
				+	 * be the one responsible for writing buffers.
			
 
				+	 *
			
 
				+	 * It's really expected to be a slow path operation here for
			
 
				+	 * control message replies, so we're implicitly linearizing,
			
 
				+	 * uncloning and drop offloads from the skb by this.
			
 
				+	 */
			
 
				+	ret = __bpf_try_make_writable(skb, skb->len);
			
 
				+	if (!ret) {
			
 
				+		if (new_len > skb->len)
			
 
				+			ret = bpf_skb_grow_rcsum(skb, new_len);
			
 
				+		else if (new_len < skb->len)
			
 
				+			ret = bpf_skb_trim_rcsum(skb, new_len);
			
 
				+		if (!ret && skb_is_gso(skb))
			
 
				+			skb_gso_reset(skb);
			
 
				+	}
			
 
				+
			
 
				+	bpf_compute_data_end(skb);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static const struct bpf_func_proto bpf_skb_change_tail_proto = {
			
 
				+	.func		= bpf_skb_change_tail,
			
 
				+	.gpl_only	= false,
			
 
				+	.ret_type	= RET_INTEGER,
			
 
				+	.arg1_type	= ARG_PTR_TO_CTX,
			
 
				+	.arg2_type	= ARG_ANYTHING,
			
 
				+	.arg3_type	= ARG_ANYTHING,
			
 
				+};
			
 
				+
			
 
				 bool bpf_helper_changes_skb_data(void *func)
			
 
				 {
			
 
				 	if (func == bpf_skb_vlan_push)
			
@@ -2002,6 +2092,8 @@ bool bpf_helper_changes_skb_data(void *func)
 
				 		return true;
			
 
				 	if (func == bpf_skb_change_proto)
			
 
				 		return true;
			
 
				+	if (func == bpf_skb_change_tail)
			
 
				+		return true;
			
 
				 	if (func == bpf_l3_csum_replace)
			
 
				 		return true;
			
 
				 	if (func == bpf_l4_csum_replace)
			
@@ -2282,7 +2374,6 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-#ifdef CONFIG_SOCK_CGROUP_DATA
			
 
				 static u64 bpf_skb_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
			
 
				 {
			
 
				 	struct sk_buff *skb = (struct sk_buff *)(long)r1;
			
@@ -2303,7 +2394,7 @@ static u64 bpf_skb_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 
				 	if (unlikely(!cgrp))
			
 
				 		return -EAGAIN;
			
 
				 
			
 
				-	return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data), cgrp);
			
 
				+	return sk_under_cgroup_hierarchy(sk, cgrp);
			
 
				 }
			
 
				 
			
 
				 static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
			
@@ -2314,7 +2405,41 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
 
				 	.arg2_type	= ARG_CONST_MAP_PTR,
			
 
				 	.arg3_type	= ARG_ANYTHING,
			
 
				 };
			
 
				-#endif
			
 
				+
			
 
				+static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
			
 
				+				  unsigned long off, unsigned long len)
			
 
				+{
			
 
				+	memcpy(dst_buff, src_buff + off, len);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static u64 bpf_xdp_event_output(u64 r1, u64 r2, u64 flags, u64 r4,
			
 
				+				u64 meta_size)
			
 
				+{
			
 
				+	struct xdp_buff *xdp = (struct xdp_buff *)(long) r1;
			
 
				+	struct bpf_map *map = (struct bpf_map *)(long) r2;
			
 
				+	u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
			
 
				+	void *meta = (void *)(long) r4;
			
 
				+
			
 
				+	if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
			
 
				+		return -EINVAL;
			
 
				+	if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data)))
			
 
				+		return -EFAULT;
			
 
				+
			
 
				+	return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size,
			
 
				+				bpf_xdp_copy);
			
 
				+}
			
 
				+
			
 
				+static const struct bpf_func_proto bpf_xdp_event_output_proto = {
			
 
				+	.func		= bpf_xdp_event_output,
			
 
				+	.gpl_only	= true,
			
 
				+	.ret_type	= RET_INTEGER,
			
 
				+	.arg1_type	= ARG_PTR_TO_CTX,
			
 
				+	.arg2_type	= ARG_CONST_MAP_PTR,
			
 
				+	.arg3_type	= ARG_ANYTHING,
			
 
				+	.arg4_type	= ARG_PTR_TO_STACK,
			
 
				+	.arg5_type	= ARG_CONST_STACK_SIZE,
			
 
				+};
			
 
				 
			
 
				 static const struct bpf_func_proto *
			
 
				 sk_filter_func_proto(enum bpf_func_id func_id)
			
@@ -2368,6 +2493,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 
				 		return &bpf_skb_change_proto_proto;
			
 
				 	case BPF_FUNC_skb_change_type:
			
 
				 		return &bpf_skb_change_type_proto;
			
 
				+	case BPF_FUNC_skb_change_tail:
			
 
				+		return &bpf_skb_change_tail_proto;
			
 
				 	case BPF_FUNC_skb_get_tunnel_key:
			
 
				 		return &bpf_skb_get_tunnel_key_proto;
			
 
				 	case BPF_FUNC_skb_set_tunnel_key:
			
@@ -2386,10 +2513,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 
				 		return &bpf_skb_event_output_proto;
			
 
				 	case BPF_FUNC_get_smp_processor_id:
			
 
				 		return &bpf_get_smp_processor_id_proto;
			
 
				-#ifdef CONFIG_SOCK_CGROUP_DATA
			
 
				 	case BPF_FUNC_skb_under_cgroup:
			
 
				 		return &bpf_skb_under_cgroup_proto;
			
 
				-#endif
			
 
				 	default:
			
 
				 		return sk_filter_func_proto(func_id);
			
 
				 	}
			
@@ -2398,7 +2523,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 
				 static const struct bpf_func_proto *
			
 
				 xdp_func_proto(enum bpf_func_id func_id)
			
 
				 {
			
 
				-	return sk_filter_func_proto(func_id);
			
 
				+	switch (func_id) {
			
 
				+	case BPF_FUNC_perf_event_output:
			
 
				+		return &bpf_xdp_event_output_proto;
			
 
				+	default:
			
 
				+		return sk_filter_func_proto(func_id);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 static bool __is_valid_access(int off, int size, enum bpf_access_type type)