9 years ago · 545c321ba3
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -313,6 +313,29 @@ enum bpf_func_id {
 
				 	 */
			
 
				 	BPF_FUNC_skb_get_tunnel_opt,
			
 
				 	BPF_FUNC_skb_set_tunnel_opt,
			
 
				+
			
 
				+	/**
			
 
				+	 * bpf_skb_change_proto(skb, proto, flags)
			
 
				+	 * Change protocol of the skb. Currently supported is
			
 
				+	 * v4 -> v6, v6 -> v4 transitions. The helper will also
			
 
				+	 * resize the skb. eBPF program is expected to fill the
			
 
				+	 * new headers via skb_store_bytes and lX_csum_replace.
			
 
				+	 * @skb: pointer to skb
			
 
				+	 * @proto: new skb->protocol type
			
 
				+	 * @flags: reserved
			
 
				+	 * Return: 0 on success or negative error
			
 
				+	 */
			
 
				+	BPF_FUNC_skb_change_proto,
			
 
				+
			
 
				+	/**
			
 
				+	 * bpf_skb_change_type(skb, type)
			
 
				+	 * Change packet type of skb.
			
 
				+	 * @skb: pointer to skb
			
 
				+	 * @type: new skb->pkt_type type
			
 
				+	 * Return: 0 on success or negative error
			
 
				+	 */
			
 
				+	BPF_FUNC_skb_change_type,
			
 
				+
			
 
				 	__BPF_FUNC_MAX_ID,
			
 
				 };
			
 
				 
			
@@ -347,7 +370,7 @@ enum bpf_func_id {
 
				 #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
			
 
				 #define BPF_F_DONT_FRAGMENT		(1ULL << 2)
			
 
				 
			
 
				-/* BPF_FUNC_perf_event_output flags. */
			
 
				+/* BPF_FUNC_perf_event_output and BPF_FUNC_perf_event_read flags. */
			
 
				 #define BPF_F_INDEX_MASK		0xffffffffULL
			
 
				 #define BPF_F_CURRENT_CPU		BPF_F_INDEX_MASK
			
 
				 
			
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -719,14 +719,13 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
 
				 
			
 
				 		if (unlikely(index >= array->map.max_entries))
			
 
				 			goto out;
			
 
				-
			
 
				 		if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
			
 
				 			goto out;
			
 
				 
			
 
				 		tail_call_cnt++;
			
 
				 
			
 
				 		prog = READ_ONCE(array->ptrs[index]);
			
 
				-		if (unlikely(!prog))
			
 
				+		if (!prog)
			
 
				 			goto out;
			
 
				 
			
 
				 		/* ARG1 at this point is guaranteed to point to CTX from
			
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -101,7 +101,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = {
 
				 
			
 
				 static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
			
 
				 {
			
 
				-	return raw_smp_processor_id();
			
 
				+	return smp_processor_id();
			
 
				 }
			
 
				 
			
 
				 const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
			
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -188,30 +188,35 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
 
				 	return &bpf_trace_printk_proto;
			
 
				 }
			
 
				 
			
 
				-static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
			
 
				+static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5)
			
 
				 {
			
 
				 	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
			
 
				 	struct bpf_array *array = container_of(map, struct bpf_array, map);
			
 
				+	unsigned int cpu = smp_processor_id();
			
 
				+	u64 index = flags & BPF_F_INDEX_MASK;
			
 
				 	struct bpf_event_entry *ee;
			
 
				 	struct perf_event *event;
			
 
				 
			
 
				+	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
			
 
				+		return -EINVAL;
			
 
				+	if (index == BPF_F_CURRENT_CPU)
			
 
				+		index = cpu;
			
 
				 	if (unlikely(index >= array->map.max_entries))
			
 
				 		return -E2BIG;
			
 
				 
			
 
				 	ee = READ_ONCE(array->ptrs[index]);
			
 
				-	if (unlikely(!ee))
			
 
				+	if (!ee)
			
 
				 		return -ENOENT;
			
 
				 
			
 
				 	event = ee->event;
			
 
				-	/* make sure event is local and doesn't have pmu::count */
			
 
				-	if (event->oncpu != smp_processor_id() ||
			
 
				-	    event->pmu->count)
			
 
				-		return -EINVAL;
			
 
				-
			
 
				 	if (unlikely(event->attr.type != PERF_TYPE_HARDWARE &&
			
 
				 		     event->attr.type != PERF_TYPE_RAW))
			
 
				 		return -EINVAL;
			
 
				 
			
 
				+	/* make sure event is local and doesn't have pmu::count */
			
 
				+	if (unlikely(event->oncpu != cpu || event->pmu->count))
			
 
				+		return -EINVAL;
			
 
				+
			
 
				 	/*
			
 
				 	 * we don't know if the function is run successfully by the
			
 
				 	 * return value. It can be judged in other places, such as
			
@@ -233,6 +238,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 
				 	struct pt_regs *regs = (struct pt_regs *) (long) r1;
			
 
				 	struct bpf_map *map = (struct bpf_map *) (long) r2;
			
 
				 	struct bpf_array *array = container_of(map, struct bpf_array, map);
			
 
				+	unsigned int cpu = smp_processor_id();
			
 
				 	u64 index = flags & BPF_F_INDEX_MASK;
			
 
				 	void *data = (void *) (long) r4;
			
 
				 	struct perf_sample_data sample_data;
			
@@ -246,12 +252,12 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 
				 	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
			
 
				 		return -EINVAL;
			
 
				 	if (index == BPF_F_CURRENT_CPU)
			
 
				-		index = raw_smp_processor_id();
			
 
				+		index = cpu;
			
 
				 	if (unlikely(index >= array->map.max_entries))
			
 
				 		return -E2BIG;
			
 
				 
			
 
				 	ee = READ_ONCE(array->ptrs[index]);
			
 
				-	if (unlikely(!ee))
			
 
				+	if (!ee)
			
 
				 		return -ENOENT;
			
 
				 
			
 
				 	event = ee->event;
			
@@ -259,7 +265,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 
				 		     event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
			
 
				 		return -EINVAL;
			
 
				 
			
 
				-	if (unlikely(event->oncpu != smp_processor_id()))
			
 
				+	if (unlikely(event->oncpu != cpu))
			
 
				 		return -EOPNOTSUPP;
			
 
				 
			
 
				 	perf_sample_data_init(&sample_data, 0, 0);
			
@@ -354,18 +360,12 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 
				 static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
			
 
				 					enum bpf_reg_type *reg_type)
			
 
				 {
			
 
				-	/* check bounds */
			
 
				 	if (off < 0 || off >= sizeof(struct pt_regs))
			
 
				 		return false;
			
 
				-
			
 
				-	/* only read is allowed */
			
 
				 	if (type != BPF_READ)
			
 
				 		return false;
			
 
				-
			
 
				-	/* disallow misaligned access */
			
 
				 	if (off % size != 0)
			
 
				 		return false;
			
 
				-
			
 
				 	return true;
			
 
				 }
			
 
				 
			
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -150,6 +150,12 @@ static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 
				 	return raw_smp_processor_id();
			
 
				 }
			
 
				 
			
 
				+static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
			
 
				+	.func		= __get_raw_cpu_id,
			
 
				+	.gpl_only	= false,
			
 
				+	.ret_type	= RET_INTEGER,
			
 
				+};
			
 
				+
			
 
				 static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
			
 
				 			      struct bpf_insn *insn_buf)
			
 
				 {
			
@@ -1777,6 +1783,224 @@ const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
 
				 };
			
 
				 EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto);
			
 
				 
			
 
				+static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
			
 
				+{
			
 
				+	/* Caller already did skb_cow() with len as headroom,
			
 
				+	 * so no need to do it here.
			
 
				+	 */
			
 
				+	skb_push(skb, len);
			
 
				+	memmove(skb->data, skb->data + len, off);
			
 
				+	memset(skb->data + off, 0, len);
			
 
				+
			
 
				+	/* No skb_postpush_rcsum(skb, skb->data + off, len)
			
 
				+	 * needed here as it does not change the skb->csum
			
 
				+	 * result for checksum complete when summing over
			
 
				+	 * zeroed blocks.
			
 
				+	 */
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len)
			
 
				+{
			
 
				+	/* skb_ensure_writable() is not needed here, as we're
			
 
				+	 * already working on an uncloned skb.
			
 
				+	 */
			
 
				+	if (unlikely(!pskb_may_pull(skb, off + len)))
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	skb_postpull_rcsum(skb, skb->data + off, len);
			
 
				+	memmove(skb->data + len, skb->data, off);
			
 
				+	__skb_pull(skb, len);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len)
			
 
				+{
			
 
				+	bool trans_same = skb->transport_header == skb->network_header;
			
 
				+	int ret;
			
 
				+
			
 
				+	/* There's no need for __skb_push()/__skb_pull() pair to
			
 
				+	 * get to the start of the mac header as we're guaranteed
			
 
				+	 * to always start from here under eBPF.
			
 
				+	 */
			
 
				+	ret = bpf_skb_generic_push(skb, off, len);
			
 
				+	if (likely(!ret)) {
			
 
				+		skb->mac_header -= len;
			
 
				+		skb->network_header -= len;
			
 
				+		if (trans_same)
			
 
				+			skb->transport_header = skb->network_header;
			
 
				+	}
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len)
			
 
				+{
			
 
				+	bool trans_same = skb->transport_header == skb->network_header;
			
 
				+	int ret;
			
 
				+
			
 
				+	/* Same here, __skb_push()/__skb_pull() pair not needed. */
			
 
				+	ret = bpf_skb_generic_pop(skb, off, len);
			
 
				+	if (likely(!ret)) {
			
 
				+		skb->mac_header += len;
			
 
				+		skb->network_header += len;
			
 
				+		if (trans_same)
			
 
				+			skb->transport_header = skb->network_header;
			
 
				+	}
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
			
 
				+{
			
 
				+	const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
			
 
				+	u32 off = skb->network_header - skb->mac_header;
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = skb_cow(skb, len_diff);
			
 
				+	if (unlikely(ret < 0))
			
 
				+		return ret;
			
 
				+
			
 
				+	ret = bpf_skb_net_hdr_push(skb, off, len_diff);
			
 
				+	if (unlikely(ret < 0))
			
 
				+		return ret;
			
 
				+
			
 
				+	if (skb_is_gso(skb)) {
			
 
				+		/* SKB_GSO_UDP stays as is. SKB_GSO_TCPV4 needs to
			
 
				+		 * be changed into SKB_GSO_TCPV6.
			
 
				+		 */
			
 
				+		if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
			
 
				+			skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV4;
			
 
				+			skb_shinfo(skb)->gso_type |=  SKB_GSO_TCPV6;
			
 
				+		}
			
 
				+
			
 
				+		/* Due to IPv6 header, MSS needs to be downgraded. */
			
 
				+		skb_shinfo(skb)->gso_size -= len_diff;
			
 
				+		/* Header must be checked, and gso_segs recomputed. */
			
 
				+		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
			
 
				+		skb_shinfo(skb)->gso_segs = 0;
			
 
				+	}
			
 
				+
			
 
				+	skb->protocol = htons(ETH_P_IPV6);
			
 
				+	skb_clear_hash(skb);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
			
 
				+{
			
 
				+	const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
			
 
				+	u32 off = skb->network_header - skb->mac_header;
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = skb_unclone(skb, GFP_ATOMIC);
			
 
				+	if (unlikely(ret < 0))
			
 
				+		return ret;
			
 
				+
			
 
				+	ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
			
 
				+	if (unlikely(ret < 0))
			
 
				+		return ret;
			
 
				+
			
 
				+	if (skb_is_gso(skb)) {
			
 
				+		/* SKB_GSO_UDP stays as is. SKB_GSO_TCPV6 needs to
			
 
				+		 * be changed into SKB_GSO_TCPV4.
			
 
				+		 */
			
 
				+		if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) {
			
 
				+			skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV6;
			
 
				+			skb_shinfo(skb)->gso_type |=  SKB_GSO_TCPV4;
			
 
				+		}
			
 
				+
			
 
				+		/* Due to IPv4 header, MSS can be upgraded. */
			
 
				+		skb_shinfo(skb)->gso_size += len_diff;
			
 
				+		/* Header must be checked, and gso_segs recomputed. */
			
 
				+		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
			
 
				+		skb_shinfo(skb)->gso_segs = 0;
			
 
				+	}
			
 
				+
			
 
				+	skb->protocol = htons(ETH_P_IP);
			
 
				+	skb_clear_hash(skb);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto)
			
 
				+{
			
 
				+	__be16 from_proto = skb->protocol;
			
 
				+
			
 
				+	if (from_proto == htons(ETH_P_IP) &&
			
 
				+	      to_proto == htons(ETH_P_IPV6))
			
 
				+		return bpf_skb_proto_4_to_6(skb);
			
 
				+
			
 
				+	if (from_proto == htons(ETH_P_IPV6) &&
			
 
				+	      to_proto == htons(ETH_P_IP))
			
 
				+		return bpf_skb_proto_6_to_4(skb);
			
 
				+
			
 
				+	return -ENOTSUPP;
			
 
				+}
			
 
				+
			
 
				+static u64 bpf_skb_change_proto(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
			
 
				+{
			
 
				+	struct sk_buff *skb = (struct sk_buff *) (long) r1;
			
 
				+	__be16 proto = (__force __be16) r2;
			
 
				+	int ret;
			
 
				+
			
 
				+	if (unlikely(flags))
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	/* General idea is that this helper does the basic groundwork
			
 
				+	 * needed for changing the protocol, and eBPF program fills the
			
 
				+	 * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace()
			
 
				+	 * and other helpers, rather than passing a raw buffer here.
			
 
				+	 *
			
 
				+	 * The rationale is to keep this minimal and without a need to
			
 
				+	 * deal with raw packet data. F.e. even if we would pass buffers
			
 
				+	 * here, the program still needs to call the bpf_lX_csum_replace()
			
 
				+	 * helpers anyway. Plus, this way we keep also separation of
			
 
				+	 * concerns, since f.e. bpf_skb_store_bytes() should only take
			
 
				+	 * care of stores.
			
 
				+	 *
			
 
				+	 * Currently, additional options and extension header space are
			
 
				+	 * not supported, but flags register is reserved so we can adapt
			
 
				+	 * that. For offloads, we mark packet as dodgy, so that headers
			
 
				+	 * need to be verified first.
			
 
				+	 */
			
 
				+	ret = bpf_skb_proto_xlat(skb, proto);
			
 
				+	bpf_compute_data_end(skb);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static const struct bpf_func_proto bpf_skb_change_proto_proto = {
			
 
				+	.func		= bpf_skb_change_proto,
			
 
				+	.gpl_only	= false,
			
 
				+	.ret_type	= RET_INTEGER,
			
 
				+	.arg1_type	= ARG_PTR_TO_CTX,
			
 
				+	.arg2_type	= ARG_ANYTHING,
			
 
				+	.arg3_type	= ARG_ANYTHING,
			
 
				+};
			
 
				+
			
 
				+static u64 bpf_skb_change_type(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
			
 
				+{
			
 
				+	struct sk_buff *skb = (struct sk_buff *) (long) r1;
			
 
				+	u32 pkt_type = r2;
			
 
				+
			
 
				+	/* We only allow a restricted subset to be changed for now. */
			
 
				+	if (unlikely(skb->pkt_type > PACKET_OTHERHOST ||
			
 
				+		     pkt_type > PACKET_OTHERHOST))
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	skb->pkt_type = pkt_type;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static const struct bpf_func_proto bpf_skb_change_type_proto = {
			
 
				+	.func		= bpf_skb_change_type,
			
 
				+	.gpl_only	= false,
			
 
				+	.ret_type	= RET_INTEGER,
			
 
				+	.arg1_type	= ARG_PTR_TO_CTX,
			
 
				+	.arg2_type	= ARG_ANYTHING,
			
 
				+};
			
 
				+
			
 
				 bool bpf_helper_changes_skb_data(void *func)
			
 
				 {
			
 
				 	if (func == bpf_skb_vlan_push)
			
@@ -1785,6 +2009,8 @@ bool bpf_helper_changes_skb_data(void *func)
 
				 		return true;
			
 
				 	if (func == bpf_skb_store_bytes)
			
 
				 		return true;
			
 
				+	if (func == bpf_skb_change_proto)
			
 
				+		return true;
			
 
				 	if (func == bpf_l3_csum_replace)
			
 
				 		return true;
			
 
				 	if (func == bpf_l4_csum_replace)
			
@@ -2037,7 +2263,7 @@ sk_filter_func_proto(enum bpf_func_id func_id)
 
				 	case BPF_FUNC_get_prandom_u32:
			
 
				 		return &bpf_get_prandom_u32_proto;
			
 
				 	case BPF_FUNC_get_smp_processor_id:
			
 
				-		return &bpf_get_smp_processor_id_proto;
			
 
				+		return &bpf_get_raw_smp_processor_id_proto;
			
 
				 	case BPF_FUNC_tail_call:
			
 
				 		return &bpf_tail_call_proto;
			
 
				 	case BPF_FUNC_ktime_get_ns:
			
@@ -2072,6 +2298,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 
				 		return &bpf_skb_vlan_push_proto;
			
 
				 	case BPF_FUNC_skb_vlan_pop:
			
 
				 		return &bpf_skb_vlan_pop_proto;
			
 
				+	case BPF_FUNC_skb_change_proto:
			
 
				+		return &bpf_skb_change_proto_proto;
			
 
				+	case BPF_FUNC_skb_change_type:
			
 
				+		return &bpf_skb_change_type_proto;
			
 
				 	case BPF_FUNC_skb_get_tunnel_key:
			
 
				 		return &bpf_skb_get_tunnel_key_proto;
			
 
				 	case BPF_FUNC_skb_set_tunnel_key:
			
@@ -2086,6 +2316,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 
				 		return &bpf_get_route_realm_proto;
			
 
				 	case BPF_FUNC_perf_event_output:
			
 
				 		return bpf_get_event_output_proto();
			
 
				+	case BPF_FUNC_get_smp_processor_id:
			
 
				+		return &bpf_get_smp_processor_id_proto;
			
 
				 	default:
			
 
				 		return sk_filter_func_proto(func_id);
			
 
				 	}