7 年之前 · 8bb83b7838
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -619,7 +619,7 @@ static inline struct sk_buff *tap_alloc_skb(struct sock *sk, size_t prepad,
 
															 #define TAP_RESERVE HH_DATA_OFF(ETH_HLEN)
														
 
															 /* Get packet from user space buffer */
														
 
															-static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
														
 
															+static ssize_t tap_get_user(struct tap_queue *q, void *msg_control,
														
 
															 			    struct iov_iter *from, int noblock)
														
 
															 {
														
 
															 	int good_linear = SKB_MAX_HEAD(TAP_RESERVE);
														
@@ -663,7 +663,7 @@ static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
 
															 	if (unlikely(len < ETH_HLEN))
														
 
															 		goto err;
														
 
															-	if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) {
														
 
															+	if (msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) {
														
 
															 		struct iov_iter i;
														
 
															 		copylen = vnet_hdr.hdr_len ?
														
@@ -724,11 +724,11 @@ static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
 
															 	tap = rcu_dereference(q->tap);
														
 
															 	/* copy skb_ubuf_info for callback when skb has no error */
														
 
															 	if (zerocopy) {
														
 
															-		skb_shinfo(skb)->destructor_arg = m->msg_control;
														
 
															+		skb_shinfo(skb)->destructor_arg = msg_control;
														
 
															 		skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
														
 
															 		skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
														
 
															-	} else if (m && m->msg_control) {
														
 
															-		struct ubuf_info *uarg = m->msg_control;
														
 
															+	} else if (msg_control) {
														
 
															+		struct ubuf_info *uarg = msg_control;
														
 
															 		uarg->callback(uarg, false);
														
 
															 	}
														
@@ -1146,11 +1146,87 @@ static const struct file_operations tap_fops = {
 
															 #endif
														
 
															 };
														
 
															+static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp)
														
 
															+{
														
 
															+	struct tun_xdp_hdr *hdr = xdp->data_hard_start;
														
 
															+	struct virtio_net_hdr *gso = &hdr->gso;
														
 
															+	int buflen = hdr->buflen;
														
 
															+	int vnet_hdr_len = 0;
														
 
															+	struct tap_dev *tap;
														
 
															+	struct sk_buff *skb;
														
 
															+	int err, depth;
														
 
															+
														
 
															+	if (q->flags & IFF_VNET_HDR)
														
 
															+		vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
														
 
															+
														
 
															+	skb = build_skb(xdp->data_hard_start, buflen);
														
 
															+	if (!skb) {
														
 
															+		err = -ENOMEM;
														
 
															+		goto err;
														
 
															+	}
														
 
															+
														
 
															+	skb_reserve(skb, xdp->data - xdp->data_hard_start);
														
 
															+	skb_put(skb, xdp->data_end - xdp->data);
														
 
															+
														
 
															+	skb_set_network_header(skb, ETH_HLEN);
														
 
															+	skb_reset_mac_header(skb);
														
 
															+	skb->protocol = eth_hdr(skb)->h_proto;
														
 
															+
														
 
															+	if (vnet_hdr_len) {
														
 
															+		err = virtio_net_hdr_to_skb(skb, gso, tap_is_little_endian(q));
														
 
															+		if (err)
														
 
															+			goto err_kfree;
														
 
															+	}
														
 
															+
														
 
															+	skb_probe_transport_header(skb, ETH_HLEN);
														
 
															+
														
 
															+	/* Move network header to the right position for VLAN tagged packets */
														
 
															+	if ((skb->protocol == htons(ETH_P_8021Q) ||
														
 
															+	     skb->protocol == htons(ETH_P_8021AD)) &&
														
 
															+	    __vlan_get_protocol(skb, skb->protocol, &depth) != 0)
														
 
															+		skb_set_network_header(skb, depth);
														
 
															+
														
 
															+	rcu_read_lock();
														
 
															+	tap = rcu_dereference(q->tap);
														
 
															+	if (tap) {
														
 
															+		skb->dev = tap->dev;
														
 
															+		dev_queue_xmit(skb);
														
 
															+	} else {
														
 
															+		kfree_skb(skb);
														
 
															+	}
														
 
															+	rcu_read_unlock();
														
 
															+
														
 
															+	return 0;
														
 
															+
														
 
															+err_kfree:
														
 
															+	kfree_skb(skb);
														
 
															+err:
														
 
															+	rcu_read_lock();
														
 
															+		tap = rcu_dereference(q->tap);
														
 
															+	if (tap && tap->count_tx_dropped)
														
 
															+		tap->count_tx_dropped(tap);
														
 
															+	rcu_read_unlock();
														
 
															+	return err;
														
 
															+}
														
 
															+
														
 
															 static int tap_sendmsg(struct socket *sock, struct msghdr *m,
														
 
															 		       size_t total_len)
														
 
															 {
														
 
															 	struct tap_queue *q = container_of(sock, struct tap_queue, sock);
														
 
															-	return tap_get_user(q, m, &m->msg_iter, m->msg_flags & MSG_DONTWAIT);
														
 
															+	struct tun_msg_ctl *ctl = m->msg_control;
														
 
															+	struct xdp_buff *xdp;
														
 
															+	int i;
														
 
															+
														
 
															+	if (ctl && (ctl->type == TUN_MSG_PTR)) {
														
 
															+		for (i = 0; i < ctl->num; i++) {
														
 
															+			xdp = &((struct xdp_buff *)ctl->ptr)[i];
														
 
															+			tap_get_user_xdp(q, xdp);
														
 
															+		}
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	return tap_get_user(q, ctl ? ctl->ptr : NULL, &m->msg_iter,
														
 
															+			    m->msg_flags & MSG_DONTWAIT);
														
 
															 }
														
 
															 static int tap_recvmsg(struct socket *sock, struct msghdr *m,
														
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -113,7 +113,6 @@ do {								\
 
															 } while (0)
														
 
															 #endif
														
 
															-#define TUN_HEADROOM 256
														
 
															 #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
														
 
															 /* TUN device flags */
														
@@ -869,6 +868,9 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
 
															 		tun_napi_init(tun, tfile, napi);
														
 
															 	}
														
 
															+	if (rtnl_dereference(tun->xdp_prog))
														
 
															+		sock_set_flag(&tfile->sk, SOCK_XDP);
														
 
															+
														
 
															 	tun_set_real_num_queues(tun);
														
 
															 	/* device is allowed to go away first, so no need to hold extra
														
@@ -1241,13 +1243,29 @@ static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 
															 		       struct netlink_ext_ack *extack)
														
 
															 {
														
 
															 	struct tun_struct *tun = netdev_priv(dev);
														
 
															+	struct tun_file *tfile;
														
 
															 	struct bpf_prog *old_prog;
														
 
															+	int i;
														
 
															 	old_prog = rtnl_dereference(tun->xdp_prog);
														
 
															 	rcu_assign_pointer(tun->xdp_prog, prog);
														
 
															 	if (old_prog)
														
 
															 		bpf_prog_put(old_prog);
														
 
															+	for (i = 0; i < tun->numqueues; i++) {
														
 
															+		tfile = rtnl_dereference(tun->tfiles[i]);
														
 
															+		if (prog)
														
 
															+			sock_set_flag(&tfile->sk, SOCK_XDP);
														
 
															+		else
														
 
															+			sock_reset_flag(&tfile->sk, SOCK_XDP);
														
 
															+	}
														
 
															+	list_for_each_entry(tfile, &tun->disabled, next) {
														
 
															+		if (prog)
														
 
															+			sock_set_flag(&tfile->sk, SOCK_XDP);
														
 
															+		else
														
 
															+			sock_reset_flag(&tfile->sk, SOCK_XDP);
														
 
															+	}
														
 
															+
														
 
															 	return 0;
														
 
															 }
														
@@ -1617,6 +1635,55 @@ static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
 
															 	return true;
														
 
															 }
														
 
															+static struct sk_buff *__tun_build_skb(struct page_frag *alloc_frag, char *buf,
														
 
															+				       int buflen, int len, int pad)
														
 
															+{
														
 
															+	struct sk_buff *skb = build_skb(buf, buflen);
														
 
															+
														
 
															+	if (!skb)
														
 
															+		return ERR_PTR(-ENOMEM);
														
 
															+
														
 
															+	skb_reserve(skb, pad);
														
 
															+	skb_put(skb, len);
														
 
															+
														
 
															+	get_page(alloc_frag->page);
														
 
															+	alloc_frag->offset += buflen;
														
 
															+
														
 
															+	return skb;
														
 
															+}
														
 
															+
														
 
															+static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog,
														
 
															+		       struct xdp_buff *xdp, u32 act)
														
 
															+{
														
 
															+	int err;
														
 
															+
														
 
															+	switch (act) {
														
 
															+	case XDP_REDIRECT:
														
 
															+		err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
														
 
															+		if (err)
														
 
															+			return err;
														
 
															+		break;
														
 
															+	case XDP_TX:
														
 
															+		err = tun_xdp_tx(tun->dev, xdp);
														
 
															+		if (err < 0)
														
 
															+			return err;
														
 
															+		break;
														
 
															+	case XDP_PASS:
														
 
															+		break;
														
 
															+	default:
														
 
															+		bpf_warn_invalid_xdp_action(act);
														
 
															+		/* fall through */
														
 
															+	case XDP_ABORTED:
														
 
															+		trace_xdp_exception(tun->dev, xdp_prog, act);
														
 
															+		/* fall through */
														
 
															+	case XDP_DROP:
														
 
															+		this_cpu_inc(tun->pcpu_stats->rx_dropped);
														
 
															+		break;
														
 
															+	}
														
 
															+
														
 
															+	return act;
														
 
															+}
														
 
															+
														
 
															 static struct sk_buff *tun_build_skb(struct tun_struct *tun,
														
 
															 				     struct tun_file *tfile,
														
 
															 				     struct iov_iter *from,
														
@@ -1624,18 +1691,17 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 
															 				     int len, int *skb_xdp)
														
 
															 {
														
 
															 	struct page_frag *alloc_frag = &current->task_frag;
														
 
															-	struct sk_buff *skb;
														
 
															 	struct bpf_prog *xdp_prog;
														
 
															 	int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
														
 
															-	unsigned int delta = 0;
														
 
															 	char *buf;
														
 
															 	size_t copied;
														
 
															-	int err, pad = TUN_RX_PAD;
														
 
															+	int pad = TUN_RX_PAD;
														
 
															+	int err = 0;
														
 
															 	rcu_read_lock();
														
 
															 	xdp_prog = rcu_dereference(tun->xdp_prog);
														
 
															 	if (xdp_prog)
														
 
															-		pad += TUN_HEADROOM;
														
 
															+		pad += XDP_PACKET_HEADROOM;
														
 
															 	buflen += SKB_DATA_ALIGN(len + pad);
														
 
															 	rcu_read_unlock();
														
@@ -1654,17 +1720,18 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 
															 	 * of xdp_prog above, this should be rare and for simplicity
														
 
															 	 * we do XDP on skb in case the headroom is not enough.
														
 
															 	 */
														
 
															-	if (hdr->gso_type || !xdp_prog)
														
 
															+	if (hdr->gso_type || !xdp_prog) {
														
 
															 		*skb_xdp = 1;
														
 
															-	else
														
 
															-		*skb_xdp = 0;
														
 
															+		return __tun_build_skb(alloc_frag, buf, buflen, len, pad);
														
 
															+	}
														
 
															+
														
 
															+	*skb_xdp = 0;
														
 
															 	local_bh_disable();
														
 
															 	rcu_read_lock();
														
 
															 	xdp_prog = rcu_dereference(tun->xdp_prog);
														
 
															-	if (xdp_prog && !*skb_xdp) {
														
 
															+	if (xdp_prog) {
														
 
															 		struct xdp_buff xdp;
														
 
															-		void *orig_data;
														
 
															 		u32 act;
														
 
															 		xdp.data_hard_start = buf;
														
@@ -1672,66 +1739,33 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 
															 		xdp_set_data_meta_invalid(&xdp);
														
 
															 		xdp.data_end = xdp.data + len;
														
 
															 		xdp.rxq = &tfile->xdp_rxq;
														
 
															-		orig_data = xdp.data;
														
 
															-		act = bpf_prog_run_xdp(xdp_prog, &xdp);
														
 
															-		switch (act) {
														
 
															-		case XDP_REDIRECT:
														
 
															-			get_page(alloc_frag->page);
														
 
															-			alloc_frag->offset += buflen;
														
 
															-			err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
														
 
															-			xdp_do_flush_map();
														
 
															-			if (err)
														
 
															-				goto err_redirect;
														
 
															-			rcu_read_unlock();
														
 
															-			local_bh_enable();
														
 
															-			return NULL;
														
 
															-		case XDP_TX:
														
 
															+		act = bpf_prog_run_xdp(xdp_prog, &xdp);
														
 
															+		if (act == XDP_REDIRECT || act == XDP_TX) {
														
 
															 			get_page(alloc_frag->page);
														
 
															 			alloc_frag->offset += buflen;
														
 
															-			if (tun_xdp_tx(tun->dev, &xdp) < 0)
														
 
															-				goto err_redirect;
														
 
															-			rcu_read_unlock();
														
 
															-			local_bh_enable();
														
 
															-			return NULL;
														
 
															-		case XDP_PASS:
														
 
															-			delta = orig_data - xdp.data;
														
 
															-			len = xdp.data_end - xdp.data;
														
 
															-			break;
														
 
															-		default:
														
 
															-			bpf_warn_invalid_xdp_action(act);
														
 
															-			/* fall through */
														
 
															-		case XDP_ABORTED:
														
 
															-			trace_xdp_exception(tun->dev, xdp_prog, act);
														
 
															-			/* fall through */
														
 
															-		case XDP_DROP:
														
 
															-			goto err_xdp;
														
 
															 		}
														
 
															-	}
														
 
															+		err = tun_xdp_act(tun, xdp_prog, &xdp, act);
														
 
															+		if (err < 0)
														
 
															+			goto err_xdp;
														
 
															+		if (err == XDP_REDIRECT)
														
 
															+			xdp_do_flush_map();
														
 
															+		if (err != XDP_PASS)
														
 
															+			goto out;
														
 
															-	skb = build_skb(buf, buflen);
														
 
															-	if (!skb) {
														
 
															-		rcu_read_unlock();
														
 
															-		local_bh_enable();
														
 
															-		return ERR_PTR(-ENOMEM);
														
 
															+		pad = xdp.data - xdp.data_hard_start;
														
 
															+		len = xdp.data_end - xdp.data;
														
 
															 	}
														
 
															-
														
 
															-	skb_reserve(skb, pad - delta);
														
 
															-	skb_put(skb, len);
														
 
															-	get_page(alloc_frag->page);
														
 
															-	alloc_frag->offset += buflen;
														
 
															-
														
 
															 	rcu_read_unlock();
														
 
															 	local_bh_enable();
														
 
															-	return skb;
														
 
															+	return __tun_build_skb(alloc_frag, buf, buflen, len, pad);
														
 
															-err_redirect:
														
 
															-	put_page(alloc_frag->page);
														
 
															 err_xdp:
														
 
															+	put_page(alloc_frag->page);
														
 
															+out:
														
 
															 	rcu_read_unlock();
														
 
															 	local_bh_enable();
														
 
															-	this_cpu_inc(tun->pcpu_stats->rx_dropped);
														
 
															 	return NULL;
														
 
															 }
														
@@ -2392,18 +2426,133 @@ static void tun_sock_write_space(struct sock *sk)
 
															 	kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
														
 
															 }
														
 
															+static int tun_xdp_one(struct tun_struct *tun,
														
 
															+		       struct tun_file *tfile,
														
 
															+		       struct xdp_buff *xdp, int *flush)
														
 
															+{
														
 
															+	struct tun_xdp_hdr *hdr = xdp->data_hard_start;
														
 
															+	struct virtio_net_hdr *gso = &hdr->gso;
														
 
															+	struct tun_pcpu_stats *stats;
														
 
															+	struct bpf_prog *xdp_prog;
														
 
															+	struct sk_buff *skb = NULL;
														
 
															+	u32 rxhash = 0, act;
														
 
															+	int buflen = hdr->buflen;
														
 
															+	int err = 0;
														
 
															+	bool skb_xdp = false;
														
 
															+
														
 
															+	xdp_prog = rcu_dereference(tun->xdp_prog);
														
 
															+	if (xdp_prog) {
														
 
															+		if (gso->gso_type) {
														
 
															+			skb_xdp = true;
														
 
															+			goto build;
														
 
															+		}
														
 
															+		xdp_set_data_meta_invalid(xdp);
														
 
															+		xdp->rxq = &tfile->xdp_rxq;
														
 
															+
														
 
															+		act = bpf_prog_run_xdp(xdp_prog, xdp);
														
 
															+		err = tun_xdp_act(tun, xdp_prog, xdp, act);
														
 
															+		if (err < 0) {
														
 
															+			put_page(virt_to_head_page(xdp->data));
														
 
															+			return err;
														
 
															+		}
														
 
															+
														
 
															+		switch (err) {
														
 
															+		case XDP_REDIRECT:
														
 
															+			*flush = true;
														
 
															+			/* fall through */
														
 
															+		case XDP_TX:
														
 
															+			return 0;
														
 
															+		case XDP_PASS:
														
 
															+			break;
														
 
															+		default:
														
 
															+			put_page(virt_to_head_page(xdp->data));
														
 
															+			return 0;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+build:
														
 
															+	skb = build_skb(xdp->data_hard_start, buflen);
														
 
															+	if (!skb) {
														
 
															+		err = -ENOMEM;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	skb_reserve(skb, xdp->data - xdp->data_hard_start);
														
 
															+	skb_put(skb, xdp->data_end - xdp->data);
														
 
															+
														
 
															+	if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) {
														
 
															+		this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
														
 
															+		kfree_skb(skb);
														
 
															+		err = -EINVAL;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	skb->protocol = eth_type_trans(skb, tun->dev);
														
 
															+	skb_reset_network_header(skb);
														
 
															+	skb_probe_transport_header(skb, 0);
														
 
															+
														
 
															+	if (skb_xdp) {
														
 
															+		err = do_xdp_generic(xdp_prog, skb);
														
 
															+		if (err != XDP_PASS)
														
 
															+			goto out;
														
 
															+	}
														
 
															+
														
 
															+	if (!rcu_dereference(tun->steering_prog))
														
 
															+		rxhash = __skb_get_hash_symmetric(skb);
														
 
															+
														
 
															+	netif_receive_skb(skb);
														
 
															+
														
 
															+	stats = get_cpu_ptr(tun->pcpu_stats);
														
 
															+	u64_stats_update_begin(&stats->syncp);
														
 
															+	stats->rx_packets++;
														
 
															+	stats->rx_bytes += skb->len;
														
 
															+	u64_stats_update_end(&stats->syncp);
														
 
															+	put_cpu_ptr(stats);
														
 
															+
														
 
															+	if (rxhash)
														
 
															+		tun_flow_update(tun, rxhash, tfile);
														
 
															+
														
 
															+out:
														
 
															+	return err;
														
 
															+}
														
 
															+
														
 
															 static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
														
 
															 {
														
 
															-	int ret;
														
 
															+	int ret, i;
														
 
															 	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
														
 
															 	struct tun_struct *tun = tun_get(tfile);
														
 
															+	struct tun_msg_ctl *ctl = m->msg_control;
														
 
															+	struct xdp_buff *xdp;
														
 
															 	if (!tun)
														
 
															 		return -EBADFD;
														
 
															-	ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter,
														
 
															+	if (ctl && (ctl->type == TUN_MSG_PTR)) {
														
 
															+		int n = ctl->num;
														
 
															+		int flush = 0;
														
 
															+
														
 
															+		local_bh_disable();
														
 
															+		rcu_read_lock();
														
 
															+
														
 
															+		for (i = 0; i < n; i++) {
														
 
															+			xdp = &((struct xdp_buff *)ctl->ptr)[i];
														
 
															+			tun_xdp_one(tun, tfile, xdp, &flush);
														
 
															+		}
														
 
															+
														
 
															+		if (flush)
														
 
															+			xdp_do_flush_map();
														
 
															+
														
 
															+		rcu_read_unlock();
														
 
															+		local_bh_enable();
														
 
															+
														
 
															+		ret = total_len;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter,
														
 
															 			   m->msg_flags & MSG_DONTWAIT,
														
 
															 			   m->msg_flags & MSG_MORE);
														
 
															+out:
														
 
															 	tun_put(tun);
														
 
															 	return ret;
														
 
															 }
														
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -116,6 +116,8 @@ struct vhost_net_virtqueue {
 
															 	 * For RX, number of batched heads
														
 
															 	 */
														
 
															 	int done_idx;
														
 
															+	/* Number of XDP frames batched */
														
 
															+	int batched_xdp;
														
 
															 	/* an array of userspace buffers info */
														
 
															 	struct ubuf_info *ubuf_info;
														
 
															 	/* Reference counting for outstanding ubufs.
														
@@ -123,6 +125,8 @@ struct vhost_net_virtqueue {
 
															 	struct vhost_net_ubuf_ref *ubufs;
														
 
															 	struct ptr_ring *rx_ring;
														
 
															 	struct vhost_net_buf rxq;
														
 
															+	/* Batched XDP buffs */
														
 
															+	struct xdp_buff *xdp;
														
 
															 };
														
 
															 struct vhost_net {
														
@@ -338,6 +342,11 @@ static bool vhost_sock_zcopy(struct socket *sock)
 
															 		sock_flag(sock->sk, SOCK_ZEROCOPY);
														
 
															 }
														
 
															+static bool vhost_sock_xdp(struct socket *sock)
														
 
															+{
														
 
															+	return sock_flag(sock->sk, SOCK_XDP);
														
 
															+}
														
 
															+
														
 
															 /* In case of DMA done not in order in lower device driver for some reason.
														
 
															  * upend_idx is used to track end of used idx, done_idx is used to track head
														
 
															  * of used idx. Once lower device DMA done contiguously, we will signal KVM
														
@@ -444,10 +453,37 @@ static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq)
 
															 	nvq->done_idx = 0;
														
 
															 }
														
 
															+static void vhost_tx_batch(struct vhost_net *net,
														
 
															+			   struct vhost_net_virtqueue *nvq,
														
 
															+			   struct socket *sock,
														
 
															+			   struct msghdr *msghdr)
														
 
															+{
														
 
															+	struct tun_msg_ctl ctl = {
														
 
															+		.type = TUN_MSG_PTR,
														
 
															+		.num = nvq->batched_xdp,
														
 
															+		.ptr = nvq->xdp,
														
 
															+	};
														
 
															+	int err;
														
 
															+
														
 
															+	if (nvq->batched_xdp == 0)
														
 
															+		goto signal_used;
														
 
															+
														
 
															+	msghdr->msg_control = &ctl;
														
 
															+	err = sock->ops->sendmsg(sock, msghdr, 0);
														
 
															+	if (unlikely(err < 0)) {
														
 
															+		vq_err(&nvq->vq, "Fail to batch sending packets\n");
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+signal_used:
														
 
															+	vhost_net_signal_used(nvq);
														
 
															+	nvq->batched_xdp = 0;
														
 
															+}
														
 
															+
														
 
															 static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
														
 
															 				    struct vhost_net_virtqueue *nvq,
														
 
															 				    unsigned int *out_num, unsigned int *in_num,
														
 
															-				    bool *busyloop_intr)
														
 
															+				    struct msghdr *msghdr, bool *busyloop_intr)
														
 
															 {
														
 
															 	struct vhost_virtqueue *vq = &nvq->vq;
														
 
															 	unsigned long uninitialized_var(endtime);
														
@@ -455,8 +491,9 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 
															 				  out_num, in_num, NULL, NULL);
														
 
															 	if (r == vq->num && vq->busyloop_timeout) {
														
 
															+		/* Flush batched packets first */
														
 
															 		if (!vhost_sock_zcopy(vq->private_data))
														
 
															-			vhost_net_signal_used(nvq);
														
 
															+			vhost_tx_batch(net, nvq, vq->private_data, msghdr);
														
 
															 		preempt_disable();
														
 
															 		endtime = busy_clock() + vq->busyloop_timeout;
														
 
															 		while (vhost_can_busy_poll(endtime)) {
														
@@ -512,7 +549,7 @@ static int get_tx_bufs(struct vhost_net *net,
 
															 	struct vhost_virtqueue *vq = &nvq->vq;
														
 
															 	int ret;
														
 
															-	ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, busyloop_intr);
														
 
															+	ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr);
														
 
															 	if (ret < 0 || ret == vq->num)
														
 
															 		return ret;
														
@@ -540,6 +577,80 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len)
 
															 	       !vhost_vq_avail_empty(vq->dev, vq);
														
 
															 }
														
 
															+#define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
														
 
															+
														
 
															+static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
														
 
															+			       struct iov_iter *from)
														
 
															+{
														
 
															+	struct vhost_virtqueue *vq = &nvq->vq;
														
 
															+	struct socket *sock = vq->private_data;
														
 
															+	struct page_frag *alloc_frag = &current->task_frag;
														
 
															+	struct virtio_net_hdr *gso;
														
 
															+	struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp];
														
 
															+	struct tun_xdp_hdr *hdr;
														
 
															+	size_t len = iov_iter_count(from);
														
 
															+	int headroom = vhost_sock_xdp(sock) ? XDP_PACKET_HEADROOM : 0;
														
 
															+	int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
														
 
															+	int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + headroom + nvq->sock_hlen);
														
 
															+	int sock_hlen = nvq->sock_hlen;
														
 
															+	void *buf;
														
 
															+	int copied;
														
 
															+
														
 
															+	if (unlikely(len < nvq->sock_hlen))
														
 
															+		return -EFAULT;
														
 
															+
														
 
															+	if (SKB_DATA_ALIGN(len + pad) +
														
 
															+	    SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
														
 
															+		return -ENOSPC;
														
 
															+
														
 
															+	buflen += SKB_DATA_ALIGN(len + pad);
														
 
															+	alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
														
 
															+	if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
														
 
															+		return -ENOMEM;
														
 
															+
														
 
															+	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
														
 
															+	copied = copy_page_from_iter(alloc_frag->page,
														
 
															+				     alloc_frag->offset +
														
 
															+				     offsetof(struct tun_xdp_hdr, gso),
														
 
															+				     sock_hlen, from);
														
 
															+	if (copied != sock_hlen)
														
 
															+		return -EFAULT;
														
 
															+
														
 
															+	hdr = buf;
														
 
															+	gso = &hdr->gso;
														
 
															+
														
 
															+	if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
														
 
															+	    vhost16_to_cpu(vq, gso->csum_start) +
														
 
															+	    vhost16_to_cpu(vq, gso->csum_offset) + 2 >
														
 
															+	    vhost16_to_cpu(vq, gso->hdr_len)) {
														
 
															+		gso->hdr_len = cpu_to_vhost16(vq,
														
 
															+			       vhost16_to_cpu(vq, gso->csum_start) +
														
 
															+			       vhost16_to_cpu(vq, gso->csum_offset) + 2);
														
 
															+
														
 
															+		if (vhost16_to_cpu(vq, gso->hdr_len) > len)
														
 
															+			return -EINVAL;
														
 
															+	}
														
 
															+
														
 
															+	len -= sock_hlen;
														
 
															+	copied = copy_page_from_iter(alloc_frag->page,
														
 
															+				     alloc_frag->offset + pad,
														
 
															+				     len, from);
														
 
															+	if (copied != len)
														
 
															+		return -EFAULT;
														
 
															+
														
 
															+	xdp->data_hard_start = buf;
														
 
															+	xdp->data = buf + pad;
														
 
															+	xdp->data_end = xdp->data + len;
														
 
															+	hdr->buflen = buflen;
														
 
															+
														
 
															+	get_page(alloc_frag->page);
														
 
															+	alloc_frag->offset += buflen;
														
 
															+
														
 
															+	++nvq->batched_xdp;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															 static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
														
 
															 {
														
 
															 	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
														
@@ -556,10 +667,14 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
 
															 	size_t len, total_len = 0;
														
 
															 	int err;
														
 
															 	int sent_pkts = 0;
														
 
															+	bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX);
														
 
															 	for (;;) {
														
 
															 		bool busyloop_intr = false;
														
 
															+		if (nvq->done_idx == VHOST_NET_BATCH)
														
 
															+			vhost_tx_batch(net, nvq, sock, &msg);
														
 
															+
														
 
															 		head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
														
 
															 				   &busyloop_intr);
														
 
															 		/* On error, stop handling until the next kick. */
														
@@ -577,14 +692,34 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
 
															 			break;
														
 
															 		}
														
 
															-		vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
														
 
															-		vq->heads[nvq->done_idx].len = 0;
														
 
															-
														
 
															 		total_len += len;
														
 
															-		if (tx_can_batch(vq, total_len))
														
 
															-			msg.msg_flags |= MSG_MORE;
														
 
															-		else
														
 
															-			msg.msg_flags &= ~MSG_MORE;
														
 
															+
														
 
															+		/* For simplicity, TX batching is only enabled if
														
 
															+		 * sndbuf is unlimited.
														
 
															+		 */
														
 
															+		if (sock_can_batch) {
														
 
															+			err = vhost_net_build_xdp(nvq, &msg.msg_iter);
														
 
															+			if (!err) {
														
 
															+				goto done;
														
 
															+			} else if (unlikely(err != -ENOSPC)) {
														
 
															+				vhost_tx_batch(net, nvq, sock, &msg);
														
 
															+				vhost_discard_vq_desc(vq, 1);
														
 
															+				vhost_net_enable_vq(net, vq);
														
 
															+				break;
														
 
															+			}
														
 
															+
														
 
															+			/* We can't build XDP buff, go for single
														
 
															+			 * packet path but let's flush batched
														
 
															+			 * packets.
														
 
															+			 */
														
 
															+			vhost_tx_batch(net, nvq, sock, &msg);
														
 
															+			msg.msg_control = NULL;
														
 
															+		} else {
														
 
															+			if (tx_can_batch(vq, total_len))
														
 
															+				msg.msg_flags |= MSG_MORE;
														
 
															+			else
														
 
															+				msg.msg_flags &= ~MSG_MORE;
														
 
															+		}
														
 
															 		/* TODO: Check specific error and bomb out unless ENOBUFS? */
														
 
															 		err = sock->ops->sendmsg(sock, &msg, len);
														
@@ -596,15 +731,17 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
 
															 		if (err != len)
														
 
															 			pr_debug("Truncated TX packet: len %d != %zd\n",
														
 
															 				 err, len);
														
 
															-		if (++nvq->done_idx >= VHOST_NET_BATCH)
														
 
															-			vhost_net_signal_used(nvq);
														
 
															+done:
														
 
															+		vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
														
 
															+		vq->heads[nvq->done_idx].len = 0;
														
 
															+		++nvq->done_idx;
														
 
															 		if (vhost_exceeds_weight(++sent_pkts, total_len)) {
														
 
															 			vhost_poll_queue(&vq->poll);
														
 
															 			break;
														
 
															 		}
														
 
															 	}
														
 
															-	vhost_net_signal_used(nvq);
														
 
															+	vhost_tx_batch(net, nvq, sock, &msg);
														
 
															 }
														
 
															 static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
														
@@ -620,6 +757,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
 
															 		.msg_controllen = 0,
														
 
															 		.msg_flags = MSG_DONTWAIT,
														
 
															 	};
														
 
															+	struct tun_msg_ctl ctl;
														
 
															 	size_t len, total_len = 0;
														
 
															 	int err;
														
 
															 	struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
														
@@ -664,8 +802,10 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
 
															 			ubuf->ctx = nvq->ubufs;
														
 
															 			ubuf->desc = nvq->upend_idx;
														
 
															 			refcount_set(&ubuf->refcnt, 1);
														
 
															-			msg.msg_control = ubuf;
														
 
															-			msg.msg_controllen = sizeof(ubuf);
														
 
															+			msg.msg_control = &ctl;
														
 
															+			ctl.type = TUN_MSG_UBUF;
														
 
															+			ctl.ptr = ubuf;
														
 
															+			msg.msg_controllen = sizeof(ctl);
														
 
															 			ubufs = nvq->ubufs;
														
 
															 			atomic_inc(&ubufs->refcount);
														
 
															 			nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
														
@@ -1078,6 +1218,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
 
															 	struct vhost_dev *dev;
														
 
															 	struct vhost_virtqueue **vqs;
														
 
															 	void **queue;
														
 
															+	struct xdp_buff *xdp;
														
 
															 	int i;
														
 
															 	n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
														
@@ -1098,6 +1239,14 @@ static int vhost_net_open(struct inode *inode, struct file *f)
 
															 	}
														
 
															 	n->vqs[VHOST_NET_VQ_RX].rxq.queue = queue;
														
 
															+	xdp = kmalloc_array(VHOST_NET_BATCH, sizeof(*xdp), GFP_KERNEL);
														
 
															+	if (!xdp) {
														
 
															+		kfree(vqs);
														
 
															+		kvfree(n);
														
 
															+		kfree(queue);
														
 
															+	}
														
 
															+	n->vqs[VHOST_NET_VQ_TX].xdp = xdp;
														
 
															+
														
 
															 	dev = &n->dev;
														
 
															 	vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq;
														
 
															 	vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq;
														
@@ -1108,6 +1257,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
 
															 		n->vqs[i].ubuf_info = NULL;
														
 
															 		n->vqs[i].upend_idx = 0;
														
 
															 		n->vqs[i].done_idx = 0;
														
 
															+		n->vqs[i].batched_xdp = 0;
														
 
															 		n->vqs[i].vhost_hlen = 0;
														
 
															 		n->vqs[i].sock_hlen = 0;
														
 
															 		n->vqs[i].rx_ring = NULL;
														
@@ -1191,6 +1341,7 @@ static int vhost_net_release(struct inode *inode, struct file *f)
 
															 	 * since jobs can re-queue themselves. */
														
 
															 	vhost_net_flush(n);
														
 
															 	kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue);
														
 
															+	kfree(n->vqs[VHOST_NET_VQ_TX].xdp);
														
 
															 	kfree(n->dev.vqs);
														
 
															 	kvfree(n);
														
 
															 	return 0;
														
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -16,9 +16,23 @@
 
															 #define __IF_TUN_H
														
 
															 #include <uapi/linux/if_tun.h>
														
 
															+#include <uapi/linux/virtio_net.h>
														
 
															 #define TUN_XDP_FLAG 0x1UL
														
 
															+#define TUN_MSG_UBUF 1
														
 
															+#define TUN_MSG_PTR  2
														
 
															+struct tun_msg_ctl {
														
 
															+	unsigned short type;
														
 
															+	unsigned short num;
														
 
															+	void *ptr;
														
 
															+};
														
 
															+
														
 
															+struct tun_xdp_hdr {
														
 
															+	int buflen;
														
 
															+	struct virtio_net_hdr gso;
														
 
															+};
														
 
															+
														
 
															 #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
														
 
															 struct socket *tun_get_socket(struct file *);
														
 
															 struct ptr_ring *tun_get_tx_ring(struct file *file);
														
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -800,6 +800,7 @@ enum sock_flags {
 
															 	SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */
														
 
															 	SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */
														
 
															 	SOCK_TXTIME,
														
 
															+	SOCK_XDP, /* XDP is attached */
														
 
															 };
														
 
															 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))