Browse Source

Merge branch 'XDP-transmission-for-tuntap'

Jason Wang says:

====================
XDP transmission for tuntap

This series tries to implement XDP transmission (ndo_xdp_xmit) for
tuntap. Pointer ring was used for queuing both XDP buffers and
sk_buff, this is done by encoding the type into lowest bit of the
pointer and storin XDP metadata in the headroom of XDP buff.

Tests gets 3.05 Mpps when doing xdp_redirect_map from ixgbe to VM
(testpmd + virtio-net in guest). This gives us ~20% improvments
compared to use skb during redirect.

Please review.

Changes from V1:

- slient warnings
- fix typos
- add skb mode number in the commit log
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
David S. Miller 7 years ago
parent
commit
e8b18af8c3
5 changed files with 269 additions and 90 deletions
  1. 21 20
      drivers/net/tap.c
  2. 194 45
      drivers/net/tun.c
  3. 32 20
      drivers/vhost/net.c
  4. 3 3
      include/linux/if_tap.h
  5. 19 2
      include/linux/if_tun.h

+ 21 - 20
drivers/net/tap.c

@@ -330,7 +330,7 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
 	if (!q)
 	if (!q)
 		return RX_HANDLER_PASS;
 		return RX_HANDLER_PASS;
 
 
-	if (__skb_array_full(&q->skb_array))
+	if (__ptr_ring_full(&q->ring))
 		goto drop;
 		goto drop;
 
 
 	skb_push(skb, ETH_HLEN);
 	skb_push(skb, ETH_HLEN);
@@ -348,7 +348,7 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
 			goto drop;
 			goto drop;
 
 
 		if (!segs) {
 		if (!segs) {
-			if (skb_array_produce(&q->skb_array, skb))
+			if (ptr_ring_produce(&q->ring, skb))
 				goto drop;
 				goto drop;
 			goto wake_up;
 			goto wake_up;
 		}
 		}
@@ -358,7 +358,7 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
 			struct sk_buff *nskb = segs->next;
 			struct sk_buff *nskb = segs->next;
 
 
 			segs->next = NULL;
 			segs->next = NULL;
-			if (skb_array_produce(&q->skb_array, segs)) {
+			if (ptr_ring_produce(&q->ring, segs)) {
 				kfree_skb(segs);
 				kfree_skb(segs);
 				kfree_skb_list(nskb);
 				kfree_skb_list(nskb);
 				break;
 				break;
@@ -375,7 +375,7 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
 		    !(features & NETIF_F_CSUM_MASK) &&
 		    !(features & NETIF_F_CSUM_MASK) &&
 		    skb_checksum_help(skb))
 		    skb_checksum_help(skb))
 			goto drop;
 			goto drop;
-		if (skb_array_produce(&q->skb_array, skb))
+		if (ptr_ring_produce(&q->ring, skb))
 			goto drop;
 			goto drop;
 	}
 	}
 
 
@@ -497,7 +497,7 @@ static void tap_sock_destruct(struct sock *sk)
 {
 {
 	struct tap_queue *q = container_of(sk, struct tap_queue, sk);
 	struct tap_queue *q = container_of(sk, struct tap_queue, sk);
 
 
-	skb_array_cleanup(&q->skb_array);
+	ptr_ring_cleanup(&q->ring, __skb_array_destroy_skb);
 }
 }
 
 
 static int tap_open(struct inode *inode, struct file *file)
 static int tap_open(struct inode *inode, struct file *file)
@@ -517,7 +517,7 @@ static int tap_open(struct inode *inode, struct file *file)
 					     &tap_proto, 0);
 					     &tap_proto, 0);
 	if (!q)
 	if (!q)
 		goto err;
 		goto err;
-	if (skb_array_init(&q->skb_array, tap->dev->tx_queue_len, GFP_KERNEL)) {
+	if (ptr_ring_init(&q->ring, tap->dev->tx_queue_len, GFP_KERNEL)) {
 		sk_free(&q->sk);
 		sk_free(&q->sk);
 		goto err;
 		goto err;
 	}
 	}
@@ -546,7 +546,7 @@ static int tap_open(struct inode *inode, struct file *file)
 
 
 	err = tap_set_queue(tap, file, q);
 	err = tap_set_queue(tap, file, q);
 	if (err) {
 	if (err) {
-		/* tap_sock_destruct() will take care of freeing skb_array */
+		/* tap_sock_destruct() will take care of freeing ptr_ring */
 		goto err_put;
 		goto err_put;
 	}
 	}
 
 
@@ -583,7 +583,7 @@ static unsigned int tap_poll(struct file *file, poll_table *wait)
 	mask = 0;
 	mask = 0;
 	poll_wait(file, &q->wq.wait, wait);
 	poll_wait(file, &q->wq.wait, wait);
 
 
-	if (!skb_array_empty(&q->skb_array))
+	if (!ptr_ring_empty(&q->ring))
 		mask |= POLLIN | POLLRDNORM;
 		mask |= POLLIN | POLLRDNORM;
 
 
 	if (sock_writeable(&q->sk) ||
 	if (sock_writeable(&q->sk) ||
@@ -844,7 +844,7 @@ static ssize_t tap_do_read(struct tap_queue *q,
 					TASK_INTERRUPTIBLE);
 					TASK_INTERRUPTIBLE);
 
 
 		/* Read frames from the queue */
 		/* Read frames from the queue */
-		skb = skb_array_consume(&q->skb_array);
+		skb = ptr_ring_consume(&q->ring);
 		if (skb)
 		if (skb)
 			break;
 			break;
 		if (noblock) {
 		if (noblock) {
@@ -1176,7 +1176,7 @@ static int tap_peek_len(struct socket *sock)
 {
 {
 	struct tap_queue *q = container_of(sock, struct tap_queue,
 	struct tap_queue *q = container_of(sock, struct tap_queue,
 					       sock);
 					       sock);
-	return skb_array_peek_len(&q->skb_array);
+	return PTR_RING_PEEK_CALL(&q->ring, __skb_array_len_with_tag);
 }
 }
 
 
 /* Ops structure to mimic raw sockets with tun */
 /* Ops structure to mimic raw sockets with tun */
@@ -1202,7 +1202,7 @@ struct socket *tap_get_socket(struct file *file)
 }
 }
 EXPORT_SYMBOL_GPL(tap_get_socket);
 EXPORT_SYMBOL_GPL(tap_get_socket);
 
 
-struct skb_array *tap_get_skb_array(struct file *file)
+struct ptr_ring *tap_get_ptr_ring(struct file *file)
 {
 {
 	struct tap_queue *q;
 	struct tap_queue *q;
 
 
@@ -1211,29 +1211,30 @@ struct skb_array *tap_get_skb_array(struct file *file)
 	q = file->private_data;
 	q = file->private_data;
 	if (!q)
 	if (!q)
 		return ERR_PTR(-EBADFD);
 		return ERR_PTR(-EBADFD);
-	return &q->skb_array;
+	return &q->ring;
 }
 }
-EXPORT_SYMBOL_GPL(tap_get_skb_array);
+EXPORT_SYMBOL_GPL(tap_get_ptr_ring);
 
 
 int tap_queue_resize(struct tap_dev *tap)
 int tap_queue_resize(struct tap_dev *tap)
 {
 {
 	struct net_device *dev = tap->dev;
 	struct net_device *dev = tap->dev;
 	struct tap_queue *q;
 	struct tap_queue *q;
-	struct skb_array **arrays;
+	struct ptr_ring **rings;
 	int n = tap->numqueues;
 	int n = tap->numqueues;
 	int ret, i = 0;
 	int ret, i = 0;
 
 
-	arrays = kmalloc_array(n, sizeof(*arrays), GFP_KERNEL);
-	if (!arrays)
+	rings = kmalloc_array(n, sizeof(*rings), GFP_KERNEL);
+	if (!rings)
 		return -ENOMEM;
 		return -ENOMEM;
 
 
 	list_for_each_entry(q, &tap->queue_list, next)
 	list_for_each_entry(q, &tap->queue_list, next)
-		arrays[i++] = &q->skb_array;
+		rings[i++] = &q->ring;
 
 
-	ret = skb_array_resize_multiple(arrays, n,
-					dev->tx_queue_len, GFP_KERNEL);
+	ret = ptr_ring_resize_multiple(rings, n,
+				       dev->tx_queue_len, GFP_KERNEL,
+				       __skb_array_destroy_skb);
 
 
-	kfree(arrays);
+	kfree(rings);
 	return ret;
 	return ret;
 }
 }
 EXPORT_SYMBOL_GPL(tap_queue_resize);
 EXPORT_SYMBOL_GPL(tap_queue_resize);

+ 194 - 45
drivers/net/tun.c

@@ -179,7 +179,7 @@ struct tun_file {
 	struct mutex napi_mutex;	/* Protects access to the above napi */
 	struct mutex napi_mutex;	/* Protects access to the above napi */
 	struct list_head next;
 	struct list_head next;
 	struct tun_struct *detached;
 	struct tun_struct *detached;
-	struct skb_array tx_array;
+	struct ptr_ring tx_ring;
 	struct xdp_rxq_info xdp_rxq;
 	struct xdp_rxq_info xdp_rxq;
 };
 };
 
 
@@ -241,6 +241,24 @@ struct tun_struct {
 	struct tun_steering_prog __rcu *steering_prog;
 	struct tun_steering_prog __rcu *steering_prog;
 };
 };
 
 
+bool tun_is_xdp_buff(void *ptr)
+{
+	return (unsigned long)ptr & TUN_XDP_FLAG;
+}
+EXPORT_SYMBOL(tun_is_xdp_buff);
+
+void *tun_xdp_to_ptr(void *ptr)
+{
+	return (void *)((unsigned long)ptr | TUN_XDP_FLAG);
+}
+EXPORT_SYMBOL(tun_xdp_to_ptr);
+
+void *tun_ptr_to_xdp(void *ptr)
+{
+	return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG);
+}
+EXPORT_SYMBOL(tun_ptr_to_xdp);
+
 static int tun_napi_receive(struct napi_struct *napi, int budget)
 static int tun_napi_receive(struct napi_struct *napi, int budget)
 {
 {
 	struct tun_file *tfile = container_of(napi, struct tun_file, napi);
 	struct tun_file *tfile = container_of(napi, struct tun_file, napi);
@@ -631,12 +649,25 @@ static struct tun_struct *tun_enable_queue(struct tun_file *tfile)
 	return tun;
 	return tun;
 }
 }
 
 
+static void tun_ptr_free(void *ptr)
+{
+	if (!ptr)
+		return;
+	if (tun_is_xdp_buff(ptr)) {
+		struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
+
+		put_page(virt_to_head_page(xdp->data));
+	} else {
+		__skb_array_destroy_skb(ptr);
+	}
+}
+
 static void tun_queue_purge(struct tun_file *tfile)
 static void tun_queue_purge(struct tun_file *tfile)
 {
 {
-	struct sk_buff *skb;
+	void *ptr;
 
 
-	while ((skb = skb_array_consume(&tfile->tx_array)) != NULL)
-		kfree_skb(skb);
+	while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL)
+		tun_ptr_free(ptr);
 
 
 	skb_queue_purge(&tfile->sk.sk_write_queue);
 	skb_queue_purge(&tfile->sk.sk_write_queue);
 	skb_queue_purge(&tfile->sk.sk_error_queue);
 	skb_queue_purge(&tfile->sk.sk_error_queue);
@@ -689,7 +720,7 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
 				unregister_netdevice(tun->dev);
 				unregister_netdevice(tun->dev);
 		}
 		}
 		if (tun) {
 		if (tun) {
-			skb_array_cleanup(&tfile->tx_array);
+			ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
 			xdp_rxq_info_unreg(&tfile->xdp_rxq);
 			xdp_rxq_info_unreg(&tfile->xdp_rxq);
 		}
 		}
 		sock_put(&tfile->sk);
 		sock_put(&tfile->sk);
@@ -782,7 +813,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
 	}
 	}
 
 
 	if (!tfile->detached &&
 	if (!tfile->detached &&
-	    skb_array_init(&tfile->tx_array, dev->tx_queue_len, GFP_KERNEL)) {
+	    ptr_ring_init(&tfile->tx_ring, dev->tx_queue_len, GFP_KERNEL)) {
 		err = -ENOMEM;
 		err = -ENOMEM;
 		goto out;
 		goto out;
 	}
 	}
@@ -1048,7 +1079,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 
 
 	nf_reset(skb);
 	nf_reset(skb);
 
 
-	if (skb_array_produce(&tfile->tx_array, skb))
+	if (ptr_ring_produce(&tfile->tx_ring, skb))
 		goto drop;
 		goto drop;
 
 
 	/* Notify and wake up reader process */
 	/* Notify and wake up reader process */
@@ -1221,6 +1252,67 @@ static const struct net_device_ops tun_netdev_ops = {
 	.ndo_get_stats64	= tun_net_get_stats64,
 	.ndo_get_stats64	= tun_net_get_stats64,
 };
 };
 
 
+static int tun_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
+{
+	struct tun_struct *tun = netdev_priv(dev);
+	struct xdp_buff *buff = xdp->data_hard_start;
+	int headroom = xdp->data - xdp->data_hard_start;
+	struct tun_file *tfile;
+	u32 numqueues;
+	int ret = 0;
+
+	/* Assure headroom is available and buff is properly aligned */
+	if (unlikely(headroom < sizeof(*xdp) || tun_is_xdp_buff(xdp)))
+		return -ENOSPC;
+
+	*buff = *xdp;
+
+	rcu_read_lock();
+
+	numqueues = READ_ONCE(tun->numqueues);
+	if (!numqueues) {
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
+					    numqueues]);
+	/* Encode the XDP flag into lowest bit for consumer to differ
+	 * XDP buffer from sk_buff.
+	 */
+	if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(buff))) {
+		this_cpu_inc(tun->pcpu_stats->tx_dropped);
+		ret = -ENOSPC;
+	}
+
+out:
+	rcu_read_unlock();
+	return ret;
+}
+
+static void tun_xdp_flush(struct net_device *dev)
+{
+	struct tun_struct *tun = netdev_priv(dev);
+	struct tun_file *tfile;
+	u32 numqueues;
+
+	rcu_read_lock();
+
+	numqueues = READ_ONCE(tun->numqueues);
+	if (!numqueues)
+		goto out;
+
+	tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
+					    numqueues]);
+	/* Notify and wake up reader process */
+	if (tfile->flags & TUN_FASYNC)
+		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
+	tfile->socket.sk->sk_data_ready(tfile->socket.sk);
+
+out:
+	rcu_read_unlock();
+}
+
 static const struct net_device_ops tap_netdev_ops = {
 static const struct net_device_ops tap_netdev_ops = {
 	.ndo_uninit		= tun_net_uninit,
 	.ndo_uninit		= tun_net_uninit,
 	.ndo_open		= tun_net_open,
 	.ndo_open		= tun_net_open,
@@ -1238,6 +1330,8 @@ static const struct net_device_ops tap_netdev_ops = {
 	.ndo_set_rx_headroom	= tun_set_headroom,
 	.ndo_set_rx_headroom	= tun_set_headroom,
 	.ndo_get_stats64	= tun_net_get_stats64,
 	.ndo_get_stats64	= tun_net_get_stats64,
 	.ndo_bpf		= tun_xdp,
 	.ndo_bpf		= tun_xdp,
+	.ndo_xdp_xmit		= tun_xdp_xmit,
+	.ndo_xdp_flush		= tun_xdp_flush,
 };
 };
 
 
 static void tun_flow_init(struct tun_struct *tun)
 static void tun_flow_init(struct tun_struct *tun)
@@ -1316,7 +1410,7 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
 
 
 	poll_wait(file, sk_sleep(sk), wait);
 	poll_wait(file, sk_sleep(sk), wait);
 
 
-	if (!skb_array_empty(&tfile->tx_array))
+	if (!ptr_ring_empty(&tfile->tx_ring))
 		mask |= POLLIN | POLLRDNORM;
 		mask |= POLLIN | POLLRDNORM;
 
 
 	if (tun->dev->flags & IFF_UP &&
 	if (tun->dev->flags & IFF_UP &&
@@ -1862,6 +1956,40 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	return result;
 	return result;
 }
 }
 
 
+static ssize_t tun_put_user_xdp(struct tun_struct *tun,
+				struct tun_file *tfile,
+				struct xdp_buff *xdp,
+				struct iov_iter *iter)
+{
+	int vnet_hdr_sz = 0;
+	size_t size = xdp->data_end - xdp->data;
+	struct tun_pcpu_stats *stats;
+	size_t ret;
+
+	if (tun->flags & IFF_VNET_HDR) {
+		struct virtio_net_hdr gso = { 0 };
+
+		vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
+		if (unlikely(iov_iter_count(iter) < vnet_hdr_sz))
+			return -EINVAL;
+		if (unlikely(copy_to_iter(&gso, sizeof(gso), iter) !=
+			     sizeof(gso)))
+			return -EFAULT;
+		iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
+	}
+
+	ret = copy_to_iter(xdp->data, size, iter) + vnet_hdr_sz;
+
+	stats = get_cpu_ptr(tun->pcpu_stats);
+	u64_stats_update_begin(&stats->syncp);
+	stats->tx_packets++;
+	stats->tx_bytes += ret;
+	u64_stats_update_end(&stats->syncp);
+	put_cpu_ptr(tun->pcpu_stats);
+
+	return ret;
+}
+
 /* Put packet to the user space buffer */
 /* Put packet to the user space buffer */
 static ssize_t tun_put_user(struct tun_struct *tun,
 static ssize_t tun_put_user(struct tun_struct *tun,
 			    struct tun_file *tfile,
 			    struct tun_file *tfile,
@@ -1959,15 +2087,14 @@ done:
 	return total;
 	return total;
 }
 }
 
 
-static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock,
-				     int *err)
+static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
 {
 {
 	DECLARE_WAITQUEUE(wait, current);
 	DECLARE_WAITQUEUE(wait, current);
-	struct sk_buff *skb = NULL;
+	void *ptr = NULL;
 	int error = 0;
 	int error = 0;
 
 
-	skb = skb_array_consume(&tfile->tx_array);
-	if (skb)
+	ptr = ptr_ring_consume(&tfile->tx_ring);
+	if (ptr)
 		goto out;
 		goto out;
 	if (noblock) {
 	if (noblock) {
 		error = -EAGAIN;
 		error = -EAGAIN;
@@ -1978,8 +2105,8 @@ static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock,
 	current->state = TASK_INTERRUPTIBLE;
 	current->state = TASK_INTERRUPTIBLE;
 
 
 	while (1) {
 	while (1) {
-		skb = skb_array_consume(&tfile->tx_array);
-		if (skb)
+		ptr = ptr_ring_consume(&tfile->tx_ring);
+		if (ptr)
 			break;
 			break;
 		if (signal_pending(current)) {
 		if (signal_pending(current)) {
 			error = -ERESTARTSYS;
 			error = -ERESTARTSYS;
@@ -1998,12 +2125,12 @@ static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock,
 
 
 out:
 out:
 	*err = error;
 	*err = error;
-	return skb;
+	return ptr;
 }
 }
 
 
 static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 			   struct iov_iter *to,
 			   struct iov_iter *to,
-			   int noblock, struct sk_buff *skb)
+			   int noblock, void *ptr)
 {
 {
 	ssize_t ret;
 	ssize_t ret;
 	int err;
 	int err;
@@ -2011,23 +2138,31 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 	tun_debug(KERN_INFO, tun, "tun_do_read\n");
 	tun_debug(KERN_INFO, tun, "tun_do_read\n");
 
 
 	if (!iov_iter_count(to)) {
 	if (!iov_iter_count(to)) {
-		if (skb)
-			kfree_skb(skb);
+		tun_ptr_free(ptr);
 		return 0;
 		return 0;
 	}
 	}
 
 
-	if (!skb) {
+	if (!ptr) {
 		/* Read frames from ring */
 		/* Read frames from ring */
-		skb = tun_ring_recv(tfile, noblock, &err);
-		if (!skb)
+		ptr = tun_ring_recv(tfile, noblock, &err);
+		if (!ptr)
 			return err;
 			return err;
 	}
 	}
 
 
-	ret = tun_put_user(tun, tfile, skb, to);
-	if (unlikely(ret < 0))
-		kfree_skb(skb);
-	else
-		consume_skb(skb);
+	if (tun_is_xdp_buff(ptr)) {
+		struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
+
+		ret = tun_put_user_xdp(tun, tfile, xdp, to);
+		put_page(virt_to_head_page(xdp->data));
+	} else {
+		struct sk_buff *skb = ptr;
+
+		ret = tun_put_user(tun, tfile, skb, to);
+		if (unlikely(ret < 0))
+			kfree_skb(skb);
+		else
+			consume_skb(skb);
+	}
 
 
 	return ret;
 	return ret;
 }
 }
@@ -2164,12 +2299,12 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
 {
 {
 	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
 	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
 	struct tun_struct *tun = tun_get(tfile);
 	struct tun_struct *tun = tun_get(tfile);
-	struct sk_buff *skb = m->msg_control;
+	void *ptr = m->msg_control;
 	int ret;
 	int ret;
 
 
 	if (!tun) {
 	if (!tun) {
 		ret = -EBADFD;
 		ret = -EBADFD;
-		goto out_free_skb;
+		goto out_free;
 	}
 	}
 
 
 	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) {
 	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) {
@@ -2181,7 +2316,7 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
 					 SOL_PACKET, TUN_TX_TIMESTAMP);
 					 SOL_PACKET, TUN_TX_TIMESTAMP);
 		goto out;
 		goto out;
 	}
 	}
-	ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, skb);
+	ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, ptr);
 	if (ret > (ssize_t)total_len) {
 	if (ret > (ssize_t)total_len) {
 		m->msg_flags |= MSG_TRUNC;
 		m->msg_flags |= MSG_TRUNC;
 		ret = flags & MSG_TRUNC ? ret : total_len;
 		ret = flags & MSG_TRUNC ? ret : total_len;
@@ -2192,12 +2327,25 @@ out:
 
 
 out_put_tun:
 out_put_tun:
 	tun_put(tun);
 	tun_put(tun);
-out_free_skb:
-	if (skb)
-		kfree_skb(skb);
+out_free:
+	tun_ptr_free(ptr);
 	return ret;
 	return ret;
 }
 }
 
 
+static int tun_ptr_peek_len(void *ptr)
+{
+	if (likely(ptr)) {
+		if (tun_is_xdp_buff(ptr)) {
+			struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
+
+			return xdp->data_end - xdp->data;
+		}
+		return __skb_array_len_with_tag(ptr);
+	} else {
+		return 0;
+	}
+}
+
 static int tun_peek_len(struct socket *sock)
 static int tun_peek_len(struct socket *sock)
 {
 {
 	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
 	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
@@ -2208,7 +2356,7 @@ static int tun_peek_len(struct socket *sock)
 	if (!tun)
 	if (!tun)
 		return 0;
 		return 0;
 
 
-	ret = skb_array_peek_len(&tfile->tx_array);
+	ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
 	tun_put(tun);
 	tun_put(tun);
 
 
 	return ret;
 	return ret;
@@ -3114,25 +3262,26 @@ static int tun_queue_resize(struct tun_struct *tun)
 {
 {
 	struct net_device *dev = tun->dev;
 	struct net_device *dev = tun->dev;
 	struct tun_file *tfile;
 	struct tun_file *tfile;
-	struct skb_array **arrays;
+	struct ptr_ring **rings;
 	int n = tun->numqueues + tun->numdisabled;
 	int n = tun->numqueues + tun->numdisabled;
 	int ret, i;
 	int ret, i;
 
 
-	arrays = kmalloc_array(n, sizeof(*arrays), GFP_KERNEL);
-	if (!arrays)
+	rings = kmalloc_array(n, sizeof(*rings), GFP_KERNEL);
+	if (!rings)
 		return -ENOMEM;
 		return -ENOMEM;
 
 
 	for (i = 0; i < tun->numqueues; i++) {
 	for (i = 0; i < tun->numqueues; i++) {
 		tfile = rtnl_dereference(tun->tfiles[i]);
 		tfile = rtnl_dereference(tun->tfiles[i]);
-		arrays[i] = &tfile->tx_array;
+		rings[i] = &tfile->tx_ring;
 	}
 	}
 	list_for_each_entry(tfile, &tun->disabled, next)
 	list_for_each_entry(tfile, &tun->disabled, next)
-		arrays[i++] = &tfile->tx_array;
+		rings[i++] = &tfile->tx_ring;
 
 
-	ret = skb_array_resize_multiple(arrays, n,
-					dev->tx_queue_len, GFP_KERNEL);
+	ret = ptr_ring_resize_multiple(rings, n,
+				       dev->tx_queue_len, GFP_KERNEL,
+				       tun_ptr_free);
 
 
-	kfree(arrays);
+	kfree(rings);
 	return ret;
 	return ret;
 }
 }
 
 
@@ -3218,7 +3367,7 @@ struct socket *tun_get_socket(struct file *file)
 }
 }
 EXPORT_SYMBOL_GPL(tun_get_socket);
 EXPORT_SYMBOL_GPL(tun_get_socket);
 
 
-struct skb_array *tun_get_skb_array(struct file *file)
+struct ptr_ring *tun_get_tx_ring(struct file *file)
 {
 {
 	struct tun_file *tfile;
 	struct tun_file *tfile;
 
 
@@ -3227,9 +3376,9 @@ struct skb_array *tun_get_skb_array(struct file *file)
 	tfile = file->private_data;
 	tfile = file->private_data;
 	if (!tfile)
 	if (!tfile)
 		return ERR_PTR(-EBADFD);
 		return ERR_PTR(-EBADFD);
-	return &tfile->tx_array;
+	return &tfile->tx_ring;
 }
 }
-EXPORT_SYMBOL_GPL(tun_get_skb_array);
+EXPORT_SYMBOL_GPL(tun_get_tx_ring);
 
 
 module_init(tun_init);
 module_init(tun_init);
 module_exit(tun_cleanup);
 module_exit(tun_cleanup);

+ 32 - 20
drivers/vhost/net.c

@@ -89,7 +89,7 @@ struct vhost_net_ubuf_ref {
 
 
 #define VHOST_RX_BATCH 64
 #define VHOST_RX_BATCH 64
 struct vhost_net_buf {
 struct vhost_net_buf {
-	struct sk_buff **queue;
+	void **queue;
 	int tail;
 	int tail;
 	int head;
 	int head;
 };
 };
@@ -108,7 +108,7 @@ struct vhost_net_virtqueue {
 	/* Reference counting for outstanding ubufs.
 	/* Reference counting for outstanding ubufs.
 	 * Protected by vq mutex. Writers must also take device mutex. */
 	 * Protected by vq mutex. Writers must also take device mutex. */
 	struct vhost_net_ubuf_ref *ubufs;
 	struct vhost_net_ubuf_ref *ubufs;
-	struct skb_array *rx_array;
+	struct ptr_ring *rx_ring;
 	struct vhost_net_buf rxq;
 	struct vhost_net_buf rxq;
 };
 };
 
 
@@ -158,7 +158,7 @@ static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq)
 	struct vhost_net_buf *rxq = &nvq->rxq;
 	struct vhost_net_buf *rxq = &nvq->rxq;
 
 
 	rxq->head = 0;
 	rxq->head = 0;
-	rxq->tail = skb_array_consume_batched(nvq->rx_array, rxq->queue,
+	rxq->tail = ptr_ring_consume_batched(nvq->rx_ring, rxq->queue,
 					      VHOST_RX_BATCH);
 					      VHOST_RX_BATCH);
 	return rxq->tail;
 	return rxq->tail;
 }
 }
@@ -167,13 +167,25 @@ static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq)
 {
 {
 	struct vhost_net_buf *rxq = &nvq->rxq;
 	struct vhost_net_buf *rxq = &nvq->rxq;
 
 
-	if (nvq->rx_array && !vhost_net_buf_is_empty(rxq)) {
-		skb_array_unconsume(nvq->rx_array, rxq->queue + rxq->head,
-				    vhost_net_buf_get_size(rxq));
+	if (nvq->rx_ring && !vhost_net_buf_is_empty(rxq)) {
+		ptr_ring_unconsume(nvq->rx_ring, rxq->queue + rxq->head,
+				   vhost_net_buf_get_size(rxq),
+				   __skb_array_destroy_skb);
 		rxq->head = rxq->tail = 0;
 		rxq->head = rxq->tail = 0;
 	}
 	}
 }
 }
 
 
+static int vhost_net_buf_peek_len(void *ptr)
+{
+	if (tun_is_xdp_buff(ptr)) {
+		struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
+
+		return xdp->data_end - xdp->data;
+	}
+
+	return __skb_array_len_with_tag(ptr);
+}
+
 static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq)
 static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq)
 {
 {
 	struct vhost_net_buf *rxq = &nvq->rxq;
 	struct vhost_net_buf *rxq = &nvq->rxq;
@@ -185,7 +197,7 @@ static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq)
 		return 0;
 		return 0;
 
 
 out:
 out:
-	return __skb_array_len_with_tag(vhost_net_buf_get_ptr(rxq));
+	return vhost_net_buf_peek_len(vhost_net_buf_get_ptr(rxq));
 }
 }
 
 
 static void vhost_net_buf_init(struct vhost_net_buf *rxq)
 static void vhost_net_buf_init(struct vhost_net_buf *rxq)
@@ -583,7 +595,7 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
 	int len = 0;
 	int len = 0;
 	unsigned long flags;
 	unsigned long flags;
 
 
-	if (rvq->rx_array)
+	if (rvq->rx_ring)
 		return vhost_net_buf_peek(rvq);
 		return vhost_net_buf_peek(rvq);
 
 
 	spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
 	spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
@@ -790,7 +802,7 @@ static void handle_rx(struct vhost_net *net)
 			 * they refilled. */
 			 * they refilled. */
 			goto out;
 			goto out;
 		}
 		}
-		if (nvq->rx_array)
+		if (nvq->rx_ring)
 			msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
 			msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
 		/* On overrun, truncate and discard */
 		/* On overrun, truncate and discard */
 		if (unlikely(headcount > UIO_MAXIOV)) {
 		if (unlikely(headcount > UIO_MAXIOV)) {
@@ -896,7 +908,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
 	struct vhost_net *n;
 	struct vhost_net *n;
 	struct vhost_dev *dev;
 	struct vhost_dev *dev;
 	struct vhost_virtqueue **vqs;
 	struct vhost_virtqueue **vqs;
-	struct sk_buff **queue;
+	void **queue;
 	int i;
 	int i;
 
 
 	n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 	n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
@@ -908,7 +920,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
 		return -ENOMEM;
 		return -ENOMEM;
 	}
 	}
 
 
-	queue = kmalloc_array(VHOST_RX_BATCH, sizeof(struct sk_buff *),
+	queue = kmalloc_array(VHOST_RX_BATCH, sizeof(void *),
 			      GFP_KERNEL);
 			      GFP_KERNEL);
 	if (!queue) {
 	if (!queue) {
 		kfree(vqs);
 		kfree(vqs);
@@ -1046,23 +1058,23 @@ err:
 	return ERR_PTR(r);
 	return ERR_PTR(r);
 }
 }
 
 
-static struct skb_array *get_tap_skb_array(int fd)
+static struct ptr_ring *get_tap_ptr_ring(int fd)
 {
 {
-	struct skb_array *array;
+	struct ptr_ring *ring;
 	struct file *file = fget(fd);
 	struct file *file = fget(fd);
 
 
 	if (!file)
 	if (!file)
 		return NULL;
 		return NULL;
-	array = tun_get_skb_array(file);
-	if (!IS_ERR(array))
+	ring = tun_get_tx_ring(file);
+	if (!IS_ERR(ring))
 		goto out;
 		goto out;
-	array = tap_get_skb_array(file);
-	if (!IS_ERR(array))
+	ring = tap_get_ptr_ring(file);
+	if (!IS_ERR(ring))
 		goto out;
 		goto out;
-	array = NULL;
+	ring = NULL;
 out:
 out:
 	fput(file);
 	fput(file);
-	return array;
+	return ring;
 }
 }
 
 
 static struct socket *get_tap_socket(int fd)
 static struct socket *get_tap_socket(int fd)
@@ -1143,7 +1155,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 		vq->private_data = sock;
 		vq->private_data = sock;
 		vhost_net_buf_unproduce(nvq);
 		vhost_net_buf_unproduce(nvq);
 		if (index == VHOST_NET_VQ_RX)
 		if (index == VHOST_NET_VQ_RX)
-			nvq->rx_array = get_tap_skb_array(fd);
+			nvq->rx_ring = get_tap_ptr_ring(fd);
 		r = vhost_vq_init_access(vq);
 		r = vhost_vq_init_access(vq);
 		if (r)
 		if (r)
 			goto err_used;
 			goto err_used;

+ 3 - 3
include/linux/if_tap.h

@@ -4,7 +4,7 @@
 
 
 #if IS_ENABLED(CONFIG_TAP)
 #if IS_ENABLED(CONFIG_TAP)
 struct socket *tap_get_socket(struct file *);
 struct socket *tap_get_socket(struct file *);
-struct skb_array *tap_get_skb_array(struct file *file);
+struct ptr_ring *tap_get_ptr_ring(struct file *file);
 #else
 #else
 #include <linux/err.h>
 #include <linux/err.h>
 #include <linux/errno.h>
 #include <linux/errno.h>
@@ -14,7 +14,7 @@ static inline struct socket *tap_get_socket(struct file *f)
 {
 {
 	return ERR_PTR(-EINVAL);
 	return ERR_PTR(-EINVAL);
 }
 }
-static inline struct skb_array *tap_get_skb_array(struct file *f)
+static inline struct ptr_ring *tap_get_ptr_ring(struct file *f)
 {
 {
 	return ERR_PTR(-EINVAL);
 	return ERR_PTR(-EINVAL);
 }
 }
@@ -70,7 +70,7 @@ struct tap_queue {
 	u16 queue_index;
 	u16 queue_index;
 	bool enabled;
 	bool enabled;
 	struct list_head next;
 	struct list_head next;
-	struct skb_array skb_array;
+	struct ptr_ring ring;
 };
 };
 
 
 rx_handler_result_t tap_handle_frame(struct sk_buff **pskb);
 rx_handler_result_t tap_handle_frame(struct sk_buff **pskb);

+ 19 - 2
include/linux/if_tun.h

@@ -17,9 +17,14 @@
 
 
 #include <uapi/linux/if_tun.h>
 #include <uapi/linux/if_tun.h>
 
 
+#define TUN_XDP_FLAG 0x1UL
+
 #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
 #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
 struct socket *tun_get_socket(struct file *);
 struct socket *tun_get_socket(struct file *);
-struct skb_array *tun_get_skb_array(struct file *file);
+struct ptr_ring *tun_get_tx_ring(struct file *file);
+bool tun_is_xdp_buff(void *ptr);
+void *tun_xdp_to_ptr(void *ptr);
+void *tun_ptr_to_xdp(void *ptr);
 #else
 #else
 #include <linux/err.h>
 #include <linux/err.h>
 #include <linux/errno.h>
 #include <linux/errno.h>
@@ -29,9 +34,21 @@ static inline struct socket *tun_get_socket(struct file *f)
 {
 {
 	return ERR_PTR(-EINVAL);
 	return ERR_PTR(-EINVAL);
 }
 }
-static inline struct skb_array *tun_get_skb_array(struct file *f)
+static inline struct ptr_ring *tun_get_tx_ring(struct file *f)
 {
 {
 	return ERR_PTR(-EINVAL);
 	return ERR_PTR(-EINVAL);
 }
 }
+static inline bool tun_is_xdp_buff(void *ptr)
+{
+	return false;
+}
+void *tun_xdp_to_ptr(void *ptr)
+{
+	return NULL;
+}
+void *tun_ptr_to_xdp(void *ptr)
+{
+	return NULL;
+}
 #endif /* CONFIG_TUN */
 #endif /* CONFIG_TUN */
 #endif /* __IF_TUN_H */
 #endif /* __IF_TUN_H */