|
@@ -116,6 +116,8 @@ struct vhost_net_virtqueue {
|
|
* For RX, number of batched heads
|
|
* For RX, number of batched heads
|
|
*/
|
|
*/
|
|
int done_idx;
|
|
int done_idx;
|
|
|
|
+ /* Number of XDP frames batched */
|
|
|
|
+ int batched_xdp;
|
|
/* an array of userspace buffers info */
|
|
/* an array of userspace buffers info */
|
|
struct ubuf_info *ubuf_info;
|
|
struct ubuf_info *ubuf_info;
|
|
/* Reference counting for outstanding ubufs.
|
|
/* Reference counting for outstanding ubufs.
|
|
@@ -123,6 +125,8 @@ struct vhost_net_virtqueue {
|
|
struct vhost_net_ubuf_ref *ubufs;
|
|
struct vhost_net_ubuf_ref *ubufs;
|
|
struct ptr_ring *rx_ring;
|
|
struct ptr_ring *rx_ring;
|
|
struct vhost_net_buf rxq;
|
|
struct vhost_net_buf rxq;
|
|
|
|
+ /* Batched XDP buffs */
|
|
|
|
+ struct xdp_buff *xdp;
|
|
};
|
|
};
|
|
|
|
|
|
struct vhost_net {
|
|
struct vhost_net {
|
|
@@ -338,6 +342,11 @@ static bool vhost_sock_zcopy(struct socket *sock)
|
|
sock_flag(sock->sk, SOCK_ZEROCOPY);
|
|
sock_flag(sock->sk, SOCK_ZEROCOPY);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+static bool vhost_sock_xdp(struct socket *sock)
|
|
|
|
+{
|
|
|
|
+ return sock_flag(sock->sk, SOCK_XDP);
|
|
|
|
+}
|
|
|
|
+
|
|
/* In case of DMA done not in order in lower device driver for some reason.
|
|
/* In case of DMA done not in order in lower device driver for some reason.
|
|
* upend_idx is used to track end of used idx, done_idx is used to track head
|
|
* upend_idx is used to track end of used idx, done_idx is used to track head
|
|
* of used idx. Once lower device DMA done contiguously, we will signal KVM
|
|
* of used idx. Once lower device DMA done contiguously, we will signal KVM
|
|
@@ -444,10 +453,37 @@ static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq)
|
|
nvq->done_idx = 0;
|
|
nvq->done_idx = 0;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+static void vhost_tx_batch(struct vhost_net *net,
|
|
|
|
+ struct vhost_net_virtqueue *nvq,
|
|
|
|
+ struct socket *sock,
|
|
|
|
+ struct msghdr *msghdr)
|
|
|
|
+{
|
|
|
|
+ struct tun_msg_ctl ctl = {
|
|
|
|
+ .type = TUN_MSG_PTR,
|
|
|
|
+ .num = nvq->batched_xdp,
|
|
|
|
+ .ptr = nvq->xdp,
|
|
|
|
+ };
|
|
|
|
+ int err;
|
|
|
|
+
|
|
|
|
+ if (nvq->batched_xdp == 0)
|
|
|
|
+ goto signal_used;
|
|
|
|
+
|
|
|
|
+ msghdr->msg_control = &ctl;
|
|
|
|
+ err = sock->ops->sendmsg(sock, msghdr, 0);
|
|
|
|
+ if (unlikely(err < 0)) {
|
|
|
|
+ vq_err(&nvq->vq, "Fail to batch sending packets\n");
|
|
|
|
+ return;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+signal_used:
|
|
|
|
+ vhost_net_signal_used(nvq);
|
|
|
|
+ nvq->batched_xdp = 0;
|
|
|
|
+}
|
|
|
|
+
|
|
static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
|
|
static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
|
|
struct vhost_net_virtqueue *nvq,
|
|
struct vhost_net_virtqueue *nvq,
|
|
unsigned int *out_num, unsigned int *in_num,
|
|
unsigned int *out_num, unsigned int *in_num,
|
|
- bool *busyloop_intr)
|
|
|
|
|
|
+ struct msghdr *msghdr, bool *busyloop_intr)
|
|
{
|
|
{
|
|
struct vhost_virtqueue *vq = &nvq->vq;
|
|
struct vhost_virtqueue *vq = &nvq->vq;
|
|
unsigned long uninitialized_var(endtime);
|
|
unsigned long uninitialized_var(endtime);
|
|
@@ -455,8 +491,9 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
|
|
out_num, in_num, NULL, NULL);
|
|
out_num, in_num, NULL, NULL);
|
|
|
|
|
|
if (r == vq->num && vq->busyloop_timeout) {
|
|
if (r == vq->num && vq->busyloop_timeout) {
|
|
|
|
+ /* Flush batched packets first */
|
|
if (!vhost_sock_zcopy(vq->private_data))
|
|
if (!vhost_sock_zcopy(vq->private_data))
|
|
- vhost_net_signal_used(nvq);
|
|
|
|
|
|
+ vhost_tx_batch(net, nvq, vq->private_data, msghdr);
|
|
preempt_disable();
|
|
preempt_disable();
|
|
endtime = busy_clock() + vq->busyloop_timeout;
|
|
endtime = busy_clock() + vq->busyloop_timeout;
|
|
while (vhost_can_busy_poll(endtime)) {
|
|
while (vhost_can_busy_poll(endtime)) {
|
|
@@ -512,7 +549,7 @@ static int get_tx_bufs(struct vhost_net *net,
|
|
struct vhost_virtqueue *vq = &nvq->vq;
|
|
struct vhost_virtqueue *vq = &nvq->vq;
|
|
int ret;
|
|
int ret;
|
|
|
|
|
|
- ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, busyloop_intr);
|
|
|
|
|
|
+ ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr);
|
|
|
|
|
|
if (ret < 0 || ret == vq->num)
|
|
if (ret < 0 || ret == vq->num)
|
|
return ret;
|
|
return ret;
|
|
@@ -540,6 +577,80 @@ static bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len)
|
|
!vhost_vq_avail_empty(vq->dev, vq);
|
|
!vhost_vq_avail_empty(vq->dev, vq);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+#define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
|
|
|
|
+
|
|
|
|
+static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
|
|
|
|
+ struct iov_iter *from)
|
|
|
|
+{
|
|
|
|
+ struct vhost_virtqueue *vq = &nvq->vq;
|
|
|
|
+ struct socket *sock = vq->private_data;
|
|
|
|
+ struct page_frag *alloc_frag = ¤t->task_frag;
|
|
|
|
+ struct virtio_net_hdr *gso;
|
|
|
|
+ struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp];
|
|
|
|
+ struct tun_xdp_hdr *hdr;
|
|
|
|
+ size_t len = iov_iter_count(from);
|
|
|
|
+ int headroom = vhost_sock_xdp(sock) ? XDP_PACKET_HEADROOM : 0;
|
|
|
|
+ int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
|
|
|
|
+ int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + headroom + nvq->sock_hlen);
|
|
|
|
+ int sock_hlen = nvq->sock_hlen;
|
|
|
|
+ void *buf;
|
|
|
|
+ int copied;
|
|
|
|
+
|
|
|
|
+ if (unlikely(len < nvq->sock_hlen))
|
|
|
|
+ return -EFAULT;
|
|
|
|
+
|
|
|
|
+ if (SKB_DATA_ALIGN(len + pad) +
|
|
|
|
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
|
|
|
|
+ return -ENOSPC;
|
|
|
|
+
|
|
|
|
+ buflen += SKB_DATA_ALIGN(len + pad);
|
|
|
|
+ alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
|
|
|
|
+ if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
|
|
|
|
+ return -ENOMEM;
|
|
|
|
+
|
|
|
|
+ buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
|
|
|
|
+ copied = copy_page_from_iter(alloc_frag->page,
|
|
|
|
+ alloc_frag->offset +
|
|
|
|
+ offsetof(struct tun_xdp_hdr, gso),
|
|
|
|
+ sock_hlen, from);
|
|
|
|
+ if (copied != sock_hlen)
|
|
|
|
+ return -EFAULT;
|
|
|
|
+
|
|
|
|
+ hdr = buf;
|
|
|
|
+ gso = &hdr->gso;
|
|
|
|
+
|
|
|
|
+ if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
|
|
|
|
+ vhost16_to_cpu(vq, gso->csum_start) +
|
|
|
|
+ vhost16_to_cpu(vq, gso->csum_offset) + 2 >
|
|
|
|
+ vhost16_to_cpu(vq, gso->hdr_len)) {
|
|
|
|
+ gso->hdr_len = cpu_to_vhost16(vq,
|
|
|
|
+ vhost16_to_cpu(vq, gso->csum_start) +
|
|
|
|
+ vhost16_to_cpu(vq, gso->csum_offset) + 2);
|
|
|
|
+
|
|
|
|
+ if (vhost16_to_cpu(vq, gso->hdr_len) > len)
|
|
|
|
+ return -EINVAL;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ len -= sock_hlen;
|
|
|
|
+ copied = copy_page_from_iter(alloc_frag->page,
|
|
|
|
+ alloc_frag->offset + pad,
|
|
|
|
+ len, from);
|
|
|
|
+ if (copied != len)
|
|
|
|
+ return -EFAULT;
|
|
|
|
+
|
|
|
|
+ xdp->data_hard_start = buf;
|
|
|
|
+ xdp->data = buf + pad;
|
|
|
|
+ xdp->data_end = xdp->data + len;
|
|
|
|
+ hdr->buflen = buflen;
|
|
|
|
+
|
|
|
|
+ get_page(alloc_frag->page);
|
|
|
|
+ alloc_frag->offset += buflen;
|
|
|
|
+
|
|
|
|
+ ++nvq->batched_xdp;
|
|
|
|
+
|
|
|
|
+ return 0;
|
|
|
|
+}
|
|
|
|
+
|
|
static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
|
|
static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
|
|
{
|
|
{
|
|
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
|
|
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
|
|
@@ -556,10 +667,14 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
|
|
size_t len, total_len = 0;
|
|
size_t len, total_len = 0;
|
|
int err;
|
|
int err;
|
|
int sent_pkts = 0;
|
|
int sent_pkts = 0;
|
|
|
|
+ bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX);
|
|
|
|
|
|
for (;;) {
|
|
for (;;) {
|
|
bool busyloop_intr = false;
|
|
bool busyloop_intr = false;
|
|
|
|
|
|
|
|
+ if (nvq->done_idx == VHOST_NET_BATCH)
|
|
|
|
+ vhost_tx_batch(net, nvq, sock, &msg);
|
|
|
|
+
|
|
head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
|
|
head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
|
|
&busyloop_intr);
|
|
&busyloop_intr);
|
|
/* On error, stop handling until the next kick. */
|
|
/* On error, stop handling until the next kick. */
|
|
@@ -577,14 +692,34 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
|
|
break;
|
|
break;
|
|
}
|
|
}
|
|
|
|
|
|
- vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
|
|
|
|
- vq->heads[nvq->done_idx].len = 0;
|
|
|
|
-
|
|
|
|
total_len += len;
|
|
total_len += len;
|
|
- if (tx_can_batch(vq, total_len))
|
|
|
|
- msg.msg_flags |= MSG_MORE;
|
|
|
|
- else
|
|
|
|
- msg.msg_flags &= ~MSG_MORE;
|
|
|
|
|
|
+
|
|
|
|
+ /* For simplicity, TX batching is only enabled if
|
|
|
|
+ * sndbuf is unlimited.
|
|
|
|
+ */
|
|
|
|
+ if (sock_can_batch) {
|
|
|
|
+ err = vhost_net_build_xdp(nvq, &msg.msg_iter);
|
|
|
|
+ if (!err) {
|
|
|
|
+ goto done;
|
|
|
|
+ } else if (unlikely(err != -ENOSPC)) {
|
|
|
|
+ vhost_tx_batch(net, nvq, sock, &msg);
|
|
|
|
+ vhost_discard_vq_desc(vq, 1);
|
|
|
|
+ vhost_net_enable_vq(net, vq);
|
|
|
|
+ break;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /* We can't build XDP buff, go for single
|
|
|
|
+ * packet path but let's flush batched
|
|
|
|
+ * packets.
|
|
|
|
+ */
|
|
|
|
+ vhost_tx_batch(net, nvq, sock, &msg);
|
|
|
|
+ msg.msg_control = NULL;
|
|
|
|
+ } else {
|
|
|
|
+ if (tx_can_batch(vq, total_len))
|
|
|
|
+ msg.msg_flags |= MSG_MORE;
|
|
|
|
+ else
|
|
|
|
+ msg.msg_flags &= ~MSG_MORE;
|
|
|
|
+ }
|
|
|
|
|
|
/* TODO: Check specific error and bomb out unless ENOBUFS? */
|
|
/* TODO: Check specific error and bomb out unless ENOBUFS? */
|
|
err = sock->ops->sendmsg(sock, &msg, len);
|
|
err = sock->ops->sendmsg(sock, &msg, len);
|
|
@@ -596,15 +731,17 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
|
|
if (err != len)
|
|
if (err != len)
|
|
pr_debug("Truncated TX packet: len %d != %zd\n",
|
|
pr_debug("Truncated TX packet: len %d != %zd\n",
|
|
err, len);
|
|
err, len);
|
|
- if (++nvq->done_idx >= VHOST_NET_BATCH)
|
|
|
|
- vhost_net_signal_used(nvq);
|
|
|
|
|
|
+done:
|
|
|
|
+ vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
|
|
|
|
+ vq->heads[nvq->done_idx].len = 0;
|
|
|
|
+ ++nvq->done_idx;
|
|
if (vhost_exceeds_weight(++sent_pkts, total_len)) {
|
|
if (vhost_exceeds_weight(++sent_pkts, total_len)) {
|
|
vhost_poll_queue(&vq->poll);
|
|
vhost_poll_queue(&vq->poll);
|
|
break;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
- vhost_net_signal_used(nvq);
|
|
|
|
|
|
+ vhost_tx_batch(net, nvq, sock, &msg);
|
|
}
|
|
}
|
|
|
|
|
|
static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
|
|
static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
|
|
@@ -620,6 +757,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
|
|
.msg_controllen = 0,
|
|
.msg_controllen = 0,
|
|
.msg_flags = MSG_DONTWAIT,
|
|
.msg_flags = MSG_DONTWAIT,
|
|
};
|
|
};
|
|
|
|
+ struct tun_msg_ctl ctl;
|
|
size_t len, total_len = 0;
|
|
size_t len, total_len = 0;
|
|
int err;
|
|
int err;
|
|
struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
|
|
struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
|
|
@@ -664,8 +802,10 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
|
|
ubuf->ctx = nvq->ubufs;
|
|
ubuf->ctx = nvq->ubufs;
|
|
ubuf->desc = nvq->upend_idx;
|
|
ubuf->desc = nvq->upend_idx;
|
|
refcount_set(&ubuf->refcnt, 1);
|
|
refcount_set(&ubuf->refcnt, 1);
|
|
- msg.msg_control = ubuf;
|
|
|
|
- msg.msg_controllen = sizeof(ubuf);
|
|
|
|
|
|
+ msg.msg_control = &ctl;
|
|
|
|
+ ctl.type = TUN_MSG_UBUF;
|
|
|
|
+ ctl.ptr = ubuf;
|
|
|
|
+ msg.msg_controllen = sizeof(ctl);
|
|
ubufs = nvq->ubufs;
|
|
ubufs = nvq->ubufs;
|
|
atomic_inc(&ubufs->refcount);
|
|
atomic_inc(&ubufs->refcount);
|
|
nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
|
|
nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
|
|
@@ -1078,6 +1218,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
|
|
struct vhost_dev *dev;
|
|
struct vhost_dev *dev;
|
|
struct vhost_virtqueue **vqs;
|
|
struct vhost_virtqueue **vqs;
|
|
void **queue;
|
|
void **queue;
|
|
|
|
+ struct xdp_buff *xdp;
|
|
int i;
|
|
int i;
|
|
|
|
|
|
n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
|
|
n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
|
|
@@ -1098,6 +1239,14 @@ static int vhost_net_open(struct inode *inode, struct file *f)
|
|
}
|
|
}
|
|
n->vqs[VHOST_NET_VQ_RX].rxq.queue = queue;
|
|
n->vqs[VHOST_NET_VQ_RX].rxq.queue = queue;
|
|
|
|
|
|
|
|
+ xdp = kmalloc_array(VHOST_NET_BATCH, sizeof(*xdp), GFP_KERNEL);
|
|
|
|
+ if (!xdp) {
|
|
|
|
+ kfree(vqs);
|
|
|
|
+ kvfree(n);
|
|
|
|
+ kfree(queue);
|
|
|
|
+ }
|
|
|
|
+ n->vqs[VHOST_NET_VQ_TX].xdp = xdp;
|
|
|
|
+
|
|
dev = &n->dev;
|
|
dev = &n->dev;
|
|
vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq;
|
|
vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq;
|
|
vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq;
|
|
vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq;
|
|
@@ -1108,6 +1257,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
|
|
n->vqs[i].ubuf_info = NULL;
|
|
n->vqs[i].ubuf_info = NULL;
|
|
n->vqs[i].upend_idx = 0;
|
|
n->vqs[i].upend_idx = 0;
|
|
n->vqs[i].done_idx = 0;
|
|
n->vqs[i].done_idx = 0;
|
|
|
|
+ n->vqs[i].batched_xdp = 0;
|
|
n->vqs[i].vhost_hlen = 0;
|
|
n->vqs[i].vhost_hlen = 0;
|
|
n->vqs[i].sock_hlen = 0;
|
|
n->vqs[i].sock_hlen = 0;
|
|
n->vqs[i].rx_ring = NULL;
|
|
n->vqs[i].rx_ring = NULL;
|
|
@@ -1191,6 +1341,7 @@ static int vhost_net_release(struct inode *inode, struct file *f)
|
|
* since jobs can re-queue themselves. */
|
|
* since jobs can re-queue themselves. */
|
|
vhost_net_flush(n);
|
|
vhost_net_flush(n);
|
|
kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue);
|
|
kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue);
|
|
|
|
+ kfree(n->vqs[VHOST_NET_VQ_TX].xdp);
|
|
kfree(n->dev.vqs);
|
|
kfree(n->dev.vqs);
|
|
kvfree(n);
|
|
kvfree(n);
|
|
return 0;
|
|
return 0;
|