Sfoglia il codice sorgente

Merge branch 'RDS-zerocopy-support'

Sowmini Varadhan says:

====================
RDS: zerocopy support

This is version 3 of the series, following up on review comments for
 http://patchwork.ozlabs.org/project/netdev/list/?series=28530

Review comments addressed
Patch 4
  - fix fragile use of skb->cb[], do not set ee_code incorrectly.
Patch 5:
  - remove needless bzero of skb->cb[], consolidate err cleanup

A brief overview of this feature follows.

This patch series provides support for MSG_ZERCOCOPY
on a PF_RDS socket based on the APIs and infrastructure added
by Commit f214f915e7db ("tcp: enable MSG_ZEROCOPY")

For single threaded rds-stress testing using rds-tcp with the
ixgbe driver using 1M message sizes (-a 1M -q 1M) preliminary
results show that  there is a significant reduction in latency: about
90 usec with zerocopy, compared with 200 usec without zerocopy.

This patchset modifies the above for zerocopy in the following manner.
- if the MSG_ZEROCOPY flag is specified with rds_sendmsg(), and,
- if the SO_ZEROCOPY  socket option has been set on the PF_RDS socket,
application pages sent down with rds_sendmsg are pinned. The pinning
uses the accounting infrastructure added by a91dbff551a6 ("sock: ulimit
on MSG_ZEROCOPY pages"). The message is unpinned when all references
to the message go down to 0, and the message is freed by rds_message_purge.

A multithreaded application using this infrastructure must send down
a unique 32 bit cookie as ancillary data with each sendmsg invocation.
The format of this ancillary data is described in Patch 5 of the series.
The cookie is passed up to the application on the sk_error_queue when
the message is unpinned, indicating to the application that it is now
safe to free/reuse the message buffer. The details of the completion
notification are provided in Patch 4 of this series.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
David S. Miller 7 anni fa
parent
commit
80c6d2b8d8

+ 3 - 0
include/linux/skbuff.h

@@ -466,6 +466,9 @@ struct ubuf_info {
 
 #define skb_uarg(SKB)	((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
 
+int mm_account_pinned_pages(struct mmpin *mmp, size_t size);
+void mm_unaccount_pinned_pages(struct mmpin *mmp);
+
 struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size);
 struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
 					struct ubuf_info *uarg);

+ 2 - 0
include/uapi/linux/errqueue.h

@@ -20,11 +20,13 @@ struct sock_extended_err {
 #define SO_EE_ORIGIN_ICMP6	3
 #define SO_EE_ORIGIN_TXSTATUS	4
 #define SO_EE_ORIGIN_ZEROCOPY	5
+#define SO_EE_ORIGIN_ZCOOKIE	6
 #define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS
 
 #define SO_EE_OFFENDER(ee)	((struct sockaddr*)((ee)+1))
 
 #define SO_EE_CODE_ZEROCOPY_COPIED	1
+#define	SO_EE_ORIGIN_MAX_ZCOOKIES	8
 
 /**
  *	struct scm_timestamping - timestamps exposed through cmsg

+ 1 - 0
include/uapi/linux/rds.h

@@ -103,6 +103,7 @@
 #define RDS_CMSG_MASKED_ATOMIC_FADD	8
 #define RDS_CMSG_MASKED_ATOMIC_CSWP	9
 #define RDS_CMSG_RXPATH_LATENCY		11
+#define	RDS_CMSG_ZCOPY_COOKIE		12
 
 #define RDS_INFO_FIRST			10000
 #define RDS_INFO_COUNTERS		10000

+ 4 - 2
net/core/skbuff.c

@@ -890,7 +890,7 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
 }
 EXPORT_SYMBOL_GPL(skb_morph);
 
-static int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
+int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
 {
 	unsigned long max_pg, num_pg, new_pg, old_pg;
 	struct user_struct *user;
@@ -919,14 +919,16 @@ static int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(mm_account_pinned_pages);
 
-static void mm_unaccount_pinned_pages(struct mmpin *mmp)
+void mm_unaccount_pinned_pages(struct mmpin *mmp)
 {
 	if (mmp->user) {
 		atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm);
 		free_uid(mmp->user);
 	}
 }
+EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
 
 struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
 {

+ 14 - 11
net/core/sock.c

@@ -1049,18 +1049,21 @@ set_rcvbuf:
 		break;
 
 	case SO_ZEROCOPY:
-		if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6)
+		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
+			if (sk->sk_protocol != IPPROTO_TCP)
+				ret = -ENOTSUPP;
+			else if (sk->sk_state != TCP_CLOSE)
+				ret = -EBUSY;
+		} else if (sk->sk_family != PF_RDS) {
 			ret = -ENOTSUPP;
-		else if (sk->sk_protocol != IPPROTO_TCP)
-			ret = -ENOTSUPP;
-		else if (sk->sk_state != TCP_CLOSE)
-			ret = -EBUSY;
-		else if (val < 0 || val > 1)
-			ret = -EINVAL;
-		else
-			sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
-		break;
-
+		}
+		if (!ret) {
+			if (val < 0 || val > 1)
+				ret = -EINVAL;
+			else
+				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
+			break;
+		}
 	default:
 		ret = -ENOPROTOOPT;
 		break;

+ 2 - 0
net/rds/af_rds.c

@@ -182,6 +182,8 @@ static __poll_t rds_poll(struct file *file, struct socket *sock,
 		mask |= (EPOLLIN | EPOLLRDNORM);
 	if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
 		mask |= (EPOLLOUT | EPOLLWRNORM);
+	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+		mask |= POLLERR;
 	read_unlock_irqrestore(&rs->rs_recv_lock, flags);
 
 	/* clear state any time we wake a seen-congested socket */

+ 128 - 4
net/rds/message.c

@@ -33,6 +33,9 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/export.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+#include <linux/errqueue.h>
 
 #include "rds.h"
 
@@ -53,20 +56,92 @@ void rds_message_addref(struct rds_message *rm)
 }
 EXPORT_SYMBOL_GPL(rds_message_addref);
 
+static inline bool skb_zcookie_add(struct sk_buff *skb, u32 cookie)
+{
+	struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
+	int ncookies;
+	u32 *ptr;
+
+	if (serr->ee.ee_origin != SO_EE_ORIGIN_ZCOOKIE)
+		return false;
+	ncookies = serr->ee.ee_data;
+	if (ncookies == SO_EE_ORIGIN_MAX_ZCOOKIES)
+		return false;
+	ptr = skb_put(skb, sizeof(u32));
+	*ptr = cookie;
+	serr->ee.ee_data = ++ncookies;
+	return true;
+}
+
+static void rds_rm_zerocopy_callback(struct rds_sock *rs,
+				     struct rds_znotifier *znotif)
+{
+	struct sock *sk = rds_rs_to_sk(rs);
+	struct sk_buff *skb, *tail;
+	struct sock_exterr_skb *serr;
+	unsigned long flags;
+	struct sk_buff_head *q;
+	u32 cookie = znotif->z_cookie;
+
+	q = &sk->sk_error_queue;
+	spin_lock_irqsave(&q->lock, flags);
+	tail = skb_peek_tail(q);
+
+	if (tail && skb_zcookie_add(tail, cookie)) {
+		spin_unlock_irqrestore(&q->lock, flags);
+		mm_unaccount_pinned_pages(&znotif->z_mmp);
+		consume_skb(rds_skb_from_znotifier(znotif));
+		sk->sk_error_report(sk);
+		return;
+	}
+
+	skb = rds_skb_from_znotifier(znotif);
+	serr = SKB_EXT_ERR(skb);
+	memset(&serr->ee, 0, sizeof(serr->ee));
+	serr->ee.ee_errno = 0;
+	serr->ee.ee_origin = SO_EE_ORIGIN_ZCOOKIE;
+	serr->ee.ee_info = 0;
+	WARN_ON(!skb_zcookie_add(skb, cookie));
+
+	__skb_queue_tail(q, skb);
+
+	spin_unlock_irqrestore(&q->lock, flags);
+	sk->sk_error_report(sk);
+
+	mm_unaccount_pinned_pages(&znotif->z_mmp);
+}
+
 /*
  * This relies on dma_map_sg() not touching sg[].page during merging.
  */
 static void rds_message_purge(struct rds_message *rm)
 {
-	unsigned long i;
+	unsigned long i, flags;
+	bool zcopy = false;
 
 	if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
 		return;
 
+	spin_lock_irqsave(&rm->m_rs_lock, flags);
+	if (rm->m_rs) {
+		struct rds_sock *rs = rm->m_rs;
+
+		if (rm->data.op_mmp_znotifier) {
+			zcopy = true;
+			rds_rm_zerocopy_callback(rs, rm->data.op_mmp_znotifier);
+			rm->data.op_mmp_znotifier = NULL;
+		}
+		sock_put(rds_rs_to_sk(rs));
+		rm->m_rs = NULL;
+	}
+	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+
 	for (i = 0; i < rm->data.op_nents; i++) {
-		rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i]));
 		/* XXX will have to put_page for page refs */
-		__free_page(sg_page(&rm->data.op_sg[i]));
+		if (!zcopy)
+			__free_page(sg_page(&rm->data.op_sg[i]));
+		else
+			put_page(sg_page(&rm->data.op_sg[i]));
 	}
 	rm->data.op_nents = 0;
 
@@ -266,12 +341,14 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
 	return rm;
 }
 
-int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from)
+int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from,
+			       bool zcopy)
 {
 	unsigned long to_copy, nbytes;
 	unsigned long sg_off;
 	struct scatterlist *sg;
 	int ret = 0;
+	int length = iov_iter_count(from);
 
 	rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from));
 
@@ -281,6 +358,53 @@ int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from)
 	sg = rm->data.op_sg;
 	sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
 
+	if (zcopy) {
+		int total_copied = 0;
+		struct sk_buff *skb;
+
+		skb = alloc_skb(SO_EE_ORIGIN_MAX_ZCOOKIES * sizeof(u32),
+				GFP_KERNEL);
+		if (!skb)
+			return -ENOMEM;
+		rm->data.op_mmp_znotifier = RDS_ZCOPY_SKB(skb);
+		if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp,
+					    length)) {
+			ret = -ENOMEM;
+			goto err;
+		}
+		while (iov_iter_count(from)) {
+			struct page *pages;
+			size_t start;
+			ssize_t copied;
+
+			copied = iov_iter_get_pages(from, &pages, PAGE_SIZE,
+						    1, &start);
+			if (copied < 0) {
+				struct mmpin *mmp;
+				int i;
+
+				for (i = 0; i < rm->data.op_nents; i++)
+					put_page(sg_page(&rm->data.op_sg[i]));
+				mmp = &rm->data.op_mmp_znotifier->z_mmp;
+				mm_unaccount_pinned_pages(mmp);
+				ret = -EFAULT;
+				goto err;
+			}
+			total_copied += copied;
+			iov_iter_advance(from, copied);
+			length -= copied;
+			sg_set_page(sg, pages, copied, start);
+			rm->data.op_nents++;
+			sg++;
+		}
+		WARN_ON_ONCE(length != 0);
+		return ret;
+err:
+		consume_skb(skb);
+		rm->data.op_mmp_znotifier = NULL;
+		return ret;
+	} /* zcopy */
+
 	while (iov_iter_count(from)) {
 		if (!sg_page(sg)) {
 			ret = rds_page_remainder_alloc(sg, iov_iter_count(from),

+ 16 - 1
net/rds/rds.h

@@ -356,6 +356,19 @@ static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
 #define RDS_MSG_PAGEVEC		7
 #define RDS_MSG_FLUSH		8
 
+struct rds_znotifier {
+	struct list_head	z_list;
+	struct mmpin		z_mmp;
+	u32			z_cookie;
+};
+
+#define	RDS_ZCOPY_SKB(__skb)	((struct rds_znotifier *)&((__skb)->cb[0]))
+
+static inline struct sk_buff *rds_skb_from_znotifier(struct rds_znotifier *z)
+{
+	return container_of((void *)z, struct sk_buff, cb);
+}
+
 struct rds_message {
 	refcount_t		m_refcount;
 	struct list_head	m_sock_item;
@@ -436,6 +449,7 @@ struct rds_message {
 			unsigned int		op_count;
 			unsigned int		op_dmasg;
 			unsigned int		op_dmaoff;
+			struct rds_znotifier	*op_mmp_znotifier;
 			struct scatterlist	*op_sg;
 		} data;
 	};
@@ -771,7 +785,8 @@ rds_conn_connecting(struct rds_connection *conn)
 /* message.c */
 struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
 struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents);
-int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from);
+int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from,
+			       bool zcopy);
 struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
 void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
 				 __be16 dport, u64 seq);

+ 2 - 0
net/rds/recv.c

@@ -594,6 +594,8 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
 
 	if (msg_flags & MSG_OOB)
 		goto out;
+	if (msg_flags & MSG_ERRQUEUE)
+		return sock_recv_errqueue(sk, msg, size, SOL_IP, IP_RECVERR);
 
 	while (1) {
 		/* If there are pending notifications, do those - and nothing else */

+ 39 - 12
net/rds/send.c

@@ -649,7 +649,6 @@ static void rds_send_remove_from_sock(struct list_head *messages, int status)
 				rm->rdma.op_notifier = NULL;
 			}
 			was_on_sock = 1;
-			rm->m_rs = NULL;
 		}
 		spin_unlock(&rs->rs_lock);
 
@@ -756,9 +755,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
 		 */
 		if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
 			spin_unlock_irqrestore(&cp->cp_lock, flags);
-			spin_lock_irqsave(&rm->m_rs_lock, flags);
-			rm->m_rs = NULL;
-			spin_unlock_irqrestore(&rm->m_rs_lock, flags);
 			continue;
 		}
 		list_del_init(&rm->m_conn_item);
@@ -774,7 +770,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
 		__rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
 		spin_unlock(&rs->rs_lock);
 
-		rm->m_rs = NULL;
 		spin_unlock_irqrestore(&rm->m_rs_lock, flags);
 
 		rds_message_put(rm);
@@ -798,7 +793,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
 		__rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
 		spin_unlock(&rs->rs_lock);
 
-		rm->m_rs = NULL;
 		spin_unlock_irqrestore(&rm->m_rs_lock, flags);
 
 		rds_message_put(rm);
@@ -849,6 +843,7 @@ static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn,
 		list_add_tail(&rm->m_sock_item, &rs->rs_send_queue);
 		set_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
 		rds_message_addref(rm);
+		sock_hold(rds_rs_to_sk(rs));
 		rm->m_rs = rs;
 
 		/* The code ordering is a little weird, but we're
@@ -880,12 +875,13 @@ out:
  * rds_message is getting to be quite complicated, and we'd like to allocate
  * it all in one go. This figures out how big it needs to be up front.
  */
-static int rds_rm_size(struct msghdr *msg, int data_len)
+static int rds_rm_size(struct msghdr *msg, int num_sgs)
 {
 	struct cmsghdr *cmsg;
 	int size = 0;
 	int cmsg_groups = 0;
 	int retval;
+	bool zcopy_cookie = false;
 
 	for_each_cmsghdr(cmsg, msg) {
 		if (!CMSG_OK(msg, cmsg))
@@ -904,6 +900,8 @@ static int rds_rm_size(struct msghdr *msg, int data_len)
 
 			break;
 
+		case RDS_CMSG_ZCOPY_COOKIE:
+			zcopy_cookie = true;
 		case RDS_CMSG_RDMA_DEST:
 		case RDS_CMSG_RDMA_MAP:
 			cmsg_groups |= 2;
@@ -924,7 +922,10 @@ static int rds_rm_size(struct msghdr *msg, int data_len)
 
 	}
 
-	size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist);
+	if ((msg->msg_flags & MSG_ZEROCOPY) && !zcopy_cookie)
+		return -EINVAL;
+
+	size += num_sgs * sizeof(struct scatterlist);
 
 	/* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */
 	if (cmsg_groups == 3)
@@ -933,6 +934,18 @@ static int rds_rm_size(struct msghdr *msg, int data_len)
 	return size;
 }
 
+static int rds_cmsg_zcopy(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg)
+{
+	u32 *cookie;
+
+	if (cmsg->cmsg_len < CMSG_LEN(sizeof(*cookie)))
+		return -EINVAL;
+	cookie = CMSG_DATA(cmsg);
+	rm->data.op_mmp_znotifier->z_cookie = *cookie;
+	return 0;
+}
+
 static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
 			 struct msghdr *msg, int *allocated_mr)
 {
@@ -975,6 +988,10 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
 			ret = rds_cmsg_atomic(rs, rm, cmsg);
 			break;
 
+		case RDS_CMSG_ZCOPY_COOKIE:
+			ret = rds_cmsg_zcopy(rs, rm, cmsg);
+			break;
+
 		default:
 			return -EINVAL;
 		}
@@ -1045,10 +1062,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 	long timeo = sock_sndtimeo(sk, nonblock);
 	struct rds_conn_path *cpath;
 	size_t total_payload_len = payload_len, rdma_payload_len = 0;
+	bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) &&
+		      sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY));
+	int num_sgs = ceil(payload_len, PAGE_SIZE);
 
 	/* Mirror Linux UDP mirror of BSD error message compatibility */
 	/* XXX: Perhaps MSG_MORE someday */
-	if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) {
+	if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT | MSG_ZEROCOPY)) {
 		ret = -EOPNOTSUPP;
 		goto out;
 	}
@@ -1092,8 +1112,15 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 		goto out;
 	}
 
+	if (zcopy) {
+		if (rs->rs_transport->t_type != RDS_TRANS_TCP) {
+			ret = -EOPNOTSUPP;
+			goto out;
+		}
+		num_sgs = iov_iter_npages(&msg->msg_iter, INT_MAX);
+	}
 	/* size of rm including all sgs */
-	ret = rds_rm_size(msg, payload_len);
+	ret = rds_rm_size(msg, num_sgs);
 	if (ret < 0)
 		goto out;
 
@@ -1105,12 +1132,12 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 
 	/* Attach data to the rm */
 	if (payload_len) {
-		rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE));
+		rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs);
 		if (!rm->data.op_sg) {
 			ret = -ENOMEM;
 			goto out;
 		}
-		ret = rds_message_copy_from_user(rm, &msg->msg_iter);
+		ret = rds_message_copy_from_user(rm, &msg->msg_iter, zcopy);
 		if (ret)
 			goto out;
 	}

+ 128 - 5
tools/testing/selftests/net/msg_zerocopy.c

@@ -14,6 +14,9 @@
  * - SOCK_DGRAM
  * - SOCK_RAW
  *
+ * PF_RDS
+ * - SOCK_SEQPACKET
+ *
  * Start this program on two connected hosts, one in send mode and
  * the other with option '-r' to put it in receiver mode.
  *
@@ -53,6 +56,7 @@
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <unistd.h>
+#include <linux/rds.h>
 
 #ifndef SO_EE_ORIGIN_ZEROCOPY
 #define SO_EE_ORIGIN_ZEROCOPY		5
@@ -164,17 +168,39 @@ static int do_accept(int fd)
 	return fd;
 }
 
-static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy)
+static void add_zcopy_cookie(struct msghdr *msg, uint32_t cookie)
+{
+	struct cmsghdr *cm;
+
+	if (!msg->msg_control)
+		error(1, errno, "NULL cookie");
+	cm = (void *)msg->msg_control;
+	cm->cmsg_len = CMSG_LEN(sizeof(cookie));
+	cm->cmsg_level = SOL_RDS;
+	cm->cmsg_type = RDS_CMSG_ZCOPY_COOKIE;
+	memcpy(CMSG_DATA(cm), &cookie, sizeof(cookie));
+}
+
+static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy, int domain)
 {
 	int ret, len, i, flags;
+	static uint32_t cookie;
+	char ckbuf[CMSG_SPACE(sizeof(cookie))];
 
 	len = 0;
 	for (i = 0; i < msg->msg_iovlen; i++)
 		len += msg->msg_iov[i].iov_len;
 
 	flags = MSG_DONTWAIT;
-	if (do_zerocopy)
+	if (do_zerocopy) {
 		flags |= MSG_ZEROCOPY;
+		if (domain == PF_RDS) {
+			memset(&msg->msg_control, 0, sizeof(msg->msg_control));
+			msg->msg_controllen = CMSG_SPACE(sizeof(cookie));
+			msg->msg_control = (struct cmsghdr *)ckbuf;
+			add_zcopy_cookie(msg, ++cookie);
+		}
+	}
 
 	ret = sendmsg(fd, msg, flags);
 	if (ret == -1 && errno == EAGAIN)
@@ -190,6 +216,10 @@ static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy)
 		if (do_zerocopy && ret)
 			expected_completions++;
 	}
+	if (do_zerocopy && domain == PF_RDS) {
+		msg->msg_control = NULL;
+		msg->msg_controllen = 0;
+	}
 
 	return true;
 }
@@ -216,7 +246,9 @@ static void do_sendmsg_corked(int fd, struct msghdr *msg)
 		msg->msg_iov[0].iov_len = payload_len + extra_len;
 		extra_len = 0;
 
-		do_sendmsg(fd, msg, do_zerocopy);
+		do_sendmsg(fd, msg, do_zerocopy,
+			   (cfg_dst_addr.ss_family == AF_INET ?
+			    PF_INET : PF_INET6));
 	}
 
 	do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0);
@@ -300,13 +332,38 @@ static int do_setup_tx(int domain, int type, int protocol)
 	if (cfg_zerocopy)
 		do_setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, 1);
 
-	if (domain != PF_PACKET)
+	if (domain != PF_PACKET && domain != PF_RDS)
 		if (connect(fd, (void *) &cfg_dst_addr, cfg_alen))
 			error(1, errno, "connect");
 
+	if (domain == PF_RDS) {
+		if (bind(fd, (void *) &cfg_src_addr, cfg_alen))
+			error(1, errno, "bind");
+	}
+
 	return fd;
 }
 
+static int do_process_zerocopy_cookies(struct sock_extended_err *serr,
+				       uint32_t *ckbuf, size_t nbytes)
+{
+	int ncookies, i;
+
+	if (serr->ee_errno != 0)
+		error(1, 0, "serr: wrong error code: %u", serr->ee_errno);
+	ncookies = serr->ee_data;
+	if (ncookies > SO_EE_ORIGIN_MAX_ZCOOKIES)
+		error(1, 0, "Returned %d cookies, max expected %d\n",
+		      ncookies, SO_EE_ORIGIN_MAX_ZCOOKIES);
+	if (nbytes != ncookies * sizeof(uint32_t))
+		error(1, 0, "Expected %d cookies, got %ld\n",
+		      ncookies, nbytes/sizeof(uint32_t));
+	for (i = 0; i < ncookies; i++)
+		if (cfg_verbose >= 2)
+			fprintf(stderr, "%d\n", ckbuf[i]);
+	return ncookies;
+}
+
 static bool do_recv_completion(int fd)
 {
 	struct sock_extended_err *serr;
@@ -315,10 +372,17 @@ static bool do_recv_completion(int fd)
 	uint32_t hi, lo, range;
 	int ret, zerocopy;
 	char control[100];
+	uint32_t ckbuf[SO_EE_ORIGIN_MAX_ZCOOKIES];
+	struct iovec iov;
 
 	msg.msg_control = control;
 	msg.msg_controllen = sizeof(control);
 
+	iov.iov_base = ckbuf;
+	iov.iov_len = (SO_EE_ORIGIN_MAX_ZCOOKIES * sizeof(ckbuf[0]));
+	msg.msg_iov = &iov;
+	msg.msg_iovlen = 1;
+
 	ret = recvmsg(fd, &msg, MSG_ERRQUEUE);
 	if (ret == -1 && errno == EAGAIN)
 		return false;
@@ -337,6 +401,11 @@ static bool do_recv_completion(int fd)
 		      cm->cmsg_level, cm->cmsg_type);
 
 	serr = (void *) CMSG_DATA(cm);
+
+	if (serr->ee_origin == SO_EE_ORIGIN_ZCOOKIE) {
+		completions += do_process_zerocopy_cookies(serr, ckbuf, ret);
+		return true;
+	}
 	if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY)
 		error(1, 0, "serr: wrong origin: %u", serr->ee_origin);
 	if (serr->ee_errno != 0)
@@ -444,6 +513,13 @@ static void do_tx(int domain, int type, int protocol)
 		msg.msg_iovlen++;
 	}
 
+	if (domain == PF_RDS) {
+		msg.msg_name = &cfg_dst_addr;
+		msg.msg_namelen =  (cfg_dst_addr.ss_family == AF_INET ?
+				    sizeof(struct sockaddr_in) :
+				    sizeof(struct sockaddr_in6));
+	}
+
 	iov[2].iov_base = payload;
 	iov[2].iov_len = cfg_payload_len;
 	msg.msg_iovlen++;
@@ -454,7 +530,7 @@ static void do_tx(int domain, int type, int protocol)
 		if (cfg_cork)
 			do_sendmsg_corked(fd, &msg);
 		else
-			do_sendmsg(fd, &msg, cfg_zerocopy);
+			do_sendmsg(fd, &msg, cfg_zerocopy, domain);
 
 		while (!do_poll(fd, POLLOUT)) {
 			if (cfg_zerocopy)
@@ -555,6 +631,40 @@ static void do_flush_datagram(int fd, int type)
 	bytes += cfg_payload_len;
 }
 
+
+static void do_recvmsg(int fd)
+{
+	int ret, off = 0;
+	char *buf;
+	struct iovec iov;
+	struct msghdr msg;
+	struct sockaddr_storage din;
+
+	buf = calloc(cfg_payload_len, sizeof(char));
+	iov.iov_base = buf;
+	iov.iov_len = cfg_payload_len;
+
+	memset(&msg, 0, sizeof(msg));
+	msg.msg_name = &din;
+	msg.msg_namelen = sizeof(din);
+	msg.msg_iov = &iov;
+	msg.msg_iovlen = 1;
+
+	ret = recvmsg(fd, &msg, MSG_TRUNC);
+
+	if (ret == -1)
+		error(1, errno, "recv");
+	if (ret != cfg_payload_len)
+		error(1, 0, "recv: ret=%u != %u", ret, cfg_payload_len);
+
+	if (memcmp(buf + off, payload, ret))
+		error(1, 0, "recv: data mismatch");
+
+	free(buf);
+	packets++;
+	bytes += cfg_payload_len;
+}
+
 static void do_rx(int domain, int type, int protocol)
 {
 	uint64_t tstop;
@@ -566,6 +676,8 @@ static void do_rx(int domain, int type, int protocol)
 	do {
 		if (type == SOCK_STREAM)
 			do_flush_tcp(fd);
+		else if (domain == PF_RDS)
+			do_recvmsg(fd);
 		else
 			do_flush_datagram(fd, type);
 
@@ -610,6 +722,7 @@ static void parse_opts(int argc, char **argv)
 				    40 /* max tcp options */;
 	int c;
 	char *daddr = NULL, *saddr = NULL;
+	char *cfg_test;
 
 	cfg_payload_len = max_payload_len;
 
@@ -667,6 +780,14 @@ static void parse_opts(int argc, char **argv)
 			break;
 		}
 	}
+
+	cfg_test = argv[argc - 1];
+	if (strcmp(cfg_test, "rds") == 0) {
+		if (!daddr)
+			error(1, 0, "-D <server addr> required for PF_RDS\n");
+		if (!cfg_rx && !saddr)
+			error(1, 0, "-S <client addr> required for PF_RDS\n");
+	}
 	setup_sockaddr(cfg_family, daddr, &cfg_dst_addr);
 	setup_sockaddr(cfg_family, saddr, &cfg_src_addr);
 
@@ -699,6 +820,8 @@ int main(int argc, char **argv)
 		do_test(cfg_family, SOCK_STREAM, 0);
 	else if (!strcmp(cfg_test, "udp"))
 		do_test(cfg_family, SOCK_DGRAM, 0);
+	else if (!strcmp(cfg_test, "rds"))
+		do_test(PF_RDS, SOCK_SEQPACKET, 0);
 	else
 		error(1, 0, "unknown cfg_test %s", cfg_test);