Browse Source

Merge branch 'net-smc-IPv6-support'

Ursula Braun says:

====================
net/smc: IPv6 support

these smc patches for the net-next tree add IPv6 support.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
David S. Miller 7 years ago
parent
commit
b8124b53d1
4 changed files with 235 additions and 67 deletions
  1. 51 27
      net/smc/af_smc.c
  2. 3 1
      net/smc/smc.h
  3. 169 34
      net/smc/smc_clc.c
  4. 12 5
      net/smc/smc_clc.h

+ 51 - 27
net/smc/af_smc.c

@@ -7,12 +7,11 @@
  *  applicable with RoCE-cards only
  *
  *  Initial restrictions:
- *    - IPv6 support postponed
  *    - support for alternate links postponed
  *    - partial support for non-blocking sockets only
  *    - support for urgent data postponed
  *
- *  Copyright IBM Corp. 2016
+ *  Copyright IBM Corp. 2016, 2018
  *
  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
  *              based on prototype from Frank Blaschka
@@ -64,6 +63,10 @@ static struct smc_hashinfo smc_v4_hashinfo = {
 	.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
 };
 
+static struct smc_hashinfo smc_v6_hashinfo = {
+	.lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
+};
+
 int smc_hash_sk(struct sock *sk)
 {
 	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
@@ -103,6 +106,18 @@ struct proto smc_proto = {
 };
 EXPORT_SYMBOL_GPL(smc_proto);
 
+struct proto smc_proto6 = {
+	.name		= "SMC6",
+	.owner		= THIS_MODULE,
+	.keepalive	= smc_set_keepalive,
+	.hash		= smc_hash_sk,
+	.unhash		= smc_unhash_sk,
+	.obj_size	= sizeof(struct smc_sock),
+	.h.smc_hash	= &smc_v6_hashinfo,
+	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
+};
+EXPORT_SYMBOL_GPL(smc_proto6);
+
 static int smc_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
@@ -159,19 +174,22 @@ static void smc_destruct(struct sock *sk)
 	sk_refcnt_debug_dec(sk);
 }
 
-static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
+static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
+				   int protocol)
 {
 	struct smc_sock *smc;
+	struct proto *prot;
 	struct sock *sk;
 
-	sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0);
+	prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
+	sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
 	if (!sk)
 		return NULL;
 
 	sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
 	sk->sk_state = SMC_INIT;
 	sk->sk_destruct = smc_destruct;
-	sk->sk_protocol = SMCPROTO_SMC;
+	sk->sk_protocol = protocol;
 	smc = smc_sk(sk);
 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
 	INIT_LIST_HEAD(&smc->accept_q);
@@ -198,10 +216,13 @@ static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
 		goto out;
 
 	rc = -EAFNOSUPPORT;
+	if (addr->sin_family != AF_INET &&
+	    addr->sin_family != AF_INET6 &&
+	    addr->sin_family != AF_UNSPEC)
+		goto out;
 	/* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
-	if ((addr->sin_family != AF_INET) &&
-	    ((addr->sin_family != AF_UNSPEC) ||
-	     (addr->sin_addr.s_addr != htonl(INADDR_ANY))))
+	if (addr->sin_family == AF_UNSPEC &&
+	    addr->sin_addr.s_addr != htonl(INADDR_ANY))
 		goto out;
 
 	lock_sock(sk);
@@ -529,7 +550,7 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
 	/* separate smc parameter checking to be safe */
 	if (alen < sizeof(addr->sa_family))
 		goto out_err;
-	if (addr->sa_family != AF_INET)
+	if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
 		goto out_err;
 
 	lock_sock(sk);
@@ -571,7 +592,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
 	int rc;
 
 	release_sock(lsk);
-	new_sk = smc_sock_alloc(sock_net(lsk), NULL);
+	new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
 	if (!new_sk) {
 		rc = -ENOMEM;
 		lsk->sk_err = ENOMEM;
@@ -767,8 +788,6 @@ static void smc_listen_work(struct work_struct *work)
 	struct smc_link *link;
 	int reason_code = 0;
 	int rc = 0;
-	__be32 subnet;
-	u8 prefix_len;
 	u8 ibport;
 
 	/* check if peer is smc capable */
@@ -803,17 +822,11 @@ static void smc_listen_work(struct work_struct *work)
 		goto decline_rdma;
 	}
 
-	/* determine subnet and mask from internal TCP socket */
-	rc = smc_clc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len);
-	if (rc) {
-		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
-		goto decline_rdma;
-	}
-
 	pclc = (struct smc_clc_msg_proposal *)&buf;
 	pclc_prfx = smc_clc_proposal_get_prefix(pclc);
-	if (pclc_prfx->outgoing_subnet != subnet ||
-	    pclc_prfx->prefix_len != prefix_len) {
+
+	rc = smc_clc_prfx_match(newclcsock, pclc_prfx);
+	if (rc) {
 		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
 		goto decline_rdma;
 	}
@@ -1375,6 +1388,7 @@ static const struct proto_ops smc_sock_ops = {
 static int smc_create(struct net *net, struct socket *sock, int protocol,
 		      int kern)
 {
+	int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
 	struct smc_sock *smc;
 	struct sock *sk;
 	int rc;
@@ -1384,20 +1398,20 @@ static int smc_create(struct net *net, struct socket *sock, int protocol,
 		goto out;
 
 	rc = -EPROTONOSUPPORT;
-	if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP))
+	if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6)
 		goto out;
 
 	rc = -ENOBUFS;
 	sock->ops = &smc_sock_ops;
-	sk = smc_sock_alloc(net, sock);
+	sk = smc_sock_alloc(net, sock, protocol);
 	if (!sk)
 		goto out;
 
 	/* create internal TCP socket for CLC handshake and fallback */
 	smc = smc_sk(sk);
 	smc->use_fallback = false; /* assume rdma capability first */
-	rc = sock_create_kern(net, PF_INET, SOCK_STREAM,
-			      IPPROTO_TCP, &smc->clcsock);
+	rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
+			      &smc->clcsock);
 	if (rc) {
 		sk_common_release(sk);
 		goto out;
@@ -1437,16 +1451,23 @@ static int __init smc_init(void)
 
 	rc = proto_register(&smc_proto, 1);
 	if (rc) {
-		pr_err("%s: proto_register fails with %d\n", __func__, rc);
+		pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
 		goto out_pnet;
 	}
 
+	rc = proto_register(&smc_proto6, 1);
+	if (rc) {
+		pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc);
+		goto out_proto;
+	}
+
 	rc = sock_register(&smc_sock_family_ops);
 	if (rc) {
 		pr_err("%s: sock_register fails with %d\n", __func__, rc);
-		goto out_proto;
+		goto out_proto6;
 	}
 	INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
+	INIT_HLIST_HEAD(&smc_v6_hashinfo.ht);
 
 	rc = smc_ib_register_client();
 	if (rc) {
@@ -1459,6 +1480,8 @@ static int __init smc_init(void)
 
 out_sock:
 	sock_unregister(PF_SMC);
+out_proto6:
+	proto_unregister(&smc_proto6);
 out_proto:
 	proto_unregister(&smc_proto);
 out_pnet:
@@ -1483,6 +1506,7 @@ static void __exit smc_exit(void)
 	static_branch_disable(&tcp_have_smc);
 	smc_ib_unregister_client();
 	sock_unregister(PF_SMC);
+	proto_unregister(&smc_proto6);
 	proto_unregister(&smc_proto);
 	smc_pnet_exit();
 }

+ 3 - 1
net/smc/smc.h

@@ -18,11 +18,13 @@
 
 #include "smc_ib.h"
 
-#define SMCPROTO_SMC		0	/* SMC protocol */
+#define SMCPROTO_SMC		0	/* SMC protocol, IPv4 */
+#define SMCPROTO_SMC6		1	/* SMC protocol, IPv6 */
 
 #define SMC_MAX_PORTS		2	/* Max # of ports */
 
 extern struct proto smc_proto;
+extern struct proto smc_proto6;
 
 #ifdef ATOMIC64_INIT
 #define KERNEL_HAS_ATOMIC64

+ 169 - 34
net/smc/smc_clc.c

@@ -5,7 +5,7 @@
  *  CLC (connection layer control) handshake over initial TCP socket to
  *  prepare for RDMA traffic
  *
- *  Copyright IBM Corp. 2016
+ *  Copyright IBM Corp. 2016, 2018
  *
  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
  */
@@ -15,6 +15,7 @@
 #include <linux/if_ether.h>
 #include <linux/sched/signal.h>
 
+#include <net/addrconf.h>
 #include <net/sock.h>
 #include <net/tcp.h>
 
@@ -74,15 +75,67 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm)
 	return true;
 }
 
-/* determine subnet and mask of internal TCP socket */
-int smc_clc_netinfo_by_tcpsk(struct socket *clcsock,
-			     __be32 *subnet, u8 *prefix_len)
+/* find ipv4 addr on device and get the prefix len, fill CLC proposal msg */
+static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4,
+				 struct smc_clc_msg_proposal_prefix *prop)
+{
+	struct in_device *in_dev = __in_dev_get_rcu(dst->dev);
+
+	if (!in_dev)
+		return -ENODEV;
+	for_ifa(in_dev) {
+		if (!inet_ifa_match(ipv4, ifa))
+			continue;
+		prop->prefix_len = inet_mask_len(ifa->ifa_mask);
+		prop->outgoing_subnet = ifa->ifa_address & ifa->ifa_mask;
+		/* prop->ipv6_prefixes_cnt = 0; already done by memset before */
+		return 0;
+	} endfor_ifa(in_dev);
+	return -ENOENT;
+}
+
+/* fill CLC proposal msg with ipv6 prefixes from device */
+static int smc_clc_prfx_set6_rcu(struct dst_entry *dst,
+				 struct smc_clc_msg_proposal_prefix *prop,
+				 struct smc_clc_ipv6_prefix *ipv6_prfx)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	struct inet6_dev *in6_dev = __in6_dev_get(dst->dev);
+	struct inet6_ifaddr *ifa;
+	int cnt = 0;
+
+	if (!in6_dev)
+		return -ENODEV;
+	/* use a maximum of 8 IPv6 prefixes from device */
+	list_for_each_entry(ifa, &in6_dev->addr_list, if_list) {
+		if (ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL)
+			continue;
+		ipv6_addr_prefix(&ipv6_prfx[cnt].prefix,
+				 &ifa->addr, ifa->prefix_len);
+		ipv6_prfx[cnt].prefix_len = ifa->prefix_len;
+		cnt++;
+		if (cnt == SMC_CLC_MAX_V6_PREFIX)
+			break;
+	}
+	prop->ipv6_prefixes_cnt = cnt;
+	if (cnt)
+		return 0;
+#endif
+	return -ENOENT;
+}
+
+/* retrieve and set prefixes in CLC proposal msg */
+static int smc_clc_prfx_set(struct socket *clcsock,
+			    struct smc_clc_msg_proposal_prefix *prop,
+			    struct smc_clc_ipv6_prefix *ipv6_prfx)
 {
 	struct dst_entry *dst = sk_dst_get(clcsock->sk);
-	struct in_device *in_dev;
-	struct sockaddr_in addr;
+	struct sockaddr_storage addrs;
+	struct sockaddr_in6 *addr6;
+	struct sockaddr_in *addr;
 	int rc = -ENOENT;
 
+	memset(prop, 0, sizeof(*prop));
 	if (!dst) {
 		rc = -ENOTCONN;
 		goto out;
@@ -91,22 +144,97 @@ int smc_clc_netinfo_by_tcpsk(struct socket *clcsock,
 		rc = -ENODEV;
 		goto out_rel;
 	}
-
 	/* get address to which the internal TCP socket is bound */
-	kernel_getsockname(clcsock, (struct sockaddr *)&addr);
-	/* analyze IPv4 specific data of net_device belonging to TCP socket */
+	kernel_getsockname(clcsock, (struct sockaddr *)&addrs);
+	/* analyze IP specific data of net_device belonging to TCP socket */
+	addr6 = (struct sockaddr_in6 *)&addrs;
 	rcu_read_lock();
-	in_dev = __in_dev_get_rcu(dst->dev);
+	if (addrs.ss_family == PF_INET) {
+		/* IPv4 */
+		addr = (struct sockaddr_in *)&addrs;
+		rc = smc_clc_prfx_set4_rcu(dst, addr->sin_addr.s_addr, prop);
+	} else if (ipv6_addr_v4mapped(&addr6->sin6_addr)) {
+		/* mapped IPv4 address - peer is IPv4 only */
+		rc = smc_clc_prfx_set4_rcu(dst, addr6->sin6_addr.s6_addr32[3],
+					   prop);
+	} else {
+		/* IPv6 */
+		rc = smc_clc_prfx_set6_rcu(dst, prop, ipv6_prfx);
+	}
+	rcu_read_unlock();
+out_rel:
+	dst_release(dst);
+out:
+	return rc;
+}
+
+/* match ipv4 addrs of dev against addr in CLC proposal */
+static int smc_clc_prfx_match4_rcu(struct net_device *dev,
+				   struct smc_clc_msg_proposal_prefix *prop)
+{
+	struct in_device *in_dev = __in_dev_get_rcu(dev);
+
+	if (!in_dev)
+		return -ENODEV;
 	for_ifa(in_dev) {
-		if (!inet_ifa_match(addr.sin_addr.s_addr, ifa))
-			continue;
-		*prefix_len = inet_mask_len(ifa->ifa_mask);
-		*subnet = ifa->ifa_address & ifa->ifa_mask;
-		rc = 0;
-		break;
+		if (prop->prefix_len == inet_mask_len(ifa->ifa_mask) &&
+		    inet_ifa_match(prop->outgoing_subnet, ifa))
+			return 0;
 	} endfor_ifa(in_dev);
-	rcu_read_unlock();
 
+	return -ENOENT;
+}
+
+/* match ipv6 addrs of dev against addrs in CLC proposal */
+static int smc_clc_prfx_match6_rcu(struct net_device *dev,
+				   struct smc_clc_msg_proposal_prefix *prop)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	struct inet6_dev *in6_dev = __in6_dev_get(dev);
+	struct smc_clc_ipv6_prefix *ipv6_prfx;
+	struct inet6_ifaddr *ifa;
+	int i, max;
+
+	if (!in6_dev)
+		return -ENODEV;
+	/* ipv6 prefix list starts behind smc_clc_msg_proposal_prefix */
+	ipv6_prfx = (struct smc_clc_ipv6_prefix *)((u8 *)prop + sizeof(*prop));
+	max = min_t(u8, prop->ipv6_prefixes_cnt, SMC_CLC_MAX_V6_PREFIX);
+	list_for_each_entry(ifa, &in6_dev->addr_list, if_list) {
+		if (ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL)
+			continue;
+		for (i = 0; i < max; i++) {
+			if (ifa->prefix_len == ipv6_prfx[i].prefix_len &&
+			    ipv6_prefix_equal(&ifa->addr, &ipv6_prfx[i].prefix,
+					      ifa->prefix_len))
+				return 0;
+		}
+	}
+#endif
+	return -ENOENT;
+}
+
+/* check if proposed prefixes match one of our device prefixes */
+int smc_clc_prfx_match(struct socket *clcsock,
+		       struct smc_clc_msg_proposal_prefix *prop)
+{
+	struct dst_entry *dst = sk_dst_get(clcsock->sk);
+	int rc;
+
+	if (!dst) {
+		rc = -ENOTCONN;
+		goto out;
+	}
+	if (!dst->dev) {
+		rc = -ENODEV;
+		goto out_rel;
+	}
+	rcu_read_lock();
+	if (!prop->ipv6_prefixes_cnt)
+		rc = smc_clc_prfx_match4_rcu(dst->dev, prop);
+	else
+		rc = smc_clc_prfx_match6_rcu(dst->dev, prop);
+	rcu_read_unlock();
 out_rel:
 	dst_release(dst);
 out:
@@ -232,16 +360,24 @@ int smc_clc_send_proposal(struct smc_sock *smc,
 			  struct smc_ib_device *smcibdev,
 			  u8 ibport)
 {
+	struct smc_clc_ipv6_prefix ipv6_prfx[SMC_CLC_MAX_V6_PREFIX];
 	struct smc_clc_msg_proposal_prefix pclc_prfx;
 	struct smc_clc_msg_proposal pclc;
 	struct smc_clc_msg_trail trl;
+	int len, i, plen, rc;
 	int reason_code = 0;
-	struct kvec vec[3];
+	struct kvec vec[4];
 	struct msghdr msg;
-	int len, plen, rc;
+
+	/* retrieve ip prefixes for CLC proposal msg */
+	rc = smc_clc_prfx_set(smc->clcsock, &pclc_prfx, ipv6_prfx);
+	if (rc)
+		return SMC_CLC_DECL_CNFERR; /* configuration error */
 
 	/* send SMC Proposal CLC message */
-	plen = sizeof(pclc) + sizeof(pclc_prfx) + sizeof(trl);
+	plen = sizeof(pclc) + sizeof(pclc_prfx) +
+	       (pclc_prfx.ipv6_prefixes_cnt * sizeof(ipv6_prfx[0])) +
+	       sizeof(trl);
 	memset(&pclc, 0, sizeof(pclc));
 	memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
 	pclc.hdr.type = SMC_CLC_PROPOSAL;
@@ -252,23 +388,22 @@ int smc_clc_send_proposal(struct smc_sock *smc,
 	memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1], ETH_ALEN);
 	pclc.iparea_offset = htons(0);
 
-	memset(&pclc_prfx, 0, sizeof(pclc_prfx));
-	/* determine subnet and mask from internal TCP socket */
-	rc = smc_clc_netinfo_by_tcpsk(smc->clcsock, &pclc_prfx.outgoing_subnet,
-				      &pclc_prfx.prefix_len);
-	if (rc)
-		return SMC_CLC_DECL_CNFERR; /* configuration error */
-	pclc_prfx.ipv6_prefixes_cnt = 0;
 	memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
 	memset(&msg, 0, sizeof(msg));
-	vec[0].iov_base = &pclc;
-	vec[0].iov_len = sizeof(pclc);
-	vec[1].iov_base = &pclc_prfx;
-	vec[1].iov_len = sizeof(pclc_prfx);
-	vec[2].iov_base = &trl;
-	vec[2].iov_len = sizeof(trl);
+	i = 0;
+	vec[i].iov_base = &pclc;
+	vec[i++].iov_len = sizeof(pclc);
+	vec[i].iov_base = &pclc_prfx;
+	vec[i++].iov_len = sizeof(pclc_prfx);
+	if (pclc_prfx.ipv6_prefixes_cnt > 0) {
+		vec[i].iov_base = &ipv6_prfx[0];
+		vec[i++].iov_len = pclc_prfx.ipv6_prefixes_cnt *
+				   sizeof(ipv6_prfx[0]);
+	}
+	vec[i].iov_base = &trl;
+	vec[i++].iov_len = sizeof(trl);
 	/* due to the few bytes needed for clc-handshake this cannot block */
-	len = kernel_sendmsg(smc->clcsock, &msg, vec, 3, plen);
+	len = kernel_sendmsg(smc->clcsock, &msg, vec, i, plen);
 	if (len < sizeof(pclc)) {
 		if (len >= 0) {
 			reason_code = -ENETUNREACH;

+ 12 - 5
net/smc/smc_clc.h

@@ -60,10 +60,15 @@ struct smc_clc_msg_local {	/* header2 of clc messages */
 	u8 mac[6];		/* mac of ib_device port */
 };
 
+#define SMC_CLC_MAX_V6_PREFIX	8
+
+/* Struct would be 4 byte aligned, but it is used in an array that is sent
+ * to peers and must conform to RFC7609, hence we need to use packed here.
+ */
 struct smc_clc_ipv6_prefix {
-	u8 prefix[4];
+	struct in6_addr prefix;
 	u8 prefix_len;
-} __packed;
+} __packed;			/* format defined in RFC7609 */
 
 struct smc_clc_msg_proposal_prefix {	/* prefix part of clc proposal message*/
 	__be32 outgoing_subnet;	/* subnet mask */
@@ -79,9 +84,11 @@ struct smc_clc_msg_proposal {	/* clc proposal message sent by Linux */
 } __aligned(4);
 
 #define SMC_CLC_PROPOSAL_MAX_OFFSET	0x28
-#define SMC_CLC_PROPOSAL_MAX_PREFIX	(8 * sizeof(struct smc_clc_ipv6_prefix))
+#define SMC_CLC_PROPOSAL_MAX_PREFIX	(SMC_CLC_MAX_V6_PREFIX * \
+					 sizeof(struct smc_clc_ipv6_prefix))
 #define SMC_CLC_MAX_LEN		(sizeof(struct smc_clc_msg_proposal) + \
 				 SMC_CLC_PROPOSAL_MAX_OFFSET + \
+				 sizeof(struct smc_clc_msg_proposal_prefix) + \
 				 SMC_CLC_PROPOSAL_MAX_PREFIX + \
 				 sizeof(struct smc_clc_msg_trail))
 
@@ -122,8 +129,8 @@ smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc)
 	       ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset));
 }
 
-int smc_clc_netinfo_by_tcpsk(struct socket *clcsock, __be32 *subnet,
-			     u8 *prefix_len);
+int smc_clc_prfx_match(struct socket *clcsock,
+		       struct smc_clc_msg_proposal_prefix *prop);
 int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
 		     u8 expected_type);
 int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info);