Bläddra i källkod

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma

Pull rdma updates from Jason Gunthorpe:
 "This has been a smaller cycle with many of the commits being smallish
  code fixes and improvements across the drivers.

   - Driver updates for bnxt_re, cxgb4, hfi1, hns, mlx5, nes, qedr, and
     rxe

   - Memory window support in hns

   - mlx5 user API 'flow mutate/steering' allows accessing the full
     packet mangling and matching machinery from user space

   - Support inter-working with verbs API calls in the 'devx' mlx5 user
     API, and provide options to use devx with less privilege

   - Modernize the use of syfs and the device interface to use attribute
     groups and cdev properly for uverbs, and clean up some of the core
     code's device list management

   - More progress on net namespaces for RDMA devices

   - Consolidate driver BAR mmapping support into core code helpers and
     rework how RDMA holds poitners to mm_struct for get_user_pages
     cases

   - First pass to use 'dev_name' instead of ib_device->name

   - Device renaming for RDMA devices"

* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (242 commits)
  IB/mlx5: Add support for extended atomic operations
  RDMA/core: Fix comment for hw stats init for port == 0
  RDMA/core: Refactor ib_register_device() function
  RDMA/core: Fix unwinding flow in case of error to register device
  ib_srp: Remove WARN_ON in srp_terminate_io()
  IB/mlx5: Allow scatter to CQE without global signaled WRs
  IB/mlx5: Verify that driver supports user flags
  IB/mlx5: Support scatter to CQE for DC transport type
  RDMA/drivers: Use core provided API for registering device attributes
  RDMA/core: Allow existing drivers to set one sysfs group per device
  IB/rxe: Remove unnecessary enum values
  RDMA/umad: Use kernel API to allocate umad indexes
  RDMA/uverbs: Use kernel API to allocate uverbs indexes
  RDMA/core: Increase total number of RDMA ports across all devices
  IB/mlx4: Add port and TID to MAD debug print
  IB/mlx4: Enable debug print of SMPs
  RDMA/core: Rename ports_parent to ports_kobj
  RDMA/core: Do not expose unsupported counters
  IB/mlx4: Refer to the device kobject instead of ports_parent
  RDMA/nldev: Allow IB device rename through RDMA netlink
  ...
Linus Torvalds 6 år sedan
förälder
incheckning
da19a102ce
100 ändrade filer med 4242 tillägg och 3445 borttagningar
  1. 18 0
      Documentation/ABI/testing/sysfs-class-net
  2. 1 0
      drivers/infiniband/Kconfig
  3. 252 154
      drivers/infiniband/core/addr.c
  4. 55 24
      drivers/infiniband/core/cache.c
  5. 6 3
      drivers/infiniband/core/cm.c
  6. 177 74
      drivers/infiniband/core/cma.c
  7. 1 1
      drivers/infiniband/core/cma_configfs.c
  8. 10 2
      drivers/infiniband/core/core_priv.h
  9. 7 3
      drivers/infiniband/core/cq.c
  10. 169 95
      drivers/infiniband/core/device.c
  11. 3 2
      drivers/infiniband/core/fmr_pool.c
  12. 1 1
      drivers/infiniband/core/iwcm.c
  13. 43 37
      drivers/infiniband/core/mad.c
  14. 1 1
      drivers/infiniband/core/mad_priv.h
  15. 2 2
      drivers/infiniband/core/netlink.c
  16. 36 1
      drivers/infiniband/core/nldev.c
  17. 13 43
      drivers/infiniband/core/rdma_core.c
  18. 1 0
      drivers/infiniband/core/rdma_core.h
  19. 24 6
      drivers/infiniband/core/restrack.c
  20. 3 5
      drivers/infiniband/core/sa.h
  21. 11 59
      drivers/infiniband/core/sa_query.c
  22. 3 4
      drivers/infiniband/core/security.c
  23. 47 54
      drivers/infiniband/core/sysfs.c
  24. 66 59
      drivers/infiniband/core/umem.c
  25. 286 335
      drivers/infiniband/core/umem_odp.c
  26. 6 7
      drivers/infiniband/core/user_mad.c
  27. 6 9
      drivers/infiniband/core/uverbs.h
  28. 12 31
      drivers/infiniband/core/uverbs_cmd.c
  29. 137 3
      drivers/infiniband/core/uverbs_ioctl.c
  30. 276 64
      drivers/infiniband/core/uverbs_main.c
  31. 2 5
      drivers/infiniband/core/uverbs_std_types_flow_action.c
  32. 12 0
      drivers/infiniband/core/uverbs_uapi.c
  33. 10 9
      drivers/infiniband/core/verbs.c
  34. 2 1
      drivers/infiniband/hw/bnxt_re/bnxt_re.h
  35. 10 1
      drivers/infiniband/hw/bnxt_re/hw_counters.c
  36. 3 0
      drivers/infiniband/hw/bnxt_re/hw_counters.h
  37. 2 2
      drivers/infiniband/hw/bnxt_re/ib_verbs.c
  38. 65 60
      drivers/infiniband/hw/bnxt_re/main.c
  39. 54 80
      drivers/infiniband/hw/bnxt_re/qplib_fp.c
  40. 49 39
      drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
  41. 4 0
      drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
  42. 14 15
      drivers/infiniband/hw/bnxt_re/qplib_res.c
  43. 45 32
      drivers/infiniband/hw/bnxt_re/qplib_sp.c
  44. 10 0
      drivers/infiniband/hw/bnxt_re/qplib_sp.h
  45. 5 0
      drivers/infiniband/hw/bnxt_re/roce_hsi.h
  46. 20 35
      drivers/infiniband/hw/cxgb3/iwch_provider.c
  47. 1 2
      drivers/infiniband/hw/cxgb4/cm.c
  48. 1 1
      drivers/infiniband/hw/cxgb4/cq.c
  49. 20 30
      drivers/infiniband/hw/cxgb4/provider.c
  50. 5 5
      drivers/infiniband/hw/cxgb4/qp.c
  51. 36 6
      drivers/infiniband/hw/hfi1/Makefile
  52. 2 2
      drivers/infiniband/hw/hfi1/affinity.c
  53. 134 352
      drivers/infiniband/hw/hfi1/chip.c
  54. 48 23
      drivers/infiniband/hw/hfi1/chip.h
  55. 4 0
      drivers/infiniband/hw/hfi1/chip_registers.h
  56. 3 1
      drivers/infiniband/hw/hfi1/file_ops.c
  57. 20 28
      drivers/infiniband/hw/hfi1/hfi.h
  58. 67 46
      drivers/infiniband/hw/hfi1/init.c
  59. 94 0
      drivers/infiniband/hw/hfi1/iowait.c
  60. 134 58
      drivers/infiniband/hw/hfi1/iowait.h
  61. 2 2
      drivers/infiniband/hw/hfi1/mad.c
  62. 363 0
      drivers/infiniband/hw/hfi1/msix.c
  63. 64 0
      drivers/infiniband/hw/hfi1/msix.h
  64. 26 48
      drivers/infiniband/hw/hfi1/pcie.c
  65. 0 8
      drivers/infiniband/hw/hfi1/pio.c
  66. 73 27
      drivers/infiniband/hw/hfi1/qp.c
  67. 17 14
      drivers/infiniband/hw/hfi1/qp.h
  68. 13 11
      drivers/infiniband/hw/hfi1/rc.c
  69. 9 373
      drivers/infiniband/hw/hfi1/ruc.c
  70. 22 34
      drivers/infiniband/hw/hfi1/sdma.c
  71. 6 15
      drivers/infiniband/hw/hfi1/sdma.h
  72. 32 37
      drivers/infiniband/hw/hfi1/sysfs.c
  73. 2 1
      drivers/infiniband/hw/hfi1/trace.h
  74. 54 0
      drivers/infiniband/hw/hfi1/trace_iowait.h
  75. 7 7
      drivers/infiniband/hw/hfi1/uc.c
  76. 11 11
      drivers/infiniband/hw/hfi1/ud.c
  77. 64 73
      drivers/infiniband/hw/hfi1/user_sdma.c
  78. 9 11
      drivers/infiniband/hw/hfi1/user_sdma.h
  79. 19 232
      drivers/infiniband/hw/hfi1/verbs.c
  80. 5 30
      drivers/infiniband/hw/hfi1/verbs.h
  81. 4 7
      drivers/infiniband/hw/hfi1/verbs_txreq.h
  82. 6 6
      drivers/infiniband/hw/hfi1/vnic_main.c
  83. 12 9
      drivers/infiniband/hw/hfi1/vnic_sdma.c
  84. 1 0
      drivers/infiniband/hw/hns/Kconfig
  85. 5 1
      drivers/infiniband/hw/hns/hns_roce_ah.c
  86. 37 8
      drivers/infiniband/hw/hns/hns_roce_device.h
  87. 2 2
      drivers/infiniband/hw/hns/hns_roce_hw_v1.c
  88. 394 235
      drivers/infiniband/hw/hns/hns_roce_hw_v2.c
  89. 72 24
      drivers/infiniband/hw/hns/hns_roce_hw_v2.h
  90. 40 83
      drivers/infiniband/hw/hns/hns_roce_main.c
  91. 204 8
      drivers/infiniband/hw/hns/hns_roce_mr.c
  92. 36 5
      drivers/infiniband/hw/hns/hns_roce_qp.c
  93. 1 1
      drivers/infiniband/hw/i40iw/i40iw_cm.c
  94. 24 49
      drivers/infiniband/hw/i40iw/i40iw_verbs.c
  95. 1 0
      drivers/infiniband/hw/mlx4/Kconfig
  96. 11 9
      drivers/infiniband/hw/mlx4/mad.c
  97. 43 139
      drivers/infiniband/hw/mlx4/main.c
  98. 1 1
      drivers/infiniband/hw/mlx4/mcg.c
  99. 0 5
      drivers/infiniband/hw/mlx4/mlx4_ib.h
  100. 1 7
      drivers/infiniband/hw/mlx4/qp.c

+ 18 - 0
Documentation/ABI/testing/sysfs-class-net

@@ -91,6 +91,24 @@ Description:
 		stacked (e.g: VLAN interfaces) but still have the same MAC
 		address as their parent device.
 
+What:		/sys/class/net/<iface>/dev_port
+Date:		February 2014
+KernelVersion:	3.15
+Contact:	netdev@vger.kernel.org
+Description:
+		Indicates the port number of this network device, formatted
+		as a decimal value. Some NICs have multiple independent ports
+		on the same PCI bus, device and function. This attribute allows
+		userspace to distinguish the respective interfaces.
+
+		Note: some device drivers started to use 'dev_id' for this
+		purpose since long before 3.15 and have not adopted the new
+		attribute ever since. To query the port number, some tools look
+		exclusively at 'dev_port', while others only consult 'dev_id'.
+		If a network device has multiple client adapter ports as
+		described in the previous paragraph and does not set this
+		attribute to its port number, it's a kernel bug.
+
 What:		/sys/class/net/<iface>/dormant
 Date:		March 2006
 KernelVersion:	2.6.17

+ 1 - 0
drivers/infiniband/Kconfig

@@ -26,6 +26,7 @@ config INFINIBAND_USER_MAD
 config INFINIBAND_USER_ACCESS
 	tristate "InfiniBand userspace access (verbs and CM)"
 	select ANON_INODES
+	depends on MMU
 	---help---
 	  Userspace InfiniBand access support.  This enables the
 	  kernel side of userspace verbs and the userspace

+ 252 - 154
drivers/infiniband/core/addr.c

@@ -45,6 +45,7 @@
 #include <net/addrconf.h>
 #include <net/ip6_route.h>
 #include <rdma/ib_addr.h>
+#include <rdma/ib_sa.h>
 #include <rdma/ib.h>
 #include <rdma/rdma_netlink.h>
 #include <net/netlink.h>
@@ -61,6 +62,7 @@ struct addr_req {
 			 struct rdma_dev_addr *addr, void *context);
 	unsigned long timeout;
 	struct delayed_work work;
+	bool resolve_by_gid_attr;	/* Consider gid attr in resolve phase */
 	int status;
 	u32 seq;
 };
@@ -219,60 +221,75 @@ int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr)
 }
 EXPORT_SYMBOL(rdma_addr_size_kss);
 
-void rdma_copy_addr(struct rdma_dev_addr *dev_addr,
-		    const struct net_device *dev,
-		    const unsigned char *dst_dev_addr)
+/**
+ * rdma_copy_src_l2_addr - Copy netdevice source addresses
+ * @dev_addr:	Destination address pointer where to copy the addresses
+ * @dev:	Netdevice whose source addresses to copy
+ *
+ * rdma_copy_src_l2_addr() copies source addresses from the specified netdevice.
+ * This includes unicast address, broadcast address, device type and
+ * interface index.
+ */
+void rdma_copy_src_l2_addr(struct rdma_dev_addr *dev_addr,
+			   const struct net_device *dev)
 {
 	dev_addr->dev_type = dev->type;
 	memcpy(dev_addr->src_dev_addr, dev->dev_addr, MAX_ADDR_LEN);
 	memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN);
-	if (dst_dev_addr)
-		memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN);
 	dev_addr->bound_dev_if = dev->ifindex;
 }
-EXPORT_SYMBOL(rdma_copy_addr);
+EXPORT_SYMBOL(rdma_copy_src_l2_addr);
 
-int rdma_translate_ip(const struct sockaddr *addr,
-		      struct rdma_dev_addr *dev_addr)
+static struct net_device *
+rdma_find_ndev_for_src_ip_rcu(struct net *net, const struct sockaddr *src_in)
 {
-	struct net_device *dev;
+	struct net_device *dev = NULL;
+	int ret = -EADDRNOTAVAIL;
 
-	if (dev_addr->bound_dev_if) {
-		dev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if);
-		if (!dev)
-			return -ENODEV;
-		rdma_copy_addr(dev_addr, dev, NULL);
-		dev_put(dev);
-		return 0;
-	}
-
-	switch (addr->sa_family) {
+	switch (src_in->sa_family) {
 	case AF_INET:
-		dev = ip_dev_find(dev_addr->net,
-			((const struct sockaddr_in *)addr)->sin_addr.s_addr);
-
-		if (!dev)
-			return -EADDRNOTAVAIL;
-
-		rdma_copy_addr(dev_addr, dev, NULL);
-		dev_put(dev);
+		dev = __ip_dev_find(net,
+				    ((const struct sockaddr_in *)src_in)->sin_addr.s_addr,
+				    false);
+		if (dev)
+			ret = 0;
 		break;
 #if IS_ENABLED(CONFIG_IPV6)
 	case AF_INET6:
-		rcu_read_lock();
-		for_each_netdev_rcu(dev_addr->net, dev) {
-			if (ipv6_chk_addr(dev_addr->net,
-					  &((const struct sockaddr_in6 *)addr)->sin6_addr,
+		for_each_netdev_rcu(net, dev) {
+			if (ipv6_chk_addr(net,
+					  &((const struct sockaddr_in6 *)src_in)->sin6_addr,
 					  dev, 1)) {
-				rdma_copy_addr(dev_addr, dev, NULL);
+				ret = 0;
 				break;
 			}
 		}
-		rcu_read_unlock();
 		break;
 #endif
 	}
-	return 0;
+	return ret ? ERR_PTR(ret) : dev;
+}
+
+int rdma_translate_ip(const struct sockaddr *addr,
+		      struct rdma_dev_addr *dev_addr)
+{
+	struct net_device *dev;
+
+	if (dev_addr->bound_dev_if) {
+		dev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if);
+		if (!dev)
+			return -ENODEV;
+		rdma_copy_src_l2_addr(dev_addr, dev);
+		dev_put(dev);
+		return 0;
+	}
+
+	rcu_read_lock();
+	dev = rdma_find_ndev_for_src_ip_rcu(dev_addr->net, addr);
+	if (!IS_ERR(dev))
+		rdma_copy_src_l2_addr(dev_addr, dev);
+	rcu_read_unlock();
+	return PTR_ERR_OR_ZERO(dev);
 }
 EXPORT_SYMBOL(rdma_translate_ip);
 
@@ -295,15 +312,12 @@ static void queue_req(struct addr_req *req)
 	spin_unlock_bh(&lock);
 }
 
-static int ib_nl_fetch_ha(const struct dst_entry *dst,
-			  struct rdma_dev_addr *dev_addr,
+static int ib_nl_fetch_ha(struct rdma_dev_addr *dev_addr,
 			  const void *daddr, u32 seq, u16 family)
 {
-	if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS))
+	if (!rdma_nl_chk_listeners(RDMA_NL_GROUP_LS))
 		return -EADDRNOTAVAIL;
 
-	/* We fill in what we can, the response will fill the rest */
-	rdma_copy_addr(dev_addr, dst->dev, NULL);
 	return ib_nl_ip_send_msg(dev_addr, daddr, seq, family);
 }
 
@@ -322,7 +336,7 @@ static int dst_fetch_ha(const struct dst_entry *dst,
 		neigh_event_send(n, NULL);
 		ret = -ENODATA;
 	} else {
-		rdma_copy_addr(dev_addr, dst->dev, n->ha);
+		memcpy(dev_addr->dst_dev_addr, n->ha, MAX_ADDR_LEN);
 	}
 
 	neigh_release(n);
@@ -356,18 +370,22 @@ static int fetch_ha(const struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
 		(const void *)&dst_in6->sin6_addr;
 	sa_family_t family = dst_in->sa_family;
 
-	/* Gateway + ARPHRD_INFINIBAND -> IB router */
-	if (has_gateway(dst, family) && dst->dev->type == ARPHRD_INFINIBAND)
-		return ib_nl_fetch_ha(dst, dev_addr, daddr, seq, family);
+	/* If we have a gateway in IB mode then it must be an IB network */
+	if (has_gateway(dst, family) && dev_addr->network == RDMA_NETWORK_IB)
+		return ib_nl_fetch_ha(dev_addr, daddr, seq, family);
 	else
 		return dst_fetch_ha(dst, dev_addr, daddr);
 }
 
-static int addr4_resolve(struct sockaddr_in *src_in,
-			 const struct sockaddr_in *dst_in,
+static int addr4_resolve(struct sockaddr *src_sock,
+			 const struct sockaddr *dst_sock,
 			 struct rdma_dev_addr *addr,
 			 struct rtable **prt)
 {
+	struct sockaddr_in *src_in = (struct sockaddr_in *)src_sock;
+	const struct sockaddr_in *dst_in =
+			(const struct sockaddr_in *)dst_sock;
+
 	__be32 src_ip = src_in->sin_addr.s_addr;
 	__be32 dst_ip = dst_in->sin_addr.s_addr;
 	struct rtable *rt;
@@ -383,16 +401,8 @@ static int addr4_resolve(struct sockaddr_in *src_in,
 	if (ret)
 		return ret;
 
-	src_in->sin_family = AF_INET;
 	src_in->sin_addr.s_addr = fl4.saddr;
 
-	/* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're
-	 * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network
-	 * type accordingly.
-	 */
-	if (rt->rt_uses_gateway && rt->dst.dev->type != ARPHRD_INFINIBAND)
-		addr->network = RDMA_NETWORK_IPV4;
-
 	addr->hoplimit = ip4_dst_hoplimit(&rt->dst);
 
 	*prt = rt;
@@ -400,14 +410,16 @@ static int addr4_resolve(struct sockaddr_in *src_in,
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-static int addr6_resolve(struct sockaddr_in6 *src_in,
-			 const struct sockaddr_in6 *dst_in,
+static int addr6_resolve(struct sockaddr *src_sock,
+			 const struct sockaddr *dst_sock,
 			 struct rdma_dev_addr *addr,
 			 struct dst_entry **pdst)
 {
+	struct sockaddr_in6 *src_in = (struct sockaddr_in6 *)src_sock;
+	const struct sockaddr_in6 *dst_in =
+				(const struct sockaddr_in6 *)dst_sock;
 	struct flowi6 fl6;
 	struct dst_entry *dst;
-	struct rt6_info *rt;
 	int ret;
 
 	memset(&fl6, 0, sizeof fl6);
@@ -419,19 +431,8 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
 	if (ret < 0)
 		return ret;
 
-	rt = (struct rt6_info *)dst;
-	if (ipv6_addr_any(&src_in->sin6_addr)) {
-		src_in->sin6_family = AF_INET6;
+	if (ipv6_addr_any(&src_in->sin6_addr))
 		src_in->sin6_addr = fl6.saddr;
-	}
-
-	/* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're
-	 * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network
-	 * type accordingly.
-	 */
-	if (rt->rt6i_flags & RTF_GATEWAY &&
-	    ip6_dst_idev(dst)->dev->type != ARPHRD_INFINIBAND)
-		addr->network = RDMA_NETWORK_IPV6;
 
 	addr->hoplimit = ip6_dst_hoplimit(dst);
 
@@ -439,8 +440,8 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
 	return 0;
 }
 #else
-static int addr6_resolve(struct sockaddr_in6 *src_in,
-			 const struct sockaddr_in6 *dst_in,
+static int addr6_resolve(struct sockaddr *src_sock,
+			 const struct sockaddr *dst_sock,
 			 struct rdma_dev_addr *addr,
 			 struct dst_entry **pdst)
 {
@@ -451,36 +452,110 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
 static int addr_resolve_neigh(const struct dst_entry *dst,
 			      const struct sockaddr *dst_in,
 			      struct rdma_dev_addr *addr,
+			      unsigned int ndev_flags,
 			      u32 seq)
 {
-	if (dst->dev->flags & IFF_LOOPBACK) {
-		int ret;
+	int ret = 0;
 
-		ret = rdma_translate_ip(dst_in, addr);
-		if (!ret)
-			memcpy(addr->dst_dev_addr, addr->src_dev_addr,
-			       MAX_ADDR_LEN);
+	if (ndev_flags & IFF_LOOPBACK) {
+		memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN);
+	} else {
+		if (!(ndev_flags & IFF_NOARP)) {
+			/* If the device doesn't do ARP internally */
+			ret = fetch_ha(dst, addr, dst_in, seq);
+		}
+	}
+	return ret;
+}
 
-		return ret;
+static int copy_src_l2_addr(struct rdma_dev_addr *dev_addr,
+			    const struct sockaddr *dst_in,
+			    const struct dst_entry *dst,
+			    const struct net_device *ndev)
+{
+	int ret = 0;
+
+	if (dst->dev->flags & IFF_LOOPBACK)
+		ret = rdma_translate_ip(dst_in, dev_addr);
+	else
+		rdma_copy_src_l2_addr(dev_addr, dst->dev);
+
+	/*
+	 * If there's a gateway and type of device not ARPHRD_INFINIBAND,
+	 * we're definitely in RoCE v2 (as RoCE v1 isn't routable) set the
+	 * network type accordingly.
+	 */
+	if (has_gateway(dst, dst_in->sa_family) &&
+	    ndev->type != ARPHRD_INFINIBAND)
+		dev_addr->network = dst_in->sa_family == AF_INET ?
+						RDMA_NETWORK_IPV4 :
+						RDMA_NETWORK_IPV6;
+	else
+		dev_addr->network = RDMA_NETWORK_IB;
+
+	return ret;
+}
+
+static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr,
+				 unsigned int *ndev_flags,
+				 const struct sockaddr *dst_in,
+				 const struct dst_entry *dst)
+{
+	struct net_device *ndev = READ_ONCE(dst->dev);
+
+	*ndev_flags = ndev->flags;
+	/* A physical device must be the RDMA device to use */
+	if (ndev->flags & IFF_LOOPBACK) {
+		/*
+		 * RDMA (IB/RoCE, iWarp) doesn't run on lo interface or
+		 * loopback IP address. So if route is resolved to loopback
+		 * interface, translate that to a real ndev based on non
+		 * loopback IP address.
+		 */
+		ndev = rdma_find_ndev_for_src_ip_rcu(dev_net(ndev), dst_in);
+		if (IS_ERR(ndev))
+			return -ENODEV;
 	}
 
-	/* If the device doesn't do ARP internally */
-	if (!(dst->dev->flags & IFF_NOARP))
-		return fetch_ha(dst, addr, dst_in, seq);
+	return copy_src_l2_addr(dev_addr, dst_in, dst, ndev);
+}
+
+static int set_addr_netns_by_gid_rcu(struct rdma_dev_addr *addr)
+{
+	struct net_device *ndev;
 
-	rdma_copy_addr(addr, dst->dev, NULL);
+	ndev = rdma_read_gid_attr_ndev_rcu(addr->sgid_attr);
+	if (IS_ERR(ndev))
+		return PTR_ERR(ndev);
 
+	/*
+	 * Since we are holding the rcu, reading net and ifindex
+	 * are safe without any additional reference; because
+	 * change_net_namespace() in net/core/dev.c does rcu sync
+	 * after it changes the state to IFF_DOWN and before
+	 * updating netdev fields {net, ifindex}.
+	 */
+	addr->net = dev_net(ndev);
+	addr->bound_dev_if = ndev->ifindex;
 	return 0;
 }
 
+static void rdma_addr_set_net_defaults(struct rdma_dev_addr *addr)
+{
+	addr->net = &init_net;
+	addr->bound_dev_if = 0;
+}
+
 static int addr_resolve(struct sockaddr *src_in,
 			const struct sockaddr *dst_in,
 			struct rdma_dev_addr *addr,
 			bool resolve_neigh,
+			bool resolve_by_gid_attr,
 			u32 seq)
 {
-	struct net_device *ndev;
-	struct dst_entry *dst;
+	struct dst_entry *dst = NULL;
+	unsigned int ndev_flags = 0;
+	struct rtable *rt = NULL;
 	int ret;
 
 	if (!addr->net) {
@@ -488,58 +563,55 @@ static int addr_resolve(struct sockaddr *src_in,
 		return -EINVAL;
 	}
 
-	if (src_in->sa_family == AF_INET) {
-		struct rtable *rt = NULL;
-		const struct sockaddr_in *dst_in4 =
-			(const struct sockaddr_in *)dst_in;
-
-		ret = addr4_resolve((struct sockaddr_in *)src_in,
-				    dst_in4, addr, &rt);
-		if (ret)
-			return ret;
-
-		if (resolve_neigh)
-			ret = addr_resolve_neigh(&rt->dst, dst_in, addr, seq);
-
-		if (addr->bound_dev_if) {
-			ndev = dev_get_by_index(addr->net, addr->bound_dev_if);
-		} else {
-			ndev = rt->dst.dev;
-			dev_hold(ndev);
+	rcu_read_lock();
+	if (resolve_by_gid_attr) {
+		if (!addr->sgid_attr) {
+			rcu_read_unlock();
+			pr_warn_ratelimited("%s: missing gid_attr\n", __func__);
+			return -EINVAL;
 		}
-
-		ip_rt_put(rt);
-	} else {
-		const struct sockaddr_in6 *dst_in6 =
-			(const struct sockaddr_in6 *)dst_in;
-
-		ret = addr6_resolve((struct sockaddr_in6 *)src_in,
-				    dst_in6, addr,
-				    &dst);
-		if (ret)
+		/*
+		 * If the request is for a specific gid attribute of the
+		 * rdma_dev_addr, derive net from the netdevice of the
+		 * GID attribute.
+		 */
+		ret = set_addr_netns_by_gid_rcu(addr);
+		if (ret) {
+			rcu_read_unlock();
 			return ret;
-
-		if (resolve_neigh)
-			ret = addr_resolve_neigh(dst, dst_in, addr, seq);
-
-		if (addr->bound_dev_if) {
-			ndev = dev_get_by_index(addr->net, addr->bound_dev_if);
-		} else {
-			ndev = dst->dev;
-			dev_hold(ndev);
 		}
-
-		dst_release(dst);
 	}
-
-	if (ndev) {
-		if (ndev->flags & IFF_LOOPBACK)
-			ret = rdma_translate_ip(dst_in, addr);
-		else
-			addr->bound_dev_if = ndev->ifindex;
-		dev_put(ndev);
+	if (src_in->sa_family == AF_INET) {
+		ret = addr4_resolve(src_in, dst_in, addr, &rt);
+		dst = &rt->dst;
+	} else {
+		ret = addr6_resolve(src_in, dst_in, addr, &dst);
 	}
+	if (ret) {
+		rcu_read_unlock();
+		goto done;
+	}
+	ret = rdma_set_src_addr_rcu(addr, &ndev_flags, dst_in, dst);
+	rcu_read_unlock();
+
+	/*
+	 * Resolve neighbor destination address if requested and
+	 * only if src addr translation didn't fail.
+	 */
+	if (!ret && resolve_neigh)
+		ret = addr_resolve_neigh(dst, dst_in, addr, ndev_flags, seq);
 
+	if (src_in->sa_family == AF_INET)
+		ip_rt_put(rt);
+	else
+		dst_release(dst);
+done:
+	/*
+	 * Clear the addr net to go back to its original state, only if it was
+	 * derived from GID attribute in this context.
+	 */
+	if (resolve_by_gid_attr)
+		rdma_addr_set_net_defaults(addr);
 	return ret;
 }
 
@@ -554,7 +626,8 @@ static void process_one_req(struct work_struct *_work)
 		src_in = (struct sockaddr *)&req->src_addr;
 		dst_in = (struct sockaddr *)&req->dst_addr;
 		req->status = addr_resolve(src_in, dst_in, req->addr,
-					   true, req->seq);
+					   true, req->resolve_by_gid_attr,
+					   req->seq);
 		if (req->status && time_after_eq(jiffies, req->timeout)) {
 			req->status = -ETIMEDOUT;
 		} else if (req->status == -ENODATA) {
@@ -586,10 +659,10 @@ static void process_one_req(struct work_struct *_work)
 }
 
 int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr,
-		    struct rdma_dev_addr *addr, int timeout_ms,
+		    struct rdma_dev_addr *addr, unsigned long timeout_ms,
 		    void (*callback)(int status, struct sockaddr *src_addr,
 				     struct rdma_dev_addr *addr, void *context),
-		    void *context)
+		    bool resolve_by_gid_attr, void *context)
 {
 	struct sockaddr *src_in, *dst_in;
 	struct addr_req *req;
@@ -617,10 +690,12 @@ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr,
 	req->addr = addr;
 	req->callback = callback;
 	req->context = context;
+	req->resolve_by_gid_attr = resolve_by_gid_attr;
 	INIT_DELAYED_WORK(&req->work, process_one_req);
 	req->seq = (u32)atomic_inc_return(&ib_nl_addr_request_seq);
 
-	req->status = addr_resolve(src_in, dst_in, addr, true, req->seq);
+	req->status = addr_resolve(src_in, dst_in, addr, true,
+				   req->resolve_by_gid_attr, req->seq);
 	switch (req->status) {
 	case 0:
 		req->timeout = jiffies;
@@ -641,25 +716,53 @@ err:
 }
 EXPORT_SYMBOL(rdma_resolve_ip);
 
-int rdma_resolve_ip_route(struct sockaddr *src_addr,
-			  const struct sockaddr *dst_addr,
-			  struct rdma_dev_addr *addr)
+int roce_resolve_route_from_path(struct sa_path_rec *rec,
+				 const struct ib_gid_attr *attr)
 {
-	struct sockaddr_storage ssrc_addr = {};
-	struct sockaddr *src_in = (struct sockaddr *)&ssrc_addr;
+	union {
+		struct sockaddr     _sockaddr;
+		struct sockaddr_in  _sockaddr_in;
+		struct sockaddr_in6 _sockaddr_in6;
+	} sgid, dgid;
+	struct rdma_dev_addr dev_addr = {};
+	int ret;
 
-	if (src_addr) {
-		if (src_addr->sa_family != dst_addr->sa_family)
-			return -EINVAL;
+	if (rec->roce.route_resolved)
+		return 0;
 
-		memcpy(src_in, src_addr, rdma_addr_size(src_addr));
-	} else {
-		src_in->sa_family = dst_addr->sa_family;
-	}
+	rdma_gid2ip(&sgid._sockaddr, &rec->sgid);
+	rdma_gid2ip(&dgid._sockaddr, &rec->dgid);
+
+	if (sgid._sockaddr.sa_family != dgid._sockaddr.sa_family)
+		return -EINVAL;
+
+	if (!attr || !attr->ndev)
+		return -EINVAL;
+
+	dev_addr.net = &init_net;
+	dev_addr.sgid_attr = attr;
+
+	ret = addr_resolve(&sgid._sockaddr, &dgid._sockaddr,
+			   &dev_addr, false, true, 0);
+	if (ret)
+		return ret;
+
+	if ((dev_addr.network == RDMA_NETWORK_IPV4 ||
+	     dev_addr.network == RDMA_NETWORK_IPV6) &&
+	    rec->rec_type != SA_PATH_REC_TYPE_ROCE_V2)
+		return -EINVAL;
 
-	return addr_resolve(src_in, dst_addr, addr, false, 0);
+	rec->roce.route_resolved = true;
+	return 0;
 }
 
+/**
+ * rdma_addr_cancel - Cancel resolve ip request
+ * @addr:	Pointer to address structure given previously
+ *		during rdma_resolve_ip().
+ * rdma_addr_cancel() is synchronous function which cancels any pending
+ * request if there is any.
+ */
 void rdma_addr_cancel(struct rdma_dev_addr *addr)
 {
 	struct addr_req *req, *temp_req;
@@ -687,11 +790,6 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr)
 	 * guarentees no work is running and none will be started.
 	 */
 	cancel_delayed_work_sync(&found->work);
-
-	if (found->callback)
-		found->callback(-ECANCELED, (struct sockaddr *)&found->src_addr,
-			      found->addr, found->context);
-
 	kfree(found);
 }
 EXPORT_SYMBOL(rdma_addr_cancel);
@@ -710,7 +808,7 @@ static void resolve_cb(int status, struct sockaddr *src_addr,
 
 int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
 				 const union ib_gid *dgid,
-				 u8 *dmac, const struct net_device *ndev,
+				 u8 *dmac, const struct ib_gid_attr *sgid_attr,
 				 int *hoplimit)
 {
 	struct rdma_dev_addr dev_addr;
@@ -726,12 +824,12 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
 	rdma_gid2ip(&dgid_addr._sockaddr, dgid);
 
 	memset(&dev_addr, 0, sizeof(dev_addr));
-	dev_addr.bound_dev_if = ndev->ifindex;
 	dev_addr.net = &init_net;
+	dev_addr.sgid_attr = sgid_attr;
 
 	init_completion(&ctx.comp);
 	ret = rdma_resolve_ip(&sgid_addr._sockaddr, &dgid_addr._sockaddr,
-			      &dev_addr, 1000, resolve_cb, &ctx);
+			      &dev_addr, 1000, resolve_cb, true, &ctx);
 	if (ret)
 		return ret;
 

+ 55 - 24
drivers/infiniband/core/cache.c

@@ -212,9 +212,8 @@ static void free_gid_entry_locked(struct ib_gid_table_entry *entry)
 	u8 port_num = entry->attr.port_num;
 	struct ib_gid_table *table = rdma_gid_table(device, port_num);
 
-	pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
-		 device->name, port_num, entry->attr.index,
-		 entry->attr.gid.raw);
+	dev_dbg(&device->dev, "%s port=%d index=%d gid %pI6\n", __func__,
+		port_num, entry->attr.index, entry->attr.gid.raw);
 
 	if (rdma_cap_roce_gid_table(device, port_num) &&
 	    entry->state != GID_TABLE_ENTRY_INVALID)
@@ -289,9 +288,9 @@ static void store_gid_entry(struct ib_gid_table *table,
 {
 	entry->state = GID_TABLE_ENTRY_VALID;
 
-	pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
-		 entry->attr.device->name, entry->attr.port_num,
-		 entry->attr.index, entry->attr.gid.raw);
+	dev_dbg(&entry->attr.device->dev, "%s port=%d index=%d gid %pI6\n",
+		__func__, entry->attr.port_num, entry->attr.index,
+		entry->attr.gid.raw);
 
 	lockdep_assert_held(&table->lock);
 	write_lock_irq(&table->rwlock);
@@ -320,17 +319,16 @@ static int add_roce_gid(struct ib_gid_table_entry *entry)
 	int ret;
 
 	if (!attr->ndev) {
-		pr_err("%s NULL netdev device=%s port=%d index=%d\n",
-		       __func__, attr->device->name, attr->port_num,
-		       attr->index);
+		dev_err(&attr->device->dev, "%s NULL netdev port=%d index=%d\n",
+			__func__, attr->port_num, attr->index);
 		return -EINVAL;
 	}
 	if (rdma_cap_roce_gid_table(attr->device, attr->port_num)) {
 		ret = attr->device->add_gid(attr, &entry->context);
 		if (ret) {
-			pr_err("%s GID add failed device=%s port=%d index=%d\n",
-			       __func__, attr->device->name, attr->port_num,
-			       attr->index);
+			dev_err(&attr->device->dev,
+				"%s GID add failed port=%d index=%d\n",
+				__func__, attr->port_num, attr->index);
 			return ret;
 		}
 	}
@@ -353,9 +351,8 @@ static void del_gid(struct ib_device *ib_dev, u8 port,
 
 	lockdep_assert_held(&table->lock);
 
-	pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
-		 ib_dev->name, port, ix,
-		 table->data_vec[ix]->attr.gid.raw);
+	dev_dbg(&ib_dev->dev, "%s port=%d index=%d gid %pI6\n", __func__, port,
+		ix, table->data_vec[ix]->attr.gid.raw);
 
 	write_lock_irq(&table->rwlock);
 	entry = table->data_vec[ix];
@@ -782,9 +779,9 @@ static void release_gid_table(struct ib_device *device, u8 port,
 		if (is_gid_entry_free(table->data_vec[i]))
 			continue;
 		if (kref_read(&table->data_vec[i]->kref) > 1) {
-			pr_err("GID entry ref leak for %s (index %d) ref=%d\n",
-			       device->name, i,
-			       kref_read(&table->data_vec[i]->kref));
+			dev_err(&device->dev,
+				"GID entry ref leak for index %d ref=%d\n", i,
+				kref_read(&table->data_vec[i]->kref));
 			leak = true;
 		}
 	}
@@ -1252,6 +1249,39 @@ void rdma_hold_gid_attr(const struct ib_gid_attr *attr)
 }
 EXPORT_SYMBOL(rdma_hold_gid_attr);
 
+/**
+ * rdma_read_gid_attr_ndev_rcu - Read GID attribute netdevice
+ * which must be in UP state.
+ *
+ * @attr:Pointer to the GID attribute
+ *
+ * Returns pointer to netdevice if the netdevice was attached to GID and
+ * netdevice is in UP state. Caller must hold RCU lock as this API
+ * reads the netdev flags which can change while netdevice migrates to
+ * different net namespace. Returns ERR_PTR with error code otherwise.
+ *
+ */
+struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr)
+{
+	struct ib_gid_table_entry *entry =
+			container_of(attr, struct ib_gid_table_entry, attr);
+	struct ib_device *device = entry->attr.device;
+	struct net_device *ndev = ERR_PTR(-ENODEV);
+	u8 port_num = entry->attr.port_num;
+	struct ib_gid_table *table;
+	unsigned long flags;
+	bool valid;
+
+	table = rdma_gid_table(device, port_num);
+
+	read_lock_irqsave(&table->rwlock, flags);
+	valid = is_gid_entry_valid(table->data_vec[attr->index]);
+	if (valid && attr->ndev && (READ_ONCE(attr->ndev->flags) & IFF_UP))
+		ndev = attr->ndev;
+	read_unlock_irqrestore(&table->rwlock, flags);
+	return ndev;
+}
+
 static int config_non_roce_gid_cache(struct ib_device *device,
 				     u8 port, int gid_tbl_len)
 {
@@ -1270,8 +1300,9 @@ static int config_non_roce_gid_cache(struct ib_device *device,
 			continue;
 		ret = device->query_gid(device, port, i, &gid_attr.gid);
 		if (ret) {
-			pr_warn("query_gid failed (%d) for %s (index %d)\n",
-				ret, device->name, i);
+			dev_warn(&device->dev,
+				 "query_gid failed (%d) for index %d\n", ret,
+				 i);
 			goto err;
 		}
 		gid_attr.index = i;
@@ -1300,8 +1331,7 @@ static void ib_cache_update(struct ib_device *device,
 
 	ret = ib_query_port(device, port, tprops);
 	if (ret) {
-		pr_warn("ib_query_port failed (%d) for %s\n",
-			ret, device->name);
+		dev_warn(&device->dev, "ib_query_port failed (%d)\n", ret);
 		goto err;
 	}
 
@@ -1323,8 +1353,9 @@ static void ib_cache_update(struct ib_device *device,
 	for (i = 0; i < pkey_cache->table_len; ++i) {
 		ret = ib_query_pkey(device, port, i, pkey_cache->table + i);
 		if (ret) {
-			pr_warn("ib_query_pkey failed (%d) for %s (index %d)\n",
-				ret, device->name, i);
+			dev_warn(&device->dev,
+				 "ib_query_pkey failed (%d) for index %d\n",
+				 ret, i);
 			goto err;
 		}
 	}

+ 6 - 3
drivers/infiniband/core/cm.c

@@ -3292,8 +3292,11 @@ static int cm_lap_handler(struct cm_work *work)
 	if (ret)
 		goto unlock;
 
-	cm_init_av_by_path(param->alternate_path, NULL, &cm_id_priv->alt_av,
-			   cm_id_priv);
+	ret = cm_init_av_by_path(param->alternate_path, NULL,
+				 &cm_id_priv->alt_av, cm_id_priv);
+	if (ret)
+		goto unlock;
+
 	cm_id_priv->id.lap_state = IB_CM_LAP_RCVD;
 	cm_id_priv->tid = lap_msg->hdr.tid;
 	ret = atomic_inc_and_test(&cm_id_priv->work_count);
@@ -4367,7 +4370,7 @@ static void cm_add_one(struct ib_device *ib_device)
 	cm_dev->going_down = 0;
 	cm_dev->device = device_create(&cm_class, &ib_device->dev,
 				       MKDEV(0, 0), NULL,
-				       "%s", ib_device->name);
+				       "%s", dev_name(&ib_device->dev));
 	if (IS_ERR(cm_dev->device)) {
 		kfree(cm_dev);
 		return;

+ 177 - 74
drivers/infiniband/core/cma.c

@@ -639,13 +639,21 @@ static void cma_bind_sgid_attr(struct rdma_id_private *id_priv,
 	id_priv->id.route.addr.dev_addr.sgid_attr = sgid_attr;
 }
 
-static int cma_acquire_dev(struct rdma_id_private *id_priv,
-			   const struct rdma_id_private *listen_id_priv)
+/**
+ * cma_acquire_dev_by_src_ip - Acquire cma device, port, gid attribute
+ * based on source ip address.
+ * @id_priv:	cm_id which should be bound to cma device
+ *
+ * cma_acquire_dev_by_src_ip() binds cm id to cma device, port and GID attribute
+ * based on source IP address. It returns 0 on success or error code otherwise.
+ * It is applicable to active and passive side cm_id.
+ */
+static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv)
 {
 	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
 	const struct ib_gid_attr *sgid_attr;
-	struct cma_device *cma_dev;
 	union ib_gid gid, iboe_gid, *gidp;
+	struct cma_device *cma_dev;
 	enum ib_gid_type gid_type;
 	int ret = -ENODEV;
 	u8 port;
@@ -654,41 +662,125 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv,
 	    id_priv->id.ps == RDMA_PS_IPOIB)
 		return -EINVAL;
 
-	mutex_lock(&lock);
 	rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
 		    &iboe_gid);
 
 	memcpy(&gid, dev_addr->src_dev_addr +
-	       rdma_addr_gid_offset(dev_addr), sizeof gid);
-
-	if (listen_id_priv) {
-		cma_dev = listen_id_priv->cma_dev;
-		port = listen_id_priv->id.port_num;
-		gidp = rdma_protocol_roce(cma_dev->device, port) ?
-		       &iboe_gid : &gid;
-		gid_type = listen_id_priv->gid_type;
-		sgid_attr = cma_validate_port(cma_dev->device, port,
-					      gid_type, gidp, id_priv);
-		if (!IS_ERR(sgid_attr)) {
-			id_priv->id.port_num = port;
-			cma_bind_sgid_attr(id_priv, sgid_attr);
-			ret = 0;
-			goto out;
+	       rdma_addr_gid_offset(dev_addr), sizeof(gid));
+
+	mutex_lock(&lock);
+	list_for_each_entry(cma_dev, &dev_list, list) {
+		for (port = rdma_start_port(cma_dev->device);
+		     port <= rdma_end_port(cma_dev->device); port++) {
+			gidp = rdma_protocol_roce(cma_dev->device, port) ?
+			       &iboe_gid : &gid;
+			gid_type = cma_dev->default_gid_type[port - 1];
+			sgid_attr = cma_validate_port(cma_dev->device, port,
+						      gid_type, gidp, id_priv);
+			if (!IS_ERR(sgid_attr)) {
+				id_priv->id.port_num = port;
+				cma_bind_sgid_attr(id_priv, sgid_attr);
+				cma_attach_to_dev(id_priv, cma_dev);
+				ret = 0;
+				goto out;
+			}
 		}
 	}
+out:
+	mutex_unlock(&lock);
+	return ret;
+}
+
+/**
+ * cma_ib_acquire_dev - Acquire cma device, port and SGID attribute
+ * @id_priv:		cm id to bind to cma device
+ * @listen_id_priv:	listener cm id to match against
+ * @req:		Pointer to req structure containaining incoming
+ *			request information
+ * cma_ib_acquire_dev() acquires cma device, port and SGID attribute when
+ * rdma device matches for listen_id and incoming request. It also verifies
+ * that a GID table entry is present for the source address.
+ * Returns 0 on success, or returns error code otherwise.
+ */
+static int cma_ib_acquire_dev(struct rdma_id_private *id_priv,
+			      const struct rdma_id_private *listen_id_priv,
+			      struct cma_req_info *req)
+{
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	const struct ib_gid_attr *sgid_attr;
+	enum ib_gid_type gid_type;
+	union ib_gid gid;
+
+	if (dev_addr->dev_type != ARPHRD_INFINIBAND &&
+	    id_priv->id.ps == RDMA_PS_IPOIB)
+		return -EINVAL;
+
+	if (rdma_protocol_roce(req->device, req->port))
+		rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
+			    &gid);
+	else
+		memcpy(&gid, dev_addr->src_dev_addr +
+		       rdma_addr_gid_offset(dev_addr), sizeof(gid));
+
+	gid_type = listen_id_priv->cma_dev->default_gid_type[req->port - 1];
+	sgid_attr = cma_validate_port(req->device, req->port,
+				      gid_type, &gid, id_priv);
+	if (IS_ERR(sgid_attr))
+		return PTR_ERR(sgid_attr);
+
+	id_priv->id.port_num = req->port;
+	cma_bind_sgid_attr(id_priv, sgid_attr);
+	/* Need to acquire lock to protect against reader
+	 * of cma_dev->id_list such as cma_netdev_callback() and
+	 * cma_process_remove().
+	 */
+	mutex_lock(&lock);
+	cma_attach_to_dev(id_priv, listen_id_priv->cma_dev);
+	mutex_unlock(&lock);
+	return 0;
+}
+
+static int cma_iw_acquire_dev(struct rdma_id_private *id_priv,
+			      const struct rdma_id_private *listen_id_priv)
+{
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	const struct ib_gid_attr *sgid_attr;
+	struct cma_device *cma_dev;
+	enum ib_gid_type gid_type;
+	int ret = -ENODEV;
+	union ib_gid gid;
+	u8 port;
+
+	if (dev_addr->dev_type != ARPHRD_INFINIBAND &&
+	    id_priv->id.ps == RDMA_PS_IPOIB)
+		return -EINVAL;
+
+	memcpy(&gid, dev_addr->src_dev_addr +
+	       rdma_addr_gid_offset(dev_addr), sizeof(gid));
+
+	mutex_lock(&lock);
+
+	cma_dev = listen_id_priv->cma_dev;
+	port = listen_id_priv->id.port_num;
+	gid_type = listen_id_priv->gid_type;
+	sgid_attr = cma_validate_port(cma_dev->device, port,
+				      gid_type, &gid, id_priv);
+	if (!IS_ERR(sgid_attr)) {
+		id_priv->id.port_num = port;
+		cma_bind_sgid_attr(id_priv, sgid_attr);
+		ret = 0;
+		goto out;
+	}
 
 	list_for_each_entry(cma_dev, &dev_list, list) {
 		for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) {
-			if (listen_id_priv &&
-			    listen_id_priv->cma_dev == cma_dev &&
+			if (listen_id_priv->cma_dev == cma_dev &&
 			    listen_id_priv->id.port_num == port)
 				continue;
 
-			gidp = rdma_protocol_roce(cma_dev->device, port) ?
-			       &iboe_gid : &gid;
 			gid_type = cma_dev->default_gid_type[port - 1];
 			sgid_attr = cma_validate_port(cma_dev->device, port,
-						      gid_type, gidp, id_priv);
+						      gid_type, &gid, id_priv);
 			if (!IS_ERR(sgid_attr)) {
 				id_priv->id.port_num = port;
 				cma_bind_sgid_attr(id_priv, sgid_attr);
@@ -785,10 +877,7 @@ struct rdma_cm_id *__rdma_create_id(struct net *net,
 	if (!id_priv)
 		return ERR_PTR(-ENOMEM);
 
-	if (caller)
-		id_priv->res.kern_name = caller;
-	else
-		rdma_restrack_set_task(&id_priv->res, current);
+	rdma_restrack_set_task(&id_priv->res, caller);
 	id_priv->res.type = RDMA_RESTRACK_CM_ID;
 	id_priv->state = RDMA_CM_IDLE;
 	id_priv->id.context = context;
@@ -1462,17 +1551,34 @@ static bool cma_protocol_roce(const struct rdma_cm_id *id)
 	return rdma_protocol_roce(device, port_num);
 }
 
+static bool cma_is_req_ipv6_ll(const struct cma_req_info *req)
+{
+	const struct sockaddr *daddr =
+			(const struct sockaddr *)&req->listen_addr_storage;
+	const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr;
+
+	/* Returns true if the req is for IPv6 link local */
+	return (daddr->sa_family == AF_INET6 &&
+		(ipv6_addr_type(&daddr6->sin6_addr) & IPV6_ADDR_LINKLOCAL));
+}
+
 static bool cma_match_net_dev(const struct rdma_cm_id *id,
 			      const struct net_device *net_dev,
-			      u8 port_num)
+			      const struct cma_req_info *req)
 {
 	const struct rdma_addr *addr = &id->route.addr;
 
 	if (!net_dev)
 		/* This request is an AF_IB request */
-		return (!id->port_num || id->port_num == port_num) &&
+		return (!id->port_num || id->port_num == req->port) &&
 		       (addr->src_addr.ss_family == AF_IB);
 
+	/*
+	 * If the request is not for IPv6 link local, allow matching
+	 * request to any netdevice of the one or multiport rdma device.
+	 */
+	if (!cma_is_req_ipv6_ll(req))
+		return true;
 	/*
 	 * Net namespaces must match, and if the listner is listening
 	 * on a specific netdevice than netdevice must match as well.
@@ -1500,13 +1606,14 @@ static struct rdma_id_private *cma_find_listener(
 	hlist_for_each_entry(id_priv, &bind_list->owners, node) {
 		if (cma_match_private_data(id_priv, ib_event->private_data)) {
 			if (id_priv->id.device == cm_id->device &&
-			    cma_match_net_dev(&id_priv->id, net_dev, req->port))
+			    cma_match_net_dev(&id_priv->id, net_dev, req))
 				return id_priv;
 			list_for_each_entry(id_priv_dev,
 					    &id_priv->listen_list,
 					    listen_list) {
 				if (id_priv_dev->id.device == cm_id->device &&
-				    cma_match_net_dev(&id_priv_dev->id, net_dev, req->port))
+				    cma_match_net_dev(&id_priv_dev->id,
+						      net_dev, req))
 					return id_priv_dev;
 			}
 		}
@@ -1518,18 +1625,18 @@ static struct rdma_id_private *cma_find_listener(
 static struct rdma_id_private *
 cma_ib_id_from_event(struct ib_cm_id *cm_id,
 		     const struct ib_cm_event *ib_event,
+		     struct cma_req_info *req,
 		     struct net_device **net_dev)
 {
-	struct cma_req_info req;
 	struct rdma_bind_list *bind_list;
 	struct rdma_id_private *id_priv;
 	int err;
 
-	err = cma_save_req_info(ib_event, &req);
+	err = cma_save_req_info(ib_event, req);
 	if (err)
 		return ERR_PTR(err);
 
-	*net_dev = cma_get_net_dev(ib_event, &req);
+	*net_dev = cma_get_net_dev(ib_event, req);
 	if (IS_ERR(*net_dev)) {
 		if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) {
 			/* Assuming the protocol is AF_IB */
@@ -1567,17 +1674,17 @@ cma_ib_id_from_event(struct ib_cm_id *cm_id,
 		}
 
 		if (!validate_net_dev(*net_dev,
-				 (struct sockaddr *)&req.listen_addr_storage,
-				 (struct sockaddr *)&req.src_addr_storage)) {
+				 (struct sockaddr *)&req->listen_addr_storage,
+				 (struct sockaddr *)&req->src_addr_storage)) {
 			id_priv = ERR_PTR(-EHOSTUNREACH);
 			goto err;
 		}
 	}
 
 	bind_list = cma_ps_find(*net_dev ? dev_net(*net_dev) : &init_net,
-				rdma_ps_from_service_id(req.service_id),
-				cma_port_from_service_id(req.service_id));
-	id_priv = cma_find_listener(bind_list, cm_id, ib_event, &req, *net_dev);
+				rdma_ps_from_service_id(req->service_id),
+				cma_port_from_service_id(req->service_id));
+	id_priv = cma_find_listener(bind_list, cm_id, ib_event, req, *net_dev);
 err:
 	rcu_read_unlock();
 	if (IS_ERR(id_priv) && *net_dev) {
@@ -1710,8 +1817,8 @@ void rdma_destroy_id(struct rdma_cm_id *id)
 	mutex_lock(&id_priv->handler_mutex);
 	mutex_unlock(&id_priv->handler_mutex);
 
+	rdma_restrack_del(&id_priv->res);
 	if (id_priv->cma_dev) {
-		rdma_restrack_del(&id_priv->res);
 		if (rdma_cap_ib_cm(id_priv->id.device, 1)) {
 			if (id_priv->cm_id.ib)
 				ib_destroy_cm_id(id_priv->cm_id.ib);
@@ -1902,7 +2009,7 @@ cma_ib_new_conn_id(const struct rdma_cm_id *listen_id,
 		rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path;
 
 	if (net_dev) {
-		rdma_copy_addr(&rt->addr.dev_addr, net_dev, NULL);
+		rdma_copy_src_l2_addr(&rt->addr.dev_addr, net_dev);
 	} else {
 		if (!cma_protocol_roce(listen_id) &&
 		    cma_any_addr(cma_src_addr(id_priv))) {
@@ -1952,7 +2059,7 @@ cma_ib_new_udp_id(const struct rdma_cm_id *listen_id,
 		goto err;
 
 	if (net_dev) {
-		rdma_copy_addr(&id->route.addr.dev_addr, net_dev, NULL);
+		rdma_copy_src_l2_addr(&id->route.addr.dev_addr, net_dev);
 	} else {
 		if (!cma_any_addr(cma_src_addr(id_priv))) {
 			ret = cma_translate_addr(cma_src_addr(id_priv),
@@ -1999,11 +2106,12 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id,
 {
 	struct rdma_id_private *listen_id, *conn_id = NULL;
 	struct rdma_cm_event event = {};
+	struct cma_req_info req = {};
 	struct net_device *net_dev;
 	u8 offset;
 	int ret;
 
-	listen_id = cma_ib_id_from_event(cm_id, ib_event, &net_dev);
+	listen_id = cma_ib_id_from_event(cm_id, ib_event, &req, &net_dev);
 	if (IS_ERR(listen_id))
 		return PTR_ERR(listen_id);
 
@@ -2036,7 +2144,7 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id,
 	}
 
 	mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
-	ret = cma_acquire_dev(conn_id, listen_id);
+	ret = cma_ib_acquire_dev(conn_id, listen_id, &req);
 	if (ret)
 		goto err2;
 
@@ -2232,7 +2340,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
 		goto out;
 	}
 
-	ret = cma_acquire_dev(conn_id, listen_id);
+	ret = cma_iw_acquire_dev(conn_id, listen_id);
 	if (ret) {
 		mutex_unlock(&conn_id->handler_mutex);
 		rdma_destroy_id(new_cm_id);
@@ -2354,8 +2462,8 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv,
 
 	ret = rdma_listen(id, id_priv->backlog);
 	if (ret)
-		pr_warn("RDMA CMA: cma_listen_on_dev, error %d, listening on device %s\n",
-			ret, cma_dev->device->name);
+		dev_warn(&cma_dev->device->dev,
+			 "RDMA CMA: cma_listen_on_dev, error %d\n", ret);
 }
 
 static void cma_listen_on_all(struct rdma_id_private *id_priv)
@@ -2402,8 +2510,8 @@ static void cma_query_handler(int status, struct sa_path_rec *path_rec,
 	queue_work(cma_wq, &work->work);
 }
 
-static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms,
-			      struct cma_work *work)
+static int cma_query_ib_route(struct rdma_id_private *id_priv,
+			      unsigned long timeout_ms, struct cma_work *work)
 {
 	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
 	struct sa_path_rec path_rec;
@@ -2521,7 +2629,8 @@ static void cma_init_resolve_addr_work(struct cma_work *work,
 	work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
 }
 
-static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms)
+static int cma_resolve_ib_route(struct rdma_id_private *id_priv,
+				unsigned long timeout_ms)
 {
 	struct rdma_route *route = &id_priv->id.route;
 	struct cma_work *work;
@@ -2643,7 +2752,7 @@ err:
 }
 EXPORT_SYMBOL(rdma_set_ib_path);
 
-static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms)
+static int cma_resolve_iw_route(struct rdma_id_private *id_priv)
 {
 	struct cma_work *work;
 
@@ -2744,7 +2853,7 @@ err1:
 	return ret;
 }
 
-int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
+int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms)
 {
 	struct rdma_id_private *id_priv;
 	int ret;
@@ -2759,7 +2868,7 @@ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
 	else if (rdma_protocol_roce(id->device, id->port_num))
 		ret = cma_resolve_iboe_route(id_priv);
 	else if (rdma_protocol_iwarp(id->device, id->port_num))
-		ret = cma_resolve_iw_route(id_priv, timeout_ms);
+		ret = cma_resolve_iw_route(id_priv);
 	else
 		ret = -ENOSYS;
 
@@ -2862,7 +2971,7 @@ static void addr_handler(int status, struct sockaddr *src_addr,
 
 	memcpy(cma_src_addr(id_priv), src_addr, rdma_addr_size(src_addr));
 	if (!status && !id_priv->cma_dev) {
-		status = cma_acquire_dev(id_priv, NULL);
+		status = cma_acquire_dev_by_src_ip(id_priv);
 		if (status)
 			pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to acquire device. status %d\n",
 					     status);
@@ -2882,13 +2991,11 @@ static void addr_handler(int status, struct sockaddr *src_addr,
 	if (id_priv->id.event_handler(&id_priv->id, &event)) {
 		cma_exch(id_priv, RDMA_CM_DESTROYING);
 		mutex_unlock(&id_priv->handler_mutex);
-		cma_deref_id(id_priv);
 		rdma_destroy_id(&id_priv->id);
 		return;
 	}
 out:
 	mutex_unlock(&id_priv->handler_mutex);
-	cma_deref_id(id_priv);
 }
 
 static int cma_resolve_loopback(struct rdma_id_private *id_priv)
@@ -2966,7 +3073,7 @@ static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
 }
 
 int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
-		      const struct sockaddr *dst_addr, int timeout_ms)
+		      const struct sockaddr *dst_addr, unsigned long timeout_ms)
 {
 	struct rdma_id_private *id_priv;
 	int ret;
@@ -2985,16 +3092,16 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
 		return -EINVAL;
 
 	memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr));
-	atomic_inc(&id_priv->refcount);
 	if (cma_any_addr(dst_addr)) {
 		ret = cma_resolve_loopback(id_priv);
 	} else {
 		if (dst_addr->sa_family == AF_IB) {
 			ret = cma_resolve_ib_addr(id_priv);
 		} else {
-			ret = rdma_resolve_ip(cma_src_addr(id_priv),
-					      dst_addr, &id->route.addr.dev_addr,
-					      timeout_ms, addr_handler, id_priv);
+			ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr,
+					      &id->route.addr.dev_addr,
+					      timeout_ms, addr_handler,
+					      false, id_priv);
 		}
 	}
 	if (ret)
@@ -3003,7 +3110,6 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
 	return 0;
 err:
 	cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND);
-	cma_deref_id(id_priv);
 	return ret;
 }
 EXPORT_SYMBOL(rdma_resolve_addr);
@@ -3414,7 +3520,7 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
 		if (ret)
 			goto err1;
 
-		ret = cma_acquire_dev(id_priv, NULL);
+		ret = cma_acquire_dev_by_src_ip(id_priv);
 		if (ret)
 			goto err1;
 	}
@@ -3439,10 +3545,9 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
 
 	return 0;
 err2:
-	if (id_priv->cma_dev) {
-		rdma_restrack_del(&id_priv->res);
+	rdma_restrack_del(&id_priv->res);
+	if (id_priv->cma_dev)
 		cma_release_dev(id_priv);
-	}
 err1:
 	cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE);
 	return ret;
@@ -3839,10 +3944,7 @@ int __rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param,
 
 	id_priv = container_of(id, struct rdma_id_private, id);
 
-	if (caller)
-		id_priv->res.kern_name = caller;
-	else
-		rdma_restrack_set_task(&id_priv->res, current);
+	rdma_restrack_set_task(&id_priv->res, caller);
 
 	if (!cma_comp(id_priv, RDMA_CM_CONNECT))
 		return -EINVAL;
@@ -4087,9 +4189,10 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv,
 	    (!ib_sa_sendonly_fullmem_support(&sa_client,
 					     id_priv->id.device,
 					     id_priv->id.port_num))) {
-		pr_warn("RDMA CM: %s port %u Unable to multicast join\n"
-			"RDMA CM: SM doesn't support Send Only Full Member option\n",
-			id_priv->id.device->name, id_priv->id.port_num);
+		dev_warn(
+			&id_priv->id.device->dev,
+			"RDMA CM: port %u Unable to multicast join: SM doesn't support Send Only Full Member option\n",
+			id_priv->id.port_num);
 		return -EOPNOTSUPP;
 	}
 

+ 1 - 1
drivers/infiniband/core/cma_configfs.c

@@ -65,7 +65,7 @@ static struct cma_dev_port_group *to_dev_port_group(struct config_item *item)
 
 static bool filter_by_name(struct ib_device *ib_dev, void *cookie)
 {
-	return !strcmp(ib_dev->name, cookie);
+	return !strcmp(dev_name(&ib_dev->dev), cookie);
 }
 
 static int cma_configfs_params_get(struct config_item *item,

+ 10 - 2
drivers/infiniband/core/core_priv.h

@@ -44,7 +44,7 @@
 #include "mad_priv.h"
 
 /* Total number of ports combined across all struct ib_devices's */
-#define RDMA_MAX_PORTS 1024
+#define RDMA_MAX_PORTS 8192
 
 struct pkey_index_qp_list {
 	struct list_head    pkey_index_list;
@@ -87,6 +87,7 @@ int  ib_device_register_sysfs(struct ib_device *device,
 			      int (*port_callback)(struct ib_device *,
 						   u8, struct kobject *));
 void ib_device_unregister_sysfs(struct ib_device *device);
+int ib_device_rename(struct ib_device *ibdev, const char *name);
 
 typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port,
 	      struct net_device *idev, void *cookie);
@@ -338,7 +339,14 @@ int rdma_resolve_ip_route(struct sockaddr *src_addr,
 
 int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
 				 const union ib_gid *dgid,
-				 u8 *dmac, const struct net_device *ndev,
+				 u8 *dmac, const struct ib_gid_attr *sgid_attr,
 				 int *hoplimit);
+void rdma_copy_src_l2_addr(struct rdma_dev_addr *dev_addr,
+			   const struct net_device *dev);
 
+struct sa_path_rec;
+int roce_resolve_route_from_path(struct sa_path_rec *rec,
+				 const struct ib_gid_attr *attr);
+
+struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr);
 #endif /* _CORE_PRIV_H */

+ 7 - 3
drivers/infiniband/core/cq.c

@@ -112,12 +112,12 @@ static void ib_cq_poll_work(struct work_struct *work)
 				    IB_POLL_BATCH);
 	if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
 	    ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
-		queue_work(ib_comp_wq, &cq->work);
+		queue_work(cq->comp_wq, &cq->work);
 }
 
 static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
 {
-	queue_work(ib_comp_wq, &cq->work);
+	queue_work(cq->comp_wq, &cq->work);
 }
 
 /**
@@ -161,7 +161,7 @@ struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private,
 		goto out_destroy_cq;
 
 	cq->res.type = RDMA_RESTRACK_CQ;
-	cq->res.kern_name = caller;
+	rdma_restrack_set_task(&cq->res, caller);
 	rdma_restrack_add(&cq->res);
 
 	switch (cq->poll_ctx) {
@@ -175,9 +175,12 @@ struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private,
 		ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
 		break;
 	case IB_POLL_WORKQUEUE:
+	case IB_POLL_UNBOUND_WORKQUEUE:
 		cq->comp_handler = ib_cq_completion_workqueue;
 		INIT_WORK(&cq->work, ib_cq_poll_work);
 		ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+		cq->comp_wq = (cq->poll_ctx == IB_POLL_WORKQUEUE) ?
+				ib_comp_wq : ib_comp_unbound_wq;
 		break;
 	default:
 		ret = -EINVAL;
@@ -213,6 +216,7 @@ void ib_free_cq(struct ib_cq *cq)
 		irq_poll_disable(&cq->iop);
 		break;
 	case IB_POLL_WORKQUEUE:
+	case IB_POLL_UNBOUND_WORKQUEUE:
 		cancel_work_sync(&cq->work);
 		break;
 	default:

+ 169 - 95
drivers/infiniband/core/device.c

@@ -61,6 +61,7 @@ struct ib_client_data {
 };
 
 struct workqueue_struct *ib_comp_wq;
+struct workqueue_struct *ib_comp_unbound_wq;
 struct workqueue_struct *ib_wq;
 EXPORT_SYMBOL_GPL(ib_wq);
 
@@ -122,8 +123,9 @@ static int ib_device_check_mandatory(struct ib_device *device)
 
 	for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
 		if (!*(void **) ((void *) device + mandatory_table[i].offset)) {
-			pr_warn("Device %s is missing mandatory function %s\n",
-				device->name, mandatory_table[i].name);
+			dev_warn(&device->dev,
+				 "Device is missing mandatory function %s\n",
+				 mandatory_table[i].name);
 			return -EINVAL;
 		}
 	}
@@ -163,16 +165,40 @@ static struct ib_device *__ib_device_get_by_name(const char *name)
 	struct ib_device *device;
 
 	list_for_each_entry(device, &device_list, core_list)
-		if (!strncmp(name, device->name, IB_DEVICE_NAME_MAX))
+		if (!strcmp(name, dev_name(&device->dev)))
 			return device;
 
 	return NULL;
 }
 
-static int alloc_name(char *name)
+int ib_device_rename(struct ib_device *ibdev, const char *name)
+{
+	struct ib_device *device;
+	int ret = 0;
+
+	if (!strcmp(name, dev_name(&ibdev->dev)))
+		return ret;
+
+	mutex_lock(&device_mutex);
+	list_for_each_entry(device, &device_list, core_list) {
+		if (!strcmp(name, dev_name(&device->dev))) {
+			ret = -EEXIST;
+			goto out;
+		}
+	}
+
+	ret = device_rename(&ibdev->dev, name);
+	if (ret)
+		goto out;
+	strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX);
+out:
+	mutex_unlock(&device_mutex);
+	return ret;
+}
+
+static int alloc_name(struct ib_device *ibdev, const char *name)
 {
 	unsigned long *inuse;
-	char buf[IB_DEVICE_NAME_MAX];
 	struct ib_device *device;
 	int i;
 
@@ -181,24 +207,21 @@ static int alloc_name(char *name)
 		return -ENOMEM;
 
 	list_for_each_entry(device, &device_list, core_list) {
-		if (!sscanf(device->name, name, &i))
+		char buf[IB_DEVICE_NAME_MAX];
+
+		if (sscanf(dev_name(&device->dev), name, &i) != 1)
 			continue;
 		if (i < 0 || i >= PAGE_SIZE * 8)
 			continue;
 		snprintf(buf, sizeof buf, name, i);
-		if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX))
+		if (!strcmp(buf, dev_name(&device->dev)))
 			set_bit(i, inuse);
 	}
 
 	i = find_first_zero_bit(inuse, PAGE_SIZE * 8);
 	free_page((unsigned long) inuse);
-	snprintf(buf, sizeof buf, name, i);
 
-	if (__ib_device_get_by_name(buf))
-		return -ENFILE;
-
-	strlcpy(name, buf, IB_DEVICE_NAME_MAX);
-	return 0;
+	return dev_set_name(&ibdev->dev, name, i);
 }
 
 static void ib_device_release(struct device *device)
@@ -221,9 +244,7 @@ static void ib_device_release(struct device *device)
 static int ib_device_uevent(struct device *device,
 			    struct kobj_uevent_env *env)
 {
-	struct ib_device *dev = container_of(device, struct ib_device, dev);
-
-	if (add_uevent_var(env, "NAME=%s", dev->name))
+	if (add_uevent_var(env, "NAME=%s", dev_name(device)))
 		return -ENOMEM;
 
 	/*
@@ -269,7 +290,7 @@ struct ib_device *ib_alloc_device(size_t size)
 
 	INIT_LIST_HEAD(&device->event_handler_list);
 	spin_lock_init(&device->event_handler_lock);
-	spin_lock_init(&device->client_data_lock);
+	rwlock_init(&device->client_data_lock);
 	INIT_LIST_HEAD(&device->client_data_list);
 	INIT_LIST_HEAD(&device->port_list);
 
@@ -285,6 +306,7 @@ EXPORT_SYMBOL(ib_alloc_device);
  */
 void ib_dealloc_device(struct ib_device *device)
 {
+	WARN_ON(!list_empty(&device->client_data_list));
 	WARN_ON(device->reg_state != IB_DEV_UNREGISTERED &&
 		device->reg_state != IB_DEV_UNINITIALIZED);
 	rdma_restrack_clean(&device->res);
@@ -295,9 +317,8 @@ EXPORT_SYMBOL(ib_dealloc_device);
 static int add_client_context(struct ib_device *device, struct ib_client *client)
 {
 	struct ib_client_data *context;
-	unsigned long flags;
 
-	context = kmalloc(sizeof *context, GFP_KERNEL);
+	context = kmalloc(sizeof(*context), GFP_KERNEL);
 	if (!context)
 		return -ENOMEM;
 
@@ -306,9 +327,9 @@ static int add_client_context(struct ib_device *device, struct ib_client *client
 	context->going_down = false;
 
 	down_write(&lists_rwsem);
-	spin_lock_irqsave(&device->client_data_lock, flags);
+	write_lock_irq(&device->client_data_lock);
 	list_add(&context->list, &device->client_data_list);
-	spin_unlock_irqrestore(&device->client_data_lock, flags);
+	write_unlock_irq(&device->client_data_lock);
 	up_write(&lists_rwsem);
 
 	return 0;
@@ -444,22 +465,8 @@ static u32 __dev_new_index(void)
 	}
 }
 
-/**
- * ib_register_device - Register an IB device with IB core
- * @device:Device to register
- *
- * Low-level drivers use ib_register_device() to register their
- * devices with the IB core.  All registered clients will receive a
- * callback for each device that is added. @device must be allocated
- * with ib_alloc_device().
- */
-int ib_register_device(struct ib_device *device,
-		       int (*port_callback)(struct ib_device *,
-					    u8, struct kobject *))
+static void setup_dma_device(struct ib_device *device)
 {
-	int ret;
-	struct ib_client *client;
-	struct ib_udata uhw = {.outlen = 0, .inlen = 0};
 	struct device *parent = device->dev.parent;
 
 	WARN_ON_ONCE(device->dma_device);
@@ -491,56 +498,113 @@ int ib_register_device(struct ib_device *device,
 		WARN_ON_ONCE(!parent);
 		device->dma_device = parent;
 	}
+}
 
-	mutex_lock(&device_mutex);
+static void cleanup_device(struct ib_device *device)
+{
+	ib_cache_cleanup_one(device);
+	ib_cache_release_one(device);
+	kfree(device->port_pkey_list);
+	kfree(device->port_immutable);
+}
 
-	if (strchr(device->name, '%')) {
-		ret = alloc_name(device->name);
-		if (ret)
-			goto out;
-	}
+static int setup_device(struct ib_device *device)
+{
+	struct ib_udata uhw = {.outlen = 0, .inlen = 0};
+	int ret;
 
-	if (ib_device_check_mandatory(device)) {
-		ret = -EINVAL;
-		goto out;
-	}
+	ret = ib_device_check_mandatory(device);
+	if (ret)
+		return ret;
 
 	ret = read_port_immutable(device);
 	if (ret) {
-		pr_warn("Couldn't create per port immutable data %s\n",
-			device->name);
-		goto out;
+		dev_warn(&device->dev,
+			 "Couldn't create per port immutable data\n");
+		return ret;
 	}
 
-	ret = setup_port_pkey_list(device);
+	memset(&device->attrs, 0, sizeof(device->attrs));
+	ret = device->query_device(device, &device->attrs, &uhw);
 	if (ret) {
-		pr_warn("Couldn't create per port_pkey_list\n");
-		goto out;
+		dev_warn(&device->dev,
+			 "Couldn't query the device attributes\n");
+		goto port_cleanup;
 	}
 
-	ret = ib_cache_setup_one(device);
+	ret = setup_port_pkey_list(device);
 	if (ret) {
-		pr_warn("Couldn't set up InfiniBand P_Key/GID cache\n");
+		dev_warn(&device->dev, "Couldn't create per port_pkey_list\n");
 		goto port_cleanup;
 	}
 
-	ret = ib_device_register_rdmacg(device);
+	ret = ib_cache_setup_one(device);
 	if (ret) {
-		pr_warn("Couldn't register device with rdma cgroup\n");
-		goto cache_cleanup;
+		dev_warn(&device->dev,
+			 "Couldn't set up InfiniBand P_Key/GID cache\n");
+		goto pkey_cleanup;
+	}
+	return 0;
+
+pkey_cleanup:
+	kfree(device->port_pkey_list);
+port_cleanup:
+	kfree(device->port_immutable);
+	return ret;
+}
+
+/**
+ * ib_register_device - Register an IB device with IB core
+ * @device:Device to register
+ *
+ * Low-level drivers use ib_register_device() to register their
+ * devices with the IB core.  All registered clients will receive a
+ * callback for each device that is added. @device must be allocated
+ * with ib_alloc_device().
+ */
+int ib_register_device(struct ib_device *device, const char *name,
+		       int (*port_callback)(struct ib_device *, u8,
+					    struct kobject *))
+{
+	int ret;
+	struct ib_client *client;
+
+	setup_dma_device(device);
+
+	mutex_lock(&device_mutex);
+
+	if (strchr(name, '%')) {
+		ret = alloc_name(device, name);
+		if (ret)
+			goto out;
+	} else {
+		ret = dev_set_name(&device->dev, name);
+		if (ret)
+			goto out;
+	}
+	if (__ib_device_get_by_name(dev_name(&device->dev))) {
+		ret = -ENFILE;
+		goto out;
 	}
+	strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX);
 
-	memset(&device->attrs, 0, sizeof(device->attrs));
-	ret = device->query_device(device, &device->attrs, &uhw);
+	ret = setup_device(device);
+	if (ret)
+		goto out;
+
+	device->index = __dev_new_index();
+
+	ret = ib_device_register_rdmacg(device);
 	if (ret) {
-		pr_warn("Couldn't query the device attributes\n");
-		goto cg_cleanup;
+		dev_warn(&device->dev,
+			 "Couldn't register device with rdma cgroup\n");
+		goto dev_cleanup;
 	}
 
 	ret = ib_device_register_sysfs(device, port_callback);
 	if (ret) {
-		pr_warn("Couldn't register device %s with driver model\n",
-			device->name);
+		dev_warn(&device->dev,
+			 "Couldn't register device with driver model\n");
 		goto cg_cleanup;
 	}
 
@@ -550,7 +614,6 @@ int ib_register_device(struct ib_device *device,
 		if (!add_client_context(device, client) && client->add)
 			client->add(device);
 
-	device->index = __dev_new_index();
 	down_write(&lists_rwsem);
 	list_add_tail(&device->core_list, &device_list);
 	up_write(&lists_rwsem);
@@ -559,11 +622,8 @@ int ib_register_device(struct ib_device *device,
 
 cg_cleanup:
 	ib_device_unregister_rdmacg(device);
-cache_cleanup:
-	ib_cache_cleanup_one(device);
-	ib_cache_release_one(device);
-port_cleanup:
-	kfree(device->port_immutable);
+dev_cleanup:
+	cleanup_device(device);
 out:
 	mutex_unlock(&device_mutex);
 	return ret;
@@ -585,21 +645,20 @@ void ib_unregister_device(struct ib_device *device)
 
 	down_write(&lists_rwsem);
 	list_del(&device->core_list);
-	spin_lock_irqsave(&device->client_data_lock, flags);
-	list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+	write_lock_irq(&device->client_data_lock);
+	list_for_each_entry(context, &device->client_data_list, list)
 		context->going_down = true;
-	spin_unlock_irqrestore(&device->client_data_lock, flags);
+	write_unlock_irq(&device->client_data_lock);
 	downgrade_write(&lists_rwsem);
 
-	list_for_each_entry_safe(context, tmp, &device->client_data_list,
-				 list) {
+	list_for_each_entry(context, &device->client_data_list, list) {
 		if (context->client->remove)
 			context->client->remove(device, context->data);
 	}
 	up_read(&lists_rwsem);
 
-	ib_device_unregister_rdmacg(device);
 	ib_device_unregister_sysfs(device);
+	ib_device_unregister_rdmacg(device);
 
 	mutex_unlock(&device_mutex);
 
@@ -609,10 +668,13 @@ void ib_unregister_device(struct ib_device *device)
 	kfree(device->port_pkey_list);
 
 	down_write(&lists_rwsem);
-	spin_lock_irqsave(&device->client_data_lock, flags);
-	list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+	write_lock_irqsave(&device->client_data_lock, flags);
+	list_for_each_entry_safe(context, tmp, &device->client_data_list,
+				 list) {
+		list_del(&context->list);
 		kfree(context);
-	spin_unlock_irqrestore(&device->client_data_lock, flags);
+	}
+	write_unlock_irqrestore(&device->client_data_lock, flags);
 	up_write(&lists_rwsem);
 
 	device->reg_state = IB_DEV_UNREGISTERED;
@@ -662,9 +724,8 @@ EXPORT_SYMBOL(ib_register_client);
  */
 void ib_unregister_client(struct ib_client *client)
 {
-	struct ib_client_data *context, *tmp;
+	struct ib_client_data *context;
 	struct ib_device *device;
-	unsigned long flags;
 
 	mutex_lock(&device_mutex);
 
@@ -676,14 +737,14 @@ void ib_unregister_client(struct ib_client *client)
 		struct ib_client_data *found_context = NULL;
 
 		down_write(&lists_rwsem);
-		spin_lock_irqsave(&device->client_data_lock, flags);
-		list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+		write_lock_irq(&device->client_data_lock);
+		list_for_each_entry(context, &device->client_data_list, list)
 			if (context->client == client) {
 				context->going_down = true;
 				found_context = context;
 				break;
 			}
-		spin_unlock_irqrestore(&device->client_data_lock, flags);
+		write_unlock_irq(&device->client_data_lock);
 		up_write(&lists_rwsem);
 
 		if (client->remove)
@@ -691,17 +752,18 @@ void ib_unregister_client(struct ib_client *client)
 					       found_context->data : NULL);
 
 		if (!found_context) {
-			pr_warn("No client context found for %s/%s\n",
-				device->name, client->name);
+			dev_warn(&device->dev,
+				 "No client context found for %s\n",
+				 client->name);
 			continue;
 		}
 
 		down_write(&lists_rwsem);
-		spin_lock_irqsave(&device->client_data_lock, flags);
+		write_lock_irq(&device->client_data_lock);
 		list_del(&found_context->list);
-		kfree(found_context);
-		spin_unlock_irqrestore(&device->client_data_lock, flags);
+		write_unlock_irq(&device->client_data_lock);
 		up_write(&lists_rwsem);
+		kfree(found_context);
 	}
 
 	mutex_unlock(&device_mutex);
@@ -722,13 +784,13 @@ void *ib_get_client_data(struct ib_device *device, struct ib_client *client)
 	void *ret = NULL;
 	unsigned long flags;
 
-	spin_lock_irqsave(&device->client_data_lock, flags);
+	read_lock_irqsave(&device->client_data_lock, flags);
 	list_for_each_entry(context, &device->client_data_list, list)
 		if (context->client == client) {
 			ret = context->data;
 			break;
 		}
-	spin_unlock_irqrestore(&device->client_data_lock, flags);
+	read_unlock_irqrestore(&device->client_data_lock, flags);
 
 	return ret;
 }
@@ -749,18 +811,18 @@ void ib_set_client_data(struct ib_device *device, struct ib_client *client,
 	struct ib_client_data *context;
 	unsigned long flags;
 
-	spin_lock_irqsave(&device->client_data_lock, flags);
+	write_lock_irqsave(&device->client_data_lock, flags);
 	list_for_each_entry(context, &device->client_data_list, list)
 		if (context->client == client) {
 			context->data = data;
 			goto out;
 		}
 
-	pr_warn("No client context found for %s/%s\n",
-		device->name, client->name);
+	dev_warn(&device->dev, "No client context found for %s\n",
+		 client->name);
 
 out:
-	spin_unlock_irqrestore(&device->client_data_lock, flags);
+	write_unlock_irqrestore(&device->client_data_lock, flags);
 }
 EXPORT_SYMBOL(ib_set_client_data);
 
@@ -1166,10 +1228,19 @@ static int __init ib_core_init(void)
 		goto err;
 	}
 
+	ib_comp_unbound_wq =
+		alloc_workqueue("ib-comp-unb-wq",
+				WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM |
+				WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE);
+	if (!ib_comp_unbound_wq) {
+		ret = -ENOMEM;
+		goto err_comp;
+	}
+
 	ret = class_register(&ib_class);
 	if (ret) {
 		pr_warn("Couldn't create InfiniBand device class\n");
-		goto err_comp;
+		goto err_comp_unbound;
 	}
 
 	ret = rdma_nl_init();
@@ -1218,6 +1289,8 @@ err_ibnl:
 	rdma_nl_exit();
 err_sysfs:
 	class_unregister(&ib_class);
+err_comp_unbound:
+	destroy_workqueue(ib_comp_unbound_wq);
 err_comp:
 	destroy_workqueue(ib_comp_wq);
 err:
@@ -1236,6 +1309,7 @@ static void __exit ib_core_cleanup(void)
 	addr_cleanup();
 	rdma_nl_exit();
 	class_unregister(&ib_class);
+	destroy_workqueue(ib_comp_unbound_wq);
 	destroy_workqueue(ib_comp_wq);
 	/* Make sure that any pending umem accounting work is done. */
 	destroy_workqueue(ib_wq);

+ 3 - 2
drivers/infiniband/core/fmr_pool.c

@@ -213,7 +213,7 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd             *pd,
 	device = pd->device;
 	if (!device->alloc_fmr    || !device->dealloc_fmr  ||
 	    !device->map_phys_fmr || !device->unmap_fmr) {
-		pr_info(PFX "Device %s does not support FMRs\n", device->name);
+		dev_info(&device->dev, "Device does not support FMRs\n");
 		return ERR_PTR(-ENOSYS);
 	}
 
@@ -257,7 +257,8 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd             *pd,
 	atomic_set(&pool->flush_ser, 0);
 	init_waitqueue_head(&pool->force_wait);
 
-	pool->worker = kthread_create_worker(0, "ib_fmr(%s)", device->name);
+	pool->worker =
+		kthread_create_worker(0, "ib_fmr(%s)", dev_name(&device->dev));
 	if (IS_ERR(pool->worker)) {
 		pr_warn(PFX "couldn't start cleanup kthread worker\n");
 		ret = PTR_ERR(pool->worker);

+ 1 - 1
drivers/infiniband/core/iwcm.c

@@ -509,7 +509,7 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active)
 	cm_id->m_local_addr = cm_id->local_addr;
 	cm_id->m_remote_addr = cm_id->remote_addr;
 
-	memcpy(pm_reg_msg.dev_name, cm_id->device->name,
+	memcpy(pm_reg_msg.dev_name, dev_name(&cm_id->device->dev),
 	       sizeof(pm_reg_msg.dev_name));
 	memcpy(pm_reg_msg.if_name, cm_id->device->iwcm->ifname,
 	       sizeof(pm_reg_msg.if_name));

+ 43 - 37
drivers/infiniband/core/mad.c

@@ -220,33 +220,37 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
 	int ret2, qpn;
 	u8 mgmt_class, vclass;
 
+	if ((qp_type == IB_QPT_SMI && !rdma_cap_ib_smi(device, port_num)) ||
+	    (qp_type == IB_QPT_GSI && !rdma_cap_ib_cm(device, port_num)))
+		return ERR_PTR(-EPROTONOSUPPORT);
+
 	/* Validate parameters */
 	qpn = get_spl_qp_index(qp_type);
 	if (qpn == -1) {
-		dev_notice(&device->dev,
-			   "ib_register_mad_agent: invalid QP Type %d\n",
-			   qp_type);
+		dev_dbg_ratelimited(&device->dev, "%s: invalid QP Type %d\n",
+				    __func__, qp_type);
 		goto error1;
 	}
 
 	if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION) {
-		dev_notice(&device->dev,
-			   "ib_register_mad_agent: invalid RMPP Version %u\n",
-			   rmpp_version);
+		dev_dbg_ratelimited(&device->dev,
+				    "%s: invalid RMPP Version %u\n",
+				    __func__, rmpp_version);
 		goto error1;
 	}
 
 	/* Validate MAD registration request if supplied */
 	if (mad_reg_req) {
 		if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION) {
-			dev_notice(&device->dev,
-				   "ib_register_mad_agent: invalid Class Version %u\n",
-				   mad_reg_req->mgmt_class_version);
+			dev_dbg_ratelimited(&device->dev,
+					    "%s: invalid Class Version %u\n",
+					    __func__,
+					    mad_reg_req->mgmt_class_version);
 			goto error1;
 		}
 		if (!recv_handler) {
-			dev_notice(&device->dev,
-				   "ib_register_mad_agent: no recv_handler\n");
+			dev_dbg_ratelimited(&device->dev,
+					    "%s: no recv_handler\n", __func__);
 			goto error1;
 		}
 		if (mad_reg_req->mgmt_class >= MAX_MGMT_CLASS) {
@@ -256,9 +260,9 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
 			 */
 			if (mad_reg_req->mgmt_class !=
 			    IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
-				dev_notice(&device->dev,
-					   "ib_register_mad_agent: Invalid Mgmt Class 0x%x\n",
-					   mad_reg_req->mgmt_class);
+				dev_dbg_ratelimited(&device->dev,
+					"%s: Invalid Mgmt Class 0x%x\n",
+					__func__, mad_reg_req->mgmt_class);
 				goto error1;
 			}
 		} else if (mad_reg_req->mgmt_class == 0) {
@@ -266,8 +270,9 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
 			 * Class 0 is reserved in IBA and is used for
 			 * aliasing of IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
 			 */
-			dev_notice(&device->dev,
-				   "ib_register_mad_agent: Invalid Mgmt Class 0\n");
+			dev_dbg_ratelimited(&device->dev,
+					    "%s: Invalid Mgmt Class 0\n",
+					    __func__);
 			goto error1;
 		} else if (is_vendor_class(mad_reg_req->mgmt_class)) {
 			/*
@@ -275,18 +280,19 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
 			 * ensure supplied OUI is not zero
 			 */
 			if (!is_vendor_oui(mad_reg_req->oui)) {
-				dev_notice(&device->dev,
-					   "ib_register_mad_agent: No OUI specified for class 0x%x\n",
-					   mad_reg_req->mgmt_class);
+				dev_dbg_ratelimited(&device->dev,
+					"%s: No OUI specified for class 0x%x\n",
+					__func__,
+					mad_reg_req->mgmt_class);
 				goto error1;
 			}
 		}
 		/* Make sure class supplied is consistent with RMPP */
 		if (!ib_is_mad_class_rmpp(mad_reg_req->mgmt_class)) {
 			if (rmpp_version) {
-				dev_notice(&device->dev,
-					   "ib_register_mad_agent: RMPP version for non-RMPP class 0x%x\n",
-					   mad_reg_req->mgmt_class);
+				dev_dbg_ratelimited(&device->dev,
+					"%s: RMPP version for non-RMPP class 0x%x\n",
+					__func__, mad_reg_req->mgmt_class);
 				goto error1;
 			}
 		}
@@ -297,9 +303,9 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
 					IB_MGMT_CLASS_SUBN_LID_ROUTED) &&
 			    (mad_reg_req->mgmt_class !=
 					IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
-				dev_notice(&device->dev,
-					   "ib_register_mad_agent: Invalid SM QP type: class 0x%x\n",
-					   mad_reg_req->mgmt_class);
+				dev_dbg_ratelimited(&device->dev,
+					"%s: Invalid SM QP type: class 0x%x\n",
+					__func__, mad_reg_req->mgmt_class);
 				goto error1;
 			}
 		} else {
@@ -307,9 +313,9 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
 					IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
 			    (mad_reg_req->mgmt_class ==
 					IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
-				dev_notice(&device->dev,
-					   "ib_register_mad_agent: Invalid GS QP type: class 0x%x\n",
-					   mad_reg_req->mgmt_class);
+				dev_dbg_ratelimited(&device->dev,
+					"%s: Invalid GS QP type: class 0x%x\n",
+					__func__, mad_reg_req->mgmt_class);
 				goto error1;
 			}
 		}
@@ -324,18 +330,18 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
 	/* Validate device and port */
 	port_priv = ib_get_mad_port(device, port_num);
 	if (!port_priv) {
-		dev_notice(&device->dev,
-			   "ib_register_mad_agent: Invalid port %d\n",
-			   port_num);
+		dev_dbg_ratelimited(&device->dev, "%s: Invalid port %d\n",
+				    __func__, port_num);
 		ret = ERR_PTR(-ENODEV);
 		goto error1;
 	}
 
-	/* Verify the QP requested is supported.  For example, Ethernet devices
-	 * will not have QP0 */
+	/* Verify the QP requested is supported. For example, Ethernet devices
+	 * will not have QP0.
+	 */
 	if (!port_priv->qp_info[qpn].qp) {
-		dev_notice(&device->dev,
-			   "ib_register_mad_agent: QP %d not supported\n", qpn);
+		dev_dbg_ratelimited(&device->dev, "%s: QP %d not supported\n",
+				    __func__, qpn);
 		ret = ERR_PTR(-EPROTONOSUPPORT);
 		goto error1;
 	}
@@ -2408,7 +2414,7 @@ static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr)
 }
 
 void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr,
-			  int timeout_ms)
+			  unsigned long timeout_ms)
 {
 	mad_send_wr->timeout = msecs_to_jiffies(timeout_ms);
 	wait_for_response(mad_send_wr);
@@ -3183,7 +3189,7 @@ static int ib_mad_port_open(struct ib_device *device,
 		cq_size *= 2;
 
 	port_priv->cq = ib_alloc_cq(port_priv->device, port_priv, cq_size, 0,
-			IB_POLL_WORKQUEUE);
+			IB_POLL_UNBOUND_WORKQUEUE);
 	if (IS_ERR(port_priv->cq)) {
 		dev_err(&device->dev, "Couldn't create ib_mad CQ\n");
 		ret = PTR_ERR(port_priv->cq);

+ 1 - 1
drivers/infiniband/core/mad_priv.h

@@ -221,6 +221,6 @@ void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
 void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr);
 
 void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr,
-			  int timeout_ms);
+			  unsigned long timeout_ms);
 
 #endif	/* __IB_MAD_PRIV_H__ */

+ 2 - 2
drivers/infiniband/core/netlink.c

@@ -47,9 +47,9 @@ static struct {
 	const struct rdma_nl_cbs   *cb_table;
 } rdma_nl_types[RDMA_NL_NUM_CLIENTS];
 
-int rdma_nl_chk_listeners(unsigned int group)
+bool rdma_nl_chk_listeners(unsigned int group)
 {
-	return (netlink_has_listeners(nls, group)) ? 0 : -1;
+	return netlink_has_listeners(nls, group);
 }
 EXPORT_SYMBOL(rdma_nl_chk_listeners);
 

+ 36 - 1
drivers/infiniband/core/nldev.c

@@ -179,7 +179,8 @@ static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device)
 {
 	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index))
 		return -EMSGSIZE;
-	if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, device->name))
+	if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME,
+			   dev_name(&device->dev)))
 		return -EMSGSIZE;
 
 	return 0;
@@ -645,6 +646,36 @@ err:
 	return err;
 }
 
+static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+			  struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	struct ib_device *device;
+	u32 index;
+	int err;
+
+	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy,
+			  extack);
+	if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+		return -EINVAL;
+
+	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	device = ib_device_get_by_index(index);
+	if (!device)
+		return -EINVAL;
+
+	if (tb[RDMA_NLDEV_ATTR_DEV_NAME]) {
+		char name[IB_DEVICE_NAME_MAX] = {};
+
+		nla_strlcpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME],
+			    IB_DEVICE_NAME_MAX);
+		err = ib_device_rename(device, name);
+	}
+
+	put_device(&device->dev);
+	return err;
+}
+
 static int _nldev_get_dumpit(struct ib_device *device,
 			     struct sk_buff *skb,
 			     struct netlink_callback *cb,
@@ -1077,6 +1108,10 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
 		.doit = nldev_get_doit,
 		.dump = nldev_get_dumpit,
 	},
+	[RDMA_NLDEV_CMD_SET] = {
+		.doit = nldev_set_doit,
+		.flags = RDMA_NL_ADMIN_PERM,
+	},
 	[RDMA_NLDEV_CMD_PORT_GET] = {
 		.doit = nldev_port_get_doit,
 		.dump = nldev_port_get_dumpit,

+ 13 - 43
drivers/infiniband/core/rdma_core.c

@@ -794,44 +794,6 @@ void uverbs_close_fd(struct file *f)
 	uverbs_uobject_put(uobj);
 }
 
-static void ufile_disassociate_ucontext(struct ib_ucontext *ibcontext)
-{
-	struct ib_device *ib_dev = ibcontext->device;
-	struct task_struct *owning_process  = NULL;
-	struct mm_struct   *owning_mm       = NULL;
-
-	owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID);
-	if (!owning_process)
-		return;
-
-	owning_mm = get_task_mm(owning_process);
-	if (!owning_mm) {
-		pr_info("no mm, disassociate ucontext is pending task termination\n");
-		while (1) {
-			put_task_struct(owning_process);
-			usleep_range(1000, 2000);
-			owning_process = get_pid_task(ibcontext->tgid,
-						      PIDTYPE_PID);
-			if (!owning_process ||
-			    owning_process->state == TASK_DEAD) {
-				pr_info("disassociate ucontext done, task was terminated\n");
-				/* in case task was dead need to release the
-				 * task struct.
-				 */
-				if (owning_process)
-					put_task_struct(owning_process);
-				return;
-			}
-		}
-	}
-
-	down_write(&owning_mm->mmap_sem);
-	ib_dev->disassociate_ucontext(ibcontext);
-	up_write(&owning_mm->mmap_sem);
-	mmput(owning_mm);
-	put_task_struct(owning_process);
-}
-
 /*
  * Drop the ucontext off the ufile and completely disconnect it from the
  * ib_device
@@ -840,20 +802,28 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile,
 				   enum rdma_remove_reason reason)
 {
 	struct ib_ucontext *ucontext = ufile->ucontext;
+	struct ib_device *ib_dev = ucontext->device;
 	int ret;
 
-	if (reason == RDMA_REMOVE_DRIVER_REMOVE)
-		ufile_disassociate_ucontext(ucontext);
+	/*
+	 * If we are closing the FD then the user mmap VMAs must have
+	 * already been destroyed as they hold on to the filep, otherwise
+	 * they need to be zap'd.
+	 */
+	if (reason == RDMA_REMOVE_DRIVER_REMOVE) {
+		uverbs_user_mmap_disassociate(ufile);
+		if (ib_dev->disassociate_ucontext)
+			ib_dev->disassociate_ucontext(ucontext);
+	}
 
-	put_pid(ucontext->tgid);
-	ib_rdmacg_uncharge(&ucontext->cg_obj, ucontext->device,
+	ib_rdmacg_uncharge(&ucontext->cg_obj, ib_dev,
 			   RDMACG_RESOURCE_HCA_HANDLE);
 
 	/*
 	 * FIXME: Drivers are not permitted to fail dealloc_ucontext, remove
 	 * the error return.
 	 */
-	ret = ucontext->device->dealloc_ucontext(ucontext);
+	ret = ib_dev->dealloc_ucontext(ucontext);
 	WARN_ON(ret);
 
 	ufile->ucontext = NULL;

+ 1 - 0
drivers/infiniband/core/rdma_core.h

@@ -160,5 +160,6 @@ void uverbs_disassociate_api(struct uverbs_api *uapi);
 void uverbs_destroy_api(struct uverbs_api *uapi);
 void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm,
 			      unsigned int num_attrs);
+void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile);
 
 #endif /* RDMA_CORE_H */

+ 24 - 6
drivers/infiniband/core/restrack.c

@@ -50,8 +50,7 @@ void rdma_restrack_clean(struct rdma_restrack_root *res)
 
 	dev = container_of(res, struct ib_device, res);
 	pr_err("restrack: %s", CUT_HERE);
-	pr_err("restrack: BUG: RESTRACK detected leak of resources on %s\n",
-	       dev->name);
+	dev_err(&dev->dev, "BUG: RESTRACK detected leak of resources\n");
 	hash_for_each(res->hash, bkt, e, node) {
 		if (rdma_is_kernel_res(e)) {
 			owner = e->kern_name;
@@ -156,6 +155,21 @@ static bool res_is_user(struct rdma_restrack_entry *res)
 	}
 }
 
+void rdma_restrack_set_task(struct rdma_restrack_entry *res,
+			    const char *caller)
+{
+	if (caller) {
+		res->kern_name = caller;
+		return;
+	}
+
+	if (res->task)
+		put_task_struct(res->task);
+	get_task_struct(current);
+	res->task = current;
+}
+EXPORT_SYMBOL(rdma_restrack_set_task);
+
 void rdma_restrack_add(struct rdma_restrack_entry *res)
 {
 	struct ib_device *dev = res_to_dev(res);
@@ -168,7 +182,7 @@ void rdma_restrack_add(struct rdma_restrack_entry *res)
 
 	if (res_is_user(res)) {
 		if (!res->task)
-			rdma_restrack_set_task(res, current);
+			rdma_restrack_set_task(res, NULL);
 		res->kern_name = NULL;
 	} else {
 		set_kern_name(res);
@@ -209,7 +223,7 @@ void rdma_restrack_del(struct rdma_restrack_entry *res)
 	struct ib_device *dev;
 
 	if (!res->valid)
-		return;
+		goto out;
 
 	dev = res_to_dev(res);
 	if (!dev)
@@ -222,8 +236,12 @@ void rdma_restrack_del(struct rdma_restrack_entry *res)
 	down_write(&dev->res.rwsem);
 	hash_del(&res->node);
 	res->valid = false;
-	if (res->task)
-		put_task_struct(res->task);
 	up_write(&dev->res.rwsem);
+
+out:
+	if (res->task) {
+		put_task_struct(res->task);
+		res->task = NULL;
+	}
 }
 EXPORT_SYMBOL(rdma_restrack_del);

+ 3 - 5
drivers/infiniband/core/sa.h

@@ -49,16 +49,14 @@ static inline void ib_sa_client_put(struct ib_sa_client *client)
 }
 
 int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
-			     struct ib_device *device, u8 port_num,
-			     u8 method,
+			     struct ib_device *device, u8 port_num, u8 method,
 			     struct ib_sa_mcmember_rec *rec,
 			     ib_sa_comp_mask comp_mask,
-			     int timeout_ms, gfp_t gfp_mask,
+			     unsigned long timeout_ms, gfp_t gfp_mask,
 			     void (*callback)(int status,
 					      struct ib_sa_mcmember_rec *resp,
 					      void *context),
-			     void *context,
-			     struct ib_sa_query **sa_query);
+			     void *context, struct ib_sa_query **sa_query);
 
 int mcast_init(void);
 void mcast_cleanup(void);

+ 11 - 59
drivers/infiniband/core/sa_query.c

@@ -761,7 +761,7 @@ static void ib_nl_set_path_rec_attrs(struct sk_buff *skb,
 
 	/* Construct the family header first */
 	header = skb_put(skb, NLMSG_ALIGN(sizeof(*header)));
-	memcpy(header->device_name, query->port->agent->device->name,
+	memcpy(header->device_name, dev_name(&query->port->agent->device->dev),
 	       LS_DEVICE_NAME_MAX);
 	header->port_num = query->port->port_num;
 
@@ -835,7 +835,6 @@ static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask)
 	struct sk_buff *skb = NULL;
 	struct nlmsghdr *nlh;
 	void *data;
-	int ret = 0;
 	struct ib_sa_mad *mad;
 	int len;
 
@@ -862,13 +861,7 @@ static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask)
 	/* Repair the nlmsg header length */
 	nlmsg_end(skb, nlh);
 
-	ret = rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, gfp_mask);
-	if (!ret)
-		ret = len;
-	else
-		ret = 0;
-
-	return ret;
+	return rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, gfp_mask);
 }
 
 static int ib_nl_make_request(struct ib_sa_query *query, gfp_t gfp_mask)
@@ -891,14 +884,12 @@ static int ib_nl_make_request(struct ib_sa_query *query, gfp_t gfp_mask)
 	spin_unlock_irqrestore(&ib_nl_request_lock, flags);
 
 	ret = ib_nl_send_msg(query, gfp_mask);
-	if (ret <= 0) {
+	if (ret) {
 		ret = -EIO;
 		/* Remove the request */
 		spin_lock_irqsave(&ib_nl_request_lock, flags);
 		list_del(&query->list);
 		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
-	} else {
-		ret = 0;
 	}
 
 	return ret;
@@ -1227,46 +1218,6 @@ static u8 get_src_path_mask(struct ib_device *device, u8 port_num)
 	return src_path_mask;
 }
 
-static int roce_resolve_route_from_path(struct sa_path_rec *rec,
-					const struct ib_gid_attr *attr)
-{
-	struct rdma_dev_addr dev_addr = {};
-	union {
-		struct sockaddr     _sockaddr;
-		struct sockaddr_in  _sockaddr_in;
-		struct sockaddr_in6 _sockaddr_in6;
-	} sgid_addr, dgid_addr;
-	int ret;
-
-	if (rec->roce.route_resolved)
-		return 0;
-	if (!attr || !attr->ndev)
-		return -EINVAL;
-
-	dev_addr.bound_dev_if = attr->ndev->ifindex;
-	/* TODO: Use net from the ib_gid_attr once it is added to it,
-	 * until than, limit itself to init_net.
-	 */
-	dev_addr.net = &init_net;
-
-	rdma_gid2ip(&sgid_addr._sockaddr, &rec->sgid);
-	rdma_gid2ip(&dgid_addr._sockaddr, &rec->dgid);
-
-	/* validate the route */
-	ret = rdma_resolve_ip_route(&sgid_addr._sockaddr,
-				    &dgid_addr._sockaddr, &dev_addr);
-	if (ret)
-		return ret;
-
-	if ((dev_addr.network == RDMA_NETWORK_IPV4 ||
-	     dev_addr.network == RDMA_NETWORK_IPV6) &&
-	    rec->rec_type != SA_PATH_REC_TYPE_ROCE_V2)
-		return -EINVAL;
-
-	rec->roce.route_resolved = true;
-	return 0;
-}
-
 static int init_ah_attr_grh_fields(struct ib_device *device, u8 port_num,
 				   struct sa_path_rec *rec,
 				   struct rdma_ah_attr *ah_attr,
@@ -1409,7 +1360,8 @@ static void init_mad(struct ib_sa_query *query, struct ib_mad_agent *agent)
 	spin_unlock_irqrestore(&tid_lock, flags);
 }
 
-static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
+static int send_mad(struct ib_sa_query *query, unsigned long timeout_ms,
+		    gfp_t gfp_mask)
 {
 	bool preload = gfpflags_allow_blocking(gfp_mask);
 	unsigned long flags;
@@ -1433,7 +1385,7 @@ static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
 
 	if ((query->flags & IB_SA_ENABLE_LOCAL_SERVICE) &&
 	    (!(query->flags & IB_SA_QUERY_OPA))) {
-		if (!rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) {
+		if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) {
 			if (!ib_nl_make_request(query, gfp_mask))
 				return id;
 		}
@@ -1599,7 +1551,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client,
 		       struct ib_device *device, u8 port_num,
 		       struct sa_path_rec *rec,
 		       ib_sa_comp_mask comp_mask,
-		       int timeout_ms, gfp_t gfp_mask,
+		       unsigned long timeout_ms, gfp_t gfp_mask,
 		       void (*callback)(int status,
 					struct sa_path_rec *resp,
 					void *context),
@@ -1753,7 +1705,7 @@ int ib_sa_service_rec_query(struct ib_sa_client *client,
 			    struct ib_device *device, u8 port_num, u8 method,
 			    struct ib_sa_service_rec *rec,
 			    ib_sa_comp_mask comp_mask,
-			    int timeout_ms, gfp_t gfp_mask,
+			    unsigned long timeout_ms, gfp_t gfp_mask,
 			    void (*callback)(int status,
 					     struct ib_sa_service_rec *resp,
 					     void *context),
@@ -1850,7 +1802,7 @@ int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
 			     u8 method,
 			     struct ib_sa_mcmember_rec *rec,
 			     ib_sa_comp_mask comp_mask,
-			     int timeout_ms, gfp_t gfp_mask,
+			     unsigned long timeout_ms, gfp_t gfp_mask,
 			     void (*callback)(int status,
 					      struct ib_sa_mcmember_rec *resp,
 					      void *context),
@@ -1941,7 +1893,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client,
 			      struct ib_device *device, u8 port_num,
 			      struct ib_sa_guidinfo_rec *rec,
 			      ib_sa_comp_mask comp_mask, u8 method,
-			      int timeout_ms, gfp_t gfp_mask,
+			      unsigned long timeout_ms, gfp_t gfp_mask,
 			      void (*callback)(int status,
 					       struct ib_sa_guidinfo_rec *resp,
 					       void *context),
@@ -2108,7 +2060,7 @@ static void ib_sa_classport_info_rec_release(struct ib_sa_query *sa_query)
 }
 
 static int ib_sa_classport_info_rec_query(struct ib_sa_port *port,
-					  int timeout_ms,
+					  unsigned long timeout_ms,
 					  void (*callback)(void *context),
 					  void *context,
 					  struct ib_sa_query **sa_query)

+ 3 - 4
drivers/infiniband/core/security.c

@@ -685,9 +685,8 @@ static int ib_mad_agent_security_change(struct notifier_block *nb,
 	if (event != LSM_POLICY_CHANGE)
 		return NOTIFY_DONE;
 
-	ag->smp_allowed = !security_ib_endport_manage_subnet(ag->security,
-							     ag->device->name,
-							     ag->port_num);
+	ag->smp_allowed = !security_ib_endport_manage_subnet(
+		ag->security, dev_name(&ag->device->dev), ag->port_num);
 
 	return NOTIFY_OK;
 }
@@ -708,7 +707,7 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent,
 		return 0;
 
 	ret = security_ib_endport_manage_subnet(agent->security,
-						agent->device->name,
+						dev_name(&agent->device->dev),
 						agent->port_num);
 	if (ret)
 		return ret;

+ 47 - 54
drivers/infiniband/core/sysfs.c

@@ -512,7 +512,7 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
 	ret = get_perf_mad(p->ibdev, p->port_num, tab_attr->attr_id, &data,
 			40 + offset / 8, sizeof(data));
 	if (ret < 0)
-		return sprintf(buf, "N/A (no PMA)\n");
+		return ret;
 
 	switch (width) {
 	case 4:
@@ -1036,7 +1036,7 @@ static int add_port(struct ib_device *device, int port_num,
 	p->port_num   = port_num;
 
 	ret = kobject_init_and_add(&p->kobj, &port_type,
-				   device->ports_parent,
+				   device->ports_kobj,
 				   "%d", port_num);
 	if (ret) {
 		kfree(p);
@@ -1057,10 +1057,12 @@ static int add_port(struct ib_device *device, int port_num,
 		goto err_put;
 	}
 
-	p->pma_table = get_counter_table(device, port_num);
-	ret = sysfs_create_group(&p->kobj, p->pma_table);
-	if (ret)
-		goto err_put_gid_attrs;
+	if (device->process_mad) {
+		p->pma_table = get_counter_table(device, port_num);
+		ret = sysfs_create_group(&p->kobj, p->pma_table);
+		if (ret)
+			goto err_put_gid_attrs;
+	}
 
 	p->gid_group.name  = "gids";
 	p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len);
@@ -1118,9 +1120,9 @@ static int add_port(struct ib_device *device, int port_num,
 	}
 
 	/*
-	 * If port == 0, it means we have only one port and the parent
-	 * device, not this port device, should be the holder of the
-	 * hw_counters
+	 * If port == 0, it means hw_counters are per device and not per
+	 * port, so holder should be device. Therefore skip per port conunter
+	 * initialization.
 	 */
 	if (device->alloc_hw_stats && port_num)
 		setup_hw_stats(device, p, port_num);
@@ -1173,7 +1175,8 @@ err_free_gid:
 	p->gid_group.attrs = NULL;
 
 err_remove_pma:
-	sysfs_remove_group(&p->kobj, p->pma_table);
+	if (p->pma_table)
+		sysfs_remove_group(&p->kobj, p->pma_table);
 
 err_put_gid_attrs:
 	kobject_put(&p->gid_attr_group->kobj);
@@ -1183,7 +1186,7 @@ err_put:
 	return ret;
 }
 
-static ssize_t show_node_type(struct device *device,
+static ssize_t node_type_show(struct device *device,
 			      struct device_attribute *attr, char *buf)
 {
 	struct ib_device *dev = container_of(device, struct ib_device, dev);
@@ -1198,8 +1201,9 @@ static ssize_t show_node_type(struct device *device,
 	default:		  return sprintf(buf, "%d: <unknown>\n", dev->node_type);
 	}
 }
+static DEVICE_ATTR_RO(node_type);
 
-static ssize_t show_sys_image_guid(struct device *device,
+static ssize_t sys_image_guid_show(struct device *device,
 				   struct device_attribute *dev_attr, char *buf)
 {
 	struct ib_device *dev = container_of(device, struct ib_device, dev);
@@ -1210,8 +1214,9 @@ static ssize_t show_sys_image_guid(struct device *device,
 		       be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[2]),
 		       be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[3]));
 }
+static DEVICE_ATTR_RO(sys_image_guid);
 
-static ssize_t show_node_guid(struct device *device,
+static ssize_t node_guid_show(struct device *device,
 			      struct device_attribute *attr, char *buf)
 {
 	struct ib_device *dev = container_of(device, struct ib_device, dev);
@@ -1222,8 +1227,9 @@ static ssize_t show_node_guid(struct device *device,
 		       be16_to_cpu(((__be16 *) &dev->node_guid)[2]),
 		       be16_to_cpu(((__be16 *) &dev->node_guid)[3]));
 }
+static DEVICE_ATTR_RO(node_guid);
 
-static ssize_t show_node_desc(struct device *device,
+static ssize_t node_desc_show(struct device *device,
 			      struct device_attribute *attr, char *buf)
 {
 	struct ib_device *dev = container_of(device, struct ib_device, dev);
@@ -1231,9 +1237,9 @@ static ssize_t show_node_desc(struct device *device,
 	return sprintf(buf, "%.64s\n", dev->node_desc);
 }
 
-static ssize_t set_node_desc(struct device *device,
-			     struct device_attribute *attr,
-			     const char *buf, size_t count)
+static ssize_t node_desc_store(struct device *device,
+			       struct device_attribute *attr,
+			       const char *buf, size_t count)
 {
 	struct ib_device *dev = container_of(device, struct ib_device, dev);
 	struct ib_device_modify desc = {};
@@ -1249,8 +1255,9 @@ static ssize_t set_node_desc(struct device *device,
 
 	return count;
 }
+static DEVICE_ATTR_RW(node_desc);
 
-static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
+static ssize_t fw_ver_show(struct device *device, struct device_attribute *attr,
 			   char *buf)
 {
 	struct ib_device *dev = container_of(device, struct ib_device, dev);
@@ -1259,19 +1266,19 @@ static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
 	strlcat(buf, "\n", IB_FW_VERSION_NAME_MAX);
 	return strlen(buf);
 }
+static DEVICE_ATTR_RO(fw_ver);
+
+static struct attribute *ib_dev_attrs[] = {
+	&dev_attr_node_type.attr,
+	&dev_attr_node_guid.attr,
+	&dev_attr_sys_image_guid.attr,
+	&dev_attr_fw_ver.attr,
+	&dev_attr_node_desc.attr,
+	NULL,
+};
 
-static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL);
-static DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL);
-static DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL);
-static DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc);
-static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
-
-static struct device_attribute *ib_class_attributes[] = {
-	&dev_attr_node_type,
-	&dev_attr_sys_image_guid,
-	&dev_attr_node_guid,
-	&dev_attr_node_desc,
-	&dev_attr_fw_ver,
+static const struct attribute_group dev_attr_group = {
+	.attrs = ib_dev_attrs,
 };
 
 static void free_port_list_attributes(struct ib_device *device)
@@ -1285,7 +1292,9 @@ static void free_port_list_attributes(struct ib_device *device)
 			kfree(port->hw_stats);
 			free_hsag(&port->kobj, port->hw_stats_ag);
 		}
-		sysfs_remove_group(p, port->pma_table);
+
+		if (port->pma_table)
+			sysfs_remove_group(p, port->pma_table);
 		sysfs_remove_group(p, &port->pkey_group);
 		sysfs_remove_group(p, &port->gid_group);
 		sysfs_remove_group(&port->gid_attr_group->kobj,
@@ -1296,7 +1305,7 @@ static void free_port_list_attributes(struct ib_device *device)
 		kobject_put(p);
 	}
 
-	kobject_put(device->ports_parent);
+	kobject_put(device->ports_kobj);
 }
 
 int ib_device_register_sysfs(struct ib_device *device,
@@ -1307,23 +1316,15 @@ int ib_device_register_sysfs(struct ib_device *device,
 	int ret;
 	int i;
 
-	ret = dev_set_name(class_dev, "%s", device->name);
-	if (ret)
-		return ret;
+	device->groups[0] = &dev_attr_group;
+	class_dev->groups = device->groups;
 
 	ret = device_add(class_dev);
 	if (ret)
 		goto err;
 
-	for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) {
-		ret = device_create_file(class_dev, ib_class_attributes[i]);
-		if (ret)
-			goto err_unregister;
-	}
-
-	device->ports_parent = kobject_create_and_add("ports",
-						      &class_dev->kobj);
-	if (!device->ports_parent) {
+	device->ports_kobj = kobject_create_and_add("ports", &class_dev->kobj);
+	if (!device->ports_kobj) {
 		ret = -ENOMEM;
 		goto err_put;
 	}
@@ -1347,20 +1348,15 @@ int ib_device_register_sysfs(struct ib_device *device,
 
 err_put:
 	free_port_list_attributes(device);
-
-err_unregister:
 	device_del(class_dev);
-
 err:
 	return ret;
 }
 
 void ib_device_unregister_sysfs(struct ib_device *device)
 {
-	int i;
-
-	/* Hold kobject until ib_dealloc_device() */
-	kobject_get(&device->dev.kobj);
+	/* Hold device until ib_dealloc_device() */
+	get_device(&device->dev);
 
 	free_port_list_attributes(device);
 
@@ -1369,8 +1365,5 @@ void ib_device_unregister_sysfs(struct ib_device *device)
 		free_hsag(&device->dev.kobj, device->hw_stats_ag);
 	}
 
-	for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i)
-		device_remove_file(&device->dev, ib_class_attributes[i]);
-
 	device_unregister(&device->dev);
 }

+ 66 - 59
drivers/infiniband/core/umem.c

@@ -85,7 +85,9 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 	struct page **page_list;
 	struct vm_area_struct **vma_list;
 	unsigned long lock_limit;
+	unsigned long new_pinned;
 	unsigned long cur_base;
+	struct mm_struct *mm;
 	unsigned long npages;
 	int ret;
 	int i;
@@ -107,25 +109,32 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 	if (!can_do_mlock())
 		return ERR_PTR(-EPERM);
 
-	umem = kzalloc(sizeof *umem, GFP_KERNEL);
-	if (!umem)
-		return ERR_PTR(-ENOMEM);
+	if (access & IB_ACCESS_ON_DEMAND) {
+		umem = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL);
+		if (!umem)
+			return ERR_PTR(-ENOMEM);
+		umem->is_odp = 1;
+	} else {
+		umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+		if (!umem)
+			return ERR_PTR(-ENOMEM);
+	}
 
 	umem->context    = context;
 	umem->length     = size;
 	umem->address    = addr;
 	umem->page_shift = PAGE_SHIFT;
 	umem->writable   = ib_access_writable(access);
+	umem->owning_mm = mm = current->mm;
+	mmgrab(mm);
 
 	if (access & IB_ACCESS_ON_DEMAND) {
-		ret = ib_umem_odp_get(context, umem, access);
+		ret = ib_umem_odp_get(to_ib_umem_odp(umem), access);
 		if (ret)
 			goto umem_kfree;
 		return umem;
 	}
 
-	umem->odp_data = NULL;
-
 	/* We assume the memory is from hugetlb until proved otherwise */
 	umem->hugetlb   = 1;
 
@@ -144,25 +153,25 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 		umem->hugetlb = 0;
 
 	npages = ib_umem_num_pages(umem);
+	if (npages == 0 || npages > UINT_MAX) {
+		ret = -EINVAL;
+		goto out;
+	}
 
 	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 
-	down_write(&current->mm->mmap_sem);
-	current->mm->pinned_vm += npages;
-	if ((current->mm->pinned_vm > lock_limit) && !capable(CAP_IPC_LOCK)) {
-		up_write(&current->mm->mmap_sem);
+	down_write(&mm->mmap_sem);
+	if (check_add_overflow(mm->pinned_vm, npages, &new_pinned) ||
+	    (new_pinned > lock_limit && !capable(CAP_IPC_LOCK))) {
+		up_write(&mm->mmap_sem);
 		ret = -ENOMEM;
-		goto vma;
+		goto out;
 	}
-	up_write(&current->mm->mmap_sem);
+	mm->pinned_vm = new_pinned;
+	up_write(&mm->mmap_sem);
 
 	cur_base = addr & PAGE_MASK;
 
-	if (npages == 0 || npages > UINT_MAX) {
-		ret = -EINVAL;
-		goto vma;
-	}
-
 	ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL);
 	if (ret)
 		goto vma;
@@ -172,14 +181,14 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 
 	sg_list_start = umem->sg_head.sgl;
 
-	down_read(&current->mm->mmap_sem);
 	while (npages) {
+		down_read(&mm->mmap_sem);
 		ret = get_user_pages_longterm(cur_base,
 				     min_t(unsigned long, npages,
 					   PAGE_SIZE / sizeof (struct page *)),
 				     gup_flags, page_list, vma_list);
 		if (ret < 0) {
-			up_read(&current->mm->mmap_sem);
+			up_read(&mm->mmap_sem);
 			goto umem_release;
 		}
 
@@ -187,17 +196,20 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 		cur_base += ret * PAGE_SIZE;
 		npages   -= ret;
 
+		/* Continue to hold the mmap_sem as vma_list access
+		 * needs to be protected.
+		 */
 		for_each_sg(sg_list_start, sg, ret, i) {
 			if (vma_list && !is_vm_hugetlb_page(vma_list[i]))
 				umem->hugetlb = 0;
 
 			sg_set_page(sg, page_list[i], PAGE_SIZE, 0);
 		}
+		up_read(&mm->mmap_sem);
 
 		/* preparing for next loop */
 		sg_list_start = sg;
 	}
-	up_read(&current->mm->mmap_sem);
 
 	umem->nmap = ib_dma_map_sg_attrs(context->device,
 				  umem->sg_head.sgl,
@@ -216,29 +228,40 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 umem_release:
 	__ib_umem_release(context->device, umem, 0);
 vma:
-	down_write(&current->mm->mmap_sem);
-	current->mm->pinned_vm -= ib_umem_num_pages(umem);
-	up_write(&current->mm->mmap_sem);
+	down_write(&mm->mmap_sem);
+	mm->pinned_vm -= ib_umem_num_pages(umem);
+	up_write(&mm->mmap_sem);
 out:
 	if (vma_list)
 		free_page((unsigned long) vma_list);
 	free_page((unsigned long) page_list);
 umem_kfree:
-	if (ret)
+	if (ret) {
+		mmdrop(umem->owning_mm);
 		kfree(umem);
+	}
 	return ret ? ERR_PTR(ret) : umem;
 }
 EXPORT_SYMBOL(ib_umem_get);
 
-static void ib_umem_account(struct work_struct *work)
+static void __ib_umem_release_tail(struct ib_umem *umem)
+{
+	mmdrop(umem->owning_mm);
+	if (umem->is_odp)
+		kfree(to_ib_umem_odp(umem));
+	else
+		kfree(umem);
+}
+
+static void ib_umem_release_defer(struct work_struct *work)
 {
 	struct ib_umem *umem = container_of(work, struct ib_umem, work);
 
-	down_write(&umem->mm->mmap_sem);
-	umem->mm->pinned_vm -= umem->diff;
-	up_write(&umem->mm->mmap_sem);
-	mmput(umem->mm);
-	kfree(umem);
+	down_write(&umem->owning_mm->mmap_sem);
+	umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem);
+	up_write(&umem->owning_mm->mmap_sem);
+
+	__ib_umem_release_tail(umem);
 }
 
 /**
@@ -248,52 +271,36 @@ static void ib_umem_account(struct work_struct *work)
 void ib_umem_release(struct ib_umem *umem)
 {
 	struct ib_ucontext *context = umem->context;
-	struct mm_struct *mm;
-	struct task_struct *task;
-	unsigned long diff;
 
-	if (umem->odp_data) {
-		ib_umem_odp_release(umem);
+	if (umem->is_odp) {
+		ib_umem_odp_release(to_ib_umem_odp(umem));
+		__ib_umem_release_tail(umem);
 		return;
 	}
 
 	__ib_umem_release(umem->context->device, umem, 1);
 
-	task = get_pid_task(umem->context->tgid, PIDTYPE_PID);
-	if (!task)
-		goto out;
-	mm = get_task_mm(task);
-	put_task_struct(task);
-	if (!mm)
-		goto out;
-
-	diff = ib_umem_num_pages(umem);
-
 	/*
 	 * We may be called with the mm's mmap_sem already held.  This
 	 * can happen when a userspace munmap() is the call that drops
 	 * the last reference to our file and calls our release
 	 * method.  If there are memory regions to destroy, we'll end
 	 * up here and not be able to take the mmap_sem.  In that case
-	 * we defer the vm_locked accounting to the system workqueue.
+	 * we defer the vm_locked accounting a workqueue.
 	 */
 	if (context->closing) {
-		if (!down_write_trylock(&mm->mmap_sem)) {
-			INIT_WORK(&umem->work, ib_umem_account);
-			umem->mm   = mm;
-			umem->diff = diff;
-
+		if (!down_write_trylock(&umem->owning_mm->mmap_sem)) {
+			INIT_WORK(&umem->work, ib_umem_release_defer);
 			queue_work(ib_wq, &umem->work);
 			return;
 		}
-	} else
-		down_write(&mm->mmap_sem);
+	} else {
+		down_write(&umem->owning_mm->mmap_sem);
+	}
+	umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem);
+	up_write(&umem->owning_mm->mmap_sem);
 
-	mm->pinned_vm -= diff;
-	up_write(&mm->mmap_sem);
-	mmput(mm);
-out:
-	kfree(umem);
+	__ib_umem_release_tail(umem);
 }
 EXPORT_SYMBOL(ib_umem_release);
 
@@ -303,7 +310,7 @@ int ib_umem_page_count(struct ib_umem *umem)
 	int n;
 	struct scatterlist *sg;
 
-	if (umem->odp_data)
+	if (umem->is_odp)
 		return ib_umem_num_pages(umem);
 
 	n = 0;

+ 286 - 335
drivers/infiniband/core/umem_odp.c

@@ -58,7 +58,7 @@ static u64 node_start(struct umem_odp_node *n)
 	struct ib_umem_odp *umem_odp =
 			container_of(n, struct ib_umem_odp, interval_tree);
 
-	return ib_umem_start(umem_odp->umem);
+	return ib_umem_start(&umem_odp->umem);
 }
 
 /* Note that the representation of the intervals in the interval tree
@@ -71,140 +71,86 @@ static u64 node_last(struct umem_odp_node *n)
 	struct ib_umem_odp *umem_odp =
 			container_of(n, struct ib_umem_odp, interval_tree);
 
-	return ib_umem_end(umem_odp->umem) - 1;
+	return ib_umem_end(&umem_odp->umem) - 1;
 }
 
 INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last,
 		     node_start, node_last, static, rbt_ib_umem)
 
-static void ib_umem_notifier_start_account(struct ib_umem *item)
+static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp)
 {
-	mutex_lock(&item->odp_data->umem_mutex);
-
-	/* Only update private counters for this umem if it has them.
-	 * Otherwise skip it. All page faults will be delayed for this umem. */
-	if (item->odp_data->mn_counters_active) {
-		int notifiers_count = item->odp_data->notifiers_count++;
-
-		if (notifiers_count == 0)
-			/* Initialize the completion object for waiting on
-			 * notifiers. Since notifier_count is zero, no one
-			 * should be waiting right now. */
-			reinit_completion(&item->odp_data->notifier_completion);
-	}
-	mutex_unlock(&item->odp_data->umem_mutex);
-}
-
-static void ib_umem_notifier_end_account(struct ib_umem *item)
-{
-	mutex_lock(&item->odp_data->umem_mutex);
-
-	/* Only update private counters for this umem if it has them.
-	 * Otherwise skip it. All page faults will be delayed for this umem. */
-	if (item->odp_data->mn_counters_active) {
+	mutex_lock(&umem_odp->umem_mutex);
+	if (umem_odp->notifiers_count++ == 0)
 		/*
-		 * This sequence increase will notify the QP page fault that
-		 * the page that is going to be mapped in the spte could have
-		 * been freed.
+		 * Initialize the completion object for waiting on
+		 * notifiers. Since notifier_count is zero, no one should be
+		 * waiting right now.
 		 */
-		++item->odp_data->notifiers_seq;
-		if (--item->odp_data->notifiers_count == 0)
-			complete_all(&item->odp_data->notifier_completion);
-	}
-	mutex_unlock(&item->odp_data->umem_mutex);
+		reinit_completion(&umem_odp->notifier_completion);
+	mutex_unlock(&umem_odp->umem_mutex);
 }
 
-/* Account for a new mmu notifier in an ib_ucontext. */
-static void ib_ucontext_notifier_start_account(struct ib_ucontext *context)
+static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp)
 {
-	atomic_inc(&context->notifier_count);
+	mutex_lock(&umem_odp->umem_mutex);
+	/*
+	 * This sequence increase will notify the QP page fault that the page
+	 * that is going to be mapped in the spte could have been freed.
+	 */
+	++umem_odp->notifiers_seq;
+	if (--umem_odp->notifiers_count == 0)
+		complete_all(&umem_odp->notifier_completion);
+	mutex_unlock(&umem_odp->umem_mutex);
 }
 
-/* Account for a terminating mmu notifier in an ib_ucontext.
- *
- * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since
- * the function takes the semaphore itself. */
-static void ib_ucontext_notifier_end_account(struct ib_ucontext *context)
+static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp,
+					       u64 start, u64 end, void *cookie)
 {
-	int zero_notifiers = atomic_dec_and_test(&context->notifier_count);
-
-	if (zero_notifiers &&
-	    !list_empty(&context->no_private_counters)) {
-		/* No currently running mmu notifiers. Now is the chance to
-		 * add private accounting to all previously added umems. */
-		struct ib_umem_odp *odp_data, *next;
-
-		/* Prevent concurrent mmu notifiers from working on the
-		 * no_private_counters list. */
-		down_write(&context->umem_rwsem);
-
-		/* Read the notifier_count again, with the umem_rwsem
-		 * semaphore taken for write. */
-		if (!atomic_read(&context->notifier_count)) {
-			list_for_each_entry_safe(odp_data, next,
-						 &context->no_private_counters,
-						 no_private_counters) {
-				mutex_lock(&odp_data->umem_mutex);
-				odp_data->mn_counters_active = true;
-				list_del(&odp_data->no_private_counters);
-				complete_all(&odp_data->notifier_completion);
-				mutex_unlock(&odp_data->umem_mutex);
-			}
-		}
-
-		up_write(&context->umem_rwsem);
-	}
-}
+	struct ib_umem *umem = &umem_odp->umem;
 
-static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start,
-					       u64 end, void *cookie) {
 	/*
 	 * Increase the number of notifiers running, to
 	 * prevent any further fault handling on this MR.
 	 */
-	ib_umem_notifier_start_account(item);
-	item->odp_data->dying = 1;
+	ib_umem_notifier_start_account(umem_odp);
+	umem_odp->dying = 1;
 	/* Make sure that the fact the umem is dying is out before we release
 	 * all pending page faults. */
 	smp_wmb();
-	complete_all(&item->odp_data->notifier_completion);
-	item->context->invalidate_range(item, ib_umem_start(item),
-					ib_umem_end(item));
+	complete_all(&umem_odp->notifier_completion);
+	umem->context->invalidate_range(umem_odp, ib_umem_start(umem),
+					ib_umem_end(umem));
 	return 0;
 }
 
 static void ib_umem_notifier_release(struct mmu_notifier *mn,
 				     struct mm_struct *mm)
 {
-	struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
-
-	if (!context->invalidate_range)
-		return;
-
-	ib_ucontext_notifier_start_account(context);
-	down_read(&context->umem_rwsem);
-	rbt_ib_umem_for_each_in_range(&context->umem_tree, 0,
-				      ULLONG_MAX,
-				      ib_umem_notifier_release_trampoline,
-				      true,
-				      NULL);
-	up_read(&context->umem_rwsem);
+	struct ib_ucontext_per_mm *per_mm =
+		container_of(mn, struct ib_ucontext_per_mm, mn);
+
+	down_read(&per_mm->umem_rwsem);
+	if (per_mm->active)
+		rbt_ib_umem_for_each_in_range(
+			&per_mm->umem_tree, 0, ULLONG_MAX,
+			ib_umem_notifier_release_trampoline, true, NULL);
+	up_read(&per_mm->umem_rwsem);
 }
 
-static int invalidate_page_trampoline(struct ib_umem *item, u64 start,
+static int invalidate_page_trampoline(struct ib_umem_odp *item, u64 start,
 				      u64 end, void *cookie)
 {
 	ib_umem_notifier_start_account(item);
-	item->context->invalidate_range(item, start, start + PAGE_SIZE);
+	item->umem.context->invalidate_range(item, start, start + PAGE_SIZE);
 	ib_umem_notifier_end_account(item);
 	return 0;
 }
 
-static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start,
-					     u64 end, void *cookie)
+static int invalidate_range_start_trampoline(struct ib_umem_odp *item,
+					     u64 start, u64 end, void *cookie)
 {
 	ib_umem_notifier_start_account(item);
-	item->context->invalidate_range(item, start, end);
+	item->umem.context->invalidate_range(item, start, end);
 	return 0;
 }
 
@@ -214,28 +160,30 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
 						    unsigned long end,
 						    bool blockable)
 {
-	struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
-	int ret;
-
-	if (!context->invalidate_range)
-		return 0;
+	struct ib_ucontext_per_mm *per_mm =
+		container_of(mn, struct ib_ucontext_per_mm, mn);
 
 	if (blockable)
-		down_read(&context->umem_rwsem);
-	else if (!down_read_trylock(&context->umem_rwsem))
+		down_read(&per_mm->umem_rwsem);
+	else if (!down_read_trylock(&per_mm->umem_rwsem))
 		return -EAGAIN;
 
-	ib_ucontext_notifier_start_account(context);
-	ret = rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
-				      end,
-				      invalidate_range_start_trampoline,
-				      blockable, NULL);
-	up_read(&context->umem_rwsem);
+	if (!per_mm->active) {
+		up_read(&per_mm->umem_rwsem);
+		/*
+		 * At this point active is permanently set and visible to this
+		 * CPU without a lock, that fact is relied on to skip the unlock
+		 * in range_end.
+		 */
+		return 0;
+	}
 
-	return ret;
+	return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end,
+					     invalidate_range_start_trampoline,
+					     blockable, NULL);
 }
 
-static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start,
+static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start,
 					   u64 end, void *cookie)
 {
 	ib_umem_notifier_end_account(item);
@@ -247,22 +195,16 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
 						  unsigned long start,
 						  unsigned long end)
 {
-	struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+	struct ib_ucontext_per_mm *per_mm =
+		container_of(mn, struct ib_ucontext_per_mm, mn);
 
-	if (!context->invalidate_range)
+	if (unlikely(!per_mm->active))
 		return;
 
-	/*
-	 * TODO: we currently bail out if there is any sleepable work to be done
-	 * in ib_umem_notifier_invalidate_range_start so we shouldn't really block
-	 * here. But this is ugly and fragile.
-	 */
-	down_read(&context->umem_rwsem);
-	rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
+	rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start,
 				      end,
 				      invalidate_range_end_trampoline, true, NULL);
-	up_read(&context->umem_rwsem);
-	ib_ucontext_notifier_end_account(context);
+	up_read(&per_mm->umem_rwsem);
 }
 
 static const struct mmu_notifier_ops ib_umem_notifiers = {
@@ -271,31 +213,158 @@ static const struct mmu_notifier_ops ib_umem_notifiers = {
 	.invalidate_range_end       = ib_umem_notifier_invalidate_range_end,
 };
 
-struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context,
-				  unsigned long addr,
-				  size_t size)
+static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp)
 {
-	struct ib_umem *umem;
+	struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm;
+	struct ib_umem *umem = &umem_odp->umem;
+
+	down_write(&per_mm->umem_rwsem);
+	if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
+		rbt_ib_umem_insert(&umem_odp->interval_tree,
+				   &per_mm->umem_tree);
+	up_write(&per_mm->umem_rwsem);
+}
+
+static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp)
+{
+	struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm;
+	struct ib_umem *umem = &umem_odp->umem;
+
+	down_write(&per_mm->umem_rwsem);
+	if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
+		rbt_ib_umem_remove(&umem_odp->interval_tree,
+				   &per_mm->umem_tree);
+	complete_all(&umem_odp->notifier_completion);
+
+	up_write(&per_mm->umem_rwsem);
+}
+
+static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx,
+					       struct mm_struct *mm)
+{
+	struct ib_ucontext_per_mm *per_mm;
+	int ret;
+
+	per_mm = kzalloc(sizeof(*per_mm), GFP_KERNEL);
+	if (!per_mm)
+		return ERR_PTR(-ENOMEM);
+
+	per_mm->context = ctx;
+	per_mm->mm = mm;
+	per_mm->umem_tree = RB_ROOT_CACHED;
+	init_rwsem(&per_mm->umem_rwsem);
+	per_mm->active = ctx->invalidate_range;
+
+	rcu_read_lock();
+	per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
+	rcu_read_unlock();
+
+	WARN_ON(mm != current->mm);
+
+	per_mm->mn.ops = &ib_umem_notifiers;
+	ret = mmu_notifier_register(&per_mm->mn, per_mm->mm);
+	if (ret) {
+		dev_err(&ctx->device->dev,
+			"Failed to register mmu_notifier %d\n", ret);
+		goto out_pid;
+	}
+
+	list_add(&per_mm->ucontext_list, &ctx->per_mm_list);
+	return per_mm;
+
+out_pid:
+	put_pid(per_mm->tgid);
+	kfree(per_mm);
+	return ERR_PTR(ret);
+}
+
+static int get_per_mm(struct ib_umem_odp *umem_odp)
+{
+	struct ib_ucontext *ctx = umem_odp->umem.context;
+	struct ib_ucontext_per_mm *per_mm;
+
+	/*
+	 * Generally speaking we expect only one or two per_mm in this list,
+	 * so no reason to optimize this search today.
+	 */
+	mutex_lock(&ctx->per_mm_list_lock);
+	list_for_each_entry(per_mm, &ctx->per_mm_list, ucontext_list) {
+		if (per_mm->mm == umem_odp->umem.owning_mm)
+			goto found;
+	}
+
+	per_mm = alloc_per_mm(ctx, umem_odp->umem.owning_mm);
+	if (IS_ERR(per_mm)) {
+		mutex_unlock(&ctx->per_mm_list_lock);
+		return PTR_ERR(per_mm);
+	}
+
+found:
+	umem_odp->per_mm = per_mm;
+	per_mm->odp_mrs_count++;
+	mutex_unlock(&ctx->per_mm_list_lock);
+
+	return 0;
+}
+
+static void free_per_mm(struct rcu_head *rcu)
+{
+	kfree(container_of(rcu, struct ib_ucontext_per_mm, rcu));
+}
+
+void put_per_mm(struct ib_umem_odp *umem_odp)
+{
+	struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm;
+	struct ib_ucontext *ctx = umem_odp->umem.context;
+	bool need_free;
+
+	mutex_lock(&ctx->per_mm_list_lock);
+	umem_odp->per_mm = NULL;
+	per_mm->odp_mrs_count--;
+	need_free = per_mm->odp_mrs_count == 0;
+	if (need_free)
+		list_del(&per_mm->ucontext_list);
+	mutex_unlock(&ctx->per_mm_list_lock);
+
+	if (!need_free)
+		return;
+
+	/*
+	 * NOTE! mmu_notifier_unregister() can happen between a start/end
+	 * callback, resulting in an start/end, and thus an unbalanced
+	 * lock. This doesn't really matter to us since we are about to kfree
+	 * the memory that holds the lock, however LOCKDEP doesn't like this.
+	 */
+	down_write(&per_mm->umem_rwsem);
+	per_mm->active = false;
+	up_write(&per_mm->umem_rwsem);
+
+	WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root));
+	mmu_notifier_unregister_no_release(&per_mm->mn, per_mm->mm);
+	put_pid(per_mm->tgid);
+	mmu_notifier_call_srcu(&per_mm->rcu, free_per_mm);
+}
+
+struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm,
+				      unsigned long addr, size_t size)
+{
+	struct ib_ucontext *ctx = per_mm->context;
 	struct ib_umem_odp *odp_data;
+	struct ib_umem *umem;
 	int pages = size >> PAGE_SHIFT;
 	int ret;
 
-	umem = kzalloc(sizeof(*umem), GFP_KERNEL);
-	if (!umem)
+	odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL);
+	if (!odp_data)
 		return ERR_PTR(-ENOMEM);
-
-	umem->context    = context;
+	umem = &odp_data->umem;
+	umem->context    = ctx;
 	umem->length     = size;
 	umem->address    = addr;
 	umem->page_shift = PAGE_SHIFT;
 	umem->writable   = 1;
-
-	odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL);
-	if (!odp_data) {
-		ret = -ENOMEM;
-		goto out_umem;
-	}
-	odp_data->umem = umem;
+	umem->is_odp = 1;
+	odp_data->per_mm = per_mm;
 
 	mutex_init(&odp_data->umem_mutex);
 	init_completion(&odp_data->notifier_completion);
@@ -314,39 +383,34 @@ struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context,
 		goto out_page_list;
 	}
 
-	down_write(&context->umem_rwsem);
-	context->odp_mrs_count++;
-	rbt_ib_umem_insert(&odp_data->interval_tree, &context->umem_tree);
-	if (likely(!atomic_read(&context->notifier_count)))
-		odp_data->mn_counters_active = true;
-	else
-		list_add(&odp_data->no_private_counters,
-			 &context->no_private_counters);
-	up_write(&context->umem_rwsem);
-
-	umem->odp_data = odp_data;
+	/*
+	 * Caller must ensure that the umem_odp that the per_mm came from
+	 * cannot be freed during the call to ib_alloc_odp_umem.
+	 */
+	mutex_lock(&ctx->per_mm_list_lock);
+	per_mm->odp_mrs_count++;
+	mutex_unlock(&ctx->per_mm_list_lock);
+	add_umem_to_per_mm(odp_data);
 
-	return umem;
+	return odp_data;
 
 out_page_list:
 	vfree(odp_data->page_list);
 out_odp_data:
 	kfree(odp_data);
-out_umem:
-	kfree(umem);
 	return ERR_PTR(ret);
 }
 EXPORT_SYMBOL(ib_alloc_odp_umem);
 
-int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem,
-		    int access)
+int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access)
 {
+	struct ib_umem *umem = &umem_odp->umem;
+	/*
+	 * NOTE: This must called in a process context where umem->owning_mm
+	 * == current->mm
+	 */
+	struct mm_struct *mm = umem->owning_mm;
 	int ret_val;
-	struct pid *our_pid;
-	struct mm_struct *mm = get_task_mm(current);
-
-	if (!mm)
-		return -EINVAL;
 
 	if (access & IB_ACCESS_HUGETLB) {
 		struct vm_area_struct *vma;
@@ -366,111 +430,43 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem,
 		umem->hugetlb = 0;
 	}
 
-	/* Prevent creating ODP MRs in child processes */
-	rcu_read_lock();
-	our_pid = get_task_pid(current->group_leader, PIDTYPE_PID);
-	rcu_read_unlock();
-	put_pid(our_pid);
-	if (context->tgid != our_pid) {
-		ret_val = -EINVAL;
-		goto out_mm;
-	}
-
-	umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL);
-	if (!umem->odp_data) {
-		ret_val = -ENOMEM;
-		goto out_mm;
-	}
-	umem->odp_data->umem = umem;
-
-	mutex_init(&umem->odp_data->umem_mutex);
+	mutex_init(&umem_odp->umem_mutex);
 
-	init_completion(&umem->odp_data->notifier_completion);
+	init_completion(&umem_odp->notifier_completion);
 
 	if (ib_umem_num_pages(umem)) {
-		umem->odp_data->page_list =
-			vzalloc(array_size(sizeof(*umem->odp_data->page_list),
+		umem_odp->page_list =
+			vzalloc(array_size(sizeof(*umem_odp->page_list),
 					   ib_umem_num_pages(umem)));
-		if (!umem->odp_data->page_list) {
-			ret_val = -ENOMEM;
-			goto out_odp_data;
-		}
+		if (!umem_odp->page_list)
+			return -ENOMEM;
 
-		umem->odp_data->dma_list =
-			vzalloc(array_size(sizeof(*umem->odp_data->dma_list),
+		umem_odp->dma_list =
+			vzalloc(array_size(sizeof(*umem_odp->dma_list),
 					   ib_umem_num_pages(umem)));
-		if (!umem->odp_data->dma_list) {
+		if (!umem_odp->dma_list) {
 			ret_val = -ENOMEM;
 			goto out_page_list;
 		}
 	}
 
-	/*
-	 * When using MMU notifiers, we will get a
-	 * notification before the "current" task (and MM) is
-	 * destroyed. We use the umem_rwsem semaphore to synchronize.
-	 */
-	down_write(&context->umem_rwsem);
-	context->odp_mrs_count++;
-	if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
-		rbt_ib_umem_insert(&umem->odp_data->interval_tree,
-				   &context->umem_tree);
-	if (likely(!atomic_read(&context->notifier_count)) ||
-	    context->odp_mrs_count == 1)
-		umem->odp_data->mn_counters_active = true;
-	else
-		list_add(&umem->odp_data->no_private_counters,
-			 &context->no_private_counters);
-	downgrade_write(&context->umem_rwsem);
-
-	if (context->odp_mrs_count == 1) {
-		/*
-		 * Note that at this point, no MMU notifier is running
-		 * for this context!
-		 */
-		atomic_set(&context->notifier_count, 0);
-		INIT_HLIST_NODE(&context->mn.hlist);
-		context->mn.ops = &ib_umem_notifiers;
-		/*
-		 * Lock-dep detects a false positive for mmap_sem vs.
-		 * umem_rwsem, due to not grasping downgrade_write correctly.
-		 */
-		lockdep_off();
-		ret_val = mmu_notifier_register(&context->mn, mm);
-		lockdep_on();
-		if (ret_val) {
-			pr_err("Failed to register mmu_notifier %d\n", ret_val);
-			ret_val = -EBUSY;
-			goto out_mutex;
-		}
-	}
-
-	up_read(&context->umem_rwsem);
+	ret_val = get_per_mm(umem_odp);
+	if (ret_val)
+		goto out_dma_list;
+	add_umem_to_per_mm(umem_odp);
 
-	/*
-	 * Note that doing an mmput can cause a notifier for the relevant mm.
-	 * If the notifier is called while we hold the umem_rwsem, this will
-	 * cause a deadlock. Therefore, we release the reference only after we
-	 * released the semaphore.
-	 */
-	mmput(mm);
 	return 0;
 
-out_mutex:
-	up_read(&context->umem_rwsem);
-	vfree(umem->odp_data->dma_list);
+out_dma_list:
+	vfree(umem_odp->dma_list);
 out_page_list:
-	vfree(umem->odp_data->page_list);
-out_odp_data:
-	kfree(umem->odp_data);
-out_mm:
-	mmput(mm);
+	vfree(umem_odp->page_list);
 	return ret_val;
 }
 
-void ib_umem_odp_release(struct ib_umem *umem)
+void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
 {
-	struct ib_ucontext *context = umem->context;
+	struct ib_umem *umem = &umem_odp->umem;
 
 	/*
 	 * Ensure that no more pages are mapped in the umem.
@@ -478,61 +474,13 @@ void ib_umem_odp_release(struct ib_umem *umem)
 	 * It is the driver's responsibility to ensure, before calling us,
 	 * that the hardware will not attempt to access the MR any more.
 	 */
-	ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem),
+	ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem),
 				    ib_umem_end(umem));
 
-	down_write(&context->umem_rwsem);
-	if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
-		rbt_ib_umem_remove(&umem->odp_data->interval_tree,
-				   &context->umem_tree);
-	context->odp_mrs_count--;
-	if (!umem->odp_data->mn_counters_active) {
-		list_del(&umem->odp_data->no_private_counters);
-		complete_all(&umem->odp_data->notifier_completion);
-	}
-
-	/*
-	 * Downgrade the lock to a read lock. This ensures that the notifiers
-	 * (who lock the mutex for reading) will be able to finish, and we
-	 * will be able to enventually obtain the mmu notifiers SRCU. Note
-	 * that since we are doing it atomically, no other user could register
-	 * and unregister while we do the check.
-	 */
-	downgrade_write(&context->umem_rwsem);
-	if (!context->odp_mrs_count) {
-		struct task_struct *owning_process = NULL;
-		struct mm_struct *owning_mm        = NULL;
-
-		owning_process = get_pid_task(context->tgid,
-					      PIDTYPE_PID);
-		if (owning_process == NULL)
-			/*
-			 * The process is already dead, notifier were removed
-			 * already.
-			 */
-			goto out;
-
-		owning_mm = get_task_mm(owning_process);
-		if (owning_mm == NULL)
-			/*
-			 * The process' mm is already dead, notifier were
-			 * removed already.
-			 */
-			goto out_put_task;
-		mmu_notifier_unregister(&context->mn, owning_mm);
-
-		mmput(owning_mm);
-
-out_put_task:
-		put_task_struct(owning_process);
-	}
-out:
-	up_read(&context->umem_rwsem);
-
-	vfree(umem->odp_data->dma_list);
-	vfree(umem->odp_data->page_list);
-	kfree(umem->odp_data);
-	kfree(umem);
+	remove_umem_from_per_mm(umem_odp);
+	put_per_mm(umem_odp);
+	vfree(umem_odp->dma_list);
+	vfree(umem_odp->page_list);
 }
 
 /*
@@ -544,7 +492,7 @@ out:
  * @access_mask: access permissions needed for this page.
  * @current_seq: sequence number for synchronization with invalidations.
  *               the sequence number is taken from
- *               umem->odp_data->notifiers_seq.
+ *               umem_odp->notifiers_seq.
  *
  * The function returns -EFAULT if the DMA mapping operation fails. It returns
  * -EAGAIN if a concurrent invalidation prevents us from updating the page.
@@ -554,12 +502,13 @@ out:
  * umem.
  */
 static int ib_umem_odp_map_dma_single_page(
-		struct ib_umem *umem,
+		struct ib_umem_odp *umem_odp,
 		int page_index,
 		struct page *page,
 		u64 access_mask,
 		unsigned long current_seq)
 {
+	struct ib_umem *umem = &umem_odp->umem;
 	struct ib_device *dev = umem->context->device;
 	dma_addr_t dma_addr;
 	int stored_page = 0;
@@ -571,11 +520,11 @@ static int ib_umem_odp_map_dma_single_page(
 	 * handle case of a racing notifier. This check also allows us to bail
 	 * early if we have a notifier running in parallel with us.
 	 */
-	if (ib_umem_mmu_notifier_retry(umem, current_seq)) {
+	if (ib_umem_mmu_notifier_retry(umem_odp, current_seq)) {
 		ret = -EAGAIN;
 		goto out;
 	}
-	if (!(umem->odp_data->dma_list[page_index])) {
+	if (!(umem_odp->dma_list[page_index])) {
 		dma_addr = ib_dma_map_page(dev,
 					   page,
 					   0, BIT(umem->page_shift),
@@ -584,15 +533,15 @@ static int ib_umem_odp_map_dma_single_page(
 			ret = -EFAULT;
 			goto out;
 		}
-		umem->odp_data->dma_list[page_index] = dma_addr | access_mask;
-		umem->odp_data->page_list[page_index] = page;
+		umem_odp->dma_list[page_index] = dma_addr | access_mask;
+		umem_odp->page_list[page_index] = page;
 		umem->npages++;
 		stored_page = 1;
-	} else if (umem->odp_data->page_list[page_index] == page) {
-		umem->odp_data->dma_list[page_index] |= access_mask;
+	} else if (umem_odp->page_list[page_index] == page) {
+		umem_odp->dma_list[page_index] |= access_mask;
 	} else {
 		pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n",
-		       umem->odp_data->page_list[page_index], page);
+		       umem_odp->page_list[page_index], page);
 		/* Better remove the mapping now, to prevent any further
 		 * damage. */
 		remove_existing_mapping = 1;
@@ -605,7 +554,7 @@ out:
 
 	if (remove_existing_mapping && umem->context->invalidate_range) {
 		invalidate_page_trampoline(
-			umem,
+			umem_odp,
 			ib_umem_start(umem) + (page_index >> umem->page_shift),
 			ib_umem_start(umem) + ((page_index + 1) >>
 					       umem->page_shift),
@@ -621,7 +570,7 @@ out:
  *
  * Pins the range of pages passed in the argument, and maps them to
  * DMA addresses. The DMA addresses of the mapped pages is updated in
- * umem->odp_data->dma_list.
+ * umem_odp->dma_list.
  *
  * Returns the number of pages mapped in success, negative error code
  * for failure.
@@ -629,7 +578,7 @@ out:
  * the function from completing its task.
  * An -ENOENT error code indicates that userspace process is being terminated
  * and mm was already destroyed.
- * @umem: the umem to map and pin
+ * @umem_odp: the umem to map and pin
  * @user_virt: the address from which we need to map.
  * @bcnt: the minimal number of bytes to pin and map. The mapping might be
  *        bigger due to alignment, and may also be smaller in case of an error
@@ -639,13 +588,15 @@ out:
  *               range.
  * @current_seq: the MMU notifiers sequance value for synchronization with
  *               invalidations. the sequance number is read from
- *               umem->odp_data->notifiers_seq before calling this function
+ *               umem_odp->notifiers_seq before calling this function
  */
-int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
-			      u64 access_mask, unsigned long current_seq)
+int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
+			      u64 bcnt, u64 access_mask,
+			      unsigned long current_seq)
 {
+	struct ib_umem *umem = &umem_odp->umem;
 	struct task_struct *owning_process  = NULL;
-	struct mm_struct   *owning_mm       = NULL;
+	struct mm_struct *owning_mm = umem_odp->umem.owning_mm;
 	struct page       **local_page_list = NULL;
 	u64 page_mask, off;
 	int j, k, ret = 0, start_idx, npages = 0, page_shift;
@@ -669,15 +620,14 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
 	user_virt = user_virt & page_mask;
 	bcnt += off; /* Charge for the first page offset as well. */
 
-	owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID);
-	if (owning_process == NULL) {
+	/*
+	 * owning_process is allowed to be NULL, this means somehow the mm is
+	 * existing beyond the lifetime of the originating process.. Presumably
+	 * mmget_not_zero will fail in this case.
+	 */
+	owning_process = get_pid_task(umem_odp->per_mm->tgid, PIDTYPE_PID);
+	if (WARN_ON(!mmget_not_zero(umem_odp->umem.owning_mm))) {
 		ret = -EINVAL;
-		goto out_no_task;
-	}
-
-	owning_mm = get_task_mm(owning_process);
-	if (owning_mm == NULL) {
-		ret = -ENOENT;
 		goto out_put_task;
 	}
 
@@ -709,7 +659,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
 			break;
 
 		bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt);
-		mutex_lock(&umem->odp_data->umem_mutex);
+		mutex_lock(&umem_odp->umem_mutex);
 		for (j = 0; j < npages; j++, user_virt += PAGE_SIZE) {
 			if (user_virt & ~page_mask) {
 				p += PAGE_SIZE;
@@ -722,7 +672,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
 			}
 
 			ret = ib_umem_odp_map_dma_single_page(
-					umem, k, local_page_list[j],
+					umem_odp, k, local_page_list[j],
 					access_mask, current_seq);
 			if (ret < 0)
 				break;
@@ -730,7 +680,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
 			p = page_to_phys(local_page_list[j]);
 			k++;
 		}
-		mutex_unlock(&umem->odp_data->umem_mutex);
+		mutex_unlock(&umem_odp->umem_mutex);
 
 		if (ret < 0) {
 			/* Release left over pages when handling errors. */
@@ -749,16 +699,17 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
 
 	mmput(owning_mm);
 out_put_task:
-	put_task_struct(owning_process);
-out_no_task:
+	if (owning_process)
+		put_task_struct(owning_process);
 	free_page((unsigned long)local_page_list);
 	return ret;
 }
 EXPORT_SYMBOL(ib_umem_odp_map_dma_pages);
 
-void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt,
+void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
 				 u64 bound)
 {
+	struct ib_umem *umem = &umem_odp->umem;
 	int idx;
 	u64 addr;
 	struct ib_device *dev = umem->context->device;
@@ -770,12 +721,12 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt,
 	 * faults from completion. We might be racing with other
 	 * invalidations, so we must make sure we free each page only
 	 * once. */
-	mutex_lock(&umem->odp_data->umem_mutex);
+	mutex_lock(&umem_odp->umem_mutex);
 	for (addr = virt; addr < bound; addr += BIT(umem->page_shift)) {
 		idx = (addr - ib_umem_start(umem)) >> umem->page_shift;
-		if (umem->odp_data->page_list[idx]) {
-			struct page *page = umem->odp_data->page_list[idx];
-			dma_addr_t dma = umem->odp_data->dma_list[idx];
+		if (umem_odp->page_list[idx]) {
+			struct page *page = umem_odp->page_list[idx];
+			dma_addr_t dma = umem_odp->dma_list[idx];
 			dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK;
 
 			WARN_ON(!dma_addr);
@@ -798,12 +749,12 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt,
 			/* on demand pinning support */
 			if (!umem->context->invalidate_range)
 				put_page(page);
-			umem->odp_data->page_list[idx] = NULL;
-			umem->odp_data->dma_list[idx] = 0;
+			umem_odp->page_list[idx] = NULL;
+			umem_odp->dma_list[idx] = 0;
 			umem->npages--;
 		}
 	}
-	mutex_unlock(&umem->odp_data->umem_mutex);
+	mutex_unlock(&umem_odp->umem_mutex);
 }
 EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
 
@@ -830,7 +781,7 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
 			return -EAGAIN;
 		next = rbt_ib_umem_iter_next(node, start, last - 1);
 		umem = container_of(node, struct ib_umem_odp, interval_tree);
-		ret_val = cb(umem->umem, start, last, cookie) || ret_val;
+		ret_val = cb(umem, start, last, cookie) || ret_val;
 	}
 
 	return ret_val;

+ 6 - 7
drivers/infiniband/core/user_mad.c

@@ -138,7 +138,7 @@ static const dev_t base_issm_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE) +
 static dev_t dynamic_umad_dev;
 static dev_t dynamic_issm_dev;
 
-static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS);
+static DEFINE_IDA(umad_ida);
 
 static void ib_umad_add_one(struct ib_device *device);
 static void ib_umad_remove_one(struct ib_device *device, void *client_data);
@@ -1132,7 +1132,7 @@ static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
 	if (!port)
 		return -ENODEV;
 
-	return sprintf(buf, "%s\n", port->ib_dev->name);
+	return sprintf(buf, "%s\n", dev_name(&port->ib_dev->dev));
 }
 static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
 
@@ -1159,11 +1159,10 @@ static int ib_umad_init_port(struct ib_device *device, int port_num,
 	dev_t base_umad;
 	dev_t base_issm;
 
-	devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS);
-	if (devnum >= IB_UMAD_MAX_PORTS)
+	devnum = ida_alloc_max(&umad_ida, IB_UMAD_MAX_PORTS - 1, GFP_KERNEL);
+	if (devnum < 0)
 		return -1;
 	port->dev_num = devnum;
-	set_bit(devnum, dev_map);
 	if (devnum >= IB_UMAD_NUM_FIXED_MINOR) {
 		base_umad = dynamic_umad_dev + devnum - IB_UMAD_NUM_FIXED_MINOR;
 		base_issm = dynamic_issm_dev + devnum - IB_UMAD_NUM_FIXED_MINOR;
@@ -1227,7 +1226,7 @@ err_dev:
 
 err_cdev:
 	cdev_del(&port->cdev);
-	clear_bit(devnum, dev_map);
+	ida_free(&umad_ida, devnum);
 
 	return -1;
 }
@@ -1261,7 +1260,7 @@ static void ib_umad_kill_port(struct ib_umad_port *port)
 	}
 
 	mutex_unlock(&port->file_mutex);
-	clear_bit(port->dev_num, dev_map);
+	ida_free(&umad_ida, port->dev_num);
 }
 
 static void ib_umad_add_one(struct ib_device *device)

+ 6 - 9
drivers/infiniband/core/uverbs.h

@@ -100,13 +100,14 @@ struct ib_uverbs_device {
 	atomic_t				refcount;
 	int					num_comp_vectors;
 	struct completion			comp;
-	struct device			       *dev;
+	struct device				dev;
+	/* First group for device attributes, NULL terminated array */
+	const struct attribute_group		*groups[2];
 	struct ib_device	__rcu	       *ib_dev;
 	int					devnum;
 	struct cdev			        cdev;
 	struct rb_root				xrcd_tree;
 	struct mutex				xrcd_tree_mutex;
-	struct kobject				kobj;
 	struct srcu_struct			disassociate_srcu;
 	struct mutex				lists_mutex; /* protect lists */
 	struct list_head			uverbs_file_list;
@@ -146,7 +147,6 @@ struct ib_uverbs_file {
 	struct ib_event_handler			event_handler;
 	struct ib_uverbs_async_event_file       *async_file;
 	struct list_head			list;
-	int					is_closed;
 
 	/*
 	 * To access the uobjects list hw_destroy_rwsem must be held for write
@@ -158,6 +158,9 @@ struct ib_uverbs_file {
 	spinlock_t		uobjects_lock;
 	struct list_head	uobjects;
 
+	struct mutex umap_lock;
+	struct list_head umaps;
+
 	u64 uverbs_cmd_mask;
 	u64 uverbs_ex_cmd_mask;
 
@@ -218,12 +221,6 @@ struct ib_ucq_object {
 	u32			async_events_reported;
 };
 
-struct ib_uflow_resources;
-struct ib_uflow_object {
-	struct ib_uobject		uobject;
-	struct ib_uflow_resources	*resources;
-};
-
 extern const struct file_operations uverbs_event_fops;
 void ib_uverbs_init_event_queue(struct ib_uverbs_event_queue *ev_queue);
 struct file *ib_uverbs_alloc_async_event_file(struct ib_uverbs_file *uverbs_file,

+ 12 - 31
drivers/infiniband/core/uverbs_cmd.c

@@ -117,18 +117,12 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
 	/* ufile is required when some objects are released */
 	ucontext->ufile = file;
 
-	rcu_read_lock();
-	ucontext->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
-	rcu_read_unlock();
-	ucontext->closing = 0;
+	ucontext->closing = false;
 	ucontext->cleanup_retryable = false;
 
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	ucontext->umem_tree = RB_ROOT_CACHED;
-	init_rwsem(&ucontext->umem_rwsem);
-	ucontext->odp_mrs_count = 0;
-	INIT_LIST_HEAD(&ucontext->no_private_counters);
-
+	mutex_init(&ucontext->per_mm_list_lock);
+	INIT_LIST_HEAD(&ucontext->per_mm_list);
 	if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
 		ucontext->invalidate_range = NULL;
 
@@ -172,7 +166,6 @@ err_fd:
 	put_unused_fd(resp.async_fd);
 
 err_free:
-	put_pid(ucontext->tgid);
 	ib_dev->dealloc_ucontext(ucontext);
 
 err_alloc:
@@ -2769,16 +2762,7 @@ out_put:
 	return ret ? ret : in_len;
 }
 
-struct ib_uflow_resources {
-	size_t			max;
-	size_t			num;
-	size_t			collection_num;
-	size_t			counters_num;
-	struct ib_counters	**counters;
-	struct ib_flow_action	**collection;
-};
-
-static struct ib_uflow_resources *flow_resources_alloc(size_t num_specs)
+struct ib_uflow_resources *flow_resources_alloc(size_t num_specs)
 {
 	struct ib_uflow_resources *resources;
 
@@ -2808,6 +2792,7 @@ err:
 
 	return NULL;
 }
+EXPORT_SYMBOL(flow_resources_alloc);
 
 void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res)
 {
@@ -2826,10 +2811,11 @@ void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res)
 	kfree(uflow_res->counters);
 	kfree(uflow_res);
 }
+EXPORT_SYMBOL(ib_uverbs_flow_resources_free);
 
-static void flow_resources_add(struct ib_uflow_resources *uflow_res,
-			       enum ib_flow_spec_type type,
-			       void *ibobj)
+void flow_resources_add(struct ib_uflow_resources *uflow_res,
+			enum ib_flow_spec_type type,
+			void *ibobj)
 {
 	WARN_ON(uflow_res->num >= uflow_res->max);
 
@@ -2850,6 +2836,7 @@ static void flow_resources_add(struct ib_uflow_resources *uflow_res,
 
 	uflow_res->num++;
 }
+EXPORT_SYMBOL(flow_resources_add);
 
 static int kern_spec_to_ib_spec_action(struct ib_uverbs_file *ufile,
 				       struct ib_uverbs_flow_spec *kern_spec,
@@ -3484,7 +3471,6 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
 	struct ib_uverbs_create_flow	  cmd;
 	struct ib_uverbs_create_flow_resp resp;
 	struct ib_uobject		  *uobj;
-	struct ib_uflow_object		  *uflow;
 	struct ib_flow			  *flow_id;
 	struct ib_uverbs_flow_attr	  *kern_flow_attr;
 	struct ib_flow_attr		  *flow_attr;
@@ -3623,13 +3609,8 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
 		err = PTR_ERR(flow_id);
 		goto err_free;
 	}
-	atomic_inc(&qp->usecnt);
-	flow_id->qp = qp;
-	flow_id->device = qp->device;
-	flow_id->uobject = uobj;
-	uobj->object = flow_id;
-	uflow = container_of(uobj, typeof(*uflow), uobject);
-	uflow->resources = uflow_res;
+
+	ib_set_flow(uobj, flow_id, qp, qp->device, uflow_res);
 
 	memset(&resp, 0, sizeof(resp));
 	resp.flow_handle = uobj->id;

+ 137 - 3
drivers/infiniband/core/uverbs_ioctl.c

@@ -57,6 +57,7 @@ struct bundle_priv {
 	struct ib_uverbs_attr *uattrs;
 
 	DECLARE_BITMAP(uobj_finalize, UVERBS_API_ATTR_BKEY_LEN);
+	DECLARE_BITMAP(spec_finalize, UVERBS_API_ATTR_BKEY_LEN);
 
 	/*
 	 * Must be last. bundle ends in a flex array which overlaps
@@ -143,6 +144,86 @@ static bool uverbs_is_attr_cleared(const struct ib_uverbs_attr *uattr,
 			   0, uattr->len - len);
 }
 
+static int uverbs_process_idrs_array(struct bundle_priv *pbundle,
+				     const struct uverbs_api_attr *attr_uapi,
+				     struct uverbs_objs_arr_attr *attr,
+				     struct ib_uverbs_attr *uattr,
+				     u32 attr_bkey)
+{
+	const struct uverbs_attr_spec *spec = &attr_uapi->spec;
+	size_t array_len;
+	u32 *idr_vals;
+	int ret = 0;
+	size_t i;
+
+	if (uattr->attr_data.reserved)
+		return -EINVAL;
+
+	if (uattr->len % sizeof(u32))
+		return -EINVAL;
+
+	array_len = uattr->len / sizeof(u32);
+	if (array_len < spec->u2.objs_arr.min_len ||
+	    array_len > spec->u2.objs_arr.max_len)
+		return -EINVAL;
+
+	attr->uobjects =
+		uverbs_alloc(&pbundle->bundle,
+			     array_size(array_len, sizeof(*attr->uobjects)));
+	if (IS_ERR(attr->uobjects))
+		return PTR_ERR(attr->uobjects);
+
+	/*
+	 * Since idr is 4B and *uobjects is >= 4B, we can use attr->uobjects
+	 * to store idrs array and avoid additional memory allocation. The
+	 * idrs array is offset to the end of the uobjects array so we will be
+	 * able to read idr and replace with a pointer.
+	 */
+	idr_vals = (u32 *)(attr->uobjects + array_len) - array_len;
+
+	if (uattr->len > sizeof(uattr->data)) {
+		ret = copy_from_user(idr_vals, u64_to_user_ptr(uattr->data),
+				     uattr->len);
+		if (ret)
+			return -EFAULT;
+	} else {
+		memcpy(idr_vals, &uattr->data, uattr->len);
+	}
+
+	for (i = 0; i != array_len; i++) {
+		attr->uobjects[i] = uverbs_get_uobject_from_file(
+			spec->u2.objs_arr.obj_type, pbundle->bundle.ufile,
+			spec->u2.objs_arr.access, idr_vals[i]);
+		if (IS_ERR(attr->uobjects[i])) {
+			ret = PTR_ERR(attr->uobjects[i]);
+			break;
+		}
+	}
+
+	attr->len = i;
+	__set_bit(attr_bkey, pbundle->spec_finalize);
+	return ret;
+}
+
+static int uverbs_free_idrs_array(const struct uverbs_api_attr *attr_uapi,
+				  struct uverbs_objs_arr_attr *attr,
+				  bool commit)
+{
+	const struct uverbs_attr_spec *spec = &attr_uapi->spec;
+	int current_ret;
+	int ret = 0;
+	size_t i;
+
+	for (i = 0; i != attr->len; i++) {
+		current_ret = uverbs_finalize_object(
+			attr->uobjects[i], spec->u2.objs_arr.access, commit);
+		if (!ret)
+			ret = current_ret;
+	}
+
+	return ret;
+}
+
 static int uverbs_process_attr(struct bundle_priv *pbundle,
 			       const struct uverbs_api_attr *attr_uapi,
 			       struct ib_uverbs_attr *uattr, u32 attr_bkey)
@@ -246,6 +327,11 @@ static int uverbs_process_attr(struct bundle_priv *pbundle,
 		}
 
 		break;
+
+	case UVERBS_ATTR_TYPE_IDRS_ARRAY:
+		return uverbs_process_idrs_array(pbundle, attr_uapi,
+						 &e->objs_arr_attr, uattr,
+						 attr_bkey);
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -300,8 +386,7 @@ static int uverbs_set_attr(struct bundle_priv *pbundle,
 			return -EPROTONOSUPPORT;
 		return 0;
 	}
-	attr = srcu_dereference(
-		*slot, &pbundle->bundle.ufile->device->disassociate_srcu);
+	attr = rcu_dereference_protected(*slot, true);
 
 	/* Reject duplicate attributes from user-space */
 	if (test_bit(attr_bkey, pbundle->bundle.attr_present))
@@ -384,6 +469,7 @@ static int bundle_destroy(struct bundle_priv *pbundle, bool commit)
 	unsigned int i;
 	int ret = 0;
 
+	/* fast path for simple uobjects */
 	i = -1;
 	while ((i = find_next_bit(pbundle->uobj_finalize, key_bitmap_len,
 				  i + 1)) < key_bitmap_len) {
@@ -397,6 +483,30 @@ static int bundle_destroy(struct bundle_priv *pbundle, bool commit)
 			ret = current_ret;
 	}
 
+	i = -1;
+	while ((i = find_next_bit(pbundle->spec_finalize, key_bitmap_len,
+				  i + 1)) < key_bitmap_len) {
+		struct uverbs_attr *attr = &pbundle->bundle.attrs[i];
+		const struct uverbs_api_attr *attr_uapi;
+		void __rcu **slot;
+		int current_ret;
+
+		slot = uapi_get_attr_for_method(
+			pbundle,
+			pbundle->method_key | uapi_bkey_to_key_attr(i));
+		if (WARN_ON(!slot))
+			continue;
+
+		attr_uapi = rcu_dereference_protected(*slot, true);
+
+		if (attr_uapi->spec.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) {
+			current_ret = uverbs_free_idrs_array(
+				attr_uapi, &attr->objs_arr_attr, commit);
+			if (!ret)
+				ret = current_ret;
+		}
+	}
+
 	for (memblock = pbundle->allocated_mem; memblock;) {
 		struct bundle_alloc_head *tmp = memblock;
 
@@ -429,7 +539,7 @@ static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile,
 			uapi_key_ioctl_method(hdr->method_id));
 	if (unlikely(!slot))
 		return -EPROTONOSUPPORT;
-	method_elm = srcu_dereference(*slot, &ufile->device->disassociate_srcu);
+	method_elm = rcu_dereference_protected(*slot, true);
 
 	if (!method_elm->use_stack) {
 		pbundle = kmalloc(method_elm->bundle_size, GFP_KERNEL);
@@ -461,6 +571,7 @@ static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile,
 	memset(pbundle->bundle.attr_present, 0,
 	       sizeof(pbundle->bundle.attr_present));
 	memset(pbundle->uobj_finalize, 0, sizeof(pbundle->uobj_finalize));
+	memset(pbundle->spec_finalize, 0, sizeof(pbundle->spec_finalize));
 
 	ret = ib_uverbs_run_method(pbundle, hdr->num_attrs);
 	destroy_ret = bundle_destroy(pbundle, ret == 0);
@@ -611,3 +722,26 @@ int uverbs_copy_to(const struct uverbs_attr_bundle *bundle, size_t idx,
 	return 0;
 }
 EXPORT_SYMBOL(uverbs_copy_to);
+
+int _uverbs_get_const(s64 *to, const struct uverbs_attr_bundle *attrs_bundle,
+		      size_t idx, s64 lower_bound, u64 upper_bound,
+		      s64  *def_val)
+{
+	const struct uverbs_attr *attr;
+
+	attr = uverbs_attr_get(attrs_bundle, idx);
+	if (IS_ERR(attr)) {
+		if ((PTR_ERR(attr) != -ENOENT) || !def_val)
+			return PTR_ERR(attr);
+
+		*to = *def_val;
+	} else {
+		*to = attr->ptr_attr.data;
+	}
+
+	if (*to < lower_bound || (*to > 0 && (u64)*to > upper_bound))
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL(_uverbs_get_const);

+ 276 - 64
drivers/infiniband/core/uverbs_main.c

@@ -45,6 +45,7 @@
 #include <linux/cdev.h>
 #include <linux/anon_inodes.h>
 #include <linux/slab.h>
+#include <linux/sched/mm.h>
 
 #include <linux/uaccess.h>
 
@@ -72,7 +73,7 @@ enum {
 static dev_t dynamic_uverbs_dev;
 static struct class *uverbs_class;
 
-static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
+static DEFINE_IDA(uverbs_ida);
 
 static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
 				     const char __user *buf, int in_len,
@@ -169,20 +170,16 @@ int uverbs_dealloc_mw(struct ib_mw *mw)
 	return ret;
 }
 
-static void ib_uverbs_release_dev(struct kobject *kobj)
+static void ib_uverbs_release_dev(struct device *device)
 {
 	struct ib_uverbs_device *dev =
-		container_of(kobj, struct ib_uverbs_device, kobj);
+			container_of(device, struct ib_uverbs_device, dev);
 
 	uverbs_destroy_api(dev->uapi);
 	cleanup_srcu_struct(&dev->disassociate_srcu);
 	kfree(dev);
 }
 
-static struct kobj_type ib_uverbs_dev_ktype = {
-	.release = ib_uverbs_release_dev,
-};
-
 static void ib_uverbs_release_async_event_file(struct kref *ref)
 {
 	struct ib_uverbs_async_event_file *file =
@@ -265,7 +262,7 @@ void ib_uverbs_release_file(struct kref *ref)
 	if (atomic_dec_and_test(&file->device->refcount))
 		ib_uverbs_comp_dev(file->device);
 
-	kobject_put(&file->device->kobj);
+	put_device(&file->device->dev);
 	kfree(file);
 }
 
@@ -816,6 +813,226 @@ out:
 	return ret;
 }
 
+/*
+ * Each time we map IO memory into user space this keeps track of the mapping.
+ * When the device is hot-unplugged we 'zap' the mmaps in user space to point
+ * to the zero page and allow the hot unplug to proceed.
+ *
+ * This is necessary for cases like PCI physical hot unplug as the actual BAR
+ * memory may vanish after this and access to it from userspace could MCE.
+ *
+ * RDMA drivers supporting disassociation must have their user space designed
+ * to cope in some way with their IO pages going to the zero page.
+ */
+struct rdma_umap_priv {
+	struct vm_area_struct *vma;
+	struct list_head list;
+};
+
+static const struct vm_operations_struct rdma_umap_ops;
+
+static void rdma_umap_priv_init(struct rdma_umap_priv *priv,
+				struct vm_area_struct *vma)
+{
+	struct ib_uverbs_file *ufile = vma->vm_file->private_data;
+
+	priv->vma = vma;
+	vma->vm_private_data = priv;
+	vma->vm_ops = &rdma_umap_ops;
+
+	mutex_lock(&ufile->umap_lock);
+	list_add(&priv->list, &ufile->umaps);
+	mutex_unlock(&ufile->umap_lock);
+}
+
+/*
+ * The VMA has been dup'd, initialize the vm_private_data with a new tracking
+ * struct
+ */
+static void rdma_umap_open(struct vm_area_struct *vma)
+{
+	struct ib_uverbs_file *ufile = vma->vm_file->private_data;
+	struct rdma_umap_priv *opriv = vma->vm_private_data;
+	struct rdma_umap_priv *priv;
+
+	if (!opriv)
+		return;
+
+	/* We are racing with disassociation */
+	if (!down_read_trylock(&ufile->hw_destroy_rwsem))
+		goto out_zap;
+	/*
+	 * Disassociation already completed, the VMA should already be zapped.
+	 */
+	if (!ufile->ucontext)
+		goto out_unlock;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		goto out_unlock;
+	rdma_umap_priv_init(priv, vma);
+
+	up_read(&ufile->hw_destroy_rwsem);
+	return;
+
+out_unlock:
+	up_read(&ufile->hw_destroy_rwsem);
+out_zap:
+	/*
+	 * We can't allow the VMA to be created with the actual IO pages, that
+	 * would break our API contract, and it can't be stopped at this
+	 * point, so zap it.
+	 */
+	vma->vm_private_data = NULL;
+	zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
+}
+
+static void rdma_umap_close(struct vm_area_struct *vma)
+{
+	struct ib_uverbs_file *ufile = vma->vm_file->private_data;
+	struct rdma_umap_priv *priv = vma->vm_private_data;
+
+	if (!priv)
+		return;
+
+	/*
+	 * The vma holds a reference on the struct file that created it, which
+	 * in turn means that the ib_uverbs_file is guaranteed to exist at
+	 * this point.
+	 */
+	mutex_lock(&ufile->umap_lock);
+	list_del(&priv->list);
+	mutex_unlock(&ufile->umap_lock);
+	kfree(priv);
+}
+
+static const struct vm_operations_struct rdma_umap_ops = {
+	.open = rdma_umap_open,
+	.close = rdma_umap_close,
+};
+
+static struct rdma_umap_priv *rdma_user_mmap_pre(struct ib_ucontext *ucontext,
+						 struct vm_area_struct *vma,
+						 unsigned long size)
+{
+	struct ib_uverbs_file *ufile = ucontext->ufile;
+	struct rdma_umap_priv *priv;
+
+	if (vma->vm_end - vma->vm_start != size)
+		return ERR_PTR(-EINVAL);
+
+	/* Driver is using this wrong, must be called by ib_uverbs_mmap */
+	if (WARN_ON(!vma->vm_file ||
+		    vma->vm_file->private_data != ufile))
+		return ERR_PTR(-EINVAL);
+	lockdep_assert_held(&ufile->device->disassociate_srcu);
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return ERR_PTR(-ENOMEM);
+	return priv;
+}
+
+/*
+ * Map IO memory into a process. This is to be called by drivers as part of
+ * their mmap() functions if they wish to send something like PCI-E BAR memory
+ * to userspace.
+ */
+int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma,
+		      unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+	struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size);
+
+	if (IS_ERR(priv))
+		return PTR_ERR(priv);
+
+	vma->vm_page_prot = prot;
+	if (io_remap_pfn_range(vma, vma->vm_start, pfn, size, prot)) {
+		kfree(priv);
+		return -EAGAIN;
+	}
+
+	rdma_umap_priv_init(priv, vma);
+	return 0;
+}
+EXPORT_SYMBOL(rdma_user_mmap_io);
+
+/*
+ * The page case is here for a slightly different reason, the driver expects
+ * to be able to free the page it is sharing to user space when it destroys
+ * its ucontext, which means we need to zap the user space references.
+ *
+ * We could handle this differently by providing an API to allocate a shared
+ * page and then only freeing the shared page when the last ufile is
+ * destroyed.
+ */
+int rdma_user_mmap_page(struct ib_ucontext *ucontext,
+			struct vm_area_struct *vma, struct page *page,
+			unsigned long size)
+{
+	struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size);
+
+	if (IS_ERR(priv))
+		return PTR_ERR(priv);
+
+	if (remap_pfn_range(vma, vma->vm_start, page_to_pfn(page), size,
+			    vma->vm_page_prot)) {
+		kfree(priv);
+		return -EAGAIN;
+	}
+
+	rdma_umap_priv_init(priv, vma);
+	return 0;
+}
+EXPORT_SYMBOL(rdma_user_mmap_page);
+
+void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
+{
+	struct rdma_umap_priv *priv, *next_priv;
+
+	lockdep_assert_held(&ufile->hw_destroy_rwsem);
+
+	while (1) {
+		struct mm_struct *mm = NULL;
+
+		/* Get an arbitrary mm pointer that hasn't been cleaned yet */
+		mutex_lock(&ufile->umap_lock);
+		if (!list_empty(&ufile->umaps)) {
+			mm = list_first_entry(&ufile->umaps,
+					      struct rdma_umap_priv, list)
+				     ->vma->vm_mm;
+			mmget(mm);
+		}
+		mutex_unlock(&ufile->umap_lock);
+		if (!mm)
+			return;
+
+		/*
+		 * The umap_lock is nested under mmap_sem since it used within
+		 * the vma_ops callbacks, so we have to clean the list one mm
+		 * at a time to get the lock ordering right. Typically there
+		 * will only be one mm, so no big deal.
+		 */
+		down_write(&mm->mmap_sem);
+		mutex_lock(&ufile->umap_lock);
+		list_for_each_entry_safe (priv, next_priv, &ufile->umaps,
+					  list) {
+			struct vm_area_struct *vma = priv->vma;
+
+			if (vma->vm_mm != mm)
+				continue;
+			list_del_init(&priv->list);
+
+			zap_vma_ptes(vma, vma->vm_start,
+				     vma->vm_end - vma->vm_start);
+			vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE);
+		}
+		mutex_unlock(&ufile->umap_lock);
+		up_write(&mm->mmap_sem);
+		mmput(mm);
+	}
+}
+
 /*
  * ib_uverbs_open() does not need the BKL:
  *
@@ -839,6 +1056,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
 	if (!atomic_inc_not_zero(&dev->refcount))
 		return -ENXIO;
 
+	get_device(&dev->dev);
 	srcu_key = srcu_read_lock(&dev->disassociate_srcu);
 	mutex_lock(&dev->lists_mutex);
 	ib_dev = srcu_dereference(dev->ib_dev,
@@ -876,9 +1094,10 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
 	spin_lock_init(&file->uobjects_lock);
 	INIT_LIST_HEAD(&file->uobjects);
 	init_rwsem(&file->hw_destroy_rwsem);
+	mutex_init(&file->umap_lock);
+	INIT_LIST_HEAD(&file->umaps);
 
 	filp->private_data = file;
-	kobject_get(&dev->kobj);
 	list_add_tail(&file->list, &dev->uverbs_file_list);
 	mutex_unlock(&dev->lists_mutex);
 	srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
@@ -899,6 +1118,7 @@ err:
 	if (atomic_dec_and_test(&dev->refcount))
 		ib_uverbs_comp_dev(dev);
 
+	put_device(&dev->dev);
 	return ret;
 }
 
@@ -909,10 +1129,7 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp)
 	uverbs_destroy_ufile_hw(file, RDMA_REMOVE_CLOSE);
 
 	mutex_lock(&file->device->lists_mutex);
-	if (!file->is_closed) {
-		list_del(&file->list);
-		file->is_closed = 1;
-	}
+	list_del_init(&file->list);
 	mutex_unlock(&file->device->lists_mutex);
 
 	if (file->async_file)
@@ -951,37 +1168,34 @@ static struct ib_client uverbs_client = {
 	.remove = ib_uverbs_remove_one
 };
 
-static ssize_t show_ibdev(struct device *device, struct device_attribute *attr,
+static ssize_t ibdev_show(struct device *device, struct device_attribute *attr,
 			  char *buf)
 {
+	struct ib_uverbs_device *dev =
+			container_of(device, struct ib_uverbs_device, dev);
 	int ret = -ENODEV;
 	int srcu_key;
-	struct ib_uverbs_device *dev = dev_get_drvdata(device);
 	struct ib_device *ib_dev;
 
-	if (!dev)
-		return -ENODEV;
-
 	srcu_key = srcu_read_lock(&dev->disassociate_srcu);
 	ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
 	if (ib_dev)
-		ret = sprintf(buf, "%s\n", ib_dev->name);
+		ret = sprintf(buf, "%s\n", dev_name(&ib_dev->dev));
 	srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
 
 	return ret;
 }
-static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+static DEVICE_ATTR_RO(ibdev);
 
-static ssize_t show_dev_abi_version(struct device *device,
-				    struct device_attribute *attr, char *buf)
+static ssize_t abi_version_show(struct device *device,
+				struct device_attribute *attr, char *buf)
 {
-	struct ib_uverbs_device *dev = dev_get_drvdata(device);
+	struct ib_uverbs_device *dev =
+			container_of(device, struct ib_uverbs_device, dev);
 	int ret = -ENODEV;
 	int srcu_key;
 	struct ib_device *ib_dev;
 
-	if (!dev)
-		return -ENODEV;
 	srcu_key = srcu_read_lock(&dev->disassociate_srcu);
 	ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
 	if (ib_dev)
@@ -990,7 +1204,17 @@ static ssize_t show_dev_abi_version(struct device *device,
 
 	return ret;
 }
-static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL);
+static DEVICE_ATTR_RO(abi_version);
+
+static struct attribute *ib_dev_attrs[] = {
+	&dev_attr_abi_version.attr,
+	&dev_attr_ibdev.attr,
+	NULL,
+};
+
+static const struct attribute_group dev_attr_group = {
+	.attrs = ib_dev_attrs,
+};
 
 static CLASS_ATTR_STRING(abi_version, S_IRUGO,
 			 __stringify(IB_USER_VERBS_ABI_VERSION));
@@ -1028,65 +1252,56 @@ static void ib_uverbs_add_one(struct ib_device *device)
 		return;
 	}
 
+	device_initialize(&uverbs_dev->dev);
+	uverbs_dev->dev.class = uverbs_class;
+	uverbs_dev->dev.parent = device->dev.parent;
+	uverbs_dev->dev.release = ib_uverbs_release_dev;
+	uverbs_dev->groups[0] = &dev_attr_group;
+	uverbs_dev->dev.groups = uverbs_dev->groups;
 	atomic_set(&uverbs_dev->refcount, 1);
 	init_completion(&uverbs_dev->comp);
 	uverbs_dev->xrcd_tree = RB_ROOT;
 	mutex_init(&uverbs_dev->xrcd_tree_mutex);
-	kobject_init(&uverbs_dev->kobj, &ib_uverbs_dev_ktype);
 	mutex_init(&uverbs_dev->lists_mutex);
 	INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list);
 	INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list);
+	rcu_assign_pointer(uverbs_dev->ib_dev, device);
+	uverbs_dev->num_comp_vectors = device->num_comp_vectors;
 
-	devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
-	if (devnum >= IB_UVERBS_MAX_DEVICES)
+	devnum = ida_alloc_max(&uverbs_ida, IB_UVERBS_MAX_DEVICES - 1,
+			       GFP_KERNEL);
+	if (devnum < 0)
 		goto err;
 	uverbs_dev->devnum = devnum;
-	set_bit(devnum, dev_map);
 	if (devnum >= IB_UVERBS_NUM_FIXED_MINOR)
 		base = dynamic_uverbs_dev + devnum - IB_UVERBS_NUM_FIXED_MINOR;
 	else
 		base = IB_UVERBS_BASE_DEV + devnum;
 
-	rcu_assign_pointer(uverbs_dev->ib_dev, device);
-	uverbs_dev->num_comp_vectors = device->num_comp_vectors;
-
 	if (ib_uverbs_create_uapi(device, uverbs_dev))
 		goto err_uapi;
 
-	cdev_init(&uverbs_dev->cdev, NULL);
+	uverbs_dev->dev.devt = base;
+	dev_set_name(&uverbs_dev->dev, "uverbs%d", uverbs_dev->devnum);
+
+	cdev_init(&uverbs_dev->cdev,
+		  device->mmap ? &uverbs_mmap_fops : &uverbs_fops);
 	uverbs_dev->cdev.owner = THIS_MODULE;
-	uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops;
-	cdev_set_parent(&uverbs_dev->cdev, &uverbs_dev->kobj);
-	kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum);
-	if (cdev_add(&uverbs_dev->cdev, base, 1))
-		goto err_cdev;
-
-	uverbs_dev->dev = device_create(uverbs_class, device->dev.parent,
-					uverbs_dev->cdev.dev, uverbs_dev,
-					"uverbs%d", uverbs_dev->devnum);
-	if (IS_ERR(uverbs_dev->dev))
-		goto err_cdev;
-
-	if (device_create_file(uverbs_dev->dev, &dev_attr_ibdev))
-		goto err_class;
-	if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version))
-		goto err_class;
 
-	ib_set_client_data(device, &uverbs_client, uverbs_dev);
+	ret = cdev_device_add(&uverbs_dev->cdev, &uverbs_dev->dev);
+	if (ret)
+		goto err_uapi;
 
+	ib_set_client_data(device, &uverbs_client, uverbs_dev);
 	return;
 
-err_class:
-	device_destroy(uverbs_class, uverbs_dev->cdev.dev);
-err_cdev:
-	cdev_del(&uverbs_dev->cdev);
 err_uapi:
-	clear_bit(devnum, dev_map);
+	ida_free(&uverbs_ida, devnum);
 err:
 	if (atomic_dec_and_test(&uverbs_dev->refcount))
 		ib_uverbs_comp_dev(uverbs_dev);
 	wait_for_completion(&uverbs_dev->comp);
-	kobject_put(&uverbs_dev->kobj);
+	put_device(&uverbs_dev->dev);
 	return;
 }
 
@@ -1107,8 +1322,7 @@ static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev,
 	while (!list_empty(&uverbs_dev->uverbs_file_list)) {
 		file = list_first_entry(&uverbs_dev->uverbs_file_list,
 					struct ib_uverbs_file, list);
-		file->is_closed = 1;
-		list_del(&file->list);
+		list_del_init(&file->list);
 		kref_get(&file->ref);
 
 		/* We must release the mutex before going ahead and calling
@@ -1156,10 +1370,8 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data)
 	if (!uverbs_dev)
 		return;
 
-	dev_set_drvdata(uverbs_dev->dev, NULL);
-	device_destroy(uverbs_class, uverbs_dev->cdev.dev);
-	cdev_del(&uverbs_dev->cdev);
-	clear_bit(uverbs_dev->devnum, dev_map);
+	cdev_device_del(&uverbs_dev->cdev, &uverbs_dev->dev);
+	ida_free(&uverbs_ida, uverbs_dev->devnum);
 
 	if (device->disassociate_ucontext) {
 		/* We disassociate HW resources and immediately return.
@@ -1182,7 +1394,7 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data)
 	if (wait_clients)
 		wait_for_completion(&uverbs_dev->comp);
 
-	kobject_put(&uverbs_dev->kobj);
+	put_device(&uverbs_dev->dev);
 }
 
 static char *uverbs_devnode(struct device *dev, umode_t *mode)

+ 2 - 5
drivers/infiniband/core/uverbs_std_types_flow_action.c

@@ -326,11 +326,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)(
 	if (IS_ERR(action))
 		return PTR_ERR(action);
 
-	atomic_set(&action->usecnt, 0);
-	action->device = ib_dev;
-	action->type = IB_FLOW_ACTION_ESP;
-	action->uobject = uobj;
-	uobj->object = action;
+	uverbs_flow_action_fill_action(action, uobj, ib_dev,
+				       IB_FLOW_ACTION_ESP);
 
 	return 0;
 }

+ 12 - 0
drivers/infiniband/core/uverbs_uapi.c

@@ -73,6 +73,18 @@ static int uapi_merge_method(struct uverbs_api *uapi,
 		if (attr->attr.type == UVERBS_ATTR_TYPE_ENUM_IN)
 			method_elm->driver_method |= is_driver;
 
+		/*
+		 * Like other uobject based things we only support a single
+		 * uobject being NEW'd or DESTROY'd
+		 */
+		if (attr->attr.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) {
+			u8 access = attr->attr.u2.objs_arr.access;
+
+			if (WARN_ON(access == UVERBS_ACCESS_NEW ||
+				    access == UVERBS_ACCESS_DESTROY))
+				return -EINVAL;
+		}
+
 		attr_slot =
 			uapi_add_elm(uapi, method_key | uapi_key_attr(attr->id),
 				     sizeof(*attr_slot));

+ 10 - 9
drivers/infiniband/core/verbs.c

@@ -264,7 +264,7 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
 	}
 
 	pd->res.type = RDMA_RESTRACK_PD;
-	pd->res.kern_name = caller;
+	rdma_restrack_set_task(&pd->res, caller);
 	rdma_restrack_add(&pd->res);
 
 	if (mr_access_flags) {
@@ -710,7 +710,7 @@ static int ib_resolve_unicast_gid_dmac(struct ib_device *device,
 
 	ret = rdma_addr_find_l2_eth_by_grh(&sgid_attr->gid, &grh->dgid,
 					   ah_attr->roce.dmac,
-					   sgid_attr->ndev, &hop_limit);
+					   sgid_attr, &hop_limit);
 
 	grh->hop_limit = hop_limit;
 	return ret;
@@ -1509,8 +1509,7 @@ static const struct {
 };
 
 bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
-			enum ib_qp_type type, enum ib_qp_attr_mask mask,
-			enum rdma_link_layer ll)
+			enum ib_qp_type type, enum ib_qp_attr_mask mask)
 {
 	enum ib_qp_attr_mask req_param, opt_param;
 
@@ -1629,14 +1628,16 @@ static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr,
 
 	if (rdma_ib_or_roce(qp->device, port)) {
 		if (attr_mask & IB_QP_RQ_PSN && attr->rq_psn & ~0xffffff) {
-			pr_warn("%s: %s rq_psn overflow, masking to 24 bits\n",
-				__func__, qp->device->name);
+			dev_warn(&qp->device->dev,
+				 "%s rq_psn overflow, masking to 24 bits\n",
+				 __func__);
 			attr->rq_psn &= 0xffffff;
 		}
 
 		if (attr_mask & IB_QP_SQ_PSN && attr->sq_psn & ~0xffffff) {
-			pr_warn("%s: %s sq_psn overflow, masking to 24 bits\n",
-				__func__, qp->device->name);
+			dev_warn(&qp->device->dev,
+				 " %s sq_psn overflow, masking to 24 bits\n",
+				 __func__);
 			attr->sq_psn &= 0xffffff;
 		}
 	}
@@ -1888,7 +1889,7 @@ struct ib_cq *__ib_create_cq(struct ib_device *device,
 		cq->cq_context    = cq_context;
 		atomic_set(&cq->usecnt, 0);
 		cq->res.type = RDMA_RESTRACK_CQ;
-		cq->res.kern_name = caller;
+		rdma_restrack_set_task(&cq->res, caller);
 		rdma_restrack_add(&cq->res);
 	}
 

+ 2 - 1
drivers/infiniband/hw/bnxt_re/bnxt_re.h

@@ -40,7 +40,6 @@
 #ifndef __BNXT_RE_H__
 #define __BNXT_RE_H__
 #define ROCE_DRV_MODULE_NAME		"bnxt_re"
-#define ROCE_DRV_MODULE_VERSION		"1.0.0"
 
 #define BNXT_RE_DESC	"Broadcom NetXtreme-C/E RoCE Driver"
 #define BNXT_RE_PAGE_SHIFT_4K		(12)
@@ -120,6 +119,8 @@ struct bnxt_re_dev {
 #define BNXT_RE_FLAG_HAVE_L2_REF		3
 #define BNXT_RE_FLAG_RCFW_CHANNEL_EN		4
 #define BNXT_RE_FLAG_QOS_WORK_REG		5
+#define BNXT_RE_FLAG_RESOURCES_ALLOCATED	7
+#define BNXT_RE_FLAG_RESOURCES_INITIALIZED	8
 #define BNXT_RE_FLAG_ISSUE_ROCE_STATS          29
 	struct net_device		*netdev;
 	unsigned int			version, major, minor;

+ 10 - 1
drivers/infiniband/hw/bnxt_re/hw_counters.c

@@ -68,6 +68,8 @@ static const char * const bnxt_re_stat_name[] = {
 	[BNXT_RE_TX_PKTS]		=  "tx_pkts",
 	[BNXT_RE_TX_BYTES]		=  "tx_bytes",
 	[BNXT_RE_RECOVERABLE_ERRORS]	=  "recoverable_errors",
+	[BNXT_RE_RX_DROPS]		=  "rx_roce_drops",
+	[BNXT_RE_RX_DISCARDS]		=  "rx_roce_discards",
 	[BNXT_RE_TO_RETRANSMITS]        = "to_retransmits",
 	[BNXT_RE_SEQ_ERR_NAKS_RCVD]     = "seq_err_naks_rcvd",
 	[BNXT_RE_MAX_RETRY_EXCEEDED]    = "max_retry_exceeded",
@@ -106,7 +108,8 @@ static const char * const bnxt_re_stat_name[] = {
 	[BNXT_RE_RES_CQ_LOAD_ERR]       = "res_cq_load_err",
 	[BNXT_RE_RES_SRQ_LOAD_ERR]      = "res_srq_load_err",
 	[BNXT_RE_RES_TX_PCI_ERR]        = "res_tx_pci_err",
-	[BNXT_RE_RES_RX_PCI_ERR]        = "res_rx_pci_err"
+	[BNXT_RE_RES_RX_PCI_ERR]        = "res_rx_pci_err",
+	[BNXT_RE_OUT_OF_SEQ_ERR]        = "oos_drop_count"
 };
 
 int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev,
@@ -128,6 +131,10 @@ int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev,
 	if (bnxt_re_stats) {
 		stats->value[BNXT_RE_RECOVERABLE_ERRORS] =
 			le64_to_cpu(bnxt_re_stats->tx_bcast_pkts);
+		stats->value[BNXT_RE_RX_DROPS] =
+			le64_to_cpu(bnxt_re_stats->rx_drop_pkts);
+		stats->value[BNXT_RE_RX_DISCARDS] =
+			le64_to_cpu(bnxt_re_stats->rx_discard_pkts);
 		stats->value[BNXT_RE_RX_PKTS] =
 			le64_to_cpu(bnxt_re_stats->rx_ucast_pkts);
 		stats->value[BNXT_RE_RX_BYTES] =
@@ -220,6 +227,8 @@ int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev,
 				rdev->stats.res_tx_pci_err;
 		stats->value[BNXT_RE_RES_RX_PCI_ERR]    =
 				rdev->stats.res_rx_pci_err;
+		stats->value[BNXT_RE_OUT_OF_SEQ_ERR]    =
+				rdev->stats.res_oos_drop_count;
 	}
 
 	return ARRAY_SIZE(bnxt_re_stat_name);

+ 3 - 0
drivers/infiniband/hw/bnxt_re/hw_counters.h

@@ -51,6 +51,8 @@ enum bnxt_re_hw_stats {
 	BNXT_RE_TX_PKTS,
 	BNXT_RE_TX_BYTES,
 	BNXT_RE_RECOVERABLE_ERRORS,
+	BNXT_RE_RX_DROPS,
+	BNXT_RE_RX_DISCARDS,
 	BNXT_RE_TO_RETRANSMITS,
 	BNXT_RE_SEQ_ERR_NAKS_RCVD,
 	BNXT_RE_MAX_RETRY_EXCEEDED,
@@ -90,6 +92,7 @@ enum bnxt_re_hw_stats {
 	BNXT_RE_RES_SRQ_LOAD_ERR,
 	BNXT_RE_RES_TX_PCI_ERR,
 	BNXT_RE_RES_RX_PCI_ERR,
+	BNXT_RE_OUT_OF_SEQ_ERR,
 	BNXT_RE_NUM_COUNTERS
 };
 

+ 2 - 2
drivers/infiniband/hw/bnxt_re/ib_verbs.c

@@ -1598,8 +1598,7 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr,
 		curr_qp_state = __to_ib_qp_state(qp->qplib_qp.cur_qp_state);
 		new_qp_state = qp_attr->qp_state;
 		if (!ib_modify_qp_is_ok(curr_qp_state, new_qp_state,
-					ib_qp->qp_type, qp_attr_mask,
-					IB_LINK_LAYER_ETHERNET)) {
+					ib_qp->qp_type, qp_attr_mask)) {
 			dev_err(rdev_to_dev(rdev),
 				"Invalid attribute mask: %#x specified ",
 				qp_attr_mask);
@@ -2664,6 +2663,7 @@ struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
 	nq->budget++;
 
 	atomic_inc(&rdev->cq_count);
+	spin_lock_init(&cq->cq_lock);
 
 	if (context) {
 		struct bnxt_re_cq_resp resp;

+ 65 - 60
drivers/infiniband/hw/bnxt_re/main.c

@@ -67,7 +67,7 @@
 #include "hw_counters.h"
 
 static char version[] =
-		BNXT_RE_DESC " v" ROCE_DRV_MODULE_VERSION "\n";
+		BNXT_RE_DESC "\n";
 
 MODULE_AUTHOR("Eddie Wai <eddie.wai@broadcom.com>");
 MODULE_DESCRIPTION(BNXT_RE_DESC " Driver");
@@ -535,6 +535,34 @@ static struct bnxt_en_dev *bnxt_re_dev_probe(struct net_device *netdev)
 	return en_dev;
 }
 
+static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr,
+			   char *buf)
+{
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev);
+
+	return scnprintf(buf, PAGE_SIZE, "0x%x\n", rdev->en_dev->pdev->vendor);
+}
+static DEVICE_ATTR_RO(hw_rev);
+
+static ssize_t hca_type_show(struct device *device,
+			     struct device_attribute *attr, char *buf)
+{
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%s\n", rdev->ibdev.node_desc);
+}
+static DEVICE_ATTR_RO(hca_type);
+
+static struct attribute *bnxt_re_attributes[] = {
+	&dev_attr_hw_rev.attr,
+	&dev_attr_hca_type.attr,
+	NULL
+};
+
+static const struct attribute_group bnxt_re_dev_attr_group = {
+	.attrs = bnxt_re_attributes,
+};
+
 static void bnxt_re_unregister_ib(struct bnxt_re_dev *rdev)
 {
 	ib_unregister_device(&rdev->ibdev);
@@ -547,7 +575,6 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
 	/* ib device init */
 	ibdev->owner = THIS_MODULE;
 	ibdev->node_type = RDMA_NODE_IB_CA;
-	strlcpy(ibdev->name, "bnxt_re%d", IB_DEVICE_NAME_MAX);
 	strlcpy(ibdev->node_desc, BNXT_RE_DESC " HCA",
 		strlen(BNXT_RE_DESC) + 5);
 	ibdev->phys_port_cnt = 1;
@@ -639,34 +666,11 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
 	ibdev->get_hw_stats             = bnxt_re_ib_get_hw_stats;
 	ibdev->alloc_hw_stats           = bnxt_re_ib_alloc_hw_stats;
 
+	rdma_set_device_sysfs_group(ibdev, &bnxt_re_dev_attr_group);
 	ibdev->driver_id = RDMA_DRIVER_BNXT_RE;
-	return ib_register_device(ibdev, NULL);
-}
-
-static ssize_t show_rev(struct device *device, struct device_attribute *attr,
-			char *buf)
-{
-	struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev);
-
-	return scnprintf(buf, PAGE_SIZE, "0x%x\n", rdev->en_dev->pdev->vendor);
-}
-
-static ssize_t show_hca(struct device *device, struct device_attribute *attr,
-			char *buf)
-{
-	struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev);
-
-	return scnprintf(buf, PAGE_SIZE, "%s\n", rdev->ibdev.node_desc);
+	return ib_register_device(ibdev, "bnxt_re%d", NULL);
 }
 
-static DEVICE_ATTR(hw_rev, 0444, show_rev, NULL);
-static DEVICE_ATTR(hca_type, 0444, show_hca, NULL);
-
-static struct device_attribute *bnxt_re_attributes[] = {
-	&dev_attr_hw_rev,
-	&dev_attr_hca_type
-};
-
 static void bnxt_re_dev_remove(struct bnxt_re_dev *rdev)
 {
 	dev_put(rdev->netdev);
@@ -864,10 +868,8 @@ static void bnxt_re_cleanup_res(struct bnxt_re_dev *rdev)
 {
 	int i;
 
-	if (rdev->nq[0].hwq.max_elements) {
-		for (i = 1; i < rdev->num_msix; i++)
-			bnxt_qplib_disable_nq(&rdev->nq[i - 1]);
-	}
+	for (i = 1; i < rdev->num_msix; i++)
+		bnxt_qplib_disable_nq(&rdev->nq[i - 1]);
 
 	if (rdev->qplib_res.rcfw)
 		bnxt_qplib_cleanup_res(&rdev->qplib_res);
@@ -876,6 +878,7 @@ static void bnxt_re_cleanup_res(struct bnxt_re_dev *rdev)
 static int bnxt_re_init_res(struct bnxt_re_dev *rdev)
 {
 	int rc = 0, i;
+	int num_vec_enabled = 0;
 
 	bnxt_qplib_init_res(&rdev->qplib_res);
 
@@ -891,9 +894,13 @@ static int bnxt_re_init_res(struct bnxt_re_dev *rdev)
 				"Failed to enable NQ with rc = 0x%x", rc);
 			goto fail;
 		}
+		num_vec_enabled++;
 	}
 	return 0;
 fail:
+	for (i = num_vec_enabled; i >= 0; i--)
+		bnxt_qplib_disable_nq(&rdev->nq[i]);
+
 	return rc;
 }
 
@@ -925,6 +932,7 @@ static void bnxt_re_free_res(struct bnxt_re_dev *rdev)
 static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev)
 {
 	int rc = 0, i;
+	int num_vec_created = 0;
 
 	/* Configure and allocate resources for qplib */
 	rdev->qplib_res.rcfw = &rdev->rcfw;
@@ -951,7 +959,7 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev)
 		if (rc) {
 			dev_err(rdev_to_dev(rdev), "Alloc Failed NQ%d rc:%#x",
 				i, rc);
-			goto dealloc_dpi;
+			goto free_nq;
 		}
 		rc = bnxt_re_net_ring_alloc
 			(rdev, rdev->nq[i].hwq.pbl[PBL_LVL_0].pg_map_arr,
@@ -964,14 +972,17 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev)
 			dev_err(rdev_to_dev(rdev),
 				"Failed to allocate NQ fw id with rc = 0x%x",
 				rc);
+			bnxt_qplib_free_nq(&rdev->nq[i]);
 			goto free_nq;
 		}
+		num_vec_created++;
 	}
 	return 0;
 free_nq:
-	for (i = 0; i < rdev->num_msix - 1; i++)
+	for (i = num_vec_created; i >= 0; i--) {
+		bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id);
 		bnxt_qplib_free_nq(&rdev->nq[i]);
-dealloc_dpi:
+	}
 	bnxt_qplib_dealloc_dpi(&rdev->qplib_res,
 			       &rdev->qplib_res.dpi_tbl,
 			       &rdev->dpi_privileged);
@@ -989,12 +1000,17 @@ static void bnxt_re_dispatch_event(struct ib_device *ibdev, struct ib_qp *qp,
 	struct ib_event ib_event;
 
 	ib_event.device = ibdev;
-	if (qp)
+	if (qp) {
 		ib_event.element.qp = qp;
-	else
+		ib_event.event = event;
+		if (qp->event_handler)
+			qp->event_handler(&ib_event, qp->qp_context);
+
+	} else {
 		ib_event.element.port_num = port_num;
-	ib_event.event = event;
-	ib_dispatch_event(&ib_event);
+		ib_event.event = event;
+		ib_dispatch_event(&ib_event);
+	}
 }
 
 #define HWRM_QUEUE_PRI2COS_QCFG_INPUT_FLAGS_IVLAN      0x02
@@ -1189,20 +1205,20 @@ static int bnxt_re_setup_qos(struct bnxt_re_dev *rdev)
 
 static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev)
 {
-	int i, rc;
+	int rc;
 
 	if (test_and_clear_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags)) {
-		for (i = 0; i < ARRAY_SIZE(bnxt_re_attributes); i++)
-			device_remove_file(&rdev->ibdev.dev,
-					   bnxt_re_attributes[i]);
 		/* Cleanup ib dev */
 		bnxt_re_unregister_ib(rdev);
 	}
 	if (test_and_clear_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags))
-		cancel_delayed_work(&rdev->worker);
+		cancel_delayed_work_sync(&rdev->worker);
 
-	bnxt_re_cleanup_res(rdev);
-	bnxt_re_free_res(rdev);
+	if (test_and_clear_bit(BNXT_RE_FLAG_RESOURCES_INITIALIZED,
+			       &rdev->flags))
+		bnxt_re_cleanup_res(rdev);
+	if (test_and_clear_bit(BNXT_RE_FLAG_RESOURCES_ALLOCATED, &rdev->flags))
+		bnxt_re_free_res(rdev);
 
 	if (test_and_clear_bit(BNXT_RE_FLAG_RCFW_CHANNEL_EN, &rdev->flags)) {
 		rc = bnxt_qplib_deinit_rcfw(&rdev->rcfw);
@@ -1241,7 +1257,7 @@ static void bnxt_re_worker(struct work_struct *work)
 
 static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 {
-	int i, j, rc;
+	int rc;
 
 	bool locked;
 
@@ -1331,12 +1347,15 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 		pr_err("Failed to allocate resources: %#x\n", rc);
 		goto fail;
 	}
+	set_bit(BNXT_RE_FLAG_RESOURCES_ALLOCATED, &rdev->flags);
 	rc = bnxt_re_init_res(rdev);
 	if (rc) {
 		pr_err("Failed to initialize resources: %#x\n", rc);
 		goto fail;
 	}
 
+	set_bit(BNXT_RE_FLAG_RESOURCES_INITIALIZED, &rdev->flags);
+
 	if (!rdev->is_virtfn) {
 		rc = bnxt_re_setup_qos(rdev);
 		if (rc)
@@ -1358,20 +1377,6 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 	}
 	set_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags);
 	dev_info(rdev_to_dev(rdev), "Device registered successfully");
-	for (i = 0; i < ARRAY_SIZE(bnxt_re_attributes); i++) {
-		rc = device_create_file(&rdev->ibdev.dev,
-					bnxt_re_attributes[i]);
-		if (rc) {
-			dev_err(rdev_to_dev(rdev),
-				"Failed to create IB sysfs: %#x", rc);
-			/* Must clean up all created device files */
-			for (j = 0; j < i; j++)
-				device_remove_file(&rdev->ibdev.dev,
-						   bnxt_re_attributes[j]);
-			bnxt_re_unregister_ib(rdev);
-			goto fail;
-		}
-	}
 	ib_get_eth_speed(&rdev->ibdev, 1, &rdev->active_speed,
 			 &rdev->active_width);
 	set_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, &rdev->flags);

+ 54 - 80
drivers/infiniband/hw/bnxt_re/qplib_fp.c

@@ -36,6 +36,8 @@
  * Description: Fast Path Operators
  */
 
+#define dev_fmt(fmt) "QPLIB: " fmt
+
 #include <linux/interrupt.h>
 #include <linux/spinlock.h>
 #include <linux/sched.h>
@@ -71,8 +73,7 @@ static void __bnxt_qplib_add_flush_qp(struct bnxt_qplib_qp *qp)
 
 	if (!qp->sq.flushed) {
 		dev_dbg(&scq->hwq.pdev->dev,
-			"QPLIB: FP: Adding to SQ Flush list = %p",
-			qp);
+			"FP: Adding to SQ Flush list = %p\n", qp);
 		bnxt_qplib_cancel_phantom_processing(qp);
 		list_add_tail(&qp->sq_flush, &scq->sqf_head);
 		qp->sq.flushed = true;
@@ -80,8 +81,7 @@ static void __bnxt_qplib_add_flush_qp(struct bnxt_qplib_qp *qp)
 	if (!qp->srq) {
 		if (!qp->rq.flushed) {
 			dev_dbg(&rcq->hwq.pdev->dev,
-				"QPLIB: FP: Adding to RQ Flush list = %p",
-				qp);
+				"FP: Adding to RQ Flush list = %p\n", qp);
 			list_add_tail(&qp->rq_flush, &rcq->rqf_head);
 			qp->rq.flushed = true;
 		}
@@ -207,7 +207,7 @@ static int bnxt_qplib_alloc_qp_hdr_buf(struct bnxt_qplib_res *res,
 		if (!qp->sq_hdr_buf) {
 			rc = -ENOMEM;
 			dev_err(&res->pdev->dev,
-				"QPLIB: Failed to create sq_hdr_buf");
+				"Failed to create sq_hdr_buf\n");
 			goto fail;
 		}
 	}
@@ -221,7 +221,7 @@ static int bnxt_qplib_alloc_qp_hdr_buf(struct bnxt_qplib_res *res,
 		if (!qp->rq_hdr_buf) {
 			rc = -ENOMEM;
 			dev_err(&res->pdev->dev,
-				"QPLIB: Failed to create rq_hdr_buf");
+				"Failed to create rq_hdr_buf\n");
 			goto fail;
 		}
 	}
@@ -277,8 +277,7 @@ static void bnxt_qplib_service_nq(unsigned long data)
 				num_cqne_processed++;
 			else
 				dev_warn(&nq->pdev->dev,
-					 "QPLIB: cqn - type 0x%x not handled",
-					 type);
+					 "cqn - type 0x%x not handled\n", type);
 			spin_unlock_bh(&cq->compl_lock);
 			break;
 		}
@@ -298,7 +297,7 @@ static void bnxt_qplib_service_nq(unsigned long data)
 				num_srqne_processed++;
 			else
 				dev_warn(&nq->pdev->dev,
-					 "QPLIB: SRQ event 0x%x not handled",
+					 "SRQ event 0x%x not handled\n",
 					 nqsrqe->event);
 			break;
 		}
@@ -306,8 +305,7 @@ static void bnxt_qplib_service_nq(unsigned long data)
 			break;
 		default:
 			dev_warn(&nq->pdev->dev,
-				 "QPLIB: nqe with type = 0x%x not handled",
-				 type);
+				 "nqe with type = 0x%x not handled\n", type);
 			break;
 		}
 		raw_cons++;
@@ -360,7 +358,8 @@ void bnxt_qplib_disable_nq(struct bnxt_qplib_nq *nq)
 	}
 
 	/* Make sure the HW is stopped! */
-	bnxt_qplib_nq_stop_irq(nq, true);
+	if (nq->requested)
+		bnxt_qplib_nq_stop_irq(nq, true);
 
 	if (nq->bar_reg_iomem)
 		iounmap(nq->bar_reg_iomem);
@@ -396,7 +395,7 @@ int bnxt_qplib_nq_start_irq(struct bnxt_qplib_nq *nq, int nq_indx,
 	rc = irq_set_affinity_hint(nq->vector, &nq->mask);
 	if (rc) {
 		dev_warn(&nq->pdev->dev,
-			 "QPLIB: set affinity failed; vector: %d nq_idx: %d\n",
+			 "set affinity failed; vector: %d nq_idx: %d\n",
 			 nq->vector, nq_indx);
 	}
 	nq->requested = true;
@@ -443,7 +442,7 @@ int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq,
 	rc = bnxt_qplib_nq_start_irq(nq, nq_idx, msix_vector, true);
 	if (rc) {
 		dev_err(&nq->pdev->dev,
-			"QPLIB: Failed to request irq for nq-idx %d", nq_idx);
+			"Failed to request irq for nq-idx %d\n", nq_idx);
 		goto fail;
 	}
 
@@ -662,8 +661,8 @@ int bnxt_qplib_post_srq_recv(struct bnxt_qplib_srq *srq,
 
 	spin_lock(&srq_hwq->lock);
 	if (srq->start_idx == srq->last_idx) {
-		dev_err(&srq_hwq->pdev->dev, "QPLIB: FP: SRQ (0x%x) is full!",
-			srq->id);
+		dev_err(&srq_hwq->pdev->dev,
+			"FP: SRQ (0x%x) is full!\n", srq->id);
 		rc = -EINVAL;
 		spin_unlock(&srq_hwq->lock);
 		goto done;
@@ -1324,7 +1323,7 @@ int bnxt_qplib_query_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
 		}
 	}
 	if (i == res->sgid_tbl.max)
-		dev_warn(&res->pdev->dev, "QPLIB: SGID not found??");
+		dev_warn(&res->pdev->dev, "SGID not found??\n");
 
 	qp->ah.hop_limit = sb->hop_limit;
 	qp->ah.traffic_class = sb->traffic_class;
@@ -1536,7 +1535,7 @@ int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp,
 
 	if (bnxt_qplib_queue_full(sq)) {
 		dev_err(&sq->hwq.pdev->dev,
-			"QPLIB: prod = %#x cons = %#x qdepth = %#x delta = %#x",
+			"prod = %#x cons = %#x qdepth = %#x delta = %#x\n",
 			sq->hwq.prod, sq->hwq.cons, sq->hwq.max_elements,
 			sq->q_full_delta);
 		rc = -ENOMEM;
@@ -1561,7 +1560,7 @@ int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp,
 		/* Copy the inline data */
 		if (wqe->inline_len > BNXT_QPLIB_SWQE_MAX_INLINE_LENGTH) {
 			dev_warn(&sq->hwq.pdev->dev,
-				 "QPLIB: Inline data length > 96 detected");
+				 "Inline data length > 96 detected\n");
 			data_len = BNXT_QPLIB_SWQE_MAX_INLINE_LENGTH;
 		} else {
 			data_len = wqe->inline_len;
@@ -1776,7 +1775,7 @@ done:
 			queue_work(qp->scq->nq->cqn_wq, &nq_work->work);
 		} else {
 			dev_err(&sq->hwq.pdev->dev,
-				"QPLIB: FP: Failed to allocate SQ nq_work!");
+				"FP: Failed to allocate SQ nq_work!\n");
 			rc = -ENOMEM;
 		}
 	}
@@ -1815,13 +1814,12 @@ int bnxt_qplib_post_recv(struct bnxt_qplib_qp *qp,
 	if (qp->state == CMDQ_MODIFY_QP_NEW_STATE_ERR) {
 		sch_handler = true;
 		dev_dbg(&rq->hwq.pdev->dev,
-			"%s Error QP. Scheduling for poll_cq\n",
-			__func__);
+			"%s: Error QP. Scheduling for poll_cq\n", __func__);
 		goto queue_err;
 	}
 	if (bnxt_qplib_queue_full(rq)) {
 		dev_err(&rq->hwq.pdev->dev,
-			"QPLIB: FP: QP (0x%x) RQ is full!", qp->id);
+			"FP: QP (0x%x) RQ is full!\n", qp->id);
 		rc = -EINVAL;
 		goto done;
 	}
@@ -1870,7 +1868,7 @@ queue_err:
 			queue_work(qp->rcq->nq->cqn_wq, &nq_work->work);
 		} else {
 			dev_err(&rq->hwq.pdev->dev,
-				"QPLIB: FP: Failed to allocate RQ nq_work!");
+				"FP: Failed to allocate RQ nq_work!\n");
 			rc = -ENOMEM;
 		}
 	}
@@ -1932,7 +1930,7 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq)
 
 	if (!cq->dpi) {
 		dev_err(&rcfw->pdev->dev,
-			"QPLIB: FP: CREATE_CQ failed due to NULL DPI");
+			"FP: CREATE_CQ failed due to NULL DPI\n");
 		return -EINVAL;
 	}
 	req.dpi = cpu_to_le32(cq->dpi->dpi);
@@ -1969,6 +1967,7 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq)
 	INIT_LIST_HEAD(&cq->sqf_head);
 	INIT_LIST_HEAD(&cq->rqf_head);
 	spin_lock_init(&cq->compl_lock);
+	spin_lock_init(&cq->flush_lock);
 
 	bnxt_qplib_arm_cq_enable(cq);
 	return 0;
@@ -2172,7 +2171,7 @@ static int do_wa9060(struct bnxt_qplib_qp *qp, struct bnxt_qplib_cq *cq,
 						 *  comes back
 						 */
 						dev_dbg(&cq->hwq.pdev->dev,
-							"FP:Got Phantom CQE");
+							"FP: Got Phantom CQE\n");
 						sq->condition = false;
 						sq->single = true;
 						rc = 0;
@@ -2189,7 +2188,7 @@ static int do_wa9060(struct bnxt_qplib_qp *qp, struct bnxt_qplib_cq *cq,
 			peek_raw_cq_cons++;
 		}
 		dev_err(&cq->hwq.pdev->dev,
-			"Should not have come here! cq_cons=0x%x qp=0x%x sq cons sw=0x%x hw=0x%x",
+			"Should not have come here! cq_cons=0x%x qp=0x%x sq cons sw=0x%x hw=0x%x\n",
 			cq_cons, qp->id, sw_sq_cons, cqe_sq_cons);
 		rc = -EINVAL;
 	}
@@ -2213,7 +2212,7 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq,
 				      le64_to_cpu(hwcqe->qp_handle));
 	if (!qp) {
 		dev_err(&cq->hwq.pdev->dev,
-			"QPLIB: FP: Process Req qp is NULL");
+			"FP: Process Req qp is NULL\n");
 		return -EINVAL;
 	}
 	sq = &qp->sq;
@@ -2221,16 +2220,14 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq,
 	cqe_sq_cons = HWQ_CMP(le16_to_cpu(hwcqe->sq_cons_idx), &sq->hwq);
 	if (cqe_sq_cons > sq->hwq.max_elements) {
 		dev_err(&cq->hwq.pdev->dev,
-			"QPLIB: FP: CQ Process req reported ");
-		dev_err(&cq->hwq.pdev->dev,
-			"QPLIB: sq_cons_idx 0x%x which exceeded max 0x%x",
+			"FP: CQ Process req reported sq_cons_idx 0x%x which exceeded max 0x%x\n",
 			cqe_sq_cons, sq->hwq.max_elements);
 		return -EINVAL;
 	}
 
 	if (qp->sq.flushed) {
 		dev_dbg(&cq->hwq.pdev->dev,
-			"%s: QPLIB: QP in Flush QP = %p\n", __func__, qp);
+			"%s: QP in Flush QP = %p\n", __func__, qp);
 		goto done;
 	}
 	/* Require to walk the sq's swq to fabricate CQEs for all previously
@@ -2262,9 +2259,7 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq,
 		    hwcqe->status != CQ_REQ_STATUS_OK) {
 			cqe->status = hwcqe->status;
 			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: FP: CQ Processed Req ");
-			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: wr_id[%d] = 0x%llx with status 0x%x",
+				"FP: CQ Processed Req wr_id[%d] = 0x%llx with status 0x%x\n",
 				sw_sq_cons, cqe->wr_id, cqe->status);
 			cqe++;
 			(*budget)--;
@@ -2330,12 +2325,12 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq,
 	qp = (struct bnxt_qplib_qp *)((unsigned long)
 				      le64_to_cpu(hwcqe->qp_handle));
 	if (!qp) {
-		dev_err(&cq->hwq.pdev->dev, "QPLIB: process_cq RC qp is NULL");
+		dev_err(&cq->hwq.pdev->dev, "process_cq RC qp is NULL\n");
 		return -EINVAL;
 	}
 	if (qp->rq.flushed) {
 		dev_dbg(&cq->hwq.pdev->dev,
-			"%s: QPLIB: QP in Flush QP = %p\n", __func__, qp);
+			"%s: QP in Flush QP = %p\n", __func__, qp);
 		goto done;
 	}
 
@@ -2356,9 +2351,7 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq,
 			return -EINVAL;
 		if (wr_id_idx >= srq->hwq.max_elements) {
 			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: FP: CQ Process RC ");
-			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: wr_id idx 0x%x exceeded SRQ max 0x%x",
+				"FP: CQ Process RC wr_id idx 0x%x exceeded SRQ max 0x%x\n",
 				wr_id_idx, srq->hwq.max_elements);
 			return -EINVAL;
 		}
@@ -2371,9 +2364,7 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq,
 		rq = &qp->rq;
 		if (wr_id_idx >= rq->hwq.max_elements) {
 			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: FP: CQ Process RC ");
-			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: wr_id idx 0x%x exceeded RQ max 0x%x",
+				"FP: CQ Process RC wr_id idx 0x%x exceeded RQ max 0x%x\n",
 				wr_id_idx, rq->hwq.max_elements);
 			return -EINVAL;
 		}
@@ -2409,12 +2400,12 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq,
 	qp = (struct bnxt_qplib_qp *)((unsigned long)
 				      le64_to_cpu(hwcqe->qp_handle));
 	if (!qp) {
-		dev_err(&cq->hwq.pdev->dev, "QPLIB: process_cq UD qp is NULL");
+		dev_err(&cq->hwq.pdev->dev, "process_cq UD qp is NULL\n");
 		return -EINVAL;
 	}
 	if (qp->rq.flushed) {
 		dev_dbg(&cq->hwq.pdev->dev,
-			"%s: QPLIB: QP in Flush QP = %p\n", __func__, qp);
+			"%s: QP in Flush QP = %p\n", __func__, qp);
 		goto done;
 	}
 	cqe = *pcqe;
@@ -2439,9 +2430,7 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq,
 
 		if (wr_id_idx >= srq->hwq.max_elements) {
 			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: FP: CQ Process UD ");
-			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: wr_id idx 0x%x exceeded SRQ max 0x%x",
+				"FP: CQ Process UD wr_id idx 0x%x exceeded SRQ max 0x%x\n",
 				wr_id_idx, srq->hwq.max_elements);
 			return -EINVAL;
 		}
@@ -2454,9 +2443,7 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq,
 		rq = &qp->rq;
 		if (wr_id_idx >= rq->hwq.max_elements) {
 			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: FP: CQ Process UD ");
-			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: wr_id idx 0x%x exceeded RQ max 0x%x",
+				"FP: CQ Process UD wr_id idx 0x%x exceeded RQ max 0x%x\n",
 				wr_id_idx, rq->hwq.max_elements);
 			return -EINVAL;
 		}
@@ -2508,13 +2495,12 @@ static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq,
 	qp = (struct bnxt_qplib_qp *)((unsigned long)
 				      le64_to_cpu(hwcqe->qp_handle));
 	if (!qp) {
-		dev_err(&cq->hwq.pdev->dev,
-			"QPLIB: process_cq Raw/QP1 qp is NULL");
+		dev_err(&cq->hwq.pdev->dev, "process_cq Raw/QP1 qp is NULL\n");
 		return -EINVAL;
 	}
 	if (qp->rq.flushed) {
 		dev_dbg(&cq->hwq.pdev->dev,
-			"%s: QPLIB: QP in Flush QP = %p\n", __func__, qp);
+			"%s: QP in Flush QP = %p\n", __func__, qp);
 		goto done;
 	}
 	cqe = *pcqe;
@@ -2543,14 +2529,12 @@ static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq,
 		srq = qp->srq;
 		if (!srq) {
 			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: FP: SRQ used but not defined??");
+				"FP: SRQ used but not defined??\n");
 			return -EINVAL;
 		}
 		if (wr_id_idx >= srq->hwq.max_elements) {
 			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: FP: CQ Process Raw/QP1 ");
-			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: wr_id idx 0x%x exceeded SRQ max 0x%x",
+				"FP: CQ Process Raw/QP1 wr_id idx 0x%x exceeded SRQ max 0x%x\n",
 				wr_id_idx, srq->hwq.max_elements);
 			return -EINVAL;
 		}
@@ -2563,9 +2547,7 @@ static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq,
 		rq = &qp->rq;
 		if (wr_id_idx >= rq->hwq.max_elements) {
 			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: FP: CQ Process Raw/QP1 RQ wr_id ");
-			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: ix 0x%x exceeded RQ max 0x%x",
+				"FP: CQ Process Raw/QP1 RQ wr_id idx 0x%x exceeded RQ max 0x%x\n",
 				wr_id_idx, rq->hwq.max_elements);
 			return -EINVAL;
 		}
@@ -2600,14 +2582,14 @@ static int bnxt_qplib_cq_process_terminal(struct bnxt_qplib_cq *cq,
 	/* Check the Status */
 	if (hwcqe->status != CQ_TERMINAL_STATUS_OK)
 		dev_warn(&cq->hwq.pdev->dev,
-			 "QPLIB: FP: CQ Process Terminal Error status = 0x%x",
+			 "FP: CQ Process Terminal Error status = 0x%x\n",
 			 hwcqe->status);
 
 	qp = (struct bnxt_qplib_qp *)((unsigned long)
 				      le64_to_cpu(hwcqe->qp_handle));
 	if (!qp) {
 		dev_err(&cq->hwq.pdev->dev,
-			"QPLIB: FP: CQ Process terminal qp is NULL");
+			"FP: CQ Process terminal qp is NULL\n");
 		return -EINVAL;
 	}
 
@@ -2623,16 +2605,14 @@ static int bnxt_qplib_cq_process_terminal(struct bnxt_qplib_cq *cq,
 
 	if (cqe_cons > sq->hwq.max_elements) {
 		dev_err(&cq->hwq.pdev->dev,
-			"QPLIB: FP: CQ Process terminal reported ");
-		dev_err(&cq->hwq.pdev->dev,
-			"QPLIB: sq_cons_idx 0x%x which exceeded max 0x%x",
+			"FP: CQ Process terminal reported sq_cons_idx 0x%x which exceeded max 0x%x\n",
 			cqe_cons, sq->hwq.max_elements);
 		goto do_rq;
 	}
 
 	if (qp->sq.flushed) {
 		dev_dbg(&cq->hwq.pdev->dev,
-			"%s: QPLIB: QP in Flush QP = %p\n", __func__, qp);
+			"%s: QP in Flush QP = %p\n", __func__, qp);
 		goto sq_done;
 	}
 
@@ -2673,16 +2653,14 @@ do_rq:
 		goto done;
 	} else if (cqe_cons > rq->hwq.max_elements) {
 		dev_err(&cq->hwq.pdev->dev,
-			"QPLIB: FP: CQ Processed terminal ");
-		dev_err(&cq->hwq.pdev->dev,
-			"QPLIB: reported rq_cons_idx 0x%x exceeds max 0x%x",
+			"FP: CQ Processed terminal reported rq_cons_idx 0x%x exceeds max 0x%x\n",
 			cqe_cons, rq->hwq.max_elements);
 		goto done;
 	}
 
 	if (qp->rq.flushed) {
 		dev_dbg(&cq->hwq.pdev->dev,
-			"%s: QPLIB: QP in Flush QP = %p\n", __func__, qp);
+			"%s: QP in Flush QP = %p\n", __func__, qp);
 		rc = 0;
 		goto done;
 	}
@@ -2704,7 +2682,7 @@ static int bnxt_qplib_cq_process_cutoff(struct bnxt_qplib_cq *cq,
 	/* Check the Status */
 	if (hwcqe->status != CQ_CUTOFF_STATUS_OK) {
 		dev_err(&cq->hwq.pdev->dev,
-			"QPLIB: FP: CQ Process Cutoff Error status = 0x%x",
+			"FP: CQ Process Cutoff Error status = 0x%x\n",
 			hwcqe->status);
 		return -EINVAL;
 	}
@@ -2724,16 +2702,12 @@ int bnxt_qplib_process_flush_list(struct bnxt_qplib_cq *cq,
 
 	spin_lock_irqsave(&cq->flush_lock, flags);
 	list_for_each_entry(qp, &cq->sqf_head, sq_flush) {
-		dev_dbg(&cq->hwq.pdev->dev,
-			"QPLIB: FP: Flushing SQ QP= %p",
-			qp);
+		dev_dbg(&cq->hwq.pdev->dev, "FP: Flushing SQ QP= %p\n", qp);
 		__flush_sq(&qp->sq, qp, &cqe, &budget);
 	}
 
 	list_for_each_entry(qp, &cq->rqf_head, rq_flush) {
-		dev_dbg(&cq->hwq.pdev->dev,
-			"QPLIB: FP: Flushing RQ QP= %p",
-			qp);
+		dev_dbg(&cq->hwq.pdev->dev, "FP: Flushing RQ QP= %p\n", qp);
 		__flush_rq(&qp->rq, qp, &cqe, &budget);
 	}
 	spin_unlock_irqrestore(&cq->flush_lock, flags);
@@ -2801,7 +2775,7 @@ int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe,
 			goto exit;
 		default:
 			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: process_cq unknown type 0x%lx",
+				"process_cq unknown type 0x%lx\n",
 				hw_cqe->cqe_type_toggle &
 				CQ_BASE_CQE_TYPE_MASK);
 			rc = -EINVAL;
@@ -2814,7 +2788,7 @@ int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe,
 			 * next one
 			 */
 			dev_err(&cq->hwq.pdev->dev,
-				"QPLIB: process_cqe error rc = 0x%x", rc);
+				"process_cqe error rc = 0x%x\n", rc);
 		}
 		raw_cons++;
 	}

+ 49 - 39
drivers/infiniband/hw/bnxt_re/qplib_rcfw.c

@@ -35,6 +35,9 @@
  *
  * Description: RDMA Controller HW interface
  */
+
+#define dev_fmt(fmt) "QPLIB: " fmt
+
 #include <linux/interrupt.h>
 #include <linux/spinlock.h>
 #include <linux/pci.h>
@@ -96,14 +99,13 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct cmdq_base *req,
 	     opcode != CMDQ_BASE_OPCODE_INITIALIZE_FW &&
 	     opcode != CMDQ_BASE_OPCODE_QUERY_VERSION)) {
 		dev_err(&rcfw->pdev->dev,
-			"QPLIB: RCFW not initialized, reject opcode 0x%x",
-			opcode);
+			"RCFW not initialized, reject opcode 0x%x\n", opcode);
 		return -EINVAL;
 	}
 
 	if (test_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->flags) &&
 	    opcode == CMDQ_BASE_OPCODE_INITIALIZE_FW) {
-		dev_err(&rcfw->pdev->dev, "QPLIB: RCFW already initialized!");
+		dev_err(&rcfw->pdev->dev, "RCFW already initialized!\n");
 		return -EINVAL;
 	}
 
@@ -115,7 +117,7 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct cmdq_base *req,
 	 */
 	spin_lock_irqsave(&cmdq->lock, flags);
 	if (req->cmd_size >= HWQ_FREE_SLOTS(cmdq)) {
-		dev_err(&rcfw->pdev->dev, "QPLIB: RCFW: CMDQ is full!");
+		dev_err(&rcfw->pdev->dev, "RCFW: CMDQ is full!\n");
 		spin_unlock_irqrestore(&cmdq->lock, flags);
 		return -EAGAIN;
 	}
@@ -154,7 +156,7 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct cmdq_base *req,
 		cmdqe = &cmdq_ptr[get_cmdq_pg(sw_prod)][get_cmdq_idx(sw_prod)];
 		if (!cmdqe) {
 			dev_err(&rcfw->pdev->dev,
-				"QPLIB: RCFW request failed with no cmdqe!");
+				"RCFW request failed with no cmdqe!\n");
 			goto done;
 		}
 		/* Copy a segment of the req cmd to the cmdq */
@@ -210,7 +212,7 @@ int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw,
 
 		if (!retry_cnt || (rc != -EAGAIN && rc != -EBUSY)) {
 			/* send failed */
-			dev_err(&rcfw->pdev->dev, "QPLIB: cmdq[%#x]=%#x send failed",
+			dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x send failed\n",
 				cookie, opcode);
 			return rc;
 		}
@@ -224,7 +226,7 @@ int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw,
 		rc = __wait_for_resp(rcfw, cookie);
 	if (rc) {
 		/* timed out */
-		dev_err(&rcfw->pdev->dev, "QPLIB: cmdq[%#x]=%#x timedout (%d)msec",
+		dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x timedout (%d)msec\n",
 			cookie, opcode, RCFW_CMD_WAIT_TIME_MS);
 		set_bit(FIRMWARE_TIMED_OUT, &rcfw->flags);
 		return rc;
@@ -232,7 +234,7 @@ int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw,
 
 	if (evnt->status) {
 		/* failed with status */
-		dev_err(&rcfw->pdev->dev, "QPLIB: cmdq[%#x]=%#x status %#x",
+		dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x status %#x\n",
 			cookie, opcode, evnt->status);
 		rc = -EFAULT;
 	}
@@ -298,9 +300,9 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw,
 		qp_id = le32_to_cpu(err_event->xid);
 		qp = rcfw->qp_tbl[qp_id].qp_handle;
 		dev_dbg(&rcfw->pdev->dev,
-			"QPLIB: Received QP error notification");
+			"Received QP error notification\n");
 		dev_dbg(&rcfw->pdev->dev,
-			"QPLIB: qpid 0x%x, req_err=0x%x, resp_err=0x%x\n",
+			"qpid 0x%x, req_err=0x%x, resp_err=0x%x\n",
 			qp_id, err_event->req_err_state_reason,
 			err_event->res_err_state_reason);
 		if (!qp)
@@ -309,8 +311,17 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw,
 		rcfw->aeq_handler(rcfw, qp_event, qp);
 		break;
 	default:
-		/* Command Response */
-		spin_lock_irqsave(&cmdq->lock, flags);
+		/*
+		 * Command Response
+		 * cmdq->lock needs to be acquired to synchronie
+		 * the command send and completion reaping. This function
+		 * is always called with creq->lock held. Using
+		 * the nested variant of spin_lock.
+		 *
+		 */
+
+		spin_lock_irqsave_nested(&cmdq->lock, flags,
+					 SINGLE_DEPTH_NESTING);
 		cookie = le16_to_cpu(qp_event->cookie);
 		mcookie = qp_event->cookie;
 		blocked = cookie & RCFW_CMD_IS_BLOCKING;
@@ -322,14 +333,16 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw,
 			memcpy(crsqe->resp, qp_event, sizeof(*qp_event));
 			crsqe->resp = NULL;
 		} else {
-			dev_err(&rcfw->pdev->dev,
-				"QPLIB: CMD %s resp->cookie = %#x, evnt->cookie = %#x",
-				crsqe->resp ? "mismatch" : "collision",
-				crsqe->resp ? crsqe->resp->cookie : 0, mcookie);
+			if (crsqe->resp && crsqe->resp->cookie)
+				dev_err(&rcfw->pdev->dev,
+					"CMD %s cookie sent=%#x, recd=%#x\n",
+					crsqe->resp ? "mismatch" : "collision",
+					crsqe->resp ? crsqe->resp->cookie : 0,
+					mcookie);
 		}
 		if (!test_and_clear_bit(cbit, rcfw->cmdq_bitmap))
 			dev_warn(&rcfw->pdev->dev,
-				 "QPLIB: CMD bit %d was not requested", cbit);
+				 "CMD bit %d was not requested\n", cbit);
 		cmdq->cons += crsqe->req_size;
 		crsqe->req_size = 0;
 
@@ -376,14 +389,14 @@ static void bnxt_qplib_service_creq(unsigned long data)
 			    (rcfw, (struct creq_func_event *)creqe))
 				rcfw->creq_func_event_processed++;
 			else
-				dev_warn
-				(&rcfw->pdev->dev, "QPLIB:aeqe:%#x Not handled",
-				 type);
+				dev_warn(&rcfw->pdev->dev,
+					 "aeqe:%#x Not handled\n", type);
 			break;
 		default:
-			dev_warn(&rcfw->pdev->dev, "QPLIB: creqe with ");
-			dev_warn(&rcfw->pdev->dev,
-				 "QPLIB: op_event = 0x%x not handled", type);
+			if (type != ASYNC_EVENT_CMPL_TYPE_HWRM_ASYNC_EVENT)
+				dev_warn(&rcfw->pdev->dev,
+					 "creqe with event 0x%x not handled\n",
+					 type);
 			break;
 		}
 		raw_cons++;
@@ -551,7 +564,7 @@ int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev,
 				      BNXT_QPLIB_CREQE_UNITS, 0, PAGE_SIZE,
 				      HWQ_TYPE_L2_CMPL)) {
 		dev_err(&rcfw->pdev->dev,
-			"QPLIB: HW channel CREQ allocation failed");
+			"HW channel CREQ allocation failed\n");
 		goto fail;
 	}
 	rcfw->cmdq.max_elements = BNXT_QPLIB_CMDQE_MAX_CNT;
@@ -560,7 +573,7 @@ int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev,
 				      BNXT_QPLIB_CMDQE_UNITS, 0, PAGE_SIZE,
 				      HWQ_TYPE_CTX)) {
 		dev_err(&rcfw->pdev->dev,
-			"QPLIB: HW channel CMDQ allocation failed");
+			"HW channel CMDQ allocation failed\n");
 		goto fail;
 	}
 
@@ -605,21 +618,18 @@ void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw)
 
 	bnxt_qplib_rcfw_stop_irq(rcfw, true);
 
-	if (rcfw->cmdq_bar_reg_iomem)
-		iounmap(rcfw->cmdq_bar_reg_iomem);
-	rcfw->cmdq_bar_reg_iomem = NULL;
-
-	if (rcfw->creq_bar_reg_iomem)
-		iounmap(rcfw->creq_bar_reg_iomem);
-	rcfw->creq_bar_reg_iomem = NULL;
+	iounmap(rcfw->cmdq_bar_reg_iomem);
+	iounmap(rcfw->creq_bar_reg_iomem);
 
 	indx = find_first_bit(rcfw->cmdq_bitmap, rcfw->bmap_size);
 	if (indx != rcfw->bmap_size)
 		dev_err(&rcfw->pdev->dev,
-			"QPLIB: disabling RCFW with pending cmd-bit %lx", indx);
+			"disabling RCFW with pending cmd-bit %lx\n", indx);
 	kfree(rcfw->cmdq_bitmap);
 	rcfw->bmap_size = 0;
 
+	rcfw->cmdq_bar_reg_iomem = NULL;
+	rcfw->creq_bar_reg_iomem = NULL;
 	rcfw->aeq_handler = NULL;
 	rcfw->vector = 0;
 }
@@ -681,8 +691,7 @@ int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev,
 					      RCFW_COMM_BASE_OFFSET,
 					      RCFW_COMM_SIZE);
 	if (!rcfw->cmdq_bar_reg_iomem) {
-		dev_err(&rcfw->pdev->dev,
-			"QPLIB: CMDQ BAR region %d mapping failed",
+		dev_err(&rcfw->pdev->dev, "CMDQ BAR region %d mapping failed\n",
 			rcfw->cmdq_bar_reg);
 		return -ENOMEM;
 	}
@@ -697,14 +706,15 @@ int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev,
 	res_base = pci_resource_start(pdev, rcfw->creq_bar_reg);
 	if (!res_base)
 		dev_err(&rcfw->pdev->dev,
-			"QPLIB: CREQ BAR region %d resc start is 0!",
+			"CREQ BAR region %d resc start is 0!\n",
 			rcfw->creq_bar_reg);
 	rcfw->creq_bar_reg_iomem = ioremap_nocache(res_base + cp_bar_reg_off,
 						   4);
 	if (!rcfw->creq_bar_reg_iomem) {
-		dev_err(&rcfw->pdev->dev,
-			"QPLIB: CREQ BAR region %d mapping failed",
+		dev_err(&rcfw->pdev->dev, "CREQ BAR region %d mapping failed\n",
 			rcfw->creq_bar_reg);
+		iounmap(rcfw->cmdq_bar_reg_iomem);
+		rcfw->cmdq_bar_reg_iomem = NULL;
 		return -ENOMEM;
 	}
 	rcfw->creq_qp_event_processed = 0;
@@ -717,7 +727,7 @@ int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev,
 	rc = bnxt_qplib_rcfw_start_irq(rcfw, msix_vector, true);
 	if (rc) {
 		dev_err(&rcfw->pdev->dev,
-			"QPLIB: Failed to request IRQ for CREQ rc = 0x%x", rc);
+			"Failed to request IRQ for CREQ rc = 0x%x\n", rc);
 		bnxt_qplib_disable_rcfw_channel(rcfw);
 		return rc;
 	}

+ 4 - 0
drivers/infiniband/hw/bnxt_re/qplib_rcfw.h

@@ -154,6 +154,8 @@ struct bnxt_qplib_qp_node {
 	void *qp_handle;        /* ptr to qplib_qp */
 };
 
+#define BNXT_QPLIB_OOS_COUNT_MASK 0xFFFFFFFF
+
 /* RCFW Communication Channels */
 struct bnxt_qplib_rcfw {
 	struct pci_dev		*pdev;
@@ -190,6 +192,8 @@ struct bnxt_qplib_rcfw {
 	struct bnxt_qplib_crsq	*crsqe_tbl;
 	int qp_tbl_size;
 	struct bnxt_qplib_qp_node *qp_tbl;
+	u64 oos_prev;
+	u32 init_oos_stats;
 };
 
 void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw);

+ 14 - 15
drivers/infiniband/hw/bnxt_re/qplib_res.c

@@ -36,6 +36,8 @@
  * Description: QPLib resource manager
  */
 
+#define dev_fmt(fmt) "QPLIB: " fmt
+
 #include <linux/spinlock.h>
 #include <linux/pci.h>
 #include <linux/interrupt.h>
@@ -68,8 +70,7 @@ static void __free_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl,
 						  pbl->pg_map_arr[i]);
 			else
 				dev_warn(&pdev->dev,
-					 "QPLIB: PBL free pg_arr[%d] empty?!",
-					 i);
+					 "PBL free pg_arr[%d] empty?!\n", i);
 			pbl->pg_arr[i] = NULL;
 		}
 	}
@@ -537,7 +538,7 @@ static void bnxt_qplib_free_pkey_tbl(struct bnxt_qplib_res *res,
 				     struct bnxt_qplib_pkey_tbl *pkey_tbl)
 {
 	if (!pkey_tbl->tbl)
-		dev_dbg(&res->pdev->dev, "QPLIB: PKEY tbl not present");
+		dev_dbg(&res->pdev->dev, "PKEY tbl not present\n");
 	else
 		kfree(pkey_tbl->tbl);
 
@@ -578,7 +579,7 @@ int bnxt_qplib_dealloc_pd(struct bnxt_qplib_res *res,
 			  struct bnxt_qplib_pd *pd)
 {
 	if (test_and_set_bit(pd->id, pdt->tbl)) {
-		dev_warn(&res->pdev->dev, "Freeing an unused PD? pdn = %d",
+		dev_warn(&res->pdev->dev, "Freeing an unused PD? pdn = %d\n",
 			 pd->id);
 		return -EINVAL;
 	}
@@ -639,11 +640,11 @@ int bnxt_qplib_dealloc_dpi(struct bnxt_qplib_res *res,
 			   struct bnxt_qplib_dpi     *dpi)
 {
 	if (dpi->dpi >= dpit->max) {
-		dev_warn(&res->pdev->dev, "Invalid DPI? dpi = %d", dpi->dpi);
+		dev_warn(&res->pdev->dev, "Invalid DPI? dpi = %d\n", dpi->dpi);
 		return -EINVAL;
 	}
 	if (test_and_set_bit(dpi->dpi, dpit->tbl)) {
-		dev_warn(&res->pdev->dev, "Freeing an unused DPI? dpi = %d",
+		dev_warn(&res->pdev->dev, "Freeing an unused DPI? dpi = %d\n",
 			 dpi->dpi);
 		return -EINVAL;
 	}
@@ -673,22 +674,21 @@ static int bnxt_qplib_alloc_dpi_tbl(struct bnxt_qplib_res     *res,
 	u32 dbr_len, bytes;
 
 	if (dpit->dbr_bar_reg_iomem) {
-		dev_err(&res->pdev->dev,
-			"QPLIB: DBR BAR region %d already mapped", dbr_bar_reg);
+		dev_err(&res->pdev->dev, "DBR BAR region %d already mapped\n",
+			dbr_bar_reg);
 		return -EALREADY;
 	}
 
 	bar_reg_base = pci_resource_start(res->pdev, dbr_bar_reg);
 	if (!bar_reg_base) {
-		dev_err(&res->pdev->dev,
-			"QPLIB: BAR region %d resc start failed", dbr_bar_reg);
+		dev_err(&res->pdev->dev, "BAR region %d resc start failed\n",
+			dbr_bar_reg);
 		return -ENOMEM;
 	}
 
 	dbr_len = pci_resource_len(res->pdev, dbr_bar_reg) - dbr_offset;
 	if (!dbr_len || ((dbr_len & (PAGE_SIZE - 1)) != 0)) {
-		dev_err(&res->pdev->dev, "QPLIB: Invalid DBR length %d",
-			dbr_len);
+		dev_err(&res->pdev->dev, "Invalid DBR length %d\n", dbr_len);
 		return -ENOMEM;
 	}
 
@@ -696,8 +696,7 @@ static int bnxt_qplib_alloc_dpi_tbl(struct bnxt_qplib_res     *res,
 						  dbr_len);
 	if (!dpit->dbr_bar_reg_iomem) {
 		dev_err(&res->pdev->dev,
-			"QPLIB: FP: DBR BAR region %d mapping failed",
-			dbr_bar_reg);
+			"FP: DBR BAR region %d mapping failed\n", dbr_bar_reg);
 		return -ENOMEM;
 	}
 
@@ -767,7 +766,7 @@ static int bnxt_qplib_alloc_stats_ctx(struct pci_dev *pdev,
 	stats->dma = dma_alloc_coherent(&pdev->dev, stats->size,
 					&stats->dma_map, GFP_KERNEL);
 	if (!stats->dma) {
-		dev_err(&pdev->dev, "QPLIB: Stats DMA allocation failed");
+		dev_err(&pdev->dev, "Stats DMA allocation failed\n");
 		return -ENOMEM;
 	}
 	return 0;

+ 45 - 32
drivers/infiniband/hw/bnxt_re/qplib_sp.c

@@ -36,6 +36,8 @@
  * Description: Slow Path Operators
  */
 
+#define dev_fmt(fmt) "QPLIB: " fmt
+
 #include <linux/interrupt.h>
 #include <linux/spinlock.h>
 #include <linux/sched.h>
@@ -89,7 +91,7 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw,
 	sbuf = bnxt_qplib_rcfw_alloc_sbuf(rcfw, sizeof(*sb));
 	if (!sbuf) {
 		dev_err(&rcfw->pdev->dev,
-			"QPLIB: SP: QUERY_FUNC alloc side buffer failed");
+			"SP: QUERY_FUNC alloc side buffer failed\n");
 		return -ENOMEM;
 	}
 
@@ -135,8 +137,16 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw,
 	attr->max_srq = le16_to_cpu(sb->max_srq);
 	attr->max_srq_wqes = le32_to_cpu(sb->max_srq_wr) - 1;
 	attr->max_srq_sges = sb->max_srq_sge;
-	/* Bono only reports 1 PKEY for now, but it can support > 1 */
 	attr->max_pkey = le32_to_cpu(sb->max_pkeys);
+	/*
+	 * Some versions of FW reports more than 0xFFFF.
+	 * Restrict it for now to 0xFFFF to avoid
+	 * reporting trucated value
+	 */
+	if (attr->max_pkey > 0xFFFF) {
+		/* ib_port_attr::pkey_tbl_len is u16 */
+		attr->max_pkey = 0xFFFF;
+	}
 
 	attr->max_inline_data = le32_to_cpu(sb->max_inline_data);
 	attr->l2_db_size = (sb->l2_db_space_size + 1) *
@@ -186,8 +196,7 @@ int bnxt_qplib_set_func_resources(struct bnxt_qplib_res *res,
 					  (void *)&resp,
 					  NULL, 0);
 	if (rc) {
-		dev_err(&res->pdev->dev,
-			"QPLIB: Failed to set function resources");
+		dev_err(&res->pdev->dev, "Failed to set function resources\n");
 	}
 	return rc;
 }
@@ -199,7 +208,7 @@ int bnxt_qplib_get_sgid(struct bnxt_qplib_res *res,
 {
 	if (index >= sgid_tbl->max) {
 		dev_err(&res->pdev->dev,
-			"QPLIB: Index %d exceeded SGID table max (%d)",
+			"Index %d exceeded SGID table max (%d)\n",
 			index, sgid_tbl->max);
 		return -EINVAL;
 	}
@@ -217,13 +226,12 @@ int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
 	int index;
 
 	if (!sgid_tbl) {
-		dev_err(&res->pdev->dev, "QPLIB: SGID table not allocated");
+		dev_err(&res->pdev->dev, "SGID table not allocated\n");
 		return -EINVAL;
 	}
 	/* Do we need a sgid_lock here? */
 	if (!sgid_tbl->active) {
-		dev_err(&res->pdev->dev,
-			"QPLIB: SGID table has no active entries");
+		dev_err(&res->pdev->dev, "SGID table has no active entries\n");
 		return -ENOMEM;
 	}
 	for (index = 0; index < sgid_tbl->max; index++) {
@@ -231,7 +239,7 @@ int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
 			break;
 	}
 	if (index == sgid_tbl->max) {
-		dev_warn(&res->pdev->dev, "GID not found in the SGID table");
+		dev_warn(&res->pdev->dev, "GID not found in the SGID table\n");
 		return 0;
 	}
 	/* Remove GID from the SGID table */
@@ -244,7 +252,7 @@ int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
 		RCFW_CMD_PREP(req, DELETE_GID, cmd_flags);
 		if (sgid_tbl->hw_id[index] == 0xFFFF) {
 			dev_err(&res->pdev->dev,
-				"QPLIB: GID entry contains an invalid HW id");
+				"GID entry contains an invalid HW id\n");
 			return -EINVAL;
 		}
 		req.gid_index = cpu_to_le16(sgid_tbl->hw_id[index]);
@@ -258,7 +266,7 @@ int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
 	sgid_tbl->vlan[index] = 0;
 	sgid_tbl->active--;
 	dev_dbg(&res->pdev->dev,
-		"QPLIB: SGID deleted hw_id[0x%x] = 0x%x active = 0x%x",
+		"SGID deleted hw_id[0x%x] = 0x%x active = 0x%x\n",
 		 index, sgid_tbl->hw_id[index], sgid_tbl->active);
 	sgid_tbl->hw_id[index] = (u16)-1;
 
@@ -277,20 +285,19 @@ int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
 	int i, free_idx;
 
 	if (!sgid_tbl) {
-		dev_err(&res->pdev->dev, "QPLIB: SGID table not allocated");
+		dev_err(&res->pdev->dev, "SGID table not allocated\n");
 		return -EINVAL;
 	}
 	/* Do we need a sgid_lock here? */
 	if (sgid_tbl->active == sgid_tbl->max) {
-		dev_err(&res->pdev->dev, "QPLIB: SGID table is full");
+		dev_err(&res->pdev->dev, "SGID table is full\n");
 		return -ENOMEM;
 	}
 	free_idx = sgid_tbl->max;
 	for (i = 0; i < sgid_tbl->max; i++) {
 		if (!memcmp(&sgid_tbl->tbl[i], gid, sizeof(*gid))) {
 			dev_dbg(&res->pdev->dev,
-				"QPLIB: SGID entry already exist in entry %d!",
-				i);
+				"SGID entry already exist in entry %d!\n", i);
 			*index = i;
 			return -EALREADY;
 		} else if (!memcmp(&sgid_tbl->tbl[i], &bnxt_qplib_gid_zero,
@@ -301,7 +308,7 @@ int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
 	}
 	if (free_idx == sgid_tbl->max) {
 		dev_err(&res->pdev->dev,
-			"QPLIB: SGID table is FULL but count is not MAX??");
+			"SGID table is FULL but count is not MAX??\n");
 		return -ENOMEM;
 	}
 	if (update) {
@@ -348,7 +355,7 @@ int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
 		sgid_tbl->vlan[free_idx] = 1;
 
 	dev_dbg(&res->pdev->dev,
-		"QPLIB: SGID added hw_id[0x%x] = 0x%x active = 0x%x",
+		"SGID added hw_id[0x%x] = 0x%x active = 0x%x\n",
 		 free_idx, sgid_tbl->hw_id[free_idx], sgid_tbl->active);
 
 	*index = free_idx;
@@ -404,7 +411,7 @@ int bnxt_qplib_get_pkey(struct bnxt_qplib_res *res,
 	}
 	if (index >= pkey_tbl->max) {
 		dev_err(&res->pdev->dev,
-			"QPLIB: Index %d exceeded PKEY table max (%d)",
+			"Index %d exceeded PKEY table max (%d)\n",
 			index, pkey_tbl->max);
 		return -EINVAL;
 	}
@@ -419,14 +426,13 @@ int bnxt_qplib_del_pkey(struct bnxt_qplib_res *res,
 	int i, rc = 0;
 
 	if (!pkey_tbl) {
-		dev_err(&res->pdev->dev, "QPLIB: PKEY table not allocated");
+		dev_err(&res->pdev->dev, "PKEY table not allocated\n");
 		return -EINVAL;
 	}
 
 	/* Do we need a pkey_lock here? */
 	if (!pkey_tbl->active) {
-		dev_err(&res->pdev->dev,
-			"QPLIB: PKEY table has no active entries");
+		dev_err(&res->pdev->dev, "PKEY table has no active entries\n");
 		return -ENOMEM;
 	}
 	for (i = 0; i < pkey_tbl->max; i++) {
@@ -435,8 +441,7 @@ int bnxt_qplib_del_pkey(struct bnxt_qplib_res *res,
 	}
 	if (i == pkey_tbl->max) {
 		dev_err(&res->pdev->dev,
-			"QPLIB: PKEY 0x%04x not found in the pkey table",
-			*pkey);
+			"PKEY 0x%04x not found in the pkey table\n", *pkey);
 		return -ENOMEM;
 	}
 	memset(&pkey_tbl->tbl[i], 0, sizeof(*pkey));
@@ -453,13 +458,13 @@ int bnxt_qplib_add_pkey(struct bnxt_qplib_res *res,
 	int i, free_idx, rc = 0;
 
 	if (!pkey_tbl) {
-		dev_err(&res->pdev->dev, "QPLIB: PKEY table not allocated");
+		dev_err(&res->pdev->dev, "PKEY table not allocated\n");
 		return -EINVAL;
 	}
 
 	/* Do we need a pkey_lock here? */
 	if (pkey_tbl->active == pkey_tbl->max) {
-		dev_err(&res->pdev->dev, "QPLIB: PKEY table is full");
+		dev_err(&res->pdev->dev, "PKEY table is full\n");
 		return -ENOMEM;
 	}
 	free_idx = pkey_tbl->max;
@@ -471,7 +476,7 @@ int bnxt_qplib_add_pkey(struct bnxt_qplib_res *res,
 	}
 	if (free_idx == pkey_tbl->max) {
 		dev_err(&res->pdev->dev,
-			"QPLIB: PKEY table is FULL but count is not MAX??");
+			"PKEY table is FULL but count is not MAX??\n");
 		return -ENOMEM;
 	}
 	/* Add PKEY to the pkey_tbl */
@@ -555,8 +560,7 @@ int bnxt_qplib_free_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw)
 	int rc;
 
 	if (mrw->lkey == 0xFFFFFFFF) {
-		dev_info(&res->pdev->dev,
-			 "QPLIB: SP: Free a reserved lkey MRW");
+		dev_info(&res->pdev->dev, "SP: Free a reserved lkey MRW\n");
 		return 0;
 	}
 
@@ -666,9 +670,8 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
 			pages++;
 
 		if (pages > MAX_PBL_LVL_1_PGS) {
-			dev_err(&res->pdev->dev, "QPLIB: SP: Reg MR pages ");
 			dev_err(&res->pdev->dev,
-				"requested (0x%x) exceeded max (0x%x)",
+				"SP: Reg MR pages requested (0x%x) exceeded max (0x%x)\n",
 				pages, MAX_PBL_LVL_1_PGS);
 			return -ENOMEM;
 		}
@@ -684,7 +687,7 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
 					       HWQ_TYPE_CTX);
 		if (rc) {
 			dev_err(&res->pdev->dev,
-				"SP: Reg MR memory allocation failed");
+				"SP: Reg MR memory allocation failed\n");
 			return -ENOMEM;
 		}
 		/* Write to the hwq */
@@ -795,7 +798,7 @@ int bnxt_qplib_get_roce_stats(struct bnxt_qplib_rcfw *rcfw,
 	sbuf = bnxt_qplib_rcfw_alloc_sbuf(rcfw, sizeof(*sb));
 	if (!sbuf) {
 		dev_err(&rcfw->pdev->dev,
-			"QPLIB: SP: QUERY_ROCE_STATS alloc side buffer failed");
+			"SP: QUERY_ROCE_STATS alloc side buffer failed\n");
 		return -ENOMEM;
 	}
 
@@ -845,6 +848,16 @@ int bnxt_qplib_get_roce_stats(struct bnxt_qplib_rcfw *rcfw,
 	stats->res_srq_load_err = le64_to_cpu(sb->res_srq_load_err);
 	stats->res_tx_pci_err = le64_to_cpu(sb->res_tx_pci_err);
 	stats->res_rx_pci_err = le64_to_cpu(sb->res_rx_pci_err);
+	if (!rcfw->init_oos_stats) {
+		rcfw->oos_prev = le64_to_cpu(sb->res_oos_drop_count);
+		rcfw->init_oos_stats = 1;
+	} else {
+		stats->res_oos_drop_count +=
+				(le64_to_cpu(sb->res_oos_drop_count) -
+				 rcfw->oos_prev) & BNXT_QPLIB_OOS_COUNT_MASK;
+		rcfw->oos_prev = le64_to_cpu(sb->res_oos_drop_count);
+	}
+
 bail:
 	bnxt_qplib_rcfw_free_sbuf(rcfw, sbuf);
 	return rc;

+ 10 - 0
drivers/infiniband/hw/bnxt_re/qplib_sp.h

@@ -205,6 +205,16 @@ struct bnxt_qplib_roce_stats {
 	/* res_tx_pci_err is 64 b */
 	u64 res_rx_pci_err;
 	/* res_rx_pci_err is 64 b */
+	u64 res_oos_drop_count;
+	/* res_oos_drop_count */
+	u64     active_qp_count_p0;
+	/* port 0 active qps */
+	u64     active_qp_count_p1;
+	/* port 1 active qps */
+	u64     active_qp_count_p2;
+	/* port 2 active qps */
+	u64     active_qp_count_p3;
+	/* port 3 active qps */
 };
 
 int bnxt_qplib_get_sgid(struct bnxt_qplib_res *res,

+ 5 - 0
drivers/infiniband/hw/bnxt_re/roce_hsi.h

@@ -2929,6 +2929,11 @@ struct creq_query_roce_stats_resp_sb {
 	__le64	res_srq_load_err;
 	__le64	res_tx_pci_err;
 	__le64	res_rx_pci_err;
+	__le64  res_oos_drop_count;
+	__le64  active_qp_count_p0;
+	__le64  active_qp_count_p1;
+	__le64  active_qp_count_p2;
+	__le64  active_qp_count_p3;
 };
 
 /* QP error notification event (16 bytes) */

+ 20 - 35
drivers/infiniband/hw/cxgb3/iwch_provider.c

@@ -1127,17 +1127,18 @@ static int iwch_query_port(struct ib_device *ibdev,
 	return 0;
 }
 
-static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
-			char *buf)
+static ssize_t hw_rev_show(struct device *dev,
+			   struct device_attribute *attr, char *buf)
 {
 	struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
 						 ibdev.dev);
 	pr_debug("%s dev 0x%p\n", __func__, dev);
 	return sprintf(buf, "%d\n", iwch_dev->rdev.t3cdev_p->type);
 }
+static DEVICE_ATTR_RO(hw_rev);
 
-static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
-			char *buf)
+static ssize_t hca_type_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
 {
 	struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
 						 ibdev.dev);
@@ -1148,9 +1149,10 @@ static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
 	lldev->ethtool_ops->get_drvinfo(lldev, &info);
 	return sprintf(buf, "%s\n", info.driver);
 }
+static DEVICE_ATTR_RO(hca_type);
 
-static ssize_t show_board(struct device *dev, struct device_attribute *attr,
-			  char *buf)
+static ssize_t board_id_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
 {
 	struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
 						 ibdev.dev);
@@ -1158,6 +1160,7 @@ static ssize_t show_board(struct device *dev, struct device_attribute *attr,
 	return sprintf(buf, "%x.%x\n", iwch_dev->rdev.rnic_info.pdev->vendor,
 		       iwch_dev->rdev.rnic_info.pdev->device);
 }
+static DEVICE_ATTR_RO(board_id);
 
 enum counters {
 	IPINRECEIVES,
@@ -1274,14 +1277,15 @@ static int iwch_get_mib(struct ib_device *ibdev, struct rdma_hw_stats *stats,
 	return stats->num_counters;
 }
 
-static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
-static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
+static struct attribute *iwch_class_attributes[] = {
+	&dev_attr_hw_rev.attr,
+	&dev_attr_hca_type.attr,
+	&dev_attr_board_id.attr,
+	NULL
+};
 
-static struct device_attribute *iwch_class_attributes[] = {
-	&dev_attr_hw_rev,
-	&dev_attr_hca_type,
-	&dev_attr_board_id,
+static const struct attribute_group iwch_attr_group = {
+	.attrs = iwch_class_attributes,
 };
 
 static int iwch_port_immutable(struct ib_device *ibdev, u8 port_num,
@@ -1316,10 +1320,8 @@ static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str)
 int iwch_register_device(struct iwch_dev *dev)
 {
 	int ret;
-	int i;
 
 	pr_debug("%s iwch_dev %p\n", __func__, dev);
-	strlcpy(dev->ibdev.name, "cxgb3_%d", IB_DEVICE_NAME_MAX);
 	memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
 	memcpy(&dev->ibdev.node_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6);
 	dev->ibdev.owner = THIS_MODULE;
@@ -1402,33 +1404,16 @@ int iwch_register_device(struct iwch_dev *dev)
 	       sizeof(dev->ibdev.iwcm->ifname));
 
 	dev->ibdev.driver_id = RDMA_DRIVER_CXGB3;
-	ret = ib_register_device(&dev->ibdev, NULL);
+	rdma_set_device_sysfs_group(&dev->ibdev, &iwch_attr_group);
+	ret = ib_register_device(&dev->ibdev, "cxgb3_%d", NULL);
 	if (ret)
-		goto bail1;
-
-	for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i) {
-		ret = device_create_file(&dev->ibdev.dev,
-					 iwch_class_attributes[i]);
-		if (ret) {
-			goto bail2;
-		}
-	}
-	return 0;
-bail2:
-	ib_unregister_device(&dev->ibdev);
-bail1:
-	kfree(dev->ibdev.iwcm);
+		kfree(dev->ibdev.iwcm);
 	return ret;
 }
 
 void iwch_unregister_device(struct iwch_dev *dev)
 {
-	int i;
-
 	pr_debug("%s iwch_dev %p\n", __func__, dev);
-	for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i)
-		device_remove_file(&dev->ibdev.dev,
-				   iwch_class_attributes[i]);
 	ib_unregister_device(&dev->ibdev);
 	kfree(dev->ibdev.iwcm);
 	return;

+ 1 - 2
drivers/infiniband/hw/cxgb4/cm.c

@@ -403,8 +403,7 @@ void _c4iw_free_ep(struct kref *kref)
 				 ep->com.local_addr.ss_family);
 		dst_release(ep->dst);
 		cxgb4_l2t_release(ep->l2t);
-		if (ep->mpa_skb)
-			kfree_skb(ep->mpa_skb);
+		kfree_skb(ep->mpa_skb);
 	}
 	if (!skb_queue_empty(&ep->com.ep_skb_list))
 		skb_queue_purge(&ep->com.ep_skb_list);

+ 1 - 1
drivers/infiniband/hw/cxgb4/cq.c

@@ -161,7 +161,7 @@ static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq,
 	cq->gts = rdev->lldi.gts_reg;
 	cq->rdev = rdev;
 
-	cq->bar2_va = c4iw_bar2_addrs(rdev, cq->cqid, T4_BAR2_QTYPE_INGRESS,
+	cq->bar2_va = c4iw_bar2_addrs(rdev, cq->cqid, CXGB4_BAR2_QTYPE_INGRESS,
 				      &cq->bar2_qid,
 				      user ? &cq->bar2_pa : NULL);
 	if (user && !cq->bar2_pa) {

+ 20 - 30
drivers/infiniband/hw/cxgb4/provider.c

@@ -373,8 +373,8 @@ static int c4iw_query_port(struct ib_device *ibdev, u8 port,
 	return 0;
 }
 
-static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
-			char *buf)
+static ssize_t hw_rev_show(struct device *dev,
+			   struct device_attribute *attr, char *buf)
 {
 	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
 						 ibdev.dev);
@@ -382,9 +382,10 @@ static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
 	return sprintf(buf, "%d\n",
 		       CHELSIO_CHIP_RELEASE(c4iw_dev->rdev.lldi.adapter_type));
 }
+static DEVICE_ATTR_RO(hw_rev);
 
-static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
-			char *buf)
+static ssize_t hca_type_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
 {
 	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
 						 ibdev.dev);
@@ -395,9 +396,10 @@ static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
 	lldev->ethtool_ops->get_drvinfo(lldev, &info);
 	return sprintf(buf, "%s\n", info.driver);
 }
+static DEVICE_ATTR_RO(hca_type);
 
-static ssize_t show_board(struct device *dev, struct device_attribute *attr,
-			  char *buf)
+static ssize_t board_id_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
 {
 	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
 						 ibdev.dev);
@@ -405,6 +407,7 @@ static ssize_t show_board(struct device *dev, struct device_attribute *attr,
 	return sprintf(buf, "%x.%x\n", c4iw_dev->rdev.lldi.pdev->vendor,
 		       c4iw_dev->rdev.lldi.pdev->device);
 }
+static DEVICE_ATTR_RO(board_id);
 
 enum counters {
 	IP4INSEGS,
@@ -461,14 +464,15 @@ static int c4iw_get_mib(struct ib_device *ibdev,
 	return stats->num_counters;
 }
 
-static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
-static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
+static struct attribute *c4iw_class_attributes[] = {
+	&dev_attr_hw_rev.attr,
+	&dev_attr_hca_type.attr,
+	&dev_attr_board_id.attr,
+	NULL
+};
 
-static struct device_attribute *c4iw_class_attributes[] = {
-	&dev_attr_hw_rev,
-	&dev_attr_hca_type,
-	&dev_attr_board_id,
+static const struct attribute_group c4iw_attr_group = {
+	.attrs = c4iw_class_attributes,
 };
 
 static int c4iw_port_immutable(struct ib_device *ibdev, u8 port_num,
@@ -530,12 +534,10 @@ static int fill_res_entry(struct sk_buff *msg, struct rdma_restrack_entry *res)
 void c4iw_register_device(struct work_struct *work)
 {
 	int ret;
-	int i;
 	struct uld_ctx *ctx = container_of(work, struct uld_ctx, reg_work);
 	struct c4iw_dev *dev = ctx->dev;
 
 	pr_debug("c4iw_dev %p\n", dev);
-	strlcpy(dev->ibdev.name, "cxgb4_%d", IB_DEVICE_NAME_MAX);
 	memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
 	memcpy(&dev->ibdev.node_guid, dev->rdev.lldi.ports[0]->dev_addr, 6);
 	dev->ibdev.owner = THIS_MODULE;
@@ -626,20 +628,13 @@ void c4iw_register_device(struct work_struct *work)
 	memcpy(dev->ibdev.iwcm->ifname, dev->rdev.lldi.ports[0]->name,
 	       sizeof(dev->ibdev.iwcm->ifname));
 
+	rdma_set_device_sysfs_group(&dev->ibdev, &c4iw_attr_group);
 	dev->ibdev.driver_id = RDMA_DRIVER_CXGB4;
-	ret = ib_register_device(&dev->ibdev, NULL);
+	ret = ib_register_device(&dev->ibdev, "cxgb4_%d", NULL);
 	if (ret)
 		goto err_kfree_iwcm;
-
-	for (i = 0; i < ARRAY_SIZE(c4iw_class_attributes); ++i) {
-		ret = device_create_file(&dev->ibdev.dev,
-					 c4iw_class_attributes[i]);
-		if (ret)
-			goto err_unregister_device;
-	}
 	return;
-err_unregister_device:
-	ib_unregister_device(&dev->ibdev);
+
 err_kfree_iwcm:
 	kfree(dev->ibdev.iwcm);
 err_dealloc_ctx:
@@ -651,12 +646,7 @@ err_dealloc_ctx:
 
 void c4iw_unregister_device(struct c4iw_dev *dev)
 {
-	int i;
-
 	pr_debug("c4iw_dev %p\n", dev);
-	for (i = 0; i < ARRAY_SIZE(c4iw_class_attributes); ++i)
-		device_remove_file(&dev->ibdev.dev,
-				   c4iw_class_attributes[i]);
 	ib_unregister_device(&dev->ibdev);
 	kfree(dev->ibdev.iwcm);
 	return;

+ 5 - 5
drivers/infiniband/hw/cxgb4/qp.c

@@ -279,12 +279,13 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq,
 
 	wq->db = rdev->lldi.db_reg;
 
-	wq->sq.bar2_va = c4iw_bar2_addrs(rdev, wq->sq.qid, T4_BAR2_QTYPE_EGRESS,
+	wq->sq.bar2_va = c4iw_bar2_addrs(rdev, wq->sq.qid,
+					 CXGB4_BAR2_QTYPE_EGRESS,
 					 &wq->sq.bar2_qid,
 					 user ? &wq->sq.bar2_pa : NULL);
 	if (need_rq)
 		wq->rq.bar2_va = c4iw_bar2_addrs(rdev, wq->rq.qid,
-						 T4_BAR2_QTYPE_EGRESS,
+						 CXGB4_BAR2_QTYPE_EGRESS,
 						 &wq->rq.bar2_qid,
 						 user ? &wq->rq.bar2_pa : NULL);
 
@@ -2572,7 +2573,7 @@ static int alloc_srq_queue(struct c4iw_srq *srq, struct c4iw_dev_ucontext *uctx,
 	memset(wq->queue, 0, wq->memsize);
 	dma_unmap_addr_set(wq, mapping, wq->dma_addr);
 
-	wq->bar2_va = c4iw_bar2_addrs(rdev, wq->qid, T4_BAR2_QTYPE_EGRESS,
+	wq->bar2_va = c4iw_bar2_addrs(rdev, wq->qid, CXGB4_BAR2_QTYPE_EGRESS,
 				      &wq->bar2_qid,
 			user ? &wq->bar2_pa : NULL);
 
@@ -2813,8 +2814,7 @@ err_free_queue:
 	free_srq_queue(srq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx,
 		       srq->wr_waitp);
 err_free_skb:
-	if (srq->destroy_skb)
-		kfree_skb(srq->destroy_skb);
+	kfree_skb(srq->destroy_skb);
 err_free_srq_idx:
 	c4iw_free_srq_idx(&rhp->rdev, srq->idx);
 err_free_wr_wait:

+ 36 - 6
drivers/infiniband/hw/hfi1/Makefile

@@ -8,12 +8,42 @@
 #
 obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o
 
-hfi1-y := affinity.o chip.o device.o driver.o efivar.o \
-	eprom.o exp_rcv.o file_ops.o firmware.o \
-	init.o intr.o mad.o mmu_rb.o pcie.o pio.o pio_copy.o platform.o \
-	qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o \
-	uc.o ud.o user_exp_rcv.o user_pages.o user_sdma.o verbs.o \
-	verbs_txreq.o vnic_main.o vnic_sdma.o
+hfi1-y := \
+	affinity.o \
+	chip.o \
+	device.o \
+	driver.o \
+	efivar.o \
+	eprom.o \
+	exp_rcv.o \
+	file_ops.o \
+	firmware.o \
+	init.o \
+	intr.o \
+	iowait.o \
+	mad.o \
+	mmu_rb.o \
+	msix.o \
+	pcie.o \
+	pio.o \
+	pio_copy.o \
+	platform.o \
+	qp.o \
+	qsfp.o \
+	rc.o \
+	ruc.o \
+	sdma.o \
+	sysfs.o \
+	trace.o \
+	uc.o \
+	ud.o \
+	user_exp_rcv.o \
+	user_pages.o \
+	user_sdma.o \
+	verbs.o \
+	verbs_txreq.o \
+	vnic_main.o \
+	vnic_sdma.o
 
 ifdef CONFIG_DEBUG_FS
 hfi1-y += debugfs.o

+ 2 - 2
drivers/infiniband/hw/hfi1/affinity.c

@@ -817,10 +817,10 @@ static void hfi1_update_sdma_affinity(struct hfi1_msix_entry *msix, int cpu)
 	set = &entry->def_intr;
 	cpumask_set_cpu(cpu, &set->mask);
 	cpumask_set_cpu(cpu, &set->used);
-	for (i = 0; i < dd->num_msix_entries; i++) {
+	for (i = 0; i < dd->msix_info.max_requested; i++) {
 		struct hfi1_msix_entry *other_msix;
 
-		other_msix = &dd->msix_entries[i];
+		other_msix = &dd->msix_info.msix_entries[i];
 		if (other_msix->type != IRQ_SDMA || other_msix == msix)
 			continue;
 

+ 134 - 352
drivers/infiniband/hw/hfi1/chip.c

@@ -67,8 +67,6 @@
 #include "debugfs.h"
 #include "fault.h"
 
-#define NUM_IB_PORTS 1
-
 uint kdeth_qp;
 module_param_named(kdeth_qp, kdeth_qp, uint, S_IRUGO);
 MODULE_PARM_DESC(kdeth_qp, "Set the KDETH queue pair prefix");
@@ -1100,9 +1098,9 @@ struct err_reg_info {
 	const char *desc;
 };
 
-#define NUM_MISC_ERRS (IS_GENERAL_ERR_END - IS_GENERAL_ERR_START)
-#define NUM_DC_ERRS (IS_DC_END - IS_DC_START)
-#define NUM_VARIOUS (IS_VARIOUS_END - IS_VARIOUS_START)
+#define NUM_MISC_ERRS (IS_GENERAL_ERR_END + 1 - IS_GENERAL_ERR_START)
+#define NUM_DC_ERRS (IS_DC_END + 1 - IS_DC_START)
+#define NUM_VARIOUS (IS_VARIOUS_END + 1 - IS_VARIOUS_START)
 
 /*
  * Helpers for building HFI and DC error interrupt table entries.  Different
@@ -8181,7 +8179,7 @@ static void is_rcv_avail_int(struct hfi1_devdata *dd, unsigned int source)
 /**
  * is_rcv_urgent_int() - User receive context urgent IRQ handler
  * @dd: valid dd
- * @source: logical IRQ source (ofse from IS_RCVURGENT_START)
+ * @source: logical IRQ source (offset from IS_RCVURGENT_START)
  *
  * RX block receive urgent interrupt.  Source is < 160.
  *
@@ -8231,7 +8229,7 @@ static const struct is_table is_table[] = {
 				is_sdma_eng_err_name,	is_sdma_eng_err_int },
 { IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END,
 				is_sendctxt_err_name,	is_sendctxt_err_int },
-{ IS_SDMA_START,	     IS_SDMA_END,
+{ IS_SDMA_START,	     IS_SDMA_IDLE_END,
 				is_sdma_eng_name,	is_sdma_eng_int },
 { IS_VARIOUS_START,	     IS_VARIOUS_END,
 				is_various_name,	is_various_int },
@@ -8257,7 +8255,7 @@ static void is_interrupt(struct hfi1_devdata *dd, unsigned int source)
 
 	/* avoids a double compare by walking the table in-order */
 	for (entry = &is_table[0]; entry->is_name; entry++) {
-		if (source < entry->end) {
+		if (source <= entry->end) {
 			trace_hfi1_interrupt(dd, entry, source);
 			entry->is_int(dd, source - entry->start);
 			return;
@@ -8276,7 +8274,7 @@ static void is_interrupt(struct hfi1_devdata *dd, unsigned int source)
  * context DATA IRQs are threaded and are not supported by this handler.
  *
  */
-static irqreturn_t general_interrupt(int irq, void *data)
+irqreturn_t general_interrupt(int irq, void *data)
 {
 	struct hfi1_devdata *dd = data;
 	u64 regs[CCE_NUM_INT_CSRS];
@@ -8309,7 +8307,7 @@ static irqreturn_t general_interrupt(int irq, void *data)
 	return handled;
 }
 
-static irqreturn_t sdma_interrupt(int irq, void *data)
+irqreturn_t sdma_interrupt(int irq, void *data)
 {
 	struct sdma_engine *sde = data;
 	struct hfi1_devdata *dd = sde->dd;
@@ -8401,7 +8399,7 @@ static inline int check_packet_present(struct hfi1_ctxtdata *rcd)
  * invoked) is finished.  The intent is to avoid extra interrupts while we
  * are processing packets anyway.
  */
-static irqreturn_t receive_context_interrupt(int irq, void *data)
+irqreturn_t receive_context_interrupt(int irq, void *data)
 {
 	struct hfi1_ctxtdata *rcd = data;
 	struct hfi1_devdata *dd = rcd->dd;
@@ -8441,7 +8439,7 @@ static irqreturn_t receive_context_interrupt(int irq, void *data)
  * Receive packet thread handler.  This expects to be invoked with the
  * receive interrupt still blocked.
  */
-static irqreturn_t receive_context_thread(int irq, void *data)
+irqreturn_t receive_context_thread(int irq, void *data)
 {
 	struct hfi1_ctxtdata *rcd = data;
 	int present;
@@ -9651,30 +9649,10 @@ void qsfp_event(struct work_struct *work)
 	}
 }
 
-static void init_qsfp_int(struct hfi1_devdata *dd)
+void init_qsfp_int(struct hfi1_devdata *dd)
 {
 	struct hfi1_pportdata *ppd = dd->pport;
-	u64 qsfp_mask, cce_int_mask;
-	const int qsfp1_int_smask = QSFP1_INT % 64;
-	const int qsfp2_int_smask = QSFP2_INT % 64;
-
-	/*
-	 * disable QSFP1 interrupts for HFI1, QSFP2 interrupts for HFI0
-	 * Qsfp1Int and Qsfp2Int are adjacent bits in the same CSR,
-	 * therefore just one of QSFP1_INT/QSFP2_INT can be used to find
-	 * the index of the appropriate CSR in the CCEIntMask CSR array
-	 */
-	cce_int_mask = read_csr(dd, CCE_INT_MASK +
-				(8 * (QSFP1_INT / 64)));
-	if (dd->hfi1_id) {
-		cce_int_mask &= ~((u64)1 << qsfp1_int_smask);
-		write_csr(dd, CCE_INT_MASK + (8 * (QSFP1_INT / 64)),
-			  cce_int_mask);
-	} else {
-		cce_int_mask &= ~((u64)1 << qsfp2_int_smask);
-		write_csr(dd, CCE_INT_MASK + (8 * (QSFP2_INT / 64)),
-			  cce_int_mask);
-	}
+	u64 qsfp_mask;
 
 	qsfp_mask = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N);
 	/* Clear current status to avoid spurious interrupts */
@@ -9691,6 +9669,12 @@ static void init_qsfp_int(struct hfi1_devdata *dd)
 	write_csr(dd,
 		  dd->hfi1_id ? ASIC_QSFP2_INVERT : ASIC_QSFP1_INVERT,
 		  qsfp_mask);
+
+	/* Enable the appropriate QSFP IRQ source */
+	if (!dd->hfi1_id)
+		set_intr_bits(dd, QSFP1_INT, QSFP1_INT, true);
+	else
+		set_intr_bits(dd, QSFP2_INT, QSFP2_INT, true);
 }
 
 /*
@@ -10577,12 +10561,29 @@ void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason,
 	}
 }
 
-/*
- * Verify if BCT for data VLs is non-zero.
+/**
+ * data_vls_operational() - Verify if data VL BCT credits and MTU
+ *			    are both set.
+ * @ppd: pointer to hfi1_pportdata structure
+ *
+ * Return: true - Ok, false -otherwise.
  */
 static inline bool data_vls_operational(struct hfi1_pportdata *ppd)
 {
-	return !!ppd->actual_vls_operational;
+	int i;
+	u64 reg;
+
+	if (!ppd->actual_vls_operational)
+		return false;
+
+	for (i = 0; i < ppd->vls_supported; i++) {
+		reg = read_csr(ppd->dd, SEND_CM_CREDIT_VL + (8 * i));
+		if ((reg && !ppd->dd->vld[i].mtu) ||
+		    (!reg && ppd->dd->vld[i].mtu))
+			return false;
+	}
+
+	return true;
 }
 
 /*
@@ -10695,7 +10696,8 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
 
 		if (!data_vls_operational(ppd)) {
 			dd_dev_err(dd,
-				   "%s: data VLs not operational\n", __func__);
+				   "%s: Invalid data VL credits or mtu\n",
+				   __func__);
 			ret = -EINVAL;
 			break;
 		}
@@ -11932,10 +11934,16 @@ void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op,
 
 		rcvctrl &= ~RCV_CTXT_CTRL_ENABLE_SMASK;
 	}
-	if (op & HFI1_RCVCTRL_INTRAVAIL_ENB)
+	if (op & HFI1_RCVCTRL_INTRAVAIL_ENB) {
+		set_intr_bits(dd, IS_RCVAVAIL_START + rcd->ctxt,
+			      IS_RCVAVAIL_START + rcd->ctxt, true);
 		rcvctrl |= RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
-	if (op & HFI1_RCVCTRL_INTRAVAIL_DIS)
+	}
+	if (op & HFI1_RCVCTRL_INTRAVAIL_DIS) {
+		set_intr_bits(dd, IS_RCVAVAIL_START + rcd->ctxt,
+			      IS_RCVAVAIL_START + rcd->ctxt, false);
 		rcvctrl &= ~RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
+	}
 	if ((op & HFI1_RCVCTRL_TAILUPD_ENB) && rcd->rcvhdrtail_kvaddr)
 		rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK;
 	if (op & HFI1_RCVCTRL_TAILUPD_DIS) {
@@ -11965,6 +11973,13 @@ void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op,
 		rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
 	if (op & HFI1_RCVCTRL_NO_EGR_DROP_DIS)
 		rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
+	if (op & HFI1_RCVCTRL_URGENT_ENB)
+		set_intr_bits(dd, IS_RCVURGENT_START + rcd->ctxt,
+			      IS_RCVURGENT_START + rcd->ctxt, true);
+	if (op & HFI1_RCVCTRL_URGENT_DIS)
+		set_intr_bits(dd, IS_RCVURGENT_START + rcd->ctxt,
+			      IS_RCVURGENT_START + rcd->ctxt, false);
+
 	hfi1_cdbg(RCVCTRL, "ctxt %d rcvctrl 0x%llx\n", ctxt, rcvctrl);
 	write_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL, rcvctrl);
 
@@ -12963,63 +12978,71 @@ int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp)
 	return ret;
 }
 
+/* ========================================================================= */
+
 /**
- * get_int_mask - get 64 bit int mask
- * @dd - the devdata
- * @i - the csr (relative to CCE_INT_MASK)
+ * read_mod_write() - Calculate the IRQ register index and set/clear the bits
+ * @dd: valid devdata
+ * @src: IRQ source to determine register index from
+ * @bits: the bits to set or clear
+ * @set: true == set the bits, false == clear the bits
  *
- * Returns the mask with the urgent interrupt mask
- * bit clear for kernel receive contexts.
  */
-static u64 get_int_mask(struct hfi1_devdata *dd, u32 i)
+static void read_mod_write(struct hfi1_devdata *dd, u16 src, u64 bits,
+			   bool set)
 {
-	u64 mask = U64_MAX; /* default to no change */
-
-	if (i >= (IS_RCVURGENT_START / 64) && i < (IS_RCVURGENT_END / 64)) {
-		int j = (i - (IS_RCVURGENT_START / 64)) * 64;
-		int k = !j ? IS_RCVURGENT_START % 64 : 0;
+	u64 reg;
+	u16 idx = src / BITS_PER_REGISTER;
 
-		if (j)
-			j -= IS_RCVURGENT_START % 64;
-		/* j = 0..dd->first_dyn_alloc_ctxt - 1,k = 0..63 */
-		for (; j < dd->first_dyn_alloc_ctxt && k < 64; j++, k++)
-			/* convert to bit in mask and clear */
-			mask &= ~BIT_ULL(k);
-	}
-	return mask;
+	spin_lock(&dd->irq_src_lock);
+	reg = read_csr(dd, CCE_INT_MASK + (8 * idx));
+	if (set)
+		reg |= bits;
+	else
+		reg &= ~bits;
+	write_csr(dd, CCE_INT_MASK + (8 * idx), reg);
+	spin_unlock(&dd->irq_src_lock);
 }
 
-/* ========================================================================= */
-
-/*
- * Enable/disable chip from delivering interrupts.
+/**
+ * set_intr_bits() - Enable/disable a range (one or more) IRQ sources
+ * @dd: valid devdata
+ * @first: first IRQ source to set/clear
+ * @last: last IRQ source (inclusive) to set/clear
+ * @set: true == set the bits, false == clear the bits
+ *
+ * If first == last, set the exact source.
  */
-void set_intr_state(struct hfi1_devdata *dd, u32 enable)
+int set_intr_bits(struct hfi1_devdata *dd, u16 first, u16 last, bool set)
 {
-	int i;
+	u64 bits = 0;
+	u64 bit;
+	u16 src;
 
-	/*
-	 * In HFI, the mask needs to be 1 to allow interrupts.
-	 */
-	if (enable) {
-		/* enable all interrupts but urgent on kernel contexts */
-		for (i = 0; i < CCE_NUM_INT_CSRS; i++) {
-			u64 mask = get_int_mask(dd, i);
+	if (first > NUM_INTERRUPT_SOURCES || last > NUM_INTERRUPT_SOURCES)
+		return -EINVAL;
 
-			write_csr(dd, CCE_INT_MASK + (8 * i), mask);
-		}
+	if (last < first)
+		return -ERANGE;
 
-		init_qsfp_int(dd);
-	} else {
-		for (i = 0; i < CCE_NUM_INT_CSRS; i++)
-			write_csr(dd, CCE_INT_MASK + (8 * i), 0ull);
+	for (src = first; src <= last; src++) {
+		bit = src % BITS_PER_REGISTER;
+		/* wrapped to next register? */
+		if (!bit && bits) {
+			read_mod_write(dd, src - 1, bits, set);
+			bits = 0;
+		}
+		bits |= BIT_ULL(bit);
 	}
+	read_mod_write(dd, last, bits, set);
+
+	return 0;
 }
 
 /*
  * Clear all interrupt sources on the chip.
  */
-static void clear_all_interrupts(struct hfi1_devdata *dd)
+void clear_all_interrupts(struct hfi1_devdata *dd)
 {
 	int i;
 
@@ -13043,38 +13066,11 @@ static void clear_all_interrupts(struct hfi1_devdata *dd)
 	write_csr(dd, DC_DC8051_ERR_CLR, ~(u64)0);
 }
 
-/**
- * hfi1_clean_up_interrupts() - Free all IRQ resources
- * @dd: valid device data data structure
- *
- * Free the MSIx and assoicated PCI resources, if they have been allocated.
- */
-void hfi1_clean_up_interrupts(struct hfi1_devdata *dd)
-{
-	int i;
-	struct hfi1_msix_entry *me = dd->msix_entries;
-
-	/* remove irqs - must happen before disabling/turning off */
-	for (i = 0; i < dd->num_msix_entries; i++, me++) {
-		if (!me->arg) /* => no irq, no affinity */
-			continue;
-		hfi1_put_irq_affinity(dd, me);
-		pci_free_irq(dd->pcidev, i, me->arg);
-	}
-
-	/* clean structures */
-	kfree(dd->msix_entries);
-	dd->msix_entries = NULL;
-	dd->num_msix_entries = 0;
-
-	pci_free_irq_vectors(dd->pcidev);
-}
-
 /*
  * Remap the interrupt source from the general handler to the given MSI-X
  * interrupt.
  */
-static void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr)
+void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr)
 {
 	u64 reg;
 	int m, n;
@@ -13098,8 +13094,7 @@ static void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr)
 	write_csr(dd, CCE_INT_MAP + (8 * m), reg);
 }
 
-static void remap_sdma_interrupts(struct hfi1_devdata *dd,
-				  int engine, int msix_intr)
+void remap_sdma_interrupts(struct hfi1_devdata *dd, int engine, int msix_intr)
 {
 	/*
 	 * SDMA engine interrupt sources grouped by type, rather than
@@ -13108,204 +13103,16 @@ static void remap_sdma_interrupts(struct hfi1_devdata *dd,
 	 *	SDMAProgress
 	 *	SDMAIdle
 	 */
-	remap_intr(dd, IS_SDMA_START + 0 * TXE_NUM_SDMA_ENGINES + engine,
-		   msix_intr);
-	remap_intr(dd, IS_SDMA_START + 1 * TXE_NUM_SDMA_ENGINES + engine,
-		   msix_intr);
-	remap_intr(dd, IS_SDMA_START + 2 * TXE_NUM_SDMA_ENGINES + engine,
-		   msix_intr);
-}
-
-static int request_msix_irqs(struct hfi1_devdata *dd)
-{
-	int first_general, last_general;
-	int first_sdma, last_sdma;
-	int first_rx, last_rx;
-	int i, ret = 0;
-
-	/* calculate the ranges we are going to use */
-	first_general = 0;
-	last_general = first_general + 1;
-	first_sdma = last_general;
-	last_sdma = first_sdma + dd->num_sdma;
-	first_rx = last_sdma;
-	last_rx = first_rx + dd->n_krcv_queues + dd->num_vnic_contexts;
-
-	/* VNIC MSIx interrupts get mapped when VNIC contexts are created */
-	dd->first_dyn_msix_idx = first_rx + dd->n_krcv_queues;
-
-	/*
-	 * Sanity check - the code expects all SDMA chip source
-	 * interrupts to be in the same CSR, starting at bit 0.  Verify
-	 * that this is true by checking the bit location of the start.
-	 */
-	BUILD_BUG_ON(IS_SDMA_START % 64);
-
-	for (i = 0; i < dd->num_msix_entries; i++) {
-		struct hfi1_msix_entry *me = &dd->msix_entries[i];
-		const char *err_info;
-		irq_handler_t handler;
-		irq_handler_t thread = NULL;
-		void *arg = NULL;
-		int idx;
-		struct hfi1_ctxtdata *rcd = NULL;
-		struct sdma_engine *sde = NULL;
-		char name[MAX_NAME_SIZE];
-
-		/* obtain the arguments to pci_request_irq */
-		if (first_general <= i && i < last_general) {
-			idx = i - first_general;
-			handler = general_interrupt;
-			arg = dd;
-			snprintf(name, sizeof(name),
-				 DRIVER_NAME "_%d", dd->unit);
-			err_info = "general";
-			me->type = IRQ_GENERAL;
-		} else if (first_sdma <= i && i < last_sdma) {
-			idx = i - first_sdma;
-			sde = &dd->per_sdma[idx];
-			handler = sdma_interrupt;
-			arg = sde;
-			snprintf(name, sizeof(name),
-				 DRIVER_NAME "_%d sdma%d", dd->unit, idx);
-			err_info = "sdma";
-			remap_sdma_interrupts(dd, idx, i);
-			me->type = IRQ_SDMA;
-		} else if (first_rx <= i && i < last_rx) {
-			idx = i - first_rx;
-			rcd = hfi1_rcd_get_by_index_safe(dd, idx);
-			if (rcd) {
-				/*
-				 * Set the interrupt register and mask for this
-				 * context's interrupt.
-				 */
-				rcd->ireg = (IS_RCVAVAIL_START + idx) / 64;
-				rcd->imask = ((u64)1) <<
-					  ((IS_RCVAVAIL_START + idx) % 64);
-				handler = receive_context_interrupt;
-				thread = receive_context_thread;
-				arg = rcd;
-				snprintf(name, sizeof(name),
-					 DRIVER_NAME "_%d kctxt%d",
-					 dd->unit, idx);
-				err_info = "receive context";
-				remap_intr(dd, IS_RCVAVAIL_START + idx, i);
-				me->type = IRQ_RCVCTXT;
-				rcd->msix_intr = i;
-				hfi1_rcd_put(rcd);
-			}
-		} else {
-			/* not in our expected range - complain, then
-			 * ignore it
-			 */
-			dd_dev_err(dd,
-				   "Unexpected extra MSI-X interrupt %d\n", i);
-			continue;
-		}
-		/* no argument, no interrupt */
-		if (!arg)
-			continue;
-		/* make sure the name is terminated */
-		name[sizeof(name) - 1] = 0;
-		me->irq = pci_irq_vector(dd->pcidev, i);
-		ret = pci_request_irq(dd->pcidev, i, handler, thread, arg,
-				      name);
-		if (ret) {
-			dd_dev_err(dd,
-				   "unable to allocate %s interrupt, irq %d, index %d, err %d\n",
-				   err_info, me->irq, idx, ret);
-			return ret;
-		}
-		/*
-		 * assign arg after pci_request_irq call, so it will be
-		 * cleaned up
-		 */
-		me->arg = arg;
-
-		ret = hfi1_get_irq_affinity(dd, me);
-		if (ret)
-			dd_dev_err(dd, "unable to pin IRQ %d\n", ret);
-	}
-
-	return ret;
-}
-
-void hfi1_vnic_synchronize_irq(struct hfi1_devdata *dd)
-{
-	int i;
-
-	for (i = 0; i < dd->vnic.num_ctxt; i++) {
-		struct hfi1_ctxtdata *rcd = dd->vnic.ctxt[i];
-		struct hfi1_msix_entry *me = &dd->msix_entries[rcd->msix_intr];
-
-		synchronize_irq(me->irq);
-	}
-}
-
-void hfi1_reset_vnic_msix_info(struct hfi1_ctxtdata *rcd)
-{
-	struct hfi1_devdata *dd = rcd->dd;
-	struct hfi1_msix_entry *me = &dd->msix_entries[rcd->msix_intr];
-
-	if (!me->arg) /* => no irq, no affinity */
-		return;
-
-	hfi1_put_irq_affinity(dd, me);
-	pci_free_irq(dd->pcidev, rcd->msix_intr, me->arg);
-
-	me->arg = NULL;
-}
-
-void hfi1_set_vnic_msix_info(struct hfi1_ctxtdata *rcd)
-{
-	struct hfi1_devdata *dd = rcd->dd;
-	struct hfi1_msix_entry *me;
-	int idx = rcd->ctxt;
-	void *arg = rcd;
-	int ret;
-
-	rcd->msix_intr = dd->vnic.msix_idx++;
-	me = &dd->msix_entries[rcd->msix_intr];
-
-	/*
-	 * Set the interrupt register and mask for this
-	 * context's interrupt.
-	 */
-	rcd->ireg = (IS_RCVAVAIL_START + idx) / 64;
-	rcd->imask = ((u64)1) <<
-		  ((IS_RCVAVAIL_START + idx) % 64);
-	me->type = IRQ_RCVCTXT;
-	me->irq = pci_irq_vector(dd->pcidev, rcd->msix_intr);
-	remap_intr(dd, IS_RCVAVAIL_START + idx, rcd->msix_intr);
-
-	ret = pci_request_irq(dd->pcidev, rcd->msix_intr,
-			      receive_context_interrupt,
-			      receive_context_thread, arg,
-			      DRIVER_NAME "_%d kctxt%d", dd->unit, idx);
-	if (ret) {
-		dd_dev_err(dd, "vnic irq request (irq %d, idx %d) fail %d\n",
-			   me->irq, idx, ret);
-		return;
-	}
-	/*
-	 * assign arg after pci_request_irq call, so it will be
-	 * cleaned up
-	 */
-	me->arg = arg;
-
-	ret = hfi1_get_irq_affinity(dd, me);
-	if (ret) {
-		dd_dev_err(dd,
-			   "unable to pin IRQ %d\n", ret);
-		pci_free_irq(dd->pcidev, rcd->msix_intr, me->arg);
-	}
+	remap_intr(dd, IS_SDMA_START + engine, msix_intr);
+	remap_intr(dd, IS_SDMA_PROGRESS_START + engine, msix_intr);
+	remap_intr(dd, IS_SDMA_IDLE_START + engine, msix_intr);
 }
 
 /*
  * Set the general handler to accept all interrupts, remap all
  * chip interrupts back to MSI-X 0.
  */
-static void reset_interrupts(struct hfi1_devdata *dd)
+void reset_interrupts(struct hfi1_devdata *dd)
 {
 	int i;
 
@@ -13318,54 +13125,33 @@ static void reset_interrupts(struct hfi1_devdata *dd)
 		write_csr(dd, CCE_INT_MAP + (8 * i), 0);
 }
 
+/**
+ * set_up_interrupts() - Initialize the IRQ resources and state
+ * @dd: valid devdata
+ *
+ */
 static int set_up_interrupts(struct hfi1_devdata *dd)
 {
-	u32 total;
-	int ret, request;
-
-	/*
-	 * Interrupt count:
-	 *	1 general, "slow path" interrupt (includes the SDMA engines
-	 *		slow source, SDMACleanupDone)
-	 *	N interrupts - one per used SDMA engine
-	 *	M interrupt - one per kernel receive context
-	 *	V interrupt - one for each VNIC context
-	 */
-	total = 1 + dd->num_sdma + dd->n_krcv_queues + dd->num_vnic_contexts;
-
-	/* ask for MSI-X interrupts */
-	request = request_msix(dd, total);
-	if (request < 0) {
-		ret = request;
-		goto fail;
-	} else {
-		dd->msix_entries = kcalloc(total, sizeof(*dd->msix_entries),
-					   GFP_KERNEL);
-		if (!dd->msix_entries) {
-			ret = -ENOMEM;
-			goto fail;
-		}
-		/* using MSI-X */
-		dd->num_msix_entries = total;
-		dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total);
-	}
+	int ret;
 
 	/* mask all interrupts */
-	set_intr_state(dd, 0);
+	set_intr_bits(dd, IS_FIRST_SOURCE, IS_LAST_SOURCE, false);
+
 	/* clear all pending interrupts */
 	clear_all_interrupts(dd);
 
 	/* reset general handler mask, chip MSI-X mappings */
 	reset_interrupts(dd);
 
-	ret = request_msix_irqs(dd);
+	/* ask for MSI-X interrupts */
+	ret = msix_initialize(dd);
 	if (ret)
-		goto fail;
+		return ret;
 
-	return 0;
+	ret = msix_request_irqs(dd);
+	if (ret)
+		msix_clean_up_interrupts(dd);
 
-fail:
-	hfi1_clean_up_interrupts(dd);
 	return ret;
 }
 
@@ -14918,20 +14704,16 @@ err_exit:
 }
 
 /**
- * Allocate and initialize the device structure for the hfi.
+ * hfi1_init_dd() - Initialize most of the dd structure.
  * @dev: the pci_dev for hfi1_ib device
  * @ent: pci_device_id struct for this dev
  *
- * Also allocates, initializes, and returns the devdata struct for this
- * device instance
- *
  * This is global, and is called directly at init to set up the
  * chip-specific function pointers for later use.
  */
-struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
-				  const struct pci_device_id *ent)
+int hfi1_init_dd(struct hfi1_devdata *dd)
 {
-	struct hfi1_devdata *dd;
+	struct pci_dev *pdev = dd->pcidev;
 	struct hfi1_pportdata *ppd;
 	u64 reg;
 	int i, ret;
@@ -14942,13 +14724,8 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
 		"Functional simulator"
 	};
 	struct pci_dev *parent = pdev->bus->self;
-	u32 sdma_engines;
+	u32 sdma_engines = chip_sdma_engines(dd);
 
-	dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS *
-				sizeof(struct hfi1_pportdata));
-	if (IS_ERR(dd))
-		goto bail;
-	sdma_engines = chip_sdma_engines(dd);
 	ppd = dd->pport;
 	for (i = 0; i < dd->num_pports; i++, ppd++) {
 		int vl;
@@ -15127,6 +14904,12 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
 	if (ret)
 		goto bail_cleanup;
 
+	/*
+	 * This should probably occur in hfi1_pcie_init(), but historically
+	 * occurs after the do_pcie_gen3_transition() code.
+	 */
+	tune_pcie_caps(dd);
+
 	/* start setting dd values and adjusting CSRs */
 	init_early_variables(dd);
 
@@ -15239,14 +15022,13 @@ bail_free_cntrs:
 	free_cntrs(dd);
 bail_clear_intr:
 	hfi1_comp_vectors_clean_up(dd);
-	hfi1_clean_up_interrupts(dd);
+	msix_clean_up_interrupts(dd);
 bail_cleanup:
 	hfi1_pcie_ddcleanup(dd);
 bail_free:
 	hfi1_free_devdata(dd);
-	dd = ERR_PTR(ret);
 bail:
-	return dd;
+	return ret;
 }
 
 static u16 delay_cycles(struct hfi1_pportdata *ppd, u32 desired_egress_rate,

+ 48 - 23
drivers/infiniband/hw/hfi1/chip.h

@@ -52,9 +52,7 @@
  */
 
 /* sizes */
-#define CCE_NUM_MSIX_VECTORS 256
-#define CCE_NUM_INT_CSRS 12
-#define CCE_NUM_INT_MAP_CSRS 96
+#define BITS_PER_REGISTER (BITS_PER_BYTE * sizeof(u64))
 #define NUM_INTERRUPT_SOURCES 768
 #define RXE_NUM_CONTEXTS 160
 #define RXE_PER_CONTEXT_SIZE 0x1000	/* 4k */
@@ -161,34 +159,49 @@
 	(CR_CREDIT_RETURN_DUE_TO_FORCE_MASK << \
 	CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT)
 
-/* interrupt source numbers */
-#define IS_GENERAL_ERR_START	  0
-#define IS_SDMAENG_ERR_START	 16
-#define IS_SENDCTXT_ERR_START	 32
-#define IS_SDMA_START		192 /* includes SDmaProgress,SDmaIdle */
+/* Specific IRQ sources */
+#define CCE_ERR_INT		  0
+#define RXE_ERR_INT		  1
+#define MISC_ERR_INT		  2
+#define PIO_ERR_INT		  4
+#define SDMA_ERR_INT		  5
+#define EGRESS_ERR_INT		  6
+#define TXE_ERR_INT		  7
+#define PBC_INT			240
+#define GPIO_ASSERT_INT		241
+#define QSFP1_INT		242
+#define QSFP2_INT		243
+#define TCRIT_INT		244
+
+/* interrupt source ranges */
+#define IS_FIRST_SOURCE		CCE_ERR_INT
+#define IS_GENERAL_ERR_START		  0
+#define IS_SDMAENG_ERR_START		 16
+#define IS_SENDCTXT_ERR_START		 32
+#define IS_SDMA_START			192
+#define IS_SDMA_PROGRESS_START		208
+#define IS_SDMA_IDLE_START		224
 #define IS_VARIOUS_START		240
 #define IS_DC_START			248
 #define IS_RCVAVAIL_START		256
 #define IS_RCVURGENT_START		416
 #define IS_SENDCREDIT_START		576
 #define IS_RESERVED_START		736
-#define IS_MAX_SOURCES		768
+#define IS_LAST_SOURCE			767
 
 /* derived interrupt source values */
-#define IS_GENERAL_ERR_END		IS_SDMAENG_ERR_START
-#define IS_SDMAENG_ERR_END		IS_SENDCTXT_ERR_START
-#define IS_SENDCTXT_ERR_END		IS_SDMA_START
-#define IS_SDMA_END			IS_VARIOUS_START
-#define IS_VARIOUS_END		IS_DC_START
-#define IS_DC_END			IS_RCVAVAIL_START
-#define IS_RCVAVAIL_END		IS_RCVURGENT_START
-#define IS_RCVURGENT_END		IS_SENDCREDIT_START
-#define IS_SENDCREDIT_END		IS_RESERVED_START
-#define IS_RESERVED_END		IS_MAX_SOURCES
-
-/* absolute interrupt numbers for QSFP1Int and QSFP2Int */
-#define QSFP1_INT		242
-#define QSFP2_INT		243
+#define IS_GENERAL_ERR_END		7
+#define IS_SDMAENG_ERR_END		31
+#define IS_SENDCTXT_ERR_END		191
+#define IS_SDMA_END                     207
+#define IS_SDMA_PROGRESS_END            223
+#define IS_SDMA_IDLE_END		239
+#define IS_VARIOUS_END			244
+#define IS_DC_END			255
+#define IS_RCVAVAIL_END			415
+#define IS_RCVURGENT_END		575
+#define IS_SENDCREDIT_END		735
+#define IS_RESERVED_END			IS_LAST_SOURCE
 
 /* DCC_CFG_PORT_CONFIG logical link states */
 #define LSTATE_DOWN    0x1
@@ -1416,6 +1429,18 @@ void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality);
 void hfi1_init_vnic_rsm(struct hfi1_devdata *dd);
 void hfi1_deinit_vnic_rsm(struct hfi1_devdata *dd);
 
+irqreturn_t general_interrupt(int irq, void *data);
+irqreturn_t sdma_interrupt(int irq, void *data);
+irqreturn_t receive_context_interrupt(int irq, void *data);
+irqreturn_t receive_context_thread(int irq, void *data);
+
+int set_intr_bits(struct hfi1_devdata *dd, u16 first, u16 last, bool set);
+void init_qsfp_int(struct hfi1_devdata *dd);
+void clear_all_interrupts(struct hfi1_devdata *dd);
+void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr);
+void remap_sdma_interrupts(struct hfi1_devdata *dd, int engine, int msix_intr);
+void reset_interrupts(struct hfi1_devdata *dd);
+
 /*
  * Interrupt source table.
  *

+ 4 - 0
drivers/infiniband/hw/hfi1/chip_registers.h

@@ -878,6 +878,10 @@
 #define SEND_CTRL (TXE + 0x000000000000)
 #define SEND_CTRL_CM_RESET_SMASK 0x4ull
 #define SEND_CTRL_SEND_ENABLE_SMASK 0x1ull
+#define SEND_CTRL_UNSUPPORTED_VL_SHIFT 3
+#define SEND_CTRL_UNSUPPORTED_VL_MASK 0xFFull
+#define SEND_CTRL_UNSUPPORTED_VL_SMASK (SEND_CTRL_UNSUPPORTED_VL_MASK \
+		<< SEND_CTRL_UNSUPPORTED_VL_SHIFT)
 #define SEND_CTRL_VL_ARBITER_ENABLE_SMASK 0x2ull
 #define SEND_CTXT_CHECK_ENABLE (TXE + 0x000000100080)
 #define SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull

+ 3 - 1
drivers/infiniband/hw/hfi1/file_ops.c

@@ -681,7 +681,8 @@ static int hfi1_file_close(struct inode *inode, struct file *fp)
 		     HFI1_RCVCTRL_TAILUPD_DIS |
 		     HFI1_RCVCTRL_ONE_PKT_EGR_DIS |
 		     HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
-		     HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt);
+		     HFI1_RCVCTRL_NO_EGR_DROP_DIS |
+		     HFI1_RCVCTRL_URGENT_DIS, uctxt);
 	/* Clear the context's J_KEY */
 	hfi1_clear_ctxt_jkey(dd, uctxt);
 	/*
@@ -1096,6 +1097,7 @@ static void user_init(struct hfi1_ctxtdata *uctxt)
 	hfi1_set_ctxt_jkey(uctxt->dd, uctxt, uctxt->jkey);
 
 	rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB;
+	rcvctrl_ops |= HFI1_RCVCTRL_URGENT_ENB;
 	if (HFI1_CAP_UGET_MASK(uctxt->flags, HDRSUPP))
 		rcvctrl_ops |= HFI1_RCVCTRL_TIDFLOW_ENB;
 	/*

+ 20 - 28
drivers/infiniband/hw/hfi1/hfi.h

@@ -80,6 +80,7 @@
 #include "qsfp.h"
 #include "platform.h"
 #include "affinity.h"
+#include "msix.h"
 
 /* bumped 1 from s/w major version of TrueScale */
 #define HFI1_CHIP_VERS_MAJ 3U
@@ -620,6 +621,8 @@ struct rvt_sge_state;
 #define HFI1_RCVCTRL_NO_RHQ_DROP_DIS 0x8000
 #define HFI1_RCVCTRL_NO_EGR_DROP_ENB 0x10000
 #define HFI1_RCVCTRL_NO_EGR_DROP_DIS 0x20000
+#define HFI1_RCVCTRL_URGENT_ENB 0x40000
+#define HFI1_RCVCTRL_URGENT_DIS 0x80000
 
 /* partition enforcement flags */
 #define HFI1_PART_ENFORCE_IN	0x1
@@ -667,6 +670,14 @@ struct hfi1_msix_entry {
 	struct irq_affinity_notify notify;
 };
 
+struct hfi1_msix_info {
+	/* lock to synchronize in_use_msix access */
+	spinlock_t msix_lock;
+	DECLARE_BITMAP(in_use_msix, CCE_NUM_MSIX_VECTORS);
+	struct hfi1_msix_entry *msix_entries;
+	u16 max_requested;
+};
+
 /* per-SL CCA information */
 struct cca_timer {
 	struct hrtimer hrtimer;
@@ -992,7 +1003,6 @@ struct hfi1_vnic_data {
 	struct idr vesw_idr;
 	u8 rmt_start;
 	u8 num_ctxt;
-	u32 msix_idx;
 };
 
 struct hfi1_vnic_vport_info;
@@ -1205,11 +1215,6 @@ struct hfi1_devdata {
 
 	struct diag_client *diag_client;
 
-	/* MSI-X information */
-	struct hfi1_msix_entry *msix_entries;
-	u32 num_msix_entries;
-	u32 first_dyn_msix_idx;
-
 	/* general interrupt: mask of handled interrupts */
 	u64 gi_mask[CCE_NUM_INT_CSRS];
 
@@ -1223,6 +1228,9 @@ struct hfi1_devdata {
 	 */
 	struct timer_list synth_stats_timer;
 
+	/* MSI-X information */
+	struct hfi1_msix_info msix_info;
+
 	/*
 	 * device counters
 	 */
@@ -1349,6 +1357,8 @@ struct hfi1_devdata {
 
 	/* vnic data */
 	struct hfi1_vnic_data vnic;
+	/* Lock to protect IRQ SRC register access */
+	spinlock_t irq_src_lock;
 };
 
 static inline bool hfi1_vnic_is_rsm_full(struct hfi1_devdata *dd, int spare)
@@ -1431,9 +1441,6 @@ int handle_receive_interrupt(struct hfi1_ctxtdata *rcd, int thread);
 int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd, int thread);
 int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd, int thread);
 void set_all_slowpath(struct hfi1_devdata *dd);
-void hfi1_vnic_synchronize_irq(struct hfi1_devdata *dd);
-void hfi1_set_vnic_msix_info(struct hfi1_ctxtdata *rcd);
-void hfi1_reset_vnic_msix_info(struct hfi1_ctxtdata *rcd);
 
 extern const struct pci_device_id hfi1_pci_tbl[];
 void hfi1_make_ud_req_9B(struct rvt_qp *qp,
@@ -1887,10 +1894,8 @@ struct cc_state *get_cc_state_protected(struct hfi1_pportdata *ppd)
 #define HFI1_CTXT_WAITING_URG 4
 
 /* free up any allocated data at closes */
-struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
-				  const struct pci_device_id *ent);
+int hfi1_init_dd(struct hfi1_devdata *dd);
 void hfi1_free_devdata(struct hfi1_devdata *dd);
-struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra);
 
 /* LED beaconing functions */
 void hfi1_start_led_override(struct hfi1_pportdata *ppd, unsigned int timeon,
@@ -1963,6 +1968,7 @@ static inline u32 get_rcvhdrtail(const struct hfi1_ctxtdata *rcd)
  */
 
 extern const char ib_hfi1_version[];
+extern const struct attribute_group ib_hfi1_attr_group;
 
 int hfi1_device_create(struct hfi1_devdata *dd);
 void hfi1_device_remove(struct hfi1_devdata *dd);
@@ -1974,16 +1980,15 @@ void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *dd);
 /* Hook for sysfs read of QSFP */
 int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len);
 
-int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent);
-void hfi1_clean_up_interrupts(struct hfi1_devdata *dd);
+int hfi1_pcie_init(struct hfi1_devdata *dd);
 void hfi1_pcie_cleanup(struct pci_dev *pdev);
 int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev);
 void hfi1_pcie_ddcleanup(struct hfi1_devdata *);
 int pcie_speeds(struct hfi1_devdata *dd);
-int request_msix(struct hfi1_devdata *dd, u32 msireq);
 int restore_pci_variables(struct hfi1_devdata *dd);
 int save_pci_variables(struct hfi1_devdata *dd);
 int do_pcie_gen3_transition(struct hfi1_devdata *dd);
+void tune_pcie_caps(struct hfi1_devdata *dd);
 int parse_platform_config(struct hfi1_devdata *dd);
 int get_platform_config_field(struct hfi1_devdata *dd,
 			      enum platform_config_table_type_encoding
@@ -2124,19 +2129,6 @@ static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd)
 	return base_sdma_integrity;
 }
 
-/*
- * hfi1_early_err is used (only!) to print early errors before devdata is
- * allocated, or when dd->pcidev may not be valid, and at the tail end of
- * cleanup when devdata may have been freed, etc.  hfi1_dev_porterr is
- * the same as dd_dev_err, but is used when the message really needs
- * the IB port# to be definitive as to what's happening..
- */
-#define hfi1_early_err(dev, fmt, ...) \
-	dev_err(dev, fmt, ##__VA_ARGS__)
-
-#define hfi1_early_info(dev, fmt, ...) \
-	dev_info(dev, fmt, ##__VA_ARGS__)
-
 #define dd_dev_emerg(dd, fmt, ...) \
 	dev_emerg(&(dd)->pcidev->dev, "%s: " fmt, \
 		  rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__)

+ 67 - 46
drivers/infiniband/hw/hfi1/init.c

@@ -83,6 +83,8 @@
 #define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */
 #define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */
 
+#define NUM_IB_PORTS 1
+
 /*
  * Number of user receive contexts we are configured to use (to allow for more
  * pio buffers per ctxt, etc.)  Zero means use one user context per CPU.
@@ -654,9 +656,8 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
 	ppd->part_enforce |= HFI1_PART_ENFORCE_IN;
 
 	if (loopback) {
-		hfi1_early_err(&pdev->dev,
-			       "Faking data partition 0x8001 in idx %u\n",
-			       !default_pkey_idx);
+		dd_dev_err(dd, "Faking data partition 0x8001 in idx %u\n",
+			   !default_pkey_idx);
 		ppd->pkeys[!default_pkey_idx] = 0x8001;
 	}
 
@@ -702,9 +703,7 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
 	return;
 
 bail:
-
-	hfi1_early_err(&pdev->dev,
-		       "Congestion Control Agent disabled for port %d\n", port);
+	dd_dev_err(dd, "Congestion Control Agent disabled for port %d\n", port);
 }
 
 /*
@@ -832,6 +831,23 @@ wq_error:
 	return -ENOMEM;
 }
 
+/**
+ * enable_general_intr() - Enable the IRQs that will be handled by the
+ * general interrupt handler.
+ * @dd: valid devdata
+ *
+ */
+static void enable_general_intr(struct hfi1_devdata *dd)
+{
+	set_intr_bits(dd, CCE_ERR_INT, MISC_ERR_INT, true);
+	set_intr_bits(dd, PIO_ERR_INT, TXE_ERR_INT, true);
+	set_intr_bits(dd, IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END, true);
+	set_intr_bits(dd, PBC_INT, GPIO_ASSERT_INT, true);
+	set_intr_bits(dd, TCRIT_INT, TCRIT_INT, true);
+	set_intr_bits(dd, IS_DC_START, IS_DC_END, true);
+	set_intr_bits(dd, IS_SENDCREDIT_START, IS_SENDCREDIT_END, true);
+}
+
 /**
  * hfi1_init - do the actual initialization sequence on the chip
  * @dd: the hfi1_ib device
@@ -916,6 +932,7 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit)
 				   "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
 			ret = lastfail;
 		}
+		/* enable IRQ */
 		hfi1_rcd_put(rcd);
 	}
 
@@ -954,7 +971,8 @@ done:
 			HFI1_STATUS_INITTED;
 	if (!ret) {
 		/* enable all interrupts from the chip */
-		set_intr_state(dd, 1);
+		enable_general_intr(dd);
+		init_qsfp_int(dd);
 
 		/* chip is OK for user apps; mark it as initialized */
 		for (pidx = 0; pidx < dd->num_pports; ++pidx) {
@@ -1051,9 +1069,9 @@ static void shutdown_device(struct hfi1_devdata *dd)
 	}
 	dd->flags &= ~HFI1_INITTED;
 
-	/* mask and clean up interrupts, but not errors */
-	set_intr_state(dd, 0);
-	hfi1_clean_up_interrupts(dd);
+	/* mask and clean up interrupts */
+	set_intr_bits(dd, IS_FIRST_SOURCE, IS_LAST_SOURCE, false);
+	msix_clean_up_interrupts(dd);
 
 	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
 		ppd = dd->pport + pidx;
@@ -1246,15 +1264,19 @@ void hfi1_free_devdata(struct hfi1_devdata *dd)
 	kobject_put(&dd->kobj);
 }
 
-/*
- * Allocate our primary per-unit data structure.  Must be done via verbs
- * allocator, because the verbs cleanup process both does cleanup and
- * free of the data structure.
+/**
+ * hfi1_alloc_devdata - Allocate our primary per-unit data structure.
+ * @pdev: Valid PCI device
+ * @extra: How many bytes to alloc past the default
+ *
+ * Must be done via verbs allocator, because the verbs cleanup process
+ * both does cleanup and free of the data structure.
  * "extra" is for chip-specific data.
  *
  * Use the idr mechanism to get a unit number for this unit.
  */
-struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra)
+static struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev,
+					       size_t extra)
 {
 	unsigned long flags;
 	struct hfi1_devdata *dd;
@@ -1287,8 +1309,8 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra)
 	idr_preload_end();
 
 	if (ret < 0) {
-		hfi1_early_err(&pdev->dev,
-			       "Could not allocate unit ID: error %d\n", -ret);
+		dev_err(&pdev->dev,
+			"Could not allocate unit ID: error %d\n", -ret);
 		goto bail;
 	}
 	rvt_set_ibdev_name(&dd->verbs_dev.rdi, "%s_%d", class_name(), dd->unit);
@@ -1309,6 +1331,7 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra)
 	spin_lock_init(&dd->pio_map_lock);
 	mutex_init(&dd->dc8051_lock);
 	init_waitqueue_head(&dd->event_queue);
+	spin_lock_init(&dd->irq_src_lock);
 
 	dd->int_counter = alloc_percpu(u64);
 	if (!dd->int_counter) {
@@ -1481,9 +1504,6 @@ static int __init hfi1_mod_init(void)
 	idr_init(&hfi1_unit_table);
 
 	hfi1_dbg_init();
-	ret = hfi1_wss_init();
-	if (ret < 0)
-		goto bail_wss;
 	ret = pci_register_driver(&hfi1_pci_driver);
 	if (ret < 0) {
 		pr_err("Unable to register driver: error %d\n", -ret);
@@ -1492,8 +1512,6 @@ static int __init hfi1_mod_init(void)
 	goto bail; /* all OK */
 
 bail_dev:
-	hfi1_wss_exit();
-bail_wss:
 	hfi1_dbg_exit();
 	idr_destroy(&hfi1_unit_table);
 	dev_cleanup();
@@ -1510,7 +1528,6 @@ static void __exit hfi1_mod_cleanup(void)
 {
 	pci_unregister_driver(&hfi1_pci_driver);
 	node_affinity_destroy_all();
-	hfi1_wss_exit();
 	hfi1_dbg_exit();
 
 	idr_destroy(&hfi1_unit_table);
@@ -1604,23 +1621,23 @@ static void postinit_cleanup(struct hfi1_devdata *dd)
 	hfi1_free_devdata(dd);
 }
 
-static int init_validate_rcvhdrcnt(struct device *dev, uint thecnt)
+static int init_validate_rcvhdrcnt(struct hfi1_devdata *dd, uint thecnt)
 {
 	if (thecnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) {
-		hfi1_early_err(dev, "Receive header queue count too small\n");
+		dd_dev_err(dd, "Receive header queue count too small\n");
 		return -EINVAL;
 	}
 
 	if (thecnt > HFI1_MAX_HDRQ_EGRBUF_CNT) {
-		hfi1_early_err(dev,
-			       "Receive header queue count cannot be greater than %u\n",
-			       HFI1_MAX_HDRQ_EGRBUF_CNT);
+		dd_dev_err(dd,
+			   "Receive header queue count cannot be greater than %u\n",
+			   HFI1_MAX_HDRQ_EGRBUF_CNT);
 		return -EINVAL;
 	}
 
 	if (thecnt % HDRQ_INCREMENT) {
-		hfi1_early_err(dev, "Receive header queue count %d must be divisible by %lu\n",
-			       thecnt, HDRQ_INCREMENT);
+		dd_dev_err(dd, "Receive header queue count %d must be divisible by %lu\n",
+			   thecnt, HDRQ_INCREMENT);
 		return -EINVAL;
 	}
 
@@ -1639,22 +1656,29 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	/* Validate dev ids */
 	if (!(ent->device == PCI_DEVICE_ID_INTEL0 ||
 	      ent->device == PCI_DEVICE_ID_INTEL1)) {
-		hfi1_early_err(&pdev->dev,
-			       "Failing on unknown Intel deviceid 0x%x\n",
-			       ent->device);
+		dev_err(&pdev->dev, "Failing on unknown Intel deviceid 0x%x\n",
+			ent->device);
 		ret = -ENODEV;
 		goto bail;
 	}
 
+	/* Allocate the dd so we can get to work */
+	dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS *
+				sizeof(struct hfi1_pportdata));
+	if (IS_ERR(dd)) {
+		ret = PTR_ERR(dd);
+		goto bail;
+	}
+
 	/* Validate some global module parameters */
-	ret = init_validate_rcvhdrcnt(&pdev->dev, rcvhdrcnt);
+	ret = init_validate_rcvhdrcnt(dd, rcvhdrcnt);
 	if (ret)
 		goto bail;
 
 	/* use the encoding function as a sanitization check */
 	if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) {
-		hfi1_early_err(&pdev->dev, "Invalid HdrQ Entry size %u\n",
-			       hfi1_hdrq_entsize);
+		dd_dev_err(dd, "Invalid HdrQ Entry size %u\n",
+			   hfi1_hdrq_entsize);
 		ret = -EINVAL;
 		goto bail;
 	}
@@ -1676,10 +1700,10 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 			clamp_val(eager_buffer_size,
 				  MIN_EAGER_BUFFER * 8,
 				  MAX_EAGER_BUFFER_TOTAL);
-		hfi1_early_info(&pdev->dev, "Eager buffer size %u\n",
-				eager_buffer_size);
+		dd_dev_info(dd, "Eager buffer size %u\n",
+			    eager_buffer_size);
 	} else {
-		hfi1_early_err(&pdev->dev, "Invalid Eager buffer size of 0\n");
+		dd_dev_err(dd, "Invalid Eager buffer size of 0\n");
 		ret = -EINVAL;
 		goto bail;
 	}
@@ -1687,7 +1711,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	/* restrict value of hfi1_rcvarr_split */
 	hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100);
 
-	ret = hfi1_pcie_init(pdev, ent);
+	ret = hfi1_pcie_init(dd);
 	if (ret)
 		goto bail;
 
@@ -1695,12 +1719,9 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	 * Do device-specific initialization, function table setup, dd
 	 * allocation, etc.
 	 */
-	dd = hfi1_init_dd(pdev, ent);
-
-	if (IS_ERR(dd)) {
-		ret = PTR_ERR(dd);
+	ret = hfi1_init_dd(dd);
+	if (ret)
 		goto clean_bail; /* error already printed */
-	}
 
 	ret = create_workqueues(dd);
 	if (ret)
@@ -1731,7 +1752,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j);
 
 	if (initfail || ret) {
-		hfi1_clean_up_interrupts(dd);
+		msix_clean_up_interrupts(dd);
 		stop_timers(dd);
 		flush_workqueue(ib_wq);
 		for (pidx = 0; pidx < dd->num_pports; ++pidx) {

+ 94 - 0
drivers/infiniband/hw/hfi1/iowait.c

@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+#include "iowait.h"
+#include "trace_iowait.h"
+
+void iowait_set_flag(struct iowait *wait, u32 flag)
+{
+	trace_hfi1_iowait_set(wait, flag);
+	set_bit(flag, &wait->flags);
+}
+
+bool iowait_flag_set(struct iowait *wait, u32 flag)
+{
+	return test_bit(flag, &wait->flags);
+}
+
+inline void iowait_clear_flag(struct iowait *wait, u32 flag)
+{
+	trace_hfi1_iowait_clear(wait, flag);
+	clear_bit(flag, &wait->flags);
+}
+
+/**
+ * iowait_init() - initialize wait structure
+ * @wait: wait struct to initialize
+ * @tx_limit: limit for overflow queuing
+ * @func: restart function for workqueue
+ * @sleep: sleep function for no space
+ * @resume: wakeup function for no space
+ *
+ * This function initializes the iowait
+ * structure embedded in the QP or PQ.
+ *
+ */
+void iowait_init(struct iowait *wait, u32 tx_limit,
+		 void (*func)(struct work_struct *work),
+		 void (*tidfunc)(struct work_struct *work),
+		 int (*sleep)(struct sdma_engine *sde,
+			      struct iowait_work *wait,
+			      struct sdma_txreq *tx,
+			      uint seq,
+			      bool pkts_sent),
+		 void (*wakeup)(struct iowait *wait, int reason),
+		 void (*sdma_drained)(struct iowait *wait))
+{
+	int i;
+
+	wait->count = 0;
+	INIT_LIST_HEAD(&wait->list);
+	init_waitqueue_head(&wait->wait_dma);
+	init_waitqueue_head(&wait->wait_pio);
+	atomic_set(&wait->sdma_busy, 0);
+	atomic_set(&wait->pio_busy, 0);
+	wait->tx_limit = tx_limit;
+	wait->sleep = sleep;
+	wait->wakeup = wakeup;
+	wait->sdma_drained = sdma_drained;
+	wait->flags = 0;
+	for (i = 0; i < IOWAIT_SES; i++) {
+		wait->wait[i].iow = wait;
+		INIT_LIST_HEAD(&wait->wait[i].tx_head);
+		if (i == IOWAIT_IB_SE)
+			INIT_WORK(&wait->wait[i].iowork, func);
+		else
+			INIT_WORK(&wait->wait[i].iowork, tidfunc);
+	}
+}
+
+/**
+ * iowait_cancel_work - cancel all work in iowait
+ * @w: the iowait struct
+ */
+void iowait_cancel_work(struct iowait *w)
+{
+	cancel_work_sync(&iowait_get_ib_work(w)->iowork);
+	cancel_work_sync(&iowait_get_tid_work(w)->iowork);
+}
+
+/**
+ * iowait_set_work_flag - set work flag based on leg
+ * @w - the iowait work struct
+ */
+int iowait_set_work_flag(struct iowait_work *w)
+{
+	if (w == &w->iow->wait[IOWAIT_IB_SE]) {
+		iowait_set_flag(w->iow, IOWAIT_PENDING_IB);
+		return IOWAIT_IB_SE;
+	}
+	iowait_set_flag(w->iow, IOWAIT_PENDING_TID);
+	return IOWAIT_TID_SE;
+}

+ 134 - 58
drivers/infiniband/hw/hfi1/iowait.h

@@ -1,7 +1,7 @@
 #ifndef _HFI1_IOWAIT_H
 #define _HFI1_IOWAIT_H
 /*
- * Copyright(c) 2015, 2016 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -49,6 +49,7 @@
 
 #include <linux/list.h>
 #include <linux/workqueue.h>
+#include <linux/wait.h>
 #include <linux/sched.h>
 
 #include "sdma_txreq.h"
@@ -59,16 +60,47 @@
  */
 typedef void (*restart_t)(struct work_struct *work);
 
+#define IOWAIT_PENDING_IB  0x0
+#define IOWAIT_PENDING_TID 0x1
+
+/*
+ * A QP can have multiple Send Engines (SEs).
+ *
+ * The current use case is for supporting a TID RDMA
+ * packet build/xmit mechanism independent from verbs.
+ */
+#define IOWAIT_SES 2
+#define IOWAIT_IB_SE 0
+#define IOWAIT_TID_SE 1
+
 struct sdma_txreq;
 struct sdma_engine;
 /**
- * struct iowait - linkage for delayed progress/waiting
+ * @iowork: the work struct
+ * @tx_head: list of prebuilt packets
+ * @iow: the parent iowait structure
+ *
+ * This structure is the work item (process) specific
+ * details associated with the each of the two SEs of the
+ * QP.
+ *
+ * The workstruct and the queued TXs are unique to each
+ * SE.
+ */
+struct iowait;
+struct iowait_work {
+	struct work_struct iowork;
+	struct list_head tx_head;
+	struct iowait *iow;
+};
+
+/**
  * @list: used to add/insert into QP/PQ wait lists
- * @lock: uses to record the list head lock
  * @tx_head: overflow list of sdma_txreq's
  * @sleep: no space callback
  * @wakeup: space callback wakeup
  * @sdma_drained: sdma count drained
+ * @lock: lock protected head of wait queue
  * @iowork: workqueue overhead
  * @wait_dma: wait for sdma_busy == 0
  * @wait_pio: wait for pio_busy == 0
@@ -76,6 +108,8 @@ struct sdma_engine;
  * @count: total number of descriptors in tx_head'ed list
  * @tx_limit: limit for overflow queuing
  * @tx_count: number of tx entry's in tx_head'ed list
+ * @flags: wait flags (one per QP)
+ * @wait: SE array
  *
  * This is to be embedded in user's state structure
  * (QP or PQ).
@@ -98,13 +132,11 @@ struct sdma_engine;
  * Waiters explicity know that, but the destroy
  * code that unwaits QPs does not.
  */
-
 struct iowait {
 	struct list_head list;
-	struct list_head tx_head;
 	int (*sleep)(
 		struct sdma_engine *sde,
-		struct iowait *wait,
+		struct iowait_work *wait,
 		struct sdma_txreq *tx,
 		uint seq,
 		bool pkts_sent
@@ -112,7 +144,6 @@ struct iowait {
 	void (*wakeup)(struct iowait *wait, int reason);
 	void (*sdma_drained)(struct iowait *wait);
 	seqlock_t *lock;
-	struct work_struct iowork;
 	wait_queue_head_t wait_dma;
 	wait_queue_head_t wait_pio;
 	atomic_t sdma_busy;
@@ -121,63 +152,37 @@ struct iowait {
 	u32 tx_limit;
 	u32 tx_count;
 	u8 starved_cnt;
+	unsigned long flags;
+	struct iowait_work wait[IOWAIT_SES];
 };
 
 #define SDMA_AVAIL_REASON 0
 
-/**
- * iowait_init() - initialize wait structure
- * @wait: wait struct to initialize
- * @tx_limit: limit for overflow queuing
- * @func: restart function for workqueue
- * @sleep: sleep function for no space
- * @resume: wakeup function for no space
- *
- * This function initializes the iowait
- * structure embedded in the QP or PQ.
- *
- */
+void iowait_set_flag(struct iowait *wait, u32 flag);
+bool iowait_flag_set(struct iowait *wait, u32 flag);
+void iowait_clear_flag(struct iowait *wait, u32 flag);
 
-static inline void iowait_init(
-	struct iowait *wait,
-	u32 tx_limit,
-	void (*func)(struct work_struct *work),
-	int (*sleep)(
-		struct sdma_engine *sde,
-		struct iowait *wait,
-		struct sdma_txreq *tx,
-		uint seq,
-		bool pkts_sent),
-	void (*wakeup)(struct iowait *wait, int reason),
-	void (*sdma_drained)(struct iowait *wait))
-{
-	wait->count = 0;
-	wait->lock = NULL;
-	INIT_LIST_HEAD(&wait->list);
-	INIT_LIST_HEAD(&wait->tx_head);
-	INIT_WORK(&wait->iowork, func);
-	init_waitqueue_head(&wait->wait_dma);
-	init_waitqueue_head(&wait->wait_pio);
-	atomic_set(&wait->sdma_busy, 0);
-	atomic_set(&wait->pio_busy, 0);
-	wait->tx_limit = tx_limit;
-	wait->sleep = sleep;
-	wait->wakeup = wakeup;
-	wait->sdma_drained = sdma_drained;
-}
+void iowait_init(struct iowait *wait, u32 tx_limit,
+		 void (*func)(struct work_struct *work),
+		 void (*tidfunc)(struct work_struct *work),
+		 int (*sleep)(struct sdma_engine *sde,
+			      struct iowait_work *wait,
+			      struct sdma_txreq *tx,
+			      uint seq,
+			      bool pkts_sent),
+		 void (*wakeup)(struct iowait *wait, int reason),
+		 void (*sdma_drained)(struct iowait *wait));
 
 /**
- * iowait_schedule() - initialize wait structure
+ * iowait_schedule() - schedule the default send engine work
  * @wait: wait struct to schedule
  * @wq: workqueue for schedule
  * @cpu: cpu
  */
-static inline void iowait_schedule(
-	struct iowait *wait,
-	struct workqueue_struct *wq,
-	int cpu)
+static inline bool iowait_schedule(struct iowait *wait,
+				   struct workqueue_struct *wq, int cpu)
 {
-	queue_work_on(cpu, wq, &wait->iowork);
+	return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_IB_SE].iowork);
 }
 
 /**
@@ -228,6 +233,8 @@ static inline void iowait_sdma_add(struct iowait *wait, int count)
  */
 static inline int iowait_sdma_dec(struct iowait *wait)
 {
+	if (!wait)
+		return 0;
 	return atomic_dec_and_test(&wait->sdma_busy);
 }
 
@@ -267,11 +274,13 @@ static inline void iowait_pio_inc(struct iowait *wait)
 }
 
 /**
- * iowait_sdma_dec - note pio complete
+ * iowait_pio_dec - note pio complete
  * @wait: iowait structure
  */
 static inline int iowait_pio_dec(struct iowait *wait)
 {
+	if (!wait)
+		return 0;
 	return atomic_dec_and_test(&wait->pio_busy);
 }
 
@@ -293,9 +302,9 @@ static inline void iowait_drain_wakeup(struct iowait *wait)
 /**
  * iowait_get_txhead() - get packet off of iowait list
  *
- * @wait wait struture
+ * @wait iowait_work struture
  */
-static inline struct sdma_txreq *iowait_get_txhead(struct iowait *wait)
+static inline struct sdma_txreq *iowait_get_txhead(struct iowait_work *wait)
 {
 	struct sdma_txreq *tx = NULL;
 
@@ -309,6 +318,28 @@ static inline struct sdma_txreq *iowait_get_txhead(struct iowait *wait)
 	return tx;
 }
 
+static inline u16 iowait_get_desc(struct iowait_work *w)
+{
+	u16 num_desc = 0;
+	struct sdma_txreq *tx = NULL;
+
+	if (!list_empty(&w->tx_head)) {
+		tx = list_first_entry(&w->tx_head, struct sdma_txreq,
+				      list);
+		num_desc = tx->num_desc;
+	}
+	return num_desc;
+}
+
+static inline u32 iowait_get_all_desc(struct iowait *w)
+{
+	u32 num_desc = 0;
+
+	num_desc = iowait_get_desc(&w->wait[IOWAIT_IB_SE]);
+	num_desc += iowait_get_desc(&w->wait[IOWAIT_TID_SE]);
+	return num_desc;
+}
+
 /**
  * iowait_queue - Put the iowait on a wait queue
  * @pkts_sent: have some packets been sent before queuing?
@@ -372,12 +403,57 @@ static inline void iowait_starve_find_max(struct iowait *w, u8 *max,
 }
 
 /**
- * iowait_packet_queued() - determine if a packet is already built
- * @wait: the wait structure
+ * iowait_packet_queued() - determine if a packet is queued
+ * @wait: the iowait_work structure
  */
-static inline bool iowait_packet_queued(struct iowait *wait)
+static inline bool iowait_packet_queued(struct iowait_work *wait)
 {
 	return !list_empty(&wait->tx_head);
 }
 
+/**
+ * inc_wait_count - increment wait counts
+ * @w: the log work struct
+ * @n: the count
+ */
+static inline void iowait_inc_wait_count(struct iowait_work *w, u16 n)
+{
+	if (!w)
+		return;
+	w->iow->tx_count++;
+	w->iow->count += n;
+}
+
+/**
+ * iowait_get_tid_work - return iowait_work for tid SE
+ * @w: the iowait struct
+ */
+static inline struct iowait_work *iowait_get_tid_work(struct iowait *w)
+{
+	return &w->wait[IOWAIT_TID_SE];
+}
+
+/**
+ * iowait_get_ib_work - return iowait_work for ib SE
+ * @w: the iowait struct
+ */
+static inline struct iowait_work *iowait_get_ib_work(struct iowait *w)
+{
+	return &w->wait[IOWAIT_IB_SE];
+}
+
+/**
+ * iowait_ioww_to_iow - return iowait given iowait_work
+ * @w: the iowait_work struct
+ */
+static inline struct iowait *iowait_ioww_to_iow(struct iowait_work *w)
+{
+	if (likely(w))
+		return w->iow;
+	return NULL;
+}
+
+void iowait_cancel_work(struct iowait *w);
+int iowait_set_work_flag(struct iowait_work *w);
+
 #endif

+ 2 - 2
drivers/infiniband/hw/hfi1/mad.c

@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015-2017 Intel Corporation.
+ * Copyright(c) 2015-2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -4836,7 +4836,7 @@ static int hfi1_process_opa_mad(struct ib_device *ibdev, int mad_flags,
 	int ret;
 	int pkey_idx;
 	int local_mad = 0;
-	u32 resp_len = 0;
+	u32 resp_len = in_wc->byte_len - sizeof(*in_grh);
 	struct hfi1_ibport *ibp = to_iport(ibdev, port);
 
 	pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY);

+ 363 - 0
drivers/infiniband/hw/hfi1/msix.c

@@ -0,0 +1,363 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+#include "affinity.h"
+#include "sdma.h"
+
+/**
+ * msix_initialize() - Calculate, request and configure MSIx IRQs
+ * @dd: valid hfi1 devdata
+ *
+ */
+int msix_initialize(struct hfi1_devdata *dd)
+{
+	u32 total;
+	int ret;
+	struct hfi1_msix_entry *entries;
+
+	/*
+	 * MSIx interrupt count:
+	 *	one for the general, "slow path" interrupt
+	 *	one per used SDMA engine
+	 *	one per kernel receive context
+	 *	one for each VNIC context
+	 *      ...any new IRQs should be added here.
+	 */
+	total = 1 + dd->num_sdma + dd->n_krcv_queues + dd->num_vnic_contexts;
+
+	if (total >= CCE_NUM_MSIX_VECTORS)
+		return -EINVAL;
+
+	ret = pci_alloc_irq_vectors(dd->pcidev, total, total, PCI_IRQ_MSIX);
+	if (ret < 0) {
+		dd_dev_err(dd, "pci_alloc_irq_vectors() failed: %d\n", ret);
+		return ret;
+	}
+
+	entries = kcalloc(total, sizeof(*dd->msix_info.msix_entries),
+			  GFP_KERNEL);
+	if (!entries) {
+		pci_free_irq_vectors(dd->pcidev);
+		return -ENOMEM;
+	}
+
+	dd->msix_info.msix_entries = entries;
+	spin_lock_init(&dd->msix_info.msix_lock);
+	bitmap_zero(dd->msix_info.in_use_msix, total);
+	dd->msix_info.max_requested = total;
+	dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total);
+
+	return 0;
+}
+
+/**
+ * msix_request_irq() - Allocate a free MSIx IRQ
+ * @dd: valid devdata
+ * @arg: context information for the IRQ
+ * @handler: IRQ handler
+ * @thread: IRQ thread handler (could be NULL)
+ * @idx: zero base idx if multiple devices are needed
+ * @type: affinty IRQ type
+ *
+ * Allocated an MSIx vector if available, and then create the appropriate
+ * meta data needed to keep track of the pci IRQ request.
+ *
+ * Return:
+ *   < 0   Error
+ *   >= 0  MSIx vector
+ *
+ */
+static int msix_request_irq(struct hfi1_devdata *dd, void *arg,
+			    irq_handler_t handler, irq_handler_t thread,
+			    u32 idx, enum irq_type type)
+{
+	unsigned long nr;
+	int irq;
+	int ret;
+	const char *err_info;
+	char name[MAX_NAME_SIZE];
+	struct hfi1_msix_entry *me;
+
+	/* Allocate an MSIx vector */
+	spin_lock(&dd->msix_info.msix_lock);
+	nr = find_first_zero_bit(dd->msix_info.in_use_msix,
+				 dd->msix_info.max_requested);
+	if (nr < dd->msix_info.max_requested)
+		__set_bit(nr, dd->msix_info.in_use_msix);
+	spin_unlock(&dd->msix_info.msix_lock);
+
+	if (nr == dd->msix_info.max_requested)
+		return -ENOSPC;
+
+	/* Specific verification and determine the name */
+	switch (type) {
+	case IRQ_GENERAL:
+		/* general interrupt must be MSIx vector 0 */
+		if (nr) {
+			spin_lock(&dd->msix_info.msix_lock);
+			__clear_bit(nr, dd->msix_info.in_use_msix);
+			spin_unlock(&dd->msix_info.msix_lock);
+			dd_dev_err(dd, "Invalid index %lu for GENERAL IRQ\n",
+				   nr);
+			return -EINVAL;
+		}
+		snprintf(name, sizeof(name), DRIVER_NAME "_%d", dd->unit);
+		err_info = "general";
+		break;
+	case IRQ_SDMA:
+		snprintf(name, sizeof(name), DRIVER_NAME "_%d sdma%d",
+			 dd->unit, idx);
+		err_info = "sdma";
+		break;
+	case IRQ_RCVCTXT:
+		snprintf(name, sizeof(name), DRIVER_NAME "_%d kctxt%d",
+			 dd->unit, idx);
+		err_info = "receive context";
+		break;
+	case IRQ_OTHER:
+	default:
+		return -EINVAL;
+	}
+	name[sizeof(name) - 1] = 0;
+
+	irq = pci_irq_vector(dd->pcidev, nr);
+	ret = pci_request_irq(dd->pcidev, nr, handler, thread, arg, name);
+	if (ret) {
+		dd_dev_err(dd,
+			   "%s: request for IRQ %d failed, MSIx %d, err %d\n",
+			   err_info, irq, idx, ret);
+		spin_lock(&dd->msix_info.msix_lock);
+		__clear_bit(nr, dd->msix_info.in_use_msix);
+		spin_unlock(&dd->msix_info.msix_lock);
+		return ret;
+	}
+
+	/*
+	 * assign arg after pci_request_irq call, so it will be
+	 * cleaned up
+	 */
+	me = &dd->msix_info.msix_entries[nr];
+	me->irq = irq;
+	me->arg = arg;
+	me->type = type;
+
+	/* This is a request, so a failure is not fatal */
+	ret = hfi1_get_irq_affinity(dd, me);
+	if (ret)
+		dd_dev_err(dd, "unable to pin IRQ %d\n", ret);
+
+	return nr;
+}
+
+/**
+ * msix_request_rcd_irq() - Helper function for RCVAVAIL IRQs
+ * @rcd: valid rcd context
+ *
+ */
+int msix_request_rcd_irq(struct hfi1_ctxtdata *rcd)
+{
+	int nr;
+
+	nr = msix_request_irq(rcd->dd, rcd, receive_context_interrupt,
+			      receive_context_thread, rcd->ctxt, IRQ_RCVCTXT);
+	if (nr < 0)
+		return nr;
+
+	/*
+	 * Set the interrupt register and mask for this
+	 * context's interrupt.
+	 */
+	rcd->ireg = (IS_RCVAVAIL_START + rcd->ctxt) / 64;
+	rcd->imask = ((u64)1) << ((IS_RCVAVAIL_START + rcd->ctxt) % 64);
+	rcd->msix_intr = nr;
+	remap_intr(rcd->dd, IS_RCVAVAIL_START + rcd->ctxt, nr);
+
+	return 0;
+}
+
+/**
+ * msix_request_smda_ira() - Helper for getting SDMA IRQ resources
+ * @sde: valid sdma engine
+ *
+ */
+int msix_request_sdma_irq(struct sdma_engine *sde)
+{
+	int nr;
+
+	nr = msix_request_irq(sde->dd, sde, sdma_interrupt, NULL,
+			      sde->this_idx, IRQ_SDMA);
+	if (nr < 0)
+		return nr;
+	sde->msix_intr = nr;
+	remap_sdma_interrupts(sde->dd, sde->this_idx, nr);
+
+	return 0;
+}
+
+/**
+ * enable_sdma_src() - Helper to enable SDMA IRQ srcs
+ * @dd: valid devdata structure
+ * @i: index of SDMA engine
+ */
+static void enable_sdma_srcs(struct hfi1_devdata *dd, int i)
+{
+	set_intr_bits(dd, IS_SDMA_START + i, IS_SDMA_START + i, true);
+	set_intr_bits(dd, IS_SDMA_PROGRESS_START + i,
+		      IS_SDMA_PROGRESS_START + i, true);
+	set_intr_bits(dd, IS_SDMA_IDLE_START + i, IS_SDMA_IDLE_START + i, true);
+	set_intr_bits(dd, IS_SDMAENG_ERR_START + i, IS_SDMAENG_ERR_START + i,
+		      true);
+}
+
+/**
+ * msix_request_irqs() - Allocate all MSIx IRQs
+ * @dd: valid devdata structure
+ *
+ * Helper function to request the used MSIx IRQs.
+ *
+ */
+int msix_request_irqs(struct hfi1_devdata *dd)
+{
+	int i;
+	int ret;
+
+	ret = msix_request_irq(dd, dd, general_interrupt, NULL, 0, IRQ_GENERAL);
+	if (ret < 0)
+		return ret;
+
+	for (i = 0; i < dd->num_sdma; i++) {
+		struct sdma_engine *sde = &dd->per_sdma[i];
+
+		ret = msix_request_sdma_irq(sde);
+		if (ret)
+			return ret;
+		enable_sdma_srcs(sde->dd, i);
+	}
+
+	for (i = 0; i < dd->n_krcv_queues; i++) {
+		struct hfi1_ctxtdata *rcd = hfi1_rcd_get_by_index_safe(dd, i);
+
+		if (rcd)
+			ret = msix_request_rcd_irq(rcd);
+		hfi1_rcd_put(rcd);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * msix_free_irq() - Free the specified MSIx resources and IRQ
+ * @dd: valid devdata
+ * @msix_intr: MSIx vector to free.
+ *
+ */
+void msix_free_irq(struct hfi1_devdata *dd, u8 msix_intr)
+{
+	struct hfi1_msix_entry *me;
+
+	if (msix_intr >= dd->msix_info.max_requested)
+		return;
+
+	me = &dd->msix_info.msix_entries[msix_intr];
+
+	if (!me->arg) /* => no irq, no affinity */
+		return;
+
+	hfi1_put_irq_affinity(dd, me);
+	pci_free_irq(dd->pcidev, msix_intr, me->arg);
+
+	me->arg = NULL;
+
+	spin_lock(&dd->msix_info.msix_lock);
+	__clear_bit(msix_intr, dd->msix_info.in_use_msix);
+	spin_unlock(&dd->msix_info.msix_lock);
+}
+
+/**
+ * hfi1_clean_up_msix_interrupts() - Free all MSIx IRQ resources
+ * @dd: valid device data data structure
+ *
+ * Free the MSIx and associated PCI resources, if they have been allocated.
+ */
+void msix_clean_up_interrupts(struct hfi1_devdata *dd)
+{
+	int i;
+	struct hfi1_msix_entry *me = dd->msix_info.msix_entries;
+
+	/* remove irqs - must happen before disabling/turning off */
+	for (i = 0; i < dd->msix_info.max_requested; i++, me++)
+		msix_free_irq(dd, i);
+
+	/* clean structures */
+	kfree(dd->msix_info.msix_entries);
+	dd->msix_info.msix_entries = NULL;
+	dd->msix_info.max_requested = 0;
+
+	pci_free_irq_vectors(dd->pcidev);
+}
+
+/**
+ * msix_vnic_syncrhonize_irq() - Vnic IRQ synchronize
+ * @dd: valid devdata
+ */
+void msix_vnic_synchronize_irq(struct hfi1_devdata *dd)
+{
+	int i;
+
+	for (i = 0; i < dd->vnic.num_ctxt; i++) {
+		struct hfi1_ctxtdata *rcd = dd->vnic.ctxt[i];
+		struct hfi1_msix_entry *me;
+
+		me = &dd->msix_info.msix_entries[rcd->msix_intr];
+
+		synchronize_irq(me->irq);
+	}
+}

+ 64 - 0
drivers/infiniband/hw/hfi1/msix.h

@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _HFI1_MSIX_H
+#define _HFI1_MSIX_H
+
+#include "hfi.h"
+
+/* MSIx interface */
+int msix_initialize(struct hfi1_devdata *dd);
+int msix_request_irqs(struct hfi1_devdata *dd);
+void msix_clean_up_interrupts(struct hfi1_devdata *dd);
+int msix_request_rcd_irq(struct hfi1_ctxtdata *rcd);
+int msix_request_sdma_irq(struct sdma_engine *sde);
+void msix_free_irq(struct hfi1_devdata *dd, u8 msix_intr);
+
+/* VNIC interface */
+void msix_vnic_synchronize_irq(struct hfi1_devdata *dd);
+
+#endif

+ 26 - 48
drivers/infiniband/hw/hfi1/pcie.c

@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015 - 2017 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -60,20 +60,13 @@
  * This file contains PCIe utility routines.
  */
 
-/*
- * Code to adjust PCIe capabilities.
- */
-static void tune_pcie_caps(struct hfi1_devdata *);
-
 /*
  * Do all the common PCIe setup and initialization.
- * devdata is not yet allocated, and is not allocated until after this
- * routine returns success.  Therefore dd_dev_err() can't be used for error
- * printing.
  */
-int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent)
+int hfi1_pcie_init(struct hfi1_devdata *dd)
 {
 	int ret;
+	struct pci_dev *pdev = dd->pcidev;
 
 	ret = pci_enable_device(pdev);
 	if (ret) {
@@ -89,15 +82,13 @@ int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent)
 		 * about that, it appears.  If the original BAR was retained
 		 * in the kernel data structures, this may be OK.
 		 */
-		hfi1_early_err(&pdev->dev, "pci enable failed: error %d\n",
-			       -ret);
-		goto done;
+		dd_dev_err(dd, "pci enable failed: error %d\n", -ret);
+		return ret;
 	}
 
 	ret = pci_request_regions(pdev, DRIVER_NAME);
 	if (ret) {
-		hfi1_early_err(&pdev->dev,
-			       "pci_request_regions fails: err %d\n", -ret);
+		dd_dev_err(dd, "pci_request_regions fails: err %d\n", -ret);
 		goto bail;
 	}
 
@@ -110,8 +101,7 @@ int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent)
 		 */
 		ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (ret) {
-			hfi1_early_err(&pdev->dev,
-				       "Unable to set DMA mask: %d\n", ret);
+			dd_dev_err(dd, "Unable to set DMA mask: %d\n", ret);
 			goto bail;
 		}
 		ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
@@ -119,18 +109,16 @@ int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent)
 		ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
 	}
 	if (ret) {
-		hfi1_early_err(&pdev->dev,
-			       "Unable to set DMA consistent mask: %d\n", ret);
+		dd_dev_err(dd, "Unable to set DMA consistent mask: %d\n", ret);
 		goto bail;
 	}
 
 	pci_set_master(pdev);
 	(void)pci_enable_pcie_error_reporting(pdev);
-	goto done;
+	return 0;
 
 bail:
 	hfi1_pcie_cleanup(pdev);
-done:
 	return ret;
 }
 
@@ -206,7 +194,7 @@ int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev)
 		dd_dev_err(dd, "WC mapping of send buffers failed\n");
 		goto nomem;
 	}
-	dd_dev_info(dd, "WC piobase: %p\n for %x", dd->piobase, TXE_PIO_SIZE);
+	dd_dev_info(dd, "WC piobase: %p for %x\n", dd->piobase, TXE_PIO_SIZE);
 
 	dd->physaddr = addr;        /* used for io_remap, etc. */
 
@@ -344,26 +332,6 @@ int pcie_speeds(struct hfi1_devdata *dd)
 	return 0;
 }
 
-/*
- * Returns:
- *	- actual number of interrupts allocated or
- *      - error
- */
-int request_msix(struct hfi1_devdata *dd, u32 msireq)
-{
-	int nvec;
-
-	nvec = pci_alloc_irq_vectors(dd->pcidev, msireq, msireq, PCI_IRQ_MSIX);
-	if (nvec < 0) {
-		dd_dev_err(dd, "pci_alloc_irq_vectors() failed: %d\n", nvec);
-		return nvec;
-	}
-
-	tune_pcie_caps(dd);
-
-	return nvec;
-}
-
 /* restore command and BARs after a reset has wiped them out */
 int restore_pci_variables(struct hfi1_devdata *dd)
 {
@@ -479,14 +447,19 @@ error:
  * Check and optionally adjust them to maximize our throughput.
  */
 static int hfi1_pcie_caps;
-module_param_named(pcie_caps, hfi1_pcie_caps, int, S_IRUGO);
+module_param_named(pcie_caps, hfi1_pcie_caps, int, 0444);
 MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)");
 
 uint aspm_mode = ASPM_MODE_DISABLED;
-module_param_named(aspm, aspm_mode, uint, S_IRUGO);
+module_param_named(aspm, aspm_mode, uint, 0444);
 MODULE_PARM_DESC(aspm, "PCIe ASPM: 0: disable, 1: enable, 2: dynamic");
 
-static void tune_pcie_caps(struct hfi1_devdata *dd)
+/**
+ * tune_pcie_caps() - Code to adjust PCIe capabilities.
+ * @dd: Valid device data structure
+ *
+ */
+void tune_pcie_caps(struct hfi1_devdata *dd)
 {
 	struct pci_dev *parent;
 	u16 rc_mpss, rc_mps, ep_mpss, ep_mps;
@@ -1028,6 +1001,7 @@ int do_pcie_gen3_transition(struct hfi1_devdata *dd)
 	const u8 (*ctle_tunings)[4];
 	uint static_ctle_mode;
 	int return_error = 0;
+	u32 target_width;
 
 	/* PCIe Gen3 is for the ASIC only */
 	if (dd->icode != ICODE_RTL_SILICON)
@@ -1067,6 +1041,9 @@ int do_pcie_gen3_transition(struct hfi1_devdata *dd)
 		return 0;
 	}
 
+	/* Previous Gen1/Gen2 bus width */
+	target_width = dd->lbus_width;
+
 	/*
 	 * Do the Gen3 transition.  Steps are those of the PCIe Gen3
 	 * recipe.
@@ -1435,11 +1412,12 @@ retry:
 	dd_dev_info(dd, "%s: new speed and width: %s\n", __func__,
 		    dd->lbus_info);
 
-	if (dd->lbus_speed != target_speed) { /* not target */
+	if (dd->lbus_speed != target_speed ||
+	    dd->lbus_width < target_width) { /* not target */
 		/* maybe retry */
 		do_retry = retry_count < pcie_retry;
-		dd_dev_err(dd, "PCIe link speed did not switch to Gen%d%s\n",
-			   pcie_target, do_retry ? ", retrying" : "");
+		dd_dev_err(dd, "PCIe link speed or width did not match target%s\n",
+			   do_retry ? ", retrying" : "");
 		retry_count++;
 		if (do_retry) {
 			msleep(100); /* allow time to settle */

+ 0 - 8
drivers/infiniband/hw/hfi1/pio.c

@@ -71,14 +71,6 @@ void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl)
 	}
 }
 
-/* defined in header release 48 and higher */
-#ifndef SEND_CTRL_UNSUPPORTED_VL_SHIFT
-#define SEND_CTRL_UNSUPPORTED_VL_SHIFT 3
-#define SEND_CTRL_UNSUPPORTED_VL_MASK 0xffull
-#define SEND_CTRL_UNSUPPORTED_VL_SMASK (SEND_CTRL_UNSUPPORTED_VL_MASK \
-		<< SEND_CTRL_UNSUPPORTED_VL_SHIFT)
-#endif
-
 /* global control of PIO send */
 void pio_send_control(struct hfi1_devdata *dd, int op)
 {

+ 73 - 27
drivers/infiniband/hw/hfi1/qp.c

@@ -66,7 +66,7 @@ MODULE_PARM_DESC(qp_table_size, "QP table size");
 static void flush_tx_list(struct rvt_qp *qp);
 static int iowait_sleep(
 	struct sdma_engine *sde,
-	struct iowait *wait,
+	struct iowait_work *wait,
 	struct sdma_txreq *stx,
 	unsigned int seq,
 	bool pkts_sent);
@@ -134,15 +134,13 @@ const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = {
 
 };
 
-static void flush_tx_list(struct rvt_qp *qp)
+static void flush_list_head(struct list_head *l)
 {
-	struct hfi1_qp_priv *priv = qp->priv;
-
-	while (!list_empty(&priv->s_iowait.tx_head)) {
+	while (!list_empty(l)) {
 		struct sdma_txreq *tx;
 
 		tx = list_first_entry(
-			&priv->s_iowait.tx_head,
+			l,
 			struct sdma_txreq,
 			list);
 		list_del_init(&tx->list);
@@ -151,6 +149,14 @@ static void flush_tx_list(struct rvt_qp *qp)
 	}
 }
 
+static void flush_tx_list(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	flush_list_head(&iowait_get_ib_work(&priv->s_iowait)->tx_head);
+	flush_list_head(&iowait_get_tid_work(&priv->s_iowait)->tx_head);
+}
+
 static void flush_iowait(struct rvt_qp *qp)
 {
 	struct hfi1_qp_priv *priv = qp->priv;
@@ -282,33 +288,46 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
 }
 
 /**
- * hfi1_check_send_wqe - validate wqe
+ * hfi1_setup_wqe - set up the wqe
  * @qp - The qp
  * @wqe - The built wqe
+ * @call_send - Determine if the send should be posted or scheduled.
  *
- * validate wqe.  This is called
- * prior to inserting the wqe into
- * the ring but after the wqe has been
- * setup.
+ * Perform setup of the wqe.  This is called
+ * prior to inserting the wqe into the ring but after
+ * the wqe has been setup by RDMAVT. This function
+ * allows the driver the opportunity to perform
+ * validation and additional setup of the wqe.
  *
  * Returns 0 on success, -EINVAL on failure
  *
  */
-int hfi1_check_send_wqe(struct rvt_qp *qp,
-			struct rvt_swqe *wqe)
+int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, bool *call_send)
 {
 	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
 	struct rvt_ah *ah;
+	struct hfi1_pportdata *ppd;
+	struct hfi1_devdata *dd;
 
 	switch (qp->ibqp.qp_type) {
 	case IB_QPT_RC:
 	case IB_QPT_UC:
 		if (wqe->length > 0x80000000U)
 			return -EINVAL;
+		if (wqe->length > qp->pmtu)
+			*call_send = false;
 		break;
 	case IB_QPT_SMI:
-		ah = ibah_to_rvtah(wqe->ud_wr.ah);
-		if (wqe->length > (1 << ah->log_pmtu))
+		/*
+		 * SM packets should exclusively use VL15 and their SL is
+		 * ignored (IBTA v1.3, Section 3.5.8.2). Therefore, when ah
+		 * is created, SL is 0 in most cases and as a result some
+		 * fields (vl and pmtu) in ah may not be set correctly,
+		 * depending on the SL2SC and SC2VL tables at the time.
+		 */
+		ppd = ppd_from_ibp(ibp);
+		dd = dd_from_ppd(ppd);
+		if (wqe->length > dd->vld[15].mtu)
 			return -EINVAL;
 		break;
 	case IB_QPT_GSI:
@@ -321,7 +340,7 @@ int hfi1_check_send_wqe(struct rvt_qp *qp,
 	default:
 		break;
 	}
-	return wqe->length <= piothreshold;
+	return 0;
 }
 
 /**
@@ -333,7 +352,7 @@ int hfi1_check_send_wqe(struct rvt_qp *qp,
  * It is only used in the post send, which doesn't hold
  * the s_lock.
  */
-void _hfi1_schedule_send(struct rvt_qp *qp)
+bool _hfi1_schedule_send(struct rvt_qp *qp)
 {
 	struct hfi1_qp_priv *priv = qp->priv;
 	struct hfi1_ibport *ibp =
@@ -341,10 +360,10 @@ void _hfi1_schedule_send(struct rvt_qp *qp)
 	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
 	struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
 
-	iowait_schedule(&priv->s_iowait, ppd->hfi1_wq,
-			priv->s_sde ?
-			priv->s_sde->cpu :
-			cpumask_first(cpumask_of_node(dd->node)));
+	return iowait_schedule(&priv->s_iowait, ppd->hfi1_wq,
+			       priv->s_sde ?
+			       priv->s_sde->cpu :
+			       cpumask_first(cpumask_of_node(dd->node)));
 }
 
 static void qp_pio_drain(struct rvt_qp *qp)
@@ -372,12 +391,32 @@ static void qp_pio_drain(struct rvt_qp *qp)
  *
  * This schedules qp progress and caller should hold
  * the s_lock.
+ * @return true if the first leg is scheduled;
+ * false if the first leg is not scheduled.
  */
-void hfi1_schedule_send(struct rvt_qp *qp)
+bool hfi1_schedule_send(struct rvt_qp *qp)
 {
 	lockdep_assert_held(&qp->s_lock);
-	if (hfi1_send_ok(qp))
+	if (hfi1_send_ok(qp)) {
 		_hfi1_schedule_send(qp);
+		return true;
+	}
+	if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
+		iowait_set_flag(&((struct hfi1_qp_priv *)qp->priv)->s_iowait,
+				IOWAIT_PENDING_IB);
+	return false;
+}
+
+static void hfi1_qp_schedule(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	bool ret;
+
+	if (iowait_flag_set(&priv->s_iowait, IOWAIT_PENDING_IB)) {
+		ret = hfi1_schedule_send(qp);
+		if (ret)
+			iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
+	}
 }
 
 void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag)
@@ -388,16 +427,22 @@ void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag)
 	if (qp->s_flags & flag) {
 		qp->s_flags &= ~flag;
 		trace_hfi1_qpwakeup(qp, flag);
-		hfi1_schedule_send(qp);
+		hfi1_qp_schedule(qp);
 	}
 	spin_unlock_irqrestore(&qp->s_lock, flags);
 	/* Notify hfi1_destroy_qp() if it is waiting. */
 	rvt_put_qp(qp);
 }
 
+void hfi1_qp_unbusy(struct rvt_qp *qp, struct iowait_work *wait)
+{
+	if (iowait_set_work_flag(wait) == IOWAIT_IB_SE)
+		qp->s_flags &= ~RVT_S_BUSY;
+}
+
 static int iowait_sleep(
 	struct sdma_engine *sde,
-	struct iowait *wait,
+	struct iowait_work *wait,
 	struct sdma_txreq *stx,
 	uint seq,
 	bool pkts_sent)
@@ -438,7 +483,7 @@ static int iowait_sleep(
 			rvt_get_qp(qp);
 		}
 		write_sequnlock(&dev->iowait_lock);
-		qp->s_flags &= ~RVT_S_BUSY;
+		hfi1_qp_unbusy(qp, wait);
 		spin_unlock_irqrestore(&qp->s_lock, flags);
 		ret = -EBUSY;
 	} else {
@@ -637,6 +682,7 @@ void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp)
 		&priv->s_iowait,
 		1,
 		_hfi1_do_send,
+		NULL,
 		iowait_sleep,
 		iowait_wakeup,
 		iowait_sdma_drained);
@@ -686,7 +732,7 @@ void stop_send_queue(struct rvt_qp *qp)
 {
 	struct hfi1_qp_priv *priv = qp->priv;
 
-	cancel_work_sync(&priv->s_iowait.iowork);
+	iowait_cancel_work(&priv->s_iowait);
 }
 
 void quiesce_qp(struct rvt_qp *qp)

+ 17 - 14
drivers/infiniband/hw/hfi1/qp.h

@@ -57,18 +57,6 @@ extern unsigned int hfi1_qp_table_size;
 
 extern const struct rvt_operation_params hfi1_post_parms[];
 
-/*
- * Send if not busy or waiting for I/O and either
- * a RC response is pending or we can process send work requests.
- */
-static inline int hfi1_send_ok(struct rvt_qp *qp)
-{
-	return !(qp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT_IO)) &&
-		(verbs_txreq_queued(qp) ||
-		(qp->s_flags & RVT_S_RESP_PENDING) ||
-		 !(qp->s_flags & RVT_S_ANY_WAIT_SEND));
-}
-
 /*
  * Driver specific s_flags starting at bit 31 down to HFI1_S_MIN_BIT_MASK
  *
@@ -89,6 +77,20 @@ static inline int hfi1_send_ok(struct rvt_qp *qp)
 #define HFI1_S_ANY_WAIT_IO (RVT_S_ANY_WAIT_IO | HFI1_S_WAIT_PIO_DRAIN)
 #define HFI1_S_ANY_WAIT (HFI1_S_ANY_WAIT_IO | RVT_S_ANY_WAIT_SEND)
 
+/*
+ * Send if not busy or waiting for I/O and either
+ * a RC response is pending or we can process send work requests.
+ */
+static inline int hfi1_send_ok(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	return !(qp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT_IO)) &&
+		(verbs_txreq_queued(iowait_get_ib_work(&priv->s_iowait)) ||
+		(qp->s_flags & RVT_S_RESP_PENDING) ||
+		 !(qp->s_flags & RVT_S_ANY_WAIT_SEND));
+}
+
 /*
  * free_ahg - clear ahg from QP
  */
@@ -129,8 +131,8 @@ struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5);
 
 void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter);
 
-void _hfi1_schedule_send(struct rvt_qp *qp);
-void hfi1_schedule_send(struct rvt_qp *qp);
+bool _hfi1_schedule_send(struct rvt_qp *qp);
+bool hfi1_schedule_send(struct rvt_qp *qp);
 
 void hfi1_migrate_qp(struct rvt_qp *qp);
 
@@ -150,4 +152,5 @@ void quiesce_qp(struct rvt_qp *qp);
 u32 mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu);
 int mtu_to_path_mtu(u32 mtu);
 void hfi1_error_port_qps(struct hfi1_ibport *ibp, u8 sl);
+void hfi1_qp_unbusy(struct rvt_qp *qp, struct iowait_work *wait);
 #endif /* _QP_H */

+ 13 - 11
drivers/infiniband/hw/hfi1/rc.c

@@ -309,7 +309,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 		}
 		clear_ahg(qp);
 		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
-		hfi1_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
+		rvt_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
 			IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
 		/* will get called again */
 		goto done_free_tx;
@@ -378,9 +378,9 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 						wqe->wr.ex.invalidate_rkey);
 					local_ops = 1;
 				}
-				hfi1_send_complete(qp, wqe,
-						   err ? IB_WC_LOC_PROT_ERR
-						       : IB_WC_SUCCESS);
+				rvt_send_complete(qp, wqe,
+						  err ? IB_WC_LOC_PROT_ERR
+						      : IB_WC_SUCCESS);
 				if (local_ops)
 					atomic_dec(&qp->local_ops_pending);
 				goto done_free_tx;
@@ -1043,7 +1043,7 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
 			hfi1_migrate_qp(qp);
 			qp->s_retry = qp->s_retry_cnt;
 		} else if (qp->s_last == qp->s_acked) {
-			hfi1_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
+			rvt_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
 			rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
 			return;
 		} else { /* need to handle delayed completion */
@@ -1468,7 +1468,7 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
 			ibp->rvp.n_other_naks++;
 class_b:
 			if (qp->s_last == qp->s_acked) {
-				hfi1_send_complete(qp, wqe, status);
+				rvt_send_complete(qp, wqe, status);
 				rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
 			}
 			break;
@@ -1644,7 +1644,8 @@ read_middle:
 		qp->s_rdma_read_len -= pmtu;
 		update_last_psn(qp, psn);
 		spin_unlock_irqrestore(&qp->s_lock, flags);
-		hfi1_copy_sge(&qp->s_rdma_read_sge, data, pmtu, false, false);
+		rvt_copy_sge(qp, &qp->s_rdma_read_sge,
+			     data, pmtu, false, false);
 		goto bail;
 
 	case OP(RDMA_READ_RESPONSE_ONLY):
@@ -1684,7 +1685,8 @@ read_last:
 		if (unlikely(tlen != qp->s_rdma_read_len))
 			goto ack_len_err;
 		aeth = be32_to_cpu(ohdr->u.aeth);
-		hfi1_copy_sge(&qp->s_rdma_read_sge, data, tlen, false, false);
+		rvt_copy_sge(qp, &qp->s_rdma_read_sge,
+			     data, tlen, false, false);
 		WARN_ON(qp->s_rdma_read_sge.num_sge);
 		(void)do_rc_ack(qp, aeth, psn,
 				 OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
@@ -1704,7 +1706,7 @@ ack_len_err:
 	status = IB_WC_LOC_LEN_ERR;
 ack_err:
 	if (qp->s_last == qp->s_acked) {
-		hfi1_send_complete(qp, wqe, status);
+		rvt_send_complete(qp, wqe, status);
 		rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
 	}
 ack_done:
@@ -2144,7 +2146,7 @@ send_middle:
 		qp->r_rcv_len += pmtu;
 		if (unlikely(qp->r_rcv_len > qp->r_len))
 			goto nack_inv;
-		hfi1_copy_sge(&qp->r_sge, data, pmtu, true, false);
+		rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false);
 		break;
 
 	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
@@ -2200,7 +2202,7 @@ send_last:
 		wc.byte_len = tlen + qp->r_rcv_len;
 		if (unlikely(wc.byte_len > qp->r_len))
 			goto nack_inv;
-		hfi1_copy_sge(&qp->r_sge, data, tlen, true, copy_last);
+		rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, copy_last);
 		rvt_put_ss(&qp->r_sge);
 		qp->r_msn++;
 		if (!__test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))

+ 9 - 373
drivers/infiniband/hw/hfi1/ruc.c

@@ -155,333 +155,6 @@ int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_packet *packet)
 	return 0;
 }
 
-/**
- * ruc_loopback - handle UC and RC loopback requests
- * @sqp: the sending QP
- *
- * This is called from hfi1_do_send() to
- * forward a WQE addressed to the same HFI.
- * Note that although we are single threaded due to the send engine, we still
- * have to protect against post_send().  We don't have to worry about
- * receive interrupts since this is a connected protocol and all packets
- * will pass through here.
- */
-static void ruc_loopback(struct rvt_qp *sqp)
-{
-	struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
-	struct rvt_qp *qp;
-	struct rvt_swqe *wqe;
-	struct rvt_sge *sge;
-	unsigned long flags;
-	struct ib_wc wc;
-	u64 sdata;
-	atomic64_t *maddr;
-	enum ib_wc_status send_status;
-	bool release;
-	int ret;
-	bool copy_last = false;
-	int local_ops = 0;
-
-	rcu_read_lock();
-
-	/*
-	 * Note that we check the responder QP state after
-	 * checking the requester's state.
-	 */
-	qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp,
-			    sqp->remote_qpn);
-
-	spin_lock_irqsave(&sqp->s_lock, flags);
-
-	/* Return if we are already busy processing a work request. */
-	if ((sqp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT)) ||
-	    !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND))
-		goto unlock;
-
-	sqp->s_flags |= RVT_S_BUSY;
-
-again:
-	if (sqp->s_last == READ_ONCE(sqp->s_head))
-		goto clr_busy;
-	wqe = rvt_get_swqe_ptr(sqp, sqp->s_last);
-
-	/* Return if it is not OK to start a new work request. */
-	if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) {
-		if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND))
-			goto clr_busy;
-		/* We are in the error state, flush the work request. */
-		send_status = IB_WC_WR_FLUSH_ERR;
-		goto flush_send;
-	}
-
-	/*
-	 * We can rely on the entry not changing without the s_lock
-	 * being held until we update s_last.
-	 * We increment s_cur to indicate s_last is in progress.
-	 */
-	if (sqp->s_last == sqp->s_cur) {
-		if (++sqp->s_cur >= sqp->s_size)
-			sqp->s_cur = 0;
-	}
-	spin_unlock_irqrestore(&sqp->s_lock, flags);
-
-	if (!qp || !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) ||
-	    qp->ibqp.qp_type != sqp->ibqp.qp_type) {
-		ibp->rvp.n_pkt_drops++;
-		/*
-		 * For RC, the requester would timeout and retry so
-		 * shortcut the timeouts and just signal too many retries.
-		 */
-		if (sqp->ibqp.qp_type == IB_QPT_RC)
-			send_status = IB_WC_RETRY_EXC_ERR;
-		else
-			send_status = IB_WC_SUCCESS;
-		goto serr;
-	}
-
-	memset(&wc, 0, sizeof(wc));
-	send_status = IB_WC_SUCCESS;
-
-	release = true;
-	sqp->s_sge.sge = wqe->sg_list[0];
-	sqp->s_sge.sg_list = wqe->sg_list + 1;
-	sqp->s_sge.num_sge = wqe->wr.num_sge;
-	sqp->s_len = wqe->length;
-	switch (wqe->wr.opcode) {
-	case IB_WR_REG_MR:
-		goto send_comp;
-
-	case IB_WR_LOCAL_INV:
-		if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) {
-			if (rvt_invalidate_rkey(sqp,
-						wqe->wr.ex.invalidate_rkey))
-				send_status = IB_WC_LOC_PROT_ERR;
-			local_ops = 1;
-		}
-		goto send_comp;
-
-	case IB_WR_SEND_WITH_INV:
-		if (!rvt_invalidate_rkey(qp, wqe->wr.ex.invalidate_rkey)) {
-			wc.wc_flags = IB_WC_WITH_INVALIDATE;
-			wc.ex.invalidate_rkey = wqe->wr.ex.invalidate_rkey;
-		}
-		goto send;
-
-	case IB_WR_SEND_WITH_IMM:
-		wc.wc_flags = IB_WC_WITH_IMM;
-		wc.ex.imm_data = wqe->wr.ex.imm_data;
-		/* FALLTHROUGH */
-	case IB_WR_SEND:
-send:
-		ret = rvt_get_rwqe(qp, false);
-		if (ret < 0)
-			goto op_err;
-		if (!ret)
-			goto rnr_nak;
-		break;
-
-	case IB_WR_RDMA_WRITE_WITH_IMM:
-		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
-			goto inv_err;
-		wc.wc_flags = IB_WC_WITH_IMM;
-		wc.ex.imm_data = wqe->wr.ex.imm_data;
-		ret = rvt_get_rwqe(qp, true);
-		if (ret < 0)
-			goto op_err;
-		if (!ret)
-			goto rnr_nak;
-		/* skip copy_last set and qp_access_flags recheck */
-		goto do_write;
-	case IB_WR_RDMA_WRITE:
-		copy_last = rvt_is_user_qp(qp);
-		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
-			goto inv_err;
-do_write:
-		if (wqe->length == 0)
-			break;
-		if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length,
-					  wqe->rdma_wr.remote_addr,
-					  wqe->rdma_wr.rkey,
-					  IB_ACCESS_REMOTE_WRITE)))
-			goto acc_err;
-		qp->r_sge.sg_list = NULL;
-		qp->r_sge.num_sge = 1;
-		qp->r_sge.total_len = wqe->length;
-		break;
-
-	case IB_WR_RDMA_READ:
-		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
-			goto inv_err;
-		if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length,
-					  wqe->rdma_wr.remote_addr,
-					  wqe->rdma_wr.rkey,
-					  IB_ACCESS_REMOTE_READ)))
-			goto acc_err;
-		release = false;
-		sqp->s_sge.sg_list = NULL;
-		sqp->s_sge.num_sge = 1;
-		qp->r_sge.sge = wqe->sg_list[0];
-		qp->r_sge.sg_list = wqe->sg_list + 1;
-		qp->r_sge.num_sge = wqe->wr.num_sge;
-		qp->r_sge.total_len = wqe->length;
-		break;
-
-	case IB_WR_ATOMIC_CMP_AND_SWP:
-	case IB_WR_ATOMIC_FETCH_AND_ADD:
-		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
-			goto inv_err;
-		if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
-					  wqe->atomic_wr.remote_addr,
-					  wqe->atomic_wr.rkey,
-					  IB_ACCESS_REMOTE_ATOMIC)))
-			goto acc_err;
-		/* Perform atomic OP and save result. */
-		maddr = (atomic64_t *)qp->r_sge.sge.vaddr;
-		sdata = wqe->atomic_wr.compare_add;
-		*(u64 *)sqp->s_sge.sge.vaddr =
-			(wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
-			(u64)atomic64_add_return(sdata, maddr) - sdata :
-			(u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr,
-				      sdata, wqe->atomic_wr.swap);
-		rvt_put_mr(qp->r_sge.sge.mr);
-		qp->r_sge.num_sge = 0;
-		goto send_comp;
-
-	default:
-		send_status = IB_WC_LOC_QP_OP_ERR;
-		goto serr;
-	}
-
-	sge = &sqp->s_sge.sge;
-	while (sqp->s_len) {
-		u32 len = sqp->s_len;
-
-		if (len > sge->length)
-			len = sge->length;
-		if (len > sge->sge_length)
-			len = sge->sge_length;
-		WARN_ON_ONCE(len == 0);
-		hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, release, copy_last);
-		sge->vaddr += len;
-		sge->length -= len;
-		sge->sge_length -= len;
-		if (sge->sge_length == 0) {
-			if (!release)
-				rvt_put_mr(sge->mr);
-			if (--sqp->s_sge.num_sge)
-				*sge = *sqp->s_sge.sg_list++;
-		} else if (sge->length == 0 && sge->mr->lkey) {
-			if (++sge->n >= RVT_SEGSZ) {
-				if (++sge->m >= sge->mr->mapsz)
-					break;
-				sge->n = 0;
-			}
-			sge->vaddr =
-				sge->mr->map[sge->m]->segs[sge->n].vaddr;
-			sge->length =
-				sge->mr->map[sge->m]->segs[sge->n].length;
-		}
-		sqp->s_len -= len;
-	}
-	if (release)
-		rvt_put_ss(&qp->r_sge);
-
-	if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
-		goto send_comp;
-
-	if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
-		wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
-	else
-		wc.opcode = IB_WC_RECV;
-	wc.wr_id = qp->r_wr_id;
-	wc.status = IB_WC_SUCCESS;
-	wc.byte_len = wqe->length;
-	wc.qp = &qp->ibqp;
-	wc.src_qp = qp->remote_qpn;
-	wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX;
-	wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr);
-	wc.port_num = 1;
-	/* Signal completion event if the solicited bit is set. */
-	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
-		     wqe->wr.send_flags & IB_SEND_SOLICITED);
-
-send_comp:
-	spin_lock_irqsave(&sqp->s_lock, flags);
-	ibp->rvp.n_loop_pkts++;
-flush_send:
-	sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
-	hfi1_send_complete(sqp, wqe, send_status);
-	if (local_ops) {
-		atomic_dec(&sqp->local_ops_pending);
-		local_ops = 0;
-	}
-	goto again;
-
-rnr_nak:
-	/* Handle RNR NAK */
-	if (qp->ibqp.qp_type == IB_QPT_UC)
-		goto send_comp;
-	ibp->rvp.n_rnr_naks++;
-	/*
-	 * Note: we don't need the s_lock held since the BUSY flag
-	 * makes this single threaded.
-	 */
-	if (sqp->s_rnr_retry == 0) {
-		send_status = IB_WC_RNR_RETRY_EXC_ERR;
-		goto serr;
-	}
-	if (sqp->s_rnr_retry_cnt < 7)
-		sqp->s_rnr_retry--;
-	spin_lock_irqsave(&sqp->s_lock, flags);
-	if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK))
-		goto clr_busy;
-	rvt_add_rnr_timer(sqp, qp->r_min_rnr_timer <<
-				IB_AETH_CREDIT_SHIFT);
-	goto clr_busy;
-
-op_err:
-	send_status = IB_WC_REM_OP_ERR;
-	wc.status = IB_WC_LOC_QP_OP_ERR;
-	goto err;
-
-inv_err:
-	send_status = IB_WC_REM_INV_REQ_ERR;
-	wc.status = IB_WC_LOC_QP_OP_ERR;
-	goto err;
-
-acc_err:
-	send_status = IB_WC_REM_ACCESS_ERR;
-	wc.status = IB_WC_LOC_PROT_ERR;
-err:
-	/* responder goes to error state */
-	rvt_rc_error(qp, wc.status);
-
-serr:
-	spin_lock_irqsave(&sqp->s_lock, flags);
-	hfi1_send_complete(sqp, wqe, send_status);
-	if (sqp->ibqp.qp_type == IB_QPT_RC) {
-		int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
-
-		sqp->s_flags &= ~RVT_S_BUSY;
-		spin_unlock_irqrestore(&sqp->s_lock, flags);
-		if (lastwqe) {
-			struct ib_event ev;
-
-			ev.device = sqp->ibqp.device;
-			ev.element.qp = &sqp->ibqp;
-			ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
-			sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
-		}
-		goto done;
-	}
-clr_busy:
-	sqp->s_flags &= ~RVT_S_BUSY;
-unlock:
-	spin_unlock_irqrestore(&sqp->s_lock, flags);
-done:
-	rcu_read_unlock();
-}
-
 /**
  * hfi1_make_grh - construct a GRH header
  * @ibp: a pointer to the IB port
@@ -825,8 +498,8 @@ void hfi1_do_send_from_rvt(struct rvt_qp *qp)
 
 void _hfi1_do_send(struct work_struct *work)
 {
-	struct iowait *wait = container_of(work, struct iowait, iowork);
-	struct rvt_qp *qp = iowait_to_qp(wait);
+	struct iowait_work *w = container_of(work, struct iowait_work, iowork);
+	struct rvt_qp *qp = iowait_to_qp(w->iow);
 
 	hfi1_do_send(qp, true);
 }
@@ -850,6 +523,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
 	ps.ibp = to_iport(qp->ibqp.device, qp->port_num);
 	ps.ppd = ppd_from_ibp(ps.ibp);
 	ps.in_thread = in_thread;
+	ps.wait = iowait_get_ib_work(&priv->s_iowait);
 
 	trace_hfi1_rc_do_send(qp, in_thread);
 
@@ -858,7 +532,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
 		if (!loopback && ((rdma_ah_get_dlid(&qp->remote_ah_attr) &
 				   ~((1 << ps.ppd->lmc) - 1)) ==
 				  ps.ppd->lid)) {
-			ruc_loopback(qp);
+			rvt_ruc_loopback(qp);
 			return;
 		}
 		make_req = hfi1_make_rc_req;
@@ -868,7 +542,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
 		if (!loopback && ((rdma_ah_get_dlid(&qp->remote_ah_attr) &
 				   ~((1 << ps.ppd->lmc) - 1)) ==
 				  ps.ppd->lid)) {
-			ruc_loopback(qp);
+			rvt_ruc_loopback(qp);
 			return;
 		}
 		make_req = hfi1_make_uc_req;
@@ -883,6 +557,8 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
 
 	/* Return if we are already busy processing a work request. */
 	if (!hfi1_send_ok(qp)) {
+		if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
+			iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
 		spin_unlock_irqrestore(&qp->s_lock, ps.flags);
 		return;
 	}
@@ -896,7 +572,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
 	ps.pkts_sent = false;
 
 	/* insure a pre-built packet is handled  */
-	ps.s_txreq = get_waiting_verbs_txreq(qp);
+	ps.s_txreq = get_waiting_verbs_txreq(ps.wait);
 	do {
 		/* Check for a constructed packet to be sent. */
 		if (ps.s_txreq) {
@@ -907,6 +583,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
 			 */
 			if (hfi1_verbs_send(qp, &ps))
 				return;
+
 			/* allow other tasks to run */
 			if (schedule_send_yield(qp, &ps))
 				return;
@@ -917,44 +594,3 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
 	iowait_starve_clear(ps.pkts_sent, &priv->s_iowait);
 	spin_unlock_irqrestore(&qp->s_lock, ps.flags);
 }
-
-/*
- * This should be called with s_lock held.
- */
-void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
-			enum ib_wc_status status)
-{
-	u32 old_last, last;
-
-	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
-		return;
-
-	last = qp->s_last;
-	old_last = last;
-	trace_hfi1_qp_send_completion(qp, wqe, last);
-	if (++last >= qp->s_size)
-		last = 0;
-	trace_hfi1_qp_send_completion(qp, wqe, last);
-	qp->s_last = last;
-	/* See post_send() */
-	barrier();
-	rvt_put_swqe(wqe);
-	if (qp->ibqp.qp_type == IB_QPT_UD ||
-	    qp->ibqp.qp_type == IB_QPT_SMI ||
-	    qp->ibqp.qp_type == IB_QPT_GSI)
-		atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount);
-
-	rvt_qp_swqe_complete(qp,
-			     wqe,
-			     ib_hfi1_wc_opcode[wqe->wr.opcode],
-			     status);
-
-	if (qp->s_acked == old_last)
-		qp->s_acked = last;
-	if (qp->s_cur == old_last)
-		qp->s_cur = last;
-	if (qp->s_tail == old_last)
-		qp->s_tail = last;
-	if (qp->state == IB_QPS_SQD && last == qp->s_cur)
-		qp->s_draining = 0;
-}

+ 22 - 34
drivers/infiniband/hw/hfi1/sdma.c

@@ -378,7 +378,7 @@ static inline void complete_tx(struct sdma_engine *sde,
 	__sdma_txclean(sde->dd, tx);
 	if (complete)
 		(*complete)(tx, res);
-	if (wait && iowait_sdma_dec(wait))
+	if (iowait_sdma_dec(wait))
 		iowait_drain_wakeup(wait);
 }
 
@@ -1758,7 +1758,6 @@ static void sdma_desc_avail(struct sdma_engine *sde, uint avail)
 	struct iowait *wait, *nw;
 	struct iowait *waits[SDMA_WAIT_BATCH_SIZE];
 	uint i, n = 0, seq, max_idx = 0;
-	struct sdma_txreq *stx;
 	struct hfi1_ibdev *dev = &sde->dd->verbs_dev;
 	u8 max_starved_cnt = 0;
 
@@ -1779,19 +1778,13 @@ static void sdma_desc_avail(struct sdma_engine *sde, uint avail)
 					nw,
 					&sde->dmawait,
 					list) {
-				u16 num_desc = 0;
+				u32 num_desc;
 
 				if (!wait->wakeup)
 					continue;
 				if (n == ARRAY_SIZE(waits))
 					break;
-				if (!list_empty(&wait->tx_head)) {
-					stx = list_first_entry(
-						&wait->tx_head,
-						struct sdma_txreq,
-						list);
-					num_desc = stx->num_desc;
-				}
+				num_desc = iowait_get_all_desc(wait);
 				if (num_desc > avail)
 					break;
 				avail -= num_desc;
@@ -2346,7 +2339,7 @@ static inline u16 submit_tx(struct sdma_engine *sde, struct sdma_txreq *tx)
  */
 static int sdma_check_progress(
 	struct sdma_engine *sde,
-	struct iowait *wait,
+	struct iowait_work *wait,
 	struct sdma_txreq *tx,
 	bool pkts_sent)
 {
@@ -2356,12 +2349,12 @@ static int sdma_check_progress(
 	if (tx->num_desc <= sde->desc_avail)
 		return -EAGAIN;
 	/* pulse the head_lock */
-	if (wait && wait->sleep) {
+	if (wait && iowait_ioww_to_iow(wait)->sleep) {
 		unsigned seq;
 
 		seq = raw_seqcount_begin(
 			(const seqcount_t *)&sde->head_lock.seqcount);
-		ret = wait->sleep(sde, wait, tx, seq, pkts_sent);
+		ret = wait->iow->sleep(sde, wait, tx, seq, pkts_sent);
 		if (ret == -EAGAIN)
 			sde->desc_avail = sdma_descq_freecnt(sde);
 	} else {
@@ -2373,7 +2366,7 @@ static int sdma_check_progress(
 /**
  * sdma_send_txreq() - submit a tx req to ring
  * @sde: sdma engine to use
- * @wait: wait structure to use when full (may be NULL)
+ * @wait: SE wait structure to use when full (may be NULL)
  * @tx: sdma_txreq to submit
  * @pkts_sent: has any packet been sent yet?
  *
@@ -2386,7 +2379,7 @@ static int sdma_check_progress(
  * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
  */
 int sdma_send_txreq(struct sdma_engine *sde,
-		    struct iowait *wait,
+		    struct iowait_work *wait,
 		    struct sdma_txreq *tx,
 		    bool pkts_sent)
 {
@@ -2397,7 +2390,7 @@ int sdma_send_txreq(struct sdma_engine *sde,
 	/* user should have supplied entire packet */
 	if (unlikely(tx->tlen))
 		return -EINVAL;
-	tx->wait = wait;
+	tx->wait = iowait_ioww_to_iow(wait);
 	spin_lock_irqsave(&sde->tail_lock, flags);
 retry:
 	if (unlikely(!__sdma_running(sde)))
@@ -2406,14 +2399,14 @@ retry:
 		goto nodesc;
 	tail = submit_tx(sde, tx);
 	if (wait)
-		iowait_sdma_inc(wait);
+		iowait_sdma_inc(iowait_ioww_to_iow(wait));
 	sdma_update_tail(sde, tail);
 unlock:
 	spin_unlock_irqrestore(&sde->tail_lock, flags);
 	return ret;
 unlock_noconn:
 	if (wait)
-		iowait_sdma_inc(wait);
+		iowait_sdma_inc(iowait_ioww_to_iow(wait));
 	tx->next_descq_idx = 0;
 #ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
 	tx->sn = sde->tail_sn++;
@@ -2422,10 +2415,7 @@ unlock_noconn:
 	spin_lock(&sde->flushlist_lock);
 	list_add_tail(&tx->list, &sde->flushlist);
 	spin_unlock(&sde->flushlist_lock);
-	if (wait) {
-		wait->tx_count++;
-		wait->count += tx->num_desc;
-	}
+	iowait_inc_wait_count(wait, tx->num_desc);
 	schedule_work(&sde->flush_worker);
 	ret = -ECOMM;
 	goto unlock;
@@ -2442,9 +2432,9 @@ nodesc:
 /**
  * sdma_send_txlist() - submit a list of tx req to ring
  * @sde: sdma engine to use
- * @wait: wait structure to use when full (may be NULL)
+ * @wait: SE wait structure to use when full (may be NULL)
  * @tx_list: list of sdma_txreqs to submit
- * @count: pointer to a u32 which, after return will contain the total number of
+ * @count: pointer to a u16 which, after return will contain the total number of
  *         sdma_txreqs removed from the tx_list. This will include sdma_txreqs
  *         whose SDMA descriptors are submitted to the ring and the sdma_txreqs
  *         which are added to SDMA engine flush list if the SDMA engine state is
@@ -2467,8 +2457,8 @@ nodesc:
  * -EINVAL - sdma_txreq incomplete, -EBUSY - no space in ring (wait == NULL)
  * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
  */
-int sdma_send_txlist(struct sdma_engine *sde, struct iowait *wait,
-		     struct list_head *tx_list, u32 *count_out)
+int sdma_send_txlist(struct sdma_engine *sde, struct iowait_work *wait,
+		     struct list_head *tx_list, u16 *count_out)
 {
 	struct sdma_txreq *tx, *tx_next;
 	int ret = 0;
@@ -2479,7 +2469,7 @@ int sdma_send_txlist(struct sdma_engine *sde, struct iowait *wait,
 	spin_lock_irqsave(&sde->tail_lock, flags);
 retry:
 	list_for_each_entry_safe(tx, tx_next, tx_list, list) {
-		tx->wait = wait;
+		tx->wait = iowait_ioww_to_iow(wait);
 		if (unlikely(!__sdma_running(sde)))
 			goto unlock_noconn;
 		if (unlikely(tx->num_desc > sde->desc_avail))
@@ -2500,8 +2490,9 @@ retry:
 update_tail:
 	total_count = submit_count + flush_count;
 	if (wait) {
-		iowait_sdma_add(wait, total_count);
-		iowait_starve_clear(submit_count > 0, wait);
+		iowait_sdma_add(iowait_ioww_to_iow(wait), total_count);
+		iowait_starve_clear(submit_count > 0,
+				    iowait_ioww_to_iow(wait));
 	}
 	if (tail != INVALID_TAIL)
 		sdma_update_tail(sde, tail);
@@ -2511,7 +2502,7 @@ update_tail:
 unlock_noconn:
 	spin_lock(&sde->flushlist_lock);
 	list_for_each_entry_safe(tx, tx_next, tx_list, list) {
-		tx->wait = wait;
+		tx->wait = iowait_ioww_to_iow(wait);
 		list_del_init(&tx->list);
 		tx->next_descq_idx = 0;
 #ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
@@ -2520,10 +2511,7 @@ unlock_noconn:
 #endif
 		list_add_tail(&tx->list, &sde->flushlist);
 		flush_count++;
-		if (wait) {
-			wait->tx_count++;
-			wait->count += tx->num_desc;
-		}
+		iowait_inc_wait_count(wait, tx->num_desc);
 	}
 	spin_unlock(&sde->flushlist_lock);
 	schedule_work(&sde->flush_worker);

+ 6 - 15
drivers/infiniband/hw/hfi1/sdma.h

@@ -1,7 +1,7 @@
 #ifndef _HFI1_SDMA_H
 #define _HFI1_SDMA_H
 /*
- * Copyright(c) 2015, 2016 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -62,16 +62,6 @@
 /* Hardware limit for SDMA packet size */
 #define MAX_SDMA_PKT_SIZE ((16 * 1024) - 1)
 
-#define SDMA_TXREQ_S_OK        0
-#define SDMA_TXREQ_S_SENDERROR 1
-#define SDMA_TXREQ_S_ABORTED   2
-#define SDMA_TXREQ_S_SHUTDOWN  3
-
-/* flags bits */
-#define SDMA_TXREQ_F_URGENT       0x0001
-#define SDMA_TXREQ_F_AHG_COPY     0x0002
-#define SDMA_TXREQ_F_USE_AHG      0x0004
-
 #define SDMA_MAP_NONE          0
 #define SDMA_MAP_SINGLE        1
 #define SDMA_MAP_PAGE          2
@@ -415,6 +405,7 @@ struct sdma_engine {
 	struct list_head flushlist;
 	struct cpumask cpu_mask;
 	struct kobject kobj;
+	u32 msix_intr;
 };
 
 int sdma_init(struct hfi1_devdata *dd, u8 port);
@@ -849,16 +840,16 @@ static inline int sdma_txadd_kvaddr(
 			dd, SDMA_MAP_SINGLE, tx, addr, len);
 }
 
-struct iowait;
+struct iowait_work;
 
 int sdma_send_txreq(struct sdma_engine *sde,
-		    struct iowait *wait,
+		    struct iowait_work *wait,
 		    struct sdma_txreq *tx,
 		    bool pkts_sent);
 int sdma_send_txlist(struct sdma_engine *sde,
-		     struct iowait *wait,
+		     struct iowait_work *wait,
 		     struct list_head *tx_list,
-		     u32 *count);
+		     u16 *count_out);
 
 int sdma_ahg_alloc(struct sdma_engine *sde);
 void sdma_ahg_free(struct sdma_engine *sde, int ahg_index);

+ 32 - 37
drivers/infiniband/hw/hfi1/sysfs.c

@@ -494,17 +494,18 @@ static struct kobj_type hfi1_vl2mtu_ktype = {
  * Start of per-unit (or driver, in some cases, but replicated
  * per unit) functions (these get a device *)
  */
-static ssize_t show_rev(struct device *device, struct device_attribute *attr,
-			char *buf)
+static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr,
+			   char *buf)
 {
 	struct hfi1_ibdev *dev =
 		container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
 
 	return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev);
 }
+static DEVICE_ATTR_RO(hw_rev);
 
-static ssize_t show_hfi(struct device *device, struct device_attribute *attr,
-			char *buf)
+static ssize_t board_id_show(struct device *device,
+			     struct device_attribute *attr, char *buf)
 {
 	struct hfi1_ibdev *dev =
 		container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
@@ -517,8 +518,9 @@ static ssize_t show_hfi(struct device *device, struct device_attribute *attr,
 		ret = scnprintf(buf, PAGE_SIZE, "%s\n", dd->boardname);
 	return ret;
 }
+static DEVICE_ATTR_RO(board_id);
 
-static ssize_t show_boardversion(struct device *device,
+static ssize_t boardversion_show(struct device *device,
 				 struct device_attribute *attr, char *buf)
 {
 	struct hfi1_ibdev *dev =
@@ -528,8 +530,9 @@ static ssize_t show_boardversion(struct device *device,
 	/* The string printed here is already newline-terminated. */
 	return scnprintf(buf, PAGE_SIZE, "%s", dd->boardversion);
 }
+static DEVICE_ATTR_RO(boardversion);
 
-static ssize_t show_nctxts(struct device *device,
+static ssize_t nctxts_show(struct device *device,
 			   struct device_attribute *attr, char *buf)
 {
 	struct hfi1_ibdev *dev =
@@ -546,8 +549,9 @@ static ssize_t show_nctxts(struct device *device,
 			 min(dd->num_user_contexts,
 			     (u32)dd->sc_sizes[SC_USER].count));
 }
+static DEVICE_ATTR_RO(nctxts);
 
-static ssize_t show_nfreectxts(struct device *device,
+static ssize_t nfreectxts_show(struct device *device,
 			       struct device_attribute *attr, char *buf)
 {
 	struct hfi1_ibdev *dev =
@@ -557,8 +561,9 @@ static ssize_t show_nfreectxts(struct device *device,
 	/* Return the number of free user ports (contexts) available. */
 	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->freectxts);
 }
+static DEVICE_ATTR_RO(nfreectxts);
 
-static ssize_t show_serial(struct device *device,
+static ssize_t serial_show(struct device *device,
 			   struct device_attribute *attr, char *buf)
 {
 	struct hfi1_ibdev *dev =
@@ -567,8 +572,9 @@ static ssize_t show_serial(struct device *device,
 
 	return scnprintf(buf, PAGE_SIZE, "%s", dd->serial);
 }
+static DEVICE_ATTR_RO(serial);
 
-static ssize_t store_chip_reset(struct device *device,
+static ssize_t chip_reset_store(struct device *device,
 				struct device_attribute *attr, const char *buf,
 				size_t count)
 {
@@ -586,6 +592,7 @@ static ssize_t store_chip_reset(struct device *device,
 bail:
 	return ret < 0 ? ret : count;
 }
+static DEVICE_ATTR_WO(chip_reset);
 
 /*
  * Convert the reported temperature from an integer (reported in
@@ -598,7 +605,7 @@ bail:
 /*
  * Dump tempsense values, in decimal, to ease shell-scripts.
  */
-static ssize_t show_tempsense(struct device *device,
+static ssize_t tempsense_show(struct device *device,
 			      struct device_attribute *attr, char *buf)
 {
 	struct hfi1_ibdev *dev =
@@ -622,6 +629,7 @@ static ssize_t show_tempsense(struct device *device,
 	}
 	return ret;
 }
+static DEVICE_ATTR_RO(tempsense);
 
 /*
  * end of per-unit (or driver, in some cases, but replicated
@@ -629,24 +637,20 @@ static ssize_t show_tempsense(struct device *device,
  */
 
 /* start of per-unit file structures and support code */
-static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(board_id, S_IRUGO, show_hfi, NULL);
-static DEVICE_ATTR(nctxts, S_IRUGO, show_nctxts, NULL);
-static DEVICE_ATTR(nfreectxts, S_IRUGO, show_nfreectxts, NULL);
-static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL);
-static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL);
-static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL);
-static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset);
-
-static struct device_attribute *hfi1_attributes[] = {
-	&dev_attr_hw_rev,
-	&dev_attr_board_id,
-	&dev_attr_nctxts,
-	&dev_attr_nfreectxts,
-	&dev_attr_serial,
-	&dev_attr_boardversion,
-	&dev_attr_tempsense,
-	&dev_attr_chip_reset,
+static struct attribute *hfi1_attributes[] = {
+	&dev_attr_hw_rev.attr,
+	&dev_attr_board_id.attr,
+	&dev_attr_nctxts.attr,
+	&dev_attr_nfreectxts.attr,
+	&dev_attr_serial.attr,
+	&dev_attr_boardversion.attr,
+	&dev_attr_tempsense.attr,
+	&dev_attr_chip_reset.attr,
+	NULL,
+};
+
+const struct attribute_group ib_hfi1_attr_group = {
+	.attrs = hfi1_attributes,
 };
 
 int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
@@ -832,12 +836,6 @@ int hfi1_verbs_register_sysfs(struct hfi1_devdata *dd)
 	struct device *class_dev = &dev->dev;
 	int i, j, ret;
 
-	for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i) {
-		ret = device_create_file(&dev->dev, hfi1_attributes[i]);
-		if (ret)
-			goto bail;
-	}
-
 	for (i = 0; i < dd->num_sdma; i++) {
 		ret = kobject_init_and_add(&dd->per_sdma[i].kobj,
 					   &sde_ktype, &class_dev->kobj,
@@ -855,9 +853,6 @@ int hfi1_verbs_register_sysfs(struct hfi1_devdata *dd)
 
 	return 0;
 bail:
-	for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i)
-		device_remove_file(&dev->dev, hfi1_attributes[i]);
-
 	for (i = 0; i < dd->num_sdma; i++)
 		kobject_del(&dd->per_sdma[i].kobj);
 

+ 2 - 1
drivers/infiniband/hw/hfi1/trace.h

@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015 - 2017 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -62,3 +62,4 @@ __print_symbolic(etype,                         \
 #include "trace_rx.h"
 #include "trace_tx.h"
 #include "trace_mmu.h"
+#include "trace_iowait.h"

+ 54 - 0
drivers/infiniband/hw/hfi1/trace_iowait.h

@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+#if !defined(__HFI1_TRACE_IOWAIT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_IOWAIT_H
+
+#include <linux/tracepoint.h>
+#include "iowait.h"
+#include "verbs.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_iowait
+
+DECLARE_EVENT_CLASS(hfi1_iowait_template,
+		    TP_PROTO(struct iowait *wait, u32 flag),
+		    TP_ARGS(wait, flag),
+		    TP_STRUCT__entry(/* entry */
+			    __field(unsigned long, addr)
+			    __field(unsigned long, flags)
+			    __field(u32, flag)
+			    __field(u32, qpn)
+			    ),
+		    TP_fast_assign(/* assign */
+			    __entry->addr = (unsigned long)wait;
+			    __entry->flags = wait->flags;
+			    __entry->flag = (1 << flag);
+			    __entry->qpn = iowait_to_qp(wait)->ibqp.qp_num;
+			    ),
+		    TP_printk(/* print */
+			    "iowait 0x%lx qp %u flags 0x%lx flag 0x%x",
+			    __entry->addr,
+			    __entry->qpn,
+			    __entry->flags,
+			    __entry->flag
+			    )
+	);
+
+DEFINE_EVENT(hfi1_iowait_template, hfi1_iowait_set,
+	     TP_PROTO(struct iowait *wait, u32 flag),
+	     TP_ARGS(wait, flag));
+
+DEFINE_EVENT(hfi1_iowait_template, hfi1_iowait_clear,
+	     TP_PROTO(struct iowait *wait, u32 flag),
+	     TP_ARGS(wait, flag));
+
+#endif /* __HFI1_TRACE_IOWAIT_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_iowait
+#include <trace/define_trace.h>

+ 7 - 7
drivers/infiniband/hw/hfi1/uc.c

@@ -88,7 +88,7 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 		}
 		clear_ahg(qp);
 		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
-		hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+		rvt_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
 		goto done_free_tx;
 	}
 
@@ -140,7 +140,7 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 					qp, wqe->wr.ex.invalidate_rkey);
 				local_ops = 1;
 			}
-			hfi1_send_complete(qp, wqe, err ? IB_WC_LOC_PROT_ERR
+			rvt_send_complete(qp, wqe, err ? IB_WC_LOC_PROT_ERR
 							: IB_WC_SUCCESS);
 			if (local_ops)
 				atomic_dec(&qp->local_ops_pending);
@@ -426,7 +426,7 @@ send_first:
 		qp->r_rcv_len += pmtu;
 		if (unlikely(qp->r_rcv_len > qp->r_len))
 			goto rewind;
-		hfi1_copy_sge(&qp->r_sge, data, pmtu, false, false);
+		rvt_copy_sge(qp, &qp->r_sge, data, pmtu, false, false);
 		break;
 
 	case OP(SEND_LAST_WITH_IMMEDIATE):
@@ -449,7 +449,7 @@ send_last:
 		if (unlikely(wc.byte_len > qp->r_len))
 			goto rewind;
 		wc.opcode = IB_WC_RECV;
-		hfi1_copy_sge(&qp->r_sge, data, tlen, false, false);
+		rvt_copy_sge(qp, &qp->r_sge, data, tlen, false, false);
 		rvt_put_ss(&qp->s_rdma_read_sge);
 last_imm:
 		wc.wr_id = qp->r_wr_id;
@@ -523,7 +523,7 @@ rdma_first:
 		qp->r_rcv_len += pmtu;
 		if (unlikely(qp->r_rcv_len > qp->r_len))
 			goto drop;
-		hfi1_copy_sge(&qp->r_sge, data, pmtu, true, false);
+		rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false);
 		break;
 
 	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
@@ -550,7 +550,7 @@ rdma_last_imm:
 		}
 		wc.byte_len = qp->r_len;
 		wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
-		hfi1_copy_sge(&qp->r_sge, data, tlen, true, false);
+		rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false);
 		rvt_put_ss(&qp->r_sge);
 		goto last_imm;
 
@@ -564,7 +564,7 @@ rdma_last:
 		tlen -= (hdrsize + extra_bytes);
 		if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
 			goto drop;
-		hfi1_copy_sge(&qp->r_sge, data, tlen, true, false);
+		rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false);
 		rvt_put_ss(&qp->r_sge);
 		break;
 

+ 11 - 11
drivers/infiniband/hw/hfi1/ud.c

@@ -210,8 +210,8 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 		}
 
 		hfi1_make_grh(ibp, &grh, &grd, 0, 0);
-		hfi1_copy_sge(&qp->r_sge, &grh,
-			      sizeof(grh), true, false);
+		rvt_copy_sge(qp, &qp->r_sge, &grh,
+			     sizeof(grh), true, false);
 		wc.wc_flags |= IB_WC_GRH;
 	} else {
 		rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true);
@@ -228,7 +228,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 		if (len > sge->sge_length)
 			len = sge->sge_length;
 		WARN_ON_ONCE(len == 0);
-		hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, true, false);
+		rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, len, true, false);
 		sge->vaddr += len;
 		sge->length -= len;
 		sge->sge_length -= len;
@@ -518,7 +518,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 			goto bail;
 		}
 		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
-		hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+		rvt_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
 		goto done_free_tx;
 	}
 
@@ -560,7 +560,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 			ud_loopback(qp, wqe);
 			spin_lock_irqsave(&qp->s_lock, tflags);
 			ps->flags = tflags;
-			hfi1_send_complete(qp, wqe, IB_WC_SUCCESS);
+			rvt_send_complete(qp, wqe, IB_WC_SUCCESS);
 			goto done_free_tx;
 		}
 	}
@@ -1019,8 +1019,8 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
 		goto drop;
 	}
 	if (packet->grh) {
-		hfi1_copy_sge(&qp->r_sge, packet->grh,
-			      sizeof(struct ib_grh), true, false);
+		rvt_copy_sge(qp, &qp->r_sge, packet->grh,
+			     sizeof(struct ib_grh), true, false);
 		wc.wc_flags |= IB_WC_GRH;
 	} else if (packet->etype == RHF_RCV_TYPE_BYPASS) {
 		struct ib_grh grh;
@@ -1030,14 +1030,14 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
 		 * out when creating 16B, add back the GRH here.
 		 */
 		hfi1_make_ext_grh(packet, &grh, slid, dlid);
-		hfi1_copy_sge(&qp->r_sge, &grh,
-			      sizeof(struct ib_grh), true, false);
+		rvt_copy_sge(qp, &qp->r_sge, &grh,
+			     sizeof(struct ib_grh), true, false);
 		wc.wc_flags |= IB_WC_GRH;
 	} else {
 		rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true);
 	}
-	hfi1_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh),
-		      true, false);
+	rvt_copy_sge(qp, &qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh),
+		     true, false);
 	rvt_put_ss(&qp->r_sge);
 	if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
 		return;

+ 64 - 73
drivers/infiniband/hw/hfi1/user_sdma.c

@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015 - 2017 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -76,8 +76,7 @@ MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 12
 
 static unsigned initial_pkt_count = 8;
 
-static int user_sdma_send_pkts(struct user_sdma_request *req,
-			       unsigned maxpkts);
+static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts);
 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status);
 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq);
 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin);
@@ -101,7 +100,7 @@ static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
 
 static int defer_packet_queue(
 	struct sdma_engine *sde,
-	struct iowait *wait,
+	struct iowait_work *wait,
 	struct sdma_txreq *txreq,
 	uint seq,
 	bool pkts_sent);
@@ -124,13 +123,13 @@ static struct mmu_rb_ops sdma_rb_ops = {
 
 static int defer_packet_queue(
 	struct sdma_engine *sde,
-	struct iowait *wait,
+	struct iowait_work *wait,
 	struct sdma_txreq *txreq,
 	uint seq,
 	bool pkts_sent)
 {
 	struct hfi1_user_sdma_pkt_q *pq =
-		container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
+		container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy);
 	struct hfi1_ibdev *dev = &pq->dd->verbs_dev;
 	struct user_sdma_txreq *tx =
 		container_of(txreq, struct user_sdma_txreq, txreq);
@@ -187,13 +186,12 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
 	pq->ctxt = uctxt->ctxt;
 	pq->subctxt = fd->subctxt;
 	pq->n_max_reqs = hfi1_sdma_comp_ring_size;
-	pq->state = SDMA_PKT_Q_INACTIVE;
 	atomic_set(&pq->n_reqs, 0);
 	init_waitqueue_head(&pq->wait);
 	atomic_set(&pq->n_locked, 0);
 	pq->mm = fd->mm;
 
-	iowait_init(&pq->busy, 0, NULL, defer_packet_queue,
+	iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue,
 		    activate_packet_queue, NULL);
 	pq->reqidx = 0;
 
@@ -276,7 +274,7 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd,
 		/* Wait until all requests have been freed. */
 		wait_event_interruptible(
 			pq->wait,
-			(READ_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE));
+			!atomic_read(&pq->n_reqs));
 		kfree(pq->reqs);
 		kfree(pq->req_in_use);
 		kmem_cache_destroy(pq->txreq_cache);
@@ -312,6 +310,13 @@ static u8 dlid_to_selector(u16 dlid)
 	return mapping[hash];
 }
 
+/**
+ * hfi1_user_sdma_process_request() - Process and start a user sdma request
+ * @fd: valid file descriptor
+ * @iovec: array of io vectors to process
+ * @dim: overall iovec array size
+ * @count: number of io vector array entries processed
+ */
 int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
 				   struct iovec *iovec, unsigned long dim,
 				   unsigned long *count)
@@ -328,7 +333,6 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
 	u8 opcode, sc, vl;
 	u16 pkey;
 	u32 slid;
-	int req_queued = 0;
 	u16 dlid;
 	u32 selector;
 
@@ -392,7 +396,6 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
 	req->data_len  = 0;
 	req->pq = pq;
 	req->cq = cq;
-	req->status = -1;
 	req->ahg_idx = -1;
 	req->iov_idx = 0;
 	req->sent = 0;
@@ -400,12 +403,14 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
 	req->seqcomp = 0;
 	req->seqsubmitted = 0;
 	req->tids = NULL;
-	req->done = 0;
 	req->has_error = 0;
 	INIT_LIST_HEAD(&req->txps);
 
 	memcpy(&req->info, &info, sizeof(info));
 
+	/* The request is initialized, count it */
+	atomic_inc(&pq->n_reqs);
+
 	if (req_opcode(info.ctrl) == EXPECTED) {
 		/* expected must have a TID info and at least one data vector */
 		if (req->data_iovs < 2) {
@@ -500,7 +505,6 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
 		ret = pin_vector_pages(req, &req->iovs[i]);
 		if (ret) {
 			req->data_iovs = i;
-			req->status = ret;
 			goto free_req;
 		}
 		req->data_len += req->iovs[i].iov.iov_len;
@@ -561,23 +565,11 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
 		req->ahg_idx = sdma_ahg_alloc(req->sde);
 
 	set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
-	atomic_inc(&pq->n_reqs);
-	req_queued = 1;
+	pq->state = SDMA_PKT_Q_ACTIVE;
 	/* Send the first N packets in the request to buy us some time */
 	ret = user_sdma_send_pkts(req, pcount);
-	if (unlikely(ret < 0 && ret != -EBUSY)) {
-		req->status = ret;
+	if (unlikely(ret < 0 && ret != -EBUSY))
 		goto free_req;
-	}
-
-	/*
-	 * It is possible that the SDMA engine would have processed all the
-	 * submitted packets by the time we get here. Therefore, only set
-	 * packet queue state to ACTIVE if there are still uncompleted
-	 * requests.
-	 */
-	if (atomic_read(&pq->n_reqs))
-		xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
 
 	/*
 	 * This is a somewhat blocking send implementation.
@@ -588,14 +580,8 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
 	while (req->seqsubmitted != req->info.npkts) {
 		ret = user_sdma_send_pkts(req, pcount);
 		if (ret < 0) {
-			if (ret != -EBUSY) {
-				req->status = ret;
-				WRITE_ONCE(req->has_error, 1);
-				if (READ_ONCE(req->seqcomp) ==
-				    req->seqsubmitted - 1)
-					goto free_req;
-				return ret;
-			}
+			if (ret != -EBUSY)
+				goto free_req;
 			wait_event_interruptible_timeout(
 				pq->busy.wait_dma,
 				(pq->state == SDMA_PKT_Q_ACTIVE),
@@ -606,10 +592,19 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
 	*count += idx;
 	return 0;
 free_req:
-	user_sdma_free_request(req, true);
-	if (req_queued)
+	/*
+	 * If the submitted seqsubmitted == npkts, the completion routine
+	 * controls the final state.  If sequbmitted < npkts, wait for any
+	 * outstanding packets to finish before cleaning up.
+	 */
+	if (req->seqsubmitted < req->info.npkts) {
+		if (req->seqsubmitted)
+			wait_event(pq->busy.wait_dma,
+				   (req->seqcomp == req->seqsubmitted - 1));
+		user_sdma_free_request(req, true);
 		pq_update(pq);
-	set_comp_state(pq, cq, info.comp_idx, ERROR, req->status);
+		set_comp_state(pq, cq, info.comp_idx, ERROR, ret);
+	}
 	return ret;
 }
 
@@ -760,9 +755,10 @@ static int user_sdma_txadd(struct user_sdma_request *req,
 	return ret;
 }
 
-static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
+static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts)
 {
-	int ret = 0, count;
+	int ret = 0;
+	u16 count;
 	unsigned npkts = 0;
 	struct user_sdma_txreq *tx = NULL;
 	struct hfi1_user_sdma_pkt_q *pq = NULL;
@@ -864,8 +860,10 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
 
 				changes = set_txreq_header_ahg(req, tx,
 							       datalen);
-				if (changes < 0)
+				if (changes < 0) {
+					ret = changes;
 					goto free_tx;
+				}
 			}
 		} else {
 			ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
@@ -914,10 +912,11 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
 		npkts++;
 	}
 dosend:
-	ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps, &count);
+	ret = sdma_send_txlist(req->sde,
+			       iowait_get_ib_work(&pq->busy),
+			       &req->txps, &count);
 	req->seqsubmitted += count;
 	if (req->seqsubmitted == req->info.npkts) {
-		WRITE_ONCE(req->done, 1);
 		/*
 		 * The txreq has already been submitted to the HW queue
 		 * so we can free the AHG entry now. Corruption will not
@@ -1365,11 +1364,15 @@ static int set_txreq_header_ahg(struct user_sdma_request *req,
 	return idx;
 }
 
-/*
- * SDMA tx request completion callback. Called when the SDMA progress
- * state machine gets notification that the SDMA descriptors for this
- * tx request have been processed by the DMA engine. Called in
- * interrupt context.
+/**
+ * user_sdma_txreq_cb() - SDMA tx request completion callback.
+ * @txreq: valid sdma tx request
+ * @status: success/failure of request
+ *
+ * Called when the SDMA progress state machine gets notification that
+ * the SDMA descriptors for this tx request have been processed by the
+ * DMA engine. Called in interrupt context.
+ * Only do work on completed sequences.
  */
 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
 {
@@ -1378,7 +1381,7 @@ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
 	struct user_sdma_request *req;
 	struct hfi1_user_sdma_pkt_q *pq;
 	struct hfi1_user_sdma_comp_q *cq;
-	u16 idx;
+	enum hfi1_sdma_comp_state state = COMPLETE;
 
 	if (!tx->req)
 		return;
@@ -1391,39 +1394,25 @@ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
 		SDMA_DBG(req, "SDMA completion with error %d",
 			 status);
 		WRITE_ONCE(req->has_error, 1);
+		state = ERROR;
 	}
 
 	req->seqcomp = tx->seqnum;
 	kmem_cache_free(pq->txreq_cache, tx);
-	tx = NULL;
-
-	idx = req->info.comp_idx;
-	if (req->status == -1 && status == SDMA_TXREQ_S_OK) {
-		if (req->seqcomp == req->info.npkts - 1) {
-			req->status = 0;
-			user_sdma_free_request(req, false);
-			pq_update(pq);
-			set_comp_state(pq, cq, idx, COMPLETE, 0);
-		}
-	} else {
-		if (status != SDMA_TXREQ_S_OK)
-			req->status = status;
-		if (req->seqcomp == (READ_ONCE(req->seqsubmitted) - 1) &&
-		    (READ_ONCE(req->done) ||
-		     READ_ONCE(req->has_error))) {
-			user_sdma_free_request(req, false);
-			pq_update(pq);
-			set_comp_state(pq, cq, idx, ERROR, req->status);
-		}
-	}
+
+	/* sequence isn't complete?  We are done */
+	if (req->seqcomp != req->info.npkts - 1)
+		return;
+
+	user_sdma_free_request(req, false);
+	set_comp_state(pq, cq, req->info.comp_idx, state, status);
+	pq_update(pq);
 }
 
 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
 {
-	if (atomic_dec_and_test(&pq->n_reqs)) {
-		xchg(&pq->state, SDMA_PKT_Q_INACTIVE);
+	if (atomic_dec_and_test(&pq->n_reqs))
 		wake_up(&pq->wait);
-	}
 }
 
 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin)
@@ -1448,6 +1437,8 @@ static void user_sdma_free_request(struct user_sdma_request *req, bool unpin)
 		if (!node)
 			continue;
 
+		req->iovs[i].node = NULL;
+
 		if (unpin)
 			hfi1_mmu_rb_remove(req->pq->handler,
 					   &node->rb);

+ 9 - 11
drivers/infiniband/hw/hfi1/user_sdma.h

@@ -105,9 +105,10 @@ static inline int ahg_header_set(u32 *arr, int idx, size_t array_size,
 #define TXREQ_FLAGS_REQ_ACK   BIT(0)      /* Set the ACK bit in the header */
 #define TXREQ_FLAGS_REQ_DISABLE_SH BIT(1) /* Disable header suppression */
 
-#define SDMA_PKT_Q_INACTIVE BIT(0)
-#define SDMA_PKT_Q_ACTIVE   BIT(1)
-#define SDMA_PKT_Q_DEFERRED BIT(2)
+enum pkt_q_sdma_state {
+	SDMA_PKT_Q_ACTIVE,
+	SDMA_PKT_Q_DEFERRED,
+};
 
 /*
  * Maximum retry attempts to submit a TX request
@@ -133,7 +134,7 @@ struct hfi1_user_sdma_pkt_q {
 	struct user_sdma_request *reqs;
 	unsigned long *req_in_use;
 	struct iowait busy;
-	unsigned state;
+	enum pkt_q_sdma_state state;
 	wait_queue_head_t wait;
 	unsigned long unpinned;
 	struct mmu_rb_handler *handler;
@@ -203,14 +204,12 @@ struct user_sdma_request {
 	s8 ahg_idx;
 
 	/* Writeable fields shared with interrupt */
-	u64 seqcomp ____cacheline_aligned_in_smp;
-	u64 seqsubmitted;
-	/* status of the last txreq completed */
-	int status;
+	u16 seqcomp ____cacheline_aligned_in_smp;
+	u16 seqsubmitted;
 
 	/* Send side fields */
 	struct list_head txps ____cacheline_aligned_in_smp;
-	u64 seqnum;
+	u16 seqnum;
 	/*
 	 * KDETH.OFFSET (TID) field
 	 * The offset can cover multiple packets, depending on the
@@ -228,7 +227,6 @@ struct user_sdma_request {
 	u16 tididx;
 	/* progress index moving along the iovs array */
 	u8 iov_idx;
-	u8 done;
 	u8 has_error;
 
 	struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ];
@@ -248,7 +246,7 @@ struct user_sdma_txreq {
 	struct user_sdma_request *req;
 	u16 flags;
 	unsigned int busycount;
-	u64 seqnum;
+	u16 seqnum;
 };
 
 int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,

+ 19 - 232
drivers/infiniband/hw/hfi1/verbs.c

@@ -129,8 +129,6 @@ unsigned short piothreshold = 256;
 module_param(piothreshold, ushort, S_IRUGO);
 MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio");
 
-#define COPY_CACHELESS 1
-#define COPY_ADAPTIVE  2
 static unsigned int sge_copy_mode;
 module_param(sge_copy_mode, uint, S_IRUGO);
 MODULE_PARM_DESC(sge_copy_mode,
@@ -151,159 +149,13 @@ static int pio_wait(struct rvt_qp *qp,
 /* 16B trailing buffer */
 static const u8 trail_buf[MAX_16B_PADDING];
 
-static uint wss_threshold;
+static uint wss_threshold = 80;
 module_param(wss_threshold, uint, S_IRUGO);
 MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy");
 static uint wss_clean_period = 256;
 module_param(wss_clean_period, uint, S_IRUGO);
 MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned");
 
-/* memory working set size */
-struct hfi1_wss {
-	unsigned long *entries;
-	atomic_t total_count;
-	atomic_t clean_counter;
-	atomic_t clean_entry;
-
-	int threshold;
-	int num_entries;
-	long pages_mask;
-};
-
-static struct hfi1_wss wss;
-
-int hfi1_wss_init(void)
-{
-	long llc_size;
-	long llc_bits;
-	long table_size;
-	long table_bits;
-
-	/* check for a valid percent range - default to 80 if none or invalid */
-	if (wss_threshold < 1 || wss_threshold > 100)
-		wss_threshold = 80;
-	/* reject a wildly large period */
-	if (wss_clean_period > 1000000)
-		wss_clean_period = 256;
-	/* reject a zero period */
-	if (wss_clean_period == 0)
-		wss_clean_period = 1;
-
-	/*
-	 * Calculate the table size - the next power of 2 larger than the
-	 * LLC size.  LLC size is in KiB.
-	 */
-	llc_size = wss_llc_size() * 1024;
-	table_size = roundup_pow_of_two(llc_size);
-
-	/* one bit per page in rounded up table */
-	llc_bits = llc_size / PAGE_SIZE;
-	table_bits = table_size / PAGE_SIZE;
-	wss.pages_mask = table_bits - 1;
-	wss.num_entries = table_bits / BITS_PER_LONG;
-
-	wss.threshold = (llc_bits * wss_threshold) / 100;
-	if (wss.threshold == 0)
-		wss.threshold = 1;
-
-	atomic_set(&wss.clean_counter, wss_clean_period);
-
-	wss.entries = kcalloc(wss.num_entries, sizeof(*wss.entries),
-			      GFP_KERNEL);
-	if (!wss.entries) {
-		hfi1_wss_exit();
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-
-void hfi1_wss_exit(void)
-{
-	/* coded to handle partially initialized and repeat callers */
-	kfree(wss.entries);
-	wss.entries = NULL;
-}
-
-/*
- * Advance the clean counter.  When the clean period has expired,
- * clean an entry.
- *
- * This is implemented in atomics to avoid locking.  Because multiple
- * variables are involved, it can be racy which can lead to slightly
- * inaccurate information.  Since this is only a heuristic, this is
- * OK.  Any innaccuracies will clean themselves out as the counter
- * advances.  That said, it is unlikely the entry clean operation will
- * race - the next possible racer will not start until the next clean
- * period.
- *
- * The clean counter is implemented as a decrement to zero.  When zero
- * is reached an entry is cleaned.
- */
-static void wss_advance_clean_counter(void)
-{
-	int entry;
-	int weight;
-	unsigned long bits;
-
-	/* become the cleaner if we decrement the counter to zero */
-	if (atomic_dec_and_test(&wss.clean_counter)) {
-		/*
-		 * Set, not add, the clean period.  This avoids an issue
-		 * where the counter could decrement below the clean period.
-		 * Doing a set can result in lost decrements, slowing the
-		 * clean advance.  Since this a heuristic, this possible
-		 * slowdown is OK.
-		 *
-		 * An alternative is to loop, advancing the counter by a
-		 * clean period until the result is > 0. However, this could
-		 * lead to several threads keeping another in the clean loop.
-		 * This could be mitigated by limiting the number of times
-		 * we stay in the loop.
-		 */
-		atomic_set(&wss.clean_counter, wss_clean_period);
-
-		/*
-		 * Uniquely grab the entry to clean and move to next.
-		 * The current entry is always the lower bits of
-		 * wss.clean_entry.  The table size, wss.num_entries,
-		 * is always a power-of-2.
-		 */
-		entry = (atomic_inc_return(&wss.clean_entry) - 1)
-			& (wss.num_entries - 1);
-
-		/* clear the entry and count the bits */
-		bits = xchg(&wss.entries[entry], 0);
-		weight = hweight64((u64)bits);
-		/* only adjust the contended total count if needed */
-		if (weight)
-			atomic_sub(weight, &wss.total_count);
-	}
-}
-
-/*
- * Insert the given address into the working set array.
- */
-static void wss_insert(void *address)
-{
-	u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss.pages_mask;
-	u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */
-	u32 nr = page & (BITS_PER_LONG - 1);
-
-	if (!test_and_set_bit(nr, &wss.entries[entry]))
-		atomic_inc(&wss.total_count);
-
-	wss_advance_clean_counter();
-}
-
-/*
- * Is the working set larger than the threshold?
- */
-static inline bool wss_exceeds_threshold(void)
-{
-	return atomic_read(&wss.total_count) >= wss.threshold;
-}
-
 /*
  * Translate ib_wr_opcode into ib_wc_opcode.
  */
@@ -438,79 +290,6 @@ static const u32 pio_opmask[BIT(3)] = {
  */
 __be64 ib_hfi1_sys_image_guid;
 
-/**
- * hfi1_copy_sge - copy data to SGE memory
- * @ss: the SGE state
- * @data: the data to copy
- * @length: the length of the data
- * @release: boolean to release MR
- * @copy_last: do a separate copy of the last 8 bytes
- */
-void hfi1_copy_sge(
-	struct rvt_sge_state *ss,
-	void *data, u32 length,
-	bool release,
-	bool copy_last)
-{
-	struct rvt_sge *sge = &ss->sge;
-	int i;
-	bool in_last = false;
-	bool cacheless_copy = false;
-
-	if (sge_copy_mode == COPY_CACHELESS) {
-		cacheless_copy = length >= PAGE_SIZE;
-	} else if (sge_copy_mode == COPY_ADAPTIVE) {
-		if (length >= PAGE_SIZE) {
-			/*
-			 * NOTE: this *assumes*:
-			 * o The first vaddr is the dest.
-			 * o If multiple pages, then vaddr is sequential.
-			 */
-			wss_insert(sge->vaddr);
-			if (length >= (2 * PAGE_SIZE))
-				wss_insert(sge->vaddr + PAGE_SIZE);
-
-			cacheless_copy = wss_exceeds_threshold();
-		} else {
-			wss_advance_clean_counter();
-		}
-	}
-	if (copy_last) {
-		if (length > 8) {
-			length -= 8;
-		} else {
-			copy_last = false;
-			in_last = true;
-		}
-	}
-
-again:
-	while (length) {
-		u32 len = rvt_get_sge_length(sge, length);
-
-		WARN_ON_ONCE(len == 0);
-		if (unlikely(in_last)) {
-			/* enforce byte transfer ordering */
-			for (i = 0; i < len; i++)
-				((u8 *)sge->vaddr)[i] = ((u8 *)data)[i];
-		} else if (cacheless_copy) {
-			cacheless_memcpy(sge->vaddr, data, len);
-		} else {
-			memcpy(sge->vaddr, data, len);
-		}
-		rvt_update_sge(ss, len, release);
-		data += len;
-		length -= len;
-	}
-
-	if (copy_last) {
-		copy_last = false;
-		in_last = true;
-		length = 8;
-		goto again;
-	}
-}
-
 /*
  * Make sure the QP is ready and able to accept the given opcode.
  */
@@ -713,7 +492,7 @@ static void verbs_sdma_complete(
 
 	spin_lock(&qp->s_lock);
 	if (tx->wqe) {
-		hfi1_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
+		rvt_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
 	} else if (qp->ibqp.qp_type == IB_QPT_RC) {
 		struct hfi1_opa_header *hdr;
 
@@ -737,7 +516,7 @@ static int wait_kmem(struct hfi1_ibdev *dev,
 	if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
 		write_seqlock(&dev->iowait_lock);
 		list_add_tail(&ps->s_txreq->txreq.list,
-			      &priv->s_iowait.tx_head);
+			      &ps->wait->tx_head);
 		if (list_empty(&priv->s_iowait.list)) {
 			if (list_empty(&dev->memwait))
 				mod_timer(&dev->mem_timer, jiffies + 1);
@@ -748,7 +527,7 @@ static int wait_kmem(struct hfi1_ibdev *dev,
 			rvt_get_qp(qp);
 		}
 		write_sequnlock(&dev->iowait_lock);
-		qp->s_flags &= ~RVT_S_BUSY;
+		hfi1_qp_unbusy(qp, ps->wait);
 		ret = -EBUSY;
 	}
 	spin_unlock_irqrestore(&qp->s_lock, flags);
@@ -950,8 +729,7 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 		if (unlikely(ret))
 			goto bail_build;
 	}
-	ret =  sdma_send_txreq(tx->sde, &priv->s_iowait, &tx->txreq,
-			       ps->pkts_sent);
+	ret =  sdma_send_txreq(tx->sde, ps->wait, &tx->txreq, ps->pkts_sent);
 	if (unlikely(ret < 0)) {
 		if (ret == -ECOMM)
 			goto bail_ecomm;
@@ -1001,7 +779,7 @@ static int pio_wait(struct rvt_qp *qp,
 	if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
 		write_seqlock(&dev->iowait_lock);
 		list_add_tail(&ps->s_txreq->txreq.list,
-			      &priv->s_iowait.tx_head);
+			      &ps->wait->tx_head);
 		if (list_empty(&priv->s_iowait.list)) {
 			struct hfi1_ibdev *dev = &dd->verbs_dev;
 			int was_empty;
@@ -1020,7 +798,7 @@ static int pio_wait(struct rvt_qp *qp,
 				hfi1_sc_wantpiobuf_intr(sc, 1);
 		}
 		write_sequnlock(&dev->iowait_lock);
-		qp->s_flags &= ~RVT_S_BUSY;
+		hfi1_qp_unbusy(qp, ps->wait);
 		ret = -EBUSY;
 	}
 	spin_unlock_irqrestore(&qp->s_lock, flags);
@@ -1160,7 +938,7 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 pio_bail:
 	if (qp->s_wqe) {
 		spin_lock_irqsave(&qp->s_lock, flags);
-		hfi1_send_complete(qp, qp->s_wqe, wc_status);
+		rvt_send_complete(qp, qp->s_wqe, wc_status);
 		spin_unlock_irqrestore(&qp->s_lock, flags);
 	} else if (qp->ibqp.qp_type == IB_QPT_RC) {
 		spin_lock_irqsave(&qp->s_lock, flags);
@@ -1367,7 +1145,7 @@ int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 			hfi1_cdbg(PIO, "%s() Failed. Completing with err",
 				  __func__);
 			spin_lock_irqsave(&qp->s_lock, flags);
-			hfi1_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
+			rvt_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
 			spin_unlock_irqrestore(&qp->s_lock, flags);
 		}
 		return -EINVAL;
@@ -1943,7 +1721,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
 	dd->verbs_dev.rdi.driver_f.check_modify_qp = hfi1_check_modify_qp;
 	dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp;
 	dd->verbs_dev.rdi.driver_f.notify_restart_rc = hfi1_restart_rc;
-	dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe;
+	dd->verbs_dev.rdi.driver_f.setup_wqe = hfi1_setup_wqe;
 	dd->verbs_dev.rdi.driver_f.comp_vect_cpu_lookup =
 						hfi1_comp_vect_mappings_lookup;
 
@@ -1956,10 +1734,16 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
 	dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size;
 	dd->verbs_dev.rdi.dparms.nports = dd->num_pports;
 	dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd);
+	dd->verbs_dev.rdi.dparms.sge_copy_mode = sge_copy_mode;
+	dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold;
+	dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period;
 
 	/* post send table */
 	dd->verbs_dev.rdi.post_parms = hfi1_post_parms;
 
+	/* opcode translation table */
+	dd->verbs_dev.rdi.wc_opcode = ib_hfi1_wc_opcode;
+
 	ppd = dd->pport;
 	for (i = 0; i < dd->num_pports; i++, ppd++)
 		rvt_init_port(&dd->verbs_dev.rdi,
@@ -1967,6 +1751,9 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
 			      i,
 			      ppd->pkeys);
 
+	rdma_set_device_sysfs_group(&dd->verbs_dev.rdi.ibdev,
+				    &ib_hfi1_attr_group);
+
 	ret = rvt_register_device(&dd->verbs_dev.rdi, RDMA_DRIVER_HFI1);
 	if (ret)
 		goto err_verbs_txreq;

+ 5 - 30
drivers/infiniband/hw/hfi1/verbs.h

@@ -166,11 +166,13 @@ struct hfi1_qp_priv {
  * This structure is used to hold commonly lookedup and computed values during
  * the send engine progress.
  */
+struct iowait_work;
 struct hfi1_pkt_state {
 	struct hfi1_ibdev *dev;
 	struct hfi1_ibport *ibp;
 	struct hfi1_pportdata *ppd;
 	struct verbs_txreq *s_txreq;
+	struct iowait_work *wait;
 	unsigned long flags;
 	unsigned long timeout;
 	unsigned long timeout_int;
@@ -247,7 +249,7 @@ static inline struct hfi1_ibdev *to_idev(struct ib_device *ibdev)
 	return container_of(rdi, struct hfi1_ibdev, rdi);
 }
 
-static inline struct rvt_qp *iowait_to_qp(struct  iowait *s_iowait)
+static inline struct rvt_qp *iowait_to_qp(struct iowait *s_iowait)
 {
 	struct hfi1_qp_priv *priv;
 
@@ -313,9 +315,6 @@ void hfi1_put_txreq(struct verbs_txreq *tx);
 
 int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
 
-void hfi1_copy_sge(struct rvt_sge_state *ss, void *data, u32 length,
-		   bool release, bool copy_last);
-
 void hfi1_cnp_rcv(struct hfi1_packet *packet);
 
 void hfi1_uc_rcv(struct hfi1_packet *packet);
@@ -343,7 +342,8 @@ int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
 void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
 		    int attr_mask, struct ib_udata *udata);
 void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait);
-int hfi1_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
+int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe,
+		   bool *call_send);
 
 extern const u32 rc_only_opcode;
 extern const u32 uc_only_opcode;
@@ -363,9 +363,6 @@ void hfi1_do_send_from_rvt(struct rvt_qp *qp);
 
 void hfi1_do_send(struct rvt_qp *qp, bool in_thread);
 
-void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
-			enum ib_wc_status status);
-
 void hfi1_send_rc_ack(struct hfi1_packet *packet, bool is_fecn);
 
 int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
@@ -390,28 +387,6 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 			u64 pbc);
 
-int hfi1_wss_init(void);
-void hfi1_wss_exit(void);
-
-/* platform specific: return the lowest level cache (llc) size, in KiB */
-static inline int wss_llc_size(void)
-{
-	/* assume that the boot CPU value is universal for all CPUs */
-	return boot_cpu_data.x86_cache_size;
-}
-
-/* platform specific: cacheless copy */
-static inline void cacheless_memcpy(void *dst, void *src, size_t n)
-{
-	/*
-	 * Use the only available X64 cacheless copy.  Add a __user cast
-	 * to quiet sparse.  The src agument is already in the kernel so
-	 * there are no security issues.  The extra fault recovery machinery
-	 * is not invoked.
-	 */
-	__copy_user_nocache(dst, (void __user *)src, n, 0);
-}
-
 static inline bool opa_bth_is_migration(struct ib_other_headers *ohdr)
 {
 	return ohdr->bth[1] & cpu_to_be32(OPA_BTH_MIG_REQ);

+ 4 - 7
drivers/infiniband/hw/hfi1/verbs_txreq.h

@@ -102,22 +102,19 @@ static inline struct sdma_txreq *get_sdma_txreq(struct verbs_txreq *tx)
 	return &tx->txreq;
 }
 
-static inline struct verbs_txreq *get_waiting_verbs_txreq(struct rvt_qp *qp)
+static inline struct verbs_txreq *get_waiting_verbs_txreq(struct iowait_work *w)
 {
 	struct sdma_txreq *stx;
-	struct hfi1_qp_priv *priv = qp->priv;
 
-	stx = iowait_get_txhead(&priv->s_iowait);
+	stx = iowait_get_txhead(w);
 	if (stx)
 		return container_of(stx, struct verbs_txreq, txreq);
 	return NULL;
 }
 
-static inline bool verbs_txreq_queued(struct rvt_qp *qp)
+static inline bool verbs_txreq_queued(struct iowait_work *w)
 {
-	struct hfi1_qp_priv *priv = qp->priv;
-
-	return iowait_packet_queued(&priv->s_iowait);
+	return iowait_packet_queued(w);
 }
 
 void hfi1_put_txreq(struct verbs_txreq *tx);

+ 6 - 6
drivers/infiniband/hw/hfi1/vnic_main.c

@@ -120,7 +120,7 @@ static int allocate_vnic_ctxt(struct hfi1_devdata *dd,
 	uctxt->seq_cnt = 1;
 	uctxt->is_vnic = true;
 
-	hfi1_set_vnic_msix_info(uctxt);
+	msix_request_rcd_irq(uctxt);
 
 	hfi1_stats.sps_ctxts++;
 	dd_dev_dbg(dd, "created vnic context %d\n", uctxt->ctxt);
@@ -135,8 +135,6 @@ static void deallocate_vnic_ctxt(struct hfi1_devdata *dd,
 	dd_dev_dbg(dd, "closing vnic context %d\n", uctxt->ctxt);
 	flush_wc();
 
-	hfi1_reset_vnic_msix_info(uctxt);
-
 	/*
 	 * Disable receive context and interrupt available, reset all
 	 * RcvCtxtCtrl bits to default values.
@@ -148,6 +146,10 @@ static void deallocate_vnic_ctxt(struct hfi1_devdata *dd,
 		     HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
 		     HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt);
 
+	/* msix_intr will always be > 0, only clean up if this is true */
+	if (uctxt->msix_intr)
+		msix_free_irq(dd, uctxt->msix_intr);
+
 	uctxt->event_flags = 0;
 
 	hfi1_clear_tids(uctxt);
@@ -626,7 +628,7 @@ static void hfi1_vnic_down(struct hfi1_vnic_vport_info *vinfo)
 	idr_remove(&dd->vnic.vesw_idr, vinfo->vesw_id);
 
 	/* ensure irqs see the change */
-	hfi1_vnic_synchronize_irq(dd);
+	msix_vnic_synchronize_irq(dd);
 
 	/* remove unread skbs */
 	for (i = 0; i < vinfo->num_rx_q; i++) {
@@ -690,8 +692,6 @@ static int hfi1_vnic_init(struct hfi1_vnic_vport_info *vinfo)
 		rc = hfi1_vnic_txreq_init(dd);
 		if (rc)
 			goto txreq_fail;
-
-		dd->vnic.msix_idx = dd->first_dyn_msix_idx;
 	}
 
 	for (i = dd->vnic.num_ctxt; i < vinfo->num_rx_q; i++) {

+ 12 - 9
drivers/infiniband/hw/hfi1/vnic_sdma.c

@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2017 Intel Corporation.
+ * Copyright(c) 2017 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -198,8 +198,8 @@ int hfi1_vnic_send_dma(struct hfi1_devdata *dd, u8 q_idx,
 		goto free_desc;
 	tx->retry_count = 0;
 
-	ret = sdma_send_txreq(sde, &vnic_sdma->wait, &tx->txreq,
-			      vnic_sdma->pkts_sent);
+	ret = sdma_send_txreq(sde, iowait_get_ib_work(&vnic_sdma->wait),
+			      &tx->txreq, vnic_sdma->pkts_sent);
 	/* When -ECOMM, sdma callback will be called with ABORT status */
 	if (unlikely(ret && unlikely(ret != -ECOMM)))
 		goto free_desc;
@@ -230,13 +230,13 @@ tx_err:
  * become available.
  */
 static int hfi1_vnic_sdma_sleep(struct sdma_engine *sde,
-				struct iowait *wait,
+				struct iowait_work *wait,
 				struct sdma_txreq *txreq,
 				uint seq,
 				bool pkts_sent)
 {
 	struct hfi1_vnic_sdma *vnic_sdma =
-		container_of(wait, struct hfi1_vnic_sdma, wait);
+		container_of(wait->iow, struct hfi1_vnic_sdma, wait);
 	struct hfi1_ibdev *dev = &vnic_sdma->dd->verbs_dev;
 	struct vnic_txreq *tx = container_of(txreq, struct vnic_txreq, txreq);
 
@@ -247,7 +247,7 @@ static int hfi1_vnic_sdma_sleep(struct sdma_engine *sde,
 	vnic_sdma->state = HFI1_VNIC_SDMA_Q_DEFERRED;
 	write_seqlock(&dev->iowait_lock);
 	if (list_empty(&vnic_sdma->wait.list))
-		iowait_queue(pkts_sent, wait, &sde->dmawait);
+		iowait_queue(pkts_sent, wait->iow, &sde->dmawait);
 	write_sequnlock(&dev->iowait_lock);
 	return -EBUSY;
 }
@@ -285,7 +285,8 @@ void hfi1_vnic_sdma_init(struct hfi1_vnic_vport_info *vinfo)
 	for (i = 0; i < vinfo->num_tx_q; i++) {
 		struct hfi1_vnic_sdma *vnic_sdma = &vinfo->sdma[i];
 
-		iowait_init(&vnic_sdma->wait, 0, NULL, hfi1_vnic_sdma_sleep,
+		iowait_init(&vnic_sdma->wait, 0, NULL, NULL,
+			    hfi1_vnic_sdma_sleep,
 			    hfi1_vnic_sdma_wakeup, NULL);
 		vnic_sdma->sde = &vinfo->dd->per_sdma[i];
 		vnic_sdma->dd = vinfo->dd;
@@ -295,10 +296,12 @@ void hfi1_vnic_sdma_init(struct hfi1_vnic_vport_info *vinfo)
 
 		/* Add a free descriptor watermark for wakeups */
 		if (vnic_sdma->sde->descq_cnt > HFI1_VNIC_SDMA_DESC_WTRMRK) {
+			struct iowait_work *work;
+
 			INIT_LIST_HEAD(&vnic_sdma->stx.list);
 			vnic_sdma->stx.num_desc = HFI1_VNIC_SDMA_DESC_WTRMRK;
-			list_add_tail(&vnic_sdma->stx.list,
-				      &vnic_sdma->wait.tx_head);
+			work = iowait_get_ib_work(&vnic_sdma->wait);
+			list_add_tail(&vnic_sdma->stx.list, &work->tx_head);
 		}
 	}
 }

+ 1 - 0
drivers/infiniband/hw/hns/Kconfig

@@ -1,6 +1,7 @@
 config INFINIBAND_HNS
 	tristate "HNS RoCE Driver"
 	depends on NET_VENDOR_HISILICON
+	depends on INFINIBAND_USER_ACCESS || !INFINIBAND_USER_ACCESS
 	depends on ARM64 || (COMPILE_TEST && 64BIT)
 	---help---
 	  This is a RoCE/RDMA driver for the Hisilicon RoCE engine. The engine

+ 5 - 1
drivers/infiniband/hw/hns/hns_roce_ah.c

@@ -49,6 +49,7 @@ struct ib_ah *hns_roce_create_ah(struct ib_pd *ibpd,
 	struct hns_roce_ah *ah;
 	u16 vlan_tag = 0xffff;
 	const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr);
+	bool vlan_en = false;
 
 	ah = kzalloc(sizeof(*ah), GFP_ATOMIC);
 	if (!ah)
@@ -58,8 +59,10 @@ struct ib_ah *hns_roce_create_ah(struct ib_pd *ibpd,
 	memcpy(ah->av.mac, ah_attr->roce.dmac, ETH_ALEN);
 
 	gid_attr = ah_attr->grh.sgid_attr;
-	if (is_vlan_dev(gid_attr->ndev))
+	if (is_vlan_dev(gid_attr->ndev)) {
 		vlan_tag = vlan_dev_vlan_id(gid_attr->ndev);
+		vlan_en = true;
+	}
 
 	if (vlan_tag < 0x1000)
 		vlan_tag |= (rdma_ah_get_sl(ah_attr) &
@@ -71,6 +74,7 @@ struct ib_ah *hns_roce_create_ah(struct ib_pd *ibpd,
 				     HNS_ROCE_PORT_NUM_SHIFT));
 	ah->av.gid_index = grh->sgid_index;
 	ah->av.vlan = cpu_to_le16(vlan_tag);
+	ah->av.vlan_en = vlan_en;
 	dev_dbg(dev, "gid_index = 0x%x,vlan = 0x%x\n", ah->av.gid_index,
 		ah->av.vlan);
 

+ 37 - 8
drivers/infiniband/hw/hns/hns_roce_device.h

@@ -88,8 +88,11 @@
 #define BITMAP_RR				1
 
 #define MR_TYPE_MR				0x00
+#define MR_TYPE_FRMR				0x01
 #define MR_TYPE_DMA				0x03
 
+#define HNS_ROCE_FRMR_MAX_PA			512
+
 #define PKEY_ID					0xffff
 #define GUID_LEN				8
 #define NODE_DESC_SIZE				64
@@ -193,6 +196,9 @@ enum {
 	HNS_ROCE_CAP_FLAG_RQ_INLINE		= BIT(2),
 	HNS_ROCE_CAP_FLAG_RECORD_DB		= BIT(3),
 	HNS_ROCE_CAP_FLAG_SQ_RECORD_DB		= BIT(4),
+	HNS_ROCE_CAP_FLAG_MW			= BIT(7),
+	HNS_ROCE_CAP_FLAG_FRMR                  = BIT(8),
+	HNS_ROCE_CAP_FLAG_ATOMIC		= BIT(10),
 };
 
 enum hns_roce_mtt_type {
@@ -219,19 +225,11 @@ struct hns_roce_uar {
 	unsigned long	logic_idx;
 };
 
-struct hns_roce_vma_data {
-	struct list_head list;
-	struct vm_area_struct *vma;
-	struct mutex *vma_list_mutex;
-};
-
 struct hns_roce_ucontext {
 	struct ib_ucontext	ibucontext;
 	struct hns_roce_uar	uar;
 	struct list_head	page_list;
 	struct mutex		page_mutex;
-	struct list_head	vma_list;
-	struct mutex		vma_list_mutex;
 };
 
 struct hns_roce_pd {
@@ -293,6 +291,16 @@ struct hns_roce_mtt {
 	enum hns_roce_mtt_type	mtt_type;
 };
 
+struct hns_roce_mw {
+	struct ib_mw		ibmw;
+	u32			pdn;
+	u32			rkey;
+	int			enabled; /* MW's active status */
+	u32			pbl_hop_num;
+	u32			pbl_ba_pg_sz;
+	u32			pbl_buf_pg_sz;
+};
+
 /* Only support 4K page size for mr register */
 #define MR_SIZE_4K 0
 
@@ -304,6 +312,7 @@ struct hns_roce_mr {
 	u32			key; /* Key of MR */
 	u32			pd;   /* PD num of MR */
 	u32			access;/* Access permission of MR */
+	u32			npages;
 	int			enabled; /* MR's active status */
 	int			type;	/* MR's register type */
 	u64			*pbl_buf;/* MR's PBL space */
@@ -457,6 +466,7 @@ struct hns_roce_av {
 	u8          dgid[HNS_ROCE_GID_SIZE];
 	u8          mac[6];
 	__le16      vlan;
+	bool	    vlan_en;
 };
 
 struct hns_roce_ah {
@@ -656,6 +666,7 @@ struct hns_roce_eq_table {
 };
 
 struct hns_roce_caps {
+	u64		fw_ver;
 	u8		num_ports;
 	int		gid_table_len[HNS_ROCE_MAX_PORTS];
 	int		pkey_table_len[HNS_ROCE_MAX_PORTS];
@@ -665,7 +676,9 @@ struct hns_roce_caps {
 	u32		max_sq_sg;	/* 2 */
 	u32		max_sq_inline;	/* 32 */
 	u32		max_rq_sg;	/* 2 */
+	u32		max_extend_sg;
 	int		num_qps;	/* 256k */
+	int             reserved_qps;
 	u32		max_wqes;	/* 16k */
 	u32		max_sq_desc_sz;	/* 64 */
 	u32		max_rq_desc_sz;	/* 64 */
@@ -738,6 +751,7 @@ struct hns_roce_work {
 	struct hns_roce_dev *hr_dev;
 	struct work_struct work;
 	u32 qpn;
+	u32 cqn;
 	int event_type;
 	int sub_type;
 };
@@ -764,6 +778,8 @@ struct hns_roce_hw {
 				struct hns_roce_mr *mr, int flags, u32 pdn,
 				int mr_access_flags, u64 iova, u64 size,
 				void *mb_buf);
+	int (*frmr_write_mtpt)(void *mb_buf, struct hns_roce_mr *mr);
+	int (*mw_write_mtpt)(void *mb_buf, struct hns_roce_mw *mw);
 	void (*write_cqc)(struct hns_roce_dev *hr_dev,
 			  struct hns_roce_cq *hr_cq, void *mb_buf, u64 *mtts,
 			  dma_addr_t dma_handle, int nent, u32 vector);
@@ -863,6 +879,11 @@ static inline struct hns_roce_mr *to_hr_mr(struct ib_mr *ibmr)
 	return container_of(ibmr, struct hns_roce_mr, ibmr);
 }
 
+static inline struct hns_roce_mw *to_hr_mw(struct ib_mw *ibmw)
+{
+	return container_of(ibmw, struct hns_roce_mw, ibmw);
+}
+
 static inline struct hns_roce_qp *to_hr_qp(struct ib_qp *ibqp)
 {
 	return container_of(ibqp, struct hns_roce_qp, ibqp);
@@ -968,12 +989,20 @@ struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 int hns_roce_rereg_user_mr(struct ib_mr *mr, int flags, u64 start, u64 length,
 			   u64 virt_addr, int mr_access_flags, struct ib_pd *pd,
 			   struct ib_udata *udata);
+struct ib_mr *hns_roce_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+				u32 max_num_sg);
+int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+		       unsigned int *sg_offset);
 int hns_roce_dereg_mr(struct ib_mr *ibmr);
 int hns_roce_hw2sw_mpt(struct hns_roce_dev *hr_dev,
 		       struct hns_roce_cmd_mailbox *mailbox,
 		       unsigned long mpt_index);
 unsigned long key_to_hw_index(u32 key);
 
+struct ib_mw *hns_roce_alloc_mw(struct ib_pd *pd, enum ib_mw_type,
+				struct ib_udata *udata);
+int hns_roce_dealloc_mw(struct ib_mw *ibmw);
+
 void hns_roce_buf_free(struct hns_roce_dev *hr_dev, u32 size,
 		       struct hns_roce_buf *buf);
 int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct,

+ 2 - 2
drivers/infiniband/hw/hns/hns_roce_hw_v1.c

@@ -731,7 +731,7 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev)
 	cq_init_attr.comp_vector	= 0;
 	cq = hns_roce_ib_create_cq(&hr_dev->ib_dev, &cq_init_attr, NULL, NULL);
 	if (IS_ERR(cq)) {
-		dev_err(dev, "Create cq for reseved loop qp failed!");
+		dev_err(dev, "Create cq for reserved loop qp failed!");
 		return -ENOMEM;
 	}
 	free_mr->mr_free_cq = to_hr_cq(cq);
@@ -744,7 +744,7 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev)
 
 	pd = hns_roce_alloc_pd(&hr_dev->ib_dev, NULL, NULL);
 	if (IS_ERR(pd)) {
-		dev_err(dev, "Create pd for reseved loop qp failed!");
+		dev_err(dev, "Create pd for reserved loop qp failed!");
 		ret = -ENOMEM;
 		goto alloc_pd_failed;
 	}

+ 394 - 235
drivers/infiniband/hw/hns/hns_roce_hw_v2.c

@@ -54,6 +54,59 @@ static void set_data_seg_v2(struct hns_roce_v2_wqe_data_seg *dseg,
 	dseg->len  = cpu_to_le32(sg->length);
 }
 
+static void set_frmr_seg(struct hns_roce_v2_rc_send_wqe *rc_sq_wqe,
+			 struct hns_roce_wqe_frmr_seg *fseg,
+			 const struct ib_reg_wr *wr)
+{
+	struct hns_roce_mr *mr = to_hr_mr(wr->mr);
+
+	/* use ib_access_flags */
+	roce_set_bit(rc_sq_wqe->byte_4,
+		     V2_RC_FRMR_WQE_BYTE_4_BIND_EN_S,
+		     wr->access & IB_ACCESS_MW_BIND ? 1 : 0);
+	roce_set_bit(rc_sq_wqe->byte_4,
+		     V2_RC_FRMR_WQE_BYTE_4_ATOMIC_S,
+		     wr->access & IB_ACCESS_REMOTE_ATOMIC ? 1 : 0);
+	roce_set_bit(rc_sq_wqe->byte_4,
+		     V2_RC_FRMR_WQE_BYTE_4_RR_S,
+		     wr->access & IB_ACCESS_REMOTE_READ ? 1 : 0);
+	roce_set_bit(rc_sq_wqe->byte_4,
+		     V2_RC_FRMR_WQE_BYTE_4_RW_S,
+		     wr->access & IB_ACCESS_REMOTE_WRITE ? 1 : 0);
+	roce_set_bit(rc_sq_wqe->byte_4,
+		     V2_RC_FRMR_WQE_BYTE_4_LW_S,
+		     wr->access & IB_ACCESS_LOCAL_WRITE ? 1 : 0);
+
+	/* Data structure reuse may lead to confusion */
+	rc_sq_wqe->msg_len = cpu_to_le32(mr->pbl_ba & 0xffffffff);
+	rc_sq_wqe->inv_key = cpu_to_le32(mr->pbl_ba >> 32);
+
+	rc_sq_wqe->byte_16 = cpu_to_le32(wr->mr->length & 0xffffffff);
+	rc_sq_wqe->byte_20 = cpu_to_le32(wr->mr->length >> 32);
+	rc_sq_wqe->rkey = cpu_to_le32(wr->key);
+	rc_sq_wqe->va = cpu_to_le64(wr->mr->iova);
+
+	fseg->pbl_size = cpu_to_le32(mr->pbl_size);
+	roce_set_field(fseg->mode_buf_pg_sz,
+		       V2_RC_FRMR_WQE_BYTE_40_PBL_BUF_PG_SZ_M,
+		       V2_RC_FRMR_WQE_BYTE_40_PBL_BUF_PG_SZ_S,
+		       mr->pbl_buf_pg_sz + PG_SHIFT_OFFSET);
+	roce_set_bit(fseg->mode_buf_pg_sz,
+		     V2_RC_FRMR_WQE_BYTE_40_BLK_MODE_S, 0);
+}
+
+static void set_atomic_seg(struct hns_roce_wqe_atomic_seg *aseg,
+			   const struct ib_atomic_wr *wr)
+{
+	if (wr->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+		aseg->fetchadd_swap_data = cpu_to_le64(wr->swap);
+		aseg->cmp_data  = cpu_to_le64(wr->compare_add);
+	} else {
+		aseg->fetchadd_swap_data = cpu_to_le64(wr->compare_add);
+		aseg->cmp_data  = 0;
+	}
+}
+
 static void set_extend_sge(struct hns_roce_qp *qp, const struct ib_send_wr *wr,
 			   unsigned int *sge_ind)
 {
@@ -121,6 +174,7 @@ static int set_rwqe_data_seg(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 		}
 
 		if (wr->opcode == IB_WR_RDMA_READ) {
+			*bad_wr =  wr;
 			dev_err(hr_dev->dev, "Not support inline data!\n");
 			return -EINVAL;
 		}
@@ -179,6 +233,7 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp,
 	struct hns_roce_v2_ud_send_wqe *ud_sq_wqe;
 	struct hns_roce_v2_rc_send_wqe *rc_sq_wqe;
 	struct hns_roce_qp *qp = to_hr_qp(ibqp);
+	struct hns_roce_wqe_frmr_seg *fseg;
 	struct device *dev = hr_dev->dev;
 	struct hns_roce_v2_db sq_db;
 	struct ib_qp_attr attr;
@@ -191,6 +246,7 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp,
 	int attr_mask;
 	u32 tmp_len;
 	int ret = 0;
+	u32 hr_op;
 	u8 *smac;
 	int nreq;
 	int i;
@@ -356,6 +412,9 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp,
 				       V2_UD_SEND_WQE_BYTE_40_PORTN_S,
 				       qp->port);
 
+			roce_set_bit(ud_sq_wqe->byte_40,
+				     V2_UD_SEND_WQE_BYTE_40_UD_VLAN_EN_S,
+				     ah->av.vlan_en ? 1 : 0);
 			roce_set_field(ud_sq_wqe->byte_48,
 				       V2_UD_SEND_WQE_BYTE_48_SGID_INDX_M,
 				       V2_UD_SEND_WQE_BYTE_48_SGID_INDX_S,
@@ -406,99 +465,100 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp,
 			roce_set_bit(rc_sq_wqe->byte_4,
 				     V2_RC_SEND_WQE_BYTE_4_OWNER_S, owner_bit);
 
+			wqe += sizeof(struct hns_roce_v2_rc_send_wqe);
 			switch (wr->opcode) {
 			case IB_WR_RDMA_READ:
-				roce_set_field(rc_sq_wqe->byte_4,
-					       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-					       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-					       HNS_ROCE_V2_WQE_OP_RDMA_READ);
+				hr_op = HNS_ROCE_V2_WQE_OP_RDMA_READ;
 				rc_sq_wqe->rkey =
 					cpu_to_le32(rdma_wr(wr)->rkey);
 				rc_sq_wqe->va =
 					cpu_to_le64(rdma_wr(wr)->remote_addr);
 				break;
 			case IB_WR_RDMA_WRITE:
-				roce_set_field(rc_sq_wqe->byte_4,
-					       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-					       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-					       HNS_ROCE_V2_WQE_OP_RDMA_WRITE);
+				hr_op = HNS_ROCE_V2_WQE_OP_RDMA_WRITE;
 				rc_sq_wqe->rkey =
 					cpu_to_le32(rdma_wr(wr)->rkey);
 				rc_sq_wqe->va =
 					cpu_to_le64(rdma_wr(wr)->remote_addr);
 				break;
 			case IB_WR_RDMA_WRITE_WITH_IMM:
-				roce_set_field(rc_sq_wqe->byte_4,
-				       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-				       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-				       HNS_ROCE_V2_WQE_OP_RDMA_WRITE_WITH_IMM);
+				hr_op = HNS_ROCE_V2_WQE_OP_RDMA_WRITE_WITH_IMM;
 				rc_sq_wqe->rkey =
 					cpu_to_le32(rdma_wr(wr)->rkey);
 				rc_sq_wqe->va =
 					cpu_to_le64(rdma_wr(wr)->remote_addr);
 				break;
 			case IB_WR_SEND:
-				roce_set_field(rc_sq_wqe->byte_4,
-					       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-					       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-					       HNS_ROCE_V2_WQE_OP_SEND);
+				hr_op = HNS_ROCE_V2_WQE_OP_SEND;
 				break;
 			case IB_WR_SEND_WITH_INV:
-				roce_set_field(rc_sq_wqe->byte_4,
-				       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-				       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-				       HNS_ROCE_V2_WQE_OP_SEND_WITH_INV);
+				hr_op = HNS_ROCE_V2_WQE_OP_SEND_WITH_INV;
 				break;
 			case IB_WR_SEND_WITH_IMM:
-				roce_set_field(rc_sq_wqe->byte_4,
-					      V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-					      V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-					      HNS_ROCE_V2_WQE_OP_SEND_WITH_IMM);
+				hr_op = HNS_ROCE_V2_WQE_OP_SEND_WITH_IMM;
 				break;
 			case IB_WR_LOCAL_INV:
-				roce_set_field(rc_sq_wqe->byte_4,
-					       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-					       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-					       HNS_ROCE_V2_WQE_OP_LOCAL_INV);
+				hr_op = HNS_ROCE_V2_WQE_OP_LOCAL_INV;
+				roce_set_bit(rc_sq_wqe->byte_4,
+					       V2_RC_SEND_WQE_BYTE_4_SO_S, 1);
+				rc_sq_wqe->inv_key =
+					    cpu_to_le32(wr->ex.invalidate_rkey);
+				break;
+			case IB_WR_REG_MR:
+				hr_op = HNS_ROCE_V2_WQE_OP_FAST_REG_PMR;
+				fseg = wqe;
+				set_frmr_seg(rc_sq_wqe, fseg, reg_wr(wr));
 				break;
 			case IB_WR_ATOMIC_CMP_AND_SWP:
-				roce_set_field(rc_sq_wqe->byte_4,
-					  V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-					  V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-					  HNS_ROCE_V2_WQE_OP_ATOM_CMP_AND_SWAP);
+				hr_op = HNS_ROCE_V2_WQE_OP_ATOM_CMP_AND_SWAP;
+				rc_sq_wqe->rkey =
+					cpu_to_le32(atomic_wr(wr)->rkey);
+				rc_sq_wqe->va =
+					cpu_to_le64(atomic_wr(wr)->remote_addr);
 				break;
 			case IB_WR_ATOMIC_FETCH_AND_ADD:
-				roce_set_field(rc_sq_wqe->byte_4,
-					 V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-					 V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-					 HNS_ROCE_V2_WQE_OP_ATOM_FETCH_AND_ADD);
+				hr_op = HNS_ROCE_V2_WQE_OP_ATOM_FETCH_AND_ADD;
+				rc_sq_wqe->rkey =
+					cpu_to_le32(atomic_wr(wr)->rkey);
+				rc_sq_wqe->va =
+					cpu_to_le64(atomic_wr(wr)->remote_addr);
 				break;
 			case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
-				roce_set_field(rc_sq_wqe->byte_4,
-				      V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-				      V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-				      HNS_ROCE_V2_WQE_OP_ATOM_MSK_CMP_AND_SWAP);
+				hr_op =
+				       HNS_ROCE_V2_WQE_OP_ATOM_MSK_CMP_AND_SWAP;
 				break;
 			case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD:
-				roce_set_field(rc_sq_wqe->byte_4,
-				     V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-				     V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-				     HNS_ROCE_V2_WQE_OP_ATOM_MSK_FETCH_AND_ADD);
+				hr_op =
+				      HNS_ROCE_V2_WQE_OP_ATOM_MSK_FETCH_AND_ADD;
 				break;
 			default:
-				roce_set_field(rc_sq_wqe->byte_4,
-					       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-					       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-					       HNS_ROCE_V2_WQE_OP_MASK);
+				hr_op = HNS_ROCE_V2_WQE_OP_MASK;
 				break;
 			}
 
-			wqe += sizeof(struct hns_roce_v2_rc_send_wqe);
+			roce_set_field(rc_sq_wqe->byte_4,
+				       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
+				       V2_RC_SEND_WQE_BYTE_4_OPCODE_S, hr_op);
+
+			if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+			    wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+				struct hns_roce_v2_wqe_data_seg *dseg;
+
+				dseg = wqe;
+				set_data_seg_v2(dseg, wr->sg_list);
+				wqe += sizeof(struct hns_roce_v2_wqe_data_seg);
+				set_atomic_seg(wqe, atomic_wr(wr));
+				roce_set_field(rc_sq_wqe->byte_16,
+					       V2_RC_SEND_WQE_BYTE_16_SGE_NUM_M,
+					       V2_RC_SEND_WQE_BYTE_16_SGE_NUM_S,
+					       wr->num_sge);
+			} else if (wr->opcode != IB_WR_REG_MR) {
+				ret = set_rwqe_data_seg(ibqp, wr, rc_sq_wqe,
+							wqe, &sge_ind, bad_wr);
+				if (ret)
+					goto out;
+			}
 
-			ret = set_rwqe_data_seg(ibqp, wr, rc_sq_wqe, wqe,
-						&sge_ind, bad_wr);
-			if (ret)
-				goto out;
 			ind++;
 		} else {
 			dev_err(dev, "Illegal qp_type(0x%x)\n", ibqp->qp_type);
@@ -935,7 +995,24 @@ static int hns_roce_cmq_query_hw_info(struct hns_roce_dev *hr_dev)
 
 	resp = (struct hns_roce_query_version *)desc.data;
 	hr_dev->hw_rev = le32_to_cpu(resp->rocee_hw_version);
-	hr_dev->vendor_id = le32_to_cpu(resp->rocee_vendor_id);
+	hr_dev->vendor_id = hr_dev->pci_dev->vendor;
+
+	return 0;
+}
+
+static int hns_roce_query_fw_ver(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_query_fw_info *resp;
+	struct hns_roce_cmq_desc desc;
+	int ret;
+
+	hns_roce_cmq_setup_basic_desc(&desc, HNS_QUERY_FW_VER, true);
+	ret = hns_roce_cmq_send(hr_dev, &desc, 1);
+	if (ret)
+		return ret;
+
+	resp = (struct hns_roce_query_fw_info *)desc.data;
+	hr_dev->caps.fw_ver = (u64)(le32_to_cpu(resp->fw_ver));
 
 	return 0;
 }
@@ -1157,6 +1234,13 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
 	int ret;
 
 	ret = hns_roce_cmq_query_hw_info(hr_dev);
+	if (ret) {
+		dev_err(hr_dev->dev, "Query hardware version fail, ret = %d.\n",
+			ret);
+		return ret;
+	}
+
+	ret = hns_roce_query_fw_ver(hr_dev);
 	if (ret) {
 		dev_err(hr_dev->dev, "Query firmware version fail, ret = %d.\n",
 			ret);
@@ -1185,14 +1269,16 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
 		return ret;
 	}
 
-	hr_dev->vendor_part_id = 0;
-	hr_dev->sys_image_guid = 0;
+
+	hr_dev->vendor_part_id = hr_dev->pci_dev->device;
+	hr_dev->sys_image_guid = be64_to_cpu(hr_dev->ib_dev.node_guid);
 
 	caps->num_qps		= HNS_ROCE_V2_MAX_QP_NUM;
 	caps->max_wqes		= HNS_ROCE_V2_MAX_WQE_NUM;
 	caps->num_cqs		= HNS_ROCE_V2_MAX_CQ_NUM;
 	caps->max_cqes		= HNS_ROCE_V2_MAX_CQE_NUM;
 	caps->max_sq_sg		= HNS_ROCE_V2_MAX_SQ_SGE_NUM;
+	caps->max_extend_sg	= HNS_ROCE_V2_MAX_EXTEND_SGE_NUM;
 	caps->max_rq_sg		= HNS_ROCE_V2_MAX_RQ_SGE_NUM;
 	caps->max_sq_inline	= HNS_ROCE_V2_MAX_SQ_INLINE;
 	caps->num_uars		= HNS_ROCE_V2_UAR_NUM;
@@ -1222,6 +1308,7 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
 	caps->reserved_mrws	= 1;
 	caps->reserved_uars	= 0;
 	caps->reserved_cqs	= 0;
+	caps->reserved_qps	= HNS_ROCE_V2_RSV_QPS;
 
 	caps->qpc_ba_pg_sz	= 0;
 	caps->qpc_buf_pg_sz	= 0;
@@ -1255,6 +1342,11 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
 				  HNS_ROCE_CAP_FLAG_RQ_INLINE |
 				  HNS_ROCE_CAP_FLAG_RECORD_DB |
 				  HNS_ROCE_CAP_FLAG_SQ_RECORD_DB;
+
+	if (hr_dev->pci_dev->revision == 0x21)
+		caps->flags |= HNS_ROCE_CAP_FLAG_MW |
+			       HNS_ROCE_CAP_FLAG_FRMR;
+
 	caps->pkey_table_len[0] = 1;
 	caps->gid_table_len[0] = HNS_ROCE_V2_GID_INDEX_NUM;
 	caps->ceqe_depth	= HNS_ROCE_V2_COMP_EQE_NUM;
@@ -1262,6 +1354,9 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
 	caps->local_ca_ack_delay = 0;
 	caps->max_mtu = IB_MTU_4096;
 
+	if (hr_dev->pci_dev->revision == 0x21)
+		caps->flags |= HNS_ROCE_CAP_FLAG_ATOMIC;
+
 	ret = hns_roce_v2_set_bt(hr_dev);
 	if (ret)
 		dev_err(hr_dev->dev, "Configure bt attribute fail, ret = %d.\n",
@@ -1690,10 +1785,11 @@ static int hns_roce_v2_write_mtpt(void *mb_buf, struct hns_roce_mr *mr,
 
 	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_RA_EN_S, 0);
 	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_R_INV_EN_S, 1);
-	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_L_INV_EN_S, 0);
+	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_L_INV_EN_S, 1);
 	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_BIND_EN_S,
 		     (mr->access & IB_ACCESS_MW_BIND ? 1 : 0));
-	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_ATOMIC_EN_S, 0);
+	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_ATOMIC_EN_S,
+		     mr->access & IB_ACCESS_REMOTE_ATOMIC ? 1 : 0);
 	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_RR_EN_S,
 		     (mr->access & IB_ACCESS_REMOTE_READ ? 1 : 0));
 	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_RW_EN_S,
@@ -1817,6 +1913,88 @@ static int hns_roce_v2_rereg_write_mtpt(struct hns_roce_dev *hr_dev,
 	return 0;
 }
 
+static int hns_roce_v2_frmr_write_mtpt(void *mb_buf, struct hns_roce_mr *mr)
+{
+	struct hns_roce_v2_mpt_entry *mpt_entry;
+
+	mpt_entry = mb_buf;
+	memset(mpt_entry, 0, sizeof(*mpt_entry));
+
+	roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_MPT_ST_M,
+		       V2_MPT_BYTE_4_MPT_ST_S, V2_MPT_ST_FREE);
+	roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PBL_HOP_NUM_M,
+		       V2_MPT_BYTE_4_PBL_HOP_NUM_S, 1);
+	roce_set_field(mpt_entry->byte_4_pd_hop_st,
+		       V2_MPT_BYTE_4_PBL_BA_PG_SZ_M,
+		       V2_MPT_BYTE_4_PBL_BA_PG_SZ_S,
+		       mr->pbl_ba_pg_sz + PG_SHIFT_OFFSET);
+	roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PD_M,
+		       V2_MPT_BYTE_4_PD_S, mr->pd);
+
+	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_RA_EN_S, 1);
+	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_R_INV_EN_S, 1);
+	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_L_INV_EN_S, 1);
+
+	roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_FRE_S, 1);
+	roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_PA_S, 0);
+	roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_MR_MW_S, 0);
+	roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_BPD_S, 1);
+
+	mpt_entry->pbl_size = cpu_to_le32(mr->pbl_size);
+
+	mpt_entry->pbl_ba_l = cpu_to_le32(lower_32_bits(mr->pbl_ba >> 3));
+	roce_set_field(mpt_entry->byte_48_mode_ba, V2_MPT_BYTE_48_PBL_BA_H_M,
+		       V2_MPT_BYTE_48_PBL_BA_H_S,
+		       upper_32_bits(mr->pbl_ba >> 3));
+
+	roce_set_field(mpt_entry->byte_64_buf_pa1,
+		       V2_MPT_BYTE_64_PBL_BUF_PG_SZ_M,
+		       V2_MPT_BYTE_64_PBL_BUF_PG_SZ_S,
+		       mr->pbl_buf_pg_sz + PG_SHIFT_OFFSET);
+
+	return 0;
+}
+
+static int hns_roce_v2_mw_write_mtpt(void *mb_buf, struct hns_roce_mw *mw)
+{
+	struct hns_roce_v2_mpt_entry *mpt_entry;
+
+	mpt_entry = mb_buf;
+	memset(mpt_entry, 0, sizeof(*mpt_entry));
+
+	roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_MPT_ST_M,
+		       V2_MPT_BYTE_4_MPT_ST_S, V2_MPT_ST_FREE);
+	roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PD_M,
+		       V2_MPT_BYTE_4_PD_S, mw->pdn);
+	roce_set_field(mpt_entry->byte_4_pd_hop_st,
+		       V2_MPT_BYTE_4_PBL_HOP_NUM_M,
+		       V2_MPT_BYTE_4_PBL_HOP_NUM_S,
+		       mw->pbl_hop_num == HNS_ROCE_HOP_NUM_0 ?
+		       0 : mw->pbl_hop_num);
+	roce_set_field(mpt_entry->byte_4_pd_hop_st,
+		       V2_MPT_BYTE_4_PBL_BA_PG_SZ_M,
+		       V2_MPT_BYTE_4_PBL_BA_PG_SZ_S,
+		       mw->pbl_ba_pg_sz + PG_SHIFT_OFFSET);
+
+	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_R_INV_EN_S, 1);
+	roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_L_INV_EN_S, 1);
+
+	roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_PA_S, 0);
+	roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_MR_MW_S, 1);
+	roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_BPD_S, 1);
+	roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_BQP_S,
+		     mw->ibmw.type == IB_MW_TYPE_1 ? 0 : 1);
+
+	roce_set_field(mpt_entry->byte_64_buf_pa1,
+		       V2_MPT_BYTE_64_PBL_BUF_PG_SZ_M,
+		       V2_MPT_BYTE_64_PBL_BUF_PG_SZ_S,
+		       mw->pbl_buf_pg_sz + PG_SHIFT_OFFSET);
+
+	mpt_entry->lkey = cpu_to_le32(mw->rkey);
+
+	return 0;
+}
+
 static void *get_cqe_v2(struct hns_roce_cq *hr_cq, int n)
 {
 	return hns_roce_buf_offset(&hr_cq->hr_buf.hr_buf,
@@ -2274,6 +2452,7 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
 		wc->src_qp = (u8)roce_get_field(cqe->byte_32,
 						V2_CQE_BYTE_32_RMT_QPN_M,
 						V2_CQE_BYTE_32_RMT_QPN_S);
+		wc->slid = 0;
 		wc->wc_flags |= (roce_get_bit(cqe->byte_32,
 					      V2_CQE_BYTE_32_GRH_S) ?
 					      IB_WC_GRH : 0);
@@ -2287,7 +2466,14 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
 		wc->smac[5] = roce_get_field(cqe->byte_28,
 					     V2_CQE_BYTE_28_SMAC_5_M,
 					     V2_CQE_BYTE_28_SMAC_5_S);
-		wc->vlan_id = 0xffff;
+		if (roce_get_bit(cqe->byte_28, V2_CQE_BYTE_28_VID_VLD_S)) {
+			wc->vlan_id = (u16)roce_get_field(cqe->byte_28,
+							  V2_CQE_BYTE_28_VID_M,
+							  V2_CQE_BYTE_28_VID_S);
+		} else {
+			wc->vlan_id = 0xffff;
+		}
+
 		wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC);
 		wc->network_hdr_type = roce_get_field(cqe->byte_28,
 						    V2_CQE_BYTE_28_PORT_TYPE_M,
@@ -2589,21 +2775,16 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp,
 	roce_set_bit(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_RQ_TX_ERR_S, 0);
 	roce_set_bit(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_RQ_RX_ERR_S, 0);
 
-	roce_set_field(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_MAPID_M,
-		       V2_QPC_BYTE_60_MAPID_S, 0);
+	roce_set_field(qpc_mask->byte_60_qpst_tempid, V2_QPC_BYTE_60_TEMPID_M,
+		       V2_QPC_BYTE_60_TEMPID_S, 0);
 
-	roce_set_bit(qpc_mask->byte_60_qpst_mapid,
-		     V2_QPC_BYTE_60_INNER_MAP_IND_S, 0);
-	roce_set_bit(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_SQ_MAP_IND_S,
-		     0);
-	roce_set_bit(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_RQ_MAP_IND_S,
-		     0);
-	roce_set_bit(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_EXT_MAP_IND_S,
-		     0);
-	roce_set_bit(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_SQ_RLS_IND_S,
-		     0);
-	roce_set_bit(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_SQ_EXT_IND_S,
-		     0);
+	roce_set_field(qpc_mask->byte_60_qpst_tempid,
+		       V2_QPC_BYTE_60_SCC_TOKEN_M, V2_QPC_BYTE_60_SCC_TOKEN_S,
+		       0);
+	roce_set_bit(qpc_mask->byte_60_qpst_tempid,
+		     V2_QPC_BYTE_60_SQ_DB_DOING_S, 0);
+	roce_set_bit(qpc_mask->byte_60_qpst_tempid,
+		     V2_QPC_BYTE_60_RQ_DB_DOING_S, 0);
 	roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_CNP_TX_FLAG_S, 0);
 	roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_CE_FLAG_S, 0);
 
@@ -2685,7 +2866,8 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp,
 	roce_set_field(qpc_mask->byte_132_trrl, V2_QPC_BYTE_132_TRRL_TAIL_MAX_M,
 		       V2_QPC_BYTE_132_TRRL_TAIL_MAX_S, 0);
 
-	roce_set_bit(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RSVD_RAQ_MAP_S, 0);
+	roce_set_bit(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RQ_RTY_WAIT_DO_S,
+		     0);
 	roce_set_field(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RAQ_TRRL_HEAD_M,
 		       V2_QPC_BYTE_140_RAQ_TRRL_HEAD_S, 0);
 	roce_set_field(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RAQ_TRRL_TAIL_M,
@@ -2694,8 +2876,6 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp,
 	roce_set_field(qpc_mask->byte_144_raq,
 		       V2_QPC_BYTE_144_RAQ_RTY_INI_PSN_M,
 		       V2_QPC_BYTE_144_RAQ_RTY_INI_PSN_S, 0);
-	roce_set_bit(qpc_mask->byte_144_raq, V2_QPC_BYTE_144_RAQ_RTY_INI_IND_S,
-		     0);
 	roce_set_field(qpc_mask->byte_144_raq, V2_QPC_BYTE_144_RAQ_CREDIT_M,
 		       V2_QPC_BYTE_144_RAQ_CREDIT_S, 0);
 	roce_set_bit(qpc_mask->byte_144_raq, V2_QPC_BYTE_144_RESP_RTY_FLG_S, 0);
@@ -2721,14 +2901,12 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp,
 		       V2_QPC_BYTE_160_SQ_CONSUMER_IDX_M,
 		       V2_QPC_BYTE_160_SQ_CONSUMER_IDX_S, 0);
 
-	roce_set_field(context->byte_168_irrl_idx,
-		       V2_QPC_BYTE_168_SQ_SHIFT_BAK_M,
-		       V2_QPC_BYTE_168_SQ_SHIFT_BAK_S,
-		       ilog2((unsigned int)hr_qp->sq.wqe_cnt));
-	roce_set_field(qpc_mask->byte_168_irrl_idx,
-		       V2_QPC_BYTE_168_SQ_SHIFT_BAK_M,
-		       V2_QPC_BYTE_168_SQ_SHIFT_BAK_S, 0);
-
+	roce_set_bit(qpc_mask->byte_168_irrl_idx,
+		     V2_QPC_BYTE_168_POLL_DB_WAIT_DO_S, 0);
+	roce_set_bit(qpc_mask->byte_168_irrl_idx,
+		     V2_QPC_BYTE_168_SCC_TOKEN_FORBID_SQ_DEQ_S, 0);
+	roce_set_bit(qpc_mask->byte_168_irrl_idx,
+		     V2_QPC_BYTE_168_WAIT_ACK_TIMEOUT_S, 0);
 	roce_set_bit(qpc_mask->byte_168_irrl_idx,
 		     V2_QPC_BYTE_168_MSG_RTY_LP_FLG_S, 0);
 	roce_set_bit(qpc_mask->byte_168_irrl_idx,
@@ -2746,6 +2924,9 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp,
 	roce_set_bit(qpc_mask->byte_172_sq_psn, V2_QPC_BYTE_172_MSG_RNR_FLG_S,
 		     0);
 
+	roce_set_bit(context->byte_172_sq_psn, V2_QPC_BYTE_172_FRE_S, 1);
+	roce_set_bit(qpc_mask->byte_172_sq_psn, V2_QPC_BYTE_172_FRE_S, 0);
+
 	roce_set_field(qpc_mask->byte_176_msg_pktn,
 		       V2_QPC_BYTE_176_MSG_USE_PKTN_M,
 		       V2_QPC_BYTE_176_MSG_USE_PKTN_S, 0);
@@ -2790,6 +2971,13 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp,
 		       V2_QPC_BYTE_232_IRRL_SGE_IDX_M,
 		       V2_QPC_BYTE_232_IRRL_SGE_IDX_S, 0);
 
+	roce_set_bit(qpc_mask->byte_232_irrl_sge, V2_QPC_BYTE_232_SO_LP_VLD_S,
+		     0);
+	roce_set_bit(qpc_mask->byte_232_irrl_sge,
+		     V2_QPC_BYTE_232_FENCE_LP_VLD_S, 0);
+	roce_set_bit(qpc_mask->byte_232_irrl_sge, V2_QPC_BYTE_232_IRRL_LP_VLD_S,
+		     0);
+
 	qpc_mask->irrl_cur_sge_offset = 0;
 
 	roce_set_field(qpc_mask->byte_240_irrl_tail,
@@ -2955,13 +3143,6 @@ static void modify_qp_init_to_init(struct ib_qp *ibqp,
 		roce_set_field(qpc_mask->byte_56_dqpn_err,
 			       V2_QPC_BYTE_56_DQPN_M, V2_QPC_BYTE_56_DQPN_S, 0);
 	}
-	roce_set_field(context->byte_168_irrl_idx,
-		       V2_QPC_BYTE_168_SQ_SHIFT_BAK_M,
-		       V2_QPC_BYTE_168_SQ_SHIFT_BAK_S,
-		       ilog2((unsigned int)hr_qp->sq.wqe_cnt));
-	roce_set_field(qpc_mask->byte_168_irrl_idx,
-		       V2_QPC_BYTE_168_SQ_SHIFT_BAK_M,
-		       V2_QPC_BYTE_168_SQ_SHIFT_BAK_S, 0);
 }
 
 static int modify_qp_init_to_rtr(struct ib_qp *ibqp,
@@ -3271,13 +3452,6 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp,
 	 * we should set all bits of the relevant fields in context mask to
 	 * 0 at the same time, else set them to 0x1.
 	 */
-	roce_set_field(context->byte_60_qpst_mapid,
-		       V2_QPC_BYTE_60_RTY_NUM_INI_BAK_M,
-		       V2_QPC_BYTE_60_RTY_NUM_INI_BAK_S, attr->retry_cnt);
-	roce_set_field(qpc_mask->byte_60_qpst_mapid,
-		       V2_QPC_BYTE_60_RTY_NUM_INI_BAK_M,
-		       V2_QPC_BYTE_60_RTY_NUM_INI_BAK_S, 0);
-
 	context->sq_cur_blk_addr = (u32)(mtts[0] >> PAGE_ADDR_SHIFT);
 	roce_set_field(context->byte_168_irrl_idx,
 		       V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_M,
@@ -3538,6 +3712,17 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
 			memcpy(src_mac, gid_attr->ndev->dev_addr, ETH_ALEN);
 		}
 
+		if (is_vlan_dev(gid_attr->ndev)) {
+			roce_set_bit(context->byte_76_srqn_op_en,
+				     V2_QPC_BYTE_76_RQ_VLAN_EN_S, 1);
+			roce_set_bit(qpc_mask->byte_76_srqn_op_en,
+				     V2_QPC_BYTE_76_RQ_VLAN_EN_S, 0);
+			roce_set_bit(context->byte_168_irrl_idx,
+				     V2_QPC_BYTE_168_SQ_VLAN_EN_S, 1);
+			roce_set_bit(qpc_mask->byte_168_irrl_idx,
+				     V2_QPC_BYTE_168_SQ_VLAN_EN_S, 0);
+		}
+
 		roce_set_field(context->byte_24_mtu_tc,
 			       V2_QPC_BYTE_24_VLAN_ID_M,
 			       V2_QPC_BYTE_24_VLAN_ID_S, vlan);
@@ -3584,8 +3769,15 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
 			       V2_QPC_BYTE_24_HOP_LIMIT_M,
 			       V2_QPC_BYTE_24_HOP_LIMIT_S, 0);
 
-		roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M,
-			       V2_QPC_BYTE_24_TC_S, grh->traffic_class);
+		if (hr_dev->pci_dev->revision == 0x21 &&
+		    gid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
+			roce_set_field(context->byte_24_mtu_tc,
+				       V2_QPC_BYTE_24_TC_M, V2_QPC_BYTE_24_TC_S,
+				       grh->traffic_class >> 2);
+		else
+			roce_set_field(context->byte_24_mtu_tc,
+				       V2_QPC_BYTE_24_TC_M, V2_QPC_BYTE_24_TC_S,
+				       grh->traffic_class);
 		roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M,
 			       V2_QPC_BYTE_24_TC_S, 0);
 		roce_set_field(context->byte_28_at_fl, V2_QPC_BYTE_28_FL_M,
@@ -3606,9 +3798,9 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
 		set_access_flags(hr_qp, context, qpc_mask, attr, attr_mask);
 
 	/* Every status migrate must change state */
-	roce_set_field(context->byte_60_qpst_mapid, V2_QPC_BYTE_60_QP_ST_M,
+	roce_set_field(context->byte_60_qpst_tempid, V2_QPC_BYTE_60_QP_ST_M,
 		       V2_QPC_BYTE_60_QP_ST_S, new_state);
-	roce_set_field(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_QP_ST_M,
+	roce_set_field(qpc_mask->byte_60_qpst_tempid, V2_QPC_BYTE_60_QP_ST_M,
 		       V2_QPC_BYTE_60_QP_ST_S, 0);
 
 	/* SW pass context to HW */
@@ -3728,7 +3920,7 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
 		goto out;
 	}
 
-	state = roce_get_field(context->byte_60_qpst_mapid,
+	state = roce_get_field(context->byte_60_qpst_tempid,
 			       V2_QPC_BYTE_60_QP_ST_M, V2_QPC_BYTE_60_QP_ST_S);
 	tmp_qp_state = to_ib_qp_st((enum hns_roce_v2_qp_state)state);
 	if (tmp_qp_state == -1) {
@@ -3995,13 +4187,103 @@ static void hns_roce_irq_work_handle(struct work_struct *work)
 {
 	struct hns_roce_work *irq_work =
 				container_of(work, struct hns_roce_work, work);
+	struct device *dev = irq_work->hr_dev->dev;
 	u32 qpn = irq_work->qpn;
+	u32 cqn = irq_work->cqn;
 
 	switch (irq_work->event_type) {
+	case HNS_ROCE_EVENT_TYPE_PATH_MIG:
+		dev_info(dev, "Path migrated succeeded.\n");
+		break;
+	case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED:
+		dev_warn(dev, "Path migration failed.\n");
+		break;
+	case HNS_ROCE_EVENT_TYPE_COMM_EST:
+		dev_info(dev, "Communication established.\n");
+		break;
+	case HNS_ROCE_EVENT_TYPE_SQ_DRAINED:
+		dev_warn(dev, "Send queue drained.\n");
+		break;
 	case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR:
+		dev_err(dev, "Local work queue catastrophic error.\n");
+		hns_roce_set_qps_to_err(irq_work->hr_dev, qpn);
+		switch (irq_work->sub_type) {
+		case HNS_ROCE_LWQCE_QPC_ERROR:
+			dev_err(dev, "QP %d, QPC error.\n", qpn);
+			break;
+		case HNS_ROCE_LWQCE_MTU_ERROR:
+			dev_err(dev, "QP %d, MTU error.\n", qpn);
+			break;
+		case HNS_ROCE_LWQCE_WQE_BA_ADDR_ERROR:
+			dev_err(dev, "QP %d, WQE BA addr error.\n", qpn);
+			break;
+		case HNS_ROCE_LWQCE_WQE_ADDR_ERROR:
+			dev_err(dev, "QP %d, WQE addr error.\n", qpn);
+			break;
+		case HNS_ROCE_LWQCE_SQ_WQE_SHIFT_ERROR:
+			dev_err(dev, "QP %d, WQE shift error.\n", qpn);
+			break;
+		default:
+			dev_err(dev, "Unhandled sub_event type %d.\n",
+				irq_work->sub_type);
+			break;
+		}
+		break;
 	case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR:
+		dev_err(dev, "Invalid request local work queue error.\n");
+		hns_roce_set_qps_to_err(irq_work->hr_dev, qpn);
+		break;
 	case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
+		dev_err(dev, "Local access violation work queue error.\n");
 		hns_roce_set_qps_to_err(irq_work->hr_dev, qpn);
+		switch (irq_work->sub_type) {
+		case HNS_ROCE_LAVWQE_R_KEY_VIOLATION:
+			dev_err(dev, "QP %d, R_key violation.\n", qpn);
+			break;
+		case HNS_ROCE_LAVWQE_LENGTH_ERROR:
+			dev_err(dev, "QP %d, length error.\n", qpn);
+			break;
+		case HNS_ROCE_LAVWQE_VA_ERROR:
+			dev_err(dev, "QP %d, VA error.\n", qpn);
+			break;
+		case HNS_ROCE_LAVWQE_PD_ERROR:
+			dev_err(dev, "QP %d, PD error.\n", qpn);
+			break;
+		case HNS_ROCE_LAVWQE_RW_ACC_ERROR:
+			dev_err(dev, "QP %d, rw acc error.\n", qpn);
+			break;
+		case HNS_ROCE_LAVWQE_KEY_STATE_ERROR:
+			dev_err(dev, "QP %d, key state error.\n", qpn);
+			break;
+		case HNS_ROCE_LAVWQE_MR_OPERATION_ERROR:
+			dev_err(dev, "QP %d, MR operation error.\n", qpn);
+			break;
+		default:
+			dev_err(dev, "Unhandled sub_event type %d.\n",
+				irq_work->sub_type);
+			break;
+		}
+		break;
+	case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH:
+		dev_warn(dev, "SRQ limit reach.\n");
+		break;
+	case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH:
+		dev_warn(dev, "SRQ last wqe reach.\n");
+		break;
+	case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR:
+		dev_err(dev, "SRQ catas error.\n");
+		break;
+	case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR:
+		dev_err(dev, "CQ 0x%x access err.\n", cqn);
+		break;
+	case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW:
+		dev_warn(dev, "CQ 0x%x overflow\n", cqn);
+		break;
+	case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW:
+		dev_warn(dev, "DB overflow.\n");
+		break;
+	case HNS_ROCE_EVENT_TYPE_FLR:
+		dev_warn(dev, "Function level reset.\n");
 		break;
 	default:
 		break;
@@ -4011,7 +4293,8 @@ static void hns_roce_irq_work_handle(struct work_struct *work)
 }
 
 static void hns_roce_v2_init_irq_work(struct hns_roce_dev *hr_dev,
-				      struct hns_roce_eq *eq, u32 qpn)
+				      struct hns_roce_eq *eq,
+				      u32 qpn, u32 cqn)
 {
 	struct hns_roce_work *irq_work;
 
@@ -4022,6 +4305,7 @@ static void hns_roce_v2_init_irq_work(struct hns_roce_dev *hr_dev,
 	INIT_WORK(&(irq_work->work), hns_roce_irq_work_handle);
 	irq_work->hr_dev = hr_dev;
 	irq_work->qpn = qpn;
+	irq_work->cqn = cqn;
 	irq_work->event_type = eq->event_type;
 	irq_work->sub_type = eq->sub_type;
 	queue_work(hr_dev->irq_workq, &(irq_work->work));
@@ -4058,124 +4342,6 @@ static void set_eq_cons_index_v2(struct hns_roce_eq *eq)
 	hns_roce_write64_k(doorbell, eq->doorbell);
 }
 
-static void hns_roce_v2_wq_catas_err_handle(struct hns_roce_dev *hr_dev,
-						  struct hns_roce_aeqe *aeqe,
-						  u32 qpn)
-{
-	struct device *dev = hr_dev->dev;
-	int sub_type;
-
-	dev_warn(dev, "Local work queue catastrophic error.\n");
-	sub_type = roce_get_field(aeqe->asyn, HNS_ROCE_V2_AEQE_SUB_TYPE_M,
-				  HNS_ROCE_V2_AEQE_SUB_TYPE_S);
-	switch (sub_type) {
-	case HNS_ROCE_LWQCE_QPC_ERROR:
-		dev_warn(dev, "QP %d, QPC error.\n", qpn);
-		break;
-	case HNS_ROCE_LWQCE_MTU_ERROR:
-		dev_warn(dev, "QP %d, MTU error.\n", qpn);
-		break;
-	case HNS_ROCE_LWQCE_WQE_BA_ADDR_ERROR:
-		dev_warn(dev, "QP %d, WQE BA addr error.\n", qpn);
-		break;
-	case HNS_ROCE_LWQCE_WQE_ADDR_ERROR:
-		dev_warn(dev, "QP %d, WQE addr error.\n", qpn);
-		break;
-	case HNS_ROCE_LWQCE_SQ_WQE_SHIFT_ERROR:
-		dev_warn(dev, "QP %d, WQE shift error.\n", qpn);
-		break;
-	default:
-		dev_err(dev, "Unhandled sub_event type %d.\n", sub_type);
-		break;
-	}
-}
-
-static void hns_roce_v2_local_wq_access_err_handle(struct hns_roce_dev *hr_dev,
-					    struct hns_roce_aeqe *aeqe, u32 qpn)
-{
-	struct device *dev = hr_dev->dev;
-	int sub_type;
-
-	dev_warn(dev, "Local access violation work queue error.\n");
-	sub_type = roce_get_field(aeqe->asyn, HNS_ROCE_V2_AEQE_SUB_TYPE_M,
-				  HNS_ROCE_V2_AEQE_SUB_TYPE_S);
-	switch (sub_type) {
-	case HNS_ROCE_LAVWQE_R_KEY_VIOLATION:
-		dev_warn(dev, "QP %d, R_key violation.\n", qpn);
-		break;
-	case HNS_ROCE_LAVWQE_LENGTH_ERROR:
-		dev_warn(dev, "QP %d, length error.\n", qpn);
-		break;
-	case HNS_ROCE_LAVWQE_VA_ERROR:
-		dev_warn(dev, "QP %d, VA error.\n", qpn);
-		break;
-	case HNS_ROCE_LAVWQE_PD_ERROR:
-		dev_err(dev, "QP %d, PD error.\n", qpn);
-		break;
-	case HNS_ROCE_LAVWQE_RW_ACC_ERROR:
-		dev_warn(dev, "QP %d, rw acc error.\n", qpn);
-		break;
-	case HNS_ROCE_LAVWQE_KEY_STATE_ERROR:
-		dev_warn(dev, "QP %d, key state error.\n", qpn);
-		break;
-	case HNS_ROCE_LAVWQE_MR_OPERATION_ERROR:
-		dev_warn(dev, "QP %d, MR operation error.\n", qpn);
-		break;
-	default:
-		dev_err(dev, "Unhandled sub_event type %d.\n", sub_type);
-		break;
-	}
-}
-
-static void hns_roce_v2_qp_err_handle(struct hns_roce_dev *hr_dev,
-				      struct hns_roce_aeqe *aeqe,
-				      int event_type, u32 qpn)
-{
-	struct device *dev = hr_dev->dev;
-
-	switch (event_type) {
-	case HNS_ROCE_EVENT_TYPE_COMM_EST:
-		dev_warn(dev, "Communication established.\n");
-		break;
-	case HNS_ROCE_EVENT_TYPE_SQ_DRAINED:
-		dev_warn(dev, "Send queue drained.\n");
-		break;
-	case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR:
-		hns_roce_v2_wq_catas_err_handle(hr_dev, aeqe, qpn);
-		break;
-	case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR:
-		dev_warn(dev, "Invalid request local work queue error.\n");
-		break;
-	case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
-		hns_roce_v2_local_wq_access_err_handle(hr_dev, aeqe, qpn);
-		break;
-	default:
-		break;
-	}
-
-	hns_roce_qp_event(hr_dev, qpn, event_type);
-}
-
-static void hns_roce_v2_cq_err_handle(struct hns_roce_dev *hr_dev,
-				      struct hns_roce_aeqe *aeqe,
-				      int event_type, u32 cqn)
-{
-	struct device *dev = hr_dev->dev;
-
-	switch (event_type) {
-	case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR:
-		dev_warn(dev, "CQ 0x%x access err.\n", cqn);
-		break;
-	case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW:
-		dev_warn(dev, "CQ 0x%x overflow\n", cqn);
-		break;
-	default:
-		break;
-	}
-
-	hns_roce_cq_event(hr_dev, cqn, event_type);
-}
-
 static struct hns_roce_aeqe *get_aeqe_v2(struct hns_roce_eq *eq, u32 entry)
 {
 	u32 buf_chk_sz;
@@ -4251,31 +4417,23 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
 
 		switch (event_type) {
 		case HNS_ROCE_EVENT_TYPE_PATH_MIG:
-			dev_warn(dev, "Path migrated succeeded.\n");
-			break;
 		case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED:
-			dev_warn(dev, "Path migration failed.\n");
-			break;
 		case HNS_ROCE_EVENT_TYPE_COMM_EST:
 		case HNS_ROCE_EVENT_TYPE_SQ_DRAINED:
 		case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR:
 		case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR:
 		case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
-			hns_roce_v2_qp_err_handle(hr_dev, aeqe, event_type,
-						  qpn);
+			hns_roce_qp_event(hr_dev, qpn, event_type);
 			break;
 		case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH:
 		case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH:
 		case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR:
-			dev_warn(dev, "SRQ not support.\n");
 			break;
 		case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR:
 		case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW:
-			hns_roce_v2_cq_err_handle(hr_dev, aeqe, event_type,
-						  cqn);
+			hns_roce_cq_event(hr_dev, cqn, event_type);
 			break;
 		case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW:
-			dev_warn(dev, "DB overflow.\n");
 			break;
 		case HNS_ROCE_EVENT_TYPE_MB:
 			hns_roce_cmd_event(hr_dev,
@@ -4284,10 +4442,8 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
 					le64_to_cpu(aeqe->event.cmd.out_param));
 			break;
 		case HNS_ROCE_EVENT_TYPE_CEQ_OVERFLOW:
-			dev_warn(dev, "CEQ overflow.\n");
 			break;
 		case HNS_ROCE_EVENT_TYPE_FLR:
-			dev_warn(dev, "Function level reset.\n");
 			break;
 		default:
 			dev_err(dev, "Unhandled event %d on EQ %d at idx %u.\n",
@@ -4304,7 +4460,7 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
 			dev_warn(dev, "cons_index overflow, set back to 0.\n");
 			eq->cons_index = 0;
 		}
-		hns_roce_v2_init_irq_work(hr_dev, eq, qpn);
+		hns_roce_v2_init_irq_work(hr_dev, eq, qpn, cqn);
 	}
 
 	set_eq_cons_index_v2(eq);
@@ -5125,6 +5281,7 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev)
 		create_singlethread_workqueue("hns_roce_irq_workqueue");
 	if (!hr_dev->irq_workq) {
 		dev_err(dev, "Create irq workqueue failed!\n");
+		ret = -ENOMEM;
 		goto err_request_irq_fail;
 	}
 
@@ -5195,6 +5352,8 @@ static const struct hns_roce_hw hns_roce_hw_v2 = {
 	.set_mac = hns_roce_v2_set_mac,
 	.write_mtpt = hns_roce_v2_write_mtpt,
 	.rereg_write_mtpt = hns_roce_v2_rereg_write_mtpt,
+	.frmr_write_mtpt = hns_roce_v2_frmr_write_mtpt,
+	.mw_write_mtpt = hns_roce_v2_mw_write_mtpt,
 	.write_cqc = hns_roce_v2_write_cqc,
 	.set_hem = hns_roce_v2_set_hem,
 	.clear_hem = hns_roce_v2_clear_hem,

+ 72 - 24
drivers/infiniband/hw/hns/hns_roce_hw_v2.h

@@ -50,6 +50,7 @@
 #define HNS_ROCE_V2_MAX_CQE_NUM			0x10000
 #define HNS_ROCE_V2_MAX_RQ_SGE_NUM		0x100
 #define HNS_ROCE_V2_MAX_SQ_SGE_NUM		0xff
+#define HNS_ROCE_V2_MAX_EXTEND_SGE_NUM		0x200000
 #define HNS_ROCE_V2_MAX_SQ_INLINE		0x20
 #define HNS_ROCE_V2_UAR_NUM			256
 #define HNS_ROCE_V2_PHY_UAR_NUM			1
@@ -78,6 +79,7 @@
 #define HNS_ROCE_INVALID_LKEY			0x100
 #define HNS_ROCE_CMQ_TX_TIMEOUT			30000
 #define HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE	2
+#define HNS_ROCE_V2_RSV_QPS			8
 
 #define HNS_ROCE_CONTEXT_HOP_NUM		1
 #define HNS_ROCE_MTT_HOP_NUM			1
@@ -201,6 +203,7 @@ enum {
 
 /* CMQ command */
 enum hns_roce_opcode_type {
+	HNS_QUERY_FW_VER				= 0x0001,
 	HNS_ROCE_OPC_QUERY_HW_VER			= 0x8000,
 	HNS_ROCE_OPC_CFG_GLOBAL_PARAM			= 0x8001,
 	HNS_ROCE_OPC_ALLOC_PF_RES			= 0x8004,
@@ -324,6 +327,7 @@ struct hns_roce_v2_cq_context {
 
 enum{
 	V2_MPT_ST_VALID = 0x1,
+	V2_MPT_ST_FREE	= 0x2,
 };
 
 enum hns_roce_v2_qp_state {
@@ -350,7 +354,7 @@ struct hns_roce_v2_qp_context {
 	__le32	dmac;
 	__le32	byte_52_udpspn_dmac;
 	__le32	byte_56_dqpn_err;
-	__le32	byte_60_qpst_mapid;
+	__le32	byte_60_qpst_tempid;
 	__le32	qkey_xrcd;
 	__le32	byte_68_rq_db;
 	__le32	rq_db_record_addr;
@@ -492,26 +496,15 @@ struct hns_roce_v2_qp_context {
 #define	V2_QPC_BYTE_56_LP_PKTN_INI_S 28
 #define V2_QPC_BYTE_56_LP_PKTN_INI_M GENMASK(31, 28)
 
-#define	V2_QPC_BYTE_60_MAPID_S 0
-#define V2_QPC_BYTE_60_MAPID_M GENMASK(12, 0)
+#define	V2_QPC_BYTE_60_TEMPID_S 0
+#define V2_QPC_BYTE_60_TEMPID_M GENMASK(7, 0)
 
-#define	V2_QPC_BYTE_60_INNER_MAP_IND_S 13
+#define V2_QPC_BYTE_60_SCC_TOKEN_S 8
+#define V2_QPC_BYTE_60_SCC_TOKEN_M GENMASK(26, 8)
 
-#define	V2_QPC_BYTE_60_SQ_MAP_IND_S 14
+#define	V2_QPC_BYTE_60_SQ_DB_DOING_S 27
 
-#define	V2_QPC_BYTE_60_RQ_MAP_IND_S 15
-
-#define	V2_QPC_BYTE_60_TEMPID_S 16
-#define V2_QPC_BYTE_60_TEMPID_M  GENMASK(22, 16)
-
-#define	V2_QPC_BYTE_60_EXT_MAP_IND_S 23
-
-#define	V2_QPC_BYTE_60_RTY_NUM_INI_BAK_S 24
-#define V2_QPC_BYTE_60_RTY_NUM_INI_BAK_M GENMASK(26, 24)
-
-#define V2_QPC_BYTE_60_SQ_RLS_IND_S 27
-
-#define	V2_QPC_BYTE_60_SQ_EXT_IND_S 28
+#define	V2_QPC_BYTE_60_RQ_DB_DOING_S 28
 
 #define	V2_QPC_BYTE_60_QP_ST_S 29
 #define V2_QPC_BYTE_60_QP_ST_M GENMASK(31, 29)
@@ -534,6 +527,7 @@ struct hns_roce_v2_qp_context {
 
 #define	V2_QPC_BYTE_76_RQIE_S 28
 
+#define	V2_QPC_BYTE_76_RQ_VLAN_EN_S 30
 #define	V2_QPC_BYTE_80_RX_CQN_S 0
 #define V2_QPC_BYTE_80_RX_CQN_M GENMASK(23, 0)
 
@@ -588,7 +582,7 @@ struct hns_roce_v2_qp_context {
 #define	V2_QPC_BYTE_140_RR_MAX_S 12
 #define V2_QPC_BYTE_140_RR_MAX_M GENMASK(14, 12)
 
-#define	V2_QPC_BYTE_140_RSVD_RAQ_MAP_S 15
+#define	V2_QPC_BYTE_140_RQ_RTY_WAIT_DO_S 15
 
 #define	V2_QPC_BYTE_140_RAQ_TRRL_HEAD_S 16
 #define V2_QPC_BYTE_140_RAQ_TRRL_HEAD_M GENMASK(23, 16)
@@ -599,8 +593,6 @@ struct hns_roce_v2_qp_context {
 #define	V2_QPC_BYTE_144_RAQ_RTY_INI_PSN_S 0
 #define V2_QPC_BYTE_144_RAQ_RTY_INI_PSN_M GENMASK(23, 0)
 
-#define V2_QPC_BYTE_144_RAQ_RTY_INI_IND_S 24
-
 #define V2_QPC_BYTE_144_RAQ_CREDIT_S 25
 #define V2_QPC_BYTE_144_RAQ_CREDIT_M GENMASK(29, 25)
 
@@ -637,9 +629,10 @@ struct hns_roce_v2_qp_context {
 #define	V2_QPC_BYTE_168_LP_SGEN_INI_S 22
 #define V2_QPC_BYTE_168_LP_SGEN_INI_M GENMASK(23, 22)
 
-#define	V2_QPC_BYTE_168_SQ_SHIFT_BAK_S 24
-#define V2_QPC_BYTE_168_SQ_SHIFT_BAK_M GENMASK(27, 24)
-
+#define V2_QPC_BYTE_168_SQ_VLAN_EN_S 24
+#define V2_QPC_BYTE_168_POLL_DB_WAIT_DO_S 25
+#define V2_QPC_BYTE_168_SCC_TOKEN_FORBID_SQ_DEQ_S 26
+#define V2_QPC_BYTE_168_WAIT_ACK_TIMEOUT_S 27
 #define	V2_QPC_BYTE_168_IRRL_IDX_LSB_S 28
 #define V2_QPC_BYTE_168_IRRL_IDX_LSB_M GENMASK(31, 28)
 
@@ -725,6 +718,10 @@ struct hns_roce_v2_qp_context {
 #define	V2_QPC_BYTE_232_IRRL_SGE_IDX_S 20
 #define V2_QPC_BYTE_232_IRRL_SGE_IDX_M GENMASK(28, 20)
 
+#define V2_QPC_BYTE_232_SO_LP_VLD_S 29
+#define V2_QPC_BYTE_232_FENCE_LP_VLD_S 30
+#define V2_QPC_BYTE_232_IRRL_LP_VLD_S 31
+
 #define	V2_QPC_BYTE_240_IRRL_TAIL_REAL_S 0
 #define V2_QPC_BYTE_240_IRRL_TAIL_REAL_M GENMASK(7, 0)
 
@@ -743,6 +740,9 @@ struct hns_roce_v2_qp_context {
 #define	V2_QPC_BYTE_244_RNR_CNT_S 27
 #define V2_QPC_BYTE_244_RNR_CNT_M GENMASK(29, 27)
 
+#define V2_QPC_BYTE_244_LCL_OP_FLG_S 30
+#define V2_QPC_BYTE_244_IRRL_RD_FLG_S 31
+
 #define	V2_QPC_BYTE_248_IRRL_PSN_S 0
 #define V2_QPC_BYTE_248_IRRL_PSN_M GENMASK(23, 0)
 
@@ -818,6 +818,11 @@ struct hns_roce_v2_cqe {
 #define	V2_CQE_BYTE_28_PORT_TYPE_S 16
 #define V2_CQE_BYTE_28_PORT_TYPE_M GENMASK(17, 16)
 
+#define V2_CQE_BYTE_28_VID_S 18
+#define V2_CQE_BYTE_28_VID_M GENMASK(29, 18)
+
+#define V2_CQE_BYTE_28_VID_VLD_S 30
+
 #define	V2_CQE_BYTE_32_RMT_QPN_S 0
 #define V2_CQE_BYTE_32_RMT_QPN_M GENMASK(23, 0)
 
@@ -878,8 +883,19 @@ struct hns_roce_v2_mpt_entry {
 
 #define V2_MPT_BYTE_8_LW_EN_S 7
 
+#define V2_MPT_BYTE_8_MW_CNT_S 8
+#define V2_MPT_BYTE_8_MW_CNT_M GENMASK(31, 8)
+
+#define V2_MPT_BYTE_12_FRE_S 0
+
 #define V2_MPT_BYTE_12_PA_S 1
 
+#define V2_MPT_BYTE_12_MR_MW_S 4
+
+#define V2_MPT_BYTE_12_BPD_S 5
+
+#define V2_MPT_BYTE_12_BQP_S 6
+
 #define V2_MPT_BYTE_12_INNER_PA_VLD_S 7
 
 #define V2_MPT_BYTE_12_MW_BIND_QPN_S 8
@@ -988,6 +1004,8 @@ struct hns_roce_v2_ud_send_wqe {
 #define	V2_UD_SEND_WQE_BYTE_40_PORTN_S 24
 #define V2_UD_SEND_WQE_BYTE_40_PORTN_M GENMASK(26, 24)
 
+#define V2_UD_SEND_WQE_BYTE_40_UD_VLAN_EN_S 30
+
 #define	V2_UD_SEND_WQE_BYTE_40_LBI_S 31
 
 #define	V2_UD_SEND_WQE_DMAC_0_S 0
@@ -1042,6 +1060,16 @@ struct hns_roce_v2_rc_send_wqe {
 
 #define V2_RC_SEND_WQE_BYTE_4_INLINE_S 12
 
+#define V2_RC_FRMR_WQE_BYTE_4_BIND_EN_S 19
+
+#define V2_RC_FRMR_WQE_BYTE_4_ATOMIC_S 20
+
+#define V2_RC_FRMR_WQE_BYTE_4_RR_S 21
+
+#define V2_RC_FRMR_WQE_BYTE_4_RW_S 22
+
+#define V2_RC_FRMR_WQE_BYTE_4_LW_S 23
+
 #define	V2_RC_SEND_WQE_BYTE_16_XRC_SRQN_S 0
 #define V2_RC_SEND_WQE_BYTE_16_XRC_SRQN_M GENMASK(23, 0)
 
@@ -1051,6 +1079,16 @@ struct hns_roce_v2_rc_send_wqe {
 #define V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S 0
 #define V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M GENMASK(23, 0)
 
+struct hns_roce_wqe_frmr_seg {
+	__le32	pbl_size;
+	__le32	mode_buf_pg_sz;
+};
+
+#define V2_RC_FRMR_WQE_BYTE_40_PBL_BUF_PG_SZ_S	4
+#define V2_RC_FRMR_WQE_BYTE_40_PBL_BUF_PG_SZ_M	GENMASK(7, 4)
+
+#define V2_RC_FRMR_WQE_BYTE_40_BLK_MODE_S 8
+
 struct hns_roce_v2_wqe_data_seg {
 	__le32    len;
 	__le32    lkey;
@@ -1068,6 +1106,11 @@ struct hns_roce_query_version {
 	__le32 rsv[5];
 };
 
+struct hns_roce_query_fw_info {
+	__le32 fw_ver;
+	__le32 rsv[5];
+};
+
 struct hns_roce_cfg_llm_a {
 	__le32 base_addr_l;
 	__le32 base_addr_h;
@@ -1564,4 +1607,9 @@ struct hns_roce_eq_context {
 #define HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_S 0
 #define HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_M GENMASK(23, 0)
 
+struct hns_roce_wqe_atomic_seg {
+	__le64          fetchadd_swap_data;
+	__le64          cmp_data;
+};
+
 #endif

+ 40 - 83
drivers/infiniband/hw/hns/hns_roce_main.c

@@ -196,6 +196,7 @@ static int hns_roce_query_device(struct ib_device *ib_dev,
 
 	memset(props, 0, sizeof(*props));
 
+	props->fw_ver = hr_dev->caps.fw_ver;
 	props->sys_image_guid = cpu_to_be64(hr_dev->sys_image_guid);
 	props->max_mr_size = (u64)(~(0ULL));
 	props->page_size_cap = hr_dev->caps.page_size_cap;
@@ -215,7 +216,8 @@ static int hns_roce_query_device(struct ib_device *ib_dev,
 	props->max_pd = hr_dev->caps.num_pds;
 	props->max_qp_rd_atom = hr_dev->caps.max_qp_dest_rdma;
 	props->max_qp_init_rd_atom = hr_dev->caps.max_qp_init_rdma;
-	props->atomic_cap = IB_ATOMIC_NONE;
+	props->atomic_cap = hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_ATOMIC ?
+			    IB_ATOMIC_HCA : IB_ATOMIC_NONE;
 	props->max_pkeys = 1;
 	props->local_ca_ack_delay = hr_dev->caps.local_ca_ack_delay;
 
@@ -344,8 +346,6 @@ static struct ib_ucontext *hns_roce_alloc_ucontext(struct ib_device *ib_dev,
 	if (ret)
 		goto error_fail_uar_alloc;
 
-	INIT_LIST_HEAD(&context->vma_list);
-	mutex_init(&context->vma_list_mutex);
 	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) {
 		INIT_LIST_HEAD(&context->page_list);
 		mutex_init(&context->page_mutex);
@@ -376,76 +376,34 @@ static int hns_roce_dealloc_ucontext(struct ib_ucontext *ibcontext)
 	return 0;
 }
 
-static void hns_roce_vma_open(struct vm_area_struct *vma)
-{
-	vma->vm_ops = NULL;
-}
-
-static void hns_roce_vma_close(struct vm_area_struct *vma)
-{
-	struct hns_roce_vma_data *vma_data;
-
-	vma_data = (struct hns_roce_vma_data *)vma->vm_private_data;
-	vma_data->vma = NULL;
-	mutex_lock(vma_data->vma_list_mutex);
-	list_del(&vma_data->list);
-	mutex_unlock(vma_data->vma_list_mutex);
-	kfree(vma_data);
-}
-
-static const struct vm_operations_struct hns_roce_vm_ops = {
-	.open = hns_roce_vma_open,
-	.close = hns_roce_vma_close,
-};
-
-static int hns_roce_set_vma_data(struct vm_area_struct *vma,
-				 struct hns_roce_ucontext *context)
-{
-	struct list_head *vma_head = &context->vma_list;
-	struct hns_roce_vma_data *vma_data;
-
-	vma_data = kzalloc(sizeof(*vma_data), GFP_KERNEL);
-	if (!vma_data)
-		return -ENOMEM;
-
-	vma_data->vma = vma;
-	vma_data->vma_list_mutex = &context->vma_list_mutex;
-	vma->vm_private_data = vma_data;
-	vma->vm_ops = &hns_roce_vm_ops;
-
-	mutex_lock(&context->vma_list_mutex);
-	list_add(&vma_data->list, vma_head);
-	mutex_unlock(&context->vma_list_mutex);
-
-	return 0;
-}
-
 static int hns_roce_mmap(struct ib_ucontext *context,
 			 struct vm_area_struct *vma)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(context->device);
 
-	if (((vma->vm_end - vma->vm_start) % PAGE_SIZE) != 0)
-		return -EINVAL;
+	switch (vma->vm_pgoff) {
+	case 0:
+		return rdma_user_mmap_io(context, vma,
+					 to_hr_ucontext(context)->uar.pfn,
+					 PAGE_SIZE,
+					 pgprot_noncached(vma->vm_page_prot));
+
+	/* vm_pgoff: 1 -- TPTR */
+	case 1:
+		if (!hr_dev->tptr_dma_addr || !hr_dev->tptr_size)
+			return -EINVAL;
+		/*
+		 * FIXME: using io_remap_pfn_range on the dma address returned
+		 * by dma_alloc_coherent is totally wrong.
+		 */
+		return rdma_user_mmap_io(context, vma,
+					 hr_dev->tptr_dma_addr >> PAGE_SHIFT,
+					 hr_dev->tptr_size,
+					 vma->vm_page_prot);
 
-	if (vma->vm_pgoff == 0) {
-		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-		if (io_remap_pfn_range(vma, vma->vm_start,
-				       to_hr_ucontext(context)->uar.pfn,
-				       PAGE_SIZE, vma->vm_page_prot))
-			return -EAGAIN;
-	} else if (vma->vm_pgoff == 1 && hr_dev->tptr_dma_addr &&
-		   hr_dev->tptr_size) {
-		/* vm_pgoff: 1 -- TPTR */
-		if (io_remap_pfn_range(vma, vma->vm_start,
-				       hr_dev->tptr_dma_addr >> PAGE_SHIFT,
-				       hr_dev->tptr_size,
-				       vma->vm_page_prot))
-			return -EAGAIN;
-	} else
+	default:
 		return -EINVAL;
-
-	return hns_roce_set_vma_data(vma, to_hr_ucontext(context));
+	}
 }
 
 static int hns_roce_port_immutable(struct ib_device *ib_dev, u8 port_num,
@@ -471,21 +429,6 @@ static int hns_roce_port_immutable(struct ib_device *ib_dev, u8 port_num,
 
 static void hns_roce_disassociate_ucontext(struct ib_ucontext *ibcontext)
 {
-	struct hns_roce_ucontext *context = to_hr_ucontext(ibcontext);
-	struct hns_roce_vma_data *vma_data, *n;
-	struct vm_area_struct *vma;
-
-	mutex_lock(&context->vma_list_mutex);
-	list_for_each_entry_safe(vma_data, n, &context->vma_list, list) {
-		vma = vma_data->vma;
-		zap_vma_ptes(vma, vma->vm_start, PAGE_SIZE);
-
-		vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE);
-		vma->vm_ops = NULL;
-		list_del(&vma_data->list);
-		kfree(vma_data);
-	}
-	mutex_unlock(&context->vma_list_mutex);
 }
 
 static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev)
@@ -508,7 +451,6 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev)
 	spin_lock_init(&iboe->lock);
 
 	ib_dev = &hr_dev->ib_dev;
-	strlcpy(ib_dev->name, "hns_%d", IB_DEVICE_NAME_MAX);
 
 	ib_dev->owner			= THIS_MODULE;
 	ib_dev->node_type		= RDMA_NODE_IB_CA;
@@ -584,12 +526,27 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev)
 		ib_dev->uverbs_cmd_mask |= (1ULL << IB_USER_VERBS_CMD_REREG_MR);
 	}
 
+	/* MW */
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_MW) {
+		ib_dev->alloc_mw = hns_roce_alloc_mw;
+		ib_dev->dealloc_mw = hns_roce_dealloc_mw;
+		ib_dev->uverbs_cmd_mask |=
+					(1ULL << IB_USER_VERBS_CMD_ALLOC_MW) |
+					(1ULL << IB_USER_VERBS_CMD_DEALLOC_MW);
+	}
+
+	/* FRMR */
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_FRMR) {
+		ib_dev->alloc_mr		= hns_roce_alloc_mr;
+		ib_dev->map_mr_sg		= hns_roce_map_mr_sg;
+	}
+
 	/* OTHERS */
 	ib_dev->get_port_immutable	= hns_roce_port_immutable;
 	ib_dev->disassociate_ucontext	= hns_roce_disassociate_ucontext;
 
 	ib_dev->driver_id = RDMA_DRIVER_HNS;
-	ret = ib_register_device(ib_dev, NULL);
+	ret = ib_register_device(ib_dev, "hns_%d", NULL);
 	if (ret) {
 		dev_err(dev, "ib_register_device failed!\n");
 		return ret;

+ 204 - 8
drivers/infiniband/hw/hns/hns_roce_mr.c

@@ -329,7 +329,7 @@ static int hns_roce_mhop_alloc(struct hns_roce_dev *hr_dev, int npages,
 	u64 bt_idx;
 	u64 size;
 
-	mhop_num = hr_dev->caps.pbl_hop_num;
+	mhop_num = (mr->type == MR_TYPE_FRMR ? 1 : hr_dev->caps.pbl_hop_num);
 	pbl_bt_sz = 1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT);
 	pbl_last_bt_num = (npages + pbl_bt_sz / 8 - 1) / (pbl_bt_sz / 8);
 
@@ -351,7 +351,7 @@ static int hns_roce_mhop_alloc(struct hns_roce_dev *hr_dev, int npages,
 
 		mr->pbl_size = npages;
 		mr->pbl_ba = mr->pbl_dma_addr;
-		mr->pbl_hop_num = hr_dev->caps.pbl_hop_num;
+		mr->pbl_hop_num = mhop_num;
 		mr->pbl_ba_pg_sz = hr_dev->caps.pbl_ba_pg_sz;
 		mr->pbl_buf_pg_sz = hr_dev->caps.pbl_buf_pg_sz;
 		return 0;
@@ -511,7 +511,6 @@ static int hns_roce_mr_alloc(struct hns_roce_dev *hr_dev, u32 pd, u64 iova,
 	mr->key = hw_index_to_key(index);	/* MR key */
 
 	if (size == ~0ull) {
-		mr->type = MR_TYPE_DMA;
 		mr->pbl_buf = NULL;
 		mr->pbl_dma_addr = 0;
 		/* PBL multi-hop addressing parameters */
@@ -522,7 +521,6 @@ static int hns_roce_mr_alloc(struct hns_roce_dev *hr_dev, u32 pd, u64 iova,
 		mr->pbl_l1_dma_addr = NULL;
 		mr->pbl_l0_dma_addr = 0;
 	} else {
-		mr->type = MR_TYPE_MR;
 		if (!hr_dev->caps.pbl_hop_num) {
 			mr->pbl_buf = dma_alloc_coherent(dev, npages * 8,
 							 &(mr->pbl_dma_addr),
@@ -548,9 +546,9 @@ static void hns_roce_mhop_free(struct hns_roce_dev *hr_dev,
 	u32 mhop_num;
 	u64 bt_idx;
 
-	npages = ib_umem_page_count(mr->umem);
+	npages = mr->pbl_size;
 	pbl_bt_sz = 1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT);
-	mhop_num = hr_dev->caps.pbl_hop_num;
+	mhop_num = (mr->type == MR_TYPE_FRMR) ? 1 : hr_dev->caps.pbl_hop_num;
 
 	if (mhop_num == HNS_ROCE_HOP_NUM_0)
 		return;
@@ -636,7 +634,8 @@ static void hns_roce_mr_free(struct hns_roce_dev *hr_dev,
 	}
 
 	if (mr->size != ~0ULL) {
-		npages = ib_umem_page_count(mr->umem);
+		if (mr->type == MR_TYPE_MR)
+			npages = ib_umem_page_count(mr->umem);
 
 		if (!hr_dev->caps.pbl_hop_num)
 			dma_free_coherent(dev, (unsigned int)(npages * 8),
@@ -674,7 +673,10 @@ static int hns_roce_mr_enable(struct hns_roce_dev *hr_dev,
 		goto err_table;
 	}
 
-	ret = hr_dev->hw->write_mtpt(mailbox->buf, mr, mtpt_idx);
+	if (mr->type != MR_TYPE_FRMR)
+		ret = hr_dev->hw->write_mtpt(mailbox->buf, mr, mtpt_idx);
+	else
+		ret = hr_dev->hw->frmr_write_mtpt(mailbox->buf, mr);
 	if (ret) {
 		dev_err(dev, "Write mtpt fail!\n");
 		goto err_page;
@@ -855,6 +857,8 @@ struct ib_mr *hns_roce_get_dma_mr(struct ib_pd *pd, int acc)
 	if (mr == NULL)
 		return  ERR_PTR(-ENOMEM);
 
+	mr->type = MR_TYPE_DMA;
+
 	/* Allocate memory region key */
 	ret = hns_roce_mr_alloc(to_hr_dev(pd->device), to_hr_pd(pd)->pdn, 0,
 				~0ULL, acc, 0, mr);
@@ -1031,6 +1035,8 @@ struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 		}
 	}
 
+	mr->type = MR_TYPE_MR;
+
 	ret = hns_roce_mr_alloc(hr_dev, to_hr_pd(pd)->pdn, virt_addr, length,
 				access_flags, n, mr);
 	if (ret)
@@ -1201,3 +1207,193 @@ int hns_roce_dereg_mr(struct ib_mr *ibmr)
 
 	return ret;
 }
+
+struct ib_mr *hns_roce_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+				u32 max_num_sg)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(pd->device);
+	struct device *dev = hr_dev->dev;
+	struct hns_roce_mr *mr;
+	u64 length;
+	u32 page_size;
+	int ret;
+
+	page_size = 1 << (hr_dev->caps.pbl_buf_pg_sz + PAGE_SHIFT);
+	length = max_num_sg * page_size;
+
+	if (mr_type != IB_MR_TYPE_MEM_REG)
+		return ERR_PTR(-EINVAL);
+
+	if (max_num_sg > HNS_ROCE_FRMR_MAX_PA) {
+		dev_err(dev, "max_num_sg larger than %d\n",
+			HNS_ROCE_FRMR_MAX_PA);
+		return ERR_PTR(-EINVAL);
+	}
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	mr->type = MR_TYPE_FRMR;
+
+	/* Allocate memory region key */
+	ret = hns_roce_mr_alloc(hr_dev, to_hr_pd(pd)->pdn, 0, length,
+				0, max_num_sg, mr);
+	if (ret)
+		goto err_free;
+
+	ret = hns_roce_mr_enable(hr_dev, mr);
+	if (ret)
+		goto err_mr;
+
+	mr->ibmr.rkey = mr->ibmr.lkey = mr->key;
+	mr->umem = NULL;
+
+	return &mr->ibmr;
+
+err_mr:
+	hns_roce_mr_free(to_hr_dev(pd->device), mr);
+
+err_free:
+	kfree(mr);
+	return ERR_PTR(ret);
+}
+
+static int hns_roce_set_page(struct ib_mr *ibmr, u64 addr)
+{
+	struct hns_roce_mr *mr = to_hr_mr(ibmr);
+
+	mr->pbl_buf[mr->npages++] = cpu_to_le64(addr);
+
+	return 0;
+}
+
+int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+		       unsigned int *sg_offset)
+{
+	struct hns_roce_mr *mr = to_hr_mr(ibmr);
+
+	mr->npages = 0;
+
+	return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, hns_roce_set_page);
+}
+
+static void hns_roce_mw_free(struct hns_roce_dev *hr_dev,
+			     struct hns_roce_mw *mw)
+{
+	struct device *dev = hr_dev->dev;
+	int ret;
+
+	if (mw->enabled) {
+		ret = hns_roce_hw2sw_mpt(hr_dev, NULL, key_to_hw_index(mw->rkey)
+					 & (hr_dev->caps.num_mtpts - 1));
+		if (ret)
+			dev_warn(dev, "MW HW2SW_MPT failed (%d)\n", ret);
+
+		hns_roce_table_put(hr_dev, &hr_dev->mr_table.mtpt_table,
+				   key_to_hw_index(mw->rkey));
+	}
+
+	hns_roce_bitmap_free(&hr_dev->mr_table.mtpt_bitmap,
+			     key_to_hw_index(mw->rkey), BITMAP_NO_RR);
+}
+
+static int hns_roce_mw_enable(struct hns_roce_dev *hr_dev,
+			      struct hns_roce_mw *mw)
+{
+	struct hns_roce_mr_table *mr_table = &hr_dev->mr_table;
+	struct hns_roce_cmd_mailbox *mailbox;
+	struct device *dev = hr_dev->dev;
+	unsigned long mtpt_idx = key_to_hw_index(mw->rkey);
+	int ret;
+
+	/* prepare HEM entry memory */
+	ret = hns_roce_table_get(hr_dev, &mr_table->mtpt_table, mtpt_idx);
+	if (ret)
+		return ret;
+
+	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+	if (IS_ERR(mailbox)) {
+		ret = PTR_ERR(mailbox);
+		goto err_table;
+	}
+
+	ret = hr_dev->hw->mw_write_mtpt(mailbox->buf, mw);
+	if (ret) {
+		dev_err(dev, "MW write mtpt fail!\n");
+		goto err_page;
+	}
+
+	ret = hns_roce_sw2hw_mpt(hr_dev, mailbox,
+				 mtpt_idx & (hr_dev->caps.num_mtpts - 1));
+	if (ret) {
+		dev_err(dev, "MW sw2hw_mpt failed (%d)\n", ret);
+		goto err_page;
+	}
+
+	mw->enabled = 1;
+
+	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+
+	return 0;
+
+err_page:
+	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+
+err_table:
+	hns_roce_table_put(hr_dev, &mr_table->mtpt_table, mtpt_idx);
+
+	return ret;
+}
+
+struct ib_mw *hns_roce_alloc_mw(struct ib_pd *ib_pd, enum ib_mw_type type,
+				struct ib_udata *udata)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ib_pd->device);
+	struct hns_roce_mw *mw;
+	unsigned long index = 0;
+	int ret;
+
+	mw = kmalloc(sizeof(*mw), GFP_KERNEL);
+	if (!mw)
+		return ERR_PTR(-ENOMEM);
+
+	/* Allocate a key for mw from bitmap */
+	ret = hns_roce_bitmap_alloc(&hr_dev->mr_table.mtpt_bitmap, &index);
+	if (ret)
+		goto err_bitmap;
+
+	mw->rkey = hw_index_to_key(index);
+
+	mw->ibmw.rkey = mw->rkey;
+	mw->ibmw.type = type;
+	mw->pdn = to_hr_pd(ib_pd)->pdn;
+	mw->pbl_hop_num = hr_dev->caps.pbl_hop_num;
+	mw->pbl_ba_pg_sz = hr_dev->caps.pbl_ba_pg_sz;
+	mw->pbl_buf_pg_sz = hr_dev->caps.pbl_buf_pg_sz;
+
+	ret = hns_roce_mw_enable(hr_dev, mw);
+	if (ret)
+		goto err_mw;
+
+	return &mw->ibmw;
+
+err_mw:
+	hns_roce_mw_free(hr_dev, mw);
+
+err_bitmap:
+	kfree(mw);
+
+	return ERR_PTR(ret);
+}
+
+int hns_roce_dealloc_mw(struct ib_mw *ibmw)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibmw->device);
+	struct hns_roce_mw *mw = to_hr_mw(ibmw);
+
+	hns_roce_mw_free(hr_dev, mw);
+	kfree(mw);
+
+	return 0;
+}

+ 36 - 5
drivers/infiniband/hw/hns/hns_roce_qp.c

@@ -31,6 +31,7 @@
  * SOFTWARE.
  */
 
+#include <linux/pci.h>
 #include <linux/platform_device.h>
 #include <rdma/ib_addr.h>
 #include <rdma/ib_umem.h>
@@ -343,6 +344,7 @@ static int hns_roce_set_user_sq_size(struct hns_roce_dev *hr_dev,
 {
 	u32 roundup_sq_stride = roundup_pow_of_two(hr_dev->caps.max_sq_desc_sz);
 	u8 max_sq_stride = ilog2(roundup_sq_stride);
+	u32 ex_sge_num;
 	u32 page_size;
 	u32 max_cnt;
 
@@ -372,7 +374,18 @@ static int hns_roce_set_user_sq_size(struct hns_roce_dev *hr_dev,
 	if (hr_qp->sq.max_gs > 2)
 		hr_qp->sge.sge_cnt = roundup_pow_of_two(hr_qp->sq.wqe_cnt *
 							(hr_qp->sq.max_gs - 2));
+
+	if ((hr_qp->sq.max_gs > 2) && (hr_dev->pci_dev->revision == 0x20)) {
+		if (hr_qp->sge.sge_cnt > hr_dev->caps.max_extend_sg) {
+			dev_err(hr_dev->dev,
+				"The extended sge cnt error! sge_cnt=%d\n",
+				hr_qp->sge.sge_cnt);
+			return -EINVAL;
+		}
+	}
+
 	hr_qp->sge.sge_shift = 4;
+	ex_sge_num = hr_qp->sge.sge_cnt;
 
 	/* Get buf size, SQ and RQ  are aligned to page_szie */
 	if (hr_dev->caps.max_sq_sg <= 2) {
@@ -386,6 +399,8 @@ static int hns_roce_set_user_sq_size(struct hns_roce_dev *hr_dev,
 					     hr_qp->sq.wqe_shift), PAGE_SIZE);
 	} else {
 		page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT);
+		hr_qp->sge.sge_cnt =
+		       max(page_size / (1 << hr_qp->sge.sge_shift), ex_sge_num);
 		hr_qp->buff_size = HNS_ROCE_ALOGN_UP((hr_qp->rq.wqe_cnt <<
 					     hr_qp->rq.wqe_shift), page_size) +
 				   HNS_ROCE_ALOGN_UP((hr_qp->sge.sge_cnt <<
@@ -394,7 +409,7 @@ static int hns_roce_set_user_sq_size(struct hns_roce_dev *hr_dev,
 					     hr_qp->sq.wqe_shift), page_size);
 
 		hr_qp->sq.offset = 0;
-		if (hr_qp->sge.sge_cnt) {
+		if (ex_sge_num) {
 			hr_qp->sge.offset = HNS_ROCE_ALOGN_UP(
 							(hr_qp->sq.wqe_cnt <<
 							hr_qp->sq.wqe_shift),
@@ -465,6 +480,14 @@ static int hns_roce_set_kernel_sq_size(struct hns_roce_dev *hr_dev,
 		hr_qp->sge.sge_shift = 4;
 	}
 
+	if ((hr_qp->sq.max_gs > 2) && hr_dev->pci_dev->revision == 0x20) {
+		if (hr_qp->sge.sge_cnt > hr_dev->caps.max_extend_sg) {
+			dev_err(dev, "The extended sge cnt error! sge_cnt=%d\n",
+				hr_qp->sge.sge_cnt);
+			return -EINVAL;
+		}
+	}
+
 	/* Get buf size, SQ and RQ are aligned to PAGE_SIZE */
 	page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT);
 	hr_qp->sq.offset = 0;
@@ -472,6 +495,8 @@ static int hns_roce_set_kernel_sq_size(struct hns_roce_dev *hr_dev,
 				 page_size);
 
 	if (hr_dev->caps.max_sq_sg > 2 && hr_qp->sge.sge_cnt) {
+		hr_qp->sge.sge_cnt = max(page_size/(1 << hr_qp->sge.sge_shift),
+					(u32)hr_qp->sge.sge_cnt);
 		hr_qp->sge.offset = size;
 		size += HNS_ROCE_ALOGN_UP(hr_qp->sge.sge_cnt <<
 					  hr_qp->sge.sge_shift, page_size);
@@ -952,8 +977,8 @@ int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		}
 	}
 
-	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask,
-				IB_LINK_LAYER_ETHERNET)) {
+	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
+				attr_mask)) {
 		dev_err(dev, "ib_modify_qp_is_ok failed\n");
 		goto out;
 	}
@@ -1106,14 +1131,20 @@ int hns_roce_init_qp_table(struct hns_roce_dev *hr_dev)
 {
 	struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
 	int reserved_from_top = 0;
+	int reserved_from_bot;
 	int ret;
 
 	spin_lock_init(&qp_table->lock);
 	INIT_RADIX_TREE(&hr_dev->qp_table_tree, GFP_ATOMIC);
 
-	/* A port include two SQP, six port total 12 */
+	/* In hw v1, a port include two SQP, six ports total 12 */
+	if (hr_dev->caps.max_sq_sg <= 2)
+		reserved_from_bot = SQP_NUM;
+	else
+		reserved_from_bot = hr_dev->caps.reserved_qps;
+
 	ret = hns_roce_bitmap_init(&qp_table->bitmap, hr_dev->caps.num_qps,
-				   hr_dev->caps.num_qps - 1, SQP_NUM,
+				   hr_dev->caps.num_qps - 1, reserved_from_bot,
 				   reserved_from_top);
 	if (ret) {
 		dev_err(hr_dev->dev, "qp bitmap init failed!error=%d\n",

+ 1 - 1
drivers/infiniband/hw/i40iw/i40iw_cm.c

@@ -1689,7 +1689,7 @@ static enum i40iw_status_code i40iw_add_mqh_6(struct i40iw_device *iwdev,
 	unsigned long flags;
 
 	rtnl_lock();
-	for_each_netdev_rcu(&init_net, ip_dev) {
+	for_each_netdev(&init_net, ip_dev) {
 		if ((((rdma_vlan_dev_vlan_id(ip_dev) < I40IW_NO_VLAN) &&
 		      (rdma_vlan_dev_real_dev(ip_dev) == iwdev->netdev)) ||
 		     (ip_dev == iwdev->netdev)) && (ip_dev->flags & IFF_UP)) {

+ 24 - 49
drivers/infiniband/hw/i40iw/i40iw_verbs.c

@@ -2135,10 +2135,10 @@ static int i40iw_dereg_mr(struct ib_mr *ib_mr)
 }
 
 /**
- * i40iw_show_rev
+ * hw_rev_show
  */
-static ssize_t i40iw_show_rev(struct device *dev,
-			      struct device_attribute *attr, char *buf)
+static ssize_t hw_rev_show(struct device *dev,
+			   struct device_attribute *attr, char *buf)
 {
 	struct i40iw_ib_device *iwibdev = container_of(dev,
 						       struct i40iw_ib_device,
@@ -2147,34 +2147,37 @@ static ssize_t i40iw_show_rev(struct device *dev,
 
 	return sprintf(buf, "%x\n", hw_rev);
 }
+static DEVICE_ATTR_RO(hw_rev);
 
 /**
- * i40iw_show_hca
+ * hca_type_show
  */
-static ssize_t i40iw_show_hca(struct device *dev,
-			      struct device_attribute *attr, char *buf)
+static ssize_t hca_type_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
 {
 	return sprintf(buf, "I40IW\n");
 }
+static DEVICE_ATTR_RO(hca_type);
 
 /**
- * i40iw_show_board
+ * board_id_show
  */
-static ssize_t i40iw_show_board(struct device *dev,
-				struct device_attribute *attr,
-				char *buf)
+static ssize_t board_id_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%.*s\n", 32, "I40IW Board ID");
 }
+static DEVICE_ATTR_RO(board_id);
 
-static DEVICE_ATTR(hw_rev, S_IRUGO, i40iw_show_rev, NULL);
-static DEVICE_ATTR(hca_type, S_IRUGO, i40iw_show_hca, NULL);
-static DEVICE_ATTR(board_id, S_IRUGO, i40iw_show_board, NULL);
+static struct attribute *i40iw_dev_attributes[] = {
+	&dev_attr_hw_rev.attr,
+	&dev_attr_hca_type.attr,
+	&dev_attr_board_id.attr,
+	NULL
+};
 
-static struct device_attribute *i40iw_dev_attributes[] = {
-	&dev_attr_hw_rev,
-	&dev_attr_hca_type,
-	&dev_attr_board_id
+static const struct attribute_group i40iw_attr_group = {
+	.attrs = i40iw_dev_attributes,
 };
 
 /**
@@ -2752,7 +2755,6 @@ static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev
 		i40iw_pr_err("iwdev == NULL\n");
 		return NULL;
 	}
-	strlcpy(iwibdev->ibdev.name, "i40iw%d", IB_DEVICE_NAME_MAX);
 	iwibdev->ibdev.owner = THIS_MODULE;
 	iwdev->iwibdev = iwibdev;
 	iwibdev->iwdev = iwdev;
@@ -2850,20 +2852,6 @@ void i40iw_port_ibevent(struct i40iw_device *iwdev)
 	ib_dispatch_event(&event);
 }
 
-/**
- * i40iw_unregister_rdma_device - unregister of iwarp from IB
- * @iwibdev: rdma device ptr
- */
-static void i40iw_unregister_rdma_device(struct i40iw_ib_device *iwibdev)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(i40iw_dev_attributes); ++i)
-		device_remove_file(&iwibdev->ibdev.dev,
-				   i40iw_dev_attributes[i]);
-	ib_unregister_device(&iwibdev->ibdev);
-}
-
 /**
  * i40iw_destroy_rdma_device - destroy rdma device and free resources
  * @iwibdev: IB device ptr
@@ -2873,7 +2861,7 @@ void i40iw_destroy_rdma_device(struct i40iw_ib_device *iwibdev)
 	if (!iwibdev)
 		return;
 
-	i40iw_unregister_rdma_device(iwibdev);
+	ib_unregister_device(&iwibdev->ibdev);
 	kfree(iwibdev->ibdev.iwcm);
 	iwibdev->ibdev.iwcm = NULL;
 	wait_event_timeout(iwibdev->iwdev->close_wq,
@@ -2888,32 +2876,19 @@ void i40iw_destroy_rdma_device(struct i40iw_ib_device *iwibdev)
  */
 int i40iw_register_rdma_device(struct i40iw_device *iwdev)
 {
-	int i, ret;
+	int ret;
 	struct i40iw_ib_device *iwibdev;
 
 	iwdev->iwibdev = i40iw_init_rdma_device(iwdev);
 	if (!iwdev->iwibdev)
 		return -ENOMEM;
 	iwibdev = iwdev->iwibdev;
-
+	rdma_set_device_sysfs_group(&iwibdev->ibdev, &i40iw_attr_group);
 	iwibdev->ibdev.driver_id = RDMA_DRIVER_I40IW;
-	ret = ib_register_device(&iwibdev->ibdev, NULL);
+	ret = ib_register_device(&iwibdev->ibdev, "i40iw%d", NULL);
 	if (ret)
 		goto error;
 
-	for (i = 0; i < ARRAY_SIZE(i40iw_dev_attributes); ++i) {
-		ret =
-		    device_create_file(&iwibdev->ibdev.dev,
-				       i40iw_dev_attributes[i]);
-		if (ret) {
-			while (i > 0) {
-				i--;
-				device_remove_file(&iwibdev->ibdev.dev, i40iw_dev_attributes[i]);
-			}
-			ib_unregister_device(&iwibdev->ibdev);
-			goto error;
-		}
-	}
 	return 0;
 error:
 	kfree(iwdev->iwibdev->ibdev.iwcm);

+ 1 - 0
drivers/infiniband/hw/mlx4/Kconfig

@@ -1,6 +1,7 @@
 config MLX4_INFINIBAND
 	tristate "Mellanox ConnectX HCA support"
 	depends on NETDEVICES && ETHERNET && PCI && INET
+	depends on INFINIBAND_USER_ACCESS || !INFINIBAND_USER_ACCESS
 	depends on MAY_USE_DEVLINK
 	select NET_VENDOR_MELLANOX
 	select MLX4_CORE

+ 11 - 9
drivers/infiniband/hw/mlx4/mad.c

@@ -807,15 +807,17 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
 	int err;
 	struct ib_port_attr pattr;
 
-	if (in_wc && in_wc->qp->qp_num) {
-		pr_debug("received MAD: slid:%d sqpn:%d "
-			"dlid_bits:%d dqpn:%d wc_flags:0x%x, cls %x, mtd %x, atr %x\n",
-			in_wc->slid, in_wc->src_qp,
-			in_wc->dlid_path_bits,
-			in_wc->qp->qp_num,
-			in_wc->wc_flags,
-			in_mad->mad_hdr.mgmt_class, in_mad->mad_hdr.method,
-			be16_to_cpu(in_mad->mad_hdr.attr_id));
+	if (in_wc && in_wc->qp) {
+		pr_debug("received MAD: port:%d slid:%d sqpn:%d "
+			 "dlid_bits:%d dqpn:%d wc_flags:0x%x tid:%016llx cls:%x mtd:%x atr:%x\n",
+			 port_num,
+			 in_wc->slid, in_wc->src_qp,
+			 in_wc->dlid_path_bits,
+			 in_wc->qp->qp_num,
+			 in_wc->wc_flags,
+			 be64_to_cpu(in_mad->mad_hdr.tid),
+			 in_mad->mad_hdr.mgmt_class, in_mad->mad_hdr.method,
+			 be16_to_cpu(in_mad->mad_hdr.attr_id));
 		if (in_wc->wc_flags & IB_WC_GRH) {
 			pr_debug("sgid_hi:0x%016llx sgid_lo:0x%016llx\n",
 				 be64_to_cpu(in_grh->sgid.global.subnet_prefix),

+ 43 - 139
drivers/infiniband/hw/mlx4/main.c

@@ -1140,144 +1140,50 @@ static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
 	return 0;
 }
 
-static void  mlx4_ib_vma_open(struct vm_area_struct *area)
-{
-	/* vma_open is called when a new VMA is created on top of our VMA.
-	 * This is done through either mremap flow or split_vma (usually due
-	 * to mlock, madvise, munmap, etc.). We do not support a clone of the
-	 * vma, as this VMA is strongly hardware related. Therefore we set the
-	 * vm_ops of the newly created/cloned VMA to NULL, to prevent it from
-	 * calling us again and trying to do incorrect actions. We assume that
-	 * the original vma size is exactly a single page that there will be no
-	 * "splitting" operations on.
-	 */
-	area->vm_ops = NULL;
-}
-
-static void  mlx4_ib_vma_close(struct vm_area_struct *area)
-{
-	struct mlx4_ib_vma_private_data *mlx4_ib_vma_priv_data;
-
-	/* It's guaranteed that all VMAs opened on a FD are closed before the
-	 * file itself is closed, therefore no sync is needed with the regular
-	 * closing flow. (e.g. mlx4_ib_dealloc_ucontext) However need a sync
-	 * with accessing the vma as part of mlx4_ib_disassociate_ucontext.
-	 * The close operation is usually called under mm->mmap_sem except when
-	 * process is exiting.  The exiting case is handled explicitly as part
-	 * of mlx4_ib_disassociate_ucontext.
-	 */
-	mlx4_ib_vma_priv_data = (struct mlx4_ib_vma_private_data *)
-				area->vm_private_data;
-
-	/* set the vma context pointer to null in the mlx4_ib driver's private
-	 * data to protect against a race condition in mlx4_ib_dissassociate_ucontext().
-	 */
-	mlx4_ib_vma_priv_data->vma = NULL;
-}
-
-static const struct vm_operations_struct mlx4_ib_vm_ops = {
-	.open = mlx4_ib_vma_open,
-	.close = mlx4_ib_vma_close
-};
-
 static void mlx4_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
 {
-	int i;
-	struct vm_area_struct *vma;
-	struct mlx4_ib_ucontext *context = to_mucontext(ibcontext);
-
-	/* need to protect from a race on closing the vma as part of
-	 * mlx4_ib_vma_close().
-	 */
-	for (i = 0; i < HW_BAR_COUNT; i++) {
-		vma = context->hw_bar_info[i].vma;
-		if (!vma)
-			continue;
-
-		zap_vma_ptes(context->hw_bar_info[i].vma,
-			     context->hw_bar_info[i].vma->vm_start, PAGE_SIZE);
-
-		context->hw_bar_info[i].vma->vm_flags &=
-			~(VM_SHARED | VM_MAYSHARE);
-		/* context going to be destroyed, should not access ops any more */
-		context->hw_bar_info[i].vma->vm_ops = NULL;
-	}
-}
-
-static void mlx4_ib_set_vma_data(struct vm_area_struct *vma,
-				 struct mlx4_ib_vma_private_data *vma_private_data)
-{
-	vma_private_data->vma = vma;
-	vma->vm_private_data = vma_private_data;
-	vma->vm_ops =  &mlx4_ib_vm_ops;
 }
 
 static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
 {
 	struct mlx4_ib_dev *dev = to_mdev(context->device);
-	struct mlx4_ib_ucontext *mucontext = to_mucontext(context);
 
-	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
-		return -EINVAL;
-
-	if (vma->vm_pgoff == 0) {
-		/* We prevent double mmaping on same context */
-		if (mucontext->hw_bar_info[HW_BAR_DB].vma)
-			return -EINVAL;
-
-		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-
-		if (io_remap_pfn_range(vma, vma->vm_start,
-				       to_mucontext(context)->uar.pfn,
-				       PAGE_SIZE, vma->vm_page_prot))
-			return -EAGAIN;
-
-		mlx4_ib_set_vma_data(vma, &mucontext->hw_bar_info[HW_BAR_DB]);
+	switch (vma->vm_pgoff) {
+	case 0:
+		return rdma_user_mmap_io(context, vma,
+					 to_mucontext(context)->uar.pfn,
+					 PAGE_SIZE,
+					 pgprot_noncached(vma->vm_page_prot));
 
-	} else if (vma->vm_pgoff == 1 && dev->dev->caps.bf_reg_size != 0) {
-		/* We prevent double mmaping on same context */
-		if (mucontext->hw_bar_info[HW_BAR_BF].vma)
+	case 1:
+		if (dev->dev->caps.bf_reg_size == 0)
 			return -EINVAL;
+		return rdma_user_mmap_io(
+			context, vma,
+			to_mucontext(context)->uar.pfn +
+				dev->dev->caps.num_uars,
+			PAGE_SIZE, pgprot_writecombine(vma->vm_page_prot));
 
-		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
-
-		if (io_remap_pfn_range(vma, vma->vm_start,
-				       to_mucontext(context)->uar.pfn +
-				       dev->dev->caps.num_uars,
-				       PAGE_SIZE, vma->vm_page_prot))
-			return -EAGAIN;
-
-		mlx4_ib_set_vma_data(vma, &mucontext->hw_bar_info[HW_BAR_BF]);
-
-	} else if (vma->vm_pgoff == 3) {
+	case 3: {
 		struct mlx4_clock_params params;
 		int ret;
 
-		/* We prevent double mmaping on same context */
-		if (mucontext->hw_bar_info[HW_BAR_CLOCK].vma)
-			return -EINVAL;
-
 		ret = mlx4_get_internal_clock_params(dev->dev, &params);
-
 		if (ret)
 			return ret;
 
-		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-		if (io_remap_pfn_range(vma, vma->vm_start,
-				       (pci_resource_start(dev->dev->persist->pdev,
-							   params.bar) +
-					params.offset)
-				       >> PAGE_SHIFT,
-				       PAGE_SIZE, vma->vm_page_prot))
-			return -EAGAIN;
-
-		mlx4_ib_set_vma_data(vma,
-				     &mucontext->hw_bar_info[HW_BAR_CLOCK]);
-	} else {
-		return -EINVAL;
+		return rdma_user_mmap_io(
+			context, vma,
+			(pci_resource_start(dev->dev->persist->pdev,
+					    params.bar) +
+			 params.offset) >>
+				PAGE_SHIFT,
+			PAGE_SIZE, pgprot_noncached(vma->vm_page_prot));
 	}
 
-	return 0;
+	default:
+		return -EINVAL;
+	}
 }
 
 static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev,
@@ -2133,39 +2039,43 @@ out:
 	return err;
 }
 
-static ssize_t show_hca(struct device *device, struct device_attribute *attr,
-			char *buf)
+static ssize_t hca_type_show(struct device *device,
+			     struct device_attribute *attr, char *buf)
 {
 	struct mlx4_ib_dev *dev =
 		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
 	return sprintf(buf, "MT%d\n", dev->dev->persist->pdev->device);
 }
+static DEVICE_ATTR_RO(hca_type);
 
-static ssize_t show_rev(struct device *device, struct device_attribute *attr,
-			char *buf)
+static ssize_t hw_rev_show(struct device *device,
+			   struct device_attribute *attr, char *buf)
 {
 	struct mlx4_ib_dev *dev =
 		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
 	return sprintf(buf, "%x\n", dev->dev->rev_id);
 }
+static DEVICE_ATTR_RO(hw_rev);
 
-static ssize_t show_board(struct device *device, struct device_attribute *attr,
-			  char *buf)
+static ssize_t board_id_show(struct device *device,
+			     struct device_attribute *attr, char *buf)
 {
 	struct mlx4_ib_dev *dev =
 		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
 	return sprintf(buf, "%.*s\n", MLX4_BOARD_ID_LEN,
 		       dev->dev->board_id);
 }
+static DEVICE_ATTR_RO(board_id);
 
-static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
-static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
-static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
+static struct attribute *mlx4_class_attributes[] = {
+	&dev_attr_hw_rev.attr,
+	&dev_attr_hca_type.attr,
+	&dev_attr_board_id.attr,
+	NULL
+};
 
-static struct device_attribute *mlx4_class_attributes[] = {
-	&dev_attr_hw_rev,
-	&dev_attr_hca_type,
-	&dev_attr_board_id
+static const struct attribute_group mlx4_attr_group = {
+	.attrs = mlx4_class_attributes,
 };
 
 struct diag_counter {
@@ -2636,7 +2546,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	ibdev->dev = dev;
 	ibdev->bond_next_port	= 0;
 
-	strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX);
 	ibdev->ib_dev.owner		= THIS_MODULE;
 	ibdev->ib_dev.node_type		= RDMA_NODE_IB_CA;
 	ibdev->ib_dev.local_dma_lkey	= dev->caps.reserved_lkey;
@@ -2898,8 +2807,9 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	if (mlx4_ib_alloc_diag_counters(ibdev))
 		goto err_steer_free_bitmap;
 
+	rdma_set_device_sysfs_group(&ibdev->ib_dev, &mlx4_attr_group);
 	ibdev->ib_dev.driver_id = RDMA_DRIVER_MLX4;
-	if (ib_register_device(&ibdev->ib_dev, NULL))
+	if (ib_register_device(&ibdev->ib_dev, "mlx4_%d", NULL))
 		goto err_diag_counters;
 
 	if (mlx4_ib_mad_init(ibdev))
@@ -2922,12 +2832,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 			goto err_notif;
 	}
 
-	for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) {
-		if (device_create_file(&ibdev->ib_dev.dev,
-				       mlx4_class_attributes[j]))
-			goto err_notif;
-	}
-
 	ibdev->ib_active = true;
 	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
 		devlink_port_type_ib_set(mlx4_get_devlink_port(dev, i),

+ 1 - 1
drivers/infiniband/hw/mlx4/mcg.c

@@ -673,7 +673,7 @@ static void mlx4_ib_mcg_work_handler(struct work_struct *work)
 			if (!list_empty(&group->pending_list))
 				req = list_first_entry(&group->pending_list,
 						struct mcast_req, group_list);
-			if ((method == IB_MGMT_METHOD_GET_RESP)) {
+			if (method == IB_MGMT_METHOD_GET_RESP) {
 					if (req) {
 						send_reply_to_slave(req->func, group, &req->sa_mad, status);
 						--group->func[req->func].num_pend_reqs;

+ 0 - 5
drivers/infiniband/hw/mlx4/mlx4_ib.h

@@ -80,16 +80,11 @@ enum hw_bar_type {
 	HW_BAR_COUNT
 };
 
-struct mlx4_ib_vma_private_data {
-	struct vm_area_struct *vma;
-};
-
 struct mlx4_ib_ucontext {
 	struct ib_ucontext	ibucontext;
 	struct mlx4_uar		uar;
 	struct list_head	db_page_list;
 	struct mutex		db_page_mutex;
-	struct mlx4_ib_vma_private_data hw_bar_info[HW_BAR_COUNT];
 	struct list_head	wqn_ranges_list;
 	struct mutex		wqn_ranges_mutex; /* protect wqn_ranges_list */
 };

+ 1 - 7
drivers/infiniband/hw/mlx4/qp.c

@@ -2629,7 +2629,6 @@ enum {
 static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 			      int attr_mask, struct ib_udata *udata)
 {
-	enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED;
 	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
 	struct mlx4_ib_qp *qp = to_mqp(ibqp);
 	enum ib_qp_state cur_state, new_state;
@@ -2639,13 +2638,8 @@ static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 	cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
 	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
 
-	if (cur_state != new_state || cur_state != IB_QPS_RESET) {
-		int port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
-		ll = rdma_port_get_link_layer(&dev->ib_dev, port);
-	}
-
 	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
-				attr_mask, ll)) {
+				attr_mask)) {
 		pr_debug("qpn 0x%x: invalid attribute mask specified "
 			 "for transition %d to %d. qp_type %d,"
 			 " attr_mask 0x%x\n",

Vissa filer visades inte eftersom för många filer har ändrats