Browse Source

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma

Pull RDMA subsystem updates from Jason Gunthorpe:
 "Overall this cycle did not have any major excitement, and did not
  require any shared branch with netdev.

  Lots of driver updates, particularly of the scale-up and performance
  variety. The largest body of core work was Parav's patches fixing and
  restructing some of the core code to make way for future RDMA
  containerization.

  Summary:

   - misc small driver fixups to
     bnxt_re/hfi1/qib/hns/ocrdma/rdmavt/vmw_pvrdma/nes

   - several major feature adds to bnxt_re driver: SRIOV VF RoCE
     support, HugePages support, extended hardware stats support, and
     SRQ support

   - a notable number of fixes to the i40iw driver from debugging scale
     up testing

   - more work to enable the new hip08 chip in the hns driver

   - misc small ULP fixups to srp/srpt//ipoib

   - preparation for srp initiator and target to support the RDMA-CM
     protocol for connections

   - add RDMA-CM support to srp initiator, srp target is still a WIP

   - fixes for a couple of places where ipoib could spam the dmesg log

   - fix encode/decode of FDR/EDR data rates in the core

   - many patches from Parav with ongoing work to clean up
     inconsistencies and bugs in RoCE support around the rdma_cm

   - mlx5 driver support for the userspace features 'thread domain',
     'wallclock timestamps' and 'DV Direct Connected transport'. Support
     for the firmware dual port rocee capability

   - core support for more than 32 rdma devices in the char dev
     allocation

   - kernel doc updates from Randy Dunlap

   - new netlink uAPI for inspecting RDMA objects similar in spirit to 'ss'

   - one minor change to the kobject code acked by Greg KH"

* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (259 commits)
  RDMA/nldev: Provide detailed QP information
  RDMA/nldev: Provide global resource utilization
  RDMA/core: Add resource tracking for create and destroy PDs
  RDMA/core: Add resource tracking for create and destroy CQs
  RDMA/core: Add resource tracking for create and destroy QPs
  RDMA/restrack: Add general infrastructure to track RDMA resources
  RDMA/core: Save kernel caller name when creating PD and CQ objects
  RDMA/core: Use the MODNAME instead of the function name for pd callers
  RDMA: Move enum ib_cq_creation_flags to uapi headers
  IB/rxe: Change RDMA_RXE kconfig to use select
  IB/qib: remove qib_keys.c
  IB/mthca: remove mthca_user.h
  RDMA/cm: Fix access to uninitialized variable
  RDMA/cma: Use existing netif_is_bond_master function
  IB/core: Avoid SGID attributes query while converting GID from OPA to IB
  RDMA/mlx5: Avoid memory leak in case of XRCD dealloc failure
  IB/umad: Fix use of unprotected device pointer
  IB/iser: Combine substrings for three messages
  IB/iser: Delete an unnecessary variable initialisation in iser_send_data_out()
  IB/iser: Delete an error message for a failed memory allocation in iser_send_data_out()
  ...
Linus Torvalds 7 years ago
parent
commit
7b1cd95d65
100 changed files with 7806 additions and 2806 deletions
  1. 4 3
      MAINTAINERS
  2. 1 1
      drivers/infiniband/core/Makefile
  3. 8 57
      drivers/infiniband/core/addr.c
  4. 7 16
      drivers/infiniband/core/cache.c
  5. 159 68
      drivers/infiniband/core/cm.c
  6. 156 96
      drivers/infiniband/core/cma.c
  7. 1 1
      drivers/infiniband/core/cma_configfs.c
  8. 31 21
      drivers/infiniband/core/core_priv.h
  9. 25 14
      drivers/infiniband/core/cq.c
  10. 22 20
      drivers/infiniband/core/device.c
  11. 5 7
      drivers/infiniband/core/fmr_pool.c
  12. 1 0
      drivers/infiniband/core/iwpm_util.c
  13. 0 1
      drivers/infiniband/core/mad.c
  14. 3 7
      drivers/infiniband/core/netlink.c
  15. 388 6
      drivers/infiniband/core/nldev.c
  16. 164 0
      drivers/infiniband/core/restrack.c
  17. 8 5
      drivers/infiniband/core/roce_gid_mgmt.c
  18. 14 4
      drivers/infiniband/core/sa_query.c
  19. 4 6
      drivers/infiniband/core/security.c
  20. 0 1
      drivers/infiniband/core/sysfs.c
  21. 28 45
      drivers/infiniband/core/ucm.c
  22. 10 9
      drivers/infiniband/core/ucma.c
  23. 1 1
      drivers/infiniband/core/umem.c
  24. 57 66
      drivers/infiniband/core/user_mad.c
  25. 7 7
      drivers/infiniband/core/uverbs_cmd.c
  26. 13 6
      drivers/infiniband/core/uverbs_ioctl.c
  27. 34 61
      drivers/infiniband/core/uverbs_main.c
  28. 3 0
      drivers/infiniband/core/uverbs_std_types.c
  29. 177 135
      drivers/infiniband/core/verbs.c
  30. 34 9
      drivers/infiniband/hw/bnxt_re/bnxt_re.h
  31. 135 10
      drivers/infiniband/hw/bnxt_re/hw_counters.c
  32. 39 0
      drivers/infiniband/hw/bnxt_re/hw_counters.h
  33. 348 56
      drivers/infiniband/hw/bnxt_re/ib_verbs.c
  34. 20 0
      drivers/infiniband/hw/bnxt_re/ib_verbs.h
  35. 209 42
      drivers/infiniband/hw/bnxt_re/main.c
  36. 400 63
      drivers/infiniband/hw/bnxt_re/qplib_fp.c
  37. 54 24
      drivers/infiniband/hw/bnxt_re/qplib_fp.h
  38. 3 2
      drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
  39. 3 4
      drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
  40. 4 5
      drivers/infiniband/hw/bnxt_re/qplib_res.c
  41. 136 5
      drivers/infiniband/hw/bnxt_re/qplib_sp.c
  42. 88 3
      drivers/infiniband/hw/bnxt_re/qplib_sp.h
  43. 126 1
      drivers/infiniband/hw/bnxt_re/roce_hsi.h
  44. 16 11
      drivers/infiniband/hw/cxgb4/cm.c
  45. 16 20
      drivers/infiniband/hw/cxgb4/device.c
  46. 1 1
      drivers/infiniband/hw/cxgb4/ev.c
  47. 2 2
      drivers/infiniband/hw/cxgb4/iw_cxgb4.h
  48. 3 3
      drivers/infiniband/hw/cxgb4/qp.c
  49. 2 2
      drivers/infiniband/hw/cxgb4/t4.h
  50. 31 56
      drivers/infiniband/hw/hfi1/chip.c
  51. 1 1
      drivers/infiniband/hw/hfi1/chip.h
  52. 0 16
      drivers/infiniband/hw/hfi1/driver.c
  53. 16 48
      drivers/infiniband/hw/hfi1/firmware.c
  54. 13 12
      drivers/infiniband/hw/hfi1/hfi.h
  55. 2 0
      drivers/infiniband/hw/hfi1/init.c
  56. 1 5
      drivers/infiniband/hw/hfi1/mad.c
  57. 8 2
      drivers/infiniband/hw/hfi1/qp.c
  58. 4 4
      drivers/infiniband/hw/hfi1/rc.c
  59. 1 5
      drivers/infiniband/hw/hfi1/verbs.c
  60. 1 1
      drivers/infiniband/hw/hns/Makefile
  61. 1 0
      drivers/infiniband/hw/hns/hns_roce_cmd.c
  62. 10 0
      drivers/infiniband/hw/hns/hns_roce_cmd.h
  63. 11 0
      drivers/infiniband/hw/hns/hns_roce_common.h
  64. 10 9
      drivers/infiniband/hw/hns/hns_roce_cq.c
  65. 96 7
      drivers/infiniband/hw/hns/hns_roce_device.h
  66. 0 759
      drivers/infiniband/hw/hns/hns_roce_eq.c
  67. 0 134
      drivers/infiniband/hw/hns/hns_roce_eq.h
  68. 741 17
      drivers/infiniband/hw/hns/hns_roce_hw_v1.c
  69. 42 2
      drivers/infiniband/hw/hns/hns_roce_hw_v1.h
  70. 1655 182
      drivers/infiniband/hw/hns/hns_roce_hw_v2.c
  71. 278 5
      drivers/infiniband/hw/hns/hns_roce_hw_v2.h
  72. 6 10
      drivers/infiniband/hw/hns/hns_roce_main.c
  73. 63 9
      drivers/infiniband/hw/hns/hns_roce_qp.c
  74. 0 1
      drivers/infiniband/hw/i40iw/Kconfig
  75. 3 0
      drivers/infiniband/hw/i40iw/i40iw.h
  76. 39 29
      drivers/infiniband/hw/i40iw/i40iw_cm.c
  77. 4 4
      drivers/infiniband/hw/i40iw/i40iw_cm.h
  78. 8 17
      drivers/infiniband/hw/i40iw/i40iw_ctrl.c
  79. 1 0
      drivers/infiniband/hw/i40iw/i40iw_d.h
  80. 2 1
      drivers/infiniband/hw/i40iw/i40iw_hw.c
  81. 10 3
      drivers/infiniband/hw/i40iw/i40iw_main.c
  82. 2 3
      drivers/infiniband/hw/i40iw/i40iw_puda.c
  83. 1 0
      drivers/infiniband/hw/i40iw/i40iw_puda.h
  84. 2 16
      drivers/infiniband/hw/i40iw/i40iw_uk.c
  85. 1 2
      drivers/infiniband/hw/i40iw/i40iw_user.h
  86. 47 3
      drivers/infiniband/hw/i40iw/i40iw_utils.c
  87. 2 3
      drivers/infiniband/hw/i40iw/i40iw_verbs.c
  88. 2 2
      drivers/infiniband/hw/mlx4/cq.c
  89. 11 8
      drivers/infiniband/hw/mlx4/main.c
  90. 18 2
      drivers/infiniband/hw/mlx4/qp.c
  91. 59 24
      drivers/infiniband/hw/mlx5/cong.c
  92. 1 1
      drivers/infiniband/hw/mlx5/cq.c
  93. 15 8
      drivers/infiniband/hw/mlx5/mad.c
  94. 1115 296
      drivers/infiniband/hw/mlx5/main.c
  95. 91 20
      drivers/infiniband/hw/mlx5/mlx5_ib.h
  96. 3 0
      drivers/infiniband/hw/mlx5/mr.c
  97. 0 9
      drivers/infiniband/hw/mlx5/odp.c
  98. 405 27
      drivers/infiniband/hw/mlx5/qp.c
  99. 3 4
      drivers/infiniband/hw/mthca/mthca_memfree.c
  100. 0 112
      drivers/infiniband/hw/mthca/mthca_user.h

+ 4 - 3
MAINTAINERS

@@ -6892,7 +6892,7 @@ M:	Jason Gunthorpe <jgg@mellanox.com>
 L:	linux-rdma@vger.kernel.org
 L:	linux-rdma@vger.kernel.org
 W:	http://www.openfabrics.org/
 W:	http://www.openfabrics.org/
 Q:	http://patchwork.kernel.org/project/linux-rdma/list/
 Q:	http://patchwork.kernel.org/project/linux-rdma/list/
-T:	git git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma.git
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git
 S:	Supported
 S:	Supported
 F:	Documentation/devicetree/bindings/infiniband/
 F:	Documentation/devicetree/bindings/infiniband/
 F:	Documentation/infiniband/
 F:	Documentation/infiniband/
@@ -11218,7 +11218,8 @@ S:	Maintained
 F:	drivers/firmware/qemu_fw_cfg.c
 F:	drivers/firmware/qemu_fw_cfg.c
 
 
 QIB DRIVER
 QIB DRIVER
-M:	Mike Marciniszyn <infinipath@intel.com>
+M:	Dennis Dalessandro <dennis.dalessandro@intel.com>
+M:	Mike Marciniszyn <mike.marciniszyn@intel.com>
 L:	linux-rdma@vger.kernel.org
 L:	linux-rdma@vger.kernel.org
 S:	Supported
 S:	Supported
 F:	drivers/infiniband/hw/qib/
 F:	drivers/infiniband/hw/qib/
@@ -11245,7 +11246,6 @@ F:	include/linux/qed/
 F:	drivers/net/ethernet/qlogic/qede/
 F:	drivers/net/ethernet/qlogic/qede/
 
 
 QLOGIC QL4xxx RDMA DRIVER
 QLOGIC QL4xxx RDMA DRIVER
-M:	Ram Amrani <Ram.Amrani@cavium.com>
 M:	Michal Kalderon <Michal.Kalderon@cavium.com>
 M:	Michal Kalderon <Michal.Kalderon@cavium.com>
 M:	Ariel Elior <Ariel.Elior@cavium.com>
 M:	Ariel Elior <Ariel.Elior@cavium.com>
 L:	linux-rdma@vger.kernel.org
 L:	linux-rdma@vger.kernel.org
@@ -11507,6 +11507,7 @@ F:	drivers/net/ethernet/rdc/r6040.c
 
 
 RDMAVT - RDMA verbs software
 RDMAVT - RDMA verbs software
 M:	Dennis Dalessandro <dennis.dalessandro@intel.com>
 M:	Dennis Dalessandro <dennis.dalessandro@intel.com>
+M:	Mike Marciniszyn <mike.marciniszyn@intel.com>
 L:	linux-rdma@vger.kernel.org
 L:	linux-rdma@vger.kernel.org
 S:	Supported
 S:	Supported
 F:	drivers/infiniband/sw/rdmavt
 F:	drivers/infiniband/sw/rdmavt

+ 1 - 1
drivers/infiniband/core/Makefile

@@ -12,7 +12,7 @@ ib_core-y :=			packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
 				device.o fmr_pool.o cache.o netlink.o \
 				device.o fmr_pool.o cache.o netlink.o \
 				roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \
 				roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \
 				multicast.o mad.o smi.o agent.o mad_rmpp.o \
 				multicast.o mad.o smi.o agent.o mad_rmpp.o \
-				security.o nldev.o
+				security.o nldev.o restrack.o
 
 
 ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
 ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
 ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o
 ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o

+ 8 - 57
drivers/infiniband/core/addr.c

@@ -243,8 +243,7 @@ void rdma_copy_addr(struct rdma_dev_addr *dev_addr,
 EXPORT_SYMBOL(rdma_copy_addr);
 EXPORT_SYMBOL(rdma_copy_addr);
 
 
 int rdma_translate_ip(const struct sockaddr *addr,
 int rdma_translate_ip(const struct sockaddr *addr,
-		      struct rdma_dev_addr *dev_addr,
-		      u16 *vlan_id)
+		      struct rdma_dev_addr *dev_addr)
 {
 {
 	struct net_device *dev;
 	struct net_device *dev;
 
 
@@ -266,9 +265,6 @@ int rdma_translate_ip(const struct sockaddr *addr,
 			return -EADDRNOTAVAIL;
 			return -EADDRNOTAVAIL;
 
 
 		rdma_copy_addr(dev_addr, dev, NULL);
 		rdma_copy_addr(dev_addr, dev, NULL);
-		dev_addr->bound_dev_if = dev->ifindex;
-		if (vlan_id)
-			*vlan_id = rdma_vlan_dev_vlan_id(dev);
 		dev_put(dev);
 		dev_put(dev);
 		break;
 		break;
 #if IS_ENABLED(CONFIG_IPV6)
 #if IS_ENABLED(CONFIG_IPV6)
@@ -279,9 +275,6 @@ int rdma_translate_ip(const struct sockaddr *addr,
 					  &((const struct sockaddr_in6 *)addr)->sin6_addr,
 					  &((const struct sockaddr_in6 *)addr)->sin6_addr,
 					  dev, 1)) {
 					  dev, 1)) {
 				rdma_copy_addr(dev_addr, dev, NULL);
 				rdma_copy_addr(dev_addr, dev, NULL);
-				dev_addr->bound_dev_if = dev->ifindex;
-				if (vlan_id)
-					*vlan_id = rdma_vlan_dev_vlan_id(dev);
 				break;
 				break;
 			}
 			}
 		}
 		}
@@ -481,7 +474,7 @@ static int addr_resolve_neigh(struct dst_entry *dst,
 	if (dst->dev->flags & IFF_LOOPBACK) {
 	if (dst->dev->flags & IFF_LOOPBACK) {
 		int ret;
 		int ret;
 
 
-		ret = rdma_translate_ip(dst_in, addr, NULL);
+		ret = rdma_translate_ip(dst_in, addr);
 		if (!ret)
 		if (!ret)
 			memcpy(addr->dst_dev_addr, addr->src_dev_addr,
 			memcpy(addr->dst_dev_addr, addr->src_dev_addr,
 			       MAX_ADDR_LEN);
 			       MAX_ADDR_LEN);
@@ -558,7 +551,7 @@ static int addr_resolve(struct sockaddr *src_in,
 	}
 	}
 
 
 	if (ndev->flags & IFF_LOOPBACK) {
 	if (ndev->flags & IFF_LOOPBACK) {
-		ret = rdma_translate_ip(dst_in, addr, NULL);
+		ret = rdma_translate_ip(dst_in, addr);
 		/*
 		/*
 		 * Put the loopback device and get the translated
 		 * Put the loopback device and get the translated
 		 * device instead.
 		 * device instead.
@@ -744,7 +737,6 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr)
 EXPORT_SYMBOL(rdma_addr_cancel);
 EXPORT_SYMBOL(rdma_addr_cancel);
 
 
 struct resolve_cb_context {
 struct resolve_cb_context {
-	struct rdma_dev_addr *addr;
 	struct completion comp;
 	struct completion comp;
 	int status;
 	int status;
 };
 };
@@ -752,39 +744,31 @@ struct resolve_cb_context {
 static void resolve_cb(int status, struct sockaddr *src_addr,
 static void resolve_cb(int status, struct sockaddr *src_addr,
 	     struct rdma_dev_addr *addr, void *context)
 	     struct rdma_dev_addr *addr, void *context)
 {
 {
-	if (!status)
-		memcpy(((struct resolve_cb_context *)context)->addr,
-		       addr, sizeof(struct rdma_dev_addr));
 	((struct resolve_cb_context *)context)->status = status;
 	((struct resolve_cb_context *)context)->status = status;
 	complete(&((struct resolve_cb_context *)context)->comp);
 	complete(&((struct resolve_cb_context *)context)->comp);
 }
 }
 
 
 int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
 int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
 				 const union ib_gid *dgid,
 				 const union ib_gid *dgid,
-				 u8 *dmac, u16 *vlan_id, int *if_index,
+				 u8 *dmac, const struct net_device *ndev,
 				 int *hoplimit)
 				 int *hoplimit)
 {
 {
-	int ret = 0;
 	struct rdma_dev_addr dev_addr;
 	struct rdma_dev_addr dev_addr;
 	struct resolve_cb_context ctx;
 	struct resolve_cb_context ctx;
-	struct net_device *dev;
-
 	union {
 	union {
 		struct sockaddr     _sockaddr;
 		struct sockaddr     _sockaddr;
 		struct sockaddr_in  _sockaddr_in;
 		struct sockaddr_in  _sockaddr_in;
 		struct sockaddr_in6 _sockaddr_in6;
 		struct sockaddr_in6 _sockaddr_in6;
 	} sgid_addr, dgid_addr;
 	} sgid_addr, dgid_addr;
-
+	int ret;
 
 
 	rdma_gid2ip(&sgid_addr._sockaddr, sgid);
 	rdma_gid2ip(&sgid_addr._sockaddr, sgid);
 	rdma_gid2ip(&dgid_addr._sockaddr, dgid);
 	rdma_gid2ip(&dgid_addr._sockaddr, dgid);
 
 
 	memset(&dev_addr, 0, sizeof(dev_addr));
 	memset(&dev_addr, 0, sizeof(dev_addr));
-	if (if_index)
-		dev_addr.bound_dev_if = *if_index;
+	dev_addr.bound_dev_if = ndev->ifindex;
 	dev_addr.net = &init_net;
 	dev_addr.net = &init_net;
 
 
-	ctx.addr = &dev_addr;
 	init_completion(&ctx.comp);
 	init_completion(&ctx.comp);
 	ret = rdma_resolve_ip(&self, &sgid_addr._sockaddr, &dgid_addr._sockaddr,
 	ret = rdma_resolve_ip(&self, &sgid_addr._sockaddr, &dgid_addr._sockaddr,
 			&dev_addr, 1000, resolve_cb, &ctx);
 			&dev_addr, 1000, resolve_cb, &ctx);
@@ -798,42 +782,9 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
 		return ret;
 		return ret;
 
 
 	memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN);
 	memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN);
-	dev = dev_get_by_index(&init_net, dev_addr.bound_dev_if);
-	if (!dev)
-		return -ENODEV;
-	if (if_index)
-		*if_index = dev_addr.bound_dev_if;
-	if (vlan_id)
-		*vlan_id = rdma_vlan_dev_vlan_id(dev);
-	if (hoplimit)
-		*hoplimit = dev_addr.hoplimit;
-	dev_put(dev);
-	return ret;
-}
-EXPORT_SYMBOL(rdma_addr_find_l2_eth_by_grh);
-
-int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id)
-{
-	int ret = 0;
-	struct rdma_dev_addr dev_addr;
-	union {
-		struct sockaddr     _sockaddr;
-		struct sockaddr_in  _sockaddr_in;
-		struct sockaddr_in6 _sockaddr_in6;
-	} gid_addr;
-
-	rdma_gid2ip(&gid_addr._sockaddr, sgid);
-
-	memset(&dev_addr, 0, sizeof(dev_addr));
-	dev_addr.net = &init_net;
-	ret = rdma_translate_ip(&gid_addr._sockaddr, &dev_addr, vlan_id);
-	if (ret)
-		return ret;
-
-	memcpy(smac, dev_addr.src_dev_addr, ETH_ALEN);
-	return ret;
+	*hoplimit = dev_addr.hoplimit;
+	return 0;
 }
 }
-EXPORT_SYMBOL(rdma_addr_find_smac_by_sgid);
 
 
 static int netevent_callback(struct notifier_block *self, unsigned long event,
 static int netevent_callback(struct notifier_block *self, unsigned long event,
 	void *ctx)
 	void *ctx)

+ 7 - 16
drivers/infiniband/core/cache.c

@@ -573,27 +573,24 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev,
 		struct ib_gid_attr attr;
 		struct ib_gid_attr attr;
 
 
 		if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID)
 		if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID)
-			goto next;
+			continue;
 
 
 		if (memcmp(gid, &table->data_vec[i].gid, sizeof(*gid)))
 		if (memcmp(gid, &table->data_vec[i].gid, sizeof(*gid)))
-			goto next;
+			continue;
 
 
 		memcpy(&attr, &table->data_vec[i].attr, sizeof(attr));
 		memcpy(&attr, &table->data_vec[i].attr, sizeof(attr));
 
 
-		if (filter(gid, &attr, context))
+		if (filter(gid, &attr, context)) {
 			found = true;
 			found = true;
-
-next:
-		if (found)
+			if (index)
+				*index = i;
 			break;
 			break;
+		}
 	}
 	}
 	read_unlock_irqrestore(&table->rwlock, flags);
 	read_unlock_irqrestore(&table->rwlock, flags);
 
 
 	if (!found)
 	if (!found)
 		return -ENOENT;
 		return -ENOENT;
-
-	if (index)
-		*index = i;
 	return 0;
 	return 0;
 }
 }
 
 
@@ -824,12 +821,7 @@ static int gid_table_setup_one(struct ib_device *ib_dev)
 	if (err)
 	if (err)
 		return err;
 		return err;
 
 
-	err = roce_rescan_device(ib_dev);
-
-	if (err) {
-		gid_table_cleanup_one(ib_dev);
-		gid_table_release_one(ib_dev);
-	}
+	rdma_roce_rescan_device(ib_dev);
 
 
 	return err;
 	return err;
 }
 }
@@ -883,7 +875,6 @@ int ib_find_gid_by_filter(struct ib_device *device,
 					   port_num, filter,
 					   port_num, filter,
 					   context, index);
 					   context, index);
 }
 }
-EXPORT_SYMBOL(ib_find_gid_by_filter);
 
 
 int ib_get_cached_pkey(struct ib_device *device,
 int ib_get_cached_pkey(struct ib_device *device,
 		       u8                port_num,
 		       u8                port_num,

+ 159 - 68
drivers/infiniband/core/cm.c

@@ -452,13 +452,14 @@ static void cm_set_private_data(struct cm_id_private *cm_id_priv,
 	cm_id_priv->private_data_len = private_data_len;
 	cm_id_priv->private_data_len = private_data_len;
 }
 }
 
 
-static void cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc,
-				    struct ib_grh *grh, struct cm_av *av)
+static int cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc,
+				   struct ib_grh *grh, struct cm_av *av)
 {
 {
 	av->port = port;
 	av->port = port;
 	av->pkey_index = wc->pkey_index;
 	av->pkey_index = wc->pkey_index;
-	ib_init_ah_from_wc(port->cm_dev->ib_device, port->port_num, wc,
-			   grh, &av->ah_attr);
+	return ib_init_ah_attr_from_wc(port->cm_dev->ib_device,
+				       port->port_num, wc,
+				       grh, &av->ah_attr);
 }
 }
 
 
 static int cm_init_av_by_path(struct sa_path_rec *path, struct cm_av *av,
 static int cm_init_av_by_path(struct sa_path_rec *path, struct cm_av *av,
@@ -494,8 +495,11 @@ static int cm_init_av_by_path(struct sa_path_rec *path, struct cm_av *av,
 		return ret;
 		return ret;
 
 
 	av->port = port;
 	av->port = port;
-	ib_init_ah_from_path(cm_dev->ib_device, port->port_num, path,
-			     &av->ah_attr);
+	ret = ib_init_ah_attr_from_path(cm_dev->ib_device, port->port_num, path,
+					&av->ah_attr);
+	if (ret)
+		return ret;
+
 	av->timeout = path->packet_life_time + 1;
 	av->timeout = path->packet_life_time + 1;
 
 
 	spin_lock_irqsave(&cm.lock, flags);
 	spin_lock_irqsave(&cm.lock, flags);
@@ -1560,6 +1564,35 @@ static u16 cm_get_bth_pkey(struct cm_work *work)
 	return pkey;
 	return pkey;
 }
 }
 
 
+/**
+ * Convert OPA SGID to IB SGID
+ * ULPs (such as IPoIB) do not understand OPA GIDs and will
+ * reject them as the local_gid will not match the sgid. Therefore,
+ * change the pathrec's SGID to an IB SGID.
+ *
+ * @work: Work completion
+ * @path: Path record
+ */
+static void cm_opa_to_ib_sgid(struct cm_work *work,
+			      struct sa_path_rec *path)
+{
+	struct ib_device *dev = work->port->cm_dev->ib_device;
+	u8 port_num = work->port->port_num;
+
+	if (rdma_cap_opa_ah(dev, port_num) &&
+	    (ib_is_opa_gid(&path->sgid))) {
+		union ib_gid sgid;
+
+		if (ib_get_cached_gid(dev, port_num, 0, &sgid, NULL)) {
+			dev_warn(&dev->dev,
+				 "Error updating sgid in CM request\n");
+			return;
+		}
+
+		path->sgid = sgid;
+	}
+}
+
 static void cm_format_req_event(struct cm_work *work,
 static void cm_format_req_event(struct cm_work *work,
 				struct cm_id_private *cm_id_priv,
 				struct cm_id_private *cm_id_priv,
 				struct ib_cm_id *listen_id)
 				struct ib_cm_id *listen_id)
@@ -1573,10 +1606,13 @@ static void cm_format_req_event(struct cm_work *work,
 	param->bth_pkey = cm_get_bth_pkey(work);
 	param->bth_pkey = cm_get_bth_pkey(work);
 	param->port = cm_id_priv->av.port->port_num;
 	param->port = cm_id_priv->av.port->port_num;
 	param->primary_path = &work->path[0];
 	param->primary_path = &work->path[0];
-	if (cm_req_has_alt_path(req_msg))
+	cm_opa_to_ib_sgid(work, param->primary_path);
+	if (cm_req_has_alt_path(req_msg)) {
 		param->alternate_path = &work->path[1];
 		param->alternate_path = &work->path[1];
-	else
+		cm_opa_to_ib_sgid(work, param->alternate_path);
+	} else {
 		param->alternate_path = NULL;
 		param->alternate_path = NULL;
+	}
 	param->remote_ca_guid = req_msg->local_ca_guid;
 	param->remote_ca_guid = req_msg->local_ca_guid;
 	param->remote_qkey = be32_to_cpu(req_msg->local_qkey);
 	param->remote_qkey = be32_to_cpu(req_msg->local_qkey);
 	param->remote_qpn = be32_to_cpu(cm_req_get_local_qpn(req_msg));
 	param->remote_qpn = be32_to_cpu(cm_req_get_local_qpn(req_msg));
@@ -1826,9 +1862,11 @@ static int cm_req_handler(struct cm_work *work)
 
 
 	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
 	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
 	cm_id_priv->id.remote_id = req_msg->local_comm_id;
 	cm_id_priv->id.remote_id = req_msg->local_comm_id;
-	cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
-				work->mad_recv_wc->recv_buf.grh,
-				&cm_id_priv->av);
+	ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+				      work->mad_recv_wc->recv_buf.grh,
+				      &cm_id_priv->av);
+	if (ret)
+		goto destroy;
 	cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
 	cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
 							    id.local_id);
 							    id.local_id);
 	if (IS_ERR(cm_id_priv->timewait_info)) {
 	if (IS_ERR(cm_id_priv->timewait_info)) {
@@ -1841,9 +1879,10 @@ static int cm_req_handler(struct cm_work *work)
 
 
 	listen_cm_id_priv = cm_match_req(work, cm_id_priv);
 	listen_cm_id_priv = cm_match_req(work, cm_id_priv);
 	if (!listen_cm_id_priv) {
 	if (!listen_cm_id_priv) {
+		pr_debug("%s: local_id %d, no listen_cm_id_priv\n", __func__,
+			 be32_to_cpu(cm_id->local_id));
 		ret = -EINVAL;
 		ret = -EINVAL;
-		kfree(cm_id_priv->timewait_info);
-		goto destroy;
+		goto free_timeinfo;
 	}
 	}
 
 
 	cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler;
 	cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler;
@@ -1861,56 +1900,50 @@ static int cm_req_handler(struct cm_work *work)
 				work->port->port_num,
 				work->port->port_num,
 				grh->sgid_index,
 				grh->sgid_index,
 				&gid, &gid_attr);
 				&gid, &gid_attr);
-	if (!ret) {
-		if (gid_attr.ndev) {
-			work->path[0].rec_type =
-				sa_conv_gid_to_pathrec_type(gid_attr.gid_type);
-			sa_path_set_ifindex(&work->path[0],
-					    gid_attr.ndev->ifindex);
-			sa_path_set_ndev(&work->path[0],
-					 dev_net(gid_attr.ndev));
-			dev_put(gid_attr.ndev);
-		} else {
-			cm_path_set_rec_type(work->port->cm_dev->ib_device,
-					     work->port->port_num,
-					     &work->path[0],
-					     &req_msg->primary_local_gid);
-		}
-		if (cm_req_has_alt_path(req_msg))
-			work->path[1].rec_type = work->path[0].rec_type;
-		cm_format_paths_from_req(req_msg, &work->path[0],
-					 &work->path[1]);
-		if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE)
-			sa_path_set_dmac(&work->path[0],
-					 cm_id_priv->av.ah_attr.roce.dmac);
-		work->path[0].hop_limit = grh->hop_limit;
-		ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av,
-					 cm_id_priv);
+	if (ret) {
+		ib_send_cm_rej(cm_id, IB_CM_REJ_UNSUPPORTED, NULL, 0, NULL, 0);
+		goto rejected;
+	}
+
+	if (gid_attr.ndev) {
+		work->path[0].rec_type =
+			sa_conv_gid_to_pathrec_type(gid_attr.gid_type);
+		sa_path_set_ifindex(&work->path[0],
+				    gid_attr.ndev->ifindex);
+		sa_path_set_ndev(&work->path[0],
+				 dev_net(gid_attr.ndev));
+		dev_put(gid_attr.ndev);
+	} else {
+		cm_path_set_rec_type(work->port->cm_dev->ib_device,
+				     work->port->port_num,
+				     &work->path[0],
+				     &req_msg->primary_local_gid);
 	}
 	}
+	if (cm_req_has_alt_path(req_msg))
+		work->path[1].rec_type = work->path[0].rec_type;
+	cm_format_paths_from_req(req_msg, &work->path[0],
+				 &work->path[1]);
+	if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE)
+		sa_path_set_dmac(&work->path[0],
+				 cm_id_priv->av.ah_attr.roce.dmac);
+	work->path[0].hop_limit = grh->hop_limit;
+	ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av,
+				 cm_id_priv);
 	if (ret) {
 	if (ret) {
-		int err = ib_get_cached_gid(work->port->cm_dev->ib_device,
-					    work->port->port_num, 0,
-					    &work->path[0].sgid,
-					    &gid_attr);
-		if (!err && gid_attr.ndev) {
-			work->path[0].rec_type =
-				sa_conv_gid_to_pathrec_type(gid_attr.gid_type);
-			sa_path_set_ifindex(&work->path[0],
-					    gid_attr.ndev->ifindex);
-			sa_path_set_ndev(&work->path[0],
-					 dev_net(gid_attr.ndev));
-			dev_put(gid_attr.ndev);
-		} else {
-			cm_path_set_rec_type(work->port->cm_dev->ib_device,
-					     work->port->port_num,
-					     &work->path[0],
-					     &req_msg->primary_local_gid);
-		}
-		if (cm_req_has_alt_path(req_msg))
-			work->path[1].rec_type = work->path[0].rec_type;
-		ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID,
-			       &work->path[0].sgid, sizeof work->path[0].sgid,
-			       NULL, 0);
+		int err;
+
+		err = ib_get_cached_gid(work->port->cm_dev->ib_device,
+					work->port->port_num, 0,
+					&work->path[0].sgid,
+					NULL);
+		if (err)
+			ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID,
+				       NULL, 0, NULL, 0);
+		else
+			ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID,
+				       &work->path[0].sgid,
+				       sizeof(work->path[0].sgid),
+				       NULL, 0);
 		goto rejected;
 		goto rejected;
 	}
 	}
 	if (cm_req_has_alt_path(req_msg)) {
 	if (cm_req_has_alt_path(req_msg)) {
@@ -1919,7 +1952,7 @@ static int cm_req_handler(struct cm_work *work)
 		if (ret) {
 		if (ret) {
 			ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_ALT_GID,
 			ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_ALT_GID,
 				       &work->path[0].sgid,
 				       &work->path[0].sgid,
-				       sizeof work->path[0].sgid, NULL, 0);
+				       sizeof(work->path[0].sgid), NULL, 0);
 			goto rejected;
 			goto rejected;
 		}
 		}
 	}
 	}
@@ -1945,6 +1978,8 @@ static int cm_req_handler(struct cm_work *work)
 rejected:
 rejected:
 	atomic_dec(&cm_id_priv->refcount);
 	atomic_dec(&cm_id_priv->refcount);
 	cm_deref_id(listen_cm_id_priv);
 	cm_deref_id(listen_cm_id_priv);
+free_timeinfo:
+	kfree(cm_id_priv->timewait_info);
 destroy:
 destroy:
 	ib_destroy_cm_id(cm_id);
 	ib_destroy_cm_id(cm_id);
 	return ret;
 	return ret;
@@ -1997,6 +2032,8 @@ int ib_send_cm_rep(struct ib_cm_id *cm_id,
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
 	if (cm_id->state != IB_CM_REQ_RCVD &&
 	if (cm_id->state != IB_CM_REQ_RCVD &&
 	    cm_id->state != IB_CM_MRA_REQ_SENT) {
 	    cm_id->state != IB_CM_MRA_REQ_SENT) {
+		pr_debug("%s: local_comm_id %d, cm_id->state: %d\n", __func__,
+			 be32_to_cpu(cm_id_priv->id.local_id), cm_id->state);
 		ret = -EINVAL;
 		ret = -EINVAL;
 		goto out;
 		goto out;
 	}
 	}
@@ -2063,6 +2100,8 @@ int ib_send_cm_rtu(struct ib_cm_id *cm_id,
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
 	if (cm_id->state != IB_CM_REP_RCVD &&
 	if (cm_id->state != IB_CM_REP_RCVD &&
 	    cm_id->state != IB_CM_MRA_REP_SENT) {
 	    cm_id->state != IB_CM_MRA_REP_SENT) {
+		pr_debug("%s: local_id %d, cm_id->state %d\n", __func__,
+			 be32_to_cpu(cm_id->local_id), cm_id->state);
 		ret = -EINVAL;
 		ret = -EINVAL;
 		goto error;
 		goto error;
 	}
 	}
@@ -2170,6 +2209,8 @@ static int cm_rep_handler(struct cm_work *work)
 	cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id, 0);
 	cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id, 0);
 	if (!cm_id_priv) {
 	if (!cm_id_priv) {
 		cm_dup_rep_handler(work);
 		cm_dup_rep_handler(work);
+		pr_debug("%s: remote_comm_id %d, no cm_id_priv\n", __func__,
+			 be32_to_cpu(rep_msg->remote_comm_id));
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
@@ -2183,6 +2224,10 @@ static int cm_rep_handler(struct cm_work *work)
 	default:
 	default:
 		spin_unlock_irq(&cm_id_priv->lock);
 		spin_unlock_irq(&cm_id_priv->lock);
 		ret = -EINVAL;
 		ret = -EINVAL;
+		pr_debug("%s: cm_id_priv->id.state: %d, local_comm_id %d, remote_comm_id %d\n",
+			 __func__, cm_id_priv->id.state,
+			 be32_to_cpu(rep_msg->local_comm_id),
+			 be32_to_cpu(rep_msg->remote_comm_id));
 		goto error;
 		goto error;
 	}
 	}
 
 
@@ -2196,6 +2241,8 @@ static int cm_rep_handler(struct cm_work *work)
 		spin_unlock(&cm.lock);
 		spin_unlock(&cm.lock);
 		spin_unlock_irq(&cm_id_priv->lock);
 		spin_unlock_irq(&cm_id_priv->lock);
 		ret = -EINVAL;
 		ret = -EINVAL;
+		pr_debug("%s: Failed to insert remote id %d\n", __func__,
+			 be32_to_cpu(rep_msg->remote_comm_id));
 		goto error;
 		goto error;
 	}
 	}
 	/* Check for a stale connection. */
 	/* Check for a stale connection. */
@@ -2213,6 +2260,10 @@ static int cm_rep_handler(struct cm_work *work)
 			     IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP,
 			     IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP,
 			     NULL, 0);
 			     NULL, 0);
 		ret = -EINVAL;
 		ret = -EINVAL;
+		pr_debug("%s: Stale connection. local_comm_id %d, remote_comm_id %d\n",
+			 __func__, be32_to_cpu(rep_msg->local_comm_id),
+			 be32_to_cpu(rep_msg->remote_comm_id));
+
 		if (cur_cm_id_priv) {
 		if (cur_cm_id_priv) {
 			cm_id = &cur_cm_id_priv->id;
 			cm_id = &cur_cm_id_priv->id;
 			ib_send_cm_dreq(cm_id, NULL, 0);
 			ib_send_cm_dreq(cm_id, NULL, 0);
@@ -2359,6 +2410,8 @@ int ib_send_cm_dreq(struct ib_cm_id *cm_id,
 	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
 	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
 	if (cm_id->state != IB_CM_ESTABLISHED) {
 	if (cm_id->state != IB_CM_ESTABLISHED) {
+		pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__,
+			 be32_to_cpu(cm_id->local_id), cm_id->state);
 		ret = -EINVAL;
 		ret = -EINVAL;
 		goto out;
 		goto out;
 	}
 	}
@@ -2428,6 +2481,8 @@ int ib_send_cm_drep(struct ib_cm_id *cm_id,
 	if (cm_id->state != IB_CM_DREQ_RCVD) {
 	if (cm_id->state != IB_CM_DREQ_RCVD) {
 		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 		kfree(data);
 		kfree(data);
+		pr_debug("%s: local_id %d, cm_idcm_id->state(%d) != IB_CM_DREQ_RCVD\n",
+			 __func__, be32_to_cpu(cm_id->local_id), cm_id->state);
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
@@ -2493,6 +2548,9 @@ static int cm_dreq_handler(struct cm_work *work)
 		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
 		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
 				counter[CM_DREQ_COUNTER]);
 				counter[CM_DREQ_COUNTER]);
 		cm_issue_drep(work->port, work->mad_recv_wc);
 		cm_issue_drep(work->port, work->mad_recv_wc);
+		pr_debug("%s: no cm_id_priv, local_comm_id %d, remote_comm_id %d\n",
+			 __func__, be32_to_cpu(dreq_msg->local_comm_id),
+			 be32_to_cpu(dreq_msg->remote_comm_id));
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
@@ -2535,6 +2593,9 @@ static int cm_dreq_handler(struct cm_work *work)
 				counter[CM_DREQ_COUNTER]);
 				counter[CM_DREQ_COUNTER]);
 		goto unlock;
 		goto unlock;
 	default:
 	default:
+		pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n",
+			 __func__, be32_to_cpu(cm_id_priv->id.local_id),
+			 cm_id_priv->id.state);
 		goto unlock;
 		goto unlock;
 	}
 	}
 	cm_id_priv->id.state = IB_CM_DREQ_RCVD;
 	cm_id_priv->id.state = IB_CM_DREQ_RCVD;
@@ -2638,6 +2699,8 @@ int ib_send_cm_rej(struct ib_cm_id *cm_id,
 		cm_enter_timewait(cm_id_priv);
 		cm_enter_timewait(cm_id_priv);
 		break;
 		break;
 	default:
 	default:
+		pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__,
+			 be32_to_cpu(cm_id_priv->id.local_id), cm_id->state);
 		ret = -EINVAL;
 		ret = -EINVAL;
 		goto out;
 		goto out;
 	}
 	}
@@ -2748,6 +2811,9 @@ static int cm_rej_handler(struct cm_work *work)
 		/* fall through */
 		/* fall through */
 	default:
 	default:
 		spin_unlock_irq(&cm_id_priv->lock);
 		spin_unlock_irq(&cm_id_priv->lock);
+		pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n",
+			 __func__, be32_to_cpu(cm_id_priv->id.local_id),
+			 cm_id_priv->id.state);
 		ret = -EINVAL;
 		ret = -EINVAL;
 		goto out;
 		goto out;
 	}
 	}
@@ -2811,6 +2877,9 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id,
 		}
 		}
 		/* fall through */
 		/* fall through */
 	default:
 	default:
+		pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n",
+			 __func__, be32_to_cpu(cm_id_priv->id.local_id),
+			 cm_id_priv->id.state);
 		ret = -EINVAL;
 		ret = -EINVAL;
 		goto error1;
 		goto error1;
 	}
 	}
@@ -2912,6 +2981,9 @@ static int cm_mra_handler(struct cm_work *work)
 				counter[CM_MRA_COUNTER]);
 				counter[CM_MRA_COUNTER]);
 		/* fall through */
 		/* fall through */
 	default:
 	default:
+		pr_debug("%s local_id %d, cm_id_priv->id.state: %d\n",
+			 __func__, be32_to_cpu(cm_id_priv->id.local_id),
+			 cm_id_priv->id.state);
 		goto out;
 		goto out;
 	}
 	}
 
 
@@ -3085,6 +3157,12 @@ static int cm_lap_handler(struct cm_work *work)
 	if (!cm_id_priv)
 	if (!cm_id_priv)
 		return -EINVAL;
 		return -EINVAL;
 
 
+	ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+				      work->mad_recv_wc->recv_buf.grh,
+				      &cm_id_priv->av);
+	if (ret)
+		goto deref;
+
 	param = &work->cm_event.param.lap_rcvd;
 	param = &work->cm_event.param.lap_rcvd;
 	memset(&work->path[0], 0, sizeof(work->path[1]));
 	memset(&work->path[0], 0, sizeof(work->path[1]));
 	cm_path_set_rec_type(work->port->cm_dev->ib_device,
 	cm_path_set_rec_type(work->port->cm_dev->ib_device,
@@ -3131,9 +3209,6 @@ static int cm_lap_handler(struct cm_work *work)
 
 
 	cm_id_priv->id.lap_state = IB_CM_LAP_RCVD;
 	cm_id_priv->id.lap_state = IB_CM_LAP_RCVD;
 	cm_id_priv->tid = lap_msg->hdr.tid;
 	cm_id_priv->tid = lap_msg->hdr.tid;
-	cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
-				work->mad_recv_wc->recv_buf.grh,
-				&cm_id_priv->av);
 	cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av,
 	cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av,
 			   cm_id_priv);
 			   cm_id_priv);
 	ret = atomic_inc_and_test(&cm_id_priv->work_count);
 	ret = atomic_inc_and_test(&cm_id_priv->work_count);
@@ -3386,6 +3461,7 @@ static int cm_sidr_req_handler(struct cm_work *work)
 	struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
 	struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
 	struct cm_sidr_req_msg *sidr_req_msg;
 	struct cm_sidr_req_msg *sidr_req_msg;
 	struct ib_wc *wc;
 	struct ib_wc *wc;
+	int ret;
 
 
 	cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL);
 	cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL);
 	if (IS_ERR(cm_id))
 	if (IS_ERR(cm_id))
@@ -3398,9 +3474,12 @@ static int cm_sidr_req_handler(struct cm_work *work)
 	wc = work->mad_recv_wc->wc;
 	wc = work->mad_recv_wc->wc;
 	cm_id_priv->av.dgid.global.subnet_prefix = cpu_to_be64(wc->slid);
 	cm_id_priv->av.dgid.global.subnet_prefix = cpu_to_be64(wc->slid);
 	cm_id_priv->av.dgid.global.interface_id = 0;
 	cm_id_priv->av.dgid.global.interface_id = 0;
-	cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
-				work->mad_recv_wc->recv_buf.grh,
-				&cm_id_priv->av);
+	ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+				      work->mad_recv_wc->recv_buf.grh,
+				      &cm_id_priv->av);
+	if (ret)
+		goto out;
+
 	cm_id_priv->id.remote_id = sidr_req_msg->request_id;
 	cm_id_priv->id.remote_id = sidr_req_msg->request_id;
 	cm_id_priv->tid = sidr_req_msg->hdr.tid;
 	cm_id_priv->tid = sidr_req_msg->hdr.tid;
 	atomic_inc(&cm_id_priv->work_count);
 	atomic_inc(&cm_id_priv->work_count);
@@ -3692,6 +3771,7 @@ static void cm_work_handler(struct work_struct *_work)
 		ret = cm_timewait_handler(work);
 		ret = cm_timewait_handler(work);
 		break;
 		break;
 	default:
 	default:
+		pr_debug("cm_event.event: 0x%x\n", work->cm_event.event);
 		ret = -EINVAL;
 		ret = -EINVAL;
 		break;
 		break;
 	}
 	}
@@ -3727,6 +3807,8 @@ static int cm_establish(struct ib_cm_id *cm_id)
 		ret = -EISCONN;
 		ret = -EISCONN;
 		break;
 		break;
 	default:
 	default:
+		pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__,
+			 be32_to_cpu(cm_id->local_id), cm_id->state);
 		ret = -EINVAL;
 		ret = -EINVAL;
 		break;
 		break;
 	}
 	}
@@ -3924,6 +4006,9 @@ static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv,
 		ret = 0;
 		ret = 0;
 		break;
 		break;
 	default:
 	default:
+		pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n",
+			 __func__, be32_to_cpu(cm_id_priv->id.local_id),
+			 cm_id_priv->id.state);
 		ret = -EINVAL;
 		ret = -EINVAL;
 		break;
 		break;
 	}
 	}
@@ -3971,6 +4056,9 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv,
 		ret = 0;
 		ret = 0;
 		break;
 		break;
 	default:
 	default:
+		pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n",
+			 __func__, be32_to_cpu(cm_id_priv->id.local_id),
+			 cm_id_priv->id.state);
 		ret = -EINVAL;
 		ret = -EINVAL;
 		break;
 		break;
 	}
 	}
@@ -4030,6 +4118,9 @@ static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv,
 		ret = 0;
 		ret = 0;
 		break;
 		break;
 	default:
 	default:
+		pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n",
+			 __func__, be32_to_cpu(cm_id_priv->id.local_id),
+			 cm_id_priv->id.state);
 		ret = -EINVAL;
 		ret = -EINVAL;
 		break;
 		break;
 	}
 	}

+ 156 - 96
drivers/infiniband/core/cma.c

@@ -601,7 +601,7 @@ static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_a
 	int ret;
 	int ret;
 
 
 	if (addr->sa_family != AF_IB) {
 	if (addr->sa_family != AF_IB) {
-		ret = rdma_translate_ip(addr, dev_addr, NULL);
+		ret = rdma_translate_ip(addr, dev_addr);
 	} else {
 	} else {
 		cma_translate_ib((struct sockaddr_ib *) addr, dev_addr);
 		cma_translate_ib((struct sockaddr_ib *) addr, dev_addr);
 		ret = 0;
 		ret = 0;
@@ -612,11 +612,14 @@ static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_a
 
 
 static inline int cma_validate_port(struct ib_device *device, u8 port,
 static inline int cma_validate_port(struct ib_device *device, u8 port,
 				    enum ib_gid_type gid_type,
 				    enum ib_gid_type gid_type,
-				      union ib_gid *gid, int dev_type,
-				      int bound_if_index)
+				    union ib_gid *gid,
+				    struct rdma_id_private *id_priv)
 {
 {
-	int ret = -ENODEV;
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	int bound_if_index = dev_addr->bound_dev_if;
+	int dev_type = dev_addr->dev_type;
 	struct net_device *ndev = NULL;
 	struct net_device *ndev = NULL;
+	int ret = -ENODEV;
 
 
 	if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port))
 	if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port))
 		return ret;
 		return ret;
@@ -624,11 +627,13 @@ static inline int cma_validate_port(struct ib_device *device, u8 port,
 	if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port))
 	if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port))
 		return ret;
 		return ret;
 
 
-	if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port))
-		ndev = dev_get_by_index(&init_net, bound_if_index);
-	else
+	if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) {
+		ndev = dev_get_by_index(dev_addr->net, bound_if_index);
+		if (!ndev)
+			return ret;
+	} else {
 		gid_type = IB_GID_TYPE_IB;
 		gid_type = IB_GID_TYPE_IB;
-
+	}
 
 
 	ret = ib_find_cached_gid_by_port(device, gid, gid_type, port,
 	ret = ib_find_cached_gid_by_port(device, gid, gid_type, port,
 					 ndev, NULL);
 					 ndev, NULL);
@@ -669,8 +674,7 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv,
 					rdma_protocol_ib(cma_dev->device, port) ?
 					rdma_protocol_ib(cma_dev->device, port) ?
 					IB_GID_TYPE_IB :
 					IB_GID_TYPE_IB :
 					listen_id_priv->gid_type, gidp,
 					listen_id_priv->gid_type, gidp,
-					dev_addr->dev_type,
-					dev_addr->bound_dev_if);
+					id_priv);
 		if (!ret) {
 		if (!ret) {
 			id_priv->id.port_num = port;
 			id_priv->id.port_num = port;
 			goto out;
 			goto out;
@@ -691,8 +695,7 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv,
 						rdma_protocol_ib(cma_dev->device, port) ?
 						rdma_protocol_ib(cma_dev->device, port) ?
 						IB_GID_TYPE_IB :
 						IB_GID_TYPE_IB :
 						cma_dev->default_gid_type[port - 1],
 						cma_dev->default_gid_type[port - 1],
-						gidp, dev_addr->dev_type,
-						dev_addr->bound_dev_if);
+						gidp, id_priv);
 			if (!ret) {
 			if (!ret) {
 				id_priv->id.port_num = port;
 				id_priv->id.port_num = port;
 				goto out;
 				goto out;
@@ -2036,6 +2039,33 @@ __be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr)
 }
 }
 EXPORT_SYMBOL(rdma_get_service_id);
 EXPORT_SYMBOL(rdma_get_service_id);
 
 
+void rdma_read_gids(struct rdma_cm_id *cm_id, union ib_gid *sgid,
+		    union ib_gid *dgid)
+{
+	struct rdma_addr *addr = &cm_id->route.addr;
+
+	if (!cm_id->device) {
+		if (sgid)
+			memset(sgid, 0, sizeof(*sgid));
+		if (dgid)
+			memset(dgid, 0, sizeof(*dgid));
+		return;
+	}
+
+	if (rdma_protocol_roce(cm_id->device, cm_id->port_num)) {
+		if (sgid)
+			rdma_ip2gid((struct sockaddr *)&addr->src_addr, sgid);
+		if (dgid)
+			rdma_ip2gid((struct sockaddr *)&addr->dst_addr, dgid);
+	} else {
+		if (sgid)
+			rdma_addr_get_sgid(&addr->dev_addr, sgid);
+		if (dgid)
+			rdma_addr_get_dgid(&addr->dev_addr, dgid);
+	}
+}
+EXPORT_SYMBOL(rdma_read_gids);
+
 static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
 static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
 {
 {
 	struct rdma_id_private *id_priv = iw_id->context;
 	struct rdma_id_private *id_priv = iw_id->context;
@@ -2132,7 +2162,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
 	mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
 	mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
 	conn_id->state = RDMA_CM_CONNECT;
 	conn_id->state = RDMA_CM_CONNECT;
 
 
-	ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr, NULL);
+	ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr);
 	if (ret) {
 	if (ret) {
 		mutex_unlock(&conn_id->handler_mutex);
 		mutex_unlock(&conn_id->handler_mutex);
 		rdma_destroy_id(new_cm_id);
 		rdma_destroy_id(new_cm_id);
@@ -2414,6 +2444,26 @@ out:
 	kfree(work);
 	kfree(work);
 }
 }
 
 
+static void cma_init_resolve_route_work(struct cma_work *work,
+					struct rdma_id_private *id_priv)
+{
+	work->id = id_priv;
+	INIT_WORK(&work->work, cma_work_handler);
+	work->old_state = RDMA_CM_ROUTE_QUERY;
+	work->new_state = RDMA_CM_ROUTE_RESOLVED;
+	work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+}
+
+static void cma_init_resolve_addr_work(struct cma_work *work,
+				       struct rdma_id_private *id_priv)
+{
+	work->id = id_priv;
+	INIT_WORK(&work->work, cma_work_handler);
+	work->old_state = RDMA_CM_ADDR_QUERY;
+	work->new_state = RDMA_CM_ADDR_RESOLVED;
+	work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
+}
+
 static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms)
 static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms)
 {
 {
 	struct rdma_route *route = &id_priv->id.route;
 	struct rdma_route *route = &id_priv->id.route;
@@ -2424,11 +2474,7 @@ static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms)
 	if (!work)
 	if (!work)
 		return -ENOMEM;
 		return -ENOMEM;
 
 
-	work->id = id_priv;
-	INIT_WORK(&work->work, cma_work_handler);
-	work->old_state = RDMA_CM_ROUTE_QUERY;
-	work->new_state = RDMA_CM_ROUTE_RESOLVED;
-	work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+	cma_init_resolve_route_work(work, id_priv);
 
 
 	route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL);
 	route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL);
 	if (!route->path_rec) {
 	if (!route->path_rec) {
@@ -2449,10 +2495,63 @@ err1:
 	return ret;
 	return ret;
 }
 }
 
 
-int rdma_set_ib_paths(struct rdma_cm_id *id,
-		      struct sa_path_rec *path_rec, int num_paths)
+static enum ib_gid_type cma_route_gid_type(enum rdma_network_type network_type,
+					   unsigned long supported_gids,
+					   enum ib_gid_type default_gid)
+{
+	if ((network_type == RDMA_NETWORK_IPV4 ||
+	     network_type == RDMA_NETWORK_IPV6) &&
+	    test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids))
+		return IB_GID_TYPE_ROCE_UDP_ENCAP;
+
+	return default_gid;
+}
+
+/*
+ * cma_iboe_set_path_rec_l2_fields() is helper function which sets
+ * path record type based on GID type.
+ * It also sets up other L2 fields which includes destination mac address
+ * netdev ifindex, of the path record.
+ * It returns the netdev of the bound interface for this path record entry.
+ */
+static struct net_device *
+cma_iboe_set_path_rec_l2_fields(struct rdma_id_private *id_priv)
+{
+	struct rdma_route *route = &id_priv->id.route;
+	enum ib_gid_type gid_type = IB_GID_TYPE_ROCE;
+	struct rdma_addr *addr = &route->addr;
+	unsigned long supported_gids;
+	struct net_device *ndev;
+
+	if (!addr->dev_addr.bound_dev_if)
+		return NULL;
+
+	ndev = dev_get_by_index(addr->dev_addr.net,
+				addr->dev_addr.bound_dev_if);
+	if (!ndev)
+		return NULL;
+
+	supported_gids = roce_gid_type_mask_support(id_priv->id.device,
+						    id_priv->id.port_num);
+	gid_type = cma_route_gid_type(addr->dev_addr.network,
+				      supported_gids,
+				      id_priv->gid_type);
+	/* Use the hint from IP Stack to select GID Type */
+	if (gid_type < ib_network_to_gid_type(addr->dev_addr.network))
+		gid_type = ib_network_to_gid_type(addr->dev_addr.network);
+	route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type);
+
+	sa_path_set_ndev(route->path_rec, addr->dev_addr.net);
+	sa_path_set_ifindex(route->path_rec, ndev->ifindex);
+	sa_path_set_dmac(route->path_rec, addr->dev_addr.dst_dev_addr);
+	return ndev;
+}
+
+int rdma_set_ib_path(struct rdma_cm_id *id,
+		     struct sa_path_rec *path_rec)
 {
 {
 	struct rdma_id_private *id_priv;
 	struct rdma_id_private *id_priv;
+	struct net_device *ndev;
 	int ret;
 	int ret;
 
 
 	id_priv = container_of(id, struct rdma_id_private, id);
 	id_priv = container_of(id, struct rdma_id_private, id);
@@ -2460,20 +2559,33 @@ int rdma_set_ib_paths(struct rdma_cm_id *id,
 			   RDMA_CM_ROUTE_RESOLVED))
 			   RDMA_CM_ROUTE_RESOLVED))
 		return -EINVAL;
 		return -EINVAL;
 
 
-	id->route.path_rec = kmemdup(path_rec, sizeof *path_rec * num_paths,
+	id->route.path_rec = kmemdup(path_rec, sizeof(*path_rec),
 				     GFP_KERNEL);
 				     GFP_KERNEL);
 	if (!id->route.path_rec) {
 	if (!id->route.path_rec) {
 		ret = -ENOMEM;
 		ret = -ENOMEM;
 		goto err;
 		goto err;
 	}
 	}
 
 
-	id->route.num_paths = num_paths;
+	if (rdma_protocol_roce(id->device, id->port_num)) {
+		ndev = cma_iboe_set_path_rec_l2_fields(id_priv);
+		if (!ndev) {
+			ret = -ENODEV;
+			goto err_free;
+		}
+		dev_put(ndev);
+	}
+
+	id->route.num_paths = 1;
 	return 0;
 	return 0;
+
+err_free:
+	kfree(id->route.path_rec);
+	id->route.path_rec = NULL;
 err:
 err:
 	cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED);
 	cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED);
 	return ret;
 	return ret;
 }
 }
-EXPORT_SYMBOL(rdma_set_ib_paths);
+EXPORT_SYMBOL(rdma_set_ib_path);
 
 
 static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms)
 static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms)
 {
 {
@@ -2483,11 +2595,7 @@ static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms)
 	if (!work)
 	if (!work)
 		return -ENOMEM;
 		return -ENOMEM;
 
 
-	work->id = id_priv;
-	INIT_WORK(&work->work, cma_work_handler);
-	work->old_state = RDMA_CM_ROUTE_QUERY;
-	work->new_state = RDMA_CM_ROUTE_RESOLVED;
-	work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+	cma_init_resolve_route_work(work, id_priv);
 	queue_work(cma_wq, &work->work);
 	queue_work(cma_wq, &work->work);
 	return 0;
 	return 0;
 }
 }
@@ -2510,26 +2618,14 @@ static int iboe_tos_to_sl(struct net_device *ndev, int tos)
 	return 0;
 	return 0;
 }
 }
 
 
-static enum ib_gid_type cma_route_gid_type(enum rdma_network_type network_type,
-					   unsigned long supported_gids,
-					   enum ib_gid_type default_gid)
-{
-	if ((network_type == RDMA_NETWORK_IPV4 ||
-	     network_type == RDMA_NETWORK_IPV6) &&
-	    test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids))
-		return IB_GID_TYPE_ROCE_UDP_ENCAP;
-
-	return default_gid;
-}
-
 static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
 static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
 {
 {
 	struct rdma_route *route = &id_priv->id.route;
 	struct rdma_route *route = &id_priv->id.route;
 	struct rdma_addr *addr = &route->addr;
 	struct rdma_addr *addr = &route->addr;
 	struct cma_work *work;
 	struct cma_work *work;
 	int ret;
 	int ret;
-	struct net_device *ndev = NULL;
-	enum ib_gid_type gid_type = IB_GID_TYPE_IB;
+	struct net_device *ndev;
+
 	u8 default_roce_tos = id_priv->cma_dev->default_roce_tos[id_priv->id.port_num -
 	u8 default_roce_tos = id_priv->cma_dev->default_roce_tos[id_priv->id.port_num -
 					rdma_start_port(id_priv->cma_dev->device)];
 					rdma_start_port(id_priv->cma_dev->device)];
 	u8 tos = id_priv->tos_set ? id_priv->tos : default_roce_tos;
 	u8 tos = id_priv->tos_set ? id_priv->tos : default_roce_tos;
@@ -2539,9 +2635,6 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
 	if (!work)
 	if (!work)
 		return -ENOMEM;
 		return -ENOMEM;
 
 
-	work->id = id_priv;
-	INIT_WORK(&work->work, cma_work_handler);
-
 	route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL);
 	route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL);
 	if (!route->path_rec) {
 	if (!route->path_rec) {
 		ret = -ENOMEM;
 		ret = -ENOMEM;
@@ -2550,42 +2643,17 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
 
 
 	route->num_paths = 1;
 	route->num_paths = 1;
 
 
-	if (addr->dev_addr.bound_dev_if) {
-		unsigned long supported_gids;
-
-		ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if);
-		if (!ndev) {
-			ret = -ENODEV;
-			goto err2;
-		}
-
-		supported_gids = roce_gid_type_mask_support(id_priv->id.device,
-							    id_priv->id.port_num);
-		gid_type = cma_route_gid_type(addr->dev_addr.network,
-					      supported_gids,
-					      id_priv->gid_type);
-		route->path_rec->rec_type =
-			sa_conv_gid_to_pathrec_type(gid_type);
-		sa_path_set_ndev(route->path_rec, &init_net);
-		sa_path_set_ifindex(route->path_rec, ndev->ifindex);
-	}
+	ndev = cma_iboe_set_path_rec_l2_fields(id_priv);
 	if (!ndev) {
 	if (!ndev) {
 		ret = -ENODEV;
 		ret = -ENODEV;
 		goto err2;
 		goto err2;
 	}
 	}
 
 
-	sa_path_set_dmac(route->path_rec, addr->dev_addr.dst_dev_addr);
-
 	rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
 	rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
 		    &route->path_rec->sgid);
 		    &route->path_rec->sgid);
 	rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr,
 	rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr,
 		    &route->path_rec->dgid);
 		    &route->path_rec->dgid);
 
 
-	/* Use the hint from IP Stack to select GID Type */
-	if (gid_type < ib_network_to_gid_type(addr->dev_addr.network))
-		gid_type = ib_network_to_gid_type(addr->dev_addr.network);
-	route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type);
-
 	if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB)
 	if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB)
 		/* TODO: get the hoplimit from the inet/inet6 device */
 		/* TODO: get the hoplimit from the inet/inet6 device */
 		route->path_rec->hop_limit = addr->dev_addr.hoplimit;
 		route->path_rec->hop_limit = addr->dev_addr.hoplimit;
@@ -2607,11 +2675,7 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
 		goto err2;
 		goto err2;
 	}
 	}
 
 
-	work->old_state = RDMA_CM_ROUTE_QUERY;
-	work->new_state = RDMA_CM_ROUTE_RESOLVED;
-	work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
-	work->event.status = 0;
-
+	cma_init_resolve_route_work(work, id_priv);
 	queue_work(cma_wq, &work->work);
 	queue_work(cma_wq, &work->work);
 
 
 	return 0;
 	return 0;
@@ -2791,11 +2855,7 @@ static int cma_resolve_loopback(struct rdma_id_private *id_priv)
 	rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid);
 	rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid);
 	rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid);
 	rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid);
 
 
-	work->id = id_priv;
-	INIT_WORK(&work->work, cma_work_handler);
-	work->old_state = RDMA_CM_ADDR_QUERY;
-	work->new_state = RDMA_CM_ADDR_RESOLVED;
-	work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
+	cma_init_resolve_addr_work(work, id_priv);
 	queue_work(cma_wq, &work->work);
 	queue_work(cma_wq, &work->work);
 	return 0;
 	return 0;
 err:
 err:
@@ -2821,11 +2881,7 @@ static int cma_resolve_ib_addr(struct rdma_id_private *id_priv)
 	rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *)
 	rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *)
 		&(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr));
 		&(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr));
 
 
-	work->id = id_priv;
-	INIT_WORK(&work->work, cma_work_handler);
-	work->old_state = RDMA_CM_ADDR_QUERY;
-	work->new_state = RDMA_CM_ADDR_RESOLVED;
-	work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
+	cma_init_resolve_addr_work(work, id_priv);
 	queue_work(cma_wq, &work->work);
 	queue_work(cma_wq, &work->work);
 	return 0;
 	return 0;
 err:
 err:
@@ -3404,9 +3460,10 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
 			event.status = ret;
 			event.status = ret;
 			break;
 			break;
 		}
 		}
-		ib_init_ah_from_path(id_priv->id.device, id_priv->id.port_num,
-				     id_priv->id.route.path_rec,
-				     &event.param.ud.ah_attr);
+		ib_init_ah_attr_from_path(id_priv->id.device,
+					  id_priv->id.port_num,
+					  id_priv->id.route.path_rec,
+					  &event.param.ud.ah_attr);
 		event.param.ud.qp_num = rep->qpn;
 		event.param.ud.qp_num = rep->qpn;
 		event.param.ud.qkey = rep->qkey;
 		event.param.ud.qkey = rep->qkey;
 		event.event = RDMA_CM_EVENT_ESTABLISHED;
 		event.event = RDMA_CM_EVENT_ESTABLISHED;
@@ -3873,7 +3930,7 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
 		struct rdma_dev_addr *dev_addr =
 		struct rdma_dev_addr *dev_addr =
 			&id_priv->id.route.addr.dev_addr;
 			&id_priv->id.route.addr.dev_addr;
 		struct net_device *ndev =
 		struct net_device *ndev =
-			dev_get_by_index(&init_net, dev_addr->bound_dev_if);
+			dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if);
 		enum ib_gid_type gid_type =
 		enum ib_gid_type gid_type =
 			id_priv->cma_dev->default_gid_type[id_priv->id.port_num -
 			id_priv->cma_dev->default_gid_type[id_priv->id.port_num -
 			rdma_start_port(id_priv->cma_dev->device)];
 			rdma_start_port(id_priv->cma_dev->device)];
@@ -4010,8 +4067,10 @@ static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid,
 	} else if (addr->sa_family == AF_INET6) {
 	} else if (addr->sa_family == AF_INET6) {
 		memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
 		memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
 	} else {
 	} else {
-		mgid->raw[0] = (gid_type == IB_GID_TYPE_IB) ? 0xff : 0;
-		mgid->raw[1] = (gid_type == IB_GID_TYPE_IB) ? 0x0e : 0;
+		mgid->raw[0] =
+			(gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0xff;
+		mgid->raw[1] =
+			(gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0x0e;
 		mgid->raw[2] = 0;
 		mgid->raw[2] = 0;
 		mgid->raw[3] = 0;
 		mgid->raw[3] = 0;
 		mgid->raw[4] = 0;
 		mgid->raw[4] = 0;
@@ -4061,7 +4120,7 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
 		mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY);
 		mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY);
 
 
 	if (dev_addr->bound_dev_if)
 	if (dev_addr->bound_dev_if)
-		ndev = dev_get_by_index(&init_net, dev_addr->bound_dev_if);
+		ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if);
 	if (!ndev) {
 	if (!ndev) {
 		err = -ENODEV;
 		err = -ENODEV;
 		goto out2;
 		goto out2;
@@ -4179,7 +4238,7 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)
 					struct net_device *ndev = NULL;
 					struct net_device *ndev = NULL;
 
 
 					if (dev_addr->bound_dev_if)
 					if (dev_addr->bound_dev_if)
-						ndev = dev_get_by_index(&init_net,
+						ndev = dev_get_by_index(dev_addr->net,
 									dev_addr->bound_dev_if);
 									dev_addr->bound_dev_if);
 					if (ndev) {
 					if (ndev) {
 						cma_igmp_send(ndev,
 						cma_igmp_send(ndev,
@@ -4235,7 +4294,7 @@ static int cma_netdev_callback(struct notifier_block *self, unsigned long event,
 	if (event != NETDEV_BONDING_FAILOVER)
 	if (event != NETDEV_BONDING_FAILOVER)
 		return NOTIFY_DONE;
 		return NOTIFY_DONE;
 
 
-	if (!(ndev->flags & IFF_MASTER) || !(ndev->priv_flags & IFF_BONDING))
+	if (!netif_is_bond_master(ndev))
 		return NOTIFY_DONE;
 		return NOTIFY_DONE;
 
 
 	mutex_lock(&lock);
 	mutex_lock(&lock);
@@ -4432,7 +4491,7 @@ static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb)
 					  RDMA_NL_RDMA_CM_ATTR_SRC_ADDR))
 					  RDMA_NL_RDMA_CM_ATTR_SRC_ADDR))
 				goto out;
 				goto out;
 			if (ibnl_put_attr(skb, nlh,
 			if (ibnl_put_attr(skb, nlh,
-					  rdma_addr_size(cma_src_addr(id_priv)),
+					  rdma_addr_size(cma_dst_addr(id_priv)),
 					  cma_dst_addr(id_priv),
 					  cma_dst_addr(id_priv),
 					  RDMA_NL_RDMA_CM_ATTR_DST_ADDR))
 					  RDMA_NL_RDMA_CM_ATTR_DST_ADDR))
 				goto out;
 				goto out;
@@ -4444,6 +4503,7 @@ static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb)
 			id_stats->qp_type	= id->qp_type;
 			id_stats->qp_type	= id->qp_type;
 
 
 			i_id++;
 			i_id++;
+			nlmsg_end(skb, nlh);
 		}
 		}
 
 
 		cb->args[1] = 0;
 		cb->args[1] = 0;

+ 1 - 1
drivers/infiniband/core/cma_configfs.c

@@ -295,7 +295,7 @@ static struct config_group *make_cma_dev(struct config_group *group,
 		goto fail;
 		goto fail;
 	}
 	}
 
 
-	strncpy(cma_dev_group->name, name, sizeof(cma_dev_group->name));
+	strlcpy(cma_dev_group->name, name, sizeof(cma_dev_group->name));
 
 
 	config_group_init_type_name(&cma_dev_group->ports_group, "ports",
 	config_group_init_type_name(&cma_dev_group->ports_group, "ports",
 				    &cma_ports_group_type);
 				    &cma_ports_group_type);

+ 31 - 21
drivers/infiniband/core/core_priv.h

@@ -40,8 +40,12 @@
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/opa_addr.h>
 #include <rdma/opa_addr.h>
 #include <rdma/ib_mad.h>
 #include <rdma/ib_mad.h>
+#include <rdma/restrack.h>
 #include "mad_priv.h"
 #include "mad_priv.h"
 
 
+/* Total number of ports combined across all struct ib_devices's */
+#define RDMA_MAX_PORTS 1024
+
 struct pkey_index_qp_list {
 struct pkey_index_qp_list {
 	struct list_head    pkey_index_list;
 	struct list_head    pkey_index_list;
 	u16                 pkey_index;
 	u16                 pkey_index;
@@ -137,7 +141,6 @@ int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
 int roce_gid_mgmt_init(void);
 int roce_gid_mgmt_init(void);
 void roce_gid_mgmt_cleanup(void);
 void roce_gid_mgmt_cleanup(void);
 
 
-int roce_rescan_device(struct ib_device *ib_dev);
 unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port);
 unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port);
 
 
 int ib_cache_setup_one(struct ib_device *device);
 int ib_cache_setup_one(struct ib_device *device);
@@ -191,13 +194,6 @@ void ib_sa_cleanup(void);
 int rdma_nl_init(void);
 int rdma_nl_init(void);
 void rdma_nl_exit(void);
 void rdma_nl_exit(void);
 
 
-/**
- * Check if there are any listeners to the netlink group
- * @group: the netlink group ID
- * Returns 0 on success or a negative for no listeners.
- */
-int ibnl_chk_listeners(unsigned int group);
-
 int ib_nl_handle_resolve_resp(struct sk_buff *skb,
 int ib_nl_handle_resolve_resp(struct sk_buff *skb,
 			      struct nlmsghdr *nlh,
 			      struct nlmsghdr *nlh,
 			      struct netlink_ext_ack *extack);
 			      struct netlink_ext_ack *extack);
@@ -213,11 +209,6 @@ int ib_get_cached_subnet_prefix(struct ib_device *device,
 				u64              *sn_pfx);
 				u64              *sn_pfx);
 
 
 #ifdef CONFIG_SECURITY_INFINIBAND
 #ifdef CONFIG_SECURITY_INFINIBAND
-int ib_security_pkey_access(struct ib_device *dev,
-			    u8 port_num,
-			    u16 pkey_index,
-			    void *sec);
-
 void ib_security_destroy_port_pkey_list(struct ib_device *device);
 void ib_security_destroy_port_pkey_list(struct ib_device *device);
 
 
 void ib_security_cache_change(struct ib_device *device,
 void ib_security_cache_change(struct ib_device *device,
@@ -240,14 +231,6 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent,
 void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent);
 void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent);
 int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index);
 int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index);
 #else
 #else
-static inline int ib_security_pkey_access(struct ib_device *dev,
-					  u8 port_num,
-					  u16 pkey_index,
-					  void *sec)
-{
-	return 0;
-}
-
 static inline void ib_security_destroy_port_pkey_list(struct ib_device *device)
 static inline void ib_security_destroy_port_pkey_list(struct ib_device *device)
 {
 {
 }
 }
@@ -318,4 +301,31 @@ struct ib_device *ib_device_get_by_index(u32 ifindex);
 /* RDMA device netlink */
 /* RDMA device netlink */
 void nldev_init(void);
 void nldev_init(void);
 void nldev_exit(void);
 void nldev_exit(void);
+
+static inline struct ib_qp *_ib_create_qp(struct ib_device *dev,
+					  struct ib_pd *pd,
+					  struct ib_qp_init_attr *attr,
+					  struct ib_udata *udata)
+{
+	struct ib_qp *qp;
+
+	qp = dev->create_qp(pd, attr, udata);
+	if (IS_ERR(qp))
+		return qp;
+
+	qp->device = dev;
+	qp->pd = pd;
+	/*
+	 * We don't track XRC QPs for now, because they don't have PD
+	 * and more importantly they are created internaly by driver,
+	 * see mlx5 create_dev_resources() as an example.
+	 */
+	if (attr->qp_type < IB_QPT_XRC_INI) {
+		qp->res.type = RDMA_RESTRACK_QP;
+		rdma_restrack_add(&qp->res);
+	} else
+		qp->res.valid = false;
+
+	return qp;
+}
 #endif /* _CORE_PRIV_H */
 #endif /* _CORE_PRIV_H */

+ 25 - 14
drivers/infiniband/core/cq.c

@@ -25,9 +25,10 @@
 #define IB_POLL_FLAGS \
 #define IB_POLL_FLAGS \
 	(IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
 	(IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
 
 
-static int __ib_process_cq(struct ib_cq *cq, int budget)
+static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *poll_wc)
 {
 {
 	int i, n, completed = 0;
 	int i, n, completed = 0;
+	struct ib_wc *wcs = poll_wc ? : cq->wc;
 
 
 	/*
 	/*
 	 * budget might be (-1) if the caller does not
 	 * budget might be (-1) if the caller does not
@@ -35,9 +36,9 @@ static int __ib_process_cq(struct ib_cq *cq, int budget)
 	 * minimum here.
 	 * minimum here.
 	 */
 	 */
 	while ((n = ib_poll_cq(cq, min_t(u32, IB_POLL_BATCH,
 	while ((n = ib_poll_cq(cq, min_t(u32, IB_POLL_BATCH,
-			budget - completed), cq->wc)) > 0) {
+			budget - completed), wcs)) > 0) {
 		for (i = 0; i < n; i++) {
 		for (i = 0; i < n; i++) {
-			struct ib_wc *wc = &cq->wc[i];
+			struct ib_wc *wc = &wcs[i];
 
 
 			if (wc->wr_cqe)
 			if (wc->wr_cqe)
 				wc->wr_cqe->done(cq, wc);
 				wc->wr_cqe->done(cq, wc);
@@ -60,18 +61,20 @@ static int __ib_process_cq(struct ib_cq *cq, int budget)
  * @cq:		CQ to process
  * @cq:		CQ to process
  * @budget:	number of CQEs to poll for
  * @budget:	number of CQEs to poll for
  *
  *
- * This function is used to process all outstanding CQ entries on a
- * %IB_POLL_DIRECT CQ.  It does not offload CQ processing to a different
- * context and does not ask for completion interrupts from the HCA.
+ * This function is used to process all outstanding CQ entries.
+ * It does not offload CQ processing to a different context and does
+ * not ask for completion interrupts from the HCA.
+ * Using direct processing on CQ with non IB_POLL_DIRECT type may trigger
+ * concurrent processing.
  *
  *
  * Note: do not pass -1 as %budget unless it is guaranteed that the number
  * Note: do not pass -1 as %budget unless it is guaranteed that the number
  * of completions that will be processed is small.
  * of completions that will be processed is small.
  */
  */
 int ib_process_cq_direct(struct ib_cq *cq, int budget)
 int ib_process_cq_direct(struct ib_cq *cq, int budget)
 {
 {
-	WARN_ON_ONCE(cq->poll_ctx != IB_POLL_DIRECT);
+	struct ib_wc wcs[IB_POLL_BATCH];
 
 
-	return __ib_process_cq(cq, budget);
+	return __ib_process_cq(cq, budget, wcs);
 }
 }
 EXPORT_SYMBOL(ib_process_cq_direct);
 EXPORT_SYMBOL(ib_process_cq_direct);
 
 
@@ -85,7 +88,7 @@ static int ib_poll_handler(struct irq_poll *iop, int budget)
 	struct ib_cq *cq = container_of(iop, struct ib_cq, iop);
 	struct ib_cq *cq = container_of(iop, struct ib_cq, iop);
 	int completed;
 	int completed;
 
 
-	completed = __ib_process_cq(cq, budget);
+	completed = __ib_process_cq(cq, budget, NULL);
 	if (completed < budget) {
 	if (completed < budget) {
 		irq_poll_complete(&cq->iop);
 		irq_poll_complete(&cq->iop);
 		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
 		if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
@@ -105,7 +108,7 @@ static void ib_cq_poll_work(struct work_struct *work)
 	struct ib_cq *cq = container_of(work, struct ib_cq, work);
 	struct ib_cq *cq = container_of(work, struct ib_cq, work);
 	int completed;
 	int completed;
 
 
-	completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE);
+	completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE, NULL);
 	if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
 	if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
 	    ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
 	    ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
 		queue_work(ib_comp_wq, &cq->work);
 		queue_work(ib_comp_wq, &cq->work);
@@ -117,20 +120,22 @@ static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
 }
 }
 
 
 /**
 /**
- * ib_alloc_cq - allocate a completion queue
+ * __ib_alloc_cq - allocate a completion queue
  * @dev:		device to allocate the CQ for
  * @dev:		device to allocate the CQ for
  * @private:		driver private data, accessible from cq->cq_context
  * @private:		driver private data, accessible from cq->cq_context
  * @nr_cqe:		number of CQEs to allocate
  * @nr_cqe:		number of CQEs to allocate
  * @comp_vector:	HCA completion vectors for this CQ
  * @comp_vector:	HCA completion vectors for this CQ
  * @poll_ctx:		context to poll the CQ from.
  * @poll_ctx:		context to poll the CQ from.
+ * @caller:		module owner name.
  *
  *
  * This is the proper interface to allocate a CQ for in-kernel users. A
  * This is the proper interface to allocate a CQ for in-kernel users. A
  * CQ allocated with this interface will automatically be polled from the
  * CQ allocated with this interface will automatically be polled from the
  * specified context. The ULP must use wr->wr_cqe instead of wr->wr_id
  * specified context. The ULP must use wr->wr_cqe instead of wr->wr_id
  * to use this CQ abstraction.
  * to use this CQ abstraction.
  */
  */
-struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private,
-		int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx)
+struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private,
+			    int nr_cqe, int comp_vector,
+			    enum ib_poll_context poll_ctx, const char *caller)
 {
 {
 	struct ib_cq_init_attr cq_attr = {
 	struct ib_cq_init_attr cq_attr = {
 		.cqe		= nr_cqe,
 		.cqe		= nr_cqe,
@@ -154,6 +159,10 @@ struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private,
 	if (!cq->wc)
 	if (!cq->wc)
 		goto out_destroy_cq;
 		goto out_destroy_cq;
 
 
+	cq->res.type = RDMA_RESTRACK_CQ;
+	cq->res.kern_name = caller;
+	rdma_restrack_add(&cq->res);
+
 	switch (cq->poll_ctx) {
 	switch (cq->poll_ctx) {
 	case IB_POLL_DIRECT:
 	case IB_POLL_DIRECT:
 		cq->comp_handler = ib_cq_completion_direct;
 		cq->comp_handler = ib_cq_completion_direct;
@@ -178,11 +187,12 @@ struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private,
 
 
 out_free_wc:
 out_free_wc:
 	kfree(cq->wc);
 	kfree(cq->wc);
+	rdma_restrack_del(&cq->res);
 out_destroy_cq:
 out_destroy_cq:
 	cq->device->destroy_cq(cq);
 	cq->device->destroy_cq(cq);
 	return ERR_PTR(ret);
 	return ERR_PTR(ret);
 }
 }
-EXPORT_SYMBOL(ib_alloc_cq);
+EXPORT_SYMBOL(__ib_alloc_cq);
 
 
 /**
 /**
  * ib_free_cq - free a completion queue
  * ib_free_cq - free a completion queue
@@ -209,6 +219,7 @@ void ib_free_cq(struct ib_cq *cq)
 	}
 	}
 
 
 	kfree(cq->wc);
 	kfree(cq->wc);
+	rdma_restrack_del(&cq->res);
 	ret = cq->device->destroy_cq(cq);
 	ret = cq->device->destroy_cq(cq);
 	WARN_ON_ONCE(ret);
 	WARN_ON_ONCE(ret);
 }
 }

+ 22 - 20
drivers/infiniband/core/device.c

@@ -263,6 +263,8 @@ struct ib_device *ib_alloc_device(size_t size)
 	if (!device)
 	if (!device)
 		return NULL;
 		return NULL;
 
 
+	rdma_restrack_init(&device->res);
+
 	device->dev.class = &ib_class;
 	device->dev.class = &ib_class;
 	device_initialize(&device->dev);
 	device_initialize(&device->dev);
 
 
@@ -288,7 +290,7 @@ void ib_dealloc_device(struct ib_device *device)
 {
 {
 	WARN_ON(device->reg_state != IB_DEV_UNREGISTERED &&
 	WARN_ON(device->reg_state != IB_DEV_UNREGISTERED &&
 		device->reg_state != IB_DEV_UNINITIALIZED);
 		device->reg_state != IB_DEV_UNINITIALIZED);
-	kobject_put(&device->dev.kobj);
+	put_device(&device->dev);
 }
 }
 EXPORT_SYMBOL(ib_dealloc_device);
 EXPORT_SYMBOL(ib_dealloc_device);
 
 
@@ -462,7 +464,6 @@ int ib_register_device(struct ib_device *device,
 	struct ib_udata uhw = {.outlen = 0, .inlen = 0};
 	struct ib_udata uhw = {.outlen = 0, .inlen = 0};
 	struct device *parent = device->dev.parent;
 	struct device *parent = device->dev.parent;
 
 
-	WARN_ON_ONCE(!parent);
 	WARN_ON_ONCE(device->dma_device);
 	WARN_ON_ONCE(device->dma_device);
 	if (device->dev.dma_ops) {
 	if (device->dev.dma_ops) {
 		/*
 		/*
@@ -471,16 +472,25 @@ int ib_register_device(struct ib_device *device,
 		 * into device->dev.
 		 * into device->dev.
 		 */
 		 */
 		device->dma_device = &device->dev;
 		device->dma_device = &device->dev;
-		if (!device->dev.dma_mask)
-			device->dev.dma_mask = parent->dma_mask;
-		if (!device->dev.coherent_dma_mask)
-			device->dev.coherent_dma_mask =
-				parent->coherent_dma_mask;
+		if (!device->dev.dma_mask) {
+			if (parent)
+				device->dev.dma_mask = parent->dma_mask;
+			else
+				WARN_ON_ONCE(true);
+		}
+		if (!device->dev.coherent_dma_mask) {
+			if (parent)
+				device->dev.coherent_dma_mask =
+					parent->coherent_dma_mask;
+			else
+				WARN_ON_ONCE(true);
+		}
 	} else {
 	} else {
 		/*
 		/*
 		 * The caller did not provide custom DMA operations. Use the
 		 * The caller did not provide custom DMA operations. Use the
 		 * DMA mapping operations of the parent device.
 		 * DMA mapping operations of the parent device.
 		 */
 		 */
+		WARN_ON_ONCE(!parent);
 		device->dma_device = parent;
 		device->dma_device = parent;
 	}
 	}
 
 
@@ -588,6 +598,8 @@ void ib_unregister_device(struct ib_device *device)
 	}
 	}
 	up_read(&lists_rwsem);
 	up_read(&lists_rwsem);
 
 
+	rdma_restrack_clean(&device->res);
+
 	ib_device_unregister_rdmacg(device);
 	ib_device_unregister_rdmacg(device);
 	ib_device_unregister_sysfs(device);
 	ib_device_unregister_sysfs(device);
 
 
@@ -1033,32 +1045,22 @@ EXPORT_SYMBOL(ib_modify_port);
 
 
 /**
 /**
  * ib_find_gid - Returns the port number and GID table index where
  * ib_find_gid - Returns the port number and GID table index where
- *   a specified GID value occurs.
+ *   a specified GID value occurs. Its searches only for IB link layer.
  * @device: The device to query.
  * @device: The device to query.
  * @gid: The GID value to search for.
  * @gid: The GID value to search for.
- * @gid_type: Type of GID.
  * @ndev: The ndev related to the GID to search for.
  * @ndev: The ndev related to the GID to search for.
  * @port_num: The port number of the device where the GID value was found.
  * @port_num: The port number of the device where the GID value was found.
  * @index: The index into the GID table where the GID was found.  This
  * @index: The index into the GID table where the GID was found.  This
  *   parameter may be NULL.
  *   parameter may be NULL.
  */
  */
 int ib_find_gid(struct ib_device *device, union ib_gid *gid,
 int ib_find_gid(struct ib_device *device, union ib_gid *gid,
-		enum ib_gid_type gid_type, struct net_device *ndev,
-		u8 *port_num, u16 *index)
+		struct net_device *ndev, u8 *port_num, u16 *index)
 {
 {
 	union ib_gid tmp_gid;
 	union ib_gid tmp_gid;
 	int ret, port, i;
 	int ret, port, i;
 
 
 	for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) {
 	for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) {
-		if (rdma_cap_roce_gid_table(device, port)) {
-			if (!ib_find_cached_gid_by_port(device, gid, gid_type, port,
-							ndev, index)) {
-				*port_num = port;
-				return 0;
-			}
-		}
-
-		if (gid_type != IB_GID_TYPE_IB)
+		if (rdma_cap_roce_gid_table(device, port))
 			continue;
 			continue;
 
 
 		for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) {
 		for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) {

+ 5 - 7
drivers/infiniband/core/fmr_pool.c

@@ -388,13 +388,11 @@ int ib_flush_fmr_pool(struct ib_fmr_pool *pool)
 EXPORT_SYMBOL(ib_flush_fmr_pool);
 EXPORT_SYMBOL(ib_flush_fmr_pool);
 
 
 /**
 /**
- * ib_fmr_pool_map_phys -
- * @pool:FMR pool to allocate FMR from
- * @page_list:List of pages to map
- * @list_len:Number of pages in @page_list
- * @io_virtual_address:I/O virtual address for new FMR
- *
- * Map an FMR from an FMR pool.
+ * ib_fmr_pool_map_phys - Map an FMR from an FMR pool.
+ * @pool_handle: FMR pool to allocate FMR from
+ * @page_list: List of pages to map
+ * @list_len: Number of pages in @page_list
+ * @io_virtual_address: I/O virtual address for new FMR
  */
  */
 struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle,
 struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle,
 					 u64                *page_list,
 					 u64                *page_list,

+ 1 - 0
drivers/infiniband/core/iwpm_util.c

@@ -654,6 +654,7 @@ int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid)
 	}
 	}
 	skb_num++;
 	skb_num++;
 	spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
 	spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+	ret = -EINVAL;
 	for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) {
 	for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) {
 		hlist_for_each_entry(map_info, &iwpm_hash_bucket[i],
 		hlist_for_each_entry(map_info, &iwpm_hash_bucket[i],
 				     hlist_node) {
 				     hlist_node) {

+ 0 - 1
drivers/infiniband/core/mad.c

@@ -49,7 +49,6 @@
 #include "smi.h"
 #include "smi.h"
 #include "opa_smi.h"
 #include "opa_smi.h"
 #include "agent.h"
 #include "agent.h"
-#include "core_priv.h"
 
 
 static int mad_sendq_size = IB_MAD_QP_SEND_SIZE;
 static int mad_sendq_size = IB_MAD_QP_SEND_SIZE;
 static int mad_recvq_size = IB_MAD_QP_RECV_SIZE;
 static int mad_recvq_size = IB_MAD_QP_RECV_SIZE;

+ 3 - 7
drivers/infiniband/core/netlink.c

@@ -41,8 +41,6 @@
 #include <linux/module.h>
 #include <linux/module.h>
 #include "core_priv.h"
 #include "core_priv.h"
 
 
-#include "core_priv.h"
-
 static DEFINE_MUTEX(rdma_nl_mutex);
 static DEFINE_MUTEX(rdma_nl_mutex);
 static struct sock *nls;
 static struct sock *nls;
 static struct {
 static struct {
@@ -83,15 +81,13 @@ static bool is_nl_valid(unsigned int type, unsigned int op)
 	if (!is_nl_msg_valid(type, op))
 	if (!is_nl_msg_valid(type, op))
 		return false;
 		return false;
 
 
-	cb_table = rdma_nl_types[type].cb_table;
-#ifdef CONFIG_MODULES
-	if (!cb_table) {
+	if (!rdma_nl_types[type].cb_table) {
 		mutex_unlock(&rdma_nl_mutex);
 		mutex_unlock(&rdma_nl_mutex);
 		request_module("rdma-netlink-subsys-%d", type);
 		request_module("rdma-netlink-subsys-%d", type);
 		mutex_lock(&rdma_nl_mutex);
 		mutex_lock(&rdma_nl_mutex);
-		cb_table = rdma_nl_types[type].cb_table;
 	}
 	}
-#endif
+
+	cb_table = rdma_nl_types[type].cb_table;
 
 
 	if (!cb_table || (!cb_table[op].dump && !cb_table[op].doit))
 	if (!cb_table || (!cb_table[op].dump && !cb_table[op].doit))
 		return false;
 		return false;

+ 388 - 6
drivers/infiniband/core/nldev.c

@@ -31,6 +31,8 @@
  */
  */
 
 
 #include <linux/module.h>
 #include <linux/module.h>
+#include <linux/pid.h>
+#include <linux/pid_namespace.h>
 #include <net/netlink.h>
 #include <net/netlink.h>
 #include <rdma/rdma_netlink.h>
 #include <rdma/rdma_netlink.h>
 
 
@@ -52,16 +54,42 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_PORT_STATE]	= { .type = NLA_U8 },
 	[RDMA_NLDEV_ATTR_PORT_STATE]	= { .type = NLA_U8 },
 	[RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = { .type = NLA_U8 },
 	[RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = { .type = NLA_U8 },
 	[RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 },
 	[RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_RES_SUMMARY]	= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY]	= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME] = { .type = NLA_NUL_STRING,
+					     .len = 16 },
+	[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR] = { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_RES_QP]		= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_RES_QP_ENTRY]		= { .type = NLA_NESTED },
+	[RDMA_NLDEV_ATTR_RES_LQPN]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_RQPN]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_RQ_PSN]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_SQ_PSN]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE] = { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_RES_TYPE]		= { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_RES_STATE]		= { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_RES_PID]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_RES_KERN_NAME]		= { .type = NLA_NUL_STRING,
+						    .len = TASK_COMM_LEN },
 };
 };
 
 
-static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
+static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device)
 {
 {
-	char fw[IB_FW_VERSION_NAME_MAX];
-
 	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index))
 	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index))
 		return -EMSGSIZE;
 		return -EMSGSIZE;
 	if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, device->name))
 	if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, device->name))
 		return -EMSGSIZE;
 		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
+{
+	char fw[IB_FW_VERSION_NAME_MAX];
+
+	if (fill_nldev_handle(msg, device))
+		return -EMSGSIZE;
+
 	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, rdma_end_port(device)))
 	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, rdma_end_port(device)))
 		return -EMSGSIZE;
 		return -EMSGSIZE;
 
 
@@ -92,10 +120,9 @@ static int fill_port_info(struct sk_buff *msg,
 	struct ib_port_attr attr;
 	struct ib_port_attr attr;
 	int ret;
 	int ret;
 
 
-	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index))
-		return -EMSGSIZE;
-	if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, device->name))
+	if (fill_nldev_handle(msg, device))
 		return -EMSGSIZE;
 		return -EMSGSIZE;
+
 	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port))
 	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port))
 		return -EMSGSIZE;
 		return -EMSGSIZE;
 
 
@@ -126,6 +153,137 @@ static int fill_port_info(struct sk_buff *msg,
 	return 0;
 	return 0;
 }
 }
 
 
+static int fill_res_info_entry(struct sk_buff *msg,
+			       const char *name, u64 curr)
+{
+	struct nlattr *entry_attr;
+
+	entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY);
+	if (!entry_attr)
+		return -EMSGSIZE;
+
+	if (nla_put_string(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME, name))
+		goto err;
+	if (nla_put_u64_64bit(msg,
+			      RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR, curr, 0))
+		goto err;
+
+	nla_nest_end(msg, entry_attr);
+	return 0;
+
+err:
+	nla_nest_cancel(msg, entry_attr);
+	return -EMSGSIZE;
+}
+
+static int fill_res_info(struct sk_buff *msg, struct ib_device *device)
+{
+	static const char * const names[RDMA_RESTRACK_MAX] = {
+		[RDMA_RESTRACK_PD] = "pd",
+		[RDMA_RESTRACK_CQ] = "cq",
+		[RDMA_RESTRACK_QP] = "qp",
+	};
+
+	struct rdma_restrack_root *res = &device->res;
+	struct nlattr *table_attr;
+	int ret, i, curr;
+
+	if (fill_nldev_handle(msg, device))
+		return -EMSGSIZE;
+
+	table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_SUMMARY);
+	if (!table_attr)
+		return -EMSGSIZE;
+
+	for (i = 0; i < RDMA_RESTRACK_MAX; i++) {
+		if (!names[i])
+			continue;
+		curr = rdma_restrack_count(res, i, task_active_pid_ns(current));
+		ret = fill_res_info_entry(msg, names[i], curr);
+		if (ret)
+			goto err;
+	}
+
+	nla_nest_end(msg, table_attr);
+	return 0;
+
+err:
+	nla_nest_cancel(msg, table_attr);
+	return ret;
+}
+
+static int fill_res_qp_entry(struct sk_buff *msg,
+			     struct ib_qp *qp, uint32_t port)
+{
+	struct rdma_restrack_entry *res = &qp->res;
+	struct ib_qp_init_attr qp_init_attr;
+	struct nlattr *entry_attr;
+	struct ib_qp_attr qp_attr;
+	int ret;
+
+	ret = ib_query_qp(qp, &qp_attr, 0, &qp_init_attr);
+	if (ret)
+		return ret;
+
+	if (port && port != qp_attr.port_num)
+		return 0;
+
+	entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY);
+	if (!entry_attr)
+		goto out;
+
+	/* In create_qp() port is not set yet */
+	if (qp_attr.port_num &&
+	    nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, qp_attr.port_num))
+		goto err;
+
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qp->qp_num))
+		goto err;
+	if (qp->qp_type == IB_QPT_RC || qp->qp_type == IB_QPT_UC) {
+		if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RQPN,
+				qp_attr.dest_qp_num))
+			goto err;
+		if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RQ_PSN,
+				qp_attr.rq_psn))
+			goto err;
+	}
+
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_SQ_PSN, qp_attr.sq_psn))
+		goto err;
+
+	if (qp->qp_type == IB_QPT_RC || qp->qp_type == IB_QPT_UC ||
+	    qp->qp_type == IB_QPT_XRC_INI || qp->qp_type == IB_QPT_XRC_TGT) {
+		if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE,
+			       qp_attr.path_mig_state))
+			goto err;
+	}
+	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, qp->qp_type))
+		goto err;
+	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, qp_attr.qp_state))
+		goto err;
+
+	/*
+	 * Existence of task means that it is user QP and netlink
+	 * user is invited to go and read /proc/PID/comm to get name
+	 * of the task file and res->task_com should be NULL.
+	 */
+	if (rdma_is_kernel_res(res)) {
+		if (nla_put_string(msg, RDMA_NLDEV_ATTR_RES_KERN_NAME, res->kern_name))
+			goto err;
+	} else {
+		if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PID, task_pid_vnr(res->task)))
+			goto err;
+	}
+
+	nla_nest_end(msg, entry_attr);
+	return 0;
+
+err:
+	nla_nest_cancel(msg, entry_attr);
+out:
+	return -EMSGSIZE;
+}
+
 static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
 static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
 			  struct netlink_ext_ack *extack)
 			  struct netlink_ext_ack *extack)
 {
 {
@@ -321,6 +479,213 @@ out:
 	return skb->len;
 	return skb->len;
 }
 }
 
 
+static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+			      struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	struct ib_device *device;
+	struct sk_buff *msg;
+	u32 index;
+	int ret;
+
+	ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+			  nldev_policy, extack);
+	if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+		return -EINVAL;
+
+	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	device = ib_device_get_by_index(index);
+	if (!device)
+		return -EINVAL;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		goto err;
+
+	nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET),
+			0, 0);
+
+	ret = fill_res_info(msg, device);
+	if (ret)
+		goto err_free;
+
+	nlmsg_end(msg, nlh);
+	put_device(&device->dev);
+	return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+err_free:
+	nlmsg_free(msg);
+err:
+	put_device(&device->dev);
+	return ret;
+}
+
+static int _nldev_res_get_dumpit(struct ib_device *device,
+				 struct sk_buff *skb,
+				 struct netlink_callback *cb,
+				 unsigned int idx)
+{
+	int start = cb->args[0];
+	struct nlmsghdr *nlh;
+
+	if (idx < start)
+		return 0;
+
+	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET),
+			0, NLM_F_MULTI);
+
+	if (fill_res_info(skb, device)) {
+		nlmsg_cancel(skb, nlh);
+		goto out;
+	}
+
+	nlmsg_end(skb, nlh);
+
+	idx++;
+
+out:
+	cb->args[0] = idx;
+	return skb->len;
+}
+
+static int nldev_res_get_dumpit(struct sk_buff *skb,
+				struct netlink_callback *cb)
+{
+	return ib_enum_all_devs(_nldev_res_get_dumpit, skb, cb);
+}
+
+static int nldev_res_get_qp_dumpit(struct sk_buff *skb,
+				   struct netlink_callback *cb)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	struct rdma_restrack_entry *res;
+	int err, ret = 0, idx = 0;
+	struct nlattr *table_attr;
+	struct ib_device *device;
+	int start = cb->args[0];
+	struct ib_qp *qp = NULL;
+	struct nlmsghdr *nlh;
+	u32 index, port = 0;
+
+	err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+			  nldev_policy, NULL);
+	/*
+	 * Right now, we are expecting the device index to get QP information,
+	 * but it is possible to extend this code to return all devices in
+	 * one shot by checking the existence of RDMA_NLDEV_ATTR_DEV_INDEX.
+	 * if it doesn't exist, we will iterate over all devices.
+	 *
+	 * But it is not needed for now.
+	 */
+	if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+		return -EINVAL;
+
+	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	device = ib_device_get_by_index(index);
+	if (!device)
+		return -EINVAL;
+
+	/*
+	 * If no PORT_INDEX is supplied, we will return all QPs from that device
+	 */
+	if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) {
+		port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+		if (!rdma_is_port_valid(device, port)) {
+			ret = -EINVAL;
+			goto err_index;
+		}
+	}
+
+	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_QP_GET),
+			0, NLM_F_MULTI);
+
+	if (fill_nldev_handle(skb, device)) {
+		ret = -EMSGSIZE;
+		goto err;
+	}
+
+	table_attr = nla_nest_start(skb, RDMA_NLDEV_ATTR_RES_QP);
+	if (!table_attr) {
+		ret = -EMSGSIZE;
+		goto err;
+	}
+
+	down_read(&device->res.rwsem);
+	hash_for_each_possible(device->res.hash, res, node, RDMA_RESTRACK_QP) {
+		if (idx < start)
+			goto next;
+
+		if ((rdma_is_kernel_res(res) &&
+		     task_active_pid_ns(current) != &init_pid_ns) ||
+		    (!rdma_is_kernel_res(res) &&
+		     task_active_pid_ns(current) != task_active_pid_ns(res->task)))
+			/*
+			 * 1. Kernel QPs should be visible in init namspace only
+			 * 2. Present only QPs visible in the current namespace
+			 */
+			goto next;
+
+		if (!rdma_restrack_get(res))
+			/*
+			 * Resource is under release now, but we are not
+			 * relesing lock now, so it will be released in
+			 * our next pass, once we will get ->next pointer.
+			 */
+			goto next;
+
+		qp = container_of(res, struct ib_qp, res);
+
+		up_read(&device->res.rwsem);
+		ret = fill_res_qp_entry(skb, qp, port);
+		down_read(&device->res.rwsem);
+		/*
+		 * Return resource back, but it won't be released till
+		 * the &device->res.rwsem will be released for write.
+		 */
+		rdma_restrack_put(res);
+
+		if (ret == -EMSGSIZE)
+			/*
+			 * There is a chance to optimize here.
+			 * It can be done by using list_prepare_entry
+			 * and list_for_each_entry_continue afterwards.
+			 */
+			break;
+		if (ret)
+			goto res_err;
+next:		idx++;
+	}
+	up_read(&device->res.rwsem);
+
+	nla_nest_end(skb, table_attr);
+	nlmsg_end(skb, nlh);
+	cb->args[0] = idx;
+
+	/*
+	 * No more QPs to fill, cancel the message and
+	 * return 0 to mark end of dumpit.
+	 */
+	if (!qp)
+		goto err;
+
+	put_device(&device->dev);
+	return skb->len;
+
+res_err:
+	nla_nest_cancel(skb, table_attr);
+	up_read(&device->res.rwsem);
+
+err:
+	nlmsg_cancel(skb, nlh);
+
+err_index:
+	put_device(&device->dev);
+	return ret;
+}
+
 static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
 static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
 	[RDMA_NLDEV_CMD_GET] = {
 	[RDMA_NLDEV_CMD_GET] = {
 		.doit = nldev_get_doit,
 		.doit = nldev_get_doit,
@@ -330,6 +695,23 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
 		.doit = nldev_port_get_doit,
 		.doit = nldev_port_get_doit,
 		.dump = nldev_port_get_dumpit,
 		.dump = nldev_port_get_dumpit,
 	},
 	},
+	[RDMA_NLDEV_CMD_RES_GET] = {
+		.doit = nldev_res_get_doit,
+		.dump = nldev_res_get_dumpit,
+	},
+	[RDMA_NLDEV_CMD_RES_QP_GET] = {
+		.dump = nldev_res_get_qp_dumpit,
+		/*
+		 * .doit is not implemented yet for two reasons:
+		 * 1. It is not needed yet.
+		 * 2. There is a need to provide identifier, while it is easy
+		 * for the QPs (device index + port index + LQPN), it is not
+		 * the case for the rest of resources (PD and CQ). Because it
+		 * is better to provide similar interface for all resources,
+		 * let's wait till we will have other resources implemented
+		 * too.
+		 */
+	},
 };
 };
 
 
 void __init nldev_init(void)
 void __init nldev_init(void)

+ 164 - 0
drivers/infiniband/core/restrack.c

@@ -0,0 +1,164 @@
+/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */
+/*
+ * Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved.
+ */
+
+#include <rdma/ib_verbs.h>
+#include <rdma/restrack.h>
+#include <linux/mutex.h>
+#include <linux/sched/task.h>
+#include <linux/uaccess.h>
+#include <linux/pid_namespace.h>
+
+void rdma_restrack_init(struct rdma_restrack_root *res)
+{
+	init_rwsem(&res->rwsem);
+}
+
+void rdma_restrack_clean(struct rdma_restrack_root *res)
+{
+	WARN_ON_ONCE(!hash_empty(res->hash));
+}
+
+int rdma_restrack_count(struct rdma_restrack_root *res,
+			enum rdma_restrack_type type,
+			struct pid_namespace *ns)
+{
+	struct rdma_restrack_entry *e;
+	u32 cnt = 0;
+
+	down_read(&res->rwsem);
+	hash_for_each_possible(res->hash, e, node, type) {
+		if (ns == &init_pid_ns ||
+		    (!rdma_is_kernel_res(e) &&
+		     ns == task_active_pid_ns(e->task)))
+			cnt++;
+	}
+	up_read(&res->rwsem);
+	return cnt;
+}
+EXPORT_SYMBOL(rdma_restrack_count);
+
+static void set_kern_name(struct rdma_restrack_entry *res)
+{
+	enum rdma_restrack_type type = res->type;
+	struct ib_qp *qp;
+
+	if (type != RDMA_RESTRACK_QP)
+		/* PD and CQ types already have this name embedded in */
+		return;
+
+	qp = container_of(res, struct ib_qp, res);
+	if (!qp->pd) {
+		WARN_ONCE(true, "XRC QPs are not supported\n");
+		/* Survive, despite the programmer's error */
+		res->kern_name = " ";
+		return;
+	}
+
+	res->kern_name = qp->pd->res.kern_name;
+}
+
+static struct ib_device *res_to_dev(struct rdma_restrack_entry *res)
+{
+	enum rdma_restrack_type type = res->type;
+	struct ib_device *dev;
+	struct ib_xrcd *xrcd;
+	struct ib_pd *pd;
+	struct ib_cq *cq;
+	struct ib_qp *qp;
+
+	switch (type) {
+	case RDMA_RESTRACK_PD:
+		pd = container_of(res, struct ib_pd, res);
+		dev = pd->device;
+		break;
+	case RDMA_RESTRACK_CQ:
+		cq = container_of(res, struct ib_cq, res);
+		dev = cq->device;
+		break;
+	case RDMA_RESTRACK_QP:
+		qp = container_of(res, struct ib_qp, res);
+		dev = qp->device;
+		break;
+	case RDMA_RESTRACK_XRCD:
+		xrcd = container_of(res, struct ib_xrcd, res);
+		dev = xrcd->device;
+		break;
+	default:
+		WARN_ONCE(true, "Wrong resource tracking type %u\n", type);
+		return NULL;
+	}
+
+	return dev;
+}
+
+void rdma_restrack_add(struct rdma_restrack_entry *res)
+{
+	struct ib_device *dev = res_to_dev(res);
+
+	if (!dev)
+		return;
+
+	if (!uaccess_kernel()) {
+		get_task_struct(current);
+		res->task = current;
+		res->kern_name = NULL;
+	} else {
+		set_kern_name(res);
+		res->task = NULL;
+	}
+
+	kref_init(&res->kref);
+	init_completion(&res->comp);
+	res->valid = true;
+
+	down_write(&dev->res.rwsem);
+	hash_add(dev->res.hash, &res->node, res->type);
+	up_write(&dev->res.rwsem);
+}
+EXPORT_SYMBOL(rdma_restrack_add);
+
+int __must_check rdma_restrack_get(struct rdma_restrack_entry *res)
+{
+	return kref_get_unless_zero(&res->kref);
+}
+EXPORT_SYMBOL(rdma_restrack_get);
+
+static void restrack_release(struct kref *kref)
+{
+	struct rdma_restrack_entry *res;
+
+	res = container_of(kref, struct rdma_restrack_entry, kref);
+	complete(&res->comp);
+}
+
+int rdma_restrack_put(struct rdma_restrack_entry *res)
+{
+	return kref_put(&res->kref, restrack_release);
+}
+EXPORT_SYMBOL(rdma_restrack_put);
+
+void rdma_restrack_del(struct rdma_restrack_entry *res)
+{
+	struct ib_device *dev;
+
+	if (!res->valid)
+		return;
+
+	dev = res_to_dev(res);
+	if (!dev)
+		return;
+
+	rdma_restrack_put(res);
+
+	wait_for_completion(&res->comp);
+
+	down_write(&dev->res.rwsem);
+	hash_del(&res->node);
+	res->valid = false;
+	if (res->task)
+		put_task_struct(res->task);
+	up_write(&dev->res.rwsem);
+}
+EXPORT_SYMBOL(rdma_restrack_del);

+ 8 - 5
drivers/infiniband/core/roce_gid_mgmt.c

@@ -410,15 +410,18 @@ static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev,
 	rtnl_unlock();
 	rtnl_unlock();
 }
 }
 
 
-/* This function will rescan all of the network devices in the system
- * and add their gids, as needed, to the relevant RoCE devices. */
-int roce_rescan_device(struct ib_device *ib_dev)
+/**
+ * rdma_roce_rescan_device - Rescan all of the network devices in the system
+ * and add their gids, as needed, to the relevant RoCE devices.
+ *
+ * @device:         the rdma device
+ */
+void rdma_roce_rescan_device(struct ib_device *ib_dev)
 {
 {
 	ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL,
 	ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL,
 			    enum_all_gids_of_dev_cb, NULL);
 			    enum_all_gids_of_dev_cb, NULL);
-
-	return 0;
 }
 }
+EXPORT_SYMBOL(rdma_roce_rescan_device);
 
 
 static void callback_for_addr_gid_device_scan(struct ib_device *device,
 static void callback_for_addr_gid_device_scan(struct ib_device *device,
 					      u8 port,
 					      u8 port,

+ 14 - 4
drivers/infiniband/core/sa_query.c

@@ -1227,9 +1227,9 @@ static u8 get_src_path_mask(struct ib_device *device, u8 port_num)
 	return src_path_mask;
 	return src_path_mask;
 }
 }
 
 
-int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
-			 struct sa_path_rec *rec,
-			 struct rdma_ah_attr *ah_attr)
+int ib_init_ah_attr_from_path(struct ib_device *device, u8 port_num,
+			      struct sa_path_rec *rec,
+			      struct rdma_ah_attr *ah_attr)
 {
 {
 	int ret;
 	int ret;
 	u16 gid_index;
 	u16 gid_index;
@@ -1341,10 +1341,11 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
 
 
 	return 0;
 	return 0;
 }
 }
-EXPORT_SYMBOL(ib_init_ah_from_path);
+EXPORT_SYMBOL(ib_init_ah_attr_from_path);
 
 
 static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask)
 static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask)
 {
 {
+	struct rdma_ah_attr ah_attr;
 	unsigned long flags;
 	unsigned long flags;
 
 
 	spin_lock_irqsave(&query->port->ah_lock, flags);
 	spin_lock_irqsave(&query->port->ah_lock, flags);
@@ -1356,6 +1357,15 @@ static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask)
 	query->sm_ah = query->port->sm_ah;
 	query->sm_ah = query->port->sm_ah;
 	spin_unlock_irqrestore(&query->port->ah_lock, flags);
 	spin_unlock_irqrestore(&query->port->ah_lock, flags);
 
 
+	/*
+	 * Always check if sm_ah has valid dlid assigned,
+	 * before querying for class port info
+	 */
+	if ((rdma_query_ah(query->sm_ah->ah, &ah_attr) < 0) ||
+	    !rdma_is_valid_unicast_lid(&ah_attr)) {
+		kref_put(&query->sm_ah->ref, free_sm_ah);
+		return -EAGAIN;
+	}
 	query->mad_buf = ib_create_send_mad(query->port->agent, 1,
 	query->mad_buf = ib_create_send_mad(query->port->agent, 1,
 					    query->sm_ah->pkey_index,
 					    query->sm_ah->pkey_index,
 					    0, IB_MGMT_SA_HDR, IB_MGMT_SA_DATA,
 					    0, IB_MGMT_SA_HDR, IB_MGMT_SA_DATA,

+ 4 - 6
drivers/infiniband/core/security.c

@@ -653,12 +653,11 @@ int ib_security_modify_qp(struct ib_qp *qp,
 	}
 	}
 	return ret;
 	return ret;
 }
 }
-EXPORT_SYMBOL(ib_security_modify_qp);
 
 
-int ib_security_pkey_access(struct ib_device *dev,
-			    u8 port_num,
-			    u16 pkey_index,
-			    void *sec)
+static int ib_security_pkey_access(struct ib_device *dev,
+				   u8 port_num,
+				   u16 pkey_index,
+				   void *sec)
 {
 {
 	u64 subnet_prefix;
 	u64 subnet_prefix;
 	u16 pkey;
 	u16 pkey;
@@ -678,7 +677,6 @@ int ib_security_pkey_access(struct ib_device *dev,
 
 
 	return security_ib_pkey_access(sec, subnet_prefix, pkey);
 	return security_ib_pkey_access(sec, subnet_prefix, pkey);
 }
 }
-EXPORT_SYMBOL(ib_security_pkey_access);
 
 
 static int ib_mad_agent_security_change(struct notifier_block *nb,
 static int ib_mad_agent_security_change(struct notifier_block *nb,
 					unsigned long event,
 					unsigned long event,

+ 0 - 1
drivers/infiniband/core/sysfs.c

@@ -1276,7 +1276,6 @@ int ib_device_register_sysfs(struct ib_device *device,
 	int ret;
 	int ret;
 	int i;
 	int i;
 
 
-	WARN_ON_ONCE(!device->dev.parent);
 	ret = dev_set_name(class_dev, "%s", device->name);
 	ret = dev_set_name(class_dev, "%s", device->name);
 	if (ret)
 	if (ret)
 		return ret;
 		return ret;

+ 28 - 45
drivers/infiniband/core/ucm.c

@@ -53,6 +53,8 @@
 #include <rdma/ib_user_cm.h>
 #include <rdma/ib_user_cm.h>
 #include <rdma/ib_marshall.h>
 #include <rdma/ib_marshall.h>
 
 
+#include "core_priv.h"
+
 MODULE_AUTHOR("Libor Michalek");
 MODULE_AUTHOR("Libor Michalek");
 MODULE_DESCRIPTION("InfiniBand userspace Connection Manager access");
 MODULE_DESCRIPTION("InfiniBand userspace Connection Manager access");
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_LICENSE("Dual BSD/GPL");
@@ -104,10 +106,13 @@ struct ib_ucm_event {
 enum {
 enum {
 	IB_UCM_MAJOR = 231,
 	IB_UCM_MAJOR = 231,
 	IB_UCM_BASE_MINOR = 224,
 	IB_UCM_BASE_MINOR = 224,
-	IB_UCM_MAX_DEVICES = 32
+	IB_UCM_MAX_DEVICES = RDMA_MAX_PORTS,
+	IB_UCM_NUM_FIXED_MINOR = 32,
+	IB_UCM_NUM_DYNAMIC_MINOR = IB_UCM_MAX_DEVICES - IB_UCM_NUM_FIXED_MINOR,
 };
 };
 
 
 #define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR)
 #define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR)
+static dev_t dynamic_ucm_dev;
 
 
 static void ib_ucm_add_one(struct ib_device *device);
 static void ib_ucm_add_one(struct ib_device *device);
 static void ib_ucm_remove_one(struct ib_device *device, void *client_data);
 static void ib_ucm_remove_one(struct ib_device *device, void *client_data);
@@ -1199,7 +1204,6 @@ static int ib_ucm_close(struct inode *inode, struct file *filp)
 	return 0;
 	return 0;
 }
 }
 
 
-static DECLARE_BITMAP(overflow_map, IB_UCM_MAX_DEVICES);
 static void ib_ucm_release_dev(struct device *dev)
 static void ib_ucm_release_dev(struct device *dev)
 {
 {
 	struct ib_ucm_device *ucm_dev;
 	struct ib_ucm_device *ucm_dev;
@@ -1210,10 +1214,7 @@ static void ib_ucm_release_dev(struct device *dev)
 
 
 static void ib_ucm_free_dev(struct ib_ucm_device *ucm_dev)
 static void ib_ucm_free_dev(struct ib_ucm_device *ucm_dev)
 {
 {
-	if (ucm_dev->devnum < IB_UCM_MAX_DEVICES)
-		clear_bit(ucm_dev->devnum, dev_map);
-	else
-		clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, overflow_map);
+	clear_bit(ucm_dev->devnum, dev_map);
 }
 }
 
 
 static const struct file_operations ucm_fops = {
 static const struct file_operations ucm_fops = {
@@ -1235,27 +1236,6 @@ static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
 }
 }
 static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
 static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
 
 
-static dev_t overflow_maj;
-static int find_overflow_devnum(void)
-{
-	int ret;
-
-	if (!overflow_maj) {
-		ret = alloc_chrdev_region(&overflow_maj, 0, IB_UCM_MAX_DEVICES,
-					  "infiniband_cm");
-		if (ret) {
-			pr_err("ucm: couldn't register dynamic device number\n");
-			return ret;
-		}
-	}
-
-	ret = find_first_zero_bit(overflow_map, IB_UCM_MAX_DEVICES);
-	if (ret >= IB_UCM_MAX_DEVICES)
-		return -1;
-
-	return ret;
-}
-
 static void ib_ucm_add_one(struct ib_device *device)
 static void ib_ucm_add_one(struct ib_device *device)
 {
 {
 	int devnum;
 	int devnum;
@@ -1274,19 +1254,14 @@ static void ib_ucm_add_one(struct ib_device *device)
 	ucm_dev->dev.release = ib_ucm_release_dev;
 	ucm_dev->dev.release = ib_ucm_release_dev;
 
 
 	devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES);
 	devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES);
-	if (devnum >= IB_UCM_MAX_DEVICES) {
-		devnum = find_overflow_devnum();
-		if (devnum < 0)
-			goto err;
-
-		ucm_dev->devnum = devnum + IB_UCM_MAX_DEVICES;
-		base = devnum + overflow_maj;
-		set_bit(devnum, overflow_map);
-	} else {
-		ucm_dev->devnum = devnum;
-		base = devnum + IB_UCM_BASE_DEV;
-		set_bit(devnum, dev_map);
-	}
+	if (devnum >= IB_UCM_MAX_DEVICES)
+		goto err;
+	ucm_dev->devnum = devnum;
+	set_bit(devnum, dev_map);
+	if (devnum >= IB_UCM_NUM_FIXED_MINOR)
+		base = dynamic_ucm_dev + devnum - IB_UCM_NUM_FIXED_MINOR;
+	else
+		base = IB_UCM_BASE_DEV + devnum;
 
 
 	cdev_init(&ucm_dev->cdev, &ucm_fops);
 	cdev_init(&ucm_dev->cdev, &ucm_fops);
 	ucm_dev->cdev.owner = THIS_MODULE;
 	ucm_dev->cdev.owner = THIS_MODULE;
@@ -1334,13 +1309,20 @@ static int __init ib_ucm_init(void)
 {
 {
 	int ret;
 	int ret;
 
 
-	ret = register_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES,
+	ret = register_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR,
 				     "infiniband_cm");
 				     "infiniband_cm");
 	if (ret) {
 	if (ret) {
 		pr_err("ucm: couldn't register device number\n");
 		pr_err("ucm: couldn't register device number\n");
 		goto error1;
 		goto error1;
 	}
 	}
 
 
+	ret = alloc_chrdev_region(&dynamic_ucm_dev, 0, IB_UCM_NUM_DYNAMIC_MINOR,
+				  "infiniband_cm");
+	if (ret) {
+		pr_err("ucm: couldn't register dynamic device number\n");
+		goto err_alloc;
+	}
+
 	ret = class_create_file(&cm_class, &class_attr_abi_version.attr);
 	ret = class_create_file(&cm_class, &class_attr_abi_version.attr);
 	if (ret) {
 	if (ret) {
 		pr_err("ucm: couldn't create abi_version attribute\n");
 		pr_err("ucm: couldn't create abi_version attribute\n");
@@ -1357,7 +1339,9 @@ static int __init ib_ucm_init(void)
 error3:
 error3:
 	class_remove_file(&cm_class, &class_attr_abi_version.attr);
 	class_remove_file(&cm_class, &class_attr_abi_version.attr);
 error2:
 error2:
-	unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES);
+	unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR);
+err_alloc:
+	unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR);
 error1:
 error1:
 	return ret;
 	return ret;
 }
 }
@@ -1366,9 +1350,8 @@ static void __exit ib_ucm_cleanup(void)
 {
 {
 	ib_unregister_client(&ucm_client);
 	ib_unregister_client(&ucm_client);
 	class_remove_file(&cm_class, &class_attr_abi_version.attr);
 	class_remove_file(&cm_class, &class_attr_abi_version.attr);
-	unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES);
-	if (overflow_maj)
-		unregister_chrdev_region(overflow_maj, IB_UCM_MAX_DEVICES);
+	unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR);
+	unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR);
 	idr_destroy(&ctx_id_table);
 	idr_destroy(&ctx_id_table);
 }
 }
 
 

+ 10 - 9
drivers/infiniband/core/ucma.c

@@ -904,13 +904,14 @@ static ssize_t ucma_query_path(struct ucma_context *ctx,
 
 
 		resp->path_data[i].flags = IB_PATH_GMP | IB_PATH_PRIMARY |
 		resp->path_data[i].flags = IB_PATH_GMP | IB_PATH_PRIMARY |
 					   IB_PATH_BIDIRECTIONAL;
 					   IB_PATH_BIDIRECTIONAL;
-		if (rec->rec_type == SA_PATH_REC_TYPE_IB) {
-			ib_sa_pack_path(rec, &resp->path_data[i].path_rec);
-		} else {
+		if (rec->rec_type == SA_PATH_REC_TYPE_OPA) {
 			struct sa_path_rec ib;
 			struct sa_path_rec ib;
 
 
 			sa_convert_path_opa_to_ib(&ib, rec);
 			sa_convert_path_opa_to_ib(&ib, rec);
 			ib_sa_pack_path(&ib, &resp->path_data[i].path_rec);
 			ib_sa_pack_path(&ib, &resp->path_data[i].path_rec);
+
+		} else {
+			ib_sa_pack_path(rec, &resp->path_data[i].path_rec);
 		}
 		}
 	}
 	}
 
 
@@ -943,8 +944,8 @@ static ssize_t ucma_query_gid(struct ucma_context *ctx,
 	} else {
 	} else {
 		addr->sib_family = AF_IB;
 		addr->sib_family = AF_IB;
 		addr->sib_pkey = (__force __be16) resp.pkey;
 		addr->sib_pkey = (__force __be16) resp.pkey;
-		rdma_addr_get_sgid(&ctx->cm_id->route.addr.dev_addr,
-				   (union ib_gid *) &addr->sib_addr);
+		rdma_read_gids(ctx->cm_id, (union ib_gid *)&addr->sib_addr,
+			       NULL);
 		addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *)
 		addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *)
 						    &ctx->cm_id->route.addr.src_addr);
 						    &ctx->cm_id->route.addr.src_addr);
 	}
 	}
@@ -956,8 +957,8 @@ static ssize_t ucma_query_gid(struct ucma_context *ctx,
 	} else {
 	} else {
 		addr->sib_family = AF_IB;
 		addr->sib_family = AF_IB;
 		addr->sib_pkey = (__force __be16) resp.pkey;
 		addr->sib_pkey = (__force __be16) resp.pkey;
-		rdma_addr_get_dgid(&ctx->cm_id->route.addr.dev_addr,
-				   (union ib_gid *) &addr->sib_addr);
+		rdma_read_gids(ctx->cm_id, NULL,
+			       (union ib_gid *)&addr->sib_addr);
 		addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *)
 		addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *)
 						    &ctx->cm_id->route.addr.dst_addr);
 						    &ctx->cm_id->route.addr.dst_addr);
 	}
 	}
@@ -1231,9 +1232,9 @@ static int ucma_set_ib_path(struct ucma_context *ctx,
 		struct sa_path_rec opa;
 		struct sa_path_rec opa;
 
 
 		sa_convert_path_ib_to_opa(&opa, &sa_path);
 		sa_convert_path_ib_to_opa(&opa, &sa_path);
-		ret = rdma_set_ib_paths(ctx->cm_id, &opa, 1);
+		ret = rdma_set_ib_path(ctx->cm_id, &opa);
 	} else {
 	} else {
-		ret = rdma_set_ib_paths(ctx->cm_id, &sa_path, 1);
+		ret = rdma_set_ib_path(ctx->cm_id, &sa_path);
 	}
 	}
 	if (ret)
 	if (ret)
 		return ret;
 		return ret;

+ 1 - 1
drivers/infiniband/core/umem.c

@@ -352,7 +352,7 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
-	ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->nmap, dst, length,
+	ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->npages, dst, length,
 				 offset + ib_umem_offset(umem));
 				 offset + ib_umem_offset(umem));
 
 
 	if (ret < 0)
 	if (ret < 0)

+ 57 - 66
drivers/infiniband/core/user_mad.c

@@ -55,16 +55,21 @@
 #include <rdma/ib_mad.h>
 #include <rdma/ib_mad.h>
 #include <rdma/ib_user_mad.h>
 #include <rdma/ib_user_mad.h>
 
 
+#include "core_priv.h"
+
 MODULE_AUTHOR("Roland Dreier");
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("InfiniBand userspace MAD packet access");
 MODULE_DESCRIPTION("InfiniBand userspace MAD packet access");
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_LICENSE("Dual BSD/GPL");
 
 
 enum {
 enum {
-	IB_UMAD_MAX_PORTS  = 64,
+	IB_UMAD_MAX_PORTS  = RDMA_MAX_PORTS,
 	IB_UMAD_MAX_AGENTS = 32,
 	IB_UMAD_MAX_AGENTS = 32,
 
 
 	IB_UMAD_MAJOR      = 231,
 	IB_UMAD_MAJOR      = 231,
-	IB_UMAD_MINOR_BASE = 0
+	IB_UMAD_MINOR_BASE = 0,
+	IB_UMAD_NUM_FIXED_MINOR = 64,
+	IB_UMAD_NUM_DYNAMIC_MINOR = IB_UMAD_MAX_PORTS - IB_UMAD_NUM_FIXED_MINOR,
+	IB_ISSM_MINOR_BASE        = IB_UMAD_NUM_FIXED_MINOR,
 };
 };
 
 
 /*
 /*
@@ -127,9 +132,12 @@ struct ib_umad_packet {
 
 
 static struct class *umad_class;
 static struct class *umad_class;
 
 
-static const dev_t base_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE);
+static const dev_t base_umad_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE);
+static const dev_t base_issm_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE) +
+				   IB_UMAD_NUM_FIXED_MINOR;
+static dev_t dynamic_umad_dev;
+static dev_t dynamic_issm_dev;
 
 
-static DEFINE_SPINLOCK(port_lock);
 static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS);
 static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS);
 
 
 static void ib_umad_add_one(struct ib_device *device);
 static void ib_umad_add_one(struct ib_device *device);
@@ -233,8 +241,7 @@ static void recv_handler(struct ib_mad_agent *agent,
 	 * On OPA devices it is okay to lose the upper 16 bits of LID as this
 	 * On OPA devices it is okay to lose the upper 16 bits of LID as this
 	 * information is obtained elsewhere. Mask off the upper 16 bits.
 	 * information is obtained elsewhere. Mask off the upper 16 bits.
 	 */
 	 */
-	if (agent->device->port_immutable[agent->port_num].core_cap_flags &
-	    RDMA_CORE_PORT_INTEL_OPA)
+	if (rdma_cap_opa_mad(agent->device, agent->port_num))
 		packet->mad.hdr.lid = ib_lid_be16(0xFFFF &
 		packet->mad.hdr.lid = ib_lid_be16(0xFFFF &
 						  mad_recv_wc->wc->slid);
 						  mad_recv_wc->wc->slid);
 	else
 	else
@@ -246,10 +253,14 @@ static void recv_handler(struct ib_mad_agent *agent,
 	if (packet->mad.hdr.grh_present) {
 	if (packet->mad.hdr.grh_present) {
 		struct rdma_ah_attr ah_attr;
 		struct rdma_ah_attr ah_attr;
 		const struct ib_global_route *grh;
 		const struct ib_global_route *grh;
+		int ret;
 
 
-		ib_init_ah_from_wc(agent->device, agent->port_num,
-				   mad_recv_wc->wc, mad_recv_wc->recv_buf.grh,
-				   &ah_attr);
+		ret = ib_init_ah_attr_from_wc(agent->device, agent->port_num,
+					      mad_recv_wc->wc,
+					      mad_recv_wc->recv_buf.grh,
+					      &ah_attr);
+		if (ret)
+			goto err2;
 
 
 		grh = rdma_ah_read_grh(&ah_attr);
 		grh = rdma_ah_read_grh(&ah_attr);
 		packet->mad.hdr.gid_index = grh->sgid_index;
 		packet->mad.hdr.gid_index = grh->sgid_index;
@@ -500,7 +511,7 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
 	}
 	}
 
 
 	memset(&ah_attr, 0, sizeof ah_attr);
 	memset(&ah_attr, 0, sizeof ah_attr);
-	ah_attr.type = rdma_ah_find_type(file->port->ib_dev,
+	ah_attr.type = rdma_ah_find_type(agent->device,
 					 file->port->port_num);
 					 file->port->port_num);
 	rdma_ah_set_dlid(&ah_attr, be16_to_cpu(packet->mad.hdr.lid));
 	rdma_ah_set_dlid(&ah_attr, be16_to_cpu(packet->mad.hdr.lid));
 	rdma_ah_set_sl(&ah_attr, packet->mad.hdr.sl);
 	rdma_ah_set_sl(&ah_attr, packet->mad.hdr.sl);
@@ -1139,54 +1150,26 @@ static DEVICE_ATTR(port, S_IRUGO, show_port, NULL);
 static CLASS_ATTR_STRING(abi_version, S_IRUGO,
 static CLASS_ATTR_STRING(abi_version, S_IRUGO,
 			 __stringify(IB_USER_MAD_ABI_VERSION));
 			 __stringify(IB_USER_MAD_ABI_VERSION));
 
 
-static dev_t overflow_maj;
-static DECLARE_BITMAP(overflow_map, IB_UMAD_MAX_PORTS);
-static int find_overflow_devnum(struct ib_device *device)
-{
-	int ret;
-
-	if (!overflow_maj) {
-		ret = alloc_chrdev_region(&overflow_maj, 0, IB_UMAD_MAX_PORTS * 2,
-					  "infiniband_mad");
-		if (ret) {
-			dev_err(&device->dev,
-				"couldn't register dynamic device number\n");
-			return ret;
-		}
-	}
-
-	ret = find_first_zero_bit(overflow_map, IB_UMAD_MAX_PORTS);
-	if (ret >= IB_UMAD_MAX_PORTS)
-		return -1;
-
-	return ret;
-}
-
 static int ib_umad_init_port(struct ib_device *device, int port_num,
 static int ib_umad_init_port(struct ib_device *device, int port_num,
 			     struct ib_umad_device *umad_dev,
 			     struct ib_umad_device *umad_dev,
 			     struct ib_umad_port *port)
 			     struct ib_umad_port *port)
 {
 {
 	int devnum;
 	int devnum;
-	dev_t base;
+	dev_t base_umad;
+	dev_t base_issm;
 
 
-	spin_lock(&port_lock);
 	devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS);
 	devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS);
-	if (devnum >= IB_UMAD_MAX_PORTS) {
-		spin_unlock(&port_lock);
-		devnum = find_overflow_devnum(device);
-		if (devnum < 0)
-			return -1;
-
-		spin_lock(&port_lock);
-		port->dev_num = devnum + IB_UMAD_MAX_PORTS;
-		base = devnum + overflow_maj;
-		set_bit(devnum, overflow_map);
+	if (devnum >= IB_UMAD_MAX_PORTS)
+		return -1;
+	port->dev_num = devnum;
+	set_bit(devnum, dev_map);
+	if (devnum >= IB_UMAD_NUM_FIXED_MINOR) {
+		base_umad = dynamic_umad_dev + devnum - IB_UMAD_NUM_FIXED_MINOR;
+		base_issm = dynamic_issm_dev + devnum - IB_UMAD_NUM_FIXED_MINOR;
 	} else {
 	} else {
-		port->dev_num = devnum;
-		base = devnum + base_dev;
-		set_bit(devnum, dev_map);
+		base_umad = devnum + base_umad_dev;
+		base_issm = devnum + base_issm_dev;
 	}
 	}
-	spin_unlock(&port_lock);
 
 
 	port->ib_dev   = device;
 	port->ib_dev   = device;
 	port->port_num = port_num;
 	port->port_num = port_num;
@@ -1198,7 +1181,7 @@ static int ib_umad_init_port(struct ib_device *device, int port_num,
 	port->cdev.owner = THIS_MODULE;
 	port->cdev.owner = THIS_MODULE;
 	cdev_set_parent(&port->cdev, &umad_dev->kobj);
 	cdev_set_parent(&port->cdev, &umad_dev->kobj);
 	kobject_set_name(&port->cdev.kobj, "umad%d", port->dev_num);
 	kobject_set_name(&port->cdev.kobj, "umad%d", port->dev_num);
-	if (cdev_add(&port->cdev, base, 1))
+	if (cdev_add(&port->cdev, base_umad, 1))
 		goto err_cdev;
 		goto err_cdev;
 
 
 	port->dev = device_create(umad_class, device->dev.parent,
 	port->dev = device_create(umad_class, device->dev.parent,
@@ -1212,12 +1195,11 @@ static int ib_umad_init_port(struct ib_device *device, int port_num,
 	if (device_create_file(port->dev, &dev_attr_port))
 	if (device_create_file(port->dev, &dev_attr_port))
 		goto err_dev;
 		goto err_dev;
 
 
-	base += IB_UMAD_MAX_PORTS;
 	cdev_init(&port->sm_cdev, &umad_sm_fops);
 	cdev_init(&port->sm_cdev, &umad_sm_fops);
 	port->sm_cdev.owner = THIS_MODULE;
 	port->sm_cdev.owner = THIS_MODULE;
 	cdev_set_parent(&port->sm_cdev, &umad_dev->kobj);
 	cdev_set_parent(&port->sm_cdev, &umad_dev->kobj);
 	kobject_set_name(&port->sm_cdev.kobj, "issm%d", port->dev_num);
 	kobject_set_name(&port->sm_cdev.kobj, "issm%d", port->dev_num);
-	if (cdev_add(&port->sm_cdev, base, 1))
+	if (cdev_add(&port->sm_cdev, base_issm, 1))
 		goto err_sm_cdev;
 		goto err_sm_cdev;
 
 
 	port->sm_dev = device_create(umad_class, device->dev.parent,
 	port->sm_dev = device_create(umad_class, device->dev.parent,
@@ -1244,10 +1226,7 @@ err_dev:
 
 
 err_cdev:
 err_cdev:
 	cdev_del(&port->cdev);
 	cdev_del(&port->cdev);
-	if (port->dev_num < IB_UMAD_MAX_PORTS)
-		clear_bit(devnum, dev_map);
-	else
-		clear_bit(devnum, overflow_map);
+	clear_bit(devnum, dev_map);
 
 
 	return -1;
 	return -1;
 }
 }
@@ -1281,11 +1260,7 @@ static void ib_umad_kill_port(struct ib_umad_port *port)
 	}
 	}
 
 
 	mutex_unlock(&port->file_mutex);
 	mutex_unlock(&port->file_mutex);
-
-	if (port->dev_num < IB_UMAD_MAX_PORTS)
-		clear_bit(port->dev_num, dev_map);
-	else
-		clear_bit(port->dev_num - IB_UMAD_MAX_PORTS, overflow_map);
+	clear_bit(port->dev_num, dev_map);
 }
 }
 
 
 static void ib_umad_add_one(struct ib_device *device)
 static void ib_umad_add_one(struct ib_device *device)
@@ -1361,13 +1336,23 @@ static int __init ib_umad_init(void)
 {
 {
 	int ret;
 	int ret;
 
 
-	ret = register_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2,
+	ret = register_chrdev_region(base_umad_dev,
+				     IB_UMAD_NUM_FIXED_MINOR * 2,
 				     "infiniband_mad");
 				     "infiniband_mad");
 	if (ret) {
 	if (ret) {
 		pr_err("couldn't register device number\n");
 		pr_err("couldn't register device number\n");
 		goto out;
 		goto out;
 	}
 	}
 
 
+	ret = alloc_chrdev_region(&dynamic_umad_dev, 0,
+				  IB_UMAD_NUM_DYNAMIC_MINOR * 2,
+				  "infiniband_mad");
+	if (ret) {
+		pr_err("couldn't register dynamic device number\n");
+		goto out_alloc;
+	}
+	dynamic_issm_dev = dynamic_umad_dev + IB_UMAD_NUM_DYNAMIC_MINOR;
+
 	umad_class = class_create(THIS_MODULE, "infiniband_mad");
 	umad_class = class_create(THIS_MODULE, "infiniband_mad");
 	if (IS_ERR(umad_class)) {
 	if (IS_ERR(umad_class)) {
 		ret = PTR_ERR(umad_class);
 		ret = PTR_ERR(umad_class);
@@ -1395,7 +1380,12 @@ out_class:
 	class_destroy(umad_class);
 	class_destroy(umad_class);
 
 
 out_chrdev:
 out_chrdev:
-	unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2);
+	unregister_chrdev_region(dynamic_umad_dev,
+				 IB_UMAD_NUM_DYNAMIC_MINOR * 2);
+
+out_alloc:
+	unregister_chrdev_region(base_umad_dev,
+				 IB_UMAD_NUM_FIXED_MINOR * 2);
 
 
 out:
 out:
 	return ret;
 	return ret;
@@ -1405,9 +1395,10 @@ static void __exit ib_umad_cleanup(void)
 {
 {
 	ib_unregister_client(&umad_client);
 	ib_unregister_client(&umad_client);
 	class_destroy(umad_class);
 	class_destroy(umad_class);
-	unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2);
-	if (overflow_maj)
-		unregister_chrdev_region(overflow_maj, IB_UMAD_MAX_PORTS * 2);
+	unregister_chrdev_region(base_umad_dev,
+				 IB_UMAD_NUM_FIXED_MINOR * 2);
+	unregister_chrdev_region(dynamic_umad_dev,
+				 IB_UMAD_NUM_DYNAMIC_MINOR * 2);
 }
 }
 
 
 module_init(ib_umad_init);
 module_init(ib_umad_init);

+ 7 - 7
drivers/infiniband/core/uverbs_cmd.c

@@ -340,6 +340,8 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
 	uobj->object = pd;
 	uobj->object = pd;
 	memset(&resp, 0, sizeof resp);
 	memset(&resp, 0, sizeof resp);
 	resp.pd_handle = uobj->id;
 	resp.pd_handle = uobj->id;
+	pd->res.type = RDMA_RESTRACK_PD;
+	rdma_restrack_add(&pd->res);
 
 
 	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
 	if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) {
 		ret = -EFAULT;
 		ret = -EFAULT;
@@ -1033,6 +1035,8 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
 		goto err_cb;
 		goto err_cb;
 
 
 	uobj_alloc_commit(&obj->uobject);
 	uobj_alloc_commit(&obj->uobject);
+	cq->res.type = RDMA_RESTRACK_CQ;
+	rdma_restrack_add(&cq->res);
 
 
 	return obj;
 	return obj;
 
 
@@ -1145,10 +1149,7 @@ int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file,
 			min(ucore->inlen, sizeof(cmd)),
 			min(ucore->inlen, sizeof(cmd)),
 			ib_uverbs_ex_create_cq_cb, NULL);
 			ib_uverbs_ex_create_cq_cb, NULL);
 
 
-	if (IS_ERR(obj))
-		return PTR_ERR(obj);
-
-	return 0;
+	return PTR_ERR_OR_ZERO(obj);
 }
 }
 
 
 ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file,
 ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file,
@@ -1199,7 +1200,7 @@ static int copy_wc_to_user(struct ib_device *ib_dev, void __user *dest,
 	tmp.opcode		= wc->opcode;
 	tmp.opcode		= wc->opcode;
 	tmp.vendor_err		= wc->vendor_err;
 	tmp.vendor_err		= wc->vendor_err;
 	tmp.byte_len		= wc->byte_len;
 	tmp.byte_len		= wc->byte_len;
-	tmp.ex.imm_data		= (__u32 __force) wc->ex.imm_data;
+	tmp.ex.imm_data		= wc->ex.imm_data;
 	tmp.qp_num		= wc->qp->qp_num;
 	tmp.qp_num		= wc->qp->qp_num;
 	tmp.src_qp		= wc->src_qp;
 	tmp.src_qp		= wc->src_qp;
 	tmp.wc_flags		= wc->wc_flags;
 	tmp.wc_flags		= wc->wc_flags;
@@ -1517,7 +1518,7 @@ static int create_qp(struct ib_uverbs_file *file,
 	if (cmd->qp_type == IB_QPT_XRC_TGT)
 	if (cmd->qp_type == IB_QPT_XRC_TGT)
 		qp = ib_create_qp(pd, &attr);
 		qp = ib_create_qp(pd, &attr);
 	else
 	else
-		qp = device->create_qp(pd, &attr, uhw);
+		qp = _ib_create_qp(device, pd, &attr, uhw);
 
 
 	if (IS_ERR(qp)) {
 	if (IS_ERR(qp)) {
 		ret = PTR_ERR(qp);
 		ret = PTR_ERR(qp);
@@ -1530,7 +1531,6 @@ static int create_qp(struct ib_uverbs_file *file,
 			goto err_cb;
 			goto err_cb;
 
 
 		qp->real_qp	  = qp;
 		qp->real_qp	  = qp;
-		qp->device	  = device;
 		qp->pd		  = pd;
 		qp->pd		  = pd;
 		qp->send_cq	  = attr.send_cq;
 		qp->send_cq	  = attr.send_cq;
 		qp->recv_cq	  = attr.recv_cq;
 		qp->recv_cq	  = attr.recv_cq;

+ 13 - 6
drivers/infiniband/core/uverbs_ioctl.c

@@ -243,16 +243,13 @@ static long ib_uverbs_cmd_verbs(struct ib_device *ib_dev,
 	size_t ctx_size;
 	size_t ctx_size;
 	uintptr_t data[UVERBS_OPTIMIZE_USING_STACK_SZ / sizeof(uintptr_t)];
 	uintptr_t data[UVERBS_OPTIMIZE_USING_STACK_SZ / sizeof(uintptr_t)];
 
 
-	if (hdr->reserved)
-		return -EINVAL;
-
 	object_spec = uverbs_get_object(ib_dev, hdr->object_id);
 	object_spec = uverbs_get_object(ib_dev, hdr->object_id);
 	if (!object_spec)
 	if (!object_spec)
-		return -EOPNOTSUPP;
+		return -EPROTONOSUPPORT;
 
 
 	method_spec = uverbs_get_method(object_spec, hdr->method_id);
 	method_spec = uverbs_get_method(object_spec, hdr->method_id);
 	if (!method_spec)
 	if (!method_spec)
-		return -EOPNOTSUPP;
+		return -EPROTONOSUPPORT;
 
 
 	if ((method_spec->flags & UVERBS_ACTION_FLAG_CREATE_ROOT) ^ !file->ucontext)
 	if ((method_spec->flags & UVERBS_ACTION_FLAG_CREATE_ROOT) ^ !file->ucontext)
 		return -EINVAL;
 		return -EINVAL;
@@ -305,6 +302,16 @@ static long ib_uverbs_cmd_verbs(struct ib_device *ib_dev,
 
 
 	err = uverbs_handle_method(buf, ctx->uattrs, hdr->num_attrs, ib_dev,
 	err = uverbs_handle_method(buf, ctx->uattrs, hdr->num_attrs, ib_dev,
 				   file, method_spec, ctx->uverbs_attr_bundle);
 				   file, method_spec, ctx->uverbs_attr_bundle);
+
+	/*
+	 * EPROTONOSUPPORT is ONLY to be returned if the ioctl framework can
+	 * not invoke the method because the request is not supported.  No
+	 * other cases should return this code.
+	*/
+	if (unlikely(err == -EPROTONOSUPPORT)) {
+		WARN_ON_ONCE(err == -EPROTONOSUPPORT);
+		err = -EINVAL;
+	}
 out:
 out:
 	if (ctx != (void *)data)
 	if (ctx != (void *)data)
 		kfree(ctx);
 		kfree(ctx);
@@ -341,7 +348,7 @@ long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		}
 		}
 
 
 		if (hdr.reserved) {
 		if (hdr.reserved) {
-			err = -EOPNOTSUPP;
+			err = -EPROTONOSUPPORT;
 			goto out;
 			goto out;
 		}
 		}
 
 

+ 34 - 61
drivers/infiniband/core/uverbs_main.c

@@ -62,14 +62,16 @@ MODULE_LICENSE("Dual BSD/GPL");
 enum {
 enum {
 	IB_UVERBS_MAJOR       = 231,
 	IB_UVERBS_MAJOR       = 231,
 	IB_UVERBS_BASE_MINOR  = 192,
 	IB_UVERBS_BASE_MINOR  = 192,
-	IB_UVERBS_MAX_DEVICES = 32
+	IB_UVERBS_MAX_DEVICES = RDMA_MAX_PORTS,
+	IB_UVERBS_NUM_FIXED_MINOR = 32,
+	IB_UVERBS_NUM_DYNAMIC_MINOR = IB_UVERBS_MAX_DEVICES - IB_UVERBS_NUM_FIXED_MINOR,
 };
 };
 
 
 #define IB_UVERBS_BASE_DEV	MKDEV(IB_UVERBS_MAJOR, IB_UVERBS_BASE_MINOR)
 #define IB_UVERBS_BASE_DEV	MKDEV(IB_UVERBS_MAJOR, IB_UVERBS_BASE_MINOR)
 
 
+static dev_t dynamic_uverbs_dev;
 static struct class *uverbs_class;
 static struct class *uverbs_class;
 
 
-static DEFINE_SPINLOCK(map_lock);
 static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
 static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
 
 
 static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
 static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
@@ -1005,34 +1007,6 @@ static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL);
 static CLASS_ATTR_STRING(abi_version, S_IRUGO,
 static CLASS_ATTR_STRING(abi_version, S_IRUGO,
 			 __stringify(IB_USER_VERBS_ABI_VERSION));
 			 __stringify(IB_USER_VERBS_ABI_VERSION));
 
 
-static dev_t overflow_maj;
-static DECLARE_BITMAP(overflow_map, IB_UVERBS_MAX_DEVICES);
-
-/*
- * If we have more than IB_UVERBS_MAX_DEVICES, dynamically overflow by
- * requesting a new major number and doubling the number of max devices we
- * support. It's stupid, but simple.
- */
-static int find_overflow_devnum(void)
-{
-	int ret;
-
-	if (!overflow_maj) {
-		ret = alloc_chrdev_region(&overflow_maj, 0, IB_UVERBS_MAX_DEVICES,
-					  "infiniband_verbs");
-		if (ret) {
-			pr_err("user_verbs: couldn't register dynamic device number\n");
-			return ret;
-		}
-	}
-
-	ret = find_first_zero_bit(overflow_map, IB_UVERBS_MAX_DEVICES);
-	if (ret >= IB_UVERBS_MAX_DEVICES)
-		return -1;
-
-	return ret;
-}
-
 static void ib_uverbs_add_one(struct ib_device *device)
 static void ib_uverbs_add_one(struct ib_device *device)
 {
 {
 	int devnum;
 	int devnum;
@@ -1062,24 +1036,15 @@ static void ib_uverbs_add_one(struct ib_device *device)
 	INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list);
 	INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list);
 	INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list);
 	INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list);
 
 
-	spin_lock(&map_lock);
 	devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
 	devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
-	if (devnum >= IB_UVERBS_MAX_DEVICES) {
-		spin_unlock(&map_lock);
-		devnum = find_overflow_devnum();
-		if (devnum < 0)
-			goto err;
-
-		spin_lock(&map_lock);
-		uverbs_dev->devnum = devnum + IB_UVERBS_MAX_DEVICES;
-		base = devnum + overflow_maj;
-		set_bit(devnum, overflow_map);
-	} else {
-		uverbs_dev->devnum = devnum;
-		base = devnum + IB_UVERBS_BASE_DEV;
-		set_bit(devnum, dev_map);
-	}
-	spin_unlock(&map_lock);
+	if (devnum >= IB_UVERBS_MAX_DEVICES)
+		goto err;
+	uverbs_dev->devnum = devnum;
+	set_bit(devnum, dev_map);
+	if (devnum >= IB_UVERBS_NUM_FIXED_MINOR)
+		base = dynamic_uverbs_dev + devnum - IB_UVERBS_NUM_FIXED_MINOR;
+	else
+		base = IB_UVERBS_BASE_DEV + devnum;
 
 
 	rcu_assign_pointer(uverbs_dev->ib_dev, device);
 	rcu_assign_pointer(uverbs_dev->ib_dev, device);
 	uverbs_dev->num_comp_vectors = device->num_comp_vectors;
 	uverbs_dev->num_comp_vectors = device->num_comp_vectors;
@@ -1124,10 +1089,7 @@ err_class:
 
 
 err_cdev:
 err_cdev:
 	cdev_del(&uverbs_dev->cdev);
 	cdev_del(&uverbs_dev->cdev);
-	if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES)
-		clear_bit(devnum, dev_map);
-	else
-		clear_bit(devnum, overflow_map);
+	clear_bit(devnum, dev_map);
 
 
 err:
 err:
 	if (atomic_dec_and_test(&uverbs_dev->refcount))
 	if (atomic_dec_and_test(&uverbs_dev->refcount))
@@ -1219,11 +1181,7 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data)
 	dev_set_drvdata(uverbs_dev->dev, NULL);
 	dev_set_drvdata(uverbs_dev->dev, NULL);
 	device_destroy(uverbs_class, uverbs_dev->cdev.dev);
 	device_destroy(uverbs_class, uverbs_dev->cdev.dev);
 	cdev_del(&uverbs_dev->cdev);
 	cdev_del(&uverbs_dev->cdev);
-
-	if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES)
-		clear_bit(uverbs_dev->devnum, dev_map);
-	else
-		clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map);
+	clear_bit(uverbs_dev->devnum, dev_map);
 
 
 	if (device->disassociate_ucontext) {
 	if (device->disassociate_ucontext) {
 		/* We disassociate HW resources and immediately return.
 		/* We disassociate HW resources and immediately return.
@@ -1265,13 +1223,22 @@ static int __init ib_uverbs_init(void)
 {
 {
 	int ret;
 	int ret;
 
 
-	ret = register_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES,
+	ret = register_chrdev_region(IB_UVERBS_BASE_DEV,
+				     IB_UVERBS_NUM_FIXED_MINOR,
 				     "infiniband_verbs");
 				     "infiniband_verbs");
 	if (ret) {
 	if (ret) {
 		pr_err("user_verbs: couldn't register device number\n");
 		pr_err("user_verbs: couldn't register device number\n");
 		goto out;
 		goto out;
 	}
 	}
 
 
+	ret = alloc_chrdev_region(&dynamic_uverbs_dev, 0,
+				  IB_UVERBS_NUM_DYNAMIC_MINOR,
+				  "infiniband_verbs");
+	if (ret) {
+		pr_err("couldn't register dynamic device number\n");
+		goto out_alloc;
+	}
+
 	uverbs_class = class_create(THIS_MODULE, "infiniband_verbs");
 	uverbs_class = class_create(THIS_MODULE, "infiniband_verbs");
 	if (IS_ERR(uverbs_class)) {
 	if (IS_ERR(uverbs_class)) {
 		ret = PTR_ERR(uverbs_class);
 		ret = PTR_ERR(uverbs_class);
@@ -1299,7 +1266,12 @@ out_class:
 	class_destroy(uverbs_class);
 	class_destroy(uverbs_class);
 
 
 out_chrdev:
 out_chrdev:
-	unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES);
+	unregister_chrdev_region(dynamic_uverbs_dev,
+				 IB_UVERBS_NUM_DYNAMIC_MINOR);
+
+out_alloc:
+	unregister_chrdev_region(IB_UVERBS_BASE_DEV,
+				 IB_UVERBS_NUM_FIXED_MINOR);
 
 
 out:
 out:
 	return ret;
 	return ret;
@@ -1309,9 +1281,10 @@ static void __exit ib_uverbs_cleanup(void)
 {
 {
 	ib_unregister_client(&uverbs_client);
 	ib_unregister_client(&uverbs_client);
 	class_destroy(uverbs_class);
 	class_destroy(uverbs_class);
-	unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES);
-	if (overflow_maj)
-		unregister_chrdev_region(overflow_maj, IB_UVERBS_MAX_DEVICES);
+	unregister_chrdev_region(IB_UVERBS_BASE_DEV,
+				 IB_UVERBS_NUM_FIXED_MINOR);
+	unregister_chrdev_region(dynamic_uverbs_dev,
+				 IB_UVERBS_NUM_DYNAMIC_MINOR);
 }
 }
 
 
 module_init(ib_uverbs_init);
 module_init(ib_uverbs_init);

+ 3 - 0
drivers/infiniband/core/uverbs_std_types.c

@@ -35,6 +35,7 @@
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_verbs.h>
 #include <linux/bug.h>
 #include <linux/bug.h>
 #include <linux/file.h>
 #include <linux/file.h>
+#include <rdma/restrack.h>
 #include "rdma_core.h"
 #include "rdma_core.h"
 #include "uverbs.h"
 #include "uverbs.h"
 
 
@@ -319,6 +320,8 @@ static int uverbs_create_cq_handler(struct ib_device *ib_dev,
 	obj->uobject.object = cq;
 	obj->uobject.object = cq;
 	obj->uobject.user_handle = user_handle;
 	obj->uobject.user_handle = user_handle;
 	atomic_set(&cq->usecnt, 0);
 	atomic_set(&cq->usecnt, 0);
+	cq->res.type = RDMA_RESTRACK_CQ;
+	rdma_restrack_add(&cq->res);
 
 
 	ret = uverbs_copy_to(attrs, CREATE_CQ_RESP_CQE, &cq->cqe);
 	ret = uverbs_copy_to(attrs, CREATE_CQ_RESP_CQE, &cq->cqe);
 	if (ret)
 	if (ret)

+ 177 - 135
drivers/infiniband/core/verbs.c

@@ -124,16 +124,24 @@ EXPORT_SYMBOL(ib_wc_status_msg);
 __attribute_const__ int ib_rate_to_mult(enum ib_rate rate)
 __attribute_const__ int ib_rate_to_mult(enum ib_rate rate)
 {
 {
 	switch (rate) {
 	switch (rate) {
-	case IB_RATE_2_5_GBPS: return  1;
-	case IB_RATE_5_GBPS:   return  2;
-	case IB_RATE_10_GBPS:  return  4;
-	case IB_RATE_20_GBPS:  return  8;
-	case IB_RATE_30_GBPS:  return 12;
-	case IB_RATE_40_GBPS:  return 16;
-	case IB_RATE_60_GBPS:  return 24;
-	case IB_RATE_80_GBPS:  return 32;
-	case IB_RATE_120_GBPS: return 48;
-	default:	       return -1;
+	case IB_RATE_2_5_GBPS: return   1;
+	case IB_RATE_5_GBPS:   return   2;
+	case IB_RATE_10_GBPS:  return   4;
+	case IB_RATE_20_GBPS:  return   8;
+	case IB_RATE_30_GBPS:  return  12;
+	case IB_RATE_40_GBPS:  return  16;
+	case IB_RATE_60_GBPS:  return  24;
+	case IB_RATE_80_GBPS:  return  32;
+	case IB_RATE_120_GBPS: return  48;
+	case IB_RATE_14_GBPS:  return   6;
+	case IB_RATE_56_GBPS:  return  22;
+	case IB_RATE_112_GBPS: return  45;
+	case IB_RATE_168_GBPS: return  67;
+	case IB_RATE_25_GBPS:  return  10;
+	case IB_RATE_100_GBPS: return  40;
+	case IB_RATE_200_GBPS: return  80;
+	case IB_RATE_300_GBPS: return 120;
+	default:	       return  -1;
 	}
 	}
 }
 }
 EXPORT_SYMBOL(ib_rate_to_mult);
 EXPORT_SYMBOL(ib_rate_to_mult);
@@ -141,16 +149,24 @@ EXPORT_SYMBOL(ib_rate_to_mult);
 __attribute_const__ enum ib_rate mult_to_ib_rate(int mult)
 __attribute_const__ enum ib_rate mult_to_ib_rate(int mult)
 {
 {
 	switch (mult) {
 	switch (mult) {
-	case 1:  return IB_RATE_2_5_GBPS;
-	case 2:  return IB_RATE_5_GBPS;
-	case 4:  return IB_RATE_10_GBPS;
-	case 8:  return IB_RATE_20_GBPS;
-	case 12: return IB_RATE_30_GBPS;
-	case 16: return IB_RATE_40_GBPS;
-	case 24: return IB_RATE_60_GBPS;
-	case 32: return IB_RATE_80_GBPS;
-	case 48: return IB_RATE_120_GBPS;
-	default: return IB_RATE_PORT_CURRENT;
+	case 1:   return IB_RATE_2_5_GBPS;
+	case 2:   return IB_RATE_5_GBPS;
+	case 4:   return IB_RATE_10_GBPS;
+	case 8:   return IB_RATE_20_GBPS;
+	case 12:  return IB_RATE_30_GBPS;
+	case 16:  return IB_RATE_40_GBPS;
+	case 24:  return IB_RATE_60_GBPS;
+	case 32:  return IB_RATE_80_GBPS;
+	case 48:  return IB_RATE_120_GBPS;
+	case 6:   return IB_RATE_14_GBPS;
+	case 22:  return IB_RATE_56_GBPS;
+	case 45:  return IB_RATE_112_GBPS;
+	case 67:  return IB_RATE_168_GBPS;
+	case 10:  return IB_RATE_25_GBPS;
+	case 40:  return IB_RATE_100_GBPS;
+	case 80:  return IB_RATE_200_GBPS;
+	case 120: return IB_RATE_300_GBPS;
+	default:  return IB_RATE_PORT_CURRENT;
 	}
 	}
 }
 }
 EXPORT_SYMBOL(mult_to_ib_rate);
 EXPORT_SYMBOL(mult_to_ib_rate);
@@ -247,6 +263,10 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
 		mr_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE;
 		mr_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE;
 	}
 	}
 
 
+	pd->res.type = RDMA_RESTRACK_PD;
+	pd->res.kern_name = caller;
+	rdma_restrack_add(&pd->res);
+
 	if (mr_access_flags) {
 	if (mr_access_flags) {
 		struct ib_mr *mr;
 		struct ib_mr *mr;
 
 
@@ -296,6 +316,7 @@ void ib_dealloc_pd(struct ib_pd *pd)
 	   requires the caller to guarantee we can't race here. */
 	   requires the caller to guarantee we can't race here. */
 	WARN_ON(atomic_read(&pd->usecnt));
 	WARN_ON(atomic_read(&pd->usecnt));
 
 
+	rdma_restrack_del(&pd->res);
 	/* Making delalloc_pd a void return is a WIP, no driver should return
 	/* Making delalloc_pd a void return is a WIP, no driver should return
 	   an error here. */
 	   an error here. */
 	ret = pd->device->dealloc_pd(pd);
 	ret = pd->device->dealloc_pd(pd);
@@ -421,8 +442,7 @@ static bool find_gid_index(const union ib_gid *gid,
 			   const struct ib_gid_attr *gid_attr,
 			   const struct ib_gid_attr *gid_attr,
 			   void *context)
 			   void *context)
 {
 {
-	struct find_gid_index_context *ctx =
-		(struct find_gid_index_context *)context;
+	struct find_gid_index_context *ctx = context;
 
 
 	if (ctx->gid_type != gid_attr->gid_type)
 	if (ctx->gid_type != gid_attr->gid_type)
 		return false;
 		return false;
@@ -481,8 +501,53 @@ int ib_get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr,
 }
 }
 EXPORT_SYMBOL(ib_get_gids_from_rdma_hdr);
 EXPORT_SYMBOL(ib_get_gids_from_rdma_hdr);
 
 
+/* Resolve destination mac address and hop limit for unicast destination
+ * GID entry, considering the source GID entry as well.
+ * ah_attribute must have have valid port_num, sgid_index.
+ */
+static int ib_resolve_unicast_gid_dmac(struct ib_device *device,
+				       struct rdma_ah_attr *ah_attr)
+{
+	struct ib_gid_attr sgid_attr;
+	struct ib_global_route *grh;
+	int hop_limit = 0xff;
+	union ib_gid sgid;
+	int ret;
+
+	grh = rdma_ah_retrieve_grh(ah_attr);
+
+	ret = ib_query_gid(device,
+			   rdma_ah_get_port_num(ah_attr),
+			   grh->sgid_index,
+			   &sgid, &sgid_attr);
+	if (ret || !sgid_attr.ndev) {
+		if (!ret)
+			ret = -ENXIO;
+		return ret;
+	}
+
+	/* If destination is link local and source GID is RoCEv1,
+	 * IP stack is not used.
+	 */
+	if (rdma_link_local_addr((struct in6_addr *)grh->dgid.raw) &&
+	    sgid_attr.gid_type == IB_GID_TYPE_ROCE) {
+		rdma_get_ll_mac((struct in6_addr *)grh->dgid.raw,
+				ah_attr->roce.dmac);
+		goto done;
+	}
+
+	ret = rdma_addr_find_l2_eth_by_grh(&sgid, &grh->dgid,
+					   ah_attr->roce.dmac,
+					   sgid_attr.ndev, &hop_limit);
+done:
+	dev_put(sgid_attr.ndev);
+
+	grh->hop_limit = hop_limit;
+	return ret;
+}
+
 /*
 /*
- * This function creates ah from the incoming packet.
+ * This function initializes address handle attributes from the incoming packet.
  * Incoming packet has dgid of the receiver node on which this code is
  * Incoming packet has dgid of the receiver node on which this code is
  * getting executed and, sgid contains the GID of the sender.
  * getting executed and, sgid contains the GID of the sender.
  *
  *
@@ -490,13 +555,10 @@ EXPORT_SYMBOL(ib_get_gids_from_rdma_hdr);
  * as sgid and, sgid is used as dgid because sgid contains destinations
  * as sgid and, sgid is used as dgid because sgid contains destinations
  * GID whom to respond to.
  * GID whom to respond to.
  *
  *
- * This is why when calling rdma_addr_find_l2_eth_by_grh() function, the
- * position of arguments dgid and sgid do not match the order of the
- * parameters.
  */
  */
-int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
-		       const struct ib_wc *wc, const struct ib_grh *grh,
-		       struct rdma_ah_attr *ah_attr)
+int ib_init_ah_attr_from_wc(struct ib_device *device, u8 port_num,
+			    const struct ib_wc *wc, const struct ib_grh *grh,
+			    struct rdma_ah_attr *ah_attr)
 {
 {
 	u32 flow_class;
 	u32 flow_class;
 	u16 gid_index;
 	u16 gid_index;
@@ -523,57 +585,33 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
 	if (ret)
 	if (ret)
 		return ret;
 		return ret;
 
 
+	rdma_ah_set_sl(ah_attr, wc->sl);
+	rdma_ah_set_port_num(ah_attr, port_num);
+
 	if (rdma_protocol_roce(device, port_num)) {
 	if (rdma_protocol_roce(device, port_num)) {
-		int if_index = 0;
 		u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ?
 		u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ?
 				wc->vlan_id : 0xffff;
 				wc->vlan_id : 0xffff;
-		struct net_device *idev;
-		struct net_device *resolved_dev;
 
 
 		if (!(wc->wc_flags & IB_WC_GRH))
 		if (!(wc->wc_flags & IB_WC_GRH))
 			return -EPROTOTYPE;
 			return -EPROTOTYPE;
 
 
-		if (!device->get_netdev)
-			return -EOPNOTSUPP;
-
-		idev = device->get_netdev(device, port_num);
-		if (!idev)
-			return -ENODEV;
-
-		ret = rdma_addr_find_l2_eth_by_grh(&dgid, &sgid,
-						   ah_attr->roce.dmac,
-						   wc->wc_flags & IB_WC_WITH_VLAN ?
-						   NULL : &vlan_id,
-						   &if_index, &hoplimit);
-		if (ret) {
-			dev_put(idev);
-			return ret;
-		}
-
-		resolved_dev = dev_get_by_index(&init_net, if_index);
-		rcu_read_lock();
-		if (resolved_dev != idev && !rdma_is_upper_dev_rcu(idev,
-								   resolved_dev))
-			ret = -EHOSTUNREACH;
-		rcu_read_unlock();
-		dev_put(idev);
-		dev_put(resolved_dev);
+		ret = get_sgid_index_from_eth(device, port_num,
+					      vlan_id, &dgid,
+					      gid_type, &gid_index);
 		if (ret)
 		if (ret)
 			return ret;
 			return ret;
 
 
-		ret = get_sgid_index_from_eth(device, port_num, vlan_id,
-					      &dgid, gid_type, &gid_index);
-		if (ret)
-			return ret;
-	}
-
-	rdma_ah_set_dlid(ah_attr, wc->slid);
-	rdma_ah_set_sl(ah_attr, wc->sl);
-	rdma_ah_set_path_bits(ah_attr, wc->dlid_path_bits);
-	rdma_ah_set_port_num(ah_attr, port_num);
+		flow_class = be32_to_cpu(grh->version_tclass_flow);
+		rdma_ah_set_grh(ah_attr, &sgid,
+				flow_class & 0xFFFFF,
+				(u8)gid_index, hoplimit,
+				(flow_class >> 20) & 0xFF);
+		return ib_resolve_unicast_gid_dmac(device, ah_attr);
+	} else {
+		rdma_ah_set_dlid(ah_attr, wc->slid);
+		rdma_ah_set_path_bits(ah_attr, wc->dlid_path_bits);
 
 
-	if (wc->wc_flags & IB_WC_GRH) {
-		if (!rdma_cap_eth_ah(device, port_num)) {
+		if (wc->wc_flags & IB_WC_GRH) {
 			if (dgid.global.interface_id != cpu_to_be64(IB_SA_WELL_KNOWN_GUID)) {
 			if (dgid.global.interface_id != cpu_to_be64(IB_SA_WELL_KNOWN_GUID)) {
 				ret = ib_find_cached_gid_by_port(device, &dgid,
 				ret = ib_find_cached_gid_by_port(device, &dgid,
 								 IB_GID_TYPE_IB,
 								 IB_GID_TYPE_IB,
@@ -584,18 +622,17 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
 			} else {
 			} else {
 				gid_index = 0;
 				gid_index = 0;
 			}
 			}
-		}
-
-		flow_class = be32_to_cpu(grh->version_tclass_flow);
-		rdma_ah_set_grh(ah_attr, &sgid,
-				flow_class & 0xFFFFF,
-				(u8)gid_index, hoplimit,
-				(flow_class >> 20) & 0xFF);
 
 
+			flow_class = be32_to_cpu(grh->version_tclass_flow);
+			rdma_ah_set_grh(ah_attr, &sgid,
+					flow_class & 0xFFFFF,
+					(u8)gid_index, hoplimit,
+					(flow_class >> 20) & 0xFF);
+		}
+		return 0;
 	}
 	}
-	return 0;
 }
 }
-EXPORT_SYMBOL(ib_init_ah_from_wc);
+EXPORT_SYMBOL(ib_init_ah_attr_from_wc);
 
 
 struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc,
 struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc,
 				   const struct ib_grh *grh, u8 port_num)
 				   const struct ib_grh *grh, u8 port_num)
@@ -603,7 +640,7 @@ struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc,
 	struct rdma_ah_attr ah_attr;
 	struct rdma_ah_attr ah_attr;
 	int ret;
 	int ret;
 
 
-	ret = ib_init_ah_from_wc(pd->device, port_num, wc, grh, &ah_attr);
+	ret = ib_init_ah_attr_from_wc(pd->device, port_num, wc, grh, &ah_attr);
 	if (ret)
 	if (ret)
 		return ERR_PTR(ret);
 		return ERR_PTR(ret);
 
 
@@ -850,7 +887,7 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
 	if (qp_init_attr->cap.max_rdma_ctxs)
 	if (qp_init_attr->cap.max_rdma_ctxs)
 		rdma_rw_init_qp(device, qp_init_attr);
 		rdma_rw_init_qp(device, qp_init_attr);
 
 
-	qp = device->create_qp(pd, qp_init_attr, NULL);
+	qp = _ib_create_qp(device, pd, qp_init_attr, NULL);
 	if (IS_ERR(qp))
 	if (IS_ERR(qp))
 		return qp;
 		return qp;
 
 
@@ -860,7 +897,6 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
 		return ERR_PTR(ret);
 		return ERR_PTR(ret);
 	}
 	}
 
 
-	qp->device     = device;
 	qp->real_qp    = qp;
 	qp->real_qp    = qp;
 	qp->uobject    = NULL;
 	qp->uobject    = NULL;
 	qp->qp_type    = qp_init_attr->qp_type;
 	qp->qp_type    = qp_init_attr->qp_type;
@@ -890,7 +926,6 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
 			atomic_inc(&qp_init_attr->srq->usecnt);
 			atomic_inc(&qp_init_attr->srq->usecnt);
 	}
 	}
 
 
-	qp->pd	    = pd;
 	qp->send_cq = qp_init_attr->send_cq;
 	qp->send_cq = qp_init_attr->send_cq;
 	qp->xrcd    = NULL;
 	qp->xrcd    = NULL;
 
 
@@ -1269,16 +1304,8 @@ static int ib_resolve_eth_dmac(struct ib_device *device,
 	if (!rdma_is_port_valid(device, rdma_ah_get_port_num(ah_attr)))
 	if (!rdma_is_port_valid(device, rdma_ah_get_port_num(ah_attr)))
 		return -EINVAL;
 		return -EINVAL;
 
 
-	if (ah_attr->type != RDMA_AH_ATTR_TYPE_ROCE)
-		return 0;
-
 	grh = rdma_ah_retrieve_grh(ah_attr);
 	grh = rdma_ah_retrieve_grh(ah_attr);
 
 
-	if (rdma_link_local_addr((struct in6_addr *)grh->dgid.raw)) {
-		rdma_get_ll_mac((struct in6_addr *)grh->dgid.raw,
-				ah_attr->roce.dmac);
-		return 0;
-	}
 	if (rdma_is_multicast_addr((struct in6_addr *)ah_attr->grh.dgid.raw)) {
 	if (rdma_is_multicast_addr((struct in6_addr *)ah_attr->grh.dgid.raw)) {
 		if (ipv6_addr_v4mapped((struct in6_addr *)ah_attr->grh.dgid.raw)) {
 		if (ipv6_addr_v4mapped((struct in6_addr *)ah_attr->grh.dgid.raw)) {
 			__be32 addr = 0;
 			__be32 addr = 0;
@@ -1290,40 +1317,52 @@ static int ib_resolve_eth_dmac(struct ib_device *device,
 					(char *)ah_attr->roce.dmac);
 					(char *)ah_attr->roce.dmac);
 		}
 		}
 	} else {
 	} else {
-		union ib_gid		sgid;
-		struct ib_gid_attr	sgid_attr;
-		int			ifindex;
-		int			hop_limit;
-
-		ret = ib_query_gid(device,
-				   rdma_ah_get_port_num(ah_attr),
-				   grh->sgid_index,
-				   &sgid, &sgid_attr);
-
-		if (ret || !sgid_attr.ndev) {
-			if (!ret)
-				ret = -ENXIO;
-			goto out;
-		}
-
-		ifindex = sgid_attr.ndev->ifindex;
+		ret = ib_resolve_unicast_gid_dmac(device, ah_attr);
+	}
+	return ret;
+}
 
 
-		ret =
-		rdma_addr_find_l2_eth_by_grh(&sgid, &grh->dgid,
-					     ah_attr->roce.dmac,
-					     NULL, &ifindex, &hop_limit);
+/**
+ * IB core internal function to perform QP attributes modification.
+ */
+static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr,
+			 int attr_mask, struct ib_udata *udata)
+{
+	u8 port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
+	int ret;
 
 
-		dev_put(sgid_attr.ndev);
+	if (rdma_ib_or_roce(qp->device, port)) {
+		if (attr_mask & IB_QP_RQ_PSN && attr->rq_psn & ~0xffffff) {
+			pr_warn("%s: %s rq_psn overflow, masking to 24 bits\n",
+				__func__, qp->device->name);
+			attr->rq_psn &= 0xffffff;
+		}
 
 
-		grh->hop_limit = hop_limit;
+		if (attr_mask & IB_QP_SQ_PSN && attr->sq_psn & ~0xffffff) {
+			pr_warn("%s: %s sq_psn overflow, masking to 24 bits\n",
+				__func__, qp->device->name);
+			attr->sq_psn &= 0xffffff;
+		}
 	}
 	}
-out:
+
+	ret = ib_security_modify_qp(qp, attr, attr_mask, udata);
+	if (!ret && (attr_mask & IB_QP_PORT))
+		qp->port = attr->port_num;
+
 	return ret;
 	return ret;
 }
 }
 
 
+static bool is_qp_type_connected(const struct ib_qp *qp)
+{
+	return (qp->qp_type == IB_QPT_UC ||
+		qp->qp_type == IB_QPT_RC ||
+		qp->qp_type == IB_QPT_XRC_INI ||
+		qp->qp_type == IB_QPT_XRC_TGT);
+}
+
 /**
 /**
  * ib_modify_qp_with_udata - Modifies the attributes for the specified QP.
  * ib_modify_qp_with_udata - Modifies the attributes for the specified QP.
- * @qp: The QP to modify.
+ * @ib_qp: The QP to modify.
  * @attr: On input, specifies the QP attributes to modify.  On output,
  * @attr: On input, specifies the QP attributes to modify.  On output,
  *   the current values of selected QP attributes are returned.
  *   the current values of selected QP attributes are returned.
  * @attr_mask: A bit-mask used to specify which attributes of the QP
  * @attr_mask: A bit-mask used to specify which attributes of the QP
@@ -1332,21 +1371,20 @@ out:
  *   are being modified.
  *   are being modified.
  * It returns 0 on success and returns appropriate error code on error.
  * It returns 0 on success and returns appropriate error code on error.
  */
  */
-int ib_modify_qp_with_udata(struct ib_qp *qp, struct ib_qp_attr *attr,
+int ib_modify_qp_with_udata(struct ib_qp *ib_qp, struct ib_qp_attr *attr,
 			    int attr_mask, struct ib_udata *udata)
 			    int attr_mask, struct ib_udata *udata)
 {
 {
+	struct ib_qp *qp = ib_qp->real_qp;
 	int ret;
 	int ret;
 
 
-	if (attr_mask & IB_QP_AV) {
+	if (attr_mask & IB_QP_AV &&
+	    attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE &&
+	    is_qp_type_connected(qp)) {
 		ret = ib_resolve_eth_dmac(qp->device, &attr->ah_attr);
 		ret = ib_resolve_eth_dmac(qp->device, &attr->ah_attr);
 		if (ret)
 		if (ret)
 			return ret;
 			return ret;
 	}
 	}
-	ret = ib_security_modify_qp(qp, attr, attr_mask, udata);
-	if (!ret && (attr_mask & IB_QP_PORT))
-		qp->port = attr->port_num;
-
-	return ret;
+	return _ib_modify_qp(qp, attr, attr_mask, udata);
 }
 }
 EXPORT_SYMBOL(ib_modify_qp_with_udata);
 EXPORT_SYMBOL(ib_modify_qp_with_udata);
 
 
@@ -1409,7 +1447,7 @@ int ib_modify_qp(struct ib_qp *qp,
 		 struct ib_qp_attr *qp_attr,
 		 struct ib_qp_attr *qp_attr,
 		 int qp_attr_mask)
 		 int qp_attr_mask)
 {
 {
-	return ib_modify_qp_with_udata(qp, qp_attr, qp_attr_mask, NULL);
+	return _ib_modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL);
 }
 }
 EXPORT_SYMBOL(ib_modify_qp);
 EXPORT_SYMBOL(ib_modify_qp);
 
 
@@ -1503,6 +1541,7 @@ int ib_destroy_qp(struct ib_qp *qp)
 	if (!qp->uobject)
 	if (!qp->uobject)
 		rdma_rw_cleanup_mrs(qp);
 		rdma_rw_cleanup_mrs(qp);
 
 
+	rdma_restrack_del(&qp->res);
 	ret = qp->device->destroy_qp(qp);
 	ret = qp->device->destroy_qp(qp);
 	if (!ret) {
 	if (!ret) {
 		if (pd)
 		if (pd)
@@ -1545,6 +1584,8 @@ struct ib_cq *ib_create_cq(struct ib_device *device,
 		cq->event_handler = event_handler;
 		cq->event_handler = event_handler;
 		cq->cq_context    = cq_context;
 		cq->cq_context    = cq_context;
 		atomic_set(&cq->usecnt, 0);
 		atomic_set(&cq->usecnt, 0);
+		cq->res.type = RDMA_RESTRACK_CQ;
+		rdma_restrack_add(&cq->res);
 	}
 	}
 
 
 	return cq;
 	return cq;
@@ -1563,6 +1604,7 @@ int ib_destroy_cq(struct ib_cq *cq)
 	if (atomic_read(&cq->usecnt))
 	if (atomic_read(&cq->usecnt))
 		return -EBUSY;
 		return -EBUSY;
 
 
+	rdma_restrack_del(&cq->res);
 	return cq->device->destroy_cq(cq);
 	return cq->device->destroy_cq(cq);
 }
 }
 EXPORT_SYMBOL(ib_destroy_cq);
 EXPORT_SYMBOL(ib_destroy_cq);
@@ -1747,7 +1789,7 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
 }
 }
 EXPORT_SYMBOL(ib_detach_mcast);
 EXPORT_SYMBOL(ib_detach_mcast);
 
 
-struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device)
+struct ib_xrcd *__ib_alloc_xrcd(struct ib_device *device, const char *caller)
 {
 {
 	struct ib_xrcd *xrcd;
 	struct ib_xrcd *xrcd;
 
 
@@ -1765,7 +1807,7 @@ struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device)
 
 
 	return xrcd;
 	return xrcd;
 }
 }
-EXPORT_SYMBOL(ib_alloc_xrcd);
+EXPORT_SYMBOL(__ib_alloc_xrcd);
 
 
 int ib_dealloc_xrcd(struct ib_xrcd *xrcd)
 int ib_dealloc_xrcd(struct ib_xrcd *xrcd)
 {
 {
@@ -1790,11 +1832,11 @@ EXPORT_SYMBOL(ib_dealloc_xrcd);
  * ib_create_wq - Creates a WQ associated with the specified protection
  * ib_create_wq - Creates a WQ associated with the specified protection
  * domain.
  * domain.
  * @pd: The protection domain associated with the WQ.
  * @pd: The protection domain associated with the WQ.
- * @wq_init_attr: A list of initial attributes required to create the
+ * @wq_attr: A list of initial attributes required to create the
  * WQ. If WQ creation succeeds, then the attributes are updated to
  * WQ. If WQ creation succeeds, then the attributes are updated to
  * the actual capabilities of the created WQ.
  * the actual capabilities of the created WQ.
  *
  *
- * wq_init_attr->max_wr and wq_init_attr->max_sge determine
+ * wq_attr->max_wr and wq_attr->max_sge determine
  * the requested size of the WQ, and set to the actual values allocated
  * the requested size of the WQ, and set to the actual values allocated
  * on return.
  * on return.
  * If ib_create_wq() succeeds, then max_wr and max_sge will always be
  * If ib_create_wq() succeeds, then max_wr and max_sge will always be
@@ -2156,16 +2198,16 @@ static void __ib_drain_sq(struct ib_qp *qp)
 	struct ib_send_wr swr = {}, *bad_swr;
 	struct ib_send_wr swr = {}, *bad_swr;
 	int ret;
 	int ret;
 
 
-	swr.wr_cqe = &sdrain.cqe;
-	sdrain.cqe.done = ib_drain_qp_done;
-	init_completion(&sdrain.done);
-
 	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
 	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
 	if (ret) {
 	if (ret) {
 		WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
 		WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
 		return;
 		return;
 	}
 	}
 
 
+	swr.wr_cqe = &sdrain.cqe;
+	sdrain.cqe.done = ib_drain_qp_done;
+	init_completion(&sdrain.done);
+
 	ret = ib_post_send(qp, &swr, &bad_swr);
 	ret = ib_post_send(qp, &swr, &bad_swr);
 	if (ret) {
 	if (ret) {
 		WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
 		WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
@@ -2190,16 +2232,16 @@ static void __ib_drain_rq(struct ib_qp *qp)
 	struct ib_recv_wr rwr = {}, *bad_rwr;
 	struct ib_recv_wr rwr = {}, *bad_rwr;
 	int ret;
 	int ret;
 
 
-	rwr.wr_cqe = &rdrain.cqe;
-	rdrain.cqe.done = ib_drain_qp_done;
-	init_completion(&rdrain.done);
-
 	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
 	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
 	if (ret) {
 	if (ret) {
 		WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
 		WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
 		return;
 		return;
 	}
 	}
 
 
+	rwr.wr_cqe = &rdrain.cqe;
+	rdrain.cqe.done = ib_drain_qp_done;
+	init_completion(&rdrain.done);
+
 	ret = ib_post_recv(qp, &rwr, &bad_rwr);
 	ret = ib_post_recv(qp, &rwr, &bad_rwr);
 	if (ret) {
 	if (ret) {
 		WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
 		WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);

+ 34 - 9
drivers/infiniband/hw/bnxt_re/bnxt_re.h

@@ -43,20 +43,41 @@
 #define ROCE_DRV_MODULE_VERSION		"1.0.0"
 #define ROCE_DRV_MODULE_VERSION		"1.0.0"
 
 
 #define BNXT_RE_DESC	"Broadcom NetXtreme-C/E RoCE Driver"
 #define BNXT_RE_DESC	"Broadcom NetXtreme-C/E RoCE Driver"
-
-#define BNXT_RE_PAGE_SIZE_4K		BIT(12)
-#define BNXT_RE_PAGE_SIZE_8K		BIT(13)
-#define BNXT_RE_PAGE_SIZE_64K		BIT(16)
-#define BNXT_RE_PAGE_SIZE_2M		BIT(21)
-#define BNXT_RE_PAGE_SIZE_8M		BIT(23)
-#define BNXT_RE_PAGE_SIZE_1G		BIT(30)
-
-#define BNXT_RE_MAX_MR_SIZE		BIT(30)
+#define BNXT_RE_PAGE_SHIFT_4K		(12)
+#define BNXT_RE_PAGE_SHIFT_8K		(13)
+#define BNXT_RE_PAGE_SHIFT_64K		(16)
+#define BNXT_RE_PAGE_SHIFT_2M		(21)
+#define BNXT_RE_PAGE_SHIFT_8M		(23)
+#define BNXT_RE_PAGE_SHIFT_1G		(30)
+
+#define BNXT_RE_PAGE_SIZE_4K		BIT(BNXT_RE_PAGE_SHIFT_4K)
+#define BNXT_RE_PAGE_SIZE_8K		BIT(BNXT_RE_PAGE_SHIFT_8K)
+#define BNXT_RE_PAGE_SIZE_64K		BIT(BNXT_RE_PAGE_SHIFT_64K)
+#define BNXT_RE_PAGE_SIZE_2M		BIT(BNXT_RE_PAGE_SHIFT_2M)
+#define BNXT_RE_PAGE_SIZE_8M		BIT(BNXT_RE_PAGE_SHIFT_8M)
+#define BNXT_RE_PAGE_SIZE_1G		BIT(BNXT_RE_PAGE_SHIFT_1G)
+
+#define BNXT_RE_MAX_MR_SIZE_LOW		BIT(BNXT_RE_PAGE_SHIFT_1G)
+#define BNXT_RE_MAX_MR_SIZE_HIGH	BIT(39)
+#define BNXT_RE_MAX_MR_SIZE		BNXT_RE_MAX_MR_SIZE_HIGH
 
 
 #define BNXT_RE_MAX_QPC_COUNT		(64 * 1024)
 #define BNXT_RE_MAX_QPC_COUNT		(64 * 1024)
 #define BNXT_RE_MAX_MRW_COUNT		(64 * 1024)
 #define BNXT_RE_MAX_MRW_COUNT		(64 * 1024)
 #define BNXT_RE_MAX_SRQC_COUNT		(64 * 1024)
 #define BNXT_RE_MAX_SRQC_COUNT		(64 * 1024)
 #define BNXT_RE_MAX_CQ_COUNT		(64 * 1024)
 #define BNXT_RE_MAX_CQ_COUNT		(64 * 1024)
+#define BNXT_RE_MAX_MRW_COUNT_64K	(64 * 1024)
+#define BNXT_RE_MAX_MRW_COUNT_256K	(256 * 1024)
+
+/* Number of MRs to reserve for PF, leaving remainder for VFs */
+#define BNXT_RE_RESVD_MR_FOR_PF         (32 * 1024)
+#define BNXT_RE_MAX_GID_PER_VF          128
+
+/*
+ * Percentage of resources of each type reserved for PF.
+ * Remaining resources are divided equally among VFs.
+ * [0, 100]
+ */
+#define BNXT_RE_PCT_RSVD_FOR_PF         50
 
 
 #define BNXT_RE_UD_QP_HW_STALL		0x400000
 #define BNXT_RE_UD_QP_HW_STALL		0x400000
 
 
@@ -100,6 +121,7 @@ struct bnxt_re_dev {
 #define BNXT_RE_FLAG_RCFW_CHANNEL_EN		4
 #define BNXT_RE_FLAG_RCFW_CHANNEL_EN		4
 #define BNXT_RE_FLAG_QOS_WORK_REG		5
 #define BNXT_RE_FLAG_QOS_WORK_REG		5
 #define BNXT_RE_FLAG_TASK_IN_PROG		6
 #define BNXT_RE_FLAG_TASK_IN_PROG		6
+#define BNXT_RE_FLAG_ISSUE_ROCE_STATS          29
 	struct net_device		*netdev;
 	struct net_device		*netdev;
 	unsigned int			version, major, minor;
 	unsigned int			version, major, minor;
 	struct bnxt_en_dev		*en_dev;
 	struct bnxt_en_dev		*en_dev;
@@ -145,6 +167,9 @@ struct bnxt_re_dev {
 	struct bnxt_re_ah		*sqp_ah;
 	struct bnxt_re_ah		*sqp_ah;
 	struct bnxt_re_sqp_entries sqp_tbl[1024];
 	struct bnxt_re_sqp_entries sqp_tbl[1024];
 	atomic_t nq_alloc_cnt;
 	atomic_t nq_alloc_cnt;
+	u32 is_virtfn;
+	u32 num_vfs;
+	struct bnxt_qplib_roce_stats	stats;
 };
 };
 
 
 #define to_bnxt_re_dev(ptr, member)	\
 #define to_bnxt_re_dev(ptr, member)	\

+ 135 - 10
drivers/infiniband/hw/bnxt_re/hw_counters.c

@@ -58,16 +58,55 @@
 #include "hw_counters.h"
 #include "hw_counters.h"
 
 
 static const char * const bnxt_re_stat_name[] = {
 static const char * const bnxt_re_stat_name[] = {
-	[BNXT_RE_ACTIVE_QP]           =  "active_qps",
-	[BNXT_RE_ACTIVE_SRQ]          =  "active_srqs",
-	[BNXT_RE_ACTIVE_CQ]           =  "active_cqs",
-	[BNXT_RE_ACTIVE_MR]           =  "active_mrs",
-	[BNXT_RE_ACTIVE_MW]           =  "active_mws",
-	[BNXT_RE_RX_PKTS]             =  "rx_pkts",
-	[BNXT_RE_RX_BYTES]            =  "rx_bytes",
-	[BNXT_RE_TX_PKTS]             =  "tx_pkts",
-	[BNXT_RE_TX_BYTES]            =  "tx_bytes",
-	[BNXT_RE_RECOVERABLE_ERRORS]  =  "recoverable_errors"
+	[BNXT_RE_ACTIVE_QP]		=  "active_qps",
+	[BNXT_RE_ACTIVE_SRQ]		=  "active_srqs",
+	[BNXT_RE_ACTIVE_CQ]		=  "active_cqs",
+	[BNXT_RE_ACTIVE_MR]		=  "active_mrs",
+	[BNXT_RE_ACTIVE_MW]		=  "active_mws",
+	[BNXT_RE_RX_PKTS]		=  "rx_pkts",
+	[BNXT_RE_RX_BYTES]		=  "rx_bytes",
+	[BNXT_RE_TX_PKTS]		=  "tx_pkts",
+	[BNXT_RE_TX_BYTES]		=  "tx_bytes",
+	[BNXT_RE_RECOVERABLE_ERRORS]	=  "recoverable_errors",
+	[BNXT_RE_TO_RETRANSMITS]        = "to_retransmits",
+	[BNXT_RE_SEQ_ERR_NAKS_RCVD]     = "seq_err_naks_rcvd",
+	[BNXT_RE_MAX_RETRY_EXCEEDED]    = "max_retry_exceeded",
+	[BNXT_RE_RNR_NAKS_RCVD]         = "rnr_naks_rcvd",
+	[BNXT_RE_MISSING_RESP]          = "missin_resp",
+	[BNXT_RE_UNRECOVERABLE_ERR]     = "unrecoverable_err",
+	[BNXT_RE_BAD_RESP_ERR]          = "bad_resp_err",
+	[BNXT_RE_LOCAL_QP_OP_ERR]       = "local_qp_op_err",
+	[BNXT_RE_LOCAL_PROTECTION_ERR]  = "local_protection_err",
+	[BNXT_RE_MEM_MGMT_OP_ERR]       = "mem_mgmt_op_err",
+	[BNXT_RE_REMOTE_INVALID_REQ_ERR] = "remote_invalid_req_err",
+	[BNXT_RE_REMOTE_ACCESS_ERR]     = "remote_access_err",
+	[BNXT_RE_REMOTE_OP_ERR]         = "remote_op_err",
+	[BNXT_RE_DUP_REQ]               = "dup_req",
+	[BNXT_RE_RES_EXCEED_MAX]        = "res_exceed_max",
+	[BNXT_RE_RES_LENGTH_MISMATCH]   = "res_length_mismatch",
+	[BNXT_RE_RES_EXCEEDS_WQE]       = "res_exceeds_wqe",
+	[BNXT_RE_RES_OPCODE_ERR]        = "res_opcode_err",
+	[BNXT_RE_RES_RX_INVALID_RKEY]   = "res_rx_invalid_rkey",
+	[BNXT_RE_RES_RX_DOMAIN_ERR]     = "res_rx_domain_err",
+	[BNXT_RE_RES_RX_NO_PERM]        = "res_rx_no_perm",
+	[BNXT_RE_RES_RX_RANGE_ERR]      = "res_rx_range_err",
+	[BNXT_RE_RES_TX_INVALID_RKEY]   = "res_tx_invalid_rkey",
+	[BNXT_RE_RES_TX_DOMAIN_ERR]     = "res_tx_domain_err",
+	[BNXT_RE_RES_TX_NO_PERM]        = "res_tx_no_perm",
+	[BNXT_RE_RES_TX_RANGE_ERR]      = "res_tx_range_err",
+	[BNXT_RE_RES_IRRQ_OFLOW]        = "res_irrq_oflow",
+	[BNXT_RE_RES_UNSUP_OPCODE]      = "res_unsup_opcode",
+	[BNXT_RE_RES_UNALIGNED_ATOMIC]  = "res_unaligned_atomic",
+	[BNXT_RE_RES_REM_INV_ERR]       = "res_rem_inv_err",
+	[BNXT_RE_RES_MEM_ERROR]         = "res_mem_err",
+	[BNXT_RE_RES_SRQ_ERR]           = "res_srq_err",
+	[BNXT_RE_RES_CMP_ERR]           = "res_cmp_err",
+	[BNXT_RE_RES_INVALID_DUP_RKEY]  = "res_invalid_dup_rkey",
+	[BNXT_RE_RES_WQE_FORMAT_ERR]    = "res_wqe_format_err",
+	[BNXT_RE_RES_CQ_LOAD_ERR]       = "res_cq_load_err",
+	[BNXT_RE_RES_SRQ_LOAD_ERR]      = "res_srq_load_err",
+	[BNXT_RE_RES_TX_PCI_ERR]        = "res_tx_pci_err",
+	[BNXT_RE_RES_RX_PCI_ERR]        = "res_rx_pci_err"
 };
 };
 
 
 int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev,
 int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev,
@@ -76,6 +115,7 @@ int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev,
 {
 {
 	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
 	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
 	struct ctx_hw_stats *bnxt_re_stats = rdev->qplib_ctx.stats.dma;
 	struct ctx_hw_stats *bnxt_re_stats = rdev->qplib_ctx.stats.dma;
+	int rc  = 0;
 
 
 	if (!port || !stats)
 	if (!port || !stats)
 		return -EINVAL;
 		return -EINVAL;
@@ -97,6 +137,91 @@ int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev,
 		stats->value[BNXT_RE_TX_BYTES] =
 		stats->value[BNXT_RE_TX_BYTES] =
 			le64_to_cpu(bnxt_re_stats->tx_ucast_bytes);
 			le64_to_cpu(bnxt_re_stats->tx_ucast_bytes);
 	}
 	}
+	if (test_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, &rdev->flags)) {
+		rc = bnxt_qplib_get_roce_stats(&rdev->rcfw, &rdev->stats);
+		if (rc)
+			clear_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS,
+				  &rdev->flags);
+		stats->value[BNXT_RE_TO_RETRANSMITS] =
+					rdev->stats.to_retransmits;
+		stats->value[BNXT_RE_SEQ_ERR_NAKS_RCVD] =
+					rdev->stats.seq_err_naks_rcvd;
+		stats->value[BNXT_RE_MAX_RETRY_EXCEEDED] =
+					rdev->stats.max_retry_exceeded;
+		stats->value[BNXT_RE_RNR_NAKS_RCVD] =
+					rdev->stats.rnr_naks_rcvd;
+		stats->value[BNXT_RE_MISSING_RESP] =
+					rdev->stats.missing_resp;
+		stats->value[BNXT_RE_UNRECOVERABLE_ERR] =
+					rdev->stats.unrecoverable_err;
+		stats->value[BNXT_RE_BAD_RESP_ERR] =
+					rdev->stats.bad_resp_err;
+		stats->value[BNXT_RE_LOCAL_QP_OP_ERR]	=
+				rdev->stats.local_qp_op_err;
+		stats->value[BNXT_RE_LOCAL_PROTECTION_ERR] =
+				rdev->stats.local_protection_err;
+		stats->value[BNXT_RE_MEM_MGMT_OP_ERR] =
+				rdev->stats.mem_mgmt_op_err;
+		stats->value[BNXT_RE_REMOTE_INVALID_REQ_ERR] =
+				rdev->stats.remote_invalid_req_err;
+		stats->value[BNXT_RE_REMOTE_ACCESS_ERR] =
+				rdev->stats.remote_access_err;
+		stats->value[BNXT_RE_REMOTE_OP_ERR] =
+				rdev->stats.remote_op_err;
+		stats->value[BNXT_RE_DUP_REQ] =
+				rdev->stats.dup_req;
+		stats->value[BNXT_RE_RES_EXCEED_MAX] =
+				rdev->stats.res_exceed_max;
+		stats->value[BNXT_RE_RES_LENGTH_MISMATCH] =
+				rdev->stats.res_length_mismatch;
+		stats->value[BNXT_RE_RES_EXCEEDS_WQE] =
+				rdev->stats.res_exceeds_wqe;
+		stats->value[BNXT_RE_RES_OPCODE_ERR] =
+				rdev->stats.res_opcode_err;
+		stats->value[BNXT_RE_RES_RX_INVALID_RKEY] =
+				rdev->stats.res_rx_invalid_rkey;
+		stats->value[BNXT_RE_RES_RX_DOMAIN_ERR] =
+				rdev->stats.res_rx_domain_err;
+		stats->value[BNXT_RE_RES_RX_NO_PERM] =
+				rdev->stats.res_rx_no_perm;
+		stats->value[BNXT_RE_RES_RX_RANGE_ERR]  =
+				rdev->stats.res_rx_range_err;
+		stats->value[BNXT_RE_RES_TX_INVALID_RKEY] =
+				rdev->stats.res_tx_invalid_rkey;
+		stats->value[BNXT_RE_RES_TX_DOMAIN_ERR] =
+				rdev->stats.res_tx_domain_err;
+		stats->value[BNXT_RE_RES_TX_NO_PERM] =
+				rdev->stats.res_tx_no_perm;
+		stats->value[BNXT_RE_RES_TX_RANGE_ERR]  =
+				rdev->stats.res_tx_range_err;
+		stats->value[BNXT_RE_RES_IRRQ_OFLOW] =
+				rdev->stats.res_irrq_oflow;
+		stats->value[BNXT_RE_RES_UNSUP_OPCODE]  =
+				rdev->stats.res_unsup_opcode;
+		stats->value[BNXT_RE_RES_UNALIGNED_ATOMIC] =
+				rdev->stats.res_unaligned_atomic;
+		stats->value[BNXT_RE_RES_REM_INV_ERR]   =
+				rdev->stats.res_rem_inv_err;
+		stats->value[BNXT_RE_RES_MEM_ERROR] =
+				rdev->stats.res_mem_error;
+		stats->value[BNXT_RE_RES_SRQ_ERR] =
+				rdev->stats.res_srq_err;
+		stats->value[BNXT_RE_RES_CMP_ERR] =
+				rdev->stats.res_cmp_err;
+		stats->value[BNXT_RE_RES_INVALID_DUP_RKEY] =
+				rdev->stats.res_invalid_dup_rkey;
+		stats->value[BNXT_RE_RES_WQE_FORMAT_ERR] =
+				rdev->stats.res_wqe_format_err;
+		stats->value[BNXT_RE_RES_CQ_LOAD_ERR]   =
+				rdev->stats.res_cq_load_err;
+		stats->value[BNXT_RE_RES_SRQ_LOAD_ERR]  =
+				rdev->stats.res_srq_load_err;
+		stats->value[BNXT_RE_RES_TX_PCI_ERR]    =
+				rdev->stats.res_tx_pci_err;
+		stats->value[BNXT_RE_RES_RX_PCI_ERR]    =
+				rdev->stats.res_rx_pci_err;
+	}
+
 	return ARRAY_SIZE(bnxt_re_stat_name);
 	return ARRAY_SIZE(bnxt_re_stat_name);
 }
 }
 
 

+ 39 - 0
drivers/infiniband/hw/bnxt_re/hw_counters.h

@@ -51,6 +51,45 @@ enum bnxt_re_hw_stats {
 	BNXT_RE_TX_PKTS,
 	BNXT_RE_TX_PKTS,
 	BNXT_RE_TX_BYTES,
 	BNXT_RE_TX_BYTES,
 	BNXT_RE_RECOVERABLE_ERRORS,
 	BNXT_RE_RECOVERABLE_ERRORS,
+	BNXT_RE_TO_RETRANSMITS,
+	BNXT_RE_SEQ_ERR_NAKS_RCVD,
+	BNXT_RE_MAX_RETRY_EXCEEDED,
+	BNXT_RE_RNR_NAKS_RCVD,
+	BNXT_RE_MISSING_RESP,
+	BNXT_RE_UNRECOVERABLE_ERR,
+	BNXT_RE_BAD_RESP_ERR,
+	BNXT_RE_LOCAL_QP_OP_ERR,
+	BNXT_RE_LOCAL_PROTECTION_ERR,
+	BNXT_RE_MEM_MGMT_OP_ERR,
+	BNXT_RE_REMOTE_INVALID_REQ_ERR,
+	BNXT_RE_REMOTE_ACCESS_ERR,
+	BNXT_RE_REMOTE_OP_ERR,
+	BNXT_RE_DUP_REQ,
+	BNXT_RE_RES_EXCEED_MAX,
+	BNXT_RE_RES_LENGTH_MISMATCH,
+	BNXT_RE_RES_EXCEEDS_WQE,
+	BNXT_RE_RES_OPCODE_ERR,
+	BNXT_RE_RES_RX_INVALID_RKEY,
+	BNXT_RE_RES_RX_DOMAIN_ERR,
+	BNXT_RE_RES_RX_NO_PERM,
+	BNXT_RE_RES_RX_RANGE_ERR,
+	BNXT_RE_RES_TX_INVALID_RKEY,
+	BNXT_RE_RES_TX_DOMAIN_ERR,
+	BNXT_RE_RES_TX_NO_PERM,
+	BNXT_RE_RES_TX_RANGE_ERR,
+	BNXT_RE_RES_IRRQ_OFLOW,
+	BNXT_RE_RES_UNSUP_OPCODE,
+	BNXT_RE_RES_UNALIGNED_ATOMIC,
+	BNXT_RE_RES_REM_INV_ERR,
+	BNXT_RE_RES_MEM_ERROR,
+	BNXT_RE_RES_SRQ_ERR,
+	BNXT_RE_RES_CMP_ERR,
+	BNXT_RE_RES_INVALID_DUP_RKEY,
+	BNXT_RE_RES_WQE_FORMAT_ERR,
+	BNXT_RE_RES_CQ_LOAD_ERR,
+	BNXT_RE_RES_SRQ_LOAD_ERR,
+	BNXT_RE_RES_TX_PCI_ERR,
+	BNXT_RE_RES_RX_PCI_ERR,
 	BNXT_RE_NUM_COUNTERS
 	BNXT_RE_NUM_COUNTERS
 };
 };
 
 

+ 348 - 56
drivers/infiniband/hw/bnxt_re/ib_verbs.c

@@ -141,12 +141,13 @@ int bnxt_re_query_device(struct ib_device *ibdev,
 	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
 	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
 
 
 	memset(ib_attr, 0, sizeof(*ib_attr));
 	memset(ib_attr, 0, sizeof(*ib_attr));
-
-	ib_attr->fw_ver = (u64)(unsigned long)(dev_attr->fw_ver);
+	memcpy(&ib_attr->fw_ver, dev_attr->fw_ver,
+	       min(sizeof(dev_attr->fw_ver),
+		   sizeof(ib_attr->fw_ver)));
 	bnxt_qplib_get_guid(rdev->netdev->dev_addr,
 	bnxt_qplib_get_guid(rdev->netdev->dev_addr,
 			    (u8 *)&ib_attr->sys_image_guid);
 			    (u8 *)&ib_attr->sys_image_guid);
 	ib_attr->max_mr_size = BNXT_RE_MAX_MR_SIZE;
 	ib_attr->max_mr_size = BNXT_RE_MAX_MR_SIZE;
-	ib_attr->page_size_cap = BNXT_RE_PAGE_SIZE_4K;
+	ib_attr->page_size_cap = BNXT_RE_PAGE_SIZE_4K | BNXT_RE_PAGE_SIZE_2M;
 
 
 	ib_attr->vendor_id = rdev->en_dev->pdev->vendor;
 	ib_attr->vendor_id = rdev->en_dev->pdev->vendor;
 	ib_attr->vendor_part_id = rdev->en_dev->pdev->device;
 	ib_attr->vendor_part_id = rdev->en_dev->pdev->device;
@@ -247,8 +248,7 @@ int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num,
 				    IB_PORT_VENDOR_CLASS_SUP |
 				    IB_PORT_VENDOR_CLASS_SUP |
 				    IB_PORT_IP_BASED_GIDS;
 				    IB_PORT_IP_BASED_GIDS;
 
 
-	/* Max MSG size set to 2G for now */
-	port_attr->max_msg_sz = 0x80000000;
+	port_attr->max_msg_sz = (u32)BNXT_RE_MAX_MR_SIZE_LOW;
 	port_attr->bad_pkey_cntr = 0;
 	port_attr->bad_pkey_cntr = 0;
 	port_attr->qkey_viol_cntr = 0;
 	port_attr->qkey_viol_cntr = 0;
 	port_attr->pkey_tbl_len = dev_attr->max_pkey;
 	port_attr->pkey_tbl_len = dev_attr->max_pkey;
@@ -281,6 +281,15 @@ int bnxt_re_get_port_immutable(struct ib_device *ibdev, u8 port_num,
 	return 0;
 	return 0;
 }
 }
 
 
+void bnxt_re_query_fw_str(struct ib_device *ibdev, char *str)
+{
+	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
+
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d.%d",
+		 rdev->dev_attr.fw_ver[0], rdev->dev_attr.fw_ver[1],
+		 rdev->dev_attr.fw_ver[2], rdev->dev_attr.fw_ver[3]);
+}
+
 int bnxt_re_query_pkey(struct ib_device *ibdev, u8 port_num,
 int bnxt_re_query_pkey(struct ib_device *ibdev, u8 port_num,
 		       u16 index, u16 *pkey)
 		       u16 index, u16 *pkey)
 {
 {
@@ -532,7 +541,7 @@ static int bnxt_re_create_fence_mr(struct bnxt_re_pd *pd)
 	mr->qplib_mr.total_size = BNXT_RE_FENCE_BYTES;
 	mr->qplib_mr.total_size = BNXT_RE_FENCE_BYTES;
 	pbl_tbl = dma_addr;
 	pbl_tbl = dma_addr;
 	rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, &pbl_tbl,
 	rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, &pbl_tbl,
-			       BNXT_RE_FENCE_PBL_SIZE, false);
+			       BNXT_RE_FENCE_PBL_SIZE, false, PAGE_SIZE);
 	if (rc) {
 	if (rc) {
 		dev_err(rdev_to_dev(rdev), "Failed to register fence-MR\n");
 		dev_err(rdev_to_dev(rdev), "Failed to register fence-MR\n");
 		goto fail;
 		goto fail;
@@ -1018,6 +1027,7 @@ struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd,
 	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
 	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
 	struct bnxt_re_qp *qp;
 	struct bnxt_re_qp *qp;
 	struct bnxt_re_cq *cq;
 	struct bnxt_re_cq *cq;
+	struct bnxt_re_srq *srq;
 	int rc, entries;
 	int rc, entries;
 
 
 	if ((qp_init_attr->cap.max_send_wr > dev_attr->max_qp_wqes) ||
 	if ((qp_init_attr->cap.max_send_wr > dev_attr->max_qp_wqes) ||
@@ -1073,9 +1083,15 @@ struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd,
 	}
 	}
 
 
 	if (qp_init_attr->srq) {
 	if (qp_init_attr->srq) {
-		dev_err(rdev_to_dev(rdev), "SRQ not supported");
-		rc = -ENOTSUPP;
-		goto fail;
+		srq = container_of(qp_init_attr->srq, struct bnxt_re_srq,
+				   ib_srq);
+		if (!srq) {
+			dev_err(rdev_to_dev(rdev), "SRQ not found");
+			rc = -EINVAL;
+			goto fail;
+		}
+		qp->qplib_qp.srq = &srq->qplib_srq;
+		qp->qplib_qp.rq.max_wqe = 0;
 	} else {
 	} else {
 		/* Allocate 1 more than what's provided so posting max doesn't
 		/* Allocate 1 more than what's provided so posting max doesn't
 		 * mean empty
 		 * mean empty
@@ -1280,6 +1296,237 @@ static enum ib_mtu __to_ib_mtu(u32 mtu)
 	}
 	}
 }
 }
 
 
+/* Shared Receive Queues */
+int bnxt_re_destroy_srq(struct ib_srq *ib_srq)
+{
+	struct bnxt_re_srq *srq = container_of(ib_srq, struct bnxt_re_srq,
+					       ib_srq);
+	struct bnxt_re_dev *rdev = srq->rdev;
+	struct bnxt_qplib_srq *qplib_srq = &srq->qplib_srq;
+	struct bnxt_qplib_nq *nq = NULL;
+	int rc;
+
+	if (qplib_srq->cq)
+		nq = qplib_srq->cq->nq;
+	rc = bnxt_qplib_destroy_srq(&rdev->qplib_res, qplib_srq);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Destroy HW SRQ failed!");
+		return rc;
+	}
+
+	if (srq->umem && !IS_ERR(srq->umem))
+		ib_umem_release(srq->umem);
+	kfree(srq);
+	atomic_dec(&rdev->srq_count);
+	if (nq)
+		nq->budget--;
+	return 0;
+}
+
+static int bnxt_re_init_user_srq(struct bnxt_re_dev *rdev,
+				 struct bnxt_re_pd *pd,
+				 struct bnxt_re_srq *srq,
+				 struct ib_udata *udata)
+{
+	struct bnxt_re_srq_req ureq;
+	struct bnxt_qplib_srq *qplib_srq = &srq->qplib_srq;
+	struct ib_umem *umem;
+	int bytes = 0;
+	struct ib_ucontext *context = pd->ib_pd.uobject->context;
+	struct bnxt_re_ucontext *cntx = container_of(context,
+						     struct bnxt_re_ucontext,
+						     ib_uctx);
+	if (ib_copy_from_udata(&ureq, udata, sizeof(ureq)))
+		return -EFAULT;
+
+	bytes = (qplib_srq->max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE);
+	bytes = PAGE_ALIGN(bytes);
+	umem = ib_umem_get(context, ureq.srqva, bytes,
+			   IB_ACCESS_LOCAL_WRITE, 1);
+	if (IS_ERR(umem))
+		return PTR_ERR(umem);
+
+	srq->umem = umem;
+	qplib_srq->nmap = umem->nmap;
+	qplib_srq->sglist = umem->sg_head.sgl;
+	qplib_srq->srq_handle = ureq.srq_handle;
+	qplib_srq->dpi = &cntx->dpi;
+
+	return 0;
+}
+
+struct ib_srq *bnxt_re_create_srq(struct ib_pd *ib_pd,
+				  struct ib_srq_init_attr *srq_init_attr,
+				  struct ib_udata *udata)
+{
+	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
+	struct bnxt_re_dev *rdev = pd->rdev;
+	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
+	struct bnxt_re_srq *srq;
+	struct bnxt_qplib_nq *nq = NULL;
+	int rc, entries;
+
+	if (srq_init_attr->attr.max_wr >= dev_attr->max_srq_wqes) {
+		dev_err(rdev_to_dev(rdev), "Create CQ failed - max exceeded");
+		rc = -EINVAL;
+		goto exit;
+	}
+
+	if (srq_init_attr->srq_type != IB_SRQT_BASIC) {
+		rc = -ENOTSUPP;
+		goto exit;
+	}
+
+	srq = kzalloc(sizeof(*srq), GFP_KERNEL);
+	if (!srq) {
+		rc = -ENOMEM;
+		goto exit;
+	}
+	srq->rdev = rdev;
+	srq->qplib_srq.pd = &pd->qplib_pd;
+	srq->qplib_srq.dpi = &rdev->dpi_privileged;
+	/* Allocate 1 more than what's provided so posting max doesn't
+	 * mean empty
+	 */
+	entries = roundup_pow_of_two(srq_init_attr->attr.max_wr + 1);
+	if (entries > dev_attr->max_srq_wqes + 1)
+		entries = dev_attr->max_srq_wqes + 1;
+
+	srq->qplib_srq.max_wqe = entries;
+	srq->qplib_srq.max_sge = srq_init_attr->attr.max_sge;
+	srq->qplib_srq.threshold = srq_init_attr->attr.srq_limit;
+	srq->srq_limit = srq_init_attr->attr.srq_limit;
+	srq->qplib_srq.eventq_hw_ring_id = rdev->nq[0].ring_id;
+	nq = &rdev->nq[0];
+
+	if (udata) {
+		rc = bnxt_re_init_user_srq(rdev, pd, srq, udata);
+		if (rc)
+			goto fail;
+	}
+
+	rc = bnxt_qplib_create_srq(&rdev->qplib_res, &srq->qplib_srq);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Create HW SRQ failed!");
+		goto fail;
+	}
+
+	if (udata) {
+		struct bnxt_re_srq_resp resp;
+
+		resp.srqid = srq->qplib_srq.id;
+		rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
+		if (rc) {
+			dev_err(rdev_to_dev(rdev), "SRQ copy to udata failed!");
+			bnxt_qplib_destroy_srq(&rdev->qplib_res,
+					       &srq->qplib_srq);
+			goto exit;
+		}
+	}
+	if (nq)
+		nq->budget++;
+	atomic_inc(&rdev->srq_count);
+
+	return &srq->ib_srq;
+
+fail:
+	if (udata && srq->umem && !IS_ERR(srq->umem)) {
+		ib_umem_release(srq->umem);
+		srq->umem = NULL;
+	}
+
+	kfree(srq);
+exit:
+	return ERR_PTR(rc);
+}
+
+int bnxt_re_modify_srq(struct ib_srq *ib_srq, struct ib_srq_attr *srq_attr,
+		       enum ib_srq_attr_mask srq_attr_mask,
+		       struct ib_udata *udata)
+{
+	struct bnxt_re_srq *srq = container_of(ib_srq, struct bnxt_re_srq,
+					       ib_srq);
+	struct bnxt_re_dev *rdev = srq->rdev;
+	int rc;
+
+	switch (srq_attr_mask) {
+	case IB_SRQ_MAX_WR:
+		/* SRQ resize is not supported */
+		break;
+	case IB_SRQ_LIMIT:
+		/* Change the SRQ threshold */
+		if (srq_attr->srq_limit > srq->qplib_srq.max_wqe)
+			return -EINVAL;
+
+		srq->qplib_srq.threshold = srq_attr->srq_limit;
+		rc = bnxt_qplib_modify_srq(&rdev->qplib_res, &srq->qplib_srq);
+		if (rc) {
+			dev_err(rdev_to_dev(rdev), "Modify HW SRQ failed!");
+			return rc;
+		}
+		/* On success, update the shadow */
+		srq->srq_limit = srq_attr->srq_limit;
+		/* No need to Build and send response back to udata */
+		break;
+	default:
+		dev_err(rdev_to_dev(rdev),
+			"Unsupported srq_attr_mask 0x%x", srq_attr_mask);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+int bnxt_re_query_srq(struct ib_srq *ib_srq, struct ib_srq_attr *srq_attr)
+{
+	struct bnxt_re_srq *srq = container_of(ib_srq, struct bnxt_re_srq,
+					       ib_srq);
+	struct bnxt_re_srq tsrq;
+	struct bnxt_re_dev *rdev = srq->rdev;
+	int rc;
+
+	/* Get live SRQ attr */
+	tsrq.qplib_srq.id = srq->qplib_srq.id;
+	rc = bnxt_qplib_query_srq(&rdev->qplib_res, &tsrq.qplib_srq);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Query HW SRQ failed!");
+		return rc;
+	}
+	srq_attr->max_wr = srq->qplib_srq.max_wqe;
+	srq_attr->max_sge = srq->qplib_srq.max_sge;
+	srq_attr->srq_limit = tsrq.qplib_srq.threshold;
+
+	return 0;
+}
+
+int bnxt_re_post_srq_recv(struct ib_srq *ib_srq, struct ib_recv_wr *wr,
+			  struct ib_recv_wr **bad_wr)
+{
+	struct bnxt_re_srq *srq = container_of(ib_srq, struct bnxt_re_srq,
+					       ib_srq);
+	struct bnxt_qplib_swqe wqe;
+	unsigned long flags;
+	int rc = 0, payload_sz = 0;
+
+	spin_lock_irqsave(&srq->lock, flags);
+	while (wr) {
+		/* Transcribe each ib_recv_wr to qplib_swqe */
+		wqe.num_sge = wr->num_sge;
+		payload_sz = bnxt_re_build_sgl(wr->sg_list, wqe.sg_list,
+					       wr->num_sge);
+		wqe.wr_id = wr->wr_id;
+		wqe.type = BNXT_QPLIB_SWQE_TYPE_RECV;
+
+		rc = bnxt_qplib_post_srq_recv(&srq->qplib_srq, &wqe);
+		if (rc) {
+			*bad_wr = wr;
+			break;
+		}
+		wr = wr->next;
+	}
+	spin_unlock_irqrestore(&srq->lock, flags);
+
+	return rc;
+}
 static int bnxt_re_modify_shadow_qp(struct bnxt_re_dev *rdev,
 static int bnxt_re_modify_shadow_qp(struct bnxt_re_dev *rdev,
 				    struct bnxt_re_qp *qp1_qp,
 				    struct bnxt_re_qp *qp1_qp,
 				    int qp_attr_mask)
 				    int qp_attr_mask)
@@ -2295,10 +2542,14 @@ int bnxt_re_post_recv(struct ib_qp *ib_qp, struct ib_recv_wr *wr,
 /* Completion Queues */
 /* Completion Queues */
 int bnxt_re_destroy_cq(struct ib_cq *ib_cq)
 int bnxt_re_destroy_cq(struct ib_cq *ib_cq)
 {
 {
-	struct bnxt_re_cq *cq = container_of(ib_cq, struct bnxt_re_cq, ib_cq);
-	struct bnxt_re_dev *rdev = cq->rdev;
 	int rc;
 	int rc;
-	struct bnxt_qplib_nq *nq = cq->qplib_cq.nq;
+	struct bnxt_re_cq *cq;
+	struct bnxt_qplib_nq *nq;
+	struct bnxt_re_dev *rdev;
+
+	cq = container_of(ib_cq, struct bnxt_re_cq, ib_cq);
+	rdev = cq->rdev;
+	nq = cq->qplib_cq.nq;
 
 
 	rc = bnxt_qplib_destroy_cq(&rdev->qplib_res, &cq->qplib_cq);
 	rc = bnxt_qplib_destroy_cq(&rdev->qplib_res, &cq->qplib_cq);
 	if (rc) {
 	if (rc) {
@@ -2308,12 +2559,11 @@ int bnxt_re_destroy_cq(struct ib_cq *ib_cq)
 	if (!IS_ERR_OR_NULL(cq->umem))
 	if (!IS_ERR_OR_NULL(cq->umem))
 		ib_umem_release(cq->umem);
 		ib_umem_release(cq->umem);
 
 
-	if (cq) {
-		kfree(cq->cql);
-		kfree(cq);
-	}
 	atomic_dec(&rdev->cq_count);
 	atomic_dec(&rdev->cq_count);
 	nq->budget--;
 	nq->budget--;
+	kfree(cq->cql);
+	kfree(cq);
+
 	return 0;
 	return 0;
 }
 }
 
 
@@ -3078,7 +3328,8 @@ struct ib_mr *bnxt_re_get_dma_mr(struct ib_pd *ib_pd, int mr_access_flags)
 
 
 	mr->qplib_mr.hwq.level = PBL_LVL_MAX;
 	mr->qplib_mr.hwq.level = PBL_LVL_MAX;
 	mr->qplib_mr.total_size = -1; /* Infinte length */
 	mr->qplib_mr.total_size = -1; /* Infinte length */
-	rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, &pbl, 0, false);
+	rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, &pbl, 0, false,
+			       PAGE_SIZE);
 	if (rc)
 	if (rc)
 		goto fail_mr;
 		goto fail_mr;
 
 
@@ -3104,10 +3355,8 @@ int bnxt_re_dereg_mr(struct ib_mr *ib_mr)
 	int rc;
 	int rc;
 
 
 	rc = bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr);
 	rc = bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr);
-	if (rc) {
+	if (rc)
 		dev_err(rdev_to_dev(rdev), "Dereg MR failed: %#x\n", rc);
 		dev_err(rdev_to_dev(rdev), "Dereg MR failed: %#x\n", rc);
-		return rc;
-	}
 
 
 	if (mr->pages) {
 	if (mr->pages) {
 		rc = bnxt_qplib_free_fast_reg_page_list(&rdev->qplib_res,
 		rc = bnxt_qplib_free_fast_reg_page_list(&rdev->qplib_res,
@@ -3170,7 +3419,7 @@ struct ib_mr *bnxt_re_alloc_mr(struct ib_pd *ib_pd, enum ib_mr_type type,
 
 
 	rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr);
 	rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr);
 	if (rc)
 	if (rc)
-		goto fail;
+		goto bail;
 
 
 	mr->ib_mr.lkey = mr->qplib_mr.lkey;
 	mr->ib_mr.lkey = mr->qplib_mr.lkey;
 	mr->ib_mr.rkey = mr->ib_mr.lkey;
 	mr->ib_mr.rkey = mr->ib_mr.lkey;
@@ -3192,9 +3441,10 @@ struct ib_mr *bnxt_re_alloc_mr(struct ib_pd *ib_pd, enum ib_mr_type type,
 	return &mr->ib_mr;
 	return &mr->ib_mr;
 
 
 fail_mr:
 fail_mr:
-	bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr);
-fail:
 	kfree(mr->pages);
 	kfree(mr->pages);
+fail:
+	bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr);
+bail:
 	kfree(mr);
 	kfree(mr);
 	return ERR_PTR(rc);
 	return ERR_PTR(rc);
 }
 }
@@ -3248,6 +3498,46 @@ int bnxt_re_dealloc_mw(struct ib_mw *ib_mw)
 	return rc;
 	return rc;
 }
 }
 
 
+static int bnxt_re_page_size_ok(int page_shift)
+{
+	switch (page_shift) {
+	case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_4K:
+	case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_8K:
+	case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_64K:
+	case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_2M:
+	case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_256K:
+	case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_1M:
+	case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_4M:
+	case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_1G:
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static int fill_umem_pbl_tbl(struct ib_umem *umem, u64 *pbl_tbl_orig,
+			     int page_shift)
+{
+	u64 *pbl_tbl = pbl_tbl_orig;
+	u64 paddr;
+	u64 page_mask = (1ULL << page_shift) - 1;
+	int i, pages;
+	struct scatterlist *sg;
+	int entry;
+
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+		pages = sg_dma_len(sg) >> PAGE_SHIFT;
+		for (i = 0; i < pages; i++) {
+			paddr = sg_dma_address(sg) + (i << PAGE_SHIFT);
+			if (pbl_tbl == pbl_tbl_orig)
+				*pbl_tbl++ = paddr & ~page_mask;
+			else if ((paddr & page_mask) == 0)
+				*pbl_tbl++ = paddr;
+		}
+	}
+	return pbl_tbl - pbl_tbl_orig;
+}
+
 /* uverbs */
 /* uverbs */
 struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length,
 struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length,
 				  u64 virt_addr, int mr_access_flags,
 				  u64 virt_addr, int mr_access_flags,
@@ -3257,10 +3547,8 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length,
 	struct bnxt_re_dev *rdev = pd->rdev;
 	struct bnxt_re_dev *rdev = pd->rdev;
 	struct bnxt_re_mr *mr;
 	struct bnxt_re_mr *mr;
 	struct ib_umem *umem;
 	struct ib_umem *umem;
-	u64 *pbl_tbl, *pbl_tbl_orig;
-	int i, umem_pgs, pages, rc;
-	struct scatterlist *sg;
-	int entry;
+	u64 *pbl_tbl = NULL;
+	int umem_pgs, page_shift, rc;
 
 
 	if (length > BNXT_RE_MAX_MR_SIZE) {
 	if (length > BNXT_RE_MAX_MR_SIZE) {
 		dev_err(rdev_to_dev(rdev), "MR Size: %lld > Max supported:%ld\n",
 		dev_err(rdev_to_dev(rdev), "MR Size: %lld > Max supported:%ld\n",
@@ -3277,64 +3565,68 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length,
 	mr->qplib_mr.flags = __from_ib_access_flags(mr_access_flags);
 	mr->qplib_mr.flags = __from_ib_access_flags(mr_access_flags);
 	mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_MR;
 	mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_MR;
 
 
+	rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Failed to allocate MR");
+		goto free_mr;
+	}
+	/* The fixed portion of the rkey is the same as the lkey */
+	mr->ib_mr.rkey = mr->qplib_mr.rkey;
+
 	umem = ib_umem_get(ib_pd->uobject->context, start, length,
 	umem = ib_umem_get(ib_pd->uobject->context, start, length,
 			   mr_access_flags, 0);
 			   mr_access_flags, 0);
 	if (IS_ERR(umem)) {
 	if (IS_ERR(umem)) {
 		dev_err(rdev_to_dev(rdev), "Failed to get umem");
 		dev_err(rdev_to_dev(rdev), "Failed to get umem");
 		rc = -EFAULT;
 		rc = -EFAULT;
-		goto free_mr;
+		goto free_mrw;
 	}
 	}
 	mr->ib_umem = umem;
 	mr->ib_umem = umem;
 
 
-	rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr);
-	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Failed to allocate MR");
-		goto release_umem;
-	}
-	/* The fixed portion of the rkey is the same as the lkey */
-	mr->ib_mr.rkey = mr->qplib_mr.rkey;
-
 	mr->qplib_mr.va = virt_addr;
 	mr->qplib_mr.va = virt_addr;
 	umem_pgs = ib_umem_page_count(umem);
 	umem_pgs = ib_umem_page_count(umem);
 	if (!umem_pgs) {
 	if (!umem_pgs) {
 		dev_err(rdev_to_dev(rdev), "umem is invalid!");
 		dev_err(rdev_to_dev(rdev), "umem is invalid!");
 		rc = -EINVAL;
 		rc = -EINVAL;
-		goto free_mrw;
+		goto free_umem;
 	}
 	}
 	mr->qplib_mr.total_size = length;
 	mr->qplib_mr.total_size = length;
 
 
 	pbl_tbl = kcalloc(umem_pgs, sizeof(u64 *), GFP_KERNEL);
 	pbl_tbl = kcalloc(umem_pgs, sizeof(u64 *), GFP_KERNEL);
 	if (!pbl_tbl) {
 	if (!pbl_tbl) {
-		rc = -EINVAL;
-		goto free_mrw;
+		rc = -ENOMEM;
+		goto free_umem;
 	}
 	}
-	pbl_tbl_orig = pbl_tbl;
 
 
-	if (umem->hugetlb) {
-		dev_err(rdev_to_dev(rdev), "umem hugetlb not supported!");
+	page_shift = umem->page_shift;
+
+	if (!bnxt_re_page_size_ok(page_shift)) {
+		dev_err(rdev_to_dev(rdev), "umem page size unsupported!");
 		rc = -EFAULT;
 		rc = -EFAULT;
 		goto fail;
 		goto fail;
 	}
 	}
 
 
-	if (umem->page_shift != PAGE_SHIFT) {
-		dev_err(rdev_to_dev(rdev), "umem page shift unsupported!");
-		rc = -EFAULT;
+	if (!umem->hugetlb && length > BNXT_RE_MAX_MR_SIZE_LOW) {
+		dev_err(rdev_to_dev(rdev), "Requested MR Sz:%llu Max sup:%llu",
+			length,	(u64)BNXT_RE_MAX_MR_SIZE_LOW);
+		rc = -EINVAL;
 		goto fail;
 		goto fail;
 	}
 	}
-	/* Map umem buf ptrs to the PBL */
-	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-		pages = sg_dma_len(sg) >> umem->page_shift;
-		for (i = 0; i < pages; i++, pbl_tbl++)
-			*pbl_tbl = sg_dma_address(sg) + (i << umem->page_shift);
+	if (umem->hugetlb && length > BNXT_RE_PAGE_SIZE_2M) {
+		page_shift = BNXT_RE_PAGE_SHIFT_2M;
+		dev_warn(rdev_to_dev(rdev), "umem hugetlb set page_size %x",
+			 1 << page_shift);
 	}
 	}
-	rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, pbl_tbl_orig,
-			       umem_pgs, false);
+
+	/* Map umem buf ptrs to the PBL */
+	umem_pgs = fill_umem_pbl_tbl(umem, pbl_tbl, page_shift);
+	rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, pbl_tbl,
+			       umem_pgs, false, 1 << page_shift);
 	if (rc) {
 	if (rc) {
 		dev_err(rdev_to_dev(rdev), "Failed to register user MR");
 		dev_err(rdev_to_dev(rdev), "Failed to register user MR");
 		goto fail;
 		goto fail;
 	}
 	}
 
 
-	kfree(pbl_tbl_orig);
+	kfree(pbl_tbl);
 
 
 	mr->ib_mr.lkey = mr->qplib_mr.lkey;
 	mr->ib_mr.lkey = mr->qplib_mr.lkey;
 	mr->ib_mr.rkey = mr->qplib_mr.lkey;
 	mr->ib_mr.rkey = mr->qplib_mr.lkey;
@@ -3342,11 +3634,11 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length,
 
 
 	return &mr->ib_mr;
 	return &mr->ib_mr;
 fail:
 fail:
-	kfree(pbl_tbl_orig);
+	kfree(pbl_tbl);
+free_umem:
+	ib_umem_release(umem);
 free_mrw:
 free_mrw:
 	bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr);
 	bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr);
-release_umem:
-	ib_umem_release(umem);
 free_mr:
 free_mr:
 	kfree(mr);
 	kfree(mr);
 	return ERR_PTR(rc);
 	return ERR_PTR(rc);

+ 20 - 0
drivers/infiniband/hw/bnxt_re/ib_verbs.h

@@ -68,6 +68,15 @@ struct bnxt_re_ah {
 	struct bnxt_qplib_ah	qplib_ah;
 	struct bnxt_qplib_ah	qplib_ah;
 };
 };
 
 
+struct bnxt_re_srq {
+	struct bnxt_re_dev	*rdev;
+	u32			srq_limit;
+	struct ib_srq		ib_srq;
+	struct bnxt_qplib_srq	qplib_srq;
+	struct ib_umem		*umem;
+	spinlock_t		lock;		/* protect srq */
+};
+
 struct bnxt_re_qp {
 struct bnxt_re_qp {
 	struct list_head	list;
 	struct list_head	list;
 	struct bnxt_re_dev	*rdev;
 	struct bnxt_re_dev	*rdev;
@@ -143,6 +152,7 @@ int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num,
 		       struct ib_port_attr *port_attr);
 		       struct ib_port_attr *port_attr);
 int bnxt_re_get_port_immutable(struct ib_device *ibdev, u8 port_num,
 int bnxt_re_get_port_immutable(struct ib_device *ibdev, u8 port_num,
 			       struct ib_port_immutable *immutable);
 			       struct ib_port_immutable *immutable);
+void bnxt_re_query_fw_str(struct ib_device *ibdev, char *str);
 int bnxt_re_query_pkey(struct ib_device *ibdev, u8 port_num,
 int bnxt_re_query_pkey(struct ib_device *ibdev, u8 port_num,
 		       u16 index, u16 *pkey);
 		       u16 index, u16 *pkey);
 int bnxt_re_del_gid(struct ib_device *ibdev, u8 port_num,
 int bnxt_re_del_gid(struct ib_device *ibdev, u8 port_num,
@@ -164,6 +174,16 @@ struct ib_ah *bnxt_re_create_ah(struct ib_pd *pd,
 int bnxt_re_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
 int bnxt_re_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
 int bnxt_re_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
 int bnxt_re_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
 int bnxt_re_destroy_ah(struct ib_ah *ah);
 int bnxt_re_destroy_ah(struct ib_ah *ah);
+struct ib_srq *bnxt_re_create_srq(struct ib_pd *pd,
+				  struct ib_srq_init_attr *srq_init_attr,
+				  struct ib_udata *udata);
+int bnxt_re_modify_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr,
+		       enum ib_srq_attr_mask srq_attr_mask,
+		       struct ib_udata *udata);
+int bnxt_re_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
+int bnxt_re_destroy_srq(struct ib_srq *srq);
+int bnxt_re_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *recv_wr,
+			  struct ib_recv_wr **bad_recv_wr);
 struct ib_qp *bnxt_re_create_qp(struct ib_pd *pd,
 struct ib_qp *bnxt_re_create_qp(struct ib_pd *pd,
 				struct ib_qp_init_attr *qp_init_attr,
 				struct ib_qp_init_attr *qp_init_attr,
 				struct ib_udata *udata);
 				struct ib_udata *udata);

+ 209 - 42
drivers/infiniband/hw/bnxt_re/main.c

@@ -80,6 +80,79 @@ static DEFINE_MUTEX(bnxt_re_dev_lock);
 static struct workqueue_struct *bnxt_re_wq;
 static struct workqueue_struct *bnxt_re_wq;
 static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev, bool lock_wait);
 static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev, bool lock_wait);
 
 
+/* SR-IOV helper functions */
+
+static void bnxt_re_get_sriov_func_type(struct bnxt_re_dev *rdev)
+{
+	struct bnxt *bp;
+
+	bp = netdev_priv(rdev->en_dev->net);
+	if (BNXT_VF(bp))
+		rdev->is_virtfn = 1;
+}
+
+/* Set the maximum number of each resource that the driver actually wants
+ * to allocate. This may be up to the maximum number the firmware has
+ * reserved for the function. The driver may choose to allocate fewer
+ * resources than the firmware maximum.
+ */
+static void bnxt_re_set_resource_limits(struct bnxt_re_dev *rdev)
+{
+	u32 vf_qps = 0, vf_srqs = 0, vf_cqs = 0, vf_mrws = 0, vf_gids = 0;
+	u32 i;
+	u32 vf_pct;
+	u32 num_vfs;
+	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
+
+	rdev->qplib_ctx.qpc_count = min_t(u32, BNXT_RE_MAX_QPC_COUNT,
+					  dev_attr->max_qp);
+
+	rdev->qplib_ctx.mrw_count = BNXT_RE_MAX_MRW_COUNT_256K;
+	/* Use max_mr from fw since max_mrw does not get set */
+	rdev->qplib_ctx.mrw_count = min_t(u32, rdev->qplib_ctx.mrw_count,
+					  dev_attr->max_mr);
+	rdev->qplib_ctx.srqc_count = min_t(u32, BNXT_RE_MAX_SRQC_COUNT,
+					   dev_attr->max_srq);
+	rdev->qplib_ctx.cq_count = min_t(u32, BNXT_RE_MAX_CQ_COUNT,
+					 dev_attr->max_cq);
+
+	for (i = 0; i < MAX_TQM_ALLOC_REQ; i++)
+		rdev->qplib_ctx.tqm_count[i] =
+		rdev->dev_attr.tqm_alloc_reqs[i];
+
+	if (rdev->num_vfs) {
+		/*
+		 * Reserve a set of resources for the PF. Divide the remaining
+		 * resources among the VFs
+		 */
+		vf_pct = 100 - BNXT_RE_PCT_RSVD_FOR_PF;
+		num_vfs = 100 * rdev->num_vfs;
+		vf_qps = (rdev->qplib_ctx.qpc_count * vf_pct) / num_vfs;
+		vf_srqs = (rdev->qplib_ctx.srqc_count * vf_pct) / num_vfs;
+		vf_cqs = (rdev->qplib_ctx.cq_count * vf_pct) / num_vfs;
+		/*
+		 * The driver allows many more MRs than other resources. If the
+		 * firmware does also, then reserve a fixed amount for the PF
+		 * and divide the rest among VFs. VFs may use many MRs for NFS
+		 * mounts, ISER, NVME applications, etc. If the firmware
+		 * severely restricts the number of MRs, then let PF have
+		 * half and divide the rest among VFs, as for the other
+		 * resource types.
+		 */
+		if (rdev->qplib_ctx.mrw_count < BNXT_RE_MAX_MRW_COUNT_64K)
+			vf_mrws = rdev->qplib_ctx.mrw_count * vf_pct / num_vfs;
+		else
+			vf_mrws = (rdev->qplib_ctx.mrw_count -
+				   BNXT_RE_RESVD_MR_FOR_PF) / rdev->num_vfs;
+		vf_gids = BNXT_RE_MAX_GID_PER_VF;
+	}
+	rdev->qplib_ctx.vf_res.max_mrw_per_vf = vf_mrws;
+	rdev->qplib_ctx.vf_res.max_gid_per_vf = vf_gids;
+	rdev->qplib_ctx.vf_res.max_qp_per_vf = vf_qps;
+	rdev->qplib_ctx.vf_res.max_srq_per_vf = vf_srqs;
+	rdev->qplib_ctx.vf_res.max_cq_per_vf = vf_cqs;
+}
+
 /* for handling bnxt_en callbacks later */
 /* for handling bnxt_en callbacks later */
 static void bnxt_re_stop(void *p)
 static void bnxt_re_stop(void *p)
 {
 {
@@ -91,6 +164,15 @@ static void bnxt_re_start(void *p)
 
 
 static void bnxt_re_sriov_config(void *p, int num_vfs)
 static void bnxt_re_sriov_config(void *p, int num_vfs)
 {
 {
+	struct bnxt_re_dev *rdev = p;
+
+	if (!rdev)
+		return;
+
+	rdev->num_vfs = num_vfs;
+	bnxt_re_set_resource_limits(rdev);
+	bnxt_qplib_set_func_resources(&rdev->qplib_res, &rdev->rcfw,
+				      &rdev->qplib_ctx);
 }
 }
 
 
 static void bnxt_re_shutdown(void *p)
 static void bnxt_re_shutdown(void *p)
@@ -417,7 +499,7 @@ static struct bnxt_en_dev *bnxt_re_dev_probe(struct net_device *netdev)
 		return ERR_PTR(-EINVAL);
 		return ERR_PTR(-EINVAL);
 
 
 	if (!(en_dev->flags & BNXT_EN_FLAG_ROCE_CAP)) {
 	if (!(en_dev->flags & BNXT_EN_FLAG_ROCE_CAP)) {
-		dev_dbg(&pdev->dev,
+		dev_info(&pdev->dev,
 			"%s: probe error: RoCE is not supported on this device",
 			"%s: probe error: RoCE is not supported on this device",
 			ROCE_DRV_MODULE_NAME);
 			ROCE_DRV_MODULE_NAME);
 		return ERR_PTR(-ENODEV);
 		return ERR_PTR(-ENODEV);
@@ -490,6 +572,7 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
 
 
 	ibdev->query_port		= bnxt_re_query_port;
 	ibdev->query_port		= bnxt_re_query_port;
 	ibdev->get_port_immutable	= bnxt_re_get_port_immutable;
 	ibdev->get_port_immutable	= bnxt_re_get_port_immutable;
+	ibdev->get_dev_fw_str           = bnxt_re_query_fw_str;
 	ibdev->query_pkey		= bnxt_re_query_pkey;
 	ibdev->query_pkey		= bnxt_re_query_pkey;
 	ibdev->query_gid		= bnxt_re_query_gid;
 	ibdev->query_gid		= bnxt_re_query_gid;
 	ibdev->get_netdev		= bnxt_re_get_netdev;
 	ibdev->get_netdev		= bnxt_re_get_netdev;
@@ -505,6 +588,12 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
 	ibdev->query_ah			= bnxt_re_query_ah;
 	ibdev->query_ah			= bnxt_re_query_ah;
 	ibdev->destroy_ah		= bnxt_re_destroy_ah;
 	ibdev->destroy_ah		= bnxt_re_destroy_ah;
 
 
+	ibdev->create_srq		= bnxt_re_create_srq;
+	ibdev->modify_srq		= bnxt_re_modify_srq;
+	ibdev->query_srq		= bnxt_re_query_srq;
+	ibdev->destroy_srq		= bnxt_re_destroy_srq;
+	ibdev->post_srq_recv		= bnxt_re_post_srq_recv;
+
 	ibdev->create_qp		= bnxt_re_create_qp;
 	ibdev->create_qp		= bnxt_re_create_qp;
 	ibdev->modify_qp		= bnxt_re_modify_qp;
 	ibdev->modify_qp		= bnxt_re_modify_qp;
 	ibdev->query_qp			= bnxt_re_query_qp;
 	ibdev->query_qp			= bnxt_re_query_qp;
@@ -541,14 +630,6 @@ static ssize_t show_rev(struct device *device, struct device_attribute *attr,
 	return scnprintf(buf, PAGE_SIZE, "0x%x\n", rdev->en_dev->pdev->vendor);
 	return scnprintf(buf, PAGE_SIZE, "0x%x\n", rdev->en_dev->pdev->vendor);
 }
 }
 
 
-static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
-			   char *buf)
-{
-	struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev);
-
-	return scnprintf(buf, PAGE_SIZE, "%s\n", rdev->dev_attr.fw_ver);
-}
-
 static ssize_t show_hca(struct device *device, struct device_attribute *attr,
 static ssize_t show_hca(struct device *device, struct device_attribute *attr,
 			char *buf)
 			char *buf)
 {
 {
@@ -558,12 +639,10 @@ static ssize_t show_hca(struct device *device, struct device_attribute *attr,
 }
 }
 
 
 static DEVICE_ATTR(hw_rev, 0444, show_rev, NULL);
 static DEVICE_ATTR(hw_rev, 0444, show_rev, NULL);
-static DEVICE_ATTR(fw_rev, 0444, show_fw_ver, NULL);
 static DEVICE_ATTR(hca_type, 0444, show_hca, NULL);
 static DEVICE_ATTR(hca_type, 0444, show_hca, NULL);
 
 
 static struct device_attribute *bnxt_re_attributes[] = {
 static struct device_attribute *bnxt_re_attributes[] = {
 	&dev_attr_hw_rev,
 	&dev_attr_hw_rev,
-	&dev_attr_fw_rev,
 	&dev_attr_hca_type
 	&dev_attr_hca_type
 };
 };
 
 
@@ -616,10 +695,10 @@ static struct bnxt_re_dev *bnxt_re_dev_add(struct net_device *netdev,
 	return rdev;
 	return rdev;
 }
 }
 
 
-static int bnxt_re_aeq_handler(struct bnxt_qplib_rcfw *rcfw,
-			       struct creq_func_event *aeqe)
+static int bnxt_re_handle_unaffi_async_event(struct creq_func_event
+					     *unaffi_async)
 {
 {
-	switch (aeqe->event) {
+	switch (unaffi_async->event) {
 	case CREQ_FUNC_EVENT_EVENT_TX_WQE_ERROR:
 	case CREQ_FUNC_EVENT_EVENT_TX_WQE_ERROR:
 		break;
 		break;
 	case CREQ_FUNC_EVENT_EVENT_TX_DATA_ERROR:
 	case CREQ_FUNC_EVENT_EVENT_TX_DATA_ERROR:
@@ -648,6 +727,93 @@ static int bnxt_re_aeq_handler(struct bnxt_qplib_rcfw *rcfw,
 	return 0;
 	return 0;
 }
 }
 
 
+static int bnxt_re_handle_qp_async_event(struct creq_qp_event *qp_event,
+					 struct bnxt_re_qp *qp)
+{
+	struct ib_event event;
+
+	memset(&event, 0, sizeof(event));
+	if (qp->qplib_qp.srq) {
+		event.device = &qp->rdev->ibdev;
+		event.element.qp = &qp->ib_qp;
+		event.event = IB_EVENT_QP_LAST_WQE_REACHED;
+	}
+
+	if (event.device && qp->ib_qp.event_handler)
+		qp->ib_qp.event_handler(&event, qp->ib_qp.qp_context);
+
+	return 0;
+}
+
+static int bnxt_re_handle_affi_async_event(struct creq_qp_event *affi_async,
+					   void *obj)
+{
+	int rc = 0;
+	u8 event;
+
+	if (!obj)
+		return rc; /* QP was already dead, still return success */
+
+	event = affi_async->event;
+	if (event == CREQ_QP_EVENT_EVENT_QP_ERROR_NOTIFICATION) {
+		struct bnxt_qplib_qp *lib_qp = obj;
+		struct bnxt_re_qp *qp = container_of(lib_qp, struct bnxt_re_qp,
+						     qplib_qp);
+		rc = bnxt_re_handle_qp_async_event(affi_async, qp);
+	}
+	return rc;
+}
+
+static int bnxt_re_aeq_handler(struct bnxt_qplib_rcfw *rcfw,
+			       void *aeqe, void *obj)
+{
+	struct creq_qp_event *affi_async;
+	struct creq_func_event *unaffi_async;
+	u8 type;
+	int rc;
+
+	type = ((struct creq_base *)aeqe)->type;
+	if (type == CREQ_BASE_TYPE_FUNC_EVENT) {
+		unaffi_async = aeqe;
+		rc = bnxt_re_handle_unaffi_async_event(unaffi_async);
+	} else {
+		affi_async = aeqe;
+		rc = bnxt_re_handle_affi_async_event(affi_async, obj);
+	}
+
+	return rc;
+}
+
+static int bnxt_re_srqn_handler(struct bnxt_qplib_nq *nq,
+				struct bnxt_qplib_srq *handle, u8 event)
+{
+	struct bnxt_re_srq *srq = container_of(handle, struct bnxt_re_srq,
+					       qplib_srq);
+	struct ib_event ib_event;
+	int rc = 0;
+
+	if (!srq) {
+		dev_err(NULL, "%s: SRQ is NULL, SRQN not handled",
+			ROCE_DRV_MODULE_NAME);
+		rc = -EINVAL;
+		goto done;
+	}
+	ib_event.device = &srq->rdev->ibdev;
+	ib_event.element.srq = &srq->ib_srq;
+	if (event == NQ_SRQ_EVENT_EVENT_SRQ_THRESHOLD_EVENT)
+		ib_event.event = IB_EVENT_SRQ_LIMIT_REACHED;
+	else
+		ib_event.event = IB_EVENT_SRQ_ERR;
+
+	if (srq->ib_srq.event_handler) {
+		/* Lock event_handler? */
+		(*srq->ib_srq.event_handler)(&ib_event,
+					     srq->ib_srq.srq_context);
+	}
+done:
+	return rc;
+}
+
 static int bnxt_re_cqn_handler(struct bnxt_qplib_nq *nq,
 static int bnxt_re_cqn_handler(struct bnxt_qplib_nq *nq,
 			       struct bnxt_qplib_cq *handle)
 			       struct bnxt_qplib_cq *handle)
 {
 {
@@ -690,7 +856,8 @@ static int bnxt_re_init_res(struct bnxt_re_dev *rdev)
 		rc = bnxt_qplib_enable_nq(rdev->en_dev->pdev, &rdev->nq[i - 1],
 		rc = bnxt_qplib_enable_nq(rdev->en_dev->pdev, &rdev->nq[i - 1],
 					  i - 1, rdev->msix_entries[i].vector,
 					  i - 1, rdev->msix_entries[i].vector,
 					  rdev->msix_entries[i].db_offset,
 					  rdev->msix_entries[i].db_offset,
-					  &bnxt_re_cqn_handler, NULL);
+					  &bnxt_re_cqn_handler,
+					  &bnxt_re_srqn_handler);
 
 
 		if (rc) {
 		if (rc) {
 			dev_err(rdev_to_dev(rdev),
 			dev_err(rdev_to_dev(rdev),
@@ -734,7 +901,8 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev)
 
 
 	/* Configure and allocate resources for qplib */
 	/* Configure and allocate resources for qplib */
 	rdev->qplib_res.rcfw = &rdev->rcfw;
 	rdev->qplib_res.rcfw = &rdev->rcfw;
-	rc = bnxt_qplib_get_dev_attr(&rdev->rcfw, &rdev->dev_attr);
+	rc = bnxt_qplib_get_dev_attr(&rdev->rcfw, &rdev->dev_attr,
+				     rdev->is_virtfn);
 	if (rc)
 	if (rc)
 		goto fail;
 		goto fail;
 
 
@@ -1035,19 +1203,6 @@ static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev, bool lock_wait)
 	}
 	}
 }
 }
 
 
-static void bnxt_re_set_resource_limits(struct bnxt_re_dev *rdev)
-{
-	u32 i;
-
-	rdev->qplib_ctx.qpc_count = BNXT_RE_MAX_QPC_COUNT;
-	rdev->qplib_ctx.mrw_count = BNXT_RE_MAX_MRW_COUNT;
-	rdev->qplib_ctx.srqc_count = BNXT_RE_MAX_SRQC_COUNT;
-	rdev->qplib_ctx.cq_count = BNXT_RE_MAX_CQ_COUNT;
-	for (i = 0; i < MAX_TQM_ALLOC_REQ; i++)
-		rdev->qplib_ctx.tqm_count[i] =
-		rdev->dev_attr.tqm_alloc_reqs[i];
-}
-
 /* worker thread for polling periodic events. Now used for QoS programming*/
 /* worker thread for polling periodic events. Now used for QoS programming*/
 static void bnxt_re_worker(struct work_struct *work)
 static void bnxt_re_worker(struct work_struct *work)
 {
 {
@@ -1070,6 +1225,9 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 	}
 	}
 	set_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags);
 	set_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags);
 
 
+	/* Check whether VF or PF */
+	bnxt_re_get_sriov_func_type(rdev);
+
 	rc = bnxt_re_request_msix(rdev);
 	rc = bnxt_re_request_msix(rdev);
 	if (rc) {
 	if (rc) {
 		pr_err("Failed to get MSI-X vectors: %#x\n", rc);
 		pr_err("Failed to get MSI-X vectors: %#x\n", rc);
@@ -1101,16 +1259,18 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 				(rdev->en_dev->pdev, &rdev->rcfw,
 				(rdev->en_dev->pdev, &rdev->rcfw,
 				 rdev->msix_entries[BNXT_RE_AEQ_IDX].vector,
 				 rdev->msix_entries[BNXT_RE_AEQ_IDX].vector,
 				 rdev->msix_entries[BNXT_RE_AEQ_IDX].db_offset,
 				 rdev->msix_entries[BNXT_RE_AEQ_IDX].db_offset,
-				 0, &bnxt_re_aeq_handler);
+				 rdev->is_virtfn, &bnxt_re_aeq_handler);
 	if (rc) {
 	if (rc) {
 		pr_err("Failed to enable RCFW channel: %#x\n", rc);
 		pr_err("Failed to enable RCFW channel: %#x\n", rc);
 		goto free_ring;
 		goto free_ring;
 	}
 	}
 
 
-	rc = bnxt_qplib_get_dev_attr(&rdev->rcfw, &rdev->dev_attr);
+	rc = bnxt_qplib_get_dev_attr(&rdev->rcfw, &rdev->dev_attr,
+				     rdev->is_virtfn);
 	if (rc)
 	if (rc)
 		goto disable_rcfw;
 		goto disable_rcfw;
-	bnxt_re_set_resource_limits(rdev);
+	if (!rdev->is_virtfn)
+		bnxt_re_set_resource_limits(rdev);
 
 
 	rc = bnxt_qplib_alloc_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx, 0);
 	rc = bnxt_qplib_alloc_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx, 0);
 	if (rc) {
 	if (rc) {
@@ -1125,7 +1285,8 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 		goto free_ctx;
 		goto free_ctx;
 	}
 	}
 
 
-	rc = bnxt_qplib_init_rcfw(&rdev->rcfw, &rdev->qplib_ctx, 0);
+	rc = bnxt_qplib_init_rcfw(&rdev->rcfw, &rdev->qplib_ctx,
+				  rdev->is_virtfn);
 	if (rc) {
 	if (rc) {
 		pr_err("Failed to initialize RCFW: %#x\n", rc);
 		pr_err("Failed to initialize RCFW: %#x\n", rc);
 		goto free_sctx;
 		goto free_sctx;
@@ -1144,13 +1305,15 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 		goto fail;
 		goto fail;
 	}
 	}
 
 
-	rc = bnxt_re_setup_qos(rdev);
-	if (rc)
-		pr_info("RoCE priority not yet configured\n");
+	if (!rdev->is_virtfn) {
+		rc = bnxt_re_setup_qos(rdev);
+		if (rc)
+			pr_info("RoCE priority not yet configured\n");
 
 
-	INIT_DELAYED_WORK(&rdev->worker, bnxt_re_worker);
-	set_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags);
-	schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000));
+		INIT_DELAYED_WORK(&rdev->worker, bnxt_re_worker);
+		set_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags);
+		schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000));
+	}
 
 
 	/* Register ib dev */
 	/* Register ib dev */
 	rc = bnxt_re_register_ib(rdev);
 	rc = bnxt_re_register_ib(rdev);
@@ -1176,6 +1339,7 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 	set_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags);
 	set_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags);
 	ib_get_eth_speed(&rdev->ibdev, 1, &rdev->active_speed,
 	ib_get_eth_speed(&rdev->ibdev, 1, &rdev->active_speed,
 			 &rdev->active_width);
 			 &rdev->active_width);
+	set_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, &rdev->flags);
 	bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, IB_EVENT_PORT_ACTIVE);
 	bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, IB_EVENT_PORT_ACTIVE);
 	bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, IB_EVENT_GID_CHANGE);
 	bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, IB_EVENT_GID_CHANGE);
 
 
@@ -1400,7 +1564,7 @@ err_netdev:
 
 
 static void __exit bnxt_re_mod_exit(void)
 static void __exit bnxt_re_mod_exit(void)
 {
 {
-	struct bnxt_re_dev *rdev;
+	struct bnxt_re_dev *rdev, *next;
 	LIST_HEAD(to_be_deleted);
 	LIST_HEAD(to_be_deleted);
 
 
 	mutex_lock(&bnxt_re_dev_lock);
 	mutex_lock(&bnxt_re_dev_lock);
@@ -1408,8 +1572,11 @@ static void __exit bnxt_re_mod_exit(void)
 	if (!list_empty(&bnxt_re_dev_list))
 	if (!list_empty(&bnxt_re_dev_list))
 		list_splice_init(&bnxt_re_dev_list, &to_be_deleted);
 		list_splice_init(&bnxt_re_dev_list, &to_be_deleted);
 	mutex_unlock(&bnxt_re_dev_lock);
 	mutex_unlock(&bnxt_re_dev_lock);
-
-	list_for_each_entry(rdev, &to_be_deleted, list) {
+       /*
+	* Cleanup the devices in reverse order so that the VF device
+	* cleanup is done before PF cleanup
+	*/
+	list_for_each_entry_safe_reverse(rdev, next, &to_be_deleted, list) {
 		dev_info(rdev_to_dev(rdev), "Unregistering Device");
 		dev_info(rdev_to_dev(rdev), "Unregistering Device");
 		bnxt_re_dev_stop(rdev);
 		bnxt_re_dev_stop(rdev);
 		bnxt_re_ib_unreg(rdev, true);
 		bnxt_re_ib_unreg(rdev, true);

+ 400 - 63
drivers/infiniband/hw/bnxt_re/qplib_fp.c

@@ -52,6 +52,7 @@
 
 
 static void bnxt_qplib_arm_cq_enable(struct bnxt_qplib_cq *cq);
 static void bnxt_qplib_arm_cq_enable(struct bnxt_qplib_cq *cq);
 static void __clean_cq(struct bnxt_qplib_cq *cq, u64 qp);
 static void __clean_cq(struct bnxt_qplib_cq *cq, u64 qp);
+static void bnxt_qplib_arm_srq(struct bnxt_qplib_srq *srq, u32 arm_type);
 
 
 static void bnxt_qplib_cancel_phantom_processing(struct bnxt_qplib_qp *qp)
 static void bnxt_qplib_cancel_phantom_processing(struct bnxt_qplib_qp *qp)
 {
 {
@@ -278,6 +279,7 @@ static void bnxt_qplib_service_nq(unsigned long data)
 	struct nq_base *nqe, **nq_ptr;
 	struct nq_base *nqe, **nq_ptr;
 	struct bnxt_qplib_cq *cq;
 	struct bnxt_qplib_cq *cq;
 	int num_cqne_processed = 0;
 	int num_cqne_processed = 0;
+	int num_srqne_processed = 0;
 	u32 sw_cons, raw_cons;
 	u32 sw_cons, raw_cons;
 	u16 type;
 	u16 type;
 	int budget = nq->budget;
 	int budget = nq->budget;
@@ -320,6 +322,26 @@ static void bnxt_qplib_service_nq(unsigned long data)
 			spin_unlock_bh(&cq->compl_lock);
 			spin_unlock_bh(&cq->compl_lock);
 			break;
 			break;
 		}
 		}
+		case NQ_BASE_TYPE_SRQ_EVENT:
+		{
+			struct nq_srq_event *nqsrqe =
+						(struct nq_srq_event *)nqe;
+
+			q_handle = le32_to_cpu(nqsrqe->srq_handle_low);
+			q_handle |= (u64)le32_to_cpu(nqsrqe->srq_handle_high)
+				     << 32;
+			bnxt_qplib_arm_srq((struct bnxt_qplib_srq *)q_handle,
+					   DBR_DBR_TYPE_SRQ_ARMENA);
+			if (!nq->srqn_handler(nq,
+					      (struct bnxt_qplib_srq *)q_handle,
+					      nqsrqe->event))
+				num_srqne_processed++;
+			else
+				dev_warn(&nq->pdev->dev,
+					 "QPLIB: SRQ event 0x%x not handled",
+					 nqsrqe->event);
+			break;
+		}
 		case NQ_BASE_TYPE_DBQ_EVENT:
 		case NQ_BASE_TYPE_DBQ_EVENT:
 			break;
 			break;
 		default:
 		default:
@@ -384,17 +406,19 @@ int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq,
 			 int (*cqn_handler)(struct bnxt_qplib_nq *nq,
 			 int (*cqn_handler)(struct bnxt_qplib_nq *nq,
 					    struct bnxt_qplib_cq *),
 					    struct bnxt_qplib_cq *),
 			 int (*srqn_handler)(struct bnxt_qplib_nq *nq,
 			 int (*srqn_handler)(struct bnxt_qplib_nq *nq,
-					     void *, u8 event))
+					     struct bnxt_qplib_srq *,
+					     u8 event))
 {
 {
 	resource_size_t nq_base;
 	resource_size_t nq_base;
 	int rc = -1;
 	int rc = -1;
 
 
 	nq->pdev = pdev;
 	nq->pdev = pdev;
 	nq->vector = msix_vector;
 	nq->vector = msix_vector;
+	if (cqn_handler)
+		nq->cqn_handler = cqn_handler;
 
 
-	nq->cqn_handler = cqn_handler;
-
-	nq->srqn_handler = srqn_handler;
+	if (srqn_handler)
+		nq->srqn_handler = srqn_handler;
 
 
 	tasklet_init(&nq->worker, bnxt_qplib_service_nq, (unsigned long)nq);
 	tasklet_init(&nq->worker, bnxt_qplib_service_nq, (unsigned long)nq);
 
 
@@ -410,7 +434,6 @@ int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq,
 	if (rc) {
 	if (rc) {
 		dev_err(&nq->pdev->dev,
 		dev_err(&nq->pdev->dev,
 			"Failed to request IRQ for NQ: %#x", rc);
 			"Failed to request IRQ for NQ: %#x", rc);
-		bnxt_qplib_disable_nq(nq);
 		goto fail;
 		goto fail;
 	}
 	}
 
 
@@ -469,6 +492,238 @@ int bnxt_qplib_alloc_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq)
 	return 0;
 	return 0;
 }
 }
 
 
+/* SRQ */
+static void bnxt_qplib_arm_srq(struct bnxt_qplib_srq *srq, u32 arm_type)
+{
+	struct bnxt_qplib_hwq *srq_hwq = &srq->hwq;
+	struct dbr_dbr db_msg = { 0 };
+	void __iomem *db;
+	u32 sw_prod = 0;
+
+	/* Ring DB */
+	sw_prod = (arm_type == DBR_DBR_TYPE_SRQ_ARM) ? srq->threshold :
+		   HWQ_CMP(srq_hwq->prod, srq_hwq);
+	db_msg.index = cpu_to_le32((sw_prod << DBR_DBR_INDEX_SFT) &
+				   DBR_DBR_INDEX_MASK);
+	db_msg.type_xid = cpu_to_le32(((srq->id << DBR_DBR_XID_SFT) &
+					DBR_DBR_XID_MASK) | arm_type);
+	db = (arm_type == DBR_DBR_TYPE_SRQ_ARMENA) ?
+		srq->dbr_base : srq->dpi->dbr;
+	wmb(); /* barrier before db ring */
+	__iowrite64_copy(db, &db_msg, sizeof(db_msg) / sizeof(u64));
+}
+
+int bnxt_qplib_destroy_srq(struct bnxt_qplib_res *res,
+			   struct bnxt_qplib_srq *srq)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_destroy_srq req;
+	struct creq_destroy_srq_resp resp;
+	u16 cmd_flags = 0;
+	int rc;
+
+	RCFW_CMD_PREP(req, DESTROY_SRQ, cmd_flags);
+
+	/* Configure the request */
+	req.srq_cid = cpu_to_le32(srq->id);
+
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+					  (void *)&resp, NULL, 0);
+	if (rc)
+		return rc;
+
+	bnxt_qplib_free_hwq(res->pdev, &srq->hwq);
+	kfree(srq->swq);
+	return 0;
+}
+
+int bnxt_qplib_create_srq(struct bnxt_qplib_res *res,
+			  struct bnxt_qplib_srq *srq)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_create_srq req;
+	struct creq_create_srq_resp resp;
+	struct bnxt_qplib_pbl *pbl;
+	u16 cmd_flags = 0;
+	int rc, idx;
+
+	srq->hwq.max_elements = srq->max_wqe;
+	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &srq->hwq, srq->sglist,
+				       srq->nmap, &srq->hwq.max_elements,
+				       BNXT_QPLIB_MAX_RQE_ENTRY_SIZE, 0,
+				       PAGE_SIZE, HWQ_TYPE_QUEUE);
+	if (rc)
+		goto exit;
+
+	srq->swq = kcalloc(srq->hwq.max_elements, sizeof(*srq->swq),
+			   GFP_KERNEL);
+	if (!srq->swq)
+		goto fail;
+
+	RCFW_CMD_PREP(req, CREATE_SRQ, cmd_flags);
+
+	/* Configure the request */
+	req.dpi = cpu_to_le32(srq->dpi->dpi);
+	req.srq_handle = cpu_to_le64(srq);
+
+	req.srq_size = cpu_to_le16((u16)srq->hwq.max_elements);
+	pbl = &srq->hwq.pbl[PBL_LVL_0];
+	req.pg_size_lvl = cpu_to_le16((((u16)srq->hwq.level &
+				      CMDQ_CREATE_SRQ_LVL_MASK) <<
+				      CMDQ_CREATE_SRQ_LVL_SFT) |
+				      (pbl->pg_size == ROCE_PG_SIZE_4K ?
+				       CMDQ_CREATE_SRQ_PG_SIZE_PG_4K :
+				       pbl->pg_size == ROCE_PG_SIZE_8K ?
+				       CMDQ_CREATE_SRQ_PG_SIZE_PG_8K :
+				       pbl->pg_size == ROCE_PG_SIZE_64K ?
+				       CMDQ_CREATE_SRQ_PG_SIZE_PG_64K :
+				       pbl->pg_size == ROCE_PG_SIZE_2M ?
+				       CMDQ_CREATE_SRQ_PG_SIZE_PG_2M :
+				       pbl->pg_size == ROCE_PG_SIZE_8M ?
+				       CMDQ_CREATE_SRQ_PG_SIZE_PG_8M :
+				       pbl->pg_size == ROCE_PG_SIZE_1G ?
+				       CMDQ_CREATE_SRQ_PG_SIZE_PG_1G :
+				       CMDQ_CREATE_SRQ_PG_SIZE_PG_4K));
+	req.pbl = cpu_to_le64(pbl->pg_map_arr[0]);
+	req.pd_id = cpu_to_le32(srq->pd->id);
+	req.eventq_id = cpu_to_le16(srq->eventq_hw_ring_id);
+
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+					  (void *)&resp, NULL, 0);
+	if (rc)
+		goto fail;
+
+	spin_lock_init(&srq->lock);
+	srq->start_idx = 0;
+	srq->last_idx = srq->hwq.max_elements - 1;
+	for (idx = 0; idx < srq->hwq.max_elements; idx++)
+		srq->swq[idx].next_idx = idx + 1;
+	srq->swq[srq->last_idx].next_idx = -1;
+
+	srq->id = le32_to_cpu(resp.xid);
+	srq->dbr_base = res->dpi_tbl.dbr_bar_reg_iomem;
+	if (srq->threshold)
+		bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ_ARMENA);
+	srq->arm_req = false;
+
+	return 0;
+fail:
+	bnxt_qplib_free_hwq(res->pdev, &srq->hwq);
+	kfree(srq->swq);
+exit:
+	return rc;
+}
+
+int bnxt_qplib_modify_srq(struct bnxt_qplib_res *res,
+			  struct bnxt_qplib_srq *srq)
+{
+	struct bnxt_qplib_hwq *srq_hwq = &srq->hwq;
+	u32 sw_prod, sw_cons, count = 0;
+
+	sw_prod = HWQ_CMP(srq_hwq->prod, srq_hwq);
+	sw_cons = HWQ_CMP(srq_hwq->cons, srq_hwq);
+
+	count = sw_prod > sw_cons ? sw_prod - sw_cons :
+				    srq_hwq->max_elements - sw_cons + sw_prod;
+	if (count > srq->threshold) {
+		srq->arm_req = false;
+		bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ_ARM);
+	} else {
+		/* Deferred arming */
+		srq->arm_req = true;
+	}
+
+	return 0;
+}
+
+int bnxt_qplib_query_srq(struct bnxt_qplib_res *res,
+			 struct bnxt_qplib_srq *srq)
+{
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_query_srq req;
+	struct creq_query_srq_resp resp;
+	struct bnxt_qplib_rcfw_sbuf *sbuf;
+	struct creq_query_srq_resp_sb *sb;
+	u16 cmd_flags = 0;
+	int rc = 0;
+
+	RCFW_CMD_PREP(req, QUERY_SRQ, cmd_flags);
+	req.srq_cid = cpu_to_le32(srq->id);
+
+	/* Configure the request */
+	sbuf = bnxt_qplib_rcfw_alloc_sbuf(rcfw, sizeof(*sb));
+	if (!sbuf)
+		return -ENOMEM;
+	sb = sbuf->sb;
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp,
+					  (void *)sbuf, 0);
+	srq->threshold = le16_to_cpu(sb->srq_limit);
+	bnxt_qplib_rcfw_free_sbuf(rcfw, sbuf);
+
+	return rc;
+}
+
+int bnxt_qplib_post_srq_recv(struct bnxt_qplib_srq *srq,
+			     struct bnxt_qplib_swqe *wqe)
+{
+	struct bnxt_qplib_hwq *srq_hwq = &srq->hwq;
+	struct rq_wqe *srqe, **srqe_ptr;
+	struct sq_sge *hw_sge;
+	u32 sw_prod, sw_cons, count = 0;
+	int i, rc = 0, next;
+
+	spin_lock(&srq_hwq->lock);
+	if (srq->start_idx == srq->last_idx) {
+		dev_err(&srq_hwq->pdev->dev, "QPLIB: FP: SRQ (0x%x) is full!",
+			srq->id);
+		rc = -EINVAL;
+		spin_unlock(&srq_hwq->lock);
+		goto done;
+	}
+	next = srq->start_idx;
+	srq->start_idx = srq->swq[next].next_idx;
+	spin_unlock(&srq_hwq->lock);
+
+	sw_prod = HWQ_CMP(srq_hwq->prod, srq_hwq);
+	srqe_ptr = (struct rq_wqe **)srq_hwq->pbl_ptr;
+	srqe = &srqe_ptr[RQE_PG(sw_prod)][RQE_IDX(sw_prod)];
+	memset(srqe, 0, BNXT_QPLIB_MAX_RQE_ENTRY_SIZE);
+	/* Calculate wqe_size16 and data_len */
+	for (i = 0, hw_sge = (struct sq_sge *)srqe->data;
+	     i < wqe->num_sge; i++, hw_sge++) {
+		hw_sge->va_or_pa = cpu_to_le64(wqe->sg_list[i].addr);
+		hw_sge->l_key = cpu_to_le32(wqe->sg_list[i].lkey);
+		hw_sge->size = cpu_to_le32(wqe->sg_list[i].size);
+	}
+	srqe->wqe_type = wqe->type;
+	srqe->flags = wqe->flags;
+	srqe->wqe_size = wqe->num_sge +
+			((offsetof(typeof(*srqe), data) + 15) >> 4);
+	srqe->wr_id[0] = cpu_to_le32((u32)next);
+	srq->swq[next].wr_id = wqe->wr_id;
+
+	srq_hwq->prod++;
+
+	spin_lock(&srq_hwq->lock);
+	sw_prod = HWQ_CMP(srq_hwq->prod, srq_hwq);
+	/* retaining srq_hwq->cons for this logic
+	 * actually the lock is only required to
+	 * read srq_hwq->cons.
+	 */
+	sw_cons = HWQ_CMP(srq_hwq->cons, srq_hwq);
+	count = sw_prod > sw_cons ? sw_prod - sw_cons :
+				    srq_hwq->max_elements - sw_cons + sw_prod;
+	spin_unlock(&srq_hwq->lock);
+	/* Ring DB */
+	bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ);
+	if (srq->arm_req == true && count > srq->threshold) {
+		srq->arm_req = false;
+		bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ_ARM);
+	}
+done:
+	return rc;
+}
+
 /* QP */
 /* QP */
 int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
 int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
 {
 {
@@ -737,6 +992,12 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
 				 pbl->pg_size == ROCE_PG_SIZE_1G ?
 				 pbl->pg_size == ROCE_PG_SIZE_1G ?
 					CMDQ_CREATE_QP_RQ_PG_SIZE_PG_1G :
 					CMDQ_CREATE_QP_RQ_PG_SIZE_PG_1G :
 				 CMDQ_CREATE_QP_RQ_PG_SIZE_PG_4K);
 				 CMDQ_CREATE_QP_RQ_PG_SIZE_PG_4K);
+	} else {
+		/* SRQ */
+		if (qp->srq) {
+			qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_SRQ_USED;
+			req.srq_cid = cpu_to_le32(qp->srq->id);
+		}
 	}
 	}
 
 
 	if (qp->rcq)
 	if (qp->rcq)
@@ -2068,6 +2329,16 @@ done:
 	return rc;
 	return rc;
 }
 }
 
 
+static void bnxt_qplib_release_srqe(struct bnxt_qplib_srq *srq, u32 tag)
+{
+	spin_lock(&srq->hwq.lock);
+	srq->swq[srq->last_idx].next_idx = (int)tag;
+	srq->last_idx = (int)tag;
+	srq->swq[srq->last_idx].next_idx = -1;
+	srq->hwq.cons++; /* Support for SRQE counter */
+	spin_unlock(&srq->hwq.lock);
+}
+
 static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq,
 static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq,
 					struct cq_res_rc *hwcqe,
 					struct cq_res_rc *hwcqe,
 					struct bnxt_qplib_cqe **pcqe,
 					struct bnxt_qplib_cqe **pcqe,
@@ -2075,6 +2346,7 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq,
 {
 {
 	struct bnxt_qplib_qp *qp;
 	struct bnxt_qplib_qp *qp;
 	struct bnxt_qplib_q *rq;
 	struct bnxt_qplib_q *rq;
+	struct bnxt_qplib_srq *srq;
 	struct bnxt_qplib_cqe *cqe;
 	struct bnxt_qplib_cqe *cqe;
 	u32 wr_id_idx;
 	u32 wr_id_idx;
 	int rc = 0;
 	int rc = 0;
@@ -2102,27 +2374,46 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq,
 
 
 	wr_id_idx = le32_to_cpu(hwcqe->srq_or_rq_wr_id) &
 	wr_id_idx = le32_to_cpu(hwcqe->srq_or_rq_wr_id) &
 				CQ_RES_RC_SRQ_OR_RQ_WR_ID_MASK;
 				CQ_RES_RC_SRQ_OR_RQ_WR_ID_MASK;
-	rq = &qp->rq;
-	if (wr_id_idx > rq->hwq.max_elements) {
-		dev_err(&cq->hwq.pdev->dev, "QPLIB: FP: CQ Process RC ");
-		dev_err(&cq->hwq.pdev->dev,
-			"QPLIB: wr_id idx 0x%x exceeded RQ max 0x%x",
-			wr_id_idx, rq->hwq.max_elements);
-		return -EINVAL;
-	}
-
-	cqe->wr_id = rq->swq[wr_id_idx].wr_id;
-	cqe++;
-	(*budget)--;
-	rq->hwq.cons++;
-	*pcqe = cqe;
+	if (cqe->flags & CQ_RES_RC_FLAGS_SRQ_SRQ) {
+		srq = qp->srq;
+		if (!srq)
+			return -EINVAL;
+		if (wr_id_idx > srq->hwq.max_elements) {
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: FP: CQ Process RC ");
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: wr_id idx 0x%x exceeded SRQ max 0x%x",
+				wr_id_idx, srq->hwq.max_elements);
+			return -EINVAL;
+		}
+		cqe->wr_id = srq->swq[wr_id_idx].wr_id;
+		bnxt_qplib_release_srqe(srq, wr_id_idx);
+		cqe++;
+		(*budget)--;
+		*pcqe = cqe;
+	} else {
+		rq = &qp->rq;
+		if (wr_id_idx > rq->hwq.max_elements) {
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: FP: CQ Process RC ");
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: wr_id idx 0x%x exceeded RQ max 0x%x",
+				wr_id_idx, rq->hwq.max_elements);
+			return -EINVAL;
+		}
+		cqe->wr_id = rq->swq[wr_id_idx].wr_id;
+		cqe++;
+		(*budget)--;
+		rq->hwq.cons++;
+		*pcqe = cqe;
 
 
-	if (hwcqe->status != CQ_RES_RC_STATUS_OK) {
-		qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR;
-		 /* Add qp to flush list of the CQ */
-		bnxt_qplib_lock_buddy_cq(qp, cq);
-		__bnxt_qplib_add_flush_qp(qp);
-		bnxt_qplib_unlock_buddy_cq(qp, cq);
+		if (hwcqe->status != CQ_RES_RC_STATUS_OK) {
+			qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR;
+			/* Add qp to flush list of the CQ */
+			bnxt_qplib_lock_buddy_cq(qp, cq);
+			__bnxt_qplib_add_flush_qp(qp);
+			bnxt_qplib_unlock_buddy_cq(qp, cq);
+		}
 	}
 	}
 
 
 done:
 done:
@@ -2136,6 +2427,7 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq,
 {
 {
 	struct bnxt_qplib_qp *qp;
 	struct bnxt_qplib_qp *qp;
 	struct bnxt_qplib_q *rq;
 	struct bnxt_qplib_q *rq;
+	struct bnxt_qplib_srq *srq;
 	struct bnxt_qplib_cqe *cqe;
 	struct bnxt_qplib_cqe *cqe;
 	u32 wr_id_idx;
 	u32 wr_id_idx;
 	int rc = 0;
 	int rc = 0;
@@ -2166,27 +2458,48 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq,
 				  hwcqe->src_qp_high_srq_or_rq_wr_id) &
 				  hwcqe->src_qp_high_srq_or_rq_wr_id) &
 				 CQ_RES_UD_SRC_QP_HIGH_MASK) >> 8);
 				 CQ_RES_UD_SRC_QP_HIGH_MASK) >> 8);
 
 
-	rq = &qp->rq;
-	if (wr_id_idx > rq->hwq.max_elements) {
-		dev_err(&cq->hwq.pdev->dev, "QPLIB: FP: CQ Process UD ");
-		dev_err(&cq->hwq.pdev->dev,
-			"QPLIB: wr_id idx %#x exceeded RQ max %#x",
-			wr_id_idx, rq->hwq.max_elements);
-		return -EINVAL;
-	}
+	if (cqe->flags & CQ_RES_RC_FLAGS_SRQ_SRQ) {
+		srq = qp->srq;
+		if (!srq)
+			return -EINVAL;
 
 
-	cqe->wr_id = rq->swq[wr_id_idx].wr_id;
-	cqe++;
-	(*budget)--;
-	rq->hwq.cons++;
-	*pcqe = cqe;
+		if (wr_id_idx > srq->hwq.max_elements) {
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: FP: CQ Process UD ");
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: wr_id idx 0x%x exceeded SRQ max 0x%x",
+				wr_id_idx, srq->hwq.max_elements);
+			return -EINVAL;
+		}
+		cqe->wr_id = srq->swq[wr_id_idx].wr_id;
+		bnxt_qplib_release_srqe(srq, wr_id_idx);
+		cqe++;
+		(*budget)--;
+		*pcqe = cqe;
+	} else {
+		rq = &qp->rq;
+		if (wr_id_idx > rq->hwq.max_elements) {
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: FP: CQ Process UD ");
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: wr_id idx 0x%x exceeded RQ max 0x%x",
+				wr_id_idx, rq->hwq.max_elements);
+			return -EINVAL;
+		}
 
 
-	if (hwcqe->status != CQ_RES_RC_STATUS_OK) {
-		qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR;
-		/* Add qp to flush list of the CQ */
-		bnxt_qplib_lock_buddy_cq(qp, cq);
-		__bnxt_qplib_add_flush_qp(qp);
-		bnxt_qplib_unlock_buddy_cq(qp, cq);
+		cqe->wr_id = rq->swq[wr_id_idx].wr_id;
+		cqe++;
+		(*budget)--;
+		rq->hwq.cons++;
+		*pcqe = cqe;
+
+		if (hwcqe->status != CQ_RES_RC_STATUS_OK) {
+			qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR;
+			/* Add qp to flush list of the CQ */
+			bnxt_qplib_lock_buddy_cq(qp, cq);
+			__bnxt_qplib_add_flush_qp(qp);
+			bnxt_qplib_unlock_buddy_cq(qp, cq);
+		}
 	}
 	}
 done:
 done:
 	return rc;
 	return rc;
@@ -2218,6 +2531,7 @@ static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq,
 {
 {
 	struct bnxt_qplib_qp *qp;
 	struct bnxt_qplib_qp *qp;
 	struct bnxt_qplib_q *rq;
 	struct bnxt_qplib_q *rq;
+	struct bnxt_qplib_srq *srq;
 	struct bnxt_qplib_cqe *cqe;
 	struct bnxt_qplib_cqe *cqe;
 	u32 wr_id_idx;
 	u32 wr_id_idx;
 	int rc = 0;
 	int rc = 0;
@@ -2256,26 +2570,49 @@ static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq,
 	cqe->raweth_qp1_flags2 = le32_to_cpu(hwcqe->raweth_qp1_flags2);
 	cqe->raweth_qp1_flags2 = le32_to_cpu(hwcqe->raweth_qp1_flags2);
 	cqe->raweth_qp1_metadata = le32_to_cpu(hwcqe->raweth_qp1_metadata);
 	cqe->raweth_qp1_metadata = le32_to_cpu(hwcqe->raweth_qp1_metadata);
 
 
-	rq = &qp->rq;
-	if (wr_id_idx > rq->hwq.max_elements) {
-		dev_err(&cq->hwq.pdev->dev, "QPLIB: FP: CQ Process Raw/QP1 RQ wr_id ");
-		dev_err(&cq->hwq.pdev->dev, "QPLIB: ix 0x%x exceeded RQ max 0x%x",
-			wr_id_idx, rq->hwq.max_elements);
-		return -EINVAL;
-	}
-
-	cqe->wr_id = rq->swq[wr_id_idx].wr_id;
-	cqe++;
-	(*budget)--;
-	rq->hwq.cons++;
-	*pcqe = cqe;
+	if (cqe->flags & CQ_RES_RAWETH_QP1_FLAGS_SRQ_SRQ) {
+		srq = qp->srq;
+		if (!srq) {
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: FP: SRQ used but not defined??");
+			return -EINVAL;
+		}
+		if (wr_id_idx > srq->hwq.max_elements) {
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: FP: CQ Process Raw/QP1 ");
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: wr_id idx 0x%x exceeded SRQ max 0x%x",
+				wr_id_idx, srq->hwq.max_elements);
+			return -EINVAL;
+		}
+		cqe->wr_id = srq->swq[wr_id_idx].wr_id;
+		bnxt_qplib_release_srqe(srq, wr_id_idx);
+		cqe++;
+		(*budget)--;
+		*pcqe = cqe;
+	} else {
+		rq = &qp->rq;
+		if (wr_id_idx > rq->hwq.max_elements) {
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: FP: CQ Process Raw/QP1 RQ wr_id ");
+			dev_err(&cq->hwq.pdev->dev,
+				"QPLIB: ix 0x%x exceeded RQ max 0x%x",
+				wr_id_idx, rq->hwq.max_elements);
+			return -EINVAL;
+		}
+		cqe->wr_id = rq->swq[wr_id_idx].wr_id;
+		cqe++;
+		(*budget)--;
+		rq->hwq.cons++;
+		*pcqe = cqe;
 
 
-	if (hwcqe->status != CQ_RES_RC_STATUS_OK) {
-		qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR;
-		/* Add qp to flush list of the CQ */
-		bnxt_qplib_lock_buddy_cq(qp, cq);
-		__bnxt_qplib_add_flush_qp(qp);
-		bnxt_qplib_unlock_buddy_cq(qp, cq);
+		if (hwcqe->status != CQ_RES_RC_STATUS_OK) {
+			qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR;
+			/* Add qp to flush list of the CQ */
+			bnxt_qplib_lock_buddy_cq(qp, cq);
+			__bnxt_qplib_add_flush_qp(qp);
+			bnxt_qplib_unlock_buddy_cq(qp, cq);
+		}
 	}
 	}
 
 
 done:
 done:

+ 54 - 24
drivers/infiniband/hw/bnxt_re/qplib_fp.h

@@ -39,6 +39,27 @@
 #ifndef __BNXT_QPLIB_FP_H__
 #ifndef __BNXT_QPLIB_FP_H__
 #define __BNXT_QPLIB_FP_H__
 #define __BNXT_QPLIB_FP_H__
 
 
+struct bnxt_qplib_srq {
+	struct bnxt_qplib_pd		*pd;
+	struct bnxt_qplib_dpi		*dpi;
+	void __iomem			*dbr_base;
+	u64				srq_handle;
+	u32				id;
+	u32				max_wqe;
+	u32				max_sge;
+	u32				threshold;
+	bool				arm_req;
+	struct bnxt_qplib_cq		*cq;
+	struct bnxt_qplib_hwq		hwq;
+	struct bnxt_qplib_swq		*swq;
+	struct scatterlist		*sglist;
+	int				start_idx;
+	int				last_idx;
+	u32				nmap;
+	u16				eventq_hw_ring_id;
+	spinlock_t			lock; /* protect SRQE link list */
+};
+
 struct bnxt_qplib_sge {
 struct bnxt_qplib_sge {
 	u64				addr;
 	u64				addr;
 	u32				lkey;
 	u32				lkey;
@@ -79,6 +100,7 @@ static inline u32 get_psne_idx(u32 val)
 
 
 struct bnxt_qplib_swq {
 struct bnxt_qplib_swq {
 	u64				wr_id;
 	u64				wr_id;
+	int				next_idx;
 	u8				type;
 	u8				type;
 	u8				flags;
 	u8				flags;
 	u32				start_psn;
 	u32				start_psn;
@@ -404,29 +426,27 @@ struct bnxt_qplib_cq {
 	writel(NQ_DB_CP_FLAGS | ((raw_cons) & ((cp_bit) - 1)), db)
 	writel(NQ_DB_CP_FLAGS | ((raw_cons) & ((cp_bit) - 1)), db)
 
 
 struct bnxt_qplib_nq {
 struct bnxt_qplib_nq {
-	struct pci_dev			*pdev;
-
-	int				vector;
-	cpumask_t			mask;
-	int				budget;
-	bool				requested;
-	struct tasklet_struct		worker;
-	struct bnxt_qplib_hwq		hwq;
-
-	u16				bar_reg;
-	u16				bar_reg_off;
-	u16				ring_id;
-	void __iomem			*bar_reg_iomem;
-
-	int				(*cqn_handler)
-						(struct bnxt_qplib_nq *nq,
-						 struct bnxt_qplib_cq *cq);
-	int				(*srqn_handler)
-						(struct bnxt_qplib_nq *nq,
-						 void *srq,
-						 u8 event);
-	struct workqueue_struct         *cqn_wq;
-	char                            name[32];
+	struct pci_dev		*pdev;
+
+	int			vector;
+	cpumask_t		mask;
+	int			budget;
+	bool			requested;
+	struct tasklet_struct	worker;
+	struct bnxt_qplib_hwq	hwq;
+
+	u16			bar_reg;
+	u16			bar_reg_off;
+	u16			ring_id;
+	void __iomem		*bar_reg_iomem;
+
+	int			(*cqn_handler)(struct bnxt_qplib_nq *nq,
+					       struct bnxt_qplib_cq *cq);
+	int			(*srqn_handler)(struct bnxt_qplib_nq *nq,
+						struct bnxt_qplib_srq *srq,
+						u8 event);
+	struct workqueue_struct	*cqn_wq;
+	char			name[32];
 };
 };
 
 
 struct bnxt_qplib_nq_work {
 struct bnxt_qplib_nq_work {
@@ -441,8 +461,18 @@ int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq,
 			 int (*cqn_handler)(struct bnxt_qplib_nq *nq,
 			 int (*cqn_handler)(struct bnxt_qplib_nq *nq,
 					    struct bnxt_qplib_cq *cq),
 					    struct bnxt_qplib_cq *cq),
 			 int (*srqn_handler)(struct bnxt_qplib_nq *nq,
 			 int (*srqn_handler)(struct bnxt_qplib_nq *nq,
-					     void *srq,
+					     struct bnxt_qplib_srq *srq,
 					     u8 event));
 					     u8 event));
+int bnxt_qplib_create_srq(struct bnxt_qplib_res *res,
+			  struct bnxt_qplib_srq *srq);
+int bnxt_qplib_modify_srq(struct bnxt_qplib_res *res,
+			  struct bnxt_qplib_srq *srq);
+int bnxt_qplib_query_srq(struct bnxt_qplib_res *res,
+			 struct bnxt_qplib_srq *srq);
+int bnxt_qplib_destroy_srq(struct bnxt_qplib_res *res,
+			   struct bnxt_qplib_srq *srq);
+int bnxt_qplib_post_srq_recv(struct bnxt_qplib_srq *srq,
+			     struct bnxt_qplib_swqe *wqe);
 int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp);
 int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp);
 int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp);
 int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp);
 int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp);
 int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp);

+ 3 - 2
drivers/infiniband/hw/bnxt_re/qplib_rcfw.c

@@ -93,7 +93,8 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct cmdq_base *req,
 	opcode = req->opcode;
 	opcode = req->opcode;
 	if (!test_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->flags) &&
 	if (!test_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->flags) &&
 	    (opcode != CMDQ_BASE_OPCODE_QUERY_FUNC &&
 	    (opcode != CMDQ_BASE_OPCODE_QUERY_FUNC &&
-	     opcode != CMDQ_BASE_OPCODE_INITIALIZE_FW)) {
+	     opcode != CMDQ_BASE_OPCODE_INITIALIZE_FW &&
+	     opcode != CMDQ_BASE_OPCODE_QUERY_VERSION)) {
 		dev_err(&rcfw->pdev->dev,
 		dev_err(&rcfw->pdev->dev,
 			"QPLIB: RCFW not initialized, reject opcode 0x%x",
 			"QPLIB: RCFW not initialized, reject opcode 0x%x",
 			opcode);
 			opcode);
@@ -615,7 +616,7 @@ int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev,
 				   int msix_vector,
 				   int msix_vector,
 				   int cp_bar_reg_off, int virt_fn,
 				   int cp_bar_reg_off, int virt_fn,
 				   int (*aeq_handler)(struct bnxt_qplib_rcfw *,
 				   int (*aeq_handler)(struct bnxt_qplib_rcfw *,
-						      struct creq_func_event *))
+						      void *, void *))
 {
 {
 	resource_size_t res_base;
 	resource_size_t res_base;
 	struct cmdq_init init;
 	struct cmdq_init init;

+ 3 - 4
drivers/infiniband/hw/bnxt_re/qplib_rcfw.h

@@ -167,7 +167,7 @@ struct bnxt_qplib_rcfw {
 #define FIRMWARE_TIMED_OUT		3
 #define FIRMWARE_TIMED_OUT		3
 	wait_queue_head_t	waitq;
 	wait_queue_head_t	waitq;
 	int			(*aeq_handler)(struct bnxt_qplib_rcfw *,
 	int			(*aeq_handler)(struct bnxt_qplib_rcfw *,
-					       struct creq_func_event *);
+					       void *, void *);
 	u32			seq_num;
 	u32			seq_num;
 
 
 	/* Bar region info */
 	/* Bar region info */
@@ -199,9 +199,8 @@ int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev,
 				   struct bnxt_qplib_rcfw *rcfw,
 				   struct bnxt_qplib_rcfw *rcfw,
 				   int msix_vector,
 				   int msix_vector,
 				   int cp_bar_reg_off, int virt_fn,
 				   int cp_bar_reg_off, int virt_fn,
-				   int (*aeq_handler)
-					(struct bnxt_qplib_rcfw *,
-					 struct creq_func_event *));
+				   int (*aeq_handler)(struct bnxt_qplib_rcfw *,
+						      void *aeqe, void *obj));
 
 
 struct bnxt_qplib_rcfw_sbuf *bnxt_qplib_rcfw_alloc_sbuf(
 struct bnxt_qplib_rcfw_sbuf *bnxt_qplib_rcfw_alloc_sbuf(
 				struct bnxt_qplib_rcfw *rcfw,
 				struct bnxt_qplib_rcfw *rcfw,

+ 4 - 5
drivers/infiniband/hw/bnxt_re/qplib_res.c

@@ -104,13 +104,12 @@ static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl,
 
 
 	if (!sghead) {
 	if (!sghead) {
 		for (i = 0; i < pages; i++) {
 		for (i = 0; i < pages; i++) {
-			pbl->pg_arr[i] = dma_alloc_coherent(&pdev->dev,
-							    pbl->pg_size,
-							    &pbl->pg_map_arr[i],
-							    GFP_KERNEL);
+			pbl->pg_arr[i] = dma_zalloc_coherent(&pdev->dev,
+							     pbl->pg_size,
+							     &pbl->pg_map_arr[i],
+							     GFP_KERNEL);
 			if (!pbl->pg_arr[i])
 			if (!pbl->pg_arr[i])
 				goto fail;
 				goto fail;
-			memset(pbl->pg_arr[i], 0, pbl->pg_size);
 			pbl->pg_count++;
 			pbl->pg_count++;
 		}
 		}
 	} else {
 	} else {

+ 136 - 5
drivers/infiniband/hw/bnxt_re/qplib_sp.c

@@ -64,8 +64,28 @@ static bool bnxt_qplib_is_atomic_cap(struct bnxt_qplib_rcfw *rcfw)
 	return !!(pcie_ctl2 & PCI_EXP_DEVCTL2_ATOMIC_REQ);
 	return !!(pcie_ctl2 & PCI_EXP_DEVCTL2_ATOMIC_REQ);
 }
 }
 
 
+static void bnxt_qplib_query_version(struct bnxt_qplib_rcfw *rcfw,
+				     char *fw_ver)
+{
+	struct cmdq_query_version req;
+	struct creq_query_version_resp resp;
+	u16 cmd_flags = 0;
+	int rc = 0;
+
+	RCFW_CMD_PREP(req, QUERY_VERSION, cmd_flags);
+
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+					  (void *)&resp, NULL, 0);
+	if (rc)
+		return;
+	fw_ver[0] = resp.fw_maj;
+	fw_ver[1] = resp.fw_minor;
+	fw_ver[2] = resp.fw_bld;
+	fw_ver[3] = resp.fw_rsvd;
+}
+
 int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw,
 int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw,
-			    struct bnxt_qplib_dev_attr *attr)
+			    struct bnxt_qplib_dev_attr *attr, bool vf)
 {
 {
 	struct cmdq_query_func req;
 	struct cmdq_query_func req;
 	struct creq_query_func_resp resp;
 	struct creq_query_func_resp resp;
@@ -95,7 +115,8 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw,
 	/* Extract the context from the side buffer */
 	/* Extract the context from the side buffer */
 	attr->max_qp = le32_to_cpu(sb->max_qp);
 	attr->max_qp = le32_to_cpu(sb->max_qp);
 	/* max_qp value reported by FW for PF doesn't include the QP1 for PF */
 	/* max_qp value reported by FW for PF doesn't include the QP1 for PF */
-	attr->max_qp += 1;
+	if (!vf)
+		attr->max_qp += 1;
 	attr->max_qp_rd_atom =
 	attr->max_qp_rd_atom =
 		sb->max_qp_rd_atom > BNXT_QPLIB_MAX_OUT_RD_ATOM ?
 		sb->max_qp_rd_atom > BNXT_QPLIB_MAX_OUT_RD_ATOM ?
 		BNXT_QPLIB_MAX_OUT_RD_ATOM : sb->max_qp_rd_atom;
 		BNXT_QPLIB_MAX_OUT_RD_ATOM : sb->max_qp_rd_atom;
@@ -133,7 +154,7 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw,
 	attr->l2_db_size = (sb->l2_db_space_size + 1) * PAGE_SIZE;
 	attr->l2_db_size = (sb->l2_db_space_size + 1) * PAGE_SIZE;
 	attr->max_sgid = le32_to_cpu(sb->max_gid);
 	attr->max_sgid = le32_to_cpu(sb->max_gid);
 
 
-	strlcpy(attr->fw_ver, "20.6.28.0", sizeof(attr->fw_ver));
+	bnxt_qplib_query_version(rcfw, attr->fw_ver);
 
 
 	for (i = 0; i < MAX_TQM_ALLOC_REQ / 4; i++) {
 	for (i = 0; i < MAX_TQM_ALLOC_REQ / 4; i++) {
 		temp = le32_to_cpu(sb->tqm_alloc_reqs[i]);
 		temp = le32_to_cpu(sb->tqm_alloc_reqs[i]);
@@ -150,6 +171,38 @@ bail:
 	return rc;
 	return rc;
 }
 }
 
 
+int bnxt_qplib_set_func_resources(struct bnxt_qplib_res *res,
+				  struct bnxt_qplib_rcfw *rcfw,
+				  struct bnxt_qplib_ctx *ctx)
+{
+	struct cmdq_set_func_resources req;
+	struct creq_set_func_resources_resp resp;
+	u16 cmd_flags = 0;
+	int rc = 0;
+
+	RCFW_CMD_PREP(req, SET_FUNC_RESOURCES, cmd_flags);
+
+	req.number_of_qp = cpu_to_le32(ctx->qpc_count);
+	req.number_of_mrw = cpu_to_le32(ctx->mrw_count);
+	req.number_of_srq =  cpu_to_le32(ctx->srqc_count);
+	req.number_of_cq = cpu_to_le32(ctx->cq_count);
+
+	req.max_qp_per_vf = cpu_to_le32(ctx->vf_res.max_qp_per_vf);
+	req.max_mrw_per_vf = cpu_to_le32(ctx->vf_res.max_mrw_per_vf);
+	req.max_srq_per_vf = cpu_to_le32(ctx->vf_res.max_srq_per_vf);
+	req.max_cq_per_vf = cpu_to_le32(ctx->vf_res.max_cq_per_vf);
+	req.max_gid_per_vf = cpu_to_le32(ctx->vf_res.max_gid_per_vf);
+
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+					  (void *)&resp,
+					  NULL, 0);
+	if (rc) {
+		dev_err(&res->pdev->dev,
+			"QPLIB: Failed to set function resources");
+	}
+	return rc;
+}
+
 /* SGID */
 /* SGID */
 int bnxt_qplib_get_sgid(struct bnxt_qplib_res *res,
 int bnxt_qplib_get_sgid(struct bnxt_qplib_res *res,
 			struct bnxt_qplib_sgid_tbl *sgid_tbl, int index,
 			struct bnxt_qplib_sgid_tbl *sgid_tbl, int index,
@@ -604,7 +657,7 @@ int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw,
 }
 }
 
 
 int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
 int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
-		      u64 *pbl_tbl, int num_pbls, bool block)
+		      u64 *pbl_tbl, int num_pbls, bool block, u32 buf_pg_size)
 {
 {
 	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
 	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
 	struct cmdq_register_mr req;
 	struct cmdq_register_mr req;
@@ -615,6 +668,9 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
 	u32 pg_size;
 	u32 pg_size;
 
 
 	if (num_pbls) {
 	if (num_pbls) {
+		/* Allocate memory for the non-leaf pages to store buf ptrs.
+		 * Non-leaf pages always uses system PAGE_SIZE
+		 */
 		pg_ptrs = roundup_pow_of_two(num_pbls);
 		pg_ptrs = roundup_pow_of_two(num_pbls);
 		pages = pg_ptrs >> MAX_PBL_LVL_1_PGS_SHIFT;
 		pages = pg_ptrs >> MAX_PBL_LVL_1_PGS_SHIFT;
 		if (!pages)
 		if (!pages)
@@ -632,6 +688,7 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
 			bnxt_qplib_free_hwq(res->pdev, &mr->hwq);
 			bnxt_qplib_free_hwq(res->pdev, &mr->hwq);
 
 
 		mr->hwq.max_elements = pages;
 		mr->hwq.max_elements = pages;
+		/* Use system PAGE_SIZE */
 		rc = bnxt_qplib_alloc_init_hwq(res->pdev, &mr->hwq, NULL, 0,
 		rc = bnxt_qplib_alloc_init_hwq(res->pdev, &mr->hwq, NULL, 0,
 					       &mr->hwq.max_elements,
 					       &mr->hwq.max_elements,
 					       PAGE_SIZE, 0, PAGE_SIZE,
 					       PAGE_SIZE, 0, PAGE_SIZE,
@@ -652,18 +709,22 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
 
 
 	/* Configure the request */
 	/* Configure the request */
 	if (mr->hwq.level == PBL_LVL_MAX) {
 	if (mr->hwq.level == PBL_LVL_MAX) {
+		/* No PBL provided, just use system PAGE_SIZE */
 		level = 0;
 		level = 0;
 		req.pbl = 0;
 		req.pbl = 0;
 		pg_size = PAGE_SIZE;
 		pg_size = PAGE_SIZE;
 	} else {
 	} else {
 		level = mr->hwq.level + 1;
 		level = mr->hwq.level + 1;
 		req.pbl = cpu_to_le64(mr->hwq.pbl[PBL_LVL_0].pg_map_arr[0]);
 		req.pbl = cpu_to_le64(mr->hwq.pbl[PBL_LVL_0].pg_map_arr[0]);
-		pg_size = mr->hwq.pbl[PBL_LVL_0].pg_size;
 	}
 	}
+	pg_size = buf_pg_size ? buf_pg_size : PAGE_SIZE;
 	req.log2_pg_size_lvl = (level << CMDQ_REGISTER_MR_LVL_SFT) |
 	req.log2_pg_size_lvl = (level << CMDQ_REGISTER_MR_LVL_SFT) |
 			       ((ilog2(pg_size) <<
 			       ((ilog2(pg_size) <<
 				 CMDQ_REGISTER_MR_LOG2_PG_SIZE_SFT) &
 				 CMDQ_REGISTER_MR_LOG2_PG_SIZE_SFT) &
 				CMDQ_REGISTER_MR_LOG2_PG_SIZE_MASK);
 				CMDQ_REGISTER_MR_LOG2_PG_SIZE_MASK);
+	req.log2_pbl_pg_size = cpu_to_le16(((ilog2(PAGE_SIZE) <<
+				 CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_SFT) &
+				CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_MASK));
 	req.access = (mr->flags & 0xFFFF);
 	req.access = (mr->flags & 0xFFFF);
 	req.va = cpu_to_le64(mr->va);
 	req.va = cpu_to_le64(mr->va);
 	req.key = cpu_to_le32(mr->lkey);
 	req.key = cpu_to_le32(mr->lkey);
@@ -729,3 +790,73 @@ int bnxt_qplib_map_tc2cos(struct bnxt_qplib_res *res, u16 *cids)
 				     0);
 				     0);
 	return 0;
 	return 0;
 }
 }
+
+int bnxt_qplib_get_roce_stats(struct bnxt_qplib_rcfw *rcfw,
+			      struct bnxt_qplib_roce_stats *stats)
+{
+	struct cmdq_query_roce_stats req;
+	struct creq_query_roce_stats_resp resp;
+	struct bnxt_qplib_rcfw_sbuf *sbuf;
+	struct creq_query_roce_stats_resp_sb *sb;
+	u16 cmd_flags = 0;
+	int rc = 0;
+
+	RCFW_CMD_PREP(req, QUERY_ROCE_STATS, cmd_flags);
+
+	sbuf = bnxt_qplib_rcfw_alloc_sbuf(rcfw, sizeof(*sb));
+	if (!sbuf) {
+		dev_err(&rcfw->pdev->dev,
+			"QPLIB: SP: QUERY_ROCE_STATS alloc side buffer failed");
+		return -ENOMEM;
+	}
+
+	sb = sbuf->sb;
+	req.resp_size = sizeof(*sb) / BNXT_QPLIB_CMDQE_UNITS;
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp,
+					  (void *)sbuf, 0);
+	if (rc)
+		goto bail;
+	/* Extract the context from the side buffer */
+	stats->to_retransmits = le64_to_cpu(sb->to_retransmits);
+	stats->seq_err_naks_rcvd = le64_to_cpu(sb->seq_err_naks_rcvd);
+	stats->max_retry_exceeded = le64_to_cpu(sb->max_retry_exceeded);
+	stats->rnr_naks_rcvd = le64_to_cpu(sb->rnr_naks_rcvd);
+	stats->missing_resp = le64_to_cpu(sb->missing_resp);
+	stats->unrecoverable_err = le64_to_cpu(sb->unrecoverable_err);
+	stats->bad_resp_err = le64_to_cpu(sb->bad_resp_err);
+	stats->local_qp_op_err = le64_to_cpu(sb->local_qp_op_err);
+	stats->local_protection_err = le64_to_cpu(sb->local_protection_err);
+	stats->mem_mgmt_op_err = le64_to_cpu(sb->mem_mgmt_op_err);
+	stats->remote_invalid_req_err = le64_to_cpu(sb->remote_invalid_req_err);
+	stats->remote_access_err = le64_to_cpu(sb->remote_access_err);
+	stats->remote_op_err = le64_to_cpu(sb->remote_op_err);
+	stats->dup_req = le64_to_cpu(sb->dup_req);
+	stats->res_exceed_max = le64_to_cpu(sb->res_exceed_max);
+	stats->res_length_mismatch = le64_to_cpu(sb->res_length_mismatch);
+	stats->res_exceeds_wqe = le64_to_cpu(sb->res_exceeds_wqe);
+	stats->res_opcode_err = le64_to_cpu(sb->res_opcode_err);
+	stats->res_rx_invalid_rkey = le64_to_cpu(sb->res_rx_invalid_rkey);
+	stats->res_rx_domain_err = le64_to_cpu(sb->res_rx_domain_err);
+	stats->res_rx_no_perm = le64_to_cpu(sb->res_rx_no_perm);
+	stats->res_rx_range_err = le64_to_cpu(sb->res_rx_range_err);
+	stats->res_tx_invalid_rkey = le64_to_cpu(sb->res_tx_invalid_rkey);
+	stats->res_tx_domain_err = le64_to_cpu(sb->res_tx_domain_err);
+	stats->res_tx_no_perm = le64_to_cpu(sb->res_tx_no_perm);
+	stats->res_tx_range_err = le64_to_cpu(sb->res_tx_range_err);
+	stats->res_irrq_oflow = le64_to_cpu(sb->res_irrq_oflow);
+	stats->res_unsup_opcode = le64_to_cpu(sb->res_unsup_opcode);
+	stats->res_unaligned_atomic = le64_to_cpu(sb->res_unaligned_atomic);
+	stats->res_rem_inv_err = le64_to_cpu(sb->res_rem_inv_err);
+	stats->res_mem_error = le64_to_cpu(sb->res_mem_error);
+	stats->res_srq_err = le64_to_cpu(sb->res_srq_err);
+	stats->res_cmp_err = le64_to_cpu(sb->res_cmp_err);
+	stats->res_invalid_dup_rkey = le64_to_cpu(sb->res_invalid_dup_rkey);
+	stats->res_wqe_format_err = le64_to_cpu(sb->res_wqe_format_err);
+	stats->res_cq_load_err = le64_to_cpu(sb->res_cq_load_err);
+	stats->res_srq_load_err = le64_to_cpu(sb->res_srq_load_err);
+	stats->res_tx_pci_err = le64_to_cpu(sb->res_tx_pci_err);
+	stats->res_rx_pci_err = le64_to_cpu(sb->res_rx_pci_err);
+bail:
+	bnxt_qplib_rcfw_free_sbuf(rcfw, sbuf);
+	return rc;
+}

+ 88 - 3
drivers/infiniband/hw/bnxt_re/qplib_sp.h

@@ -45,7 +45,8 @@
 #define PCI_EXP_DEVCTL2_ATOMIC_REQ      0x0040
 #define PCI_EXP_DEVCTL2_ATOMIC_REQ      0x0040
 
 
 struct bnxt_qplib_dev_attr {
 struct bnxt_qplib_dev_attr {
-	char				fw_ver[32];
+#define FW_VER_ARR_LEN			4
+	u8				fw_ver[FW_VER_ARR_LEN];
 	u16				max_sgid;
 	u16				max_sgid;
 	u16				max_mrw;
 	u16				max_mrw;
 	u32				max_qp;
 	u32				max_qp;
@@ -127,6 +128,85 @@ struct bnxt_qplib_frpl {
 #define BNXT_QPLIB_ACCESS_ZERO_BASED	BIT(5)
 #define BNXT_QPLIB_ACCESS_ZERO_BASED	BIT(5)
 #define BNXT_QPLIB_ACCESS_ON_DEMAND	BIT(6)
 #define BNXT_QPLIB_ACCESS_ON_DEMAND	BIT(6)
 
 
+struct bnxt_qplib_roce_stats {
+	u64 to_retransmits;
+	u64 seq_err_naks_rcvd;
+	/* seq_err_naks_rcvd is 64 b */
+	u64 max_retry_exceeded;
+	/* max_retry_exceeded is 64 b */
+	u64 rnr_naks_rcvd;
+	/* rnr_naks_rcvd is 64 b */
+	u64 missing_resp;
+	u64 unrecoverable_err;
+	/* unrecoverable_err is 64 b */
+	u64 bad_resp_err;
+	/* bad_resp_err is 64 b */
+	u64 local_qp_op_err;
+	/* local_qp_op_err is 64 b */
+	u64 local_protection_err;
+	/* local_protection_err is 64 b */
+	u64 mem_mgmt_op_err;
+	/* mem_mgmt_op_err is 64 b */
+	u64 remote_invalid_req_err;
+	/* remote_invalid_req_err is 64 b */
+	u64 remote_access_err;
+	/* remote_access_err is 64 b */
+	u64 remote_op_err;
+	/* remote_op_err is 64 b */
+	u64 dup_req;
+	/* dup_req is 64 b */
+	u64 res_exceed_max;
+	/* res_exceed_max is 64 b */
+	u64 res_length_mismatch;
+	/* res_length_mismatch is 64 b */
+	u64 res_exceeds_wqe;
+	/* res_exceeds_wqe is 64 b */
+	u64 res_opcode_err;
+	/* res_opcode_err is 64 b */
+	u64 res_rx_invalid_rkey;
+	/* res_rx_invalid_rkey is 64 b */
+	u64 res_rx_domain_err;
+	/* res_rx_domain_err is 64 b */
+	u64 res_rx_no_perm;
+	/* res_rx_no_perm is 64 b */
+	u64 res_rx_range_err;
+	/* res_rx_range_err is 64 b */
+	u64 res_tx_invalid_rkey;
+	/* res_tx_invalid_rkey is 64 b */
+	u64 res_tx_domain_err;
+	/* res_tx_domain_err is 64 b */
+	u64 res_tx_no_perm;
+	/* res_tx_no_perm is 64 b */
+	u64 res_tx_range_err;
+	/* res_tx_range_err is 64 b */
+	u64 res_irrq_oflow;
+	/* res_irrq_oflow is 64 b */
+	u64 res_unsup_opcode;
+	/* res_unsup_opcode is 64 b */
+	u64 res_unaligned_atomic;
+	/* res_unaligned_atomic is 64 b */
+	u64 res_rem_inv_err;
+	/* res_rem_inv_err is 64 b */
+	u64 res_mem_error;
+	/* res_mem_error is 64 b */
+	u64 res_srq_err;
+	/* res_srq_err is 64 b */
+	u64 res_cmp_err;
+	/* res_cmp_err is 64 b */
+	u64 res_invalid_dup_rkey;
+	/* res_invalid_dup_rkey is 64 b */
+	u64 res_wqe_format_err;
+	/* res_wqe_format_err is 64 b */
+	u64 res_cq_load_err;
+	/* res_cq_load_err is 64 b */
+	u64 res_srq_load_err;
+	/* res_srq_load_err is 64 b */
+	u64 res_tx_pci_err;
+	/* res_tx_pci_err is 64 b */
+	u64 res_rx_pci_err;
+	/* res_rx_pci_err is 64 b */
+};
+
 int bnxt_qplib_get_sgid(struct bnxt_qplib_res *res,
 int bnxt_qplib_get_sgid(struct bnxt_qplib_res *res,
 			struct bnxt_qplib_sgid_tbl *sgid_tbl, int index,
 			struct bnxt_qplib_sgid_tbl *sgid_tbl, int index,
 			struct bnxt_qplib_gid *gid);
 			struct bnxt_qplib_gid *gid);
@@ -147,7 +227,10 @@ int bnxt_qplib_add_pkey(struct bnxt_qplib_res *res,
 			struct bnxt_qplib_pkey_tbl *pkey_tbl, u16 *pkey,
 			struct bnxt_qplib_pkey_tbl *pkey_tbl, u16 *pkey,
 			bool update);
 			bool update);
 int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw,
 int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw,
-			    struct bnxt_qplib_dev_attr *attr);
+			    struct bnxt_qplib_dev_attr *attr, bool vf);
+int bnxt_qplib_set_func_resources(struct bnxt_qplib_res *res,
+				  struct bnxt_qplib_rcfw *rcfw,
+				  struct bnxt_qplib_ctx *ctx);
 int bnxt_qplib_create_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah);
 int bnxt_qplib_create_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah);
 int bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah);
 int bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah);
 int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res,
 int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res,
@@ -155,7 +238,7 @@ int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res,
 int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw,
 int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw,
 			 bool block);
 			 bool block);
 int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
 int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
-		      u64 *pbl_tbl, int num_pbls, bool block);
+		      u64 *pbl_tbl, int num_pbls, bool block, u32 buf_pg_size);
 int bnxt_qplib_free_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr);
 int bnxt_qplib_free_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr);
 int bnxt_qplib_alloc_fast_reg_mr(struct bnxt_qplib_res *res,
 int bnxt_qplib_alloc_fast_reg_mr(struct bnxt_qplib_res *res,
 				 struct bnxt_qplib_mrw *mr, int max);
 				 struct bnxt_qplib_mrw *mr, int max);
@@ -164,4 +247,6 @@ int bnxt_qplib_alloc_fast_reg_page_list(struct bnxt_qplib_res *res,
 int bnxt_qplib_free_fast_reg_page_list(struct bnxt_qplib_res *res,
 int bnxt_qplib_free_fast_reg_page_list(struct bnxt_qplib_res *res,
 				       struct bnxt_qplib_frpl *frpl);
 				       struct bnxt_qplib_frpl *frpl);
 int bnxt_qplib_map_tc2cos(struct bnxt_qplib_res *res, u16 *cids);
 int bnxt_qplib_map_tc2cos(struct bnxt_qplib_res *res, u16 *cids);
+int bnxt_qplib_get_roce_stats(struct bnxt_qplib_rcfw *rcfw,
+			      struct bnxt_qplib_roce_stats *stats);
 #endif /* __BNXT_QPLIB_SP_H__*/
 #endif /* __BNXT_QPLIB_SP_H__*/

+ 126 - 1
drivers/infiniband/hw/bnxt_re/roce_hsi.h

@@ -954,6 +954,7 @@ struct cmdq_base {
 	#define CMDQ_BASE_OPCODE_QUERY_VERSION			   0x8bUL
 	#define CMDQ_BASE_OPCODE_QUERY_VERSION			   0x8bUL
 	#define CMDQ_BASE_OPCODE_MODIFY_CC			   0x8cUL
 	#define CMDQ_BASE_OPCODE_MODIFY_CC			   0x8cUL
 	#define CMDQ_BASE_OPCODE_QUERY_CC			   0x8dUL
 	#define CMDQ_BASE_OPCODE_QUERY_CC			   0x8dUL
+	#define CMDQ_BASE_OPCODE_QUERY_ROCE_STATS	   0x8eUL
 	u8 cmd_size;
 	u8 cmd_size;
 	__le16 flags;
 	__le16 flags;
 	__le16 cookie;
 	__le16 cookie;
@@ -1383,8 +1384,20 @@ struct cmdq_register_mr {
 	#define CMDQ_REGISTER_MR_LVL_LVL_0			   0x0UL
 	#define CMDQ_REGISTER_MR_LVL_LVL_0			   0x0UL
 	#define CMDQ_REGISTER_MR_LVL_LVL_1			   0x1UL
 	#define CMDQ_REGISTER_MR_LVL_LVL_1			   0x1UL
 	#define CMDQ_REGISTER_MR_LVL_LVL_2			   0x2UL
 	#define CMDQ_REGISTER_MR_LVL_LVL_2			   0x2UL
+	#define CMDQ_REGISTER_MR_LVL_LAST             CMDQ_REGISTER_MR_LVL_LVL_2
 	#define CMDQ_REGISTER_MR_LOG2_PG_SIZE_MASK		    0x7cUL
 	#define CMDQ_REGISTER_MR_LOG2_PG_SIZE_MASK		    0x7cUL
 	#define CMDQ_REGISTER_MR_LOG2_PG_SIZE_SFT		    2
 	#define CMDQ_REGISTER_MR_LOG2_PG_SIZE_SFT		    2
+	#define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_4K    (0xcUL << 2)
+	#define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_8K    (0xdUL << 2)
+	#define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_64K   (0x10UL << 2)
+	#define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_256K  (0x12UL << 2)
+	#define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_1M    (0x14UL << 2)
+	#define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_2M    (0x15UL << 2)
+	#define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_4M    (0x16UL << 2)
+	#define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_1G    (0x1eUL << 2)
+	#define CMDQ_REGISTER_MR_LOG2_PG_SIZE_LAST	\
+					CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_1G
+	#define CMDQ_REGISTER_MR_UNUSED1             0x80UL
 	u8 access;
 	u8 access;
 	#define CMDQ_REGISTER_MR_ACCESS_LOCAL_WRITE		    0x1UL
 	#define CMDQ_REGISTER_MR_ACCESS_LOCAL_WRITE		    0x1UL
 	#define CMDQ_REGISTER_MR_ACCESS_REMOTE_READ		    0x2UL
 	#define CMDQ_REGISTER_MR_ACCESS_REMOTE_READ		    0x2UL
@@ -1392,7 +1405,21 @@ struct cmdq_register_mr {
 	#define CMDQ_REGISTER_MR_ACCESS_REMOTE_ATOMIC		    0x8UL
 	#define CMDQ_REGISTER_MR_ACCESS_REMOTE_ATOMIC		    0x8UL
 	#define CMDQ_REGISTER_MR_ACCESS_MW_BIND		    0x10UL
 	#define CMDQ_REGISTER_MR_ACCESS_MW_BIND		    0x10UL
 	#define CMDQ_REGISTER_MR_ACCESS_ZERO_BASED		    0x20UL
 	#define CMDQ_REGISTER_MR_ACCESS_ZERO_BASED		    0x20UL
-	__le16 unused_1;
+	__le16	log2_pbl_pg_size;
+	#define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_MASK   0x1fUL
+	#define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_SFT    0
+	#define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_4K    0xcUL
+	#define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_8K    0xdUL
+	#define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_64K   0x10UL
+	#define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_256K  0x12UL
+	#define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_1M    0x14UL
+	#define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_2M    0x15UL
+	#define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_4M    0x16UL
+	#define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_1G    0x1eUL
+	#define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_LAST    \
+				CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_1G
+	#define CMDQ_REGISTER_MR_UNUSED11_MASK           0xffe0UL
+	#define CMDQ_REGISTER_MR_UNUSED11_SFT            5
 	__le32 key;
 	__le32 key;
 	__le64 pbl;
 	__le64 pbl;
 	__le64 va;
 	__le64 va;
@@ -1799,6 +1826,16 @@ struct cmdq_set_func_resources {
 	u8 resp_size;
 	u8 resp_size;
 	u8 reserved8;
 	u8 reserved8;
 	__le64 resp_addr;
 	__le64 resp_addr;
+	__le32 number_of_qp;
+	__le32 number_of_mrw;
+	__le32 number_of_srq;
+	__le32 number_of_cq;
+	__le32 max_qp_per_vf;
+	__le32 max_mrw_per_vf;
+	__le32 max_srq_per_vf;
+	__le32 max_cq_per_vf;
+	__le32 max_gid_per_vf;
+	__le32 stat_ctx_id;
 };
 };
 
 
 /* Read hardware resource context command (24 bytes) */
 /* Read hardware resource context command (24 bytes) */
@@ -2013,6 +2050,20 @@ struct creq_modify_qp_resp {
 	__le16 reserved48[3];
 	__le16 reserved48[3];
 };
 };
 
 
+/* cmdq_query_roce_stats (size:128b/16B) */
+struct cmdq_query_roce_stats {
+	u8	opcode;
+	#define CMDQ_QUERY_ROCE_STATS_OPCODE_QUERY_ROCE_STATS 0x8eUL
+	#define CMDQ_QUERY_ROCE_STATS_OPCODE_LAST	\
+				CMDQ_QUERY_ROCE_STATS_OPCODE_QUERY_ROCE_STATS
+	u8	cmd_size;
+	__le16	flags;
+	__le16	cookie;
+	u8	resp_size;
+	u8	reserved8;
+	__le64	resp_addr;
+};
+
 /* Query QP command response (16 bytes) */
 /* Query QP command response (16 bytes) */
 struct creq_query_qp_resp {
 struct creq_query_qp_resp {
 	u8 type;
 	u8 type;
@@ -2783,6 +2834,80 @@ struct creq_query_cc_resp_sb {
 	__le64 reserved64_1;
 	__le64 reserved64_1;
 };
 };
 
 
+/* creq_query_roce_stats_resp (size:128b/16B) */
+struct creq_query_roce_stats_resp {
+	u8	type;
+	#define CREQ_QUERY_ROCE_STATS_RESP_TYPE_MASK    0x3fUL
+	#define CREQ_QUERY_ROCE_STATS_RESP_TYPE_SFT     0
+	#define CREQ_QUERY_ROCE_STATS_RESP_TYPE_QP_EVENT  0x38UL
+	#define CREQ_QUERY_ROCE_STATS_RESP_TYPE_LAST	\
+				CREQ_QUERY_ROCE_STATS_RESP_TYPE_QP_EVENT
+	u8	status;
+	__le16	cookie;
+	__le32	size;
+	u8	v;
+	#define CREQ_QUERY_ROCE_STATS_RESP_V     0x1UL
+	u8	event;
+	#define CREQ_QUERY_ROCE_STATS_RESP_EVENT_QUERY_ROCE_STATS 0x8eUL
+	#define CREQ_QUERY_ROCE_STATS_RESP_EVENT_LAST	\
+			CREQ_QUERY_ROCE_STATS_RESP_EVENT_QUERY_ROCE_STATS
+	u8	reserved48[6];
+};
+
+/* creq_query_roce_stats_resp_sb (size:2624b/328B) */
+struct creq_query_roce_stats_resp_sb {
+	u8	opcode;
+	#define CREQ_QUERY_ROCE_STATS_RESP_SB_OPCODE_QUERY_ROCE_STATS 0x8eUL
+	#define CREQ_QUERY_ROCE_STATS_RESP_SB_OPCODE_LAST \
+			CREQ_QUERY_ROCE_STATS_RESP_SB_OPCODE_QUERY_ROCE_STATS
+	u8	status;
+	__le16	cookie;
+	__le16	flags;
+	u8	resp_size;
+	u8	rsvd;
+	__le32	num_counters;
+	__le32	rsvd1;
+	__le64	to_retransmits;
+	__le64	seq_err_naks_rcvd;
+	__le64	max_retry_exceeded;
+	__le64	rnr_naks_rcvd;
+	__le64	missing_resp;
+	__le64	unrecoverable_err;
+	__le64	bad_resp_err;
+	__le64	local_qp_op_err;
+	__le64	local_protection_err;
+	__le64	mem_mgmt_op_err;
+	__le64	remote_invalid_req_err;
+	__le64	remote_access_err;
+	__le64	remote_op_err;
+	__le64	dup_req;
+	__le64	res_exceed_max;
+	__le64	res_length_mismatch;
+	__le64	res_exceeds_wqe;
+	__le64	res_opcode_err;
+	__le64	res_rx_invalid_rkey;
+	__le64	res_rx_domain_err;
+	__le64	res_rx_no_perm;
+	__le64	res_rx_range_err;
+	__le64	res_tx_invalid_rkey;
+	__le64	res_tx_domain_err;
+	__le64	res_tx_no_perm;
+	__le64	res_tx_range_err;
+	__le64	res_irrq_oflow;
+	__le64	res_unsup_opcode;
+	__le64	res_unaligned_atomic;
+	__le64	res_rem_inv_err;
+	__le64	res_mem_error;
+	__le64	res_srq_err;
+	__le64	res_cmp_err;
+	__le64	res_invalid_dup_rkey;
+	__le64	res_wqe_format_err;
+	__le64	res_cq_load_err;
+	__le64	res_srq_load_err;
+	__le64	res_tx_pci_err;
+	__le64	res_rx_pci_err;
+};
+
 /* QP error notification event (16 bytes) */
 /* QP error notification event (16 bytes) */
 struct creq_qp_error_notification {
 struct creq_qp_error_notification {
 	u8 type;
 	u8 type;

+ 16 - 11
drivers/infiniband/hw/cxgb4/cm.c

@@ -257,8 +257,8 @@ static void set_emss(struct c4iw_ep *ep, u16 opt)
 	if (ep->emss < 128)
 	if (ep->emss < 128)
 		ep->emss = 128;
 		ep->emss = 128;
 	if (ep->emss & 7)
 	if (ep->emss & 7)
-		pr_warn("Warning: misaligned mtu idx %u mss %u emss=%u\n",
-			TCPOPT_MSS_G(opt), ep->mss, ep->emss);
+		pr_debug("Warning: misaligned mtu idx %u mss %u emss=%u\n",
+			 TCPOPT_MSS_G(opt), ep->mss, ep->emss);
 	pr_debug("mss_idx %u mss %u emss=%u\n", TCPOPT_MSS_G(opt), ep->mss,
 	pr_debug("mss_idx %u mss %u emss=%u\n", TCPOPT_MSS_G(opt), ep->mss,
 		 ep->emss);
 		 ep->emss);
 }
 }
@@ -2733,9 +2733,8 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
 		return 0;
 		return 0;
 
 
 	if (cxgb_is_neg_adv(req->status)) {
 	if (cxgb_is_neg_adv(req->status)) {
-		pr_warn("%s Negative advice on abort- tid %u status %d (%s)\n",
-			__func__, ep->hwtid, req->status,
-			neg_adv_str(req->status));
+		pr_debug("Negative advice on abort- tid %u status %d (%s)\n",
+			 ep->hwtid, req->status, neg_adv_str(req->status));
 		ep->stats.abort_neg_adv++;
 		ep->stats.abort_neg_adv++;
 		mutex_lock(&dev->rdev.stats.lock);
 		mutex_lock(&dev->rdev.stats.lock);
 		dev->rdev.stats.neg_adv++;
 		dev->rdev.stats.neg_adv++;
@@ -3567,8 +3566,8 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
 	case MORIBUND:
 	case MORIBUND:
 	case ABORTING:
 	case ABORTING:
 	case DEAD:
 	case DEAD:
-		pr_info("%s ignoring disconnect ep %p state %u\n",
-			__func__, ep, ep->com.state);
+		pr_debug("ignoring disconnect ep %p state %u\n",
+			 ep, ep->com.state);
 		break;
 		break;
 	default:
 	default:
 		WARN_ONCE(1, "Bad endpoint state %u\n", ep->com.state);
 		WARN_ONCE(1, "Bad endpoint state %u\n", ep->com.state);
@@ -4097,9 +4096,15 @@ static void process_work(struct work_struct *work)
 		dev = *((struct c4iw_dev **) (skb->cb + sizeof(void *)));
 		dev = *((struct c4iw_dev **) (skb->cb + sizeof(void *)));
 		opcode = rpl->ot.opcode;
 		opcode = rpl->ot.opcode;
 
 
-		ret = work_handlers[opcode](dev, skb);
-		if (!ret)
+		if (opcode >= ARRAY_SIZE(work_handlers) ||
+		    !work_handlers[opcode]) {
+			pr_err("No handler for opcode 0x%x.\n", opcode);
 			kfree_skb(skb);
 			kfree_skb(skb);
+		} else {
+			ret = work_handlers[opcode](dev, skb);
+			if (!ret)
+				kfree_skb(skb);
+		}
 		process_timedout_eps();
 		process_timedout_eps();
 	}
 	}
 }
 }
@@ -4201,8 +4206,8 @@ static int peer_abort_intr(struct c4iw_dev *dev, struct sk_buff *skb)
 		return 0;
 		return 0;
 	}
 	}
 	if (cxgb_is_neg_adv(req->status)) {
 	if (cxgb_is_neg_adv(req->status)) {
-		pr_warn("%s Negative advice on abort- tid %u status %d (%s)\n",
-			__func__, ep->hwtid, req->status,
+		pr_debug("Negative advice on abort- tid %u status %d (%s)\n",
+			 ep->hwtid, req->status,
 			 neg_adv_str(req->status));
 			 neg_adv_str(req->status));
 		goto out;
 		goto out;
 	}
 	}

+ 16 - 20
drivers/infiniband/hw/cxgb4/device.c

@@ -66,7 +66,7 @@ MODULE_PARM_DESC(c4iw_wr_log_size_order,
 
 
 static LIST_HEAD(uld_ctx_list);
 static LIST_HEAD(uld_ctx_list);
 static DEFINE_MUTEX(dev_mutex);
 static DEFINE_MUTEX(dev_mutex);
-struct workqueue_struct *reg_workq;
+static struct workqueue_struct *reg_workq;
 
 
 #define DB_FC_RESUME_SIZE 64
 #define DB_FC_RESUME_SIZE 64
 #define DB_FC_RESUME_DELAY 1
 #define DB_FC_RESUME_DELAY 1
@@ -108,19 +108,19 @@ void c4iw_log_wr_stats(struct t4_wq *wq, struct t4_cqe *cqe)
 	idx = (atomic_inc_return(&wq->rdev->wr_log_idx) - 1) &
 	idx = (atomic_inc_return(&wq->rdev->wr_log_idx) - 1) &
 		(wq->rdev->wr_log_size - 1);
 		(wq->rdev->wr_log_size - 1);
 	le.poll_sge_ts = cxgb4_read_sge_timestamp(wq->rdev->lldi.ports[0]);
 	le.poll_sge_ts = cxgb4_read_sge_timestamp(wq->rdev->lldi.ports[0]);
-	getnstimeofday(&le.poll_host_ts);
+	le.poll_host_time = ktime_get();
 	le.valid = 1;
 	le.valid = 1;
 	le.cqe_sge_ts = CQE_TS(cqe);
 	le.cqe_sge_ts = CQE_TS(cqe);
 	if (SQ_TYPE(cqe)) {
 	if (SQ_TYPE(cqe)) {
 		le.qid = wq->sq.qid;
 		le.qid = wq->sq.qid;
 		le.opcode = CQE_OPCODE(cqe);
 		le.opcode = CQE_OPCODE(cqe);
-		le.post_host_ts = wq->sq.sw_sq[wq->sq.cidx].host_ts;
+		le.post_host_time = wq->sq.sw_sq[wq->sq.cidx].host_time;
 		le.post_sge_ts = wq->sq.sw_sq[wq->sq.cidx].sge_ts;
 		le.post_sge_ts = wq->sq.sw_sq[wq->sq.cidx].sge_ts;
 		le.wr_id = CQE_WRID_SQ_IDX(cqe);
 		le.wr_id = CQE_WRID_SQ_IDX(cqe);
 	} else {
 	} else {
 		le.qid = wq->rq.qid;
 		le.qid = wq->rq.qid;
 		le.opcode = FW_RI_RECEIVE;
 		le.opcode = FW_RI_RECEIVE;
-		le.post_host_ts = wq->rq.sw_rq[wq->rq.cidx].host_ts;
+		le.post_host_time = wq->rq.sw_rq[wq->rq.cidx].host_time;
 		le.post_sge_ts = wq->rq.sw_rq[wq->rq.cidx].sge_ts;
 		le.post_sge_ts = wq->rq.sw_rq[wq->rq.cidx].sge_ts;
 		le.wr_id = CQE_WRID_MSN(cqe);
 		le.wr_id = CQE_WRID_MSN(cqe);
 	}
 	}
@@ -130,9 +130,9 @@ void c4iw_log_wr_stats(struct t4_wq *wq, struct t4_cqe *cqe)
 static int wr_log_show(struct seq_file *seq, void *v)
 static int wr_log_show(struct seq_file *seq, void *v)
 {
 {
 	struct c4iw_dev *dev = seq->private;
 	struct c4iw_dev *dev = seq->private;
-	struct timespec prev_ts = {0, 0};
+	ktime_t prev_time;
 	struct wr_log_entry *lep;
 	struct wr_log_entry *lep;
-	int prev_ts_set = 0;
+	int prev_time_set = 0;
 	int idx, end;
 	int idx, end;
 
 
 #define ts2ns(ts) div64_u64((ts) * dev->rdev.lldi.cclk_ps, 1000)
 #define ts2ns(ts) div64_u64((ts) * dev->rdev.lldi.cclk_ps, 1000)
@@ -145,33 +145,29 @@ static int wr_log_show(struct seq_file *seq, void *v)
 	lep = &dev->rdev.wr_log[idx];
 	lep = &dev->rdev.wr_log[idx];
 	while (idx != end) {
 	while (idx != end) {
 		if (lep->valid) {
 		if (lep->valid) {
-			if (!prev_ts_set) {
-				prev_ts_set = 1;
-				prev_ts = lep->poll_host_ts;
+			if (!prev_time_set) {
+				prev_time_set = 1;
+				prev_time = lep->poll_host_time;
 			}
 			}
-			seq_printf(seq, "%04u: sec %lu nsec %lu qid %u opcode "
-				   "%u %s 0x%x host_wr_delta sec %lu nsec %lu "
+			seq_printf(seq, "%04u: nsec %llu qid %u opcode "
+				   "%u %s 0x%x host_wr_delta nsec %llu "
 				   "post_sge_ts 0x%llx cqe_sge_ts 0x%llx "
 				   "post_sge_ts 0x%llx cqe_sge_ts 0x%llx "
 				   "poll_sge_ts 0x%llx post_poll_delta_ns %llu "
 				   "poll_sge_ts 0x%llx post_poll_delta_ns %llu "
 				   "cqe_poll_delta_ns %llu\n",
 				   "cqe_poll_delta_ns %llu\n",
 				   idx,
 				   idx,
-				   timespec_sub(lep->poll_host_ts,
-						prev_ts).tv_sec,
-				   timespec_sub(lep->poll_host_ts,
-						prev_ts).tv_nsec,
+				   ktime_to_ns(ktime_sub(lep->poll_host_time,
+							 prev_time)),
 				   lep->qid, lep->opcode,
 				   lep->qid, lep->opcode,
 				   lep->opcode == FW_RI_RECEIVE ?
 				   lep->opcode == FW_RI_RECEIVE ?
 							"msn" : "wrid",
 							"msn" : "wrid",
 				   lep->wr_id,
 				   lep->wr_id,
-				   timespec_sub(lep->poll_host_ts,
-						lep->post_host_ts).tv_sec,
-				   timespec_sub(lep->poll_host_ts,
-						lep->post_host_ts).tv_nsec,
+				   ktime_to_ns(ktime_sub(lep->poll_host_time,
+							 lep->post_host_time)),
 				   lep->post_sge_ts, lep->cqe_sge_ts,
 				   lep->post_sge_ts, lep->cqe_sge_ts,
 				   lep->poll_sge_ts,
 				   lep->poll_sge_ts,
 				   ts2ns(lep->poll_sge_ts - lep->post_sge_ts),
 				   ts2ns(lep->poll_sge_ts - lep->post_sge_ts),
 				   ts2ns(lep->poll_sge_ts - lep->cqe_sge_ts));
 				   ts2ns(lep->poll_sge_ts - lep->cqe_sge_ts));
-			prev_ts = lep->poll_host_ts;
+			prev_time = lep->poll_host_time;
 		}
 		}
 		idx++;
 		idx++;
 		if (idx > (dev->rdev.wr_log_size - 1))
 		if (idx > (dev->rdev.wr_log_size - 1))

+ 1 - 1
drivers/infiniband/hw/cxgb4/ev.c

@@ -236,7 +236,7 @@ int c4iw_ev_handler(struct c4iw_dev *dev, u32 qid)
 		if (atomic_dec_and_test(&chp->refcnt))
 		if (atomic_dec_and_test(&chp->refcnt))
 			wake_up(&chp->wait);
 			wake_up(&chp->wait);
 	} else {
 	} else {
-		pr_warn("%s unknown cqid 0x%x\n", __func__, qid);
+		pr_debug("unknown cqid 0x%x\n", qid);
 		spin_unlock_irqrestore(&dev->lock, flag);
 		spin_unlock_irqrestore(&dev->lock, flag);
 	}
 	}
 	return 0;
 	return 0;

+ 2 - 2
drivers/infiniband/hw/cxgb4/iw_cxgb4.h

@@ -153,8 +153,8 @@ struct c4iw_hw_queue {
 };
 };
 
 
 struct wr_log_entry {
 struct wr_log_entry {
-	struct timespec post_host_ts;
-	struct timespec poll_host_ts;
+	ktime_t post_host_time;
+	ktime_t poll_host_time;
 	u64 post_sge_ts;
 	u64 post_sge_ts;
 	u64 cqe_sge_ts;
 	u64 cqe_sge_ts;
 	u64 poll_sge_ts;
 	u64 poll_sge_ts;

+ 3 - 3
drivers/infiniband/hw/cxgb4/qp.c

@@ -1042,7 +1042,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 		if (c4iw_wr_log) {
 		if (c4iw_wr_log) {
 			swsqe->sge_ts = cxgb4_read_sge_timestamp(
 			swsqe->sge_ts = cxgb4_read_sge_timestamp(
 					qhp->rhp->rdev.lldi.ports[0]);
 					qhp->rhp->rdev.lldi.ports[0]);
-			getnstimeofday(&swsqe->host_ts);
+			swsqe->host_time = ktime_get();
 		}
 		}
 
 
 		init_wr_hdr(wqe, qhp->wq.sq.pidx, fw_opcode, fw_flags, len16);
 		init_wr_hdr(wqe, qhp->wq.sq.pidx, fw_opcode, fw_flags, len16);
@@ -1117,8 +1117,8 @@ int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 			qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].sge_ts =
 			qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].sge_ts =
 				cxgb4_read_sge_timestamp(
 				cxgb4_read_sge_timestamp(
 						qhp->rhp->rdev.lldi.ports[0]);
 						qhp->rhp->rdev.lldi.ports[0]);
-			getnstimeofday(
-				&qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].host_ts);
+			qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].host_time =
+				ktime_get();
 		}
 		}
 
 
 		wqe->recv.opcode = FW_RI_RECV_WR;
 		wqe->recv.opcode = FW_RI_RECV_WR;

+ 2 - 2
drivers/infiniband/hw/cxgb4/t4.h

@@ -277,7 +277,7 @@ struct t4_swsqe {
 	int			signaled;
 	int			signaled;
 	u16			idx;
 	u16			idx;
 	int                     flushed;
 	int                     flushed;
-	struct timespec         host_ts;
+	ktime_t			host_time;
 	u64                     sge_ts;
 	u64                     sge_ts;
 };
 };
 
 
@@ -318,7 +318,7 @@ struct t4_sq {
 
 
 struct t4_swrqe {
 struct t4_swrqe {
 	u64 wr_id;
 	u64 wr_id;
-	struct timespec host_ts;
+	ktime_t	host_time;
 	u64 sge_ts;
 	u64 sge_ts;
 };
 };
 
 

+ 31 - 56
drivers/infiniband/hw/hfi1/chip.c

@@ -6518,11 +6518,12 @@ static void _dc_start(struct hfi1_devdata *dd)
 	if (!dd->dc_shutdown)
 	if (!dd->dc_shutdown)
 		return;
 		return;
 
 
-	/*
-	 * Take the 8051 out of reset, wait until 8051 is ready, and set host
-	 * version bit.
-	 */
-	release_and_wait_ready_8051_firmware(dd);
+	/* Take the 8051 out of reset */
+	write_csr(dd, DC_DC8051_CFG_RST, 0ull);
+	/* Wait until 8051 is ready */
+	if (wait_fm_ready(dd, TIMEOUT_8051_START))
+		dd_dev_err(dd, "%s: timeout starting 8051 firmware\n",
+			   __func__);
 
 
 	/* Take away reset for LCB and RX FPE (set in lcb_shutdown). */
 	/* Take away reset for LCB and RX FPE (set in lcb_shutdown). */
 	write_csr(dd, DCC_CFG_RESET, 0x10);
 	write_csr(dd, DCC_CFG_RESET, 0x10);
@@ -8564,23 +8565,27 @@ int write_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 data)
 }
 }
 
 
 /*
 /*
- * If the 8051 is in reset mode (dd->dc_shutdown == 1), this function
- * will still continue executing.
- *
  * Returns:
  * Returns:
  *	< 0 = Linux error, not able to get access
  *	< 0 = Linux error, not able to get access
  *	> 0 = 8051 command RETURN_CODE
  *	> 0 = 8051 command RETURN_CODE
  */
  */
-static int _do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data,
-			    u64 *out_data)
+static int do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data,
+			   u64 *out_data)
 {
 {
 	u64 reg, completed;
 	u64 reg, completed;
 	int return_code;
 	int return_code;
 	unsigned long timeout;
 	unsigned long timeout;
 
 
-	lockdep_assert_held(&dd->dc8051_lock);
 	hfi1_cdbg(DC8051, "type %d, data 0x%012llx", type, in_data);
 	hfi1_cdbg(DC8051, "type %d, data 0x%012llx", type, in_data);
 
 
+	mutex_lock(&dd->dc8051_lock);
+
+	/* We can't send any commands to the 8051 if it's in reset */
+	if (dd->dc_shutdown) {
+		return_code = -ENODEV;
+		goto fail;
+	}
+
 	/*
 	/*
 	 * If an 8051 host command timed out previously, then the 8051 is
 	 * If an 8051 host command timed out previously, then the 8051 is
 	 * stuck.
 	 * stuck.
@@ -8680,29 +8685,6 @@ static int _do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data,
 	 */
 	 */
 	write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, 0);
 	write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, 0);
 
 
-fail:
-	return return_code;
-}
-
-/*
- * Returns:
- *	< 0 = Linux error, not able to get access
- *	> 0 = 8051 command RETURN_CODE
- */
-static int do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data,
-			   u64 *out_data)
-{
-	int return_code;
-
-	mutex_lock(&dd->dc8051_lock);
-	/* We can't send any commands to the 8051 if it's in reset */
-	if (dd->dc_shutdown) {
-		return_code = -ENODEV;
-		goto fail;
-	}
-
-	return_code = _do_8051_command(dd, type, in_data, out_data);
-
 fail:
 fail:
 	mutex_unlock(&dd->dc8051_lock);
 	mutex_unlock(&dd->dc8051_lock);
 	return return_code;
 	return return_code;
@@ -8713,17 +8695,16 @@ static int set_physical_link_state(struct hfi1_devdata *dd, u64 state)
 	return do_8051_command(dd, HCMD_CHANGE_PHY_STATE, state, NULL);
 	return do_8051_command(dd, HCMD_CHANGE_PHY_STATE, state, NULL);
 }
 }
 
 
-static int _load_8051_config(struct hfi1_devdata *dd, u8 field_id,
-			     u8 lane_id, u32 config_data)
+int load_8051_config(struct hfi1_devdata *dd, u8 field_id,
+		     u8 lane_id, u32 config_data)
 {
 {
 	u64 data;
 	u64 data;
 	int ret;
 	int ret;
 
 
-	lockdep_assert_held(&dd->dc8051_lock);
 	data = (u64)field_id << LOAD_DATA_FIELD_ID_SHIFT
 	data = (u64)field_id << LOAD_DATA_FIELD_ID_SHIFT
 		| (u64)lane_id << LOAD_DATA_LANE_ID_SHIFT
 		| (u64)lane_id << LOAD_DATA_LANE_ID_SHIFT
 		| (u64)config_data << LOAD_DATA_DATA_SHIFT;
 		| (u64)config_data << LOAD_DATA_DATA_SHIFT;
-	ret = _do_8051_command(dd, HCMD_LOAD_CONFIG_DATA, data, NULL);
+	ret = do_8051_command(dd, HCMD_LOAD_CONFIG_DATA, data, NULL);
 	if (ret != HCMD_SUCCESS) {
 	if (ret != HCMD_SUCCESS) {
 		dd_dev_err(dd,
 		dd_dev_err(dd,
 			   "load 8051 config: field id %d, lane %d, err %d\n",
 			   "load 8051 config: field id %d, lane %d, err %d\n",
@@ -8732,18 +8713,6 @@ static int _load_8051_config(struct hfi1_devdata *dd, u8 field_id,
 	return ret;
 	return ret;
 }
 }
 
 
-int load_8051_config(struct hfi1_devdata *dd, u8 field_id,
-		     u8 lane_id, u32 config_data)
-{
-	int return_code;
-
-	mutex_lock(&dd->dc8051_lock);
-	return_code = _load_8051_config(dd, field_id, lane_id, config_data);
-	mutex_unlock(&dd->dc8051_lock);
-
-	return return_code;
-}
-
 /*
 /*
  * Read the 8051 firmware "registers".  Use the RAM directly.  Always
  * Read the 8051 firmware "registers".  Use the RAM directly.  Always
  * set the result, even on error.
  * set the result, even on error.
@@ -8859,14 +8828,13 @@ int write_host_interface_version(struct hfi1_devdata *dd, u8 version)
 	u32 frame;
 	u32 frame;
 	u32 mask;
 	u32 mask;
 
 
-	lockdep_assert_held(&dd->dc8051_lock);
 	mask = (HOST_INTERFACE_VERSION_MASK << HOST_INTERFACE_VERSION_SHIFT);
 	mask = (HOST_INTERFACE_VERSION_MASK << HOST_INTERFACE_VERSION_SHIFT);
 	read_8051_config(dd, RESERVED_REGISTERS, GENERAL_CONFIG, &frame);
 	read_8051_config(dd, RESERVED_REGISTERS, GENERAL_CONFIG, &frame);
 	/* Clear, then set field */
 	/* Clear, then set field */
 	frame &= ~mask;
 	frame &= ~mask;
 	frame |= ((u32)version << HOST_INTERFACE_VERSION_SHIFT);
 	frame |= ((u32)version << HOST_INTERFACE_VERSION_SHIFT);
-	return _load_8051_config(dd, RESERVED_REGISTERS, GENERAL_CONFIG,
-				 frame);
+	return load_8051_config(dd, RESERVED_REGISTERS, GENERAL_CONFIG,
+				frame);
 }
 }
 
 
 void read_misc_status(struct hfi1_devdata *dd, u8 *ver_major, u8 *ver_minor,
 void read_misc_status(struct hfi1_devdata *dd, u8 *ver_major, u8 *ver_minor,
@@ -9270,6 +9238,14 @@ static int set_local_link_attributes(struct hfi1_pportdata *ppd)
 	if (ret != HCMD_SUCCESS)
 	if (ret != HCMD_SUCCESS)
 		goto set_local_link_attributes_fail;
 		goto set_local_link_attributes_fail;
 
 
+	ret = write_host_interface_version(dd, HOST_INTERFACE_VERSION);
+	if (ret != HCMD_SUCCESS) {
+		dd_dev_err(dd,
+			   "Failed to set host interface version, return 0x%x\n",
+			   ret);
+		goto set_local_link_attributes_fail;
+	}
+
 	/*
 	/*
 	 * DC supports continuous updates.
 	 * DC supports continuous updates.
 	 */
 	 */
@@ -14944,9 +14920,8 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
 
 
 		if (num_vls < HFI1_MIN_VLS_SUPPORTED ||
 		if (num_vls < HFI1_MIN_VLS_SUPPORTED ||
 		    num_vls > HFI1_MAX_VLS_SUPPORTED) {
 		    num_vls > HFI1_MAX_VLS_SUPPORTED) {
-			hfi1_early_err(&pdev->dev,
-				       "Invalid num_vls %u, using %u VLs\n",
-				    num_vls, HFI1_MAX_VLS_SUPPORTED);
+			dd_dev_err(dd, "Invalid num_vls %u, using %u VLs\n",
+				   num_vls, HFI1_MAX_VLS_SUPPORTED);
 			num_vls = HFI1_MAX_VLS_SUPPORTED;
 			num_vls = HFI1_MAX_VLS_SUPPORTED;
 		}
 		}
 		ppd->vls_supported = num_vls;
 		ppd->vls_supported = num_vls;

+ 1 - 1
drivers/infiniband/hw/hfi1/chip.h

@@ -508,6 +508,7 @@
 #define DOWN_REMOTE_REASON_SHIFT 16
 #define DOWN_REMOTE_REASON_SHIFT 16
 #define DOWN_REMOTE_REASON_MASK  0xff
 #define DOWN_REMOTE_REASON_MASK  0xff
 
 
+#define HOST_INTERFACE_VERSION 1
 #define HOST_INTERFACE_VERSION_SHIFT 16
 #define HOST_INTERFACE_VERSION_SHIFT 16
 #define HOST_INTERFACE_VERSION_MASK  0xff
 #define HOST_INTERFACE_VERSION_MASK  0xff
 
 
@@ -713,7 +714,6 @@ void read_misc_status(struct hfi1_devdata *dd, u8 *ver_major, u8 *ver_minor,
 		      u8 *ver_patch);
 		      u8 *ver_patch);
 int write_host_interface_version(struct hfi1_devdata *dd, u8 version);
 int write_host_interface_version(struct hfi1_devdata *dd, u8 version);
 void read_guid(struct hfi1_devdata *dd);
 void read_guid(struct hfi1_devdata *dd);
-int release_and_wait_ready_8051_firmware(struct hfi1_devdata *dd);
 int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout);
 int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout);
 void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason,
 void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason,
 			  u8 neigh_reason, u8 rem_reason);
 			  u8 neigh_reason, u8 rem_reason);

+ 0 - 16
drivers/infiniband/hw/hfi1/driver.c

@@ -159,22 +159,6 @@ static int hfi1_caps_get(char *buffer, const struct kernel_param *kp)
 	return scnprintf(buffer, PAGE_SIZE, "0x%lx", cap_mask);
 	return scnprintf(buffer, PAGE_SIZE, "0x%lx", cap_mask);
 }
 }
 
 
-const char *get_unit_name(int unit)
-{
-	static char iname[16];
-
-	snprintf(iname, sizeof(iname), DRIVER_NAME "_%u", unit);
-	return iname;
-}
-
-const char *get_card_name(struct rvt_dev_info *rdi)
-{
-	struct hfi1_ibdev *ibdev = container_of(rdi, struct hfi1_ibdev, rdi);
-	struct hfi1_devdata *dd = container_of(ibdev,
-					       struct hfi1_devdata, verbs_dev);
-	return get_unit_name(dd->unit);
-}
-
 struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi)
 struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi)
 {
 {
 	struct hfi1_ibdev *ibdev = container_of(rdi, struct hfi1_ibdev, rdi);
 	struct hfi1_ibdev *ibdev = container_of(rdi, struct hfi1_ibdev, rdi);

+ 16 - 48
drivers/infiniband/hw/hfi1/firmware.c

@@ -68,7 +68,6 @@
 #define ALT_FW_FABRIC_NAME "hfi1_fabric_d.fw"
 #define ALT_FW_FABRIC_NAME "hfi1_fabric_d.fw"
 #define ALT_FW_SBUS_NAME "hfi1_sbus_d.fw"
 #define ALT_FW_SBUS_NAME "hfi1_sbus_d.fw"
 #define ALT_FW_PCIE_NAME "hfi1_pcie_d.fw"
 #define ALT_FW_PCIE_NAME "hfi1_pcie_d.fw"
-#define HOST_INTERFACE_VERSION 1
 
 
 MODULE_FIRMWARE(DEFAULT_FW_8051_NAME_ASIC);
 MODULE_FIRMWARE(DEFAULT_FW_8051_NAME_ASIC);
 MODULE_FIRMWARE(DEFAULT_FW_FABRIC_NAME);
 MODULE_FIRMWARE(DEFAULT_FW_FABRIC_NAME);
@@ -975,46 +974,6 @@ int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout)
 	}
 	}
 }
 }
 
 
-/*
- * Clear all reset bits, releasing the 8051.
- * Wait for firmware to be ready to accept host requests.
- * Then, set host version bit.
- *
- * This function executes even if the 8051 is in reset mode when
- * dd->dc_shutdown == 1.
- *
- * Expects dd->dc8051_lock to be held.
- */
-int release_and_wait_ready_8051_firmware(struct hfi1_devdata *dd)
-{
-	int ret;
-
-	lockdep_assert_held(&dd->dc8051_lock);
-	/* clear all reset bits, releasing the 8051 */
-	write_csr(dd, DC_DC8051_CFG_RST, 0ull);
-
-	/*
-	 * Wait for firmware to be ready to accept host
-	 * requests.
-	 */
-	ret = wait_fm_ready(dd, TIMEOUT_8051_START);
-	if (ret) {
-		dd_dev_err(dd, "8051 start timeout, current FW state 0x%x\n",
-			   get_firmware_state(dd));
-		return ret;
-	}
-
-	ret = write_host_interface_version(dd, HOST_INTERFACE_VERSION);
-	if (ret != HCMD_SUCCESS) {
-		dd_dev_err(dd,
-			   "Failed to set host interface version, return 0x%x\n",
-			   ret);
-		return -EIO;
-	}
-
-	return 0;
-}
-
 /*
 /*
  * Load the 8051 firmware.
  * Load the 8051 firmware.
  */
  */
@@ -1080,22 +1039,31 @@ static int load_8051_firmware(struct hfi1_devdata *dd,
 	if (ret)
 	if (ret)
 		return ret;
 		return ret;
 
 
+	/* clear all reset bits, releasing the 8051 */
+	write_csr(dd, DC_DC8051_CFG_RST, 0ull);
+
 	/*
 	/*
-	 * Clear all reset bits, releasing the 8051.
 	 * DC reset step 5. Wait for firmware to be ready to accept host
 	 * DC reset step 5. Wait for firmware to be ready to accept host
 	 * requests.
 	 * requests.
-	 * Then, set host version bit.
 	 */
 	 */
-	mutex_lock(&dd->dc8051_lock);
-	ret = release_and_wait_ready_8051_firmware(dd);
-	mutex_unlock(&dd->dc8051_lock);
-	if (ret)
-		return ret;
+	ret = wait_fm_ready(dd, TIMEOUT_8051_START);
+	if (ret) { /* timed out */
+		dd_dev_err(dd, "8051 start timeout, current state 0x%x\n",
+			   get_firmware_state(dd));
+		return -ETIMEDOUT;
+	}
 
 
 	read_misc_status(dd, &ver_major, &ver_minor, &ver_patch);
 	read_misc_status(dd, &ver_major, &ver_minor, &ver_patch);
 	dd_dev_info(dd, "8051 firmware version %d.%d.%d\n",
 	dd_dev_info(dd, "8051 firmware version %d.%d.%d\n",
 		    (int)ver_major, (int)ver_minor, (int)ver_patch);
 		    (int)ver_major, (int)ver_minor, (int)ver_patch);
 	dd->dc8051_ver = dc8051_ver(ver_major, ver_minor, ver_patch);
 	dd->dc8051_ver = dc8051_ver(ver_major, ver_minor, ver_patch);
+	ret = write_host_interface_version(dd, HOST_INTERFACE_VERSION);
+	if (ret != HCMD_SUCCESS) {
+		dd_dev_err(dd,
+			   "Failed to set host interface version, return 0x%x\n",
+			   ret);
+		return -EIO;
+	}
 
 
 	return 0;
 	return 0;
 }
 }

+ 13 - 12
drivers/infiniband/hw/hfi1/hfi.h

@@ -1623,7 +1623,7 @@ static int ingress_pkey_table_search(struct hfi1_pportdata *ppd, u16 pkey)
  * the 'error info' for this failure.
  * the 'error info' for this failure.
  */
  */
 static void ingress_pkey_table_fail(struct hfi1_pportdata *ppd, u16 pkey,
 static void ingress_pkey_table_fail(struct hfi1_pportdata *ppd, u16 pkey,
-				    u16 slid)
+				    u32 slid)
 {
 {
 	struct hfi1_devdata *dd = ppd->dd;
 	struct hfi1_devdata *dd = ppd->dd;
 
 
@@ -1971,8 +1971,6 @@ int get_platform_config_field(struct hfi1_devdata *dd,
 			      table_type, int table_index, int field_index,
 			      table_type, int table_index, int field_index,
 			      u32 *data, u32 len);
 			      u32 *data, u32 len);
 
 
-const char *get_unit_name(int unit);
-const char *get_card_name(struct rvt_dev_info *rdi);
 struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi);
 struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi);
 
 
 /*
 /*
@@ -2122,39 +2120,42 @@ static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd)
 
 
 #define dd_dev_emerg(dd, fmt, ...) \
 #define dd_dev_emerg(dd, fmt, ...) \
 	dev_emerg(&(dd)->pcidev->dev, "%s: " fmt, \
 	dev_emerg(&(dd)->pcidev->dev, "%s: " fmt, \
-		  get_unit_name((dd)->unit), ##__VA_ARGS__)
+		  rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__)
 
 
 #define dd_dev_err(dd, fmt, ...) \
 #define dd_dev_err(dd, fmt, ...) \
 	dev_err(&(dd)->pcidev->dev, "%s: " fmt, \
 	dev_err(&(dd)->pcidev->dev, "%s: " fmt, \
-			get_unit_name((dd)->unit), ##__VA_ARGS__)
+		rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__)
 
 
 #define dd_dev_err_ratelimited(dd, fmt, ...) \
 #define dd_dev_err_ratelimited(dd, fmt, ...) \
 	dev_err_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \
 	dev_err_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \
-			get_unit_name((dd)->unit), ##__VA_ARGS__)
+			    rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), \
+			    ##__VA_ARGS__)
 
 
 #define dd_dev_warn(dd, fmt, ...) \
 #define dd_dev_warn(dd, fmt, ...) \
 	dev_warn(&(dd)->pcidev->dev, "%s: " fmt, \
 	dev_warn(&(dd)->pcidev->dev, "%s: " fmt, \
-			get_unit_name((dd)->unit), ##__VA_ARGS__)
+		 rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__)
 
 
 #define dd_dev_warn_ratelimited(dd, fmt, ...) \
 #define dd_dev_warn_ratelimited(dd, fmt, ...) \
 	dev_warn_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \
 	dev_warn_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \
-			get_unit_name((dd)->unit), ##__VA_ARGS__)
+			     rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), \
+			     ##__VA_ARGS__)
 
 
 #define dd_dev_info(dd, fmt, ...) \
 #define dd_dev_info(dd, fmt, ...) \
 	dev_info(&(dd)->pcidev->dev, "%s: " fmt, \
 	dev_info(&(dd)->pcidev->dev, "%s: " fmt, \
-			get_unit_name((dd)->unit), ##__VA_ARGS__)
+		 rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__)
 
 
 #define dd_dev_info_ratelimited(dd, fmt, ...) \
 #define dd_dev_info_ratelimited(dd, fmt, ...) \
 	dev_info_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \
 	dev_info_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \
-			get_unit_name((dd)->unit), ##__VA_ARGS__)
+			     rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), \
+			     ##__VA_ARGS__)
 
 
 #define dd_dev_dbg(dd, fmt, ...) \
 #define dd_dev_dbg(dd, fmt, ...) \
 	dev_dbg(&(dd)->pcidev->dev, "%s: " fmt, \
 	dev_dbg(&(dd)->pcidev->dev, "%s: " fmt, \
-		get_unit_name((dd)->unit), ##__VA_ARGS__)
+		rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__)
 
 
 #define hfi1_dev_porterr(dd, port, fmt, ...) \
 #define hfi1_dev_porterr(dd, port, fmt, ...) \
 	dev_err(&(dd)->pcidev->dev, "%s: port %u: " fmt, \
 	dev_err(&(dd)->pcidev->dev, "%s: port %u: " fmt, \
-			get_unit_name((dd)->unit), (port), ##__VA_ARGS__)
+		rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), (port), ##__VA_ARGS__)
 
 
 /*
 /*
  * this is used for formatting hw error messages...
  * this is used for formatting hw error messages...

+ 2 - 0
drivers/infiniband/hw/hfi1/init.c

@@ -1272,6 +1272,8 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra)
 			       "Could not allocate unit ID: error %d\n", -ret);
 			       "Could not allocate unit ID: error %d\n", -ret);
 		goto bail;
 		goto bail;
 	}
 	}
+	rvt_set_ibdev_name(&dd->verbs_dev.rdi, "%s_%d", class_name(), dd->unit);
+
 	/*
 	/*
 	 * Initialize all locks for the device. This needs to be as early as
 	 * Initialize all locks for the device. This needs to be as early as
 	 * possible so locks are usable.
 	 * possible so locks are usable.

+ 1 - 5
drivers/infiniband/hw/hfi1/mad.c

@@ -4348,11 +4348,7 @@ static int opa_local_smp_check(struct hfi1_ibport *ibp,
 	 */
 	 */
 	if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY)
 	if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY)
 		return 0;
 		return 0;
-	/*
-	 * On OPA devices it is okay to lose the upper 16 bits of LID as this
-	 * information is obtained elsewhere. Mask off the upper 16 bits.
-	 */
-	ingress_pkey_table_fail(ppd, pkey, ib_lid_cpu16(0xFFFF & in_wc->slid));
+	ingress_pkey_table_fail(ppd, pkey, in_wc->slid);
 	return 1;
 	return 1;
 }
 }
 
 

+ 8 - 2
drivers/infiniband/hw/hfi1/qp.c

@@ -556,6 +556,8 @@ void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter)
 	struct sdma_engine *sde;
 	struct sdma_engine *sde;
 	struct send_context *send_context;
 	struct send_context *send_context;
 	struct rvt_ack_entry *e = NULL;
 	struct rvt_ack_entry *e = NULL;
+	struct rvt_srq *srq = qp->ibqp.srq ?
+		ibsrq_to_rvtsrq(qp->ibqp.srq) : NULL;
 
 
 	sde = qp_to_sdma_engine(qp, priv->s_sc);
 	sde = qp_to_sdma_engine(qp, priv->s_sc);
 	wqe = rvt_get_swqe_ptr(qp, qp->s_last);
 	wqe = rvt_get_swqe_ptr(qp, qp->s_last);
@@ -563,7 +565,7 @@ void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter)
 	if (qp->s_ack_queue)
 	if (qp->s_ack_queue)
 		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
 		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
 	seq_printf(s,
 	seq_printf(s,
-		   "N %d %s QP %x R %u %s %u %u %u f=%x %u %u %u %u %u %u SPSN %x %x %x %x %x RPSN %x S(%u %u %u %u %u %u %u) R(%u %u %u) RQP %x LID %x SL %u MTU %u %u %u %u %u SDE %p,%u SC %p,%u SCQ %u %u PID %d OS %x %x E %x %x %x\n",
+		   "N %d %s QP %x R %u %s %u %u %u f=%x %u %u %u %u %u %u SPSN %x %x %x %x %x RPSN %x S(%u %u %u %u %u %u %u) R(%u %u %u) RQP %x LID %x SL %u MTU %u %u %u %u %u SDE %p,%u SC %p,%u SCQ %u %u PID %d OS %x %x E %x %x %x RNR %d %s %d\n",
 		   iter->n,
 		   iter->n,
 		   qp_idle(qp) ? "I" : "B",
 		   qp_idle(qp) ? "I" : "B",
 		   qp->ibqp.qp_num,
 		   qp->ibqp.qp_num,
@@ -610,7 +612,11 @@ void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter)
 		   /* ack queue information */
 		   /* ack queue information */
 		   e ? e->opcode : 0,
 		   e ? e->opcode : 0,
 		   e ? e->psn : 0,
 		   e ? e->psn : 0,
-		   e ? e->lpsn : 0);
+		   e ? e->lpsn : 0,
+		   qp->r_min_rnr_timer,
+		   srq ? "SRQ" : "RQ",
+		   srq ? srq->rq.size : qp->r_rq.size
+		);
 }
 }
 
 
 void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp)
 void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp)

+ 4 - 4
drivers/infiniband/hw/hfi1/rc.c

@@ -841,11 +841,11 @@ static inline void hfi1_make_rc_ack_16B(struct rvt_qp *qp,
 	/* Convert dwords to flits */
 	/* Convert dwords to flits */
 	len = (*hwords + *nwords) >> 1;
 	len = (*hwords + *nwords) >> 1;
 
 
-	hfi1_make_16b_hdr(hdr,
-			  ppd->lid | rdma_ah_get_path_bits(&qp->remote_ah_attr),
+	hfi1_make_16b_hdr(hdr, ppd->lid |
+			  (rdma_ah_get_path_bits(&qp->remote_ah_attr) &
+			  ((1 << ppd->lmc) - 1)),
 			  opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr),
 			  opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr),
-				      16B),
-			  len, pkey, becn, 0, l4, sc5);
+				      16B), len, pkey, becn, 0, l4, sc5);
 
 
 	bth0 = pkey | (OP(ACKNOWLEDGE) << 24);
 	bth0 = pkey | (OP(ACKNOWLEDGE) << 24);
 	bth0 |= extra_bytes << 20;
 	bth0 |= extra_bytes << 20;

+ 1 - 5
drivers/infiniband/hw/hfi1/verbs.c

@@ -1486,7 +1486,7 @@ static int query_port(struct rvt_dev_info *rdi, u8 port_num,
 	props->max_mtu = mtu_to_enum((!valid_ib_mtu(hfi1_max_mtu) ?
 	props->max_mtu = mtu_to_enum((!valid_ib_mtu(hfi1_max_mtu) ?
 				      4096 : hfi1_max_mtu), IB_MTU_4096);
 				      4096 : hfi1_max_mtu), IB_MTU_4096);
 	props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu :
 	props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu :
-		mtu_to_enum(ppd->ibmtu, IB_MTU_2048);
+		mtu_to_enum(ppd->ibmtu, IB_MTU_4096);
 
 
 	/*
 	/*
 	 * sm_lid of 0xFFFF needs special handling so that it can
 	 * sm_lid of 0xFFFF needs special handling so that it can
@@ -1844,7 +1844,6 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
 	struct hfi1_ibport *ibp = &ppd->ibport_data;
 	struct hfi1_ibport *ibp = &ppd->ibport_data;
 	unsigned i;
 	unsigned i;
 	int ret;
 	int ret;
-	size_t lcpysz = IB_DEVICE_NAME_MAX;
 
 
 	for (i = 0; i < dd->num_pports; i++)
 	for (i = 0; i < dd->num_pports; i++)
 		init_ibport(ppd + i);
 		init_ibport(ppd + i);
@@ -1872,8 +1871,6 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
 	 */
 	 */
 	if (!ib_hfi1_sys_image_guid)
 	if (!ib_hfi1_sys_image_guid)
 		ib_hfi1_sys_image_guid = ibdev->node_guid;
 		ib_hfi1_sys_image_guid = ibdev->node_guid;
-	lcpysz = strlcpy(ibdev->name, class_name(), lcpysz);
-	strlcpy(ibdev->name + lcpysz, "_%d", IB_DEVICE_NAME_MAX - lcpysz);
 	ibdev->owner = THIS_MODULE;
 	ibdev->owner = THIS_MODULE;
 	ibdev->phys_port_cnt = dd->num_pports;
 	ibdev->phys_port_cnt = dd->num_pports;
 	ibdev->dev.parent = &dd->pcidev->dev;
 	ibdev->dev.parent = &dd->pcidev->dev;
@@ -1893,7 +1890,6 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
 	 * Fill in rvt info object.
 	 * Fill in rvt info object.
 	 */
 	 */
 	dd->verbs_dev.rdi.driver_f.port_callback = hfi1_create_port_files;
 	dd->verbs_dev.rdi.driver_f.port_callback = hfi1_create_port_files;
-	dd->verbs_dev.rdi.driver_f.get_card_name = get_card_name;
 	dd->verbs_dev.rdi.driver_f.get_pci_dev = get_pci_dev;
 	dd->verbs_dev.rdi.driver_f.get_pci_dev = get_pci_dev;
 	dd->verbs_dev.rdi.driver_f.check_ah = hfi1_check_ah;
 	dd->verbs_dev.rdi.driver_f.check_ah = hfi1_check_ah;
 	dd->verbs_dev.rdi.driver_f.notify_new_ah = hfi1_notify_new_ah;
 	dd->verbs_dev.rdi.driver_f.notify_new_ah = hfi1_notify_new_ah;

+ 1 - 1
drivers/infiniband/hw/hns/Makefile

@@ -5,7 +5,7 @@
 ccflags-y :=  -Idrivers/net/ethernet/hisilicon/hns3
 ccflags-y :=  -Idrivers/net/ethernet/hisilicon/hns3
 
 
 obj-$(CONFIG_INFINIBAND_HNS) += hns-roce.o
 obj-$(CONFIG_INFINIBAND_HNS) += hns-roce.o
-hns-roce-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_eq.o hns_roce_pd.o \
+hns-roce-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \
 	hns_roce_ah.o hns_roce_hem.o hns_roce_mr.o hns_roce_qp.o \
 	hns_roce_ah.o hns_roce_hem.o hns_roce_mr.o hns_roce_qp.o \
 	hns_roce_cq.o hns_roce_alloc.o
 	hns_roce_cq.o hns_roce_alloc.o
 obj-$(CONFIG_INFINIBAND_HNS_HIP06) += hns-roce-hw-v1.o
 obj-$(CONFIG_INFINIBAND_HNS_HIP06) += hns-roce-hw-v1.o

+ 1 - 0
drivers/infiniband/hw/hns/hns_roce_cmd.c

@@ -103,6 +103,7 @@ void hns_roce_cmd_event(struct hns_roce_dev *hr_dev, u16 token, u8 status,
 	context->out_param = out_param;
 	context->out_param = out_param;
 	complete(&context->done);
 	complete(&context->done);
 }
 }
+EXPORT_SYMBOL_GPL(hns_roce_cmd_event);
 
 
 /* this should be called with "use_events" */
 /* this should be called with "use_events" */
 static int __hns_roce_cmd_mbox_wait(struct hns_roce_dev *hr_dev, u64 in_param,
 static int __hns_roce_cmd_mbox_wait(struct hns_roce_dev *hr_dev, u64 in_param,

+ 10 - 0
drivers/infiniband/hw/hns/hns_roce_cmd.h

@@ -88,6 +88,16 @@ enum {
 	HNS_ROCE_CMD_DESTROY_SRQC_BT0	= 0x38,
 	HNS_ROCE_CMD_DESTROY_SRQC_BT0	= 0x38,
 	HNS_ROCE_CMD_DESTROY_SRQC_BT1	= 0x39,
 	HNS_ROCE_CMD_DESTROY_SRQC_BT1	= 0x39,
 	HNS_ROCE_CMD_DESTROY_SRQC_BT2	= 0x3a,
 	HNS_ROCE_CMD_DESTROY_SRQC_BT2	= 0x3a,
+
+	/* EQC commands */
+	HNS_ROCE_CMD_CREATE_AEQC	= 0x80,
+	HNS_ROCE_CMD_MODIFY_AEQC	= 0x81,
+	HNS_ROCE_CMD_QUERY_AEQC		= 0x82,
+	HNS_ROCE_CMD_DESTROY_AEQC	= 0x83,
+	HNS_ROCE_CMD_CREATE_CEQC	= 0x90,
+	HNS_ROCE_CMD_MODIFY_CEQC	= 0x91,
+	HNS_ROCE_CMD_QUERY_CEQC		= 0x92,
+	HNS_ROCE_CMD_DESTROY_CEQC	= 0x93,
 };
 };
 
 
 enum {
 enum {

+ 11 - 0
drivers/infiniband/hw/hns/hns_roce_common.h

@@ -376,6 +376,12 @@
 #define ROCEE_RX_CMQ_TAIL_REG			0x07024
 #define ROCEE_RX_CMQ_TAIL_REG			0x07024
 #define ROCEE_RX_CMQ_HEAD_REG			0x07028
 #define ROCEE_RX_CMQ_HEAD_REG			0x07028
 
 
+#define ROCEE_VF_MB_CFG0_REG			0x40
+#define ROCEE_VF_MB_STATUS_REG			0x58
+
+#define ROCEE_VF_EQ_DB_CFG0_REG			0x238
+#define ROCEE_VF_EQ_DB_CFG1_REG			0x23C
+
 #define ROCEE_VF_SMAC_CFG0_REG			0x12000
 #define ROCEE_VF_SMAC_CFG0_REG			0x12000
 #define ROCEE_VF_SMAC_CFG1_REG			0x12004
 #define ROCEE_VF_SMAC_CFG1_REG			0x12004
 
 
@@ -385,4 +391,9 @@
 #define ROCEE_VF_SGID_CFG3_REG			0x1000c
 #define ROCEE_VF_SGID_CFG3_REG			0x1000c
 #define ROCEE_VF_SGID_CFG4_REG			0x10010
 #define ROCEE_VF_SGID_CFG4_REG			0x10010
 
 
+#define ROCEE_VF_ABN_INT_CFG_REG		0x13000
+#define ROCEE_VF_ABN_INT_ST_REG			0x13004
+#define ROCEE_VF_ABN_INT_EN_REG			0x13008
+#define ROCEE_VF_EVENT_INT_EN_REG		0x1300c
+
 #endif /* _HNS_ROCE_COMMON_H */
 #endif /* _HNS_ROCE_COMMON_H */

+ 10 - 9
drivers/infiniband/hw/hns/hns_roce_cq.c

@@ -196,15 +196,14 @@ void hns_roce_free_cq(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq)
 	if (ret)
 	if (ret)
 		dev_err(dev, "HW2SW_CQ failed (%d) for CQN %06lx\n", ret,
 		dev_err(dev, "HW2SW_CQ failed (%d) for CQN %06lx\n", ret,
 			hr_cq->cqn);
 			hr_cq->cqn);
-	if (hr_dev->eq_table.eq) {
-		/* Waiting interrupt process procedure carried out */
-		synchronize_irq(hr_dev->eq_table.eq[hr_cq->vector].irq);
-
-		/* wait for all interrupt processed */
-		if (atomic_dec_and_test(&hr_cq->refcount))
-			complete(&hr_cq->free);
-		wait_for_completion(&hr_cq->free);
-	}
+
+	/* Waiting interrupt process procedure carried out */
+	synchronize_irq(hr_dev->eq_table.eq[hr_cq->vector].irq);
+
+	/* wait for all interrupt processed */
+	if (atomic_dec_and_test(&hr_cq->refcount))
+		complete(&hr_cq->free);
+	wait_for_completion(&hr_cq->free);
 
 
 	spin_lock_irq(&cq_table->lock);
 	spin_lock_irq(&cq_table->lock);
 	radix_tree_delete(&cq_table->tree, hr_cq->cqn);
 	radix_tree_delete(&cq_table->tree, hr_cq->cqn);
@@ -460,6 +459,7 @@ void hns_roce_cq_completion(struct hns_roce_dev *hr_dev, u32 cqn)
 	++cq->arm_sn;
 	++cq->arm_sn;
 	cq->comp(cq);
 	cq->comp(cq);
 }
 }
+EXPORT_SYMBOL_GPL(hns_roce_cq_completion);
 
 
 void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type)
 void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type)
 {
 {
@@ -482,6 +482,7 @@ void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type)
 	if (atomic_dec_and_test(&cq->refcount))
 	if (atomic_dec_and_test(&cq->refcount))
 		complete(&cq->free);
 		complete(&cq->free);
 }
 }
+EXPORT_SYMBOL_GPL(hns_roce_cq_event);
 
 
 int hns_roce_init_cq_table(struct hns_roce_dev *hr_dev)
 int hns_roce_init_cq_table(struct hns_roce_dev *hr_dev)
 {
 {

+ 96 - 7
drivers/infiniband/hw/hns/hns_roce_device.h

@@ -62,12 +62,16 @@
 #define HNS_ROCE_CQE_WCMD_EMPTY_BIT		0x2
 #define HNS_ROCE_CQE_WCMD_EMPTY_BIT		0x2
 #define HNS_ROCE_MIN_CQE_CNT			16
 #define HNS_ROCE_MIN_CQE_CNT			16
 
 
-#define HNS_ROCE_MAX_IRQ_NUM			34
+#define HNS_ROCE_MAX_IRQ_NUM			128
 
 
-#define HNS_ROCE_COMP_VEC_NUM			32
+#define EQ_ENABLE				1
+#define EQ_DISABLE				0
 
 
-#define HNS_ROCE_AEQE_VEC_NUM			1
-#define HNS_ROCE_AEQE_OF_VEC_NUM		1
+#define HNS_ROCE_CEQ				0
+#define HNS_ROCE_AEQ				1
+
+#define HNS_ROCE_CEQ_ENTRY_SIZE			0x4
+#define HNS_ROCE_AEQ_ENTRY_SIZE			0x10
 
 
 /* 4G/4K = 1M */
 /* 4G/4K = 1M */
 #define HNS_ROCE_SL_SHIFT			28
 #define HNS_ROCE_SL_SHIFT			28
@@ -130,6 +134,7 @@ enum hns_roce_event {
 	HNS_ROCE_EVENT_TYPE_DB_OVERFLOW               = 0x12,
 	HNS_ROCE_EVENT_TYPE_DB_OVERFLOW               = 0x12,
 	HNS_ROCE_EVENT_TYPE_MB                        = 0x13,
 	HNS_ROCE_EVENT_TYPE_MB                        = 0x13,
 	HNS_ROCE_EVENT_TYPE_CEQ_OVERFLOW              = 0x14,
 	HNS_ROCE_EVENT_TYPE_CEQ_OVERFLOW              = 0x14,
+	HNS_ROCE_EVENT_TYPE_FLR			      = 0x15,
 };
 };
 
 
 /* Local Work Queue Catastrophic Error,SUBTYPE 0x5 */
 /* Local Work Queue Catastrophic Error,SUBTYPE 0x5 */
@@ -173,6 +178,7 @@ enum {
 enum {
 enum {
 	HNS_ROCE_CAP_FLAG_REREG_MR		= BIT(0),
 	HNS_ROCE_CAP_FLAG_REREG_MR		= BIT(0),
 	HNS_ROCE_CAP_FLAG_ROCE_V1_V2		= BIT(1),
 	HNS_ROCE_CAP_FLAG_ROCE_V1_V2		= BIT(1),
+	HNS_ROCE_CAP_FLAG_RQ_INLINE		= BIT(2)
 };
 };
 
 
 enum hns_roce_mtt_type {
 enum hns_roce_mtt_type {
@@ -441,6 +447,21 @@ struct hns_roce_cmd_mailbox {
 
 
 struct hns_roce_dev;
 struct hns_roce_dev;
 
 
+struct hns_roce_rinl_sge {
+	void			*addr;
+	u32			len;
+};
+
+struct hns_roce_rinl_wqe {
+	struct hns_roce_rinl_sge *sg_list;
+	u32			 sge_cnt;
+};
+
+struct hns_roce_rinl_buf {
+	struct hns_roce_rinl_wqe *wqe_list;
+	u32			 wqe_cnt;
+};
+
 struct hns_roce_qp {
 struct hns_roce_qp {
 	struct ib_qp		ibqp;
 	struct ib_qp		ibqp;
 	struct hns_roce_buf	hr_buf;
 	struct hns_roce_buf	hr_buf;
@@ -462,7 +483,9 @@ struct hns_roce_qp {
 	u8			resp_depth;
 	u8			resp_depth;
 	u8			state;
 	u8			state;
 	u32			access_flags;
 	u32			access_flags;
+	u32                     atomic_rd_en;
 	u32			pkey_index;
 	u32			pkey_index;
+	u32			qkey;
 	void			(*event)(struct hns_roce_qp *,
 	void			(*event)(struct hns_roce_qp *,
 					 enum hns_roce_event);
 					 enum hns_roce_event);
 	unsigned long		qpn;
 	unsigned long		qpn;
@@ -472,6 +495,8 @@ struct hns_roce_qp {
 
 
 	struct hns_roce_sge	sge;
 	struct hns_roce_sge	sge;
 	u32			next_sge;
 	u32			next_sge;
+
+	struct hns_roce_rinl_buf rq_inl_buf;
 };
 };
 
 
 struct hns_roce_sqp {
 struct hns_roce_sqp {
@@ -485,6 +510,45 @@ struct hns_roce_ib_iboe {
 	u8			phy_port[HNS_ROCE_MAX_PORTS];
 	u8			phy_port[HNS_ROCE_MAX_PORTS];
 };
 };
 
 
+enum {
+	HNS_ROCE_EQ_STAT_INVALID  = 0,
+	HNS_ROCE_EQ_STAT_VALID    = 2,
+};
+
+struct hns_roce_ceqe {
+	u32			comp;
+};
+
+struct hns_roce_aeqe {
+	u32 asyn;
+	union {
+		struct {
+			u32 qp;
+			u32 rsv0;
+			u32 rsv1;
+		} qp_event;
+
+		struct {
+			u32 cq;
+			u32 rsv0;
+			u32 rsv1;
+		} cq_event;
+
+		struct {
+			u32 ceqe;
+			u32 rsv0;
+			u32 rsv1;
+		} ce_event;
+
+		struct {
+			__le64  out_param;
+			__le16  token;
+			u8	status;
+			u8	rsv0;
+		} __packed cmd;
+	 } event;
+};
+
 struct hns_roce_eq {
 struct hns_roce_eq {
 	struct hns_roce_dev		*hr_dev;
 	struct hns_roce_dev		*hr_dev;
 	void __iomem			*doorbell;
 	void __iomem			*doorbell;
@@ -498,11 +562,31 @@ struct hns_roce_eq {
 	int				log_page_size;
 	int				log_page_size;
 	int				cons_index;
 	int				cons_index;
 	struct hns_roce_buf_list	*buf_list;
 	struct hns_roce_buf_list	*buf_list;
+	int				over_ignore;
+	int				coalesce;
+	int				arm_st;
+	u64				eqe_ba;
+	int				eqe_ba_pg_sz;
+	int				eqe_buf_pg_sz;
+	int				hop_num;
+	u64				*bt_l0;	/* Base address table for L0 */
+	u64				**bt_l1; /* Base address table for L1 */
+	u64				**buf;
+	dma_addr_t			l0_dma;
+	dma_addr_t			*l1_dma;
+	dma_addr_t			*buf_dma;
+	u32				l0_last_num; /* L0 last chunk num */
+	u32				l1_last_num; /* L1 last chunk num */
+	int				eq_max_cnt;
+	int				eq_period;
+	int				shift;
+	dma_addr_t			cur_eqe_ba;
+	dma_addr_t			nxt_eqe_ba;
 };
 };
 
 
 struct hns_roce_eq_table {
 struct hns_roce_eq_table {
 	struct hns_roce_eq	*eq;
 	struct hns_roce_eq	*eq;
-	void __iomem		**eqc_base;
+	void __iomem		**eqc_base; /* only for hw v1 */
 };
 };
 
 
 struct hns_roce_caps {
 struct hns_roce_caps {
@@ -528,7 +612,7 @@ struct hns_roce_caps {
 	u32		min_wqes;
 	u32		min_wqes;
 	int		reserved_cqs;
 	int		reserved_cqs;
 	int		num_aeq_vectors;	/* 1 */
 	int		num_aeq_vectors;	/* 1 */
-	int		num_comp_vectors;	/* 32 ceq */
+	int		num_comp_vectors;
 	int		num_other_vectors;
 	int		num_other_vectors;
 	int		num_mtpts;
 	int		num_mtpts;
 	u32		num_mtt_segs;
 	u32		num_mtt_segs;
@@ -550,7 +634,7 @@ struct hns_roce_caps {
 	u32		pbl_buf_pg_sz;
 	u32		pbl_buf_pg_sz;
 	u32		pbl_hop_num;
 	u32		pbl_hop_num;
 	int		aeqe_depth;
 	int		aeqe_depth;
-	int		ceqe_depth[HNS_ROCE_COMP_VEC_NUM];
+	int		ceqe_depth;
 	enum ib_mtu	max_mtu;
 	enum ib_mtu	max_mtu;
 	u32		qpc_bt_num;
 	u32		qpc_bt_num;
 	u32		srqc_bt_num;
 	u32		srqc_bt_num;
@@ -574,6 +658,9 @@ struct hns_roce_caps {
 	u32		cqe_ba_pg_sz;
 	u32		cqe_ba_pg_sz;
 	u32		cqe_buf_pg_sz;
 	u32		cqe_buf_pg_sz;
 	u32		cqe_hop_num;
 	u32		cqe_hop_num;
+	u32		eqe_ba_pg_sz;
+	u32		eqe_buf_pg_sz;
+	u32		eqe_hop_num;
 	u32		chunk_sz;	/* chunk size in non multihop mode*/
 	u32		chunk_sz;	/* chunk size in non multihop mode*/
 	u64		flags;
 	u64		flags;
 };
 };
@@ -623,6 +710,8 @@ struct hns_roce_hw {
 	int (*dereg_mr)(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr);
 	int (*dereg_mr)(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr);
 	int (*destroy_cq)(struct ib_cq *ibcq);
 	int (*destroy_cq)(struct ib_cq *ibcq);
 	int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period);
 	int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period);
+	int (*init_eq)(struct hns_roce_dev *hr_dev);
+	void (*cleanup_eq)(struct hns_roce_dev *hr_dev);
 };
 };
 
 
 struct hns_roce_dev {
 struct hns_roce_dev {

+ 0 - 759
drivers/infiniband/hw/hns/hns_roce_eq.c

@@ -1,759 +0,0 @@
-/*
- * Copyright (c) 2016 Hisilicon Limited.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/platform_device.h>
-#include <linux/interrupt.h>
-#include "hns_roce_common.h"
-#include "hns_roce_device.h"
-#include "hns_roce_eq.h"
-
-static void eq_set_cons_index(struct hns_roce_eq *eq, int req_not)
-{
-	roce_raw_write((eq->cons_index & CONS_INDEX_MASK) |
-		      (req_not << eq->log_entries), eq->doorbell);
-	/* Memory barrier */
-	mb();
-}
-
-static struct hns_roce_aeqe *get_aeqe(struct hns_roce_eq *eq, u32 entry)
-{
-	unsigned long off = (entry & (eq->entries - 1)) *
-			     HNS_ROCE_AEQ_ENTRY_SIZE;
-
-	return (struct hns_roce_aeqe *)((u8 *)
-		(eq->buf_list[off / HNS_ROCE_BA_SIZE].buf) +
-		off % HNS_ROCE_BA_SIZE);
-}
-
-static struct hns_roce_aeqe *next_aeqe_sw(struct hns_roce_eq *eq)
-{
-	struct hns_roce_aeqe *aeqe = get_aeqe(eq, eq->cons_index);
-
-	return (roce_get_bit(aeqe->asyn, HNS_ROCE_AEQE_U32_4_OWNER_S) ^
-		!!(eq->cons_index & eq->entries)) ? aeqe : NULL;
-}
-
-static void hns_roce_wq_catas_err_handle(struct hns_roce_dev *hr_dev,
-					 struct hns_roce_aeqe *aeqe, int qpn)
-{
-	struct device *dev = &hr_dev->pdev->dev;
-
-	dev_warn(dev, "Local Work Queue Catastrophic Error.\n");
-	switch (roce_get_field(aeqe->asyn, HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_M,
-			       HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S)) {
-	case HNS_ROCE_LWQCE_QPC_ERROR:
-		dev_warn(dev, "QP %d, QPC error.\n", qpn);
-		break;
-	case HNS_ROCE_LWQCE_MTU_ERROR:
-		dev_warn(dev, "QP %d, MTU error.\n", qpn);
-		break;
-	case HNS_ROCE_LWQCE_WQE_BA_ADDR_ERROR:
-		dev_warn(dev, "QP %d, WQE BA addr error.\n", qpn);
-		break;
-	case HNS_ROCE_LWQCE_WQE_ADDR_ERROR:
-		dev_warn(dev, "QP %d, WQE addr error.\n", qpn);
-		break;
-	case HNS_ROCE_LWQCE_SQ_WQE_SHIFT_ERROR:
-		dev_warn(dev, "QP %d, WQE shift error\n", qpn);
-		break;
-	case HNS_ROCE_LWQCE_SL_ERROR:
-		dev_warn(dev, "QP %d, SL error.\n", qpn);
-		break;
-	case HNS_ROCE_LWQCE_PORT_ERROR:
-		dev_warn(dev, "QP %d, port error.\n", qpn);
-		break;
-	default:
-		break;
-	}
-}
-
-static void hns_roce_local_wq_access_err_handle(struct hns_roce_dev *hr_dev,
-						struct hns_roce_aeqe *aeqe,
-						int qpn)
-{
-	struct device *dev = &hr_dev->pdev->dev;
-
-	dev_warn(dev, "Local Access Violation Work Queue Error.\n");
-	switch (roce_get_field(aeqe->asyn, HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_M,
-			       HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S)) {
-	case HNS_ROCE_LAVWQE_R_KEY_VIOLATION:
-		dev_warn(dev, "QP %d, R_key violation.\n", qpn);
-		break;
-	case HNS_ROCE_LAVWQE_LENGTH_ERROR:
-		dev_warn(dev, "QP %d, length error.\n", qpn);
-		break;
-	case HNS_ROCE_LAVWQE_VA_ERROR:
-		dev_warn(dev, "QP %d, VA error.\n", qpn);
-		break;
-	case HNS_ROCE_LAVWQE_PD_ERROR:
-		dev_err(dev, "QP %d, PD error.\n", qpn);
-		break;
-	case HNS_ROCE_LAVWQE_RW_ACC_ERROR:
-		dev_warn(dev, "QP %d, rw acc error.\n", qpn);
-		break;
-	case HNS_ROCE_LAVWQE_KEY_STATE_ERROR:
-		dev_warn(dev, "QP %d, key state error.\n", qpn);
-		break;
-	case HNS_ROCE_LAVWQE_MR_OPERATION_ERROR:
-		dev_warn(dev, "QP %d, MR operation error.\n", qpn);
-		break;
-	default:
-		break;
-	}
-}
-
-static void hns_roce_qp_err_handle(struct hns_roce_dev *hr_dev,
-				   struct hns_roce_aeqe *aeqe,
-				   int event_type)
-{
-	struct device *dev = &hr_dev->pdev->dev;
-	int phy_port;
-	int qpn;
-
-	qpn = roce_get_field(aeqe->event.qp_event.qp,
-			     HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_M,
-			     HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_S);
-	phy_port = roce_get_field(aeqe->event.qp_event.qp,
-			HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_M,
-			HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_S);
-	if (qpn <= 1)
-		qpn = HNS_ROCE_MAX_PORTS * qpn + phy_port;
-
-	switch (event_type) {
-	case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR:
-		dev_warn(dev, "Invalid Req Local Work Queue Error.\n"
-			      "QP %d, phy_port %d.\n", qpn, phy_port);
-		break;
-	case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR:
-		hns_roce_wq_catas_err_handle(hr_dev, aeqe, qpn);
-		break;
-	case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
-		hns_roce_local_wq_access_err_handle(hr_dev, aeqe, qpn);
-		break;
-	default:
-		break;
-	}
-
-	hns_roce_qp_event(hr_dev, qpn, event_type);
-}
-
-static void hns_roce_cq_err_handle(struct hns_roce_dev *hr_dev,
-				   struct hns_roce_aeqe *aeqe,
-				   int event_type)
-{
-	struct device *dev = &hr_dev->pdev->dev;
-	u32 cqn;
-
-	cqn = le32_to_cpu(roce_get_field(aeqe->event.cq_event.cq,
-		    HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_M,
-		    HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_S));
-
-	switch (event_type) {
-	case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR:
-		dev_warn(dev, "CQ 0x%x access err.\n", cqn);
-		break;
-	case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW:
-		dev_warn(dev, "CQ 0x%x overflow\n", cqn);
-		break;
-	case HNS_ROCE_EVENT_TYPE_CQ_ID_INVALID:
-		dev_warn(dev, "CQ 0x%x ID invalid.\n", cqn);
-		break;
-	default:
-		break;
-	}
-
-	hns_roce_cq_event(hr_dev, cqn, event_type);
-}
-
-static void hns_roce_db_overflow_handle(struct hns_roce_dev *hr_dev,
-					struct hns_roce_aeqe *aeqe)
-{
-	struct device *dev = &hr_dev->pdev->dev;
-
-	switch (roce_get_field(aeqe->asyn, HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_M,
-			       HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S)) {
-	case HNS_ROCE_DB_SUBTYPE_SDB_OVF:
-		dev_warn(dev, "SDB overflow.\n");
-		break;
-	case HNS_ROCE_DB_SUBTYPE_SDB_ALM_OVF:
-		dev_warn(dev, "SDB almost overflow.\n");
-		break;
-	case HNS_ROCE_DB_SUBTYPE_SDB_ALM_EMP:
-		dev_warn(dev, "SDB almost empty.\n");
-		break;
-	case HNS_ROCE_DB_SUBTYPE_ODB_OVF:
-		dev_warn(dev, "ODB overflow.\n");
-		break;
-	case HNS_ROCE_DB_SUBTYPE_ODB_ALM_OVF:
-		dev_warn(dev, "ODB almost overflow.\n");
-		break;
-	case HNS_ROCE_DB_SUBTYPE_ODB_ALM_EMP:
-		dev_warn(dev, "SDB almost empty.\n");
-		break;
-	default:
-		break;
-	}
-}
-
-static int hns_roce_aeq_int(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq)
-{
-	struct device *dev = &hr_dev->pdev->dev;
-	struct hns_roce_aeqe *aeqe;
-	int aeqes_found = 0;
-	int event_type;
-
-	while ((aeqe = next_aeqe_sw(eq))) {
-		dev_dbg(dev, "aeqe = %p, aeqe->asyn.event_type = 0x%lx\n", aeqe,
-			roce_get_field(aeqe->asyn,
-				       HNS_ROCE_AEQE_U32_4_EVENT_TYPE_M,
-				       HNS_ROCE_AEQE_U32_4_EVENT_TYPE_S));
-		/* Memory barrier */
-		rmb();
-
-		event_type = roce_get_field(aeqe->asyn,
-				HNS_ROCE_AEQE_U32_4_EVENT_TYPE_M,
-				HNS_ROCE_AEQE_U32_4_EVENT_TYPE_S);
-		switch (event_type) {
-		case HNS_ROCE_EVENT_TYPE_PATH_MIG:
-			dev_warn(dev, "PATH MIG not supported\n");
-			break;
-		case HNS_ROCE_EVENT_TYPE_COMM_EST:
-			dev_warn(dev, "COMMUNICATION established\n");
-			break;
-		case HNS_ROCE_EVENT_TYPE_SQ_DRAINED:
-			dev_warn(dev, "SQ DRAINED not supported\n");
-			break;
-		case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED:
-			dev_warn(dev, "PATH MIG failed\n");
-			break;
-		case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR:
-		case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR:
-		case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
-			hns_roce_qp_err_handle(hr_dev, aeqe, event_type);
-			break;
-		case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH:
-		case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR:
-		case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH:
-			dev_warn(dev, "SRQ not support!\n");
-			break;
-		case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR:
-		case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW:
-		case HNS_ROCE_EVENT_TYPE_CQ_ID_INVALID:
-			hns_roce_cq_err_handle(hr_dev, aeqe, event_type);
-			break;
-		case HNS_ROCE_EVENT_TYPE_PORT_CHANGE:
-			dev_warn(dev, "port change.\n");
-			break;
-		case HNS_ROCE_EVENT_TYPE_MB:
-			hns_roce_cmd_event(hr_dev,
-					   le16_to_cpu(aeqe->event.cmd.token),
-					   aeqe->event.cmd.status,
-					   le64_to_cpu(aeqe->event.cmd.out_param
-					   ));
-			break;
-		case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW:
-			hns_roce_db_overflow_handle(hr_dev, aeqe);
-			break;
-		case HNS_ROCE_EVENT_TYPE_CEQ_OVERFLOW:
-			dev_warn(dev, "CEQ 0x%lx overflow.\n",
-			roce_get_field(aeqe->event.ce_event.ceqe,
-				     HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_M,
-				     HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_S));
-			break;
-		default:
-			dev_warn(dev, "Unhandled event %d on EQ %d at index %u\n",
-				 event_type, eq->eqn, eq->cons_index);
-			break;
-		}
-
-		eq->cons_index++;
-		aeqes_found = 1;
-
-		if (eq->cons_index > 2 * hr_dev->caps.aeqe_depth - 1) {
-			dev_warn(dev, "cons_index overflow, set back to zero\n"
-				);
-			eq->cons_index = 0;
-		}
-	}
-
-	eq_set_cons_index(eq, 0);
-
-	return aeqes_found;
-}
-
-static struct hns_roce_ceqe *get_ceqe(struct hns_roce_eq *eq, u32 entry)
-{
-	unsigned long off = (entry & (eq->entries - 1)) *
-			     HNS_ROCE_CEQ_ENTRY_SIZE;
-
-	return (struct hns_roce_ceqe *)((u8 *)
-			(eq->buf_list[off / HNS_ROCE_BA_SIZE].buf) +
-			off % HNS_ROCE_BA_SIZE);
-}
-
-static struct hns_roce_ceqe *next_ceqe_sw(struct hns_roce_eq *eq)
-{
-	struct hns_roce_ceqe *ceqe = get_ceqe(eq, eq->cons_index);
-
-	return (!!(roce_get_bit(ceqe->ceqe.comp,
-		 HNS_ROCE_CEQE_CEQE_COMP_OWNER_S))) ^
-		 (!!(eq->cons_index & eq->entries)) ? ceqe : NULL;
-}
-
-static int hns_roce_ceq_int(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq)
-{
-	struct hns_roce_ceqe *ceqe;
-	int ceqes_found = 0;
-	u32 cqn;
-
-	while ((ceqe = next_ceqe_sw(eq))) {
-		/* Memory barrier */
-		rmb();
-		cqn = roce_get_field(ceqe->ceqe.comp,
-				     HNS_ROCE_CEQE_CEQE_COMP_CQN_M,
-				     HNS_ROCE_CEQE_CEQE_COMP_CQN_S);
-		hns_roce_cq_completion(hr_dev, cqn);
-
-		++eq->cons_index;
-		ceqes_found = 1;
-
-		if (eq->cons_index > 2 * hr_dev->caps.ceqe_depth[eq->eqn] - 1) {
-			dev_warn(&eq->hr_dev->pdev->dev,
-				"cons_index overflow, set back to zero\n");
-			eq->cons_index = 0;
-		}
-	}
-
-	eq_set_cons_index(eq, 0);
-
-	return ceqes_found;
-}
-
-static int hns_roce_aeq_ovf_int(struct hns_roce_dev *hr_dev,
-				struct hns_roce_eq *eq)
-{
-	struct device *dev = &eq->hr_dev->pdev->dev;
-	int eqovf_found = 0;
-	u32 caepaemask_val;
-	u32 cealmovf_val;
-	u32 caepaest_val;
-	u32 aeshift_val;
-	u32 ceshift_val;
-	u32 cemask_val;
-	int i = 0;
-
-	/**
-	 * AEQ overflow ECC mult bit err CEQ overflow alarm
-	 * must clear interrupt, mask irq, clear irq, cancel mask operation
-	 */
-	aeshift_val = roce_read(hr_dev, ROCEE_CAEP_AEQC_AEQE_SHIFT_REG);
-
-	if (roce_get_bit(aeshift_val,
-		ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQ_ALM_OVF_INT_ST_S) == 1) {
-		dev_warn(dev, "AEQ overflow!\n");
-
-		/* Set mask */
-		caepaemask_val = roce_read(hr_dev, ROCEE_CAEP_AE_MASK_REG);
-		roce_set_bit(caepaemask_val,
-			     ROCEE_CAEP_AE_MASK_CAEP_AEQ_ALM_OVF_MASK_S,
-			     HNS_ROCE_INT_MASK_ENABLE);
-		roce_write(hr_dev, ROCEE_CAEP_AE_MASK_REG, caepaemask_val);
-
-		/* Clear int state(INT_WC : write 1 clear) */
-		caepaest_val = roce_read(hr_dev, ROCEE_CAEP_AE_ST_REG);
-		roce_set_bit(caepaest_val,
-			     ROCEE_CAEP_AE_ST_CAEP_AEQ_ALM_OVF_S, 1);
-		roce_write(hr_dev, ROCEE_CAEP_AE_ST_REG, caepaest_val);
-
-		/* Clear mask */
-		caepaemask_val = roce_read(hr_dev, ROCEE_CAEP_AE_MASK_REG);
-		roce_set_bit(caepaemask_val,
-			     ROCEE_CAEP_AE_MASK_CAEP_AEQ_ALM_OVF_MASK_S,
-			     HNS_ROCE_INT_MASK_DISABLE);
-		roce_write(hr_dev, ROCEE_CAEP_AE_MASK_REG, caepaemask_val);
-	}
-
-	/* CEQ almost overflow */
-	for (i = 0; i < hr_dev->caps.num_comp_vectors; i++) {
-		ceshift_val = roce_read(hr_dev, ROCEE_CAEP_CEQC_SHIFT_0_REG +
-					i * CEQ_REG_OFFSET);
-
-		if (roce_get_bit(ceshift_val,
-		ROCEE_CAEP_CEQC_SHIFT_CAEP_CEQ_ALM_OVF_INT_ST_S) == 1) {
-			dev_warn(dev, "CEQ[%d] almost overflow!\n", i);
-			eqovf_found++;
-
-			/* Set mask */
-			cemask_val = roce_read(hr_dev,
-					       ROCEE_CAEP_CE_IRQ_MASK_0_REG +
-					       i * CEQ_REG_OFFSET);
-			roce_set_bit(cemask_val,
-				ROCEE_CAEP_CE_IRQ_MASK_CAEP_CEQ_ALM_OVF_MASK_S,
-				HNS_ROCE_INT_MASK_ENABLE);
-			roce_write(hr_dev, ROCEE_CAEP_CE_IRQ_MASK_0_REG +
-				   i * CEQ_REG_OFFSET, cemask_val);
-
-			/* Clear int state(INT_WC : write 1 clear) */
-			cealmovf_val = roce_read(hr_dev,
-				       ROCEE_CAEP_CEQ_ALM_OVF_0_REG +
-				       i * CEQ_REG_OFFSET);
-			roce_set_bit(cealmovf_val,
-				     ROCEE_CAEP_CEQ_ALM_OVF_CAEP_CEQ_ALM_OVF_S,
-				     1);
-			roce_write(hr_dev, ROCEE_CAEP_CEQ_ALM_OVF_0_REG +
-				    i * CEQ_REG_OFFSET, cealmovf_val);
-
-			/* Clear mask */
-			cemask_val = roce_read(hr_dev,
-				     ROCEE_CAEP_CE_IRQ_MASK_0_REG +
-				     i * CEQ_REG_OFFSET);
-			roce_set_bit(cemask_val,
-			       ROCEE_CAEP_CE_IRQ_MASK_CAEP_CEQ_ALM_OVF_MASK_S,
-			       HNS_ROCE_INT_MASK_DISABLE);
-			roce_write(hr_dev, ROCEE_CAEP_CE_IRQ_MASK_0_REG +
-				   i * CEQ_REG_OFFSET, cemask_val);
-		}
-	}
-
-	/* ECC multi-bit error alarm */
-	dev_warn(dev, "ECC UCERR ALARM: 0x%x, 0x%x, 0x%x\n",
-		 roce_read(hr_dev, ROCEE_ECC_UCERR_ALM0_REG),
-		 roce_read(hr_dev, ROCEE_ECC_UCERR_ALM1_REG),
-		 roce_read(hr_dev, ROCEE_ECC_UCERR_ALM2_REG));
-
-	dev_warn(dev, "ECC CERR ALARM: 0x%x, 0x%x, 0x%x\n",
-		 roce_read(hr_dev, ROCEE_ECC_CERR_ALM0_REG),
-		 roce_read(hr_dev, ROCEE_ECC_CERR_ALM1_REG),
-		 roce_read(hr_dev, ROCEE_ECC_CERR_ALM2_REG));
-
-	return eqovf_found;
-}
-
-static int hns_roce_eq_int(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq)
-{
-	int eqes_found = 0;
-
-	if (likely(eq->type_flag == HNS_ROCE_CEQ))
-		/* CEQ irq routine, CEQ is pulse irq, not clear */
-		eqes_found = hns_roce_ceq_int(hr_dev, eq);
-	else if (likely(eq->type_flag == HNS_ROCE_AEQ))
-		/* AEQ irq routine, AEQ is pulse irq, not clear */
-		eqes_found = hns_roce_aeq_int(hr_dev, eq);
-	else
-		/* AEQ queue overflow irq */
-		eqes_found = hns_roce_aeq_ovf_int(hr_dev, eq);
-
-	return eqes_found;
-}
-
-static irqreturn_t hns_roce_msi_x_interrupt(int irq, void *eq_ptr)
-{
-	int int_work = 0;
-	struct hns_roce_eq  *eq  = eq_ptr;
-	struct hns_roce_dev *hr_dev = eq->hr_dev;
-
-	int_work = hns_roce_eq_int(hr_dev, eq);
-
-	return IRQ_RETVAL(int_work);
-}
-
-static void hns_roce_enable_eq(struct hns_roce_dev *hr_dev, int eq_num,
-			       int enable_flag)
-{
-	void __iomem *eqc = hr_dev->eq_table.eqc_base[eq_num];
-	u32 val;
-
-	val = readl(eqc);
-
-	if (enable_flag)
-		roce_set_field(val,
-			       ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M,
-			       ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S,
-			       HNS_ROCE_EQ_STAT_VALID);
-	else
-		roce_set_field(val,
-			       ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M,
-			       ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S,
-			       HNS_ROCE_EQ_STAT_INVALID);
-	writel(val, eqc);
-}
-
-static int hns_roce_create_eq(struct hns_roce_dev *hr_dev,
-			      struct hns_roce_eq *eq)
-{
-	void __iomem *eqc = hr_dev->eq_table.eqc_base[eq->eqn];
-	struct device *dev = &hr_dev->pdev->dev;
-	dma_addr_t tmp_dma_addr;
-	u32 eqconsindx_val = 0;
-	u32 eqcuridx_val = 0;
-	u32 eqshift_val = 0;
-	int num_bas = 0;
-	int ret;
-	int i;
-
-	num_bas = (PAGE_ALIGN(eq->entries * eq->eqe_size) +
-		   HNS_ROCE_BA_SIZE - 1) / HNS_ROCE_BA_SIZE;
-
-	if ((eq->entries * eq->eqe_size) > HNS_ROCE_BA_SIZE) {
-		dev_err(dev, "[error]eq buf %d gt ba size(%d) need bas=%d\n",
-			(eq->entries * eq->eqe_size), HNS_ROCE_BA_SIZE,
-			num_bas);
-		return -EINVAL;
-	}
-
-	eq->buf_list = kcalloc(num_bas, sizeof(*eq->buf_list), GFP_KERNEL);
-	if (!eq->buf_list)
-		return -ENOMEM;
-
-	for (i = 0; i < num_bas; ++i) {
-		eq->buf_list[i].buf = dma_alloc_coherent(dev, HNS_ROCE_BA_SIZE,
-							 &tmp_dma_addr,
-							 GFP_KERNEL);
-		if (!eq->buf_list[i].buf) {
-			ret = -ENOMEM;
-			goto err_out_free_pages;
-		}
-
-		eq->buf_list[i].map = tmp_dma_addr;
-		memset(eq->buf_list[i].buf, 0, HNS_ROCE_BA_SIZE);
-	}
-	eq->cons_index = 0;
-	roce_set_field(eqshift_val,
-		       ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M,
-		       ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S,
-		       HNS_ROCE_EQ_STAT_INVALID);
-	roce_set_field(eqshift_val,
-		       ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_AEQE_SHIFT_M,
-		       ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_AEQE_SHIFT_S,
-		       eq->log_entries);
-	writel(eqshift_val, eqc);
-
-	/* Configure eq extended address 12~44bit */
-	writel((u32)(eq->buf_list[0].map >> 12), eqc + 4);
-
-	/*
-	 * Configure eq extended address 45~49 bit.
-	 * 44 = 32 + 12, When evaluating addr to hardware, shift 12 because of
-	 * using 4K page, and shift more 32 because of
-	 * caculating the high 32 bit value evaluated to hardware.
-	 */
-	roce_set_field(eqcuridx_val, ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQ_BT_H_M,
-		       ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQ_BT_H_S,
-		       eq->buf_list[0].map >> 44);
-	roce_set_field(eqcuridx_val,
-		       ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQE_CUR_IDX_M,
-		       ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQE_CUR_IDX_S, 0);
-	writel(eqcuridx_val, eqc + 8);
-
-	/* Configure eq consumer index */
-	roce_set_field(eqconsindx_val,
-		       ROCEE_CAEP_AEQE_CONS_IDX_CAEP_AEQE_CONS_IDX_M,
-		       ROCEE_CAEP_AEQE_CONS_IDX_CAEP_AEQE_CONS_IDX_S, 0);
-	writel(eqconsindx_val, eqc + 0xc);
-
-	return 0;
-
-err_out_free_pages:
-	for (i = i - 1; i >= 0; i--)
-		dma_free_coherent(dev, HNS_ROCE_BA_SIZE, eq->buf_list[i].buf,
-				  eq->buf_list[i].map);
-
-	kfree(eq->buf_list);
-	return ret;
-}
-
-static void hns_roce_free_eq(struct hns_roce_dev *hr_dev,
-			     struct hns_roce_eq *eq)
-{
-	int i = 0;
-	int npages = (PAGE_ALIGN(eq->eqe_size * eq->entries) +
-		      HNS_ROCE_BA_SIZE - 1) / HNS_ROCE_BA_SIZE;
-
-	if (!eq->buf_list)
-		return;
-
-	for (i = 0; i < npages; ++i)
-		dma_free_coherent(&hr_dev->pdev->dev, HNS_ROCE_BA_SIZE,
-				  eq->buf_list[i].buf, eq->buf_list[i].map);
-
-	kfree(eq->buf_list);
-}
-
-static void hns_roce_int_mask_en(struct hns_roce_dev *hr_dev)
-{
-	int i = 0;
-	u32 aemask_val;
-	int masken = 0;
-
-	/* AEQ INT */
-	aemask_val = roce_read(hr_dev, ROCEE_CAEP_AE_MASK_REG);
-	roce_set_bit(aemask_val, ROCEE_CAEP_AE_MASK_CAEP_AEQ_ALM_OVF_MASK_S,
-		     masken);
-	roce_set_bit(aemask_val, ROCEE_CAEP_AE_MASK_CAEP_AE_IRQ_MASK_S, masken);
-	roce_write(hr_dev, ROCEE_CAEP_AE_MASK_REG, aemask_val);
-
-	/* CEQ INT */
-	for (i = 0; i < hr_dev->caps.num_comp_vectors; i++) {
-		/* IRQ mask */
-		roce_write(hr_dev, ROCEE_CAEP_CE_IRQ_MASK_0_REG +
-			   i * CEQ_REG_OFFSET, masken);
-	}
-}
-
-static void hns_roce_ce_int_default_cfg(struct hns_roce_dev *hr_dev)
-{
-	/* Configure ce int interval */
-	roce_write(hr_dev, ROCEE_CAEP_CE_INTERVAL_CFG_REG,
-		   HNS_ROCE_CEQ_DEFAULT_INTERVAL);
-
-	/* Configure ce int burst num */
-	roce_write(hr_dev, ROCEE_CAEP_CE_BURST_NUM_CFG_REG,
-		   HNS_ROCE_CEQ_DEFAULT_BURST_NUM);
-}
-
-int hns_roce_init_eq_table(struct hns_roce_dev *hr_dev)
-{
-	struct hns_roce_eq_table *eq_table = &hr_dev->eq_table;
-	struct device *dev = &hr_dev->pdev->dev;
-	struct hns_roce_eq *eq = NULL;
-	int eq_num = 0;
-	int ret = 0;
-	int i = 0;
-	int j = 0;
-
-	eq_num = hr_dev->caps.num_comp_vectors + hr_dev->caps.num_aeq_vectors;
-	eq_table->eq = kcalloc(eq_num, sizeof(*eq_table->eq), GFP_KERNEL);
-	if (!eq_table->eq)
-		return -ENOMEM;
-
-	eq_table->eqc_base = kcalloc(eq_num, sizeof(*eq_table->eqc_base),
-				     GFP_KERNEL);
-	if (!eq_table->eqc_base) {
-		ret = -ENOMEM;
-		goto err_eqc_base_alloc_fail;
-	}
-
-	for (i = 0; i < eq_num; i++) {
-		eq = &eq_table->eq[i];
-		eq->hr_dev = hr_dev;
-		eq->eqn = i;
-		eq->irq = hr_dev->irq[i];
-		eq->log_page_size = PAGE_SHIFT;
-
-		if (i < hr_dev->caps.num_comp_vectors) {
-			/* CEQ */
-			eq_table->eqc_base[i] = hr_dev->reg_base +
-						ROCEE_CAEP_CEQC_SHIFT_0_REG +
-						HNS_ROCE_CEQC_REG_OFFSET * i;
-			eq->type_flag = HNS_ROCE_CEQ;
-			eq->doorbell = hr_dev->reg_base +
-				       ROCEE_CAEP_CEQC_CONS_IDX_0_REG +
-				       HNS_ROCE_CEQC_REG_OFFSET * i;
-			eq->entries = hr_dev->caps.ceqe_depth[i];
-			eq->log_entries = ilog2(eq->entries);
-			eq->eqe_size = sizeof(struct hns_roce_ceqe);
-		} else {
-			/* AEQ */
-			eq_table->eqc_base[i] = hr_dev->reg_base +
-						ROCEE_CAEP_AEQC_AEQE_SHIFT_REG;
-			eq->type_flag = HNS_ROCE_AEQ;
-			eq->doorbell = hr_dev->reg_base +
-				       ROCEE_CAEP_AEQE_CONS_IDX_REG;
-			eq->entries = hr_dev->caps.aeqe_depth;
-			eq->log_entries = ilog2(eq->entries);
-			eq->eqe_size = sizeof(struct hns_roce_aeqe);
-		}
-	}
-
-	/* Disable irq */
-	hns_roce_int_mask_en(hr_dev);
-
-	/* Configure CE irq interval and burst num */
-	hns_roce_ce_int_default_cfg(hr_dev);
-
-	for (i = 0; i < eq_num; i++) {
-		ret = hns_roce_create_eq(hr_dev, &eq_table->eq[i]);
-		if (ret) {
-			dev_err(dev, "eq create failed\n");
-			goto err_create_eq_fail;
-		}
-	}
-
-	for (j = 0; j < eq_num; j++) {
-		ret = request_irq(eq_table->eq[j].irq, hns_roce_msi_x_interrupt,
-				  0, hr_dev->irq_names[j], eq_table->eq + j);
-		if (ret) {
-			dev_err(dev, "request irq error!\n");
-			goto err_request_irq_fail;
-		}
-	}
-
-	for (i = 0; i < eq_num; i++)
-		hns_roce_enable_eq(hr_dev, i, EQ_ENABLE);
-
-	return 0;
-
-err_request_irq_fail:
-	for (j = j - 1; j >= 0; j--)
-		free_irq(eq_table->eq[j].irq, eq_table->eq + j);
-
-err_create_eq_fail:
-	for (i = i - 1; i >= 0; i--)
-		hns_roce_free_eq(hr_dev, &eq_table->eq[i]);
-
-	kfree(eq_table->eqc_base);
-
-err_eqc_base_alloc_fail:
-	kfree(eq_table->eq);
-
-	return ret;
-}
-
-void hns_roce_cleanup_eq_table(struct hns_roce_dev *hr_dev)
-{
-	int i;
-	int eq_num;
-	struct hns_roce_eq_table *eq_table = &hr_dev->eq_table;
-
-	eq_num = hr_dev->caps.num_comp_vectors + hr_dev->caps.num_aeq_vectors;
-	for (i = 0; i < eq_num; i++) {
-		/* Disable EQ */
-		hns_roce_enable_eq(hr_dev, i, EQ_DISABLE);
-
-		free_irq(eq_table->eq[i].irq, eq_table->eq + i);
-
-		hns_roce_free_eq(hr_dev, &eq_table->eq[i]);
-	}
-
-	kfree(eq_table->eqc_base);
-	kfree(eq_table->eq);
-}

+ 0 - 134
drivers/infiniband/hw/hns/hns_roce_eq.h

@@ -1,134 +0,0 @@
-/*
- * Copyright (c) 2016 Hisilicon Limited.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef _HNS_ROCE_EQ_H
-#define _HNS_ROCE_EQ_H
-
-#define HNS_ROCE_CEQ			1
-#define HNS_ROCE_AEQ			2
-
-#define HNS_ROCE_CEQ_ENTRY_SIZE		0x4
-#define HNS_ROCE_AEQ_ENTRY_SIZE		0x10
-#define HNS_ROCE_CEQC_REG_OFFSET	0x18
-
-#define HNS_ROCE_CEQ_DEFAULT_INTERVAL	0x10
-#define HNS_ROCE_CEQ_DEFAULT_BURST_NUM	0x10
-
-#define HNS_ROCE_INT_MASK_DISABLE	0
-#define HNS_ROCE_INT_MASK_ENABLE	1
-
-#define EQ_ENABLE			1
-#define EQ_DISABLE			0
-#define CONS_INDEX_MASK			0xffff
-
-#define CEQ_REG_OFFSET			0x18
-
-enum {
-	HNS_ROCE_EQ_STAT_INVALID  = 0,
-	HNS_ROCE_EQ_STAT_VALID    = 2,
-};
-
-struct hns_roce_aeqe {
-	u32 asyn;
-	union {
-		struct {
-			u32 qp;
-			u32 rsv0;
-			u32 rsv1;
-		} qp_event;
-
-		struct {
-			u32 cq;
-			u32 rsv0;
-			u32 rsv1;
-		} cq_event;
-
-		struct {
-			u32 port;
-			u32 rsv0;
-			u32 rsv1;
-		} port_event;
-
-		struct {
-			u32 ceqe;
-			u32 rsv0;
-			u32 rsv1;
-		} ce_event;
-
-		struct {
-			__le64  out_param;
-			__le16  token;
-			u8	status;
-			u8	rsv0;
-		} __packed cmd;
-	 } event;
-};
-
-#define HNS_ROCE_AEQE_U32_4_EVENT_TYPE_S 16
-#define HNS_ROCE_AEQE_U32_4_EVENT_TYPE_M   \
-	(((1UL << 8) - 1) << HNS_ROCE_AEQE_U32_4_EVENT_TYPE_S)
-
-#define HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S 24
-#define HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_M   \
-	(((1UL << 7) - 1) << HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S)
-
-#define HNS_ROCE_AEQE_U32_4_OWNER_S 31
-
-#define HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_S 0
-#define HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_M   \
-	(((1UL << 24) - 1) << HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_S)
-
-#define HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_S 25
-#define HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_M   \
-	(((1UL << 3) - 1) << HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_S)
-
-#define HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_S 0
-#define HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_M   \
-	(((1UL << 16) - 1) << HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_S)
-
-#define HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_S 0
-#define HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_M   \
-	(((1UL << 5) - 1) << HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_S)
-
-struct hns_roce_ceqe {
-	union {
-		int		comp;
-	} ceqe;
-};
-
-#define HNS_ROCE_CEQE_CEQE_COMP_OWNER_S	0
-
-#define HNS_ROCE_CEQE_CEQE_COMP_CQN_S 16
-#define HNS_ROCE_CEQE_CEQE_COMP_CQN_M   \
-	(((1UL << 16) - 1) << HNS_ROCE_CEQE_CEQE_COMP_CQN_S)
-
-#endif /* _HNS_ROCE_EQ_H */

+ 741 - 17
drivers/infiniband/hw/hns/hns_roce_hw_v1.c

@@ -33,6 +33,7 @@
 #include <linux/platform_device.h>
 #include <linux/platform_device.h>
 #include <linux/acpi.h>
 #include <linux/acpi.h>
 #include <linux/etherdevice.h>
 #include <linux/etherdevice.h>
+#include <linux/interrupt.h>
 #include <linux/of.h>
 #include <linux/of.h>
 #include <linux/of_platform.h>
 #include <linux/of_platform.h>
 #include <rdma/ib_umem.h>
 #include <rdma/ib_umem.h>
@@ -774,7 +775,7 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev)
 			goto create_lp_qp_failed;
 			goto create_lp_qp_failed;
 		}
 		}
 
 
-		ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, &attr, attr_mask,
+		ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, &attr, IB_QP_DEST_QPN,
 					    IB_QPS_INIT, IB_QPS_RTR);
 					    IB_QPS_INIT, IB_QPS_RTR);
 		if (ret) {
 		if (ret) {
 			dev_err(dev, "modify qp failed(%d)!\n", ret);
 			dev_err(dev, "modify qp failed(%d)!\n", ret);
@@ -1492,9 +1493,9 @@ static int hns_roce_v1_profile(struct hns_roce_dev *hr_dev)
 	caps->max_sq_inline	= HNS_ROCE_V1_INLINE_SIZE;
 	caps->max_sq_inline	= HNS_ROCE_V1_INLINE_SIZE;
 	caps->num_uars		= HNS_ROCE_V1_UAR_NUM;
 	caps->num_uars		= HNS_ROCE_V1_UAR_NUM;
 	caps->phy_num_uars	= HNS_ROCE_V1_PHY_UAR_NUM;
 	caps->phy_num_uars	= HNS_ROCE_V1_PHY_UAR_NUM;
-	caps->num_aeq_vectors	= HNS_ROCE_AEQE_VEC_NUM;
-	caps->num_comp_vectors	= HNS_ROCE_COMP_VEC_NUM;
-	caps->num_other_vectors	= HNS_ROCE_AEQE_OF_VEC_NUM;
+	caps->num_aeq_vectors	= HNS_ROCE_V1_AEQE_VEC_NUM;
+	caps->num_comp_vectors	= HNS_ROCE_V1_COMP_VEC_NUM;
+	caps->num_other_vectors	= HNS_ROCE_V1_ABNORMAL_VEC_NUM;
 	caps->num_mtpts		= HNS_ROCE_V1_MAX_MTPT_NUM;
 	caps->num_mtpts		= HNS_ROCE_V1_MAX_MTPT_NUM;
 	caps->num_mtt_segs	= HNS_ROCE_V1_MAX_MTT_SEGS;
 	caps->num_mtt_segs	= HNS_ROCE_V1_MAX_MTT_SEGS;
 	caps->num_pds		= HNS_ROCE_V1_MAX_PD_NUM;
 	caps->num_pds		= HNS_ROCE_V1_MAX_PD_NUM;
@@ -1529,10 +1530,8 @@ static int hns_roce_v1_profile(struct hns_roce_dev *hr_dev)
 						 caps->num_ports + 1;
 						 caps->num_ports + 1;
 	}
 	}
 
 
-	for (i = 0; i < caps->num_comp_vectors; i++)
-		caps->ceqe_depth[i] = HNS_ROCE_V1_NUM_COMP_EQE;
-
-	caps->aeqe_depth = HNS_ROCE_V1_NUM_ASYNC_EQE;
+	caps->ceqe_depth = HNS_ROCE_V1_COMP_EQE_NUM;
+	caps->aeqe_depth = HNS_ROCE_V1_ASYNC_EQE_NUM;
 	caps->local_ca_ack_delay = le32_to_cpu(roce_read(hr_dev,
 	caps->local_ca_ack_delay = le32_to_cpu(roce_read(hr_dev,
 							 ROCEE_ACK_DELAY_REG));
 							 ROCEE_ACK_DELAY_REG));
 	caps->max_mtu = IB_MTU_2048;
 	caps->max_mtu = IB_MTU_2048;
@@ -2312,15 +2311,16 @@ static int hns_roce_v1_poll_one(struct hns_roce_cq *hr_cq,
 		case HNS_ROCE_OPCODE_RDMA_WITH_IMM_RECEIVE:
 		case HNS_ROCE_OPCODE_RDMA_WITH_IMM_RECEIVE:
 			wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
 			wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
 			wc->wc_flags = IB_WC_WITH_IMM;
 			wc->wc_flags = IB_WC_WITH_IMM;
-			wc->ex.imm_data = le32_to_cpu(cqe->immediate_data);
+			wc->ex.imm_data =
+				cpu_to_be32(le32_to_cpu(cqe->immediate_data));
 			break;
 			break;
 		case HNS_ROCE_OPCODE_SEND_DATA_RECEIVE:
 		case HNS_ROCE_OPCODE_SEND_DATA_RECEIVE:
 			if (roce_get_bit(cqe->cqe_byte_4,
 			if (roce_get_bit(cqe->cqe_byte_4,
 					 CQE_BYTE_4_IMM_INDICATOR_S)) {
 					 CQE_BYTE_4_IMM_INDICATOR_S)) {
 				wc->opcode = IB_WC_RECV;
 				wc->opcode = IB_WC_RECV;
 				wc->wc_flags = IB_WC_WITH_IMM;
 				wc->wc_flags = IB_WC_WITH_IMM;
-				wc->ex.imm_data = le32_to_cpu(
-						  cqe->immediate_data);
+				wc->ex.imm_data = cpu_to_be32(
+					le32_to_cpu(cqe->immediate_data));
 			} else {
 			} else {
 				wc->opcode = IB_WC_RECV;
 				wc->opcode = IB_WC_RECV;
 				wc->wc_flags = 0;
 				wc->wc_flags = 0;
@@ -3960,6 +3960,732 @@ static int hns_roce_v1_destroy_cq(struct ib_cq *ibcq)
 	return ret;
 	return ret;
 }
 }
 
 
+static void set_eq_cons_index_v1(struct hns_roce_eq *eq, int req_not)
+{
+	roce_raw_write((eq->cons_index & HNS_ROCE_V1_CONS_IDX_M) |
+		      (req_not << eq->log_entries), eq->doorbell);
+}
+
+static void hns_roce_v1_wq_catas_err_handle(struct hns_roce_dev *hr_dev,
+					    struct hns_roce_aeqe *aeqe, int qpn)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+
+	dev_warn(dev, "Local Work Queue Catastrophic Error.\n");
+	switch (roce_get_field(aeqe->asyn, HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_M,
+			       HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S)) {
+	case HNS_ROCE_LWQCE_QPC_ERROR:
+		dev_warn(dev, "QP %d, QPC error.\n", qpn);
+		break;
+	case HNS_ROCE_LWQCE_MTU_ERROR:
+		dev_warn(dev, "QP %d, MTU error.\n", qpn);
+		break;
+	case HNS_ROCE_LWQCE_WQE_BA_ADDR_ERROR:
+		dev_warn(dev, "QP %d, WQE BA addr error.\n", qpn);
+		break;
+	case HNS_ROCE_LWQCE_WQE_ADDR_ERROR:
+		dev_warn(dev, "QP %d, WQE addr error.\n", qpn);
+		break;
+	case HNS_ROCE_LWQCE_SQ_WQE_SHIFT_ERROR:
+		dev_warn(dev, "QP %d, WQE shift error\n", qpn);
+		break;
+	case HNS_ROCE_LWQCE_SL_ERROR:
+		dev_warn(dev, "QP %d, SL error.\n", qpn);
+		break;
+	case HNS_ROCE_LWQCE_PORT_ERROR:
+		dev_warn(dev, "QP %d, port error.\n", qpn);
+		break;
+	default:
+		break;
+	}
+}
+
+static void hns_roce_v1_local_wq_access_err_handle(struct hns_roce_dev *hr_dev,
+						   struct hns_roce_aeqe *aeqe,
+						   int qpn)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+
+	dev_warn(dev, "Local Access Violation Work Queue Error.\n");
+	switch (roce_get_field(aeqe->asyn, HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_M,
+			       HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S)) {
+	case HNS_ROCE_LAVWQE_R_KEY_VIOLATION:
+		dev_warn(dev, "QP %d, R_key violation.\n", qpn);
+		break;
+	case HNS_ROCE_LAVWQE_LENGTH_ERROR:
+		dev_warn(dev, "QP %d, length error.\n", qpn);
+		break;
+	case HNS_ROCE_LAVWQE_VA_ERROR:
+		dev_warn(dev, "QP %d, VA error.\n", qpn);
+		break;
+	case HNS_ROCE_LAVWQE_PD_ERROR:
+		dev_err(dev, "QP %d, PD error.\n", qpn);
+		break;
+	case HNS_ROCE_LAVWQE_RW_ACC_ERROR:
+		dev_warn(dev, "QP %d, rw acc error.\n", qpn);
+		break;
+	case HNS_ROCE_LAVWQE_KEY_STATE_ERROR:
+		dev_warn(dev, "QP %d, key state error.\n", qpn);
+		break;
+	case HNS_ROCE_LAVWQE_MR_OPERATION_ERROR:
+		dev_warn(dev, "QP %d, MR operation error.\n", qpn);
+		break;
+	default:
+		break;
+	}
+}
+
+static void hns_roce_v1_qp_err_handle(struct hns_roce_dev *hr_dev,
+				      struct hns_roce_aeqe *aeqe,
+				      int event_type)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	int phy_port;
+	int qpn;
+
+	qpn = roce_get_field(aeqe->event.qp_event.qp,
+			     HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_M,
+			     HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_S);
+	phy_port = roce_get_field(aeqe->event.qp_event.qp,
+				  HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_M,
+				  HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_S);
+	if (qpn <= 1)
+		qpn = HNS_ROCE_MAX_PORTS * qpn + phy_port;
+
+	switch (event_type) {
+	case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR:
+		dev_warn(dev, "Invalid Req Local Work Queue Error.\n"
+			 "QP %d, phy_port %d.\n", qpn, phy_port);
+		break;
+	case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR:
+		hns_roce_v1_wq_catas_err_handle(hr_dev, aeqe, qpn);
+		break;
+	case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
+		hns_roce_v1_local_wq_access_err_handle(hr_dev, aeqe, qpn);
+		break;
+	default:
+		break;
+	}
+
+	hns_roce_qp_event(hr_dev, qpn, event_type);
+}
+
+static void hns_roce_v1_cq_err_handle(struct hns_roce_dev *hr_dev,
+				      struct hns_roce_aeqe *aeqe,
+				      int event_type)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	u32 cqn;
+
+	cqn = le32_to_cpu(roce_get_field(aeqe->event.cq_event.cq,
+			  HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_M,
+			  HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_S));
+
+	switch (event_type) {
+	case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR:
+		dev_warn(dev, "CQ 0x%x access err.\n", cqn);
+		break;
+	case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW:
+		dev_warn(dev, "CQ 0x%x overflow\n", cqn);
+		break;
+	case HNS_ROCE_EVENT_TYPE_CQ_ID_INVALID:
+		dev_warn(dev, "CQ 0x%x ID invalid.\n", cqn);
+		break;
+	default:
+		break;
+	}
+
+	hns_roce_cq_event(hr_dev, cqn, event_type);
+}
+
+static void hns_roce_v1_db_overflow_handle(struct hns_roce_dev *hr_dev,
+					   struct hns_roce_aeqe *aeqe)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+
+	switch (roce_get_field(aeqe->asyn, HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_M,
+			       HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S)) {
+	case HNS_ROCE_DB_SUBTYPE_SDB_OVF:
+		dev_warn(dev, "SDB overflow.\n");
+		break;
+	case HNS_ROCE_DB_SUBTYPE_SDB_ALM_OVF:
+		dev_warn(dev, "SDB almost overflow.\n");
+		break;
+	case HNS_ROCE_DB_SUBTYPE_SDB_ALM_EMP:
+		dev_warn(dev, "SDB almost empty.\n");
+		break;
+	case HNS_ROCE_DB_SUBTYPE_ODB_OVF:
+		dev_warn(dev, "ODB overflow.\n");
+		break;
+	case HNS_ROCE_DB_SUBTYPE_ODB_ALM_OVF:
+		dev_warn(dev, "ODB almost overflow.\n");
+		break;
+	case HNS_ROCE_DB_SUBTYPE_ODB_ALM_EMP:
+		dev_warn(dev, "SDB almost empty.\n");
+		break;
+	default:
+		break;
+	}
+}
+
+static struct hns_roce_aeqe *get_aeqe_v1(struct hns_roce_eq *eq, u32 entry)
+{
+	unsigned long off = (entry & (eq->entries - 1)) *
+			     HNS_ROCE_AEQ_ENTRY_SIZE;
+
+	return (struct hns_roce_aeqe *)((u8 *)
+		(eq->buf_list[off / HNS_ROCE_BA_SIZE].buf) +
+		off % HNS_ROCE_BA_SIZE);
+}
+
+static struct hns_roce_aeqe *next_aeqe_sw_v1(struct hns_roce_eq *eq)
+{
+	struct hns_roce_aeqe *aeqe = get_aeqe_v1(eq, eq->cons_index);
+
+	return (roce_get_bit(aeqe->asyn, HNS_ROCE_AEQE_U32_4_OWNER_S) ^
+		!!(eq->cons_index & eq->entries)) ? aeqe : NULL;
+}
+
+static int hns_roce_v1_aeq_int(struct hns_roce_dev *hr_dev,
+			       struct hns_roce_eq *eq)
+{
+	struct device *dev = &hr_dev->pdev->dev;
+	struct hns_roce_aeqe *aeqe;
+	int aeqes_found = 0;
+	int event_type;
+
+	while ((aeqe = next_aeqe_sw_v1(eq))) {
+
+		/* Make sure we read the AEQ entry after we have checked the
+		 * ownership bit
+		 */
+		dma_rmb();
+
+		dev_dbg(dev, "aeqe = %p, aeqe->asyn.event_type = 0x%lx\n", aeqe,
+			roce_get_field(aeqe->asyn,
+				       HNS_ROCE_AEQE_U32_4_EVENT_TYPE_M,
+				       HNS_ROCE_AEQE_U32_4_EVENT_TYPE_S));
+		event_type = roce_get_field(aeqe->asyn,
+					    HNS_ROCE_AEQE_U32_4_EVENT_TYPE_M,
+					    HNS_ROCE_AEQE_U32_4_EVENT_TYPE_S);
+		switch (event_type) {
+		case HNS_ROCE_EVENT_TYPE_PATH_MIG:
+			dev_warn(dev, "PATH MIG not supported\n");
+			break;
+		case HNS_ROCE_EVENT_TYPE_COMM_EST:
+			dev_warn(dev, "COMMUNICATION established\n");
+			break;
+		case HNS_ROCE_EVENT_TYPE_SQ_DRAINED:
+			dev_warn(dev, "SQ DRAINED not supported\n");
+			break;
+		case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED:
+			dev_warn(dev, "PATH MIG failed\n");
+			break;
+		case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR:
+		case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR:
+		case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
+			hns_roce_v1_qp_err_handle(hr_dev, aeqe, event_type);
+			break;
+		case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH:
+		case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR:
+		case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH:
+			dev_warn(dev, "SRQ not support!\n");
+			break;
+		case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR:
+		case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW:
+		case HNS_ROCE_EVENT_TYPE_CQ_ID_INVALID:
+			hns_roce_v1_cq_err_handle(hr_dev, aeqe, event_type);
+			break;
+		case HNS_ROCE_EVENT_TYPE_PORT_CHANGE:
+			dev_warn(dev, "port change.\n");
+			break;
+		case HNS_ROCE_EVENT_TYPE_MB:
+			hns_roce_cmd_event(hr_dev,
+					   le16_to_cpu(aeqe->event.cmd.token),
+					   aeqe->event.cmd.status,
+					   le64_to_cpu(aeqe->event.cmd.out_param
+					   ));
+			break;
+		case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW:
+			hns_roce_v1_db_overflow_handle(hr_dev, aeqe);
+			break;
+		case HNS_ROCE_EVENT_TYPE_CEQ_OVERFLOW:
+			dev_warn(dev, "CEQ 0x%lx overflow.\n",
+			roce_get_field(aeqe->event.ce_event.ceqe,
+				     HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_M,
+				     HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_S));
+			break;
+		default:
+			dev_warn(dev, "Unhandled event %d on EQ %d at idx %u.\n",
+				 event_type, eq->eqn, eq->cons_index);
+			break;
+		}
+
+		eq->cons_index++;
+		aeqes_found = 1;
+
+		if (eq->cons_index > 2 * hr_dev->caps.aeqe_depth - 1) {
+			dev_warn(dev, "cons_index overflow, set back to 0.\n");
+			eq->cons_index = 0;
+		}
+	}
+
+	set_eq_cons_index_v1(eq, 0);
+
+	return aeqes_found;
+}
+
+static struct hns_roce_ceqe *get_ceqe_v1(struct hns_roce_eq *eq, u32 entry)
+{
+	unsigned long off = (entry & (eq->entries - 1)) *
+			     HNS_ROCE_CEQ_ENTRY_SIZE;
+
+	return (struct hns_roce_ceqe *)((u8 *)
+			(eq->buf_list[off / HNS_ROCE_BA_SIZE].buf) +
+			off % HNS_ROCE_BA_SIZE);
+}
+
+static struct hns_roce_ceqe *next_ceqe_sw_v1(struct hns_roce_eq *eq)
+{
+	struct hns_roce_ceqe *ceqe = get_ceqe_v1(eq, eq->cons_index);
+
+	return (!!(roce_get_bit(ceqe->comp,
+		HNS_ROCE_CEQE_CEQE_COMP_OWNER_S))) ^
+		(!!(eq->cons_index & eq->entries)) ? ceqe : NULL;
+}
+
+static int hns_roce_v1_ceq_int(struct hns_roce_dev *hr_dev,
+			       struct hns_roce_eq *eq)
+{
+	struct hns_roce_ceqe *ceqe;
+	int ceqes_found = 0;
+	u32 cqn;
+
+	while ((ceqe = next_ceqe_sw_v1(eq))) {
+
+		/* Make sure we read CEQ entry after we have checked the
+		 * ownership bit
+		 */
+		dma_rmb();
+
+		cqn = roce_get_field(ceqe->comp,
+				     HNS_ROCE_CEQE_CEQE_COMP_CQN_M,
+				     HNS_ROCE_CEQE_CEQE_COMP_CQN_S);
+		hns_roce_cq_completion(hr_dev, cqn);
+
+		++eq->cons_index;
+		ceqes_found = 1;
+
+		if (eq->cons_index > 2 * hr_dev->caps.ceqe_depth - 1) {
+			dev_warn(&eq->hr_dev->pdev->dev,
+				"cons_index overflow, set back to 0.\n");
+			eq->cons_index = 0;
+		}
+	}
+
+	set_eq_cons_index_v1(eq, 0);
+
+	return ceqes_found;
+}
+
+static irqreturn_t hns_roce_v1_msix_interrupt_eq(int irq, void *eq_ptr)
+{
+	struct hns_roce_eq  *eq  = eq_ptr;
+	struct hns_roce_dev *hr_dev = eq->hr_dev;
+	int int_work = 0;
+
+	if (eq->type_flag == HNS_ROCE_CEQ)
+		/* CEQ irq routine, CEQ is pulse irq, not clear */
+		int_work = hns_roce_v1_ceq_int(hr_dev, eq);
+	else
+		/* AEQ irq routine, AEQ is pulse irq, not clear */
+		int_work = hns_roce_v1_aeq_int(hr_dev, eq);
+
+	return IRQ_RETVAL(int_work);
+}
+
+static irqreturn_t hns_roce_v1_msix_interrupt_abn(int irq, void *dev_id)
+{
+	struct hns_roce_dev *hr_dev = dev_id;
+	struct device *dev = &hr_dev->pdev->dev;
+	int int_work = 0;
+	u32 caepaemask_val;
+	u32 cealmovf_val;
+	u32 caepaest_val;
+	u32 aeshift_val;
+	u32 ceshift_val;
+	u32 cemask_val;
+	int i;
+
+	/*
+	 * Abnormal interrupt:
+	 * AEQ overflow, ECC multi-bit err, CEQ overflow must clear
+	 * interrupt, mask irq, clear irq, cancel mask operation
+	 */
+	aeshift_val = roce_read(hr_dev, ROCEE_CAEP_AEQC_AEQE_SHIFT_REG);
+
+	/* AEQE overflow */
+	if (roce_get_bit(aeshift_val,
+		ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQ_ALM_OVF_INT_ST_S) == 1) {
+		dev_warn(dev, "AEQ overflow!\n");
+
+		/* Set mask */
+		caepaemask_val = roce_read(hr_dev, ROCEE_CAEP_AE_MASK_REG);
+		roce_set_bit(caepaemask_val,
+			     ROCEE_CAEP_AE_MASK_CAEP_AEQ_ALM_OVF_MASK_S,
+			     HNS_ROCE_INT_MASK_ENABLE);
+		roce_write(hr_dev, ROCEE_CAEP_AE_MASK_REG, caepaemask_val);
+
+		/* Clear int state(INT_WC : write 1 clear) */
+		caepaest_val = roce_read(hr_dev, ROCEE_CAEP_AE_ST_REG);
+		roce_set_bit(caepaest_val,
+			     ROCEE_CAEP_AE_ST_CAEP_AEQ_ALM_OVF_S, 1);
+		roce_write(hr_dev, ROCEE_CAEP_AE_ST_REG, caepaest_val);
+
+		/* Clear mask */
+		caepaemask_val = roce_read(hr_dev, ROCEE_CAEP_AE_MASK_REG);
+		roce_set_bit(caepaemask_val,
+			     ROCEE_CAEP_AE_MASK_CAEP_AEQ_ALM_OVF_MASK_S,
+			     HNS_ROCE_INT_MASK_DISABLE);
+		roce_write(hr_dev, ROCEE_CAEP_AE_MASK_REG, caepaemask_val);
+	}
+
+	/* CEQ almost overflow */
+	for (i = 0; i < hr_dev->caps.num_comp_vectors; i++) {
+		ceshift_val = roce_read(hr_dev, ROCEE_CAEP_CEQC_SHIFT_0_REG +
+					i * CEQ_REG_OFFSET);
+
+		if (roce_get_bit(ceshift_val,
+			ROCEE_CAEP_CEQC_SHIFT_CAEP_CEQ_ALM_OVF_INT_ST_S) == 1) {
+			dev_warn(dev, "CEQ[%d] almost overflow!\n", i);
+			int_work++;
+
+			/* Set mask */
+			cemask_val = roce_read(hr_dev,
+					       ROCEE_CAEP_CE_IRQ_MASK_0_REG +
+					       i * CEQ_REG_OFFSET);
+			roce_set_bit(cemask_val,
+				ROCEE_CAEP_CE_IRQ_MASK_CAEP_CEQ_ALM_OVF_MASK_S,
+				HNS_ROCE_INT_MASK_ENABLE);
+			roce_write(hr_dev, ROCEE_CAEP_CE_IRQ_MASK_0_REG +
+				   i * CEQ_REG_OFFSET, cemask_val);
+
+			/* Clear int state(INT_WC : write 1 clear) */
+			cealmovf_val = roce_read(hr_dev,
+				       ROCEE_CAEP_CEQ_ALM_OVF_0_REG +
+				       i * CEQ_REG_OFFSET);
+			roce_set_bit(cealmovf_val,
+				     ROCEE_CAEP_CEQ_ALM_OVF_CAEP_CEQ_ALM_OVF_S,
+				     1);
+			roce_write(hr_dev, ROCEE_CAEP_CEQ_ALM_OVF_0_REG +
+				   i * CEQ_REG_OFFSET, cealmovf_val);
+
+			/* Clear mask */
+			cemask_val = roce_read(hr_dev,
+				     ROCEE_CAEP_CE_IRQ_MASK_0_REG +
+				     i * CEQ_REG_OFFSET);
+			roce_set_bit(cemask_val,
+			       ROCEE_CAEP_CE_IRQ_MASK_CAEP_CEQ_ALM_OVF_MASK_S,
+			       HNS_ROCE_INT_MASK_DISABLE);
+			roce_write(hr_dev, ROCEE_CAEP_CE_IRQ_MASK_0_REG +
+				   i * CEQ_REG_OFFSET, cemask_val);
+		}
+	}
+
+	/* ECC multi-bit error alarm */
+	dev_warn(dev, "ECC UCERR ALARM: 0x%x, 0x%x, 0x%x\n",
+		 roce_read(hr_dev, ROCEE_ECC_UCERR_ALM0_REG),
+		 roce_read(hr_dev, ROCEE_ECC_UCERR_ALM1_REG),
+		 roce_read(hr_dev, ROCEE_ECC_UCERR_ALM2_REG));
+
+	dev_warn(dev, "ECC CERR ALARM: 0x%x, 0x%x, 0x%x\n",
+		 roce_read(hr_dev, ROCEE_ECC_CERR_ALM0_REG),
+		 roce_read(hr_dev, ROCEE_ECC_CERR_ALM1_REG),
+		 roce_read(hr_dev, ROCEE_ECC_CERR_ALM2_REG));
+
+	return IRQ_RETVAL(int_work);
+}
+
+static void hns_roce_v1_int_mask_enable(struct hns_roce_dev *hr_dev)
+{
+	u32 aemask_val;
+	int masken = 0;
+	int i;
+
+	/* AEQ INT */
+	aemask_val = roce_read(hr_dev, ROCEE_CAEP_AE_MASK_REG);
+	roce_set_bit(aemask_val, ROCEE_CAEP_AE_MASK_CAEP_AEQ_ALM_OVF_MASK_S,
+		     masken);
+	roce_set_bit(aemask_val, ROCEE_CAEP_AE_MASK_CAEP_AE_IRQ_MASK_S, masken);
+	roce_write(hr_dev, ROCEE_CAEP_AE_MASK_REG, aemask_val);
+
+	/* CEQ INT */
+	for (i = 0; i < hr_dev->caps.num_comp_vectors; i++) {
+		/* IRQ mask */
+		roce_write(hr_dev, ROCEE_CAEP_CE_IRQ_MASK_0_REG +
+			   i * CEQ_REG_OFFSET, masken);
+	}
+}
+
+static void hns_roce_v1_free_eq(struct hns_roce_dev *hr_dev,
+				struct hns_roce_eq *eq)
+{
+	int npages = (PAGE_ALIGN(eq->eqe_size * eq->entries) +
+		      HNS_ROCE_BA_SIZE - 1) / HNS_ROCE_BA_SIZE;
+	int i;
+
+	if (!eq->buf_list)
+		return;
+
+	for (i = 0; i < npages; ++i)
+		dma_free_coherent(&hr_dev->pdev->dev, HNS_ROCE_BA_SIZE,
+				  eq->buf_list[i].buf, eq->buf_list[i].map);
+
+	kfree(eq->buf_list);
+}
+
+static void hns_roce_v1_enable_eq(struct hns_roce_dev *hr_dev, int eq_num,
+				  int enable_flag)
+{
+	void __iomem *eqc = hr_dev->eq_table.eqc_base[eq_num];
+	u32 val;
+
+	val = readl(eqc);
+
+	if (enable_flag)
+		roce_set_field(val,
+			       ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M,
+			       ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S,
+			       HNS_ROCE_EQ_STAT_VALID);
+	else
+		roce_set_field(val,
+			       ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M,
+			       ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S,
+			       HNS_ROCE_EQ_STAT_INVALID);
+	writel(val, eqc);
+}
+
+static int hns_roce_v1_create_eq(struct hns_roce_dev *hr_dev,
+				 struct hns_roce_eq *eq)
+{
+	void __iomem *eqc = hr_dev->eq_table.eqc_base[eq->eqn];
+	struct device *dev = &hr_dev->pdev->dev;
+	dma_addr_t tmp_dma_addr;
+	u32 eqconsindx_val = 0;
+	u32 eqcuridx_val = 0;
+	u32 eqshift_val = 0;
+	int num_bas;
+	int ret;
+	int i;
+
+	num_bas = (PAGE_ALIGN(eq->entries * eq->eqe_size) +
+		   HNS_ROCE_BA_SIZE - 1) / HNS_ROCE_BA_SIZE;
+
+	if ((eq->entries * eq->eqe_size) > HNS_ROCE_BA_SIZE) {
+		dev_err(dev, "[error]eq buf %d gt ba size(%d) need bas=%d\n",
+			(eq->entries * eq->eqe_size), HNS_ROCE_BA_SIZE,
+			num_bas);
+		return -EINVAL;
+	}
+
+	eq->buf_list = kcalloc(num_bas, sizeof(*eq->buf_list), GFP_KERNEL);
+	if (!eq->buf_list)
+		return -ENOMEM;
+
+	for (i = 0; i < num_bas; ++i) {
+		eq->buf_list[i].buf = dma_alloc_coherent(dev, HNS_ROCE_BA_SIZE,
+							 &tmp_dma_addr,
+							 GFP_KERNEL);
+		if (!eq->buf_list[i].buf) {
+			ret = -ENOMEM;
+			goto err_out_free_pages;
+		}
+
+		eq->buf_list[i].map = tmp_dma_addr;
+		memset(eq->buf_list[i].buf, 0, HNS_ROCE_BA_SIZE);
+	}
+	eq->cons_index = 0;
+	roce_set_field(eqshift_val,
+		       ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M,
+		       ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S,
+		       HNS_ROCE_EQ_STAT_INVALID);
+	roce_set_field(eqshift_val,
+		       ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_AEQE_SHIFT_M,
+		       ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_AEQE_SHIFT_S,
+		       eq->log_entries);
+	writel(eqshift_val, eqc);
+
+	/* Configure eq extended address 12~44bit */
+	writel((u32)(eq->buf_list[0].map >> 12), eqc + 4);
+
+	/*
+	 * Configure eq extended address 45~49 bit.
+	 * 44 = 32 + 12, When evaluating addr to hardware, shift 12 because of
+	 * using 4K page, and shift more 32 because of
+	 * caculating the high 32 bit value evaluated to hardware.
+	 */
+	roce_set_field(eqcuridx_val, ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQ_BT_H_M,
+		       ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQ_BT_H_S,
+		       eq->buf_list[0].map >> 44);
+	roce_set_field(eqcuridx_val,
+		       ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQE_CUR_IDX_M,
+		       ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQE_CUR_IDX_S, 0);
+	writel(eqcuridx_val, eqc + 8);
+
+	/* Configure eq consumer index */
+	roce_set_field(eqconsindx_val,
+		       ROCEE_CAEP_AEQE_CONS_IDX_CAEP_AEQE_CONS_IDX_M,
+		       ROCEE_CAEP_AEQE_CONS_IDX_CAEP_AEQE_CONS_IDX_S, 0);
+	writel(eqconsindx_val, eqc + 0xc);
+
+	return 0;
+
+err_out_free_pages:
+	for (i -= 1; i >= 0; i--)
+		dma_free_coherent(dev, HNS_ROCE_BA_SIZE, eq->buf_list[i].buf,
+				  eq->buf_list[i].map);
+
+	kfree(eq->buf_list);
+	return ret;
+}
+
+static int hns_roce_v1_init_eq_table(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_eq_table *eq_table = &hr_dev->eq_table;
+	struct device *dev = &hr_dev->pdev->dev;
+	struct hns_roce_eq *eq;
+	int irq_num;
+	int eq_num;
+	int ret;
+	int i, j;
+
+	eq_num = hr_dev->caps.num_comp_vectors + hr_dev->caps.num_aeq_vectors;
+	irq_num = eq_num + hr_dev->caps.num_other_vectors;
+
+	eq_table->eq = kcalloc(eq_num, sizeof(*eq_table->eq), GFP_KERNEL);
+	if (!eq_table->eq)
+		return -ENOMEM;
+
+	eq_table->eqc_base = kcalloc(eq_num, sizeof(*eq_table->eqc_base),
+				     GFP_KERNEL);
+	if (!eq_table->eqc_base) {
+		ret = -ENOMEM;
+		goto err_eqc_base_alloc_fail;
+	}
+
+	for (i = 0; i < eq_num; i++) {
+		eq = &eq_table->eq[i];
+		eq->hr_dev = hr_dev;
+		eq->eqn = i;
+		eq->irq = hr_dev->irq[i];
+		eq->log_page_size = PAGE_SHIFT;
+
+		if (i < hr_dev->caps.num_comp_vectors) {
+			/* CEQ */
+			eq_table->eqc_base[i] = hr_dev->reg_base +
+						ROCEE_CAEP_CEQC_SHIFT_0_REG +
+						CEQ_REG_OFFSET * i;
+			eq->type_flag = HNS_ROCE_CEQ;
+			eq->doorbell = hr_dev->reg_base +
+				       ROCEE_CAEP_CEQC_CONS_IDX_0_REG +
+				       CEQ_REG_OFFSET * i;
+			eq->entries = hr_dev->caps.ceqe_depth;
+			eq->log_entries = ilog2(eq->entries);
+			eq->eqe_size = HNS_ROCE_CEQ_ENTRY_SIZE;
+		} else {
+			/* AEQ */
+			eq_table->eqc_base[i] = hr_dev->reg_base +
+						ROCEE_CAEP_AEQC_AEQE_SHIFT_REG;
+			eq->type_flag = HNS_ROCE_AEQ;
+			eq->doorbell = hr_dev->reg_base +
+				       ROCEE_CAEP_AEQE_CONS_IDX_REG;
+			eq->entries = hr_dev->caps.aeqe_depth;
+			eq->log_entries = ilog2(eq->entries);
+			eq->eqe_size = HNS_ROCE_AEQ_ENTRY_SIZE;
+		}
+	}
+
+	/* Disable irq */
+	hns_roce_v1_int_mask_enable(hr_dev);
+
+	/* Configure ce int interval */
+	roce_write(hr_dev, ROCEE_CAEP_CE_INTERVAL_CFG_REG,
+		   HNS_ROCE_CEQ_DEFAULT_INTERVAL);
+
+	/* Configure ce int burst num */
+	roce_write(hr_dev, ROCEE_CAEP_CE_BURST_NUM_CFG_REG,
+		   HNS_ROCE_CEQ_DEFAULT_BURST_NUM);
+
+	for (i = 0; i < eq_num; i++) {
+		ret = hns_roce_v1_create_eq(hr_dev, &eq_table->eq[i]);
+		if (ret) {
+			dev_err(dev, "eq create failed\n");
+			goto err_create_eq_fail;
+		}
+	}
+
+	for (j = 0; j < irq_num; j++) {
+		if (j < eq_num)
+			ret = request_irq(hr_dev->irq[j],
+					  hns_roce_v1_msix_interrupt_eq, 0,
+					  hr_dev->irq_names[j],
+					  &eq_table->eq[j]);
+		else
+			ret = request_irq(hr_dev->irq[j],
+					  hns_roce_v1_msix_interrupt_abn, 0,
+					  hr_dev->irq_names[j], hr_dev);
+
+		if (ret) {
+			dev_err(dev, "request irq error!\n");
+			goto err_request_irq_fail;
+		}
+	}
+
+	for (i = 0; i < eq_num; i++)
+		hns_roce_v1_enable_eq(hr_dev, i, EQ_ENABLE);
+
+	return 0;
+
+err_request_irq_fail:
+	for (j -= 1; j >= 0; j--)
+		free_irq(hr_dev->irq[j], &eq_table->eq[j]);
+
+err_create_eq_fail:
+	for (i -= 1; i >= 0; i--)
+		hns_roce_v1_free_eq(hr_dev, &eq_table->eq[i]);
+
+	kfree(eq_table->eqc_base);
+
+err_eqc_base_alloc_fail:
+	kfree(eq_table->eq);
+
+	return ret;
+}
+
+static void hns_roce_v1_cleanup_eq_table(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_eq_table *eq_table = &hr_dev->eq_table;
+	int irq_num;
+	int eq_num;
+	int i;
+
+	eq_num = hr_dev->caps.num_comp_vectors + hr_dev->caps.num_aeq_vectors;
+	irq_num = eq_num + hr_dev->caps.num_other_vectors;
+	for (i = 0; i < eq_num; i++) {
+		/* Disable EQ */
+		hns_roce_v1_enable_eq(hr_dev, i, EQ_DISABLE);
+
+		free_irq(hr_dev->irq[i], &eq_table->eq[i]);
+
+		hns_roce_v1_free_eq(hr_dev, &eq_table->eq[i]);
+	}
+	for (i = eq_num; i < irq_num; i++)
+		free_irq(hr_dev->irq[i], hr_dev);
+
+	kfree(eq_table->eqc_base);
+	kfree(eq_table->eq);
+}
+
 static const struct hns_roce_hw hns_roce_hw_v1 = {
 static const struct hns_roce_hw hns_roce_hw_v1 = {
 	.reset = hns_roce_v1_reset,
 	.reset = hns_roce_v1_reset,
 	.hw_profile = hns_roce_v1_profile,
 	.hw_profile = hns_roce_v1_profile,
@@ -3983,6 +4709,8 @@ static const struct hns_roce_hw hns_roce_hw_v1 = {
 	.poll_cq = hns_roce_v1_poll_cq,
 	.poll_cq = hns_roce_v1_poll_cq,
 	.dereg_mr = hns_roce_v1_dereg_mr,
 	.dereg_mr = hns_roce_v1_dereg_mr,
 	.destroy_cq = hns_roce_v1_destroy_cq,
 	.destroy_cq = hns_roce_v1_destroy_cq,
+	.init_eq = hns_roce_v1_init_eq_table,
+	.cleanup_eq = hns_roce_v1_cleanup_eq_table,
 };
 };
 
 
 static const struct of_device_id hns_roce_of_match[] = {
 static const struct of_device_id hns_roce_of_match[] = {
@@ -4060,10 +4788,6 @@ static int hns_roce_get_cfg(struct hns_roce_dev *hr_dev)
 
 
 	/* get the mapped register base address */
 	/* get the mapped register base address */
 	res = platform_get_resource(hr_dev->pdev, IORESOURCE_MEM, 0);
 	res = platform_get_resource(hr_dev->pdev, IORESOURCE_MEM, 0);
-	if (!res) {
-		dev_err(dev, "memory resource not found!\n");
-		return -EINVAL;
-	}
 	hr_dev->reg_base = devm_ioremap_resource(dev, res);
 	hr_dev->reg_base = devm_ioremap_resource(dev, res);
 	if (IS_ERR(hr_dev->reg_base))
 	if (IS_ERR(hr_dev->reg_base))
 		return PTR_ERR(hr_dev->reg_base);
 		return PTR_ERR(hr_dev->reg_base);
@@ -4132,14 +4856,14 @@ static int hns_roce_get_cfg(struct hns_roce_dev *hr_dev)
 	/* read the interrupt names from the DT or ACPI */
 	/* read the interrupt names from the DT or ACPI */
 	ret = device_property_read_string_array(dev, "interrupt-names",
 	ret = device_property_read_string_array(dev, "interrupt-names",
 						hr_dev->irq_names,
 						hr_dev->irq_names,
-						HNS_ROCE_MAX_IRQ_NUM);
+						HNS_ROCE_V1_MAX_IRQ_NUM);
 	if (ret < 0) {
 	if (ret < 0) {
 		dev_err(dev, "couldn't get interrupt names from DT or ACPI!\n");
 		dev_err(dev, "couldn't get interrupt names from DT or ACPI!\n");
 		return ret;
 		return ret;
 	}
 	}
 
 
 	/* fetch the interrupt numbers */
 	/* fetch the interrupt numbers */
-	for (i = 0; i < HNS_ROCE_MAX_IRQ_NUM; i++) {
+	for (i = 0; i < HNS_ROCE_V1_MAX_IRQ_NUM; i++) {
 		hr_dev->irq[i] = platform_get_irq(hr_dev->pdev, i);
 		hr_dev->irq[i] = platform_get_irq(hr_dev->pdev, i);
 		if (hr_dev->irq[i] <= 0) {
 		if (hr_dev->irq[i] <= 0) {
 			dev_err(dev, "platform get of irq[=%d] failed!\n", i);
 			dev_err(dev, "platform get of irq[=%d] failed!\n", i);

+ 42 - 2
drivers/infiniband/hw/hns/hns_roce_hw_v1.h

@@ -60,8 +60,13 @@
 #define HNS_ROCE_V1_GID_NUM				16
 #define HNS_ROCE_V1_GID_NUM				16
 #define HNS_ROCE_V1_RESV_QP				8
 #define HNS_ROCE_V1_RESV_QP				8
 
 
-#define HNS_ROCE_V1_NUM_COMP_EQE			0x8000
-#define HNS_ROCE_V1_NUM_ASYNC_EQE			0x400
+#define HNS_ROCE_V1_MAX_IRQ_NUM				34
+#define HNS_ROCE_V1_COMP_VEC_NUM			32
+#define HNS_ROCE_V1_AEQE_VEC_NUM			1
+#define HNS_ROCE_V1_ABNORMAL_VEC_NUM			1
+
+#define HNS_ROCE_V1_COMP_EQE_NUM			0x8000
+#define HNS_ROCE_V1_ASYNC_EQE_NUM			0x400
 
 
 #define HNS_ROCE_V1_QPC_ENTRY_SIZE			256
 #define HNS_ROCE_V1_QPC_ENTRY_SIZE			256
 #define HNS_ROCE_V1_IRRL_ENTRY_SIZE			8
 #define HNS_ROCE_V1_IRRL_ENTRY_SIZE			8
@@ -159,6 +164,41 @@
 #define SDB_INV_CNT_OFFSET				8
 #define SDB_INV_CNT_OFFSET				8
 #define SDB_ST_CMP_VAL					8
 #define SDB_ST_CMP_VAL					8
 
 
+#define HNS_ROCE_CEQ_DEFAULT_INTERVAL			0x10
+#define HNS_ROCE_CEQ_DEFAULT_BURST_NUM			0x10
+
+#define HNS_ROCE_INT_MASK_DISABLE			0
+#define HNS_ROCE_INT_MASK_ENABLE			1
+
+#define CEQ_REG_OFFSET					0x18
+
+#define HNS_ROCE_CEQE_CEQE_COMP_OWNER_S	0
+
+#define HNS_ROCE_V1_CONS_IDX_M GENMASK(15, 0)
+
+#define HNS_ROCE_CEQE_CEQE_COMP_CQN_S 16
+#define HNS_ROCE_CEQE_CEQE_COMP_CQN_M GENMASK(31, 16)
+
+#define HNS_ROCE_AEQE_U32_4_EVENT_TYPE_S 16
+#define HNS_ROCE_AEQE_U32_4_EVENT_TYPE_M GENMASK(23, 16)
+
+#define HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S 24
+#define HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_M GENMASK(30, 24)
+
+#define HNS_ROCE_AEQE_U32_4_OWNER_S 31
+
+#define HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_S 0
+#define HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_M GENMASK(23, 0)
+
+#define HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_S 25
+#define HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_M GENMASK(27, 25)
+
+#define HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_S 0
+#define HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_M GENMASK(15, 0)
+
+#define HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_S 0
+#define HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_M GENMASK(4, 0)
+
 struct hns_roce_cq_context {
 struct hns_roce_cq_context {
 	u32 cqc_byte_4;
 	u32 cqc_byte_4;
 	u32 cq_bt_l;
 	u32 cq_bt_l;

+ 1655 - 182
drivers/infiniband/hw/hns/hns_roce_hw_v2.c

@@ -34,6 +34,7 @@
 #include <linux/etherdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/interrupt.h>
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/kernel.h>
+#include <net/addrconf.h>
 #include <rdma/ib_umem.h>
 #include <rdma/ib_umem.h>
 
 
 #include "hnae3.h"
 #include "hnae3.h"
@@ -51,32 +52,106 @@ static void set_data_seg_v2(struct hns_roce_v2_wqe_data_seg *dseg,
 	dseg->len  = cpu_to_le32(sg->length);
 	dseg->len  = cpu_to_le32(sg->length);
 }
 }
 
 
+static int set_rwqe_data_seg(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			     struct hns_roce_v2_rc_send_wqe *rc_sq_wqe,
+			     void *wqe, unsigned int *sge_ind,
+			     struct ib_send_wr **bad_wr)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	struct hns_roce_v2_wqe_data_seg *dseg = wqe;
+	struct hns_roce_qp *qp = to_hr_qp(ibqp);
+	int i;
+
+	if (wr->send_flags & IB_SEND_INLINE && wr->num_sge) {
+		if (rc_sq_wqe->msg_len > hr_dev->caps.max_sq_inline) {
+			*bad_wr = wr;
+			dev_err(hr_dev->dev, "inline len(1-%d)=%d, illegal",
+				rc_sq_wqe->msg_len, hr_dev->caps.max_sq_inline);
+			return -EINVAL;
+		}
+
+		for (i = 0; i < wr->num_sge; i++) {
+			memcpy(wqe, ((void *)wr->sg_list[i].addr),
+			       wr->sg_list[i].length);
+			wqe += wr->sg_list[i].length;
+		}
+
+		roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_INLINE_S,
+			     1);
+	} else {
+		if (wr->num_sge <= 2) {
+			for (i = 0; i < wr->num_sge; i++) {
+				if (likely(wr->sg_list[i].length)) {
+					set_data_seg_v2(dseg, wr->sg_list + i);
+					dseg++;
+				}
+			}
+		} else {
+			roce_set_field(rc_sq_wqe->byte_20,
+				     V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M,
+				     V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S,
+				     (*sge_ind) & (qp->sge.sge_cnt - 1));
+
+			for (i = 0; i < 2; i++) {
+				if (likely(wr->sg_list[i].length)) {
+					set_data_seg_v2(dseg, wr->sg_list + i);
+					dseg++;
+				}
+			}
+
+			dseg = get_send_extend_sge(qp,
+					    (*sge_ind) & (qp->sge.sge_cnt - 1));
+
+			for (i = 0; i < wr->num_sge - 2; i++) {
+				if (likely(wr->sg_list[i + 2].length)) {
+					set_data_seg_v2(dseg,
+							wr->sg_list + 2 + i);
+					dseg++;
+					(*sge_ind)++;
+				}
+			}
+		}
+
+		roce_set_field(rc_sq_wqe->byte_16,
+			       V2_RC_SEND_WQE_BYTE_16_SGE_NUM_M,
+			       V2_RC_SEND_WQE_BYTE_16_SGE_NUM_S, wr->num_sge);
+	}
+
+	return 0;
+}
+
 static int hns_roce_v2_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 static int hns_roce_v2_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 				 struct ib_send_wr **bad_wr)
 				 struct ib_send_wr **bad_wr)
 {
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	struct hns_roce_ah *ah = to_hr_ah(ud_wr(wr)->ah);
+	struct hns_roce_v2_ud_send_wqe *ud_sq_wqe;
 	struct hns_roce_v2_rc_send_wqe *rc_sq_wqe;
 	struct hns_roce_v2_rc_send_wqe *rc_sq_wqe;
 	struct hns_roce_qp *qp = to_hr_qp(ibqp);
 	struct hns_roce_qp *qp = to_hr_qp(ibqp);
 	struct hns_roce_v2_wqe_data_seg *dseg;
 	struct hns_roce_v2_wqe_data_seg *dseg;
 	struct device *dev = hr_dev->dev;
 	struct device *dev = hr_dev->dev;
 	struct hns_roce_v2_db sq_db;
 	struct hns_roce_v2_db sq_db;
 	unsigned int sge_ind = 0;
 	unsigned int sge_ind = 0;
-	unsigned int wqe_sz = 0;
 	unsigned int owner_bit;
 	unsigned int owner_bit;
 	unsigned long flags;
 	unsigned long flags;
 	unsigned int ind;
 	unsigned int ind;
 	void *wqe = NULL;
 	void *wqe = NULL;
+	bool loopback;
 	int ret = 0;
 	int ret = 0;
+	u8 *smac;
 	int nreq;
 	int nreq;
 	int i;
 	int i;
 
 
-	if (unlikely(ibqp->qp_type != IB_QPT_RC)) {
+	if (unlikely(ibqp->qp_type != IB_QPT_RC &&
+		     ibqp->qp_type != IB_QPT_GSI &&
+		     ibqp->qp_type != IB_QPT_UD)) {
 		dev_err(dev, "Not supported QP(0x%x)type!\n", ibqp->qp_type);
 		dev_err(dev, "Not supported QP(0x%x)type!\n", ibqp->qp_type);
 		*bad_wr = NULL;
 		*bad_wr = NULL;
 		return -EOPNOTSUPP;
 		return -EOPNOTSUPP;
 	}
 	}
 
 
-	if (unlikely(qp->state != IB_QPS_RTS && qp->state != IB_QPS_SQD)) {
+	if (unlikely(qp->state == IB_QPS_RESET || qp->state == IB_QPS_INIT ||
+		     qp->state == IB_QPS_RTR)) {
 		dev_err(dev, "Post WQE fail, QP state %d err!\n", qp->state);
 		dev_err(dev, "Post WQE fail, QP state %d err!\n", qp->state);
 		*bad_wr = wr;
 		*bad_wr = wr;
 		return -EINVAL;
 		return -EINVAL;
@@ -106,161 +181,255 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 								      wr->wr_id;
 								      wr->wr_id;
 
 
 		owner_bit = ~(qp->sq.head >> ilog2(qp->sq.wqe_cnt)) & 0x1;
 		owner_bit = ~(qp->sq.head >> ilog2(qp->sq.wqe_cnt)) & 0x1;
-		rc_sq_wqe = wqe;
-		memset(rc_sq_wqe, 0, sizeof(*rc_sq_wqe));
-		for (i = 0; i < wr->num_sge; i++)
-			rc_sq_wqe->msg_len += wr->sg_list[i].length;
 
 
-		rc_sq_wqe->inv_key_immtdata = send_ieth(wr);
+		/* Corresponding to the QP type, wqe process separately */
+		if (ibqp->qp_type == IB_QPT_GSI) {
+			ud_sq_wqe = wqe;
+			memset(ud_sq_wqe, 0, sizeof(*ud_sq_wqe));
+
+			roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_0_M,
+				       V2_UD_SEND_WQE_DMAC_0_S, ah->av.mac[0]);
+			roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_1_M,
+				       V2_UD_SEND_WQE_DMAC_1_S, ah->av.mac[1]);
+			roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_2_M,
+				       V2_UD_SEND_WQE_DMAC_2_S, ah->av.mac[2]);
+			roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_3_M,
+				       V2_UD_SEND_WQE_DMAC_3_S, ah->av.mac[3]);
+			roce_set_field(ud_sq_wqe->byte_48,
+				       V2_UD_SEND_WQE_BYTE_48_DMAC_4_M,
+				       V2_UD_SEND_WQE_BYTE_48_DMAC_4_S,
+				       ah->av.mac[4]);
+			roce_set_field(ud_sq_wqe->byte_48,
+				       V2_UD_SEND_WQE_BYTE_48_DMAC_5_M,
+				       V2_UD_SEND_WQE_BYTE_48_DMAC_5_S,
+				       ah->av.mac[5]);
+
+			/* MAC loopback */
+			smac = (u8 *)hr_dev->dev_addr[qp->port];
+			loopback = ether_addr_equal_unaligned(ah->av.mac,
+							      smac) ? 1 : 0;
+
+			roce_set_bit(ud_sq_wqe->byte_40,
+				     V2_UD_SEND_WQE_BYTE_40_LBI_S, loopback);
+
+			roce_set_field(ud_sq_wqe->byte_4,
+				       V2_UD_SEND_WQE_BYTE_4_OPCODE_M,
+				       V2_UD_SEND_WQE_BYTE_4_OPCODE_S,
+				       HNS_ROCE_V2_WQE_OP_SEND);
 
 
-		roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_FENCE_S,
-			    (wr->send_flags & IB_SEND_FENCE) ? 1 : 0);
+			for (i = 0; i < wr->num_sge; i++)
+				ud_sq_wqe->msg_len += wr->sg_list[i].length;
 
 
-		roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_SE_S,
-			    (wr->send_flags & IB_SEND_SOLICITED) ? 1 : 0);
+			ud_sq_wqe->immtdata = send_ieth(wr);
 
 
-		roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_CQE_S,
-			    (wr->send_flags & IB_SEND_SIGNALED) ? 1 : 0);
+			/* Set sig attr */
+			roce_set_bit(ud_sq_wqe->byte_4,
+				   V2_UD_SEND_WQE_BYTE_4_CQE_S,
+				   (wr->send_flags & IB_SEND_SIGNALED) ? 1 : 0);
 
 
-		roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_OWNER_S,
-			     owner_bit);
+			/* Set se attr */
+			roce_set_bit(ud_sq_wqe->byte_4,
+				  V2_UD_SEND_WQE_BYTE_4_SE_S,
+				  (wr->send_flags & IB_SEND_SOLICITED) ? 1 : 0);
 
 
-		switch (wr->opcode) {
-		case IB_WR_RDMA_READ:
-			roce_set_field(rc_sq_wqe->byte_4,
-				       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-				       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-				       HNS_ROCE_V2_WQE_OP_RDMA_READ);
-			rc_sq_wqe->rkey = cpu_to_le32(rdma_wr(wr)->rkey);
-			rc_sq_wqe->va = cpu_to_le64(rdma_wr(wr)->remote_addr);
-			break;
-		case IB_WR_RDMA_WRITE:
-			roce_set_field(rc_sq_wqe->byte_4,
-				       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-				       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-				       HNS_ROCE_V2_WQE_OP_RDMA_WRITE);
-			rc_sq_wqe->rkey = cpu_to_le32(rdma_wr(wr)->rkey);
-			rc_sq_wqe->va = cpu_to_le64(rdma_wr(wr)->remote_addr);
-			break;
-		case IB_WR_RDMA_WRITE_WITH_IMM:
-			roce_set_field(rc_sq_wqe->byte_4,
+			roce_set_bit(ud_sq_wqe->byte_4,
+				     V2_UD_SEND_WQE_BYTE_4_OWNER_S, owner_bit);
+
+			roce_set_field(ud_sq_wqe->byte_16,
+				       V2_UD_SEND_WQE_BYTE_16_PD_M,
+				       V2_UD_SEND_WQE_BYTE_16_PD_S,
+				       to_hr_pd(ibqp->pd)->pdn);
+
+			roce_set_field(ud_sq_wqe->byte_16,
+				       V2_UD_SEND_WQE_BYTE_16_SGE_NUM_M,
+				       V2_UD_SEND_WQE_BYTE_16_SGE_NUM_S,
+				       wr->num_sge);
+
+			roce_set_field(ud_sq_wqe->byte_20,
+				     V2_UD_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M,
+				     V2_UD_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S,
+				     sge_ind & (qp->sge.sge_cnt - 1));
+
+			roce_set_field(ud_sq_wqe->byte_24,
+				       V2_UD_SEND_WQE_BYTE_24_UDPSPN_M,
+				       V2_UD_SEND_WQE_BYTE_24_UDPSPN_S, 0);
+			ud_sq_wqe->qkey =
+			     cpu_to_be32(ud_wr(wr)->remote_qkey & 0x80000000) ?
+			     qp->qkey : ud_wr(wr)->remote_qkey;
+			roce_set_field(ud_sq_wqe->byte_32,
+				       V2_UD_SEND_WQE_BYTE_32_DQPN_M,
+				       V2_UD_SEND_WQE_BYTE_32_DQPN_S,
+				       ud_wr(wr)->remote_qpn);
+
+			roce_set_field(ud_sq_wqe->byte_36,
+				       V2_UD_SEND_WQE_BYTE_36_VLAN_M,
+				       V2_UD_SEND_WQE_BYTE_36_VLAN_S,
+				       ah->av.vlan);
+			roce_set_field(ud_sq_wqe->byte_36,
+				       V2_UD_SEND_WQE_BYTE_36_HOPLIMIT_M,
+				       V2_UD_SEND_WQE_BYTE_36_HOPLIMIT_S,
+				       ah->av.hop_limit);
+			roce_set_field(ud_sq_wqe->byte_36,
+				       V2_UD_SEND_WQE_BYTE_36_TCLASS_M,
+				       V2_UD_SEND_WQE_BYTE_36_TCLASS_S,
+				       0);
+			roce_set_field(ud_sq_wqe->byte_36,
+				       V2_UD_SEND_WQE_BYTE_36_TCLASS_M,
+				       V2_UD_SEND_WQE_BYTE_36_TCLASS_S,
+				       0);
+			roce_set_field(ud_sq_wqe->byte_40,
+				       V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_M,
+				       V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_S, 0);
+			roce_set_field(ud_sq_wqe->byte_40,
+				       V2_UD_SEND_WQE_BYTE_40_SL_M,
+				       V2_UD_SEND_WQE_BYTE_40_SL_S,
+				       ah->av.sl_tclass_flowlabel >>
+				       HNS_ROCE_SL_SHIFT);
+			roce_set_field(ud_sq_wqe->byte_40,
+				       V2_UD_SEND_WQE_BYTE_40_PORTN_M,
+				       V2_UD_SEND_WQE_BYTE_40_PORTN_S,
+				       qp->port);
+
+			roce_set_field(ud_sq_wqe->byte_48,
+				       V2_UD_SEND_WQE_BYTE_48_SGID_INDX_M,
+				       V2_UD_SEND_WQE_BYTE_48_SGID_INDX_S,
+				       hns_get_gid_index(hr_dev, qp->phy_port,
+							 ah->av.gid_index));
+
+			memcpy(&ud_sq_wqe->dgid[0], &ah->av.dgid[0],
+			       GID_LEN_V2);
+
+			dseg = get_send_extend_sge(qp,
+					    sge_ind & (qp->sge.sge_cnt - 1));
+			for (i = 0; i < wr->num_sge; i++) {
+				set_data_seg_v2(dseg + i, wr->sg_list + i);
+				sge_ind++;
+			}
+
+			ind++;
+		} else if (ibqp->qp_type == IB_QPT_RC) {
+			rc_sq_wqe = wqe;
+			memset(rc_sq_wqe, 0, sizeof(*rc_sq_wqe));
+			for (i = 0; i < wr->num_sge; i++)
+				rc_sq_wqe->msg_len += wr->sg_list[i].length;
+
+			rc_sq_wqe->inv_key_immtdata = send_ieth(wr);
+
+			roce_set_bit(rc_sq_wqe->byte_4,
+				     V2_RC_SEND_WQE_BYTE_4_FENCE_S,
+				     (wr->send_flags & IB_SEND_FENCE) ? 1 : 0);
+
+			roce_set_bit(rc_sq_wqe->byte_4,
+				  V2_RC_SEND_WQE_BYTE_4_SE_S,
+				  (wr->send_flags & IB_SEND_SOLICITED) ? 1 : 0);
+
+			roce_set_bit(rc_sq_wqe->byte_4,
+				   V2_RC_SEND_WQE_BYTE_4_CQE_S,
+				   (wr->send_flags & IB_SEND_SIGNALED) ? 1 : 0);
+
+			roce_set_bit(rc_sq_wqe->byte_4,
+				     V2_RC_SEND_WQE_BYTE_4_OWNER_S, owner_bit);
+
+			switch (wr->opcode) {
+			case IB_WR_RDMA_READ:
+				roce_set_field(rc_sq_wqe->byte_4,
+					       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
+					       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
+					       HNS_ROCE_V2_WQE_OP_RDMA_READ);
+				rc_sq_wqe->rkey =
+					cpu_to_le32(rdma_wr(wr)->rkey);
+				rc_sq_wqe->va =
+					cpu_to_le64(rdma_wr(wr)->remote_addr);
+				break;
+			case IB_WR_RDMA_WRITE:
+				roce_set_field(rc_sq_wqe->byte_4,
+					       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
+					       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
+					       HNS_ROCE_V2_WQE_OP_RDMA_WRITE);
+				rc_sq_wqe->rkey =
+					cpu_to_le32(rdma_wr(wr)->rkey);
+				rc_sq_wqe->va =
+					cpu_to_le64(rdma_wr(wr)->remote_addr);
+				break;
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				roce_set_field(rc_sq_wqe->byte_4,
 				       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
 				       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
 				       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
 				       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
 				       HNS_ROCE_V2_WQE_OP_RDMA_WRITE_WITH_IMM);
 				       HNS_ROCE_V2_WQE_OP_RDMA_WRITE_WITH_IMM);
-			rc_sq_wqe->rkey = cpu_to_le32(rdma_wr(wr)->rkey);
-			rc_sq_wqe->va = cpu_to_le64(rdma_wr(wr)->remote_addr);
-			break;
-		case IB_WR_SEND:
-			roce_set_field(rc_sq_wqe->byte_4,
-				       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-				       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-				       HNS_ROCE_V2_WQE_OP_SEND);
-			break;
-		case IB_WR_SEND_WITH_INV:
-			roce_set_field(rc_sq_wqe->byte_4,
+				rc_sq_wqe->rkey =
+					cpu_to_le32(rdma_wr(wr)->rkey);
+				rc_sq_wqe->va =
+					cpu_to_le64(rdma_wr(wr)->remote_addr);
+				break;
+			case IB_WR_SEND:
+				roce_set_field(rc_sq_wqe->byte_4,
+					       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
+					       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
+					       HNS_ROCE_V2_WQE_OP_SEND);
+				break;
+			case IB_WR_SEND_WITH_INV:
+				roce_set_field(rc_sq_wqe->byte_4,
 				       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
 				       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
 				       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
 				       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
 				       HNS_ROCE_V2_WQE_OP_SEND_WITH_INV);
 				       HNS_ROCE_V2_WQE_OP_SEND_WITH_INV);
-			break;
-		case IB_WR_SEND_WITH_IMM:
-			roce_set_field(rc_sq_wqe->byte_4,
-				       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-				       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-				       HNS_ROCE_V2_WQE_OP_SEND_WITH_IMM);
-			break;
-		case IB_WR_LOCAL_INV:
-			roce_set_field(rc_sq_wqe->byte_4,
-				       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-				       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-				       HNS_ROCE_V2_WQE_OP_LOCAL_INV);
-			break;
-		case IB_WR_ATOMIC_CMP_AND_SWP:
-			roce_set_field(rc_sq_wqe->byte_4,
-				       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-				       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-				       HNS_ROCE_V2_WQE_OP_ATOM_CMP_AND_SWAP);
-			break;
-		case IB_WR_ATOMIC_FETCH_AND_ADD:
-			roce_set_field(rc_sq_wqe->byte_4,
-				       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-				       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-				       HNS_ROCE_V2_WQE_OP_ATOM_FETCH_AND_ADD);
-			break;
-		case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
-			roce_set_field(rc_sq_wqe->byte_4,
+				break;
+			case IB_WR_SEND_WITH_IMM:
+				roce_set_field(rc_sq_wqe->byte_4,
+					      V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
+					      V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
+					      HNS_ROCE_V2_WQE_OP_SEND_WITH_IMM);
+				break;
+			case IB_WR_LOCAL_INV:
+				roce_set_field(rc_sq_wqe->byte_4,
+					       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
+					       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
+					       HNS_ROCE_V2_WQE_OP_LOCAL_INV);
+				break;
+			case IB_WR_ATOMIC_CMP_AND_SWP:
+				roce_set_field(rc_sq_wqe->byte_4,
+					  V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
+					  V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
+					  HNS_ROCE_V2_WQE_OP_ATOM_CMP_AND_SWAP);
+				break;
+			case IB_WR_ATOMIC_FETCH_AND_ADD:
+				roce_set_field(rc_sq_wqe->byte_4,
+					 V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
+					 V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
+					 HNS_ROCE_V2_WQE_OP_ATOM_FETCH_AND_ADD);
+				break;
+			case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
+				roce_set_field(rc_sq_wqe->byte_4,
 				      V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
 				      V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
 				      V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
 				      V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
 				      HNS_ROCE_V2_WQE_OP_ATOM_MSK_CMP_AND_SWAP);
 				      HNS_ROCE_V2_WQE_OP_ATOM_MSK_CMP_AND_SWAP);
-			break;
-		case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD:
-			roce_set_field(rc_sq_wqe->byte_4,
+				break;
+			case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD:
+				roce_set_field(rc_sq_wqe->byte_4,
 				     V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
 				     V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
 				     V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
 				     V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
 				     HNS_ROCE_V2_WQE_OP_ATOM_MSK_FETCH_AND_ADD);
 				     HNS_ROCE_V2_WQE_OP_ATOM_MSK_FETCH_AND_ADD);
-			break;
-		default:
-			roce_set_field(rc_sq_wqe->byte_4,
-				       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-				       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-				       HNS_ROCE_V2_WQE_OP_MASK);
-			break;
-		}
-
-		wqe += sizeof(struct hns_roce_v2_rc_send_wqe);
-		dseg = wqe;
-		if (wr->send_flags & IB_SEND_INLINE && wr->num_sge) {
-			if (rc_sq_wqe->msg_len >
-				hr_dev->caps.max_sq_inline) {
-				ret = -EINVAL;
-				*bad_wr = wr;
-				dev_err(dev, "inline len(1-%d)=%d, illegal",
-					rc_sq_wqe->msg_len,
-					hr_dev->caps.max_sq_inline);
-				goto out;
+				break;
+			default:
+				roce_set_field(rc_sq_wqe->byte_4,
+					       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
+					       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
+					       HNS_ROCE_V2_WQE_OP_MASK);
+				break;
 			}
 			}
 
 
-			for (i = 0; i < wr->num_sge; i++) {
-				memcpy(wqe, ((void *)wr->sg_list[i].addr),
-				       wr->sg_list[i].length);
-				wqe += wr->sg_list[i].length;
-				wqe_sz += wr->sg_list[i].length;
-			}
+			wqe += sizeof(struct hns_roce_v2_rc_send_wqe);
+			dseg = wqe;
 
 
-			roce_set_bit(rc_sq_wqe->byte_4,
-				     V2_RC_SEND_WQE_BYTE_4_INLINE_S, 1);
+			ret = set_rwqe_data_seg(ibqp, wr, rc_sq_wqe, wqe,
+						&sge_ind, bad_wr);
+			if (ret)
+				goto out;
+			ind++;
 		} else {
 		} else {
-			if (wr->num_sge <= 2) {
-				for (i = 0; i < wr->num_sge; i++)
-					set_data_seg_v2(dseg + i,
-							wr->sg_list + i);
-			} else {
-				roce_set_field(rc_sq_wqe->byte_20,
-				V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M,
-				V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S,
-				sge_ind & (qp->sge.sge_cnt - 1));
-
-				for (i = 0; i < 2; i++)
-					set_data_seg_v2(dseg + i,
-							wr->sg_list + i);
-
-				dseg = get_send_extend_sge(qp,
-					sge_ind & (qp->sge.sge_cnt - 1));
-
-				for (i = 0; i < wr->num_sge - 2; i++) {
-					set_data_seg_v2(dseg + i,
-							wr->sg_list + 2 + i);
-					sge_ind++;
-				}
-			}
-
-			roce_set_field(rc_sq_wqe->byte_16,
-				       V2_RC_SEND_WQE_BYTE_16_SGE_NUM_M,
-				       V2_RC_SEND_WQE_BYTE_16_SGE_NUM_S,
-				       wr->num_sge);
-			wqe_sz += wr->num_sge *
-				  sizeof(struct hns_roce_v2_wqe_data_seg);
+			dev_err(dev, "Illegal qp_type(0x%x)\n", ibqp->qp_type);
+			spin_unlock_irqrestore(&qp->sq.lock, flags);
+			return -EOPNOTSUPP;
 		}
 		}
-		ind++;
 	}
 	}
 
 
 out:
 out:
@@ -299,6 +468,7 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
 	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
 	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
 	struct hns_roce_v2_wqe_data_seg *dseg;
 	struct hns_roce_v2_wqe_data_seg *dseg;
+	struct hns_roce_rinl_sge *sge_list;
 	struct device *dev = hr_dev->dev;
 	struct device *dev = hr_dev->dev;
 	struct hns_roce_v2_db rq_db;
 	struct hns_roce_v2_db rq_db;
 	unsigned long flags;
 	unsigned long flags;
@@ -347,6 +517,14 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 			dseg[i].addr = 0;
 			dseg[i].addr = 0;
 		}
 		}
 
 
+		/* rq support inline data */
+		sge_list = hr_qp->rq_inl_buf.wqe_list[ind].sg_list;
+		hr_qp->rq_inl_buf.wqe_list[ind].sge_cnt = (u32)wr->num_sge;
+		for (i = 0; i < wr->num_sge; i++) {
+			sge_list[i].addr = (void *)(u64)wr->sg_list[i].addr;
+			sge_list[i].len = wr->sg_list[i].length;
+		}
+
 		hr_qp->rq.wrid[ind] = wr->wr_id;
 		hr_qp->rq.wrid[ind] = wr->wr_id;
 
 
 		ind = (ind + 1) & (hr_qp->rq.wqe_cnt - 1);
 		ind = (ind + 1) & (hr_qp->rq.wqe_cnt - 1);
@@ -908,9 +1086,9 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
 	caps->max_sq_inline	= HNS_ROCE_V2_MAX_SQ_INLINE;
 	caps->max_sq_inline	= HNS_ROCE_V2_MAX_SQ_INLINE;
 	caps->num_uars		= HNS_ROCE_V2_UAR_NUM;
 	caps->num_uars		= HNS_ROCE_V2_UAR_NUM;
 	caps->phy_num_uars	= HNS_ROCE_V2_PHY_UAR_NUM;
 	caps->phy_num_uars	= HNS_ROCE_V2_PHY_UAR_NUM;
-	caps->num_aeq_vectors	= 1;
-	caps->num_comp_vectors	= 63;
-	caps->num_other_vectors	= 0;
+	caps->num_aeq_vectors	= HNS_ROCE_V2_AEQE_VEC_NUM;
+	caps->num_comp_vectors	= HNS_ROCE_V2_COMP_VEC_NUM;
+	caps->num_other_vectors	= HNS_ROCE_V2_ABNORMAL_VEC_NUM;
 	caps->num_mtpts		= HNS_ROCE_V2_MAX_MTPT_NUM;
 	caps->num_mtpts		= HNS_ROCE_V2_MAX_MTPT_NUM;
 	caps->num_mtt_segs	= HNS_ROCE_V2_MAX_MTT_SEGS;
 	caps->num_mtt_segs	= HNS_ROCE_V2_MAX_MTT_SEGS;
 	caps->num_cqe_segs	= HNS_ROCE_V2_MAX_CQE_SEGS;
 	caps->num_cqe_segs	= HNS_ROCE_V2_MAX_CQE_SEGS;
@@ -955,12 +1133,18 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
 	caps->cqe_ba_pg_sz	= 0;
 	caps->cqe_ba_pg_sz	= 0;
 	caps->cqe_buf_pg_sz	= 0;
 	caps->cqe_buf_pg_sz	= 0;
 	caps->cqe_hop_num	= HNS_ROCE_CQE_HOP_NUM;
 	caps->cqe_hop_num	= HNS_ROCE_CQE_HOP_NUM;
+	caps->eqe_ba_pg_sz	= 0;
+	caps->eqe_buf_pg_sz	= 0;
+	caps->eqe_hop_num	= HNS_ROCE_EQE_HOP_NUM;
 	caps->chunk_sz		= HNS_ROCE_V2_TABLE_CHUNK_SIZE;
 	caps->chunk_sz		= HNS_ROCE_V2_TABLE_CHUNK_SIZE;
 
 
 	caps->flags		= HNS_ROCE_CAP_FLAG_REREG_MR |
 	caps->flags		= HNS_ROCE_CAP_FLAG_REREG_MR |
-				  HNS_ROCE_CAP_FLAG_ROCE_V1_V2;
+				  HNS_ROCE_CAP_FLAG_ROCE_V1_V2 |
+				  HNS_ROCE_CAP_FLAG_RQ_INLINE;
 	caps->pkey_table_len[0] = 1;
 	caps->pkey_table_len[0] = 1;
 	caps->gid_table_len[0] = HNS_ROCE_V2_GID_INDEX_NUM;
 	caps->gid_table_len[0] = HNS_ROCE_V2_GID_INDEX_NUM;
+	caps->ceqe_depth	= HNS_ROCE_V2_COMP_EQE_NUM;
+	caps->aeqe_depth	= HNS_ROCE_V2_ASYNC_EQE_NUM;
 	caps->local_ca_ack_delay = 0;
 	caps->local_ca_ack_delay = 0;
 	caps->max_mtu = IB_MTU_4096;
 	caps->max_mtu = IB_MTU_4096;
 
 
@@ -1382,6 +1566,8 @@ static void hns_roce_v2_write_cqc(struct hns_roce_dev *hr_dev,
 
 
 	roce_set_field(cq_context->byte_4_pg_ceqn, V2_CQC_BYTE_4_CQ_ST_M,
 	roce_set_field(cq_context->byte_4_pg_ceqn, V2_CQC_BYTE_4_CQ_ST_M,
 		       V2_CQC_BYTE_4_CQ_ST_S, V2_CQ_STATE_VALID);
 		       V2_CQC_BYTE_4_CQ_ST_S, V2_CQ_STATE_VALID);
+	roce_set_field(cq_context->byte_4_pg_ceqn, V2_CQC_BYTE_4_ARM_ST_M,
+		       V2_CQC_BYTE_4_ARM_ST_S, REG_NXT_CEQE);
 	roce_set_field(cq_context->byte_4_pg_ceqn, V2_CQC_BYTE_4_SHIFT_M,
 	roce_set_field(cq_context->byte_4_pg_ceqn, V2_CQC_BYTE_4_SHIFT_M,
 		       V2_CQC_BYTE_4_SHIFT_S, ilog2((unsigned int)nent));
 		       V2_CQC_BYTE_4_SHIFT_S, ilog2((unsigned int)nent));
 	roce_set_field(cq_context->byte_4_pg_ceqn, V2_CQC_BYTE_4_CEQN_M,
 	roce_set_field(cq_context->byte_4_pg_ceqn, V2_CQC_BYTE_4_CEQN_M,
@@ -1422,6 +1608,15 @@ static void hns_roce_v2_write_cqc(struct hns_roce_dev *hr_dev,
 
 
 	roce_set_field(cq_context->byte_40_cqe_ba, V2_CQC_BYTE_40_CQE_BA_M,
 	roce_set_field(cq_context->byte_40_cqe_ba, V2_CQC_BYTE_40_CQE_BA_M,
 		       V2_CQC_BYTE_40_CQE_BA_S, (dma_handle >> (32 + 3)));
 		       V2_CQC_BYTE_40_CQE_BA_S, (dma_handle >> (32 + 3)));
+
+	roce_set_field(cq_context->byte_56_cqe_period_maxcnt,
+		       V2_CQC_BYTE_56_CQ_MAX_CNT_M,
+		       V2_CQC_BYTE_56_CQ_MAX_CNT_S,
+		       HNS_ROCE_V2_CQ_DEFAULT_BURST_NUM);
+	roce_set_field(cq_context->byte_56_cqe_period_maxcnt,
+		       V2_CQC_BYTE_56_CQ_PERIOD_M,
+		       V2_CQC_BYTE_56_CQ_PERIOD_S,
+		       HNS_ROCE_V2_CQ_DEFAULT_INTERVAL);
 }
 }
 
 
 static int hns_roce_v2_req_notify_cq(struct ib_cq *ibcq,
 static int hns_roce_v2_req_notify_cq(struct ib_cq *ibcq,
@@ -1457,6 +1652,40 @@ static int hns_roce_v2_req_notify_cq(struct ib_cq *ibcq,
 	return 0;
 	return 0;
 }
 }
 
 
+static int hns_roce_handle_recv_inl_wqe(struct hns_roce_v2_cqe *cqe,
+						    struct hns_roce_qp **cur_qp,
+						    struct ib_wc *wc)
+{
+	struct hns_roce_rinl_sge *sge_list;
+	u32 wr_num, wr_cnt, sge_num;
+	u32 sge_cnt, data_len, size;
+	void *wqe_buf;
+
+	wr_num = roce_get_field(cqe->byte_4, V2_CQE_BYTE_4_WQE_INDX_M,
+				V2_CQE_BYTE_4_WQE_INDX_S) & 0xffff;
+	wr_cnt = wr_num & ((*cur_qp)->rq.wqe_cnt - 1);
+
+	sge_list = (*cur_qp)->rq_inl_buf.wqe_list[wr_cnt].sg_list;
+	sge_num = (*cur_qp)->rq_inl_buf.wqe_list[wr_cnt].sge_cnt;
+	wqe_buf = get_recv_wqe(*cur_qp, wr_cnt);
+	data_len = wc->byte_len;
+
+	for (sge_cnt = 0; (sge_cnt < sge_num) && (data_len); sge_cnt++) {
+		size = min(sge_list[sge_cnt].len, data_len);
+		memcpy((void *)sge_list[sge_cnt].addr, wqe_buf, size);
+
+		data_len -= size;
+		wqe_buf += size;
+	}
+
+	if (data_len) {
+		wc->status = IB_WC_LOC_LEN_ERR;
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+
 static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
 static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
 				struct hns_roce_qp **cur_qp, struct ib_wc *wc)
 				struct hns_roce_qp **cur_qp, struct ib_wc *wc)
 {
 {
@@ -1469,6 +1698,7 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
 	u32 opcode;
 	u32 opcode;
 	u32 status;
 	u32 status;
 	int qpn;
 	int qpn;
+	int ret;
 
 
 	/* Find cqe according to consumer index */
 	/* Find cqe according to consumer index */
 	cqe = next_cqe_sw_v2(hr_cq);
 	cqe = next_cqe_sw_v2(hr_cq);
@@ -1636,7 +1866,7 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
 		case HNS_ROCE_V2_OPCODE_RDMA_WRITE_IMM:
 		case HNS_ROCE_V2_OPCODE_RDMA_WRITE_IMM:
 			wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
 			wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
 			wc->wc_flags = IB_WC_WITH_IMM;
 			wc->wc_flags = IB_WC_WITH_IMM;
-			wc->ex.imm_data = le32_to_cpu(cqe->rkey_immtdata);
+			wc->ex.imm_data = cqe->immtdata;
 			break;
 			break;
 		case HNS_ROCE_V2_OPCODE_SEND:
 		case HNS_ROCE_V2_OPCODE_SEND:
 			wc->opcode = IB_WC_RECV;
 			wc->opcode = IB_WC_RECV;
@@ -1645,18 +1875,29 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
 		case HNS_ROCE_V2_OPCODE_SEND_WITH_IMM:
 		case HNS_ROCE_V2_OPCODE_SEND_WITH_IMM:
 			wc->opcode = IB_WC_RECV;
 			wc->opcode = IB_WC_RECV;
 			wc->wc_flags = IB_WC_WITH_IMM;
 			wc->wc_flags = IB_WC_WITH_IMM;
-			wc->ex.imm_data = le32_to_cpu(cqe->rkey_immtdata);
+			wc->ex.imm_data = cqe->immtdata;
 			break;
 			break;
 		case HNS_ROCE_V2_OPCODE_SEND_WITH_INV:
 		case HNS_ROCE_V2_OPCODE_SEND_WITH_INV:
 			wc->opcode = IB_WC_RECV;
 			wc->opcode = IB_WC_RECV;
 			wc->wc_flags = IB_WC_WITH_INVALIDATE;
 			wc->wc_flags = IB_WC_WITH_INVALIDATE;
-			wc->ex.invalidate_rkey = cqe->rkey_immtdata;
+			wc->ex.invalidate_rkey = le32_to_cpu(cqe->rkey);
 			break;
 			break;
 		default:
 		default:
 			wc->status = IB_WC_GENERAL_ERR;
 			wc->status = IB_WC_GENERAL_ERR;
 			break;
 			break;
 		}
 		}
 
 
+		if ((wc->qp->qp_type == IB_QPT_RC ||
+		     wc->qp->qp_type == IB_QPT_UC) &&
+		    (opcode == HNS_ROCE_V2_OPCODE_SEND ||
+		    opcode == HNS_ROCE_V2_OPCODE_SEND_WITH_IMM ||
+		    opcode == HNS_ROCE_V2_OPCODE_SEND_WITH_INV) &&
+		    (roce_get_bit(cqe->byte_4, V2_CQE_BYTE_4_RQ_INLINE_S))) {
+			ret = hns_roce_handle_recv_inl_wqe(cqe, cur_qp, wc);
+			if (ret)
+				return -EAGAIN;
+		}
+
 		/* Update tail pointer, record wr_id */
 		/* Update tail pointer, record wr_id */
 		wq = &(*cur_qp)->rq;
 		wq = &(*cur_qp)->rq;
 		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
 		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
@@ -1670,6 +1911,21 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
 		wc->wc_flags |= (roce_get_bit(cqe->byte_32,
 		wc->wc_flags |= (roce_get_bit(cqe->byte_32,
 					      V2_CQE_BYTE_32_GRH_S) ?
 					      V2_CQE_BYTE_32_GRH_S) ?
 					      IB_WC_GRH : 0);
 					      IB_WC_GRH : 0);
+		wc->port_num = roce_get_field(cqe->byte_32,
+				V2_CQE_BYTE_32_PORTN_M, V2_CQE_BYTE_32_PORTN_S);
+		wc->pkey_index = 0;
+		memcpy(wc->smac, cqe->smac, 4);
+		wc->smac[4] = roce_get_field(cqe->byte_28,
+					     V2_CQE_BYTE_28_SMAC_4_M,
+					     V2_CQE_BYTE_28_SMAC_4_S);
+		wc->smac[5] = roce_get_field(cqe->byte_28,
+					     V2_CQE_BYTE_28_SMAC_5_M,
+					     V2_CQE_BYTE_28_SMAC_5_S);
+		wc->vlan_id = 0xffff;
+		wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC);
+		wc->network_hdr_type = roce_get_field(cqe->byte_28,
+						    V2_CQE_BYTE_28_PORT_TYPE_M,
+						    V2_CQE_BYTE_28_PORT_TYPE_S);
 	}
 	}
 
 
 	return 0;
 	return 0;
@@ -1859,8 +2115,39 @@ static int hns_roce_v2_qp_modify(struct hns_roce_dev *hr_dev,
 	return ret;
 	return ret;
 }
 }
 
 
+static void set_access_flags(struct hns_roce_qp *hr_qp,
+			     struct hns_roce_v2_qp_context *context,
+			     struct hns_roce_v2_qp_context *qpc_mask,
+			     const struct ib_qp_attr *attr, int attr_mask)
+{
+	u8 dest_rd_atomic;
+	u32 access_flags;
+
+	dest_rd_atomic = !!(attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) ?
+			 attr->max_dest_rd_atomic : hr_qp->resp_depth;
+
+	access_flags = !!(attr_mask & IB_QP_ACCESS_FLAGS) ?
+		       attr->qp_access_flags : hr_qp->atomic_rd_en;
+
+	if (!dest_rd_atomic)
+		access_flags &= IB_ACCESS_REMOTE_WRITE;
+
+	roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RRE_S,
+		     !!(access_flags & IB_ACCESS_REMOTE_READ));
+	roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_RRE_S, 0);
+
+	roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RWE_S,
+		     !!(access_flags & IB_ACCESS_REMOTE_WRITE));
+	roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_RWE_S, 0);
+
+	roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_ATE_S,
+		     !!(access_flags & IB_ACCESS_REMOTE_ATOMIC));
+	roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_ATE_S, 0);
+}
+
 static void modify_qp_reset_to_init(struct ib_qp *ibqp,
 static void modify_qp_reset_to_init(struct ib_qp *ibqp,
 				    const struct ib_qp_attr *attr,
 				    const struct ib_qp_attr *attr,
+				    int attr_mask,
 				    struct hns_roce_v2_qp_context *context,
 				    struct hns_roce_v2_qp_context *context,
 				    struct hns_roce_v2_qp_context *qpc_mask)
 				    struct hns_roce_v2_qp_context *qpc_mask)
 {
 {
@@ -1877,9 +2164,18 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp,
 	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_TST_M,
 	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_TST_M,
 		       V2_QPC_BYTE_4_TST_S, 0);
 		       V2_QPC_BYTE_4_TST_S, 0);
 
 
-	roce_set_field(context->byte_4_sqpn_tst, V2_QPC_BYTE_4_SGE_SHIFT_M,
-		       V2_QPC_BYTE_4_SGE_SHIFT_S, hr_qp->sq.max_gs > 2 ?
-		       ilog2((unsigned int)hr_qp->sge.sge_cnt) : 0);
+	if (ibqp->qp_type == IB_QPT_GSI)
+		roce_set_field(context->byte_4_sqpn_tst,
+			       V2_QPC_BYTE_4_SGE_SHIFT_M,
+			       V2_QPC_BYTE_4_SGE_SHIFT_S,
+			       ilog2((unsigned int)hr_qp->sge.sge_cnt));
+	else
+		roce_set_field(context->byte_4_sqpn_tst,
+			       V2_QPC_BYTE_4_SGE_SHIFT_M,
+			       V2_QPC_BYTE_4_SGE_SHIFT_S,
+			       hr_qp->sq.max_gs > 2 ?
+			       ilog2((unsigned int)hr_qp->sge.sge_cnt) : 0);
+
 	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_SGE_SHIFT_M,
 	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_SGE_SHIFT_M,
 		       V2_QPC_BYTE_4_SGE_SHIFT_S, 0);
 		       V2_QPC_BYTE_4_SGE_SHIFT_S, 0);
 
 
@@ -1944,18 +2240,13 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp,
 	roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_CNP_TX_FLAG_S, 0);
 	roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_CNP_TX_FLAG_S, 0);
 	roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_CE_FLAG_S, 0);
 	roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_CE_FLAG_S, 0);
 
 
-	roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RRE_S,
-		     !!(attr->qp_access_flags & IB_ACCESS_REMOTE_READ));
-	roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_RRE_S, 0);
-
-	roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RWE_S,
-		     !!(attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE));
-	roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_RWE_S, 0);
-
-	roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_ATE_S,
-		     !!(attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC));
-	roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_ATE_S, 0);
+	if (attr_mask & IB_QP_QKEY) {
+		context->qkey_xrcd = attr->qkey;
+		qpc_mask->qkey_xrcd = 0;
+		hr_qp->qkey = attr->qkey;
+	}
 
 
+	roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RQIE_S, 1);
 	roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_RQIE_S, 0);
 	roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_RQIE_S, 0);
 
 
 	roce_set_field(context->byte_80_rnr_rx_cqn, V2_QPC_BYTE_80_RX_CQN_M,
 	roce_set_field(context->byte_80_rnr_rx_cqn, V2_QPC_BYTE_80_RX_CQN_M,
@@ -2176,9 +2467,17 @@ static void modify_qp_init_to_init(struct ib_qp *ibqp,
 	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_TST_M,
 	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_TST_M,
 		       V2_QPC_BYTE_4_TST_S, 0);
 		       V2_QPC_BYTE_4_TST_S, 0);
 
 
-	roce_set_field(context->byte_4_sqpn_tst, V2_QPC_BYTE_4_SGE_SHIFT_M,
-		       V2_QPC_BYTE_4_SGE_SHIFT_S, hr_qp->sq.max_gs > 2 ?
-		       ilog2((unsigned int)hr_qp->sge.sge_cnt) : 0);
+	if (ibqp->qp_type == IB_QPT_GSI)
+		roce_set_field(context->byte_4_sqpn_tst,
+			       V2_QPC_BYTE_4_SGE_SHIFT_M,
+			       V2_QPC_BYTE_4_SGE_SHIFT_S,
+			       ilog2((unsigned int)hr_qp->sge.sge_cnt));
+	else
+		roce_set_field(context->byte_4_sqpn_tst,
+			       V2_QPC_BYTE_4_SGE_SHIFT_M,
+			       V2_QPC_BYTE_4_SGE_SHIFT_S, hr_qp->sq.max_gs > 2 ?
+			       ilog2((unsigned int)hr_qp->sge.sge_cnt) : 0);
+
 	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_SGE_SHIFT_M,
 	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_SGE_SHIFT_M,
 		       V2_QPC_BYTE_4_SGE_SHIFT_S, 0);
 		       V2_QPC_BYTE_4_SGE_SHIFT_S, 0);
 
 
@@ -2239,7 +2538,7 @@ static void modify_qp_init_to_init(struct ib_qp *ibqp,
 		       V2_QPC_BYTE_80_RX_CQN_S, 0);
 		       V2_QPC_BYTE_80_RX_CQN_S, 0);
 
 
 	roce_set_field(context->byte_252_err_txcqn, V2_QPC_BYTE_252_TX_CQN_M,
 	roce_set_field(context->byte_252_err_txcqn, V2_QPC_BYTE_252_TX_CQN_M,
-		       V2_QPC_BYTE_252_TX_CQN_S, to_hr_cq(ibqp->recv_cq)->cqn);
+		       V2_QPC_BYTE_252_TX_CQN_S, to_hr_cq(ibqp->send_cq)->cqn);
 	roce_set_field(qpc_mask->byte_252_err_txcqn, V2_QPC_BYTE_252_TX_CQN_M,
 	roce_set_field(qpc_mask->byte_252_err_txcqn, V2_QPC_BYTE_252_TX_CQN_M,
 		       V2_QPC_BYTE_252_TX_CQN_S, 0);
 		       V2_QPC_BYTE_252_TX_CQN_S, 0);
 
 
@@ -2255,10 +2554,10 @@ static void modify_qp_init_to_init(struct ib_qp *ibqp,
 			       V2_QPC_BYTE_76_SRQN_M, V2_QPC_BYTE_76_SRQN_S, 0);
 			       V2_QPC_BYTE_76_SRQN_M, V2_QPC_BYTE_76_SRQN_S, 0);
 	}
 	}
 
 
-	if (attr_mask & IB_QP_PKEY_INDEX)
-		context->qkey_xrcd = attr->pkey_index;
-	else
-		context->qkey_xrcd = hr_qp->pkey_index;
+	if (attr_mask & IB_QP_QKEY) {
+		context->qkey_xrcd = attr->qkey;
+		qpc_mask->qkey_xrcd = 0;
+	}
 
 
 	roce_set_field(context->byte_4_sqpn_tst, V2_QPC_BYTE_4_SQPN_M,
 	roce_set_field(context->byte_4_sqpn_tst, V2_QPC_BYTE_4_SQPN_M,
 		       V2_QPC_BYTE_4_SQPN_S, hr_qp->qpn);
 		       V2_QPC_BYTE_4_SQPN_S, hr_qp->qpn);
@@ -2354,7 +2653,8 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp,
 	roce_set_field(context->byte_20_smac_sgid_idx,
 	roce_set_field(context->byte_20_smac_sgid_idx,
 		       V2_QPC_BYTE_20_SGE_HOP_NUM_M,
 		       V2_QPC_BYTE_20_SGE_HOP_NUM_M,
 		       V2_QPC_BYTE_20_SGE_HOP_NUM_S,
 		       V2_QPC_BYTE_20_SGE_HOP_NUM_S,
-		       hr_qp->sq.max_gs > 2 ? hr_dev->caps.mtt_hop_num : 0);
+		       ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs > 2) ?
+		       hr_dev->caps.mtt_hop_num : 0);
 	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
 	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
 		       V2_QPC_BYTE_20_SGE_HOP_NUM_M,
 		       V2_QPC_BYTE_20_SGE_HOP_NUM_M,
 		       V2_QPC_BYTE_20_SGE_HOP_NUM_S, 0);
 		       V2_QPC_BYTE_20_SGE_HOP_NUM_S, 0);
@@ -2463,11 +2763,14 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp,
 		roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_LBI_S, 0);
 		roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_LBI_S, 0);
 	}
 	}
 
 
-	roce_set_field(context->byte_140_raq, V2_QPC_BYTE_140_RR_MAX_M,
-		       V2_QPC_BYTE_140_RR_MAX_S,
-		       ilog2((unsigned int)attr->max_dest_rd_atomic));
-	roce_set_field(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RR_MAX_M,
-		       V2_QPC_BYTE_140_RR_MAX_S, 0);
+	if ((attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) &&
+	     attr->max_dest_rd_atomic) {
+		roce_set_field(context->byte_140_raq, V2_QPC_BYTE_140_RR_MAX_M,
+			       V2_QPC_BYTE_140_RR_MAX_S,
+			       fls(attr->max_dest_rd_atomic - 1));
+		roce_set_field(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RR_MAX_M,
+			       V2_QPC_BYTE_140_RR_MAX_S, 0);
+	}
 
 
 	roce_set_field(context->byte_56_dqpn_err, V2_QPC_BYTE_56_DQPN_M,
 	roce_set_field(context->byte_56_dqpn_err, V2_QPC_BYTE_56_DQPN_M,
 		       V2_QPC_BYTE_56_DQPN_S, attr->dest_qp_num);
 		       V2_QPC_BYTE_56_DQPN_S, attr->dest_qp_num);
@@ -2511,8 +2814,13 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp,
 	roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M,
 	roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M,
 		       V2_QPC_BYTE_24_TC_S, 0);
 		       V2_QPC_BYTE_24_TC_S, 0);
 
 
-	roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_MTU_M,
-		       V2_QPC_BYTE_24_MTU_S, attr->path_mtu);
+	if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_UD)
+		roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_MTU_M,
+			       V2_QPC_BYTE_24_MTU_S, IB_MTU_4096);
+	else
+		roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_MTU_M,
+			       V2_QPC_BYTE_24_MTU_S, attr->path_mtu);
+
 	roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_MTU_M,
 	roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_MTU_M,
 		       V2_QPC_BYTE_24_MTU_S, 0);
 		       V2_QPC_BYTE_24_MTU_S, 0);
 
 
@@ -2557,12 +2865,6 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp,
 		       V2_QPC_BYTE_168_LP_SGEN_INI_M,
 		       V2_QPC_BYTE_168_LP_SGEN_INI_M,
 		       V2_QPC_BYTE_168_LP_SGEN_INI_S, 0);
 		       V2_QPC_BYTE_168_LP_SGEN_INI_S, 0);
 
 
-	roce_set_field(context->byte_208_irrl, V2_QPC_BYTE_208_SR_MAX_M,
-		       V2_QPC_BYTE_208_SR_MAX_S,
-		       ilog2((unsigned int)attr->max_rd_atomic));
-	roce_set_field(qpc_mask->byte_208_irrl, V2_QPC_BYTE_208_SR_MAX_M,
-		       V2_QPC_BYTE_208_SR_MAX_S, 0);
-
 	roce_set_field(context->byte_28_at_fl, V2_QPC_BYTE_28_SL_M,
 	roce_set_field(context->byte_28_at_fl, V2_QPC_BYTE_28_SL_M,
 		       V2_QPC_BYTE_28_SL_S, rdma_ah_get_sl(&attr->ah_attr));
 		       V2_QPC_BYTE_28_SL_S, rdma_ah_get_sl(&attr->ah_attr));
 	roce_set_field(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_SL_M,
 	roce_set_field(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_SL_M,
@@ -2625,13 +2927,14 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp,
 		       V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_S, 0);
 		       V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_S, 0);
 
 
 	page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT);
 	page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT);
-	context->sq_cur_sge_blk_addr = hr_qp->sq.max_gs > 2 ?
+	context->sq_cur_sge_blk_addr =
+		       ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs > 2) ?
 				      ((u32)(mtts[hr_qp->sge.offset / page_size]
 				      ((u32)(mtts[hr_qp->sge.offset / page_size]
 				      >> PAGE_ADDR_SHIFT)) : 0;
 				      >> PAGE_ADDR_SHIFT)) : 0;
 	roce_set_field(context->byte_184_irrl_idx,
 	roce_set_field(context->byte_184_irrl_idx,
 		       V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_M,
 		       V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_M,
 		       V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_S,
 		       V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_S,
-		       hr_qp->sq.max_gs > 2 ?
+		       ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs > 2) ?
 		       (mtts[hr_qp->sge.offset / page_size] >>
 		       (mtts[hr_qp->sge.offset / page_size] >>
 		       (32 + PAGE_ADDR_SHIFT)) : 0);
 		       (32 + PAGE_ADDR_SHIFT)) : 0);
 	qpc_mask->sq_cur_sge_blk_addr = 0;
 	qpc_mask->sq_cur_sge_blk_addr = 0;
@@ -2766,6 +3069,14 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp,
 	roce_set_field(qpc_mask->byte_196_sq_psn, V2_QPC_BYTE_196_SQ_MAX_PSN_M,
 	roce_set_field(qpc_mask->byte_196_sq_psn, V2_QPC_BYTE_196_SQ_MAX_PSN_M,
 		       V2_QPC_BYTE_196_SQ_MAX_PSN_S, 0);
 		       V2_QPC_BYTE_196_SQ_MAX_PSN_S, 0);
 
 
+	if ((attr_mask & IB_QP_MAX_QP_RD_ATOMIC) && attr->max_rd_atomic) {
+		roce_set_field(context->byte_208_irrl, V2_QPC_BYTE_208_SR_MAX_M,
+			       V2_QPC_BYTE_208_SR_MAX_S,
+			       fls(attr->max_rd_atomic - 1));
+		roce_set_field(qpc_mask->byte_208_irrl,
+			       V2_QPC_BYTE_208_SR_MAX_M,
+			       V2_QPC_BYTE_208_SR_MAX_S, 0);
+	}
 	return 0;
 	return 0;
 }
 }
 
 
@@ -2794,7 +3105,8 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
 	 */
 	 */
 	memset(qpc_mask, 0xff, sizeof(*qpc_mask));
 	memset(qpc_mask, 0xff, sizeof(*qpc_mask));
 	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
 	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
-		modify_qp_reset_to_init(ibqp, attr, context, qpc_mask);
+		modify_qp_reset_to_init(ibqp, attr, attr_mask, context,
+					qpc_mask);
 	} else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_INIT) {
 	} else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_INIT) {
 		modify_qp_init_to_init(ibqp, attr, attr_mask, context,
 		modify_qp_init_to_init(ibqp, attr, attr_mask, context,
 				       qpc_mask);
 				       qpc_mask);
@@ -2829,6 +3141,9 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
 		goto out;
 		goto out;
 	}
 	}
 
 
+	if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC))
+		set_access_flags(hr_qp, context, qpc_mask, attr, attr_mask);
+
 	/* Every status migrate must change state */
 	/* Every status migrate must change state */
 	roce_set_field(context->byte_60_qpst_mapid, V2_QPC_BYTE_60_QP_ST_M,
 	roce_set_field(context->byte_60_qpst_mapid, V2_QPC_BYTE_60_QP_ST_M,
 		       V2_QPC_BYTE_60_QP_ST_S, new_state);
 		       V2_QPC_BYTE_60_QP_ST_S, new_state);
@@ -2845,6 +3160,9 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
 
 
 	hr_qp->state = new_state;
 	hr_qp->state = new_state;
 
 
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		hr_qp->atomic_rd_en = attr->qp_access_flags;
+
 	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
 	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
 		hr_qp->resp_depth = attr->max_dest_rd_atomic;
 		hr_qp->resp_depth = attr->max_dest_rd_atomic;
 	if (attr_mask & IB_QP_PORT) {
 	if (attr_mask & IB_QP_PORT) {
@@ -3098,6 +3416,11 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev,
 		hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf);
 		hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf);
 	}
 	}
 
 
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) {
+		kfree(hr_qp->rq_inl_buf.wqe_list[0].sg_list);
+		kfree(hr_qp->rq_inl_buf.wqe_list);
+	}
+
 	return 0;
 	return 0;
 }
 }
 
 
@@ -3162,6 +3485,1146 @@ static int hns_roce_v2_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
 	return ret;
 	return ret;
 }
 }
 
 
+static void set_eq_cons_index_v2(struct hns_roce_eq *eq)
+{
+	u32 doorbell[2];
+
+	doorbell[0] = 0;
+	doorbell[1] = 0;
+
+	if (eq->type_flag == HNS_ROCE_AEQ) {
+		roce_set_field(doorbell[0], HNS_ROCE_V2_EQ_DB_CMD_M,
+			       HNS_ROCE_V2_EQ_DB_CMD_S,
+			       eq->arm_st == HNS_ROCE_V2_EQ_ALWAYS_ARMED ?
+			       HNS_ROCE_EQ_DB_CMD_AEQ :
+			       HNS_ROCE_EQ_DB_CMD_AEQ_ARMED);
+	} else {
+		roce_set_field(doorbell[0], HNS_ROCE_V2_EQ_DB_TAG_M,
+			       HNS_ROCE_V2_EQ_DB_TAG_S, eq->eqn);
+
+		roce_set_field(doorbell[0], HNS_ROCE_V2_EQ_DB_CMD_M,
+			       HNS_ROCE_V2_EQ_DB_CMD_S,
+			       eq->arm_st == HNS_ROCE_V2_EQ_ALWAYS_ARMED ?
+			       HNS_ROCE_EQ_DB_CMD_CEQ :
+			       HNS_ROCE_EQ_DB_CMD_CEQ_ARMED);
+	}
+
+	roce_set_field(doorbell[1], HNS_ROCE_V2_EQ_DB_PARA_M,
+		       HNS_ROCE_V2_EQ_DB_PARA_S,
+		       (eq->cons_index & HNS_ROCE_V2_CONS_IDX_M));
+
+	hns_roce_write64_k(doorbell, eq->doorbell);
+}
+
+static void hns_roce_v2_wq_catas_err_handle(struct hns_roce_dev *hr_dev,
+						  struct hns_roce_aeqe *aeqe,
+						  u32 qpn)
+{
+	struct device *dev = hr_dev->dev;
+	int sub_type;
+
+	dev_warn(dev, "Local work queue catastrophic error.\n");
+	sub_type = roce_get_field(aeqe->asyn, HNS_ROCE_V2_AEQE_SUB_TYPE_M,
+				  HNS_ROCE_V2_AEQE_SUB_TYPE_S);
+	switch (sub_type) {
+	case HNS_ROCE_LWQCE_QPC_ERROR:
+		dev_warn(dev, "QP %d, QPC error.\n", qpn);
+		break;
+	case HNS_ROCE_LWQCE_MTU_ERROR:
+		dev_warn(dev, "QP %d, MTU error.\n", qpn);
+		break;
+	case HNS_ROCE_LWQCE_WQE_BA_ADDR_ERROR:
+		dev_warn(dev, "QP %d, WQE BA addr error.\n", qpn);
+		break;
+	case HNS_ROCE_LWQCE_WQE_ADDR_ERROR:
+		dev_warn(dev, "QP %d, WQE addr error.\n", qpn);
+		break;
+	case HNS_ROCE_LWQCE_SQ_WQE_SHIFT_ERROR:
+		dev_warn(dev, "QP %d, WQE shift error.\n", qpn);
+		break;
+	default:
+		dev_err(dev, "Unhandled sub_event type %d.\n", sub_type);
+		break;
+	}
+}
+
+static void hns_roce_v2_local_wq_access_err_handle(struct hns_roce_dev *hr_dev,
+					    struct hns_roce_aeqe *aeqe, u32 qpn)
+{
+	struct device *dev = hr_dev->dev;
+	int sub_type;
+
+	dev_warn(dev, "Local access violation work queue error.\n");
+	sub_type = roce_get_field(aeqe->asyn, HNS_ROCE_V2_AEQE_SUB_TYPE_M,
+				  HNS_ROCE_V2_AEQE_SUB_TYPE_S);
+	switch (sub_type) {
+	case HNS_ROCE_LAVWQE_R_KEY_VIOLATION:
+		dev_warn(dev, "QP %d, R_key violation.\n", qpn);
+		break;
+	case HNS_ROCE_LAVWQE_LENGTH_ERROR:
+		dev_warn(dev, "QP %d, length error.\n", qpn);
+		break;
+	case HNS_ROCE_LAVWQE_VA_ERROR:
+		dev_warn(dev, "QP %d, VA error.\n", qpn);
+		break;
+	case HNS_ROCE_LAVWQE_PD_ERROR:
+		dev_err(dev, "QP %d, PD error.\n", qpn);
+		break;
+	case HNS_ROCE_LAVWQE_RW_ACC_ERROR:
+		dev_warn(dev, "QP %d, rw acc error.\n", qpn);
+		break;
+	case HNS_ROCE_LAVWQE_KEY_STATE_ERROR:
+		dev_warn(dev, "QP %d, key state error.\n", qpn);
+		break;
+	case HNS_ROCE_LAVWQE_MR_OPERATION_ERROR:
+		dev_warn(dev, "QP %d, MR operation error.\n", qpn);
+		break;
+	default:
+		dev_err(dev, "Unhandled sub_event type %d.\n", sub_type);
+		break;
+	}
+}
+
+static void hns_roce_v2_qp_err_handle(struct hns_roce_dev *hr_dev,
+				      struct hns_roce_aeqe *aeqe,
+				      int event_type)
+{
+	struct device *dev = hr_dev->dev;
+	u32 qpn;
+
+	qpn = roce_get_field(aeqe->event.qp_event.qp,
+			     HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_M,
+			     HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_S);
+
+	switch (event_type) {
+	case HNS_ROCE_EVENT_TYPE_COMM_EST:
+		dev_warn(dev, "Communication established.\n");
+		break;
+	case HNS_ROCE_EVENT_TYPE_SQ_DRAINED:
+		dev_warn(dev, "Send queue drained.\n");
+		break;
+	case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR:
+		hns_roce_v2_wq_catas_err_handle(hr_dev, aeqe, qpn);
+		break;
+	case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR:
+		dev_warn(dev, "Invalid request local work queue error.\n");
+		break;
+	case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
+		hns_roce_v2_local_wq_access_err_handle(hr_dev, aeqe, qpn);
+		break;
+	default:
+		break;
+	}
+
+	hns_roce_qp_event(hr_dev, qpn, event_type);
+}
+
+static void hns_roce_v2_cq_err_handle(struct hns_roce_dev *hr_dev,
+				      struct hns_roce_aeqe *aeqe,
+				      int event_type)
+{
+	struct device *dev = hr_dev->dev;
+	u32 cqn;
+
+	cqn = roce_get_field(aeqe->event.cq_event.cq,
+			     HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_M,
+			     HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_S);
+
+	switch (event_type) {
+	case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR:
+		dev_warn(dev, "CQ 0x%x access err.\n", cqn);
+		break;
+	case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW:
+		dev_warn(dev, "CQ 0x%x overflow\n", cqn);
+		break;
+	default:
+		break;
+	}
+
+	hns_roce_cq_event(hr_dev, cqn, event_type);
+}
+
+static struct hns_roce_aeqe *get_aeqe_v2(struct hns_roce_eq *eq, u32 entry)
+{
+	u32 buf_chk_sz;
+	unsigned long off;
+
+	buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT);
+	off = (entry & (eq->entries - 1)) * HNS_ROCE_AEQ_ENTRY_SIZE;
+
+	return (struct hns_roce_aeqe *)((char *)(eq->buf_list->buf) +
+		off % buf_chk_sz);
+}
+
+static struct hns_roce_aeqe *mhop_get_aeqe(struct hns_roce_eq *eq, u32 entry)
+{
+	u32 buf_chk_sz;
+	unsigned long off;
+
+	buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT);
+
+	off = (entry & (eq->entries - 1)) * HNS_ROCE_AEQ_ENTRY_SIZE;
+
+	if (eq->hop_num == HNS_ROCE_HOP_NUM_0)
+		return (struct hns_roce_aeqe *)((u8 *)(eq->bt_l0) +
+			off % buf_chk_sz);
+	else
+		return (struct hns_roce_aeqe *)((u8 *)
+			(eq->buf[off / buf_chk_sz]) + off % buf_chk_sz);
+}
+
+static struct hns_roce_aeqe *next_aeqe_sw_v2(struct hns_roce_eq *eq)
+{
+	struct hns_roce_aeqe *aeqe;
+
+	if (!eq->hop_num)
+		aeqe = get_aeqe_v2(eq, eq->cons_index);
+	else
+		aeqe = mhop_get_aeqe(eq, eq->cons_index);
+
+	return (roce_get_bit(aeqe->asyn, HNS_ROCE_V2_AEQ_AEQE_OWNER_S) ^
+		!!(eq->cons_index & eq->entries)) ? aeqe : NULL;
+}
+
+static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
+			       struct hns_roce_eq *eq)
+{
+	struct device *dev = hr_dev->dev;
+	struct hns_roce_aeqe *aeqe;
+	int aeqe_found = 0;
+	int event_type;
+
+	while ((aeqe = next_aeqe_sw_v2(eq))) {
+
+		/* Make sure we read AEQ entry after we have checked the
+		 * ownership bit
+		 */
+		dma_rmb();
+
+		event_type = roce_get_field(aeqe->asyn,
+					    HNS_ROCE_V2_AEQE_EVENT_TYPE_M,
+					    HNS_ROCE_V2_AEQE_EVENT_TYPE_S);
+
+		switch (event_type) {
+		case HNS_ROCE_EVENT_TYPE_PATH_MIG:
+			dev_warn(dev, "Path migrated succeeded.\n");
+			break;
+		case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED:
+			dev_warn(dev, "Path migration failed.\n");
+			break;
+		case HNS_ROCE_EVENT_TYPE_COMM_EST:
+		case HNS_ROCE_EVENT_TYPE_SQ_DRAINED:
+		case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR:
+		case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR:
+		case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
+			hns_roce_v2_qp_err_handle(hr_dev, aeqe, event_type);
+			break;
+		case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH:
+		case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH:
+		case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR:
+			dev_warn(dev, "SRQ not support.\n");
+			break;
+		case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR:
+		case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW:
+			hns_roce_v2_cq_err_handle(hr_dev, aeqe, event_type);
+			break;
+		case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW:
+			dev_warn(dev, "DB overflow.\n");
+			break;
+		case HNS_ROCE_EVENT_TYPE_MB:
+			hns_roce_cmd_event(hr_dev,
+					le16_to_cpu(aeqe->event.cmd.token),
+					aeqe->event.cmd.status,
+					le64_to_cpu(aeqe->event.cmd.out_param));
+			break;
+		case HNS_ROCE_EVENT_TYPE_CEQ_OVERFLOW:
+			dev_warn(dev, "CEQ overflow.\n");
+			break;
+		case HNS_ROCE_EVENT_TYPE_FLR:
+			dev_warn(dev, "Function level reset.\n");
+			break;
+		default:
+			dev_err(dev, "Unhandled event %d on EQ %d at idx %u.\n",
+				event_type, eq->eqn, eq->cons_index);
+			break;
+		};
+
+		++eq->cons_index;
+		aeqe_found = 1;
+
+		if (eq->cons_index > (2 * eq->entries - 1)) {
+			dev_warn(dev, "cons_index overflow, set back to 0.\n");
+			eq->cons_index = 0;
+		}
+	}
+
+	set_eq_cons_index_v2(eq);
+	return aeqe_found;
+}
+
+static struct hns_roce_ceqe *get_ceqe_v2(struct hns_roce_eq *eq, u32 entry)
+{
+	u32 buf_chk_sz;
+	unsigned long off;
+
+	buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT);
+	off = (entry & (eq->entries - 1)) * HNS_ROCE_CEQ_ENTRY_SIZE;
+
+	return (struct hns_roce_ceqe *)((char *)(eq->buf_list->buf) +
+		off % buf_chk_sz);
+}
+
+static struct hns_roce_ceqe *mhop_get_ceqe(struct hns_roce_eq *eq, u32 entry)
+{
+	u32 buf_chk_sz;
+	unsigned long off;
+
+	buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT);
+
+	off = (entry & (eq->entries - 1)) * HNS_ROCE_CEQ_ENTRY_SIZE;
+
+	if (eq->hop_num == HNS_ROCE_HOP_NUM_0)
+		return (struct hns_roce_ceqe *)((u8 *)(eq->bt_l0) +
+			off % buf_chk_sz);
+	else
+		return (struct hns_roce_ceqe *)((u8 *)(eq->buf[off /
+			buf_chk_sz]) + off % buf_chk_sz);
+}
+
+static struct hns_roce_ceqe *next_ceqe_sw_v2(struct hns_roce_eq *eq)
+{
+	struct hns_roce_ceqe *ceqe;
+
+	if (!eq->hop_num)
+		ceqe = get_ceqe_v2(eq, eq->cons_index);
+	else
+		ceqe = mhop_get_ceqe(eq, eq->cons_index);
+
+	return (!!(roce_get_bit(ceqe->comp, HNS_ROCE_V2_CEQ_CEQE_OWNER_S))) ^
+		(!!(eq->cons_index & eq->entries)) ? ceqe : NULL;
+}
+
+static int hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev,
+			       struct hns_roce_eq *eq)
+{
+	struct device *dev = hr_dev->dev;
+	struct hns_roce_ceqe *ceqe;
+	int ceqe_found = 0;
+	u32 cqn;
+
+	while ((ceqe = next_ceqe_sw_v2(eq))) {
+
+		/* Make sure we read CEQ entry after we have checked the
+		 * ownership bit
+		 */
+		dma_rmb();
+
+		cqn = roce_get_field(ceqe->comp,
+				     HNS_ROCE_V2_CEQE_COMP_CQN_M,
+				     HNS_ROCE_V2_CEQE_COMP_CQN_S);
+
+		hns_roce_cq_completion(hr_dev, cqn);
+
+		++eq->cons_index;
+		ceqe_found = 1;
+
+		if (eq->cons_index > (2 * eq->entries - 1)) {
+			dev_warn(dev, "cons_index overflow, set back to 0.\n");
+			eq->cons_index = 0;
+		}
+	}
+
+	set_eq_cons_index_v2(eq);
+
+	return ceqe_found;
+}
+
+static irqreturn_t hns_roce_v2_msix_interrupt_eq(int irq, void *eq_ptr)
+{
+	struct hns_roce_eq *eq = eq_ptr;
+	struct hns_roce_dev *hr_dev = eq->hr_dev;
+	int int_work = 0;
+
+	if (eq->type_flag == HNS_ROCE_CEQ)
+		/* Completion event interrupt */
+		int_work = hns_roce_v2_ceq_int(hr_dev, eq);
+	else
+		/* Asychronous event interrupt */
+		int_work = hns_roce_v2_aeq_int(hr_dev, eq);
+
+	return IRQ_RETVAL(int_work);
+}
+
+static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id)
+{
+	struct hns_roce_dev *hr_dev = dev_id;
+	struct device *dev = hr_dev->dev;
+	int int_work = 0;
+	u32 int_st;
+	u32 int_en;
+
+	/* Abnormal interrupt */
+	int_st = roce_read(hr_dev, ROCEE_VF_ABN_INT_ST_REG);
+	int_en = roce_read(hr_dev, ROCEE_VF_ABN_INT_EN_REG);
+
+	if (roce_get_bit(int_st, HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S)) {
+		dev_err(dev, "AEQ overflow!\n");
+
+		roce_set_bit(int_st, HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S, 1);
+		roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st);
+
+		roce_set_bit(int_en, HNS_ROCE_V2_VF_ABN_INT_EN_S, 1);
+		roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en);
+
+		int_work = 1;
+	} else if (roce_get_bit(int_st,	HNS_ROCE_V2_VF_INT_ST_BUS_ERR_S)) {
+		dev_err(dev, "BUS ERR!\n");
+
+		roce_set_bit(int_st, HNS_ROCE_V2_VF_INT_ST_BUS_ERR_S, 1);
+		roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st);
+
+		roce_set_bit(int_en, HNS_ROCE_V2_VF_ABN_INT_EN_S, 1);
+		roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en);
+
+		int_work = 1;
+	} else if (roce_get_bit(int_st,	HNS_ROCE_V2_VF_INT_ST_OTHER_ERR_S)) {
+		dev_err(dev, "OTHER ERR!\n");
+
+		roce_set_bit(int_st, HNS_ROCE_V2_VF_INT_ST_OTHER_ERR_S, 1);
+		roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st);
+
+		roce_set_bit(int_en, HNS_ROCE_V2_VF_ABN_INT_EN_S, 1);
+		roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en);
+
+		int_work = 1;
+	} else
+		dev_err(dev, "There is no abnormal irq found!\n");
+
+	return IRQ_RETVAL(int_work);
+}
+
+static void hns_roce_v2_int_mask_enable(struct hns_roce_dev *hr_dev,
+					int eq_num, int enable_flag)
+{
+	int i;
+
+	if (enable_flag == EQ_ENABLE) {
+		for (i = 0; i < eq_num; i++)
+			roce_write(hr_dev, ROCEE_VF_EVENT_INT_EN_REG +
+				   i * EQ_REG_OFFSET,
+				   HNS_ROCE_V2_VF_EVENT_INT_EN_M);
+
+		roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG,
+			   HNS_ROCE_V2_VF_ABN_INT_EN_M);
+		roce_write(hr_dev, ROCEE_VF_ABN_INT_CFG_REG,
+			   HNS_ROCE_V2_VF_ABN_INT_CFG_M);
+	} else {
+		for (i = 0; i < eq_num; i++)
+			roce_write(hr_dev, ROCEE_VF_EVENT_INT_EN_REG +
+				   i * EQ_REG_OFFSET,
+				   HNS_ROCE_V2_VF_EVENT_INT_EN_M & 0x0);
+
+		roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG,
+			   HNS_ROCE_V2_VF_ABN_INT_EN_M & 0x0);
+		roce_write(hr_dev, ROCEE_VF_ABN_INT_CFG_REG,
+			   HNS_ROCE_V2_VF_ABN_INT_CFG_M & 0x0);
+	}
+}
+
+static void hns_roce_v2_destroy_eqc(struct hns_roce_dev *hr_dev, int eqn)
+{
+	struct device *dev = hr_dev->dev;
+	int ret;
+
+	if (eqn < hr_dev->caps.num_comp_vectors)
+		ret = hns_roce_cmd_mbox(hr_dev, 0, 0, eqn & HNS_ROCE_V2_EQN_M,
+					0, HNS_ROCE_CMD_DESTROY_CEQC,
+					HNS_ROCE_CMD_TIMEOUT_MSECS);
+	else
+		ret = hns_roce_cmd_mbox(hr_dev, 0, 0, eqn & HNS_ROCE_V2_EQN_M,
+					0, HNS_ROCE_CMD_DESTROY_AEQC,
+					HNS_ROCE_CMD_TIMEOUT_MSECS);
+	if (ret)
+		dev_err(dev, "[mailbox cmd] destroy eqc(%d) failed.\n", eqn);
+}
+
+static void hns_roce_mhop_free_eq(struct hns_roce_dev *hr_dev,
+				  struct hns_roce_eq *eq)
+{
+	struct device *dev = hr_dev->dev;
+	u64 idx;
+	u64 size;
+	u32 buf_chk_sz;
+	u32 bt_chk_sz;
+	u32 mhop_num;
+	int eqe_alloc;
+	int ba_num;
+	int i = 0;
+	int j = 0;
+
+	mhop_num = hr_dev->caps.eqe_hop_num;
+	buf_chk_sz = 1 << (hr_dev->caps.eqe_buf_pg_sz + PAGE_SHIFT);
+	bt_chk_sz = 1 << (hr_dev->caps.eqe_ba_pg_sz + PAGE_SHIFT);
+	ba_num = (PAGE_ALIGN(eq->entries * eq->eqe_size) + buf_chk_sz - 1) /
+		 buf_chk_sz;
+
+	/* hop_num = 0 */
+	if (mhop_num == HNS_ROCE_HOP_NUM_0) {
+		dma_free_coherent(dev, (unsigned int)(eq->entries *
+				  eq->eqe_size), eq->bt_l0, eq->l0_dma);
+		return;
+	}
+
+	/* hop_num = 1 or hop = 2 */
+	dma_free_coherent(dev, bt_chk_sz, eq->bt_l0, eq->l0_dma);
+	if (mhop_num == 1) {
+		for (i = 0; i < eq->l0_last_num; i++) {
+			if (i == eq->l0_last_num - 1) {
+				eqe_alloc = i * (buf_chk_sz / eq->eqe_size);
+				size = (eq->entries - eqe_alloc) * eq->eqe_size;
+				dma_free_coherent(dev, size, eq->buf[i],
+						  eq->buf_dma[i]);
+				break;
+			}
+			dma_free_coherent(dev, buf_chk_sz, eq->buf[i],
+					  eq->buf_dma[i]);
+		}
+	} else if (mhop_num == 2) {
+		for (i = 0; i < eq->l0_last_num; i++) {
+			dma_free_coherent(dev, bt_chk_sz, eq->bt_l1[i],
+					  eq->l1_dma[i]);
+
+			for (j = 0; j < bt_chk_sz / 8; j++) {
+				idx = i * (bt_chk_sz / 8) + j;
+				if ((i == eq->l0_last_num - 1)
+				     && j == eq->l1_last_num - 1) {
+					eqe_alloc = (buf_chk_sz / eq->eqe_size)
+						    * idx;
+					size = (eq->entries - eqe_alloc)
+						* eq->eqe_size;
+					dma_free_coherent(dev, size,
+							  eq->buf[idx],
+							  eq->buf_dma[idx]);
+					break;
+				}
+				dma_free_coherent(dev, buf_chk_sz, eq->buf[idx],
+						  eq->buf_dma[idx]);
+			}
+		}
+	}
+	kfree(eq->buf_dma);
+	kfree(eq->buf);
+	kfree(eq->l1_dma);
+	kfree(eq->bt_l1);
+	eq->buf_dma = NULL;
+	eq->buf = NULL;
+	eq->l1_dma = NULL;
+	eq->bt_l1 = NULL;
+}
+
+static void hns_roce_v2_free_eq(struct hns_roce_dev *hr_dev,
+				struct hns_roce_eq *eq)
+{
+	u32 buf_chk_sz;
+
+	buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT);
+
+	if (hr_dev->caps.eqe_hop_num) {
+		hns_roce_mhop_free_eq(hr_dev, eq);
+		return;
+	}
+
+	if (eq->buf_list)
+		dma_free_coherent(hr_dev->dev, buf_chk_sz,
+				  eq->buf_list->buf, eq->buf_list->map);
+}
+
+static void hns_roce_config_eqc(struct hns_roce_dev *hr_dev,
+				struct hns_roce_eq *eq,
+				void *mb_buf)
+{
+	struct hns_roce_eq_context *eqc;
+
+	eqc = mb_buf;
+	memset(eqc, 0, sizeof(struct hns_roce_eq_context));
+
+	/* init eqc */
+	eq->doorbell = hr_dev->reg_base + ROCEE_VF_EQ_DB_CFG0_REG;
+	eq->hop_num = hr_dev->caps.eqe_hop_num;
+	eq->cons_index = 0;
+	eq->over_ignore = HNS_ROCE_V2_EQ_OVER_IGNORE_0;
+	eq->coalesce = HNS_ROCE_V2_EQ_COALESCE_0;
+	eq->arm_st = HNS_ROCE_V2_EQ_ALWAYS_ARMED;
+	eq->eqe_ba_pg_sz = hr_dev->caps.eqe_ba_pg_sz;
+	eq->eqe_buf_pg_sz = hr_dev->caps.eqe_buf_pg_sz;
+	eq->shift = ilog2((unsigned int)eq->entries);
+
+	if (!eq->hop_num)
+		eq->eqe_ba = eq->buf_list->map;
+	else
+		eq->eqe_ba = eq->l0_dma;
+
+	/* set eqc state */
+	roce_set_field(eqc->byte_4,
+		       HNS_ROCE_EQC_EQ_ST_M,
+		       HNS_ROCE_EQC_EQ_ST_S,
+		       HNS_ROCE_V2_EQ_STATE_VALID);
+
+	/* set eqe hop num */
+	roce_set_field(eqc->byte_4,
+		       HNS_ROCE_EQC_HOP_NUM_M,
+		       HNS_ROCE_EQC_HOP_NUM_S, eq->hop_num);
+
+	/* set eqc over_ignore */
+	roce_set_field(eqc->byte_4,
+		       HNS_ROCE_EQC_OVER_IGNORE_M,
+		       HNS_ROCE_EQC_OVER_IGNORE_S, eq->over_ignore);
+
+	/* set eqc coalesce */
+	roce_set_field(eqc->byte_4,
+		       HNS_ROCE_EQC_COALESCE_M,
+		       HNS_ROCE_EQC_COALESCE_S, eq->coalesce);
+
+	/* set eqc arm_state */
+	roce_set_field(eqc->byte_4,
+		       HNS_ROCE_EQC_ARM_ST_M,
+		       HNS_ROCE_EQC_ARM_ST_S, eq->arm_st);
+
+	/* set eqn */
+	roce_set_field(eqc->byte_4,
+		       HNS_ROCE_EQC_EQN_M,
+		       HNS_ROCE_EQC_EQN_S, eq->eqn);
+
+	/* set eqe_cnt */
+	roce_set_field(eqc->byte_4,
+		       HNS_ROCE_EQC_EQE_CNT_M,
+		       HNS_ROCE_EQC_EQE_CNT_S,
+		       HNS_ROCE_EQ_INIT_EQE_CNT);
+
+	/* set eqe_ba_pg_sz */
+	roce_set_field(eqc->byte_8,
+		       HNS_ROCE_EQC_BA_PG_SZ_M,
+		       HNS_ROCE_EQC_BA_PG_SZ_S, eq->eqe_ba_pg_sz);
+
+	/* set eqe_buf_pg_sz */
+	roce_set_field(eqc->byte_8,
+		       HNS_ROCE_EQC_BUF_PG_SZ_M,
+		       HNS_ROCE_EQC_BUF_PG_SZ_S, eq->eqe_buf_pg_sz);
+
+	/* set eq_producer_idx */
+	roce_set_field(eqc->byte_8,
+		       HNS_ROCE_EQC_PROD_INDX_M,
+		       HNS_ROCE_EQC_PROD_INDX_S,
+		       HNS_ROCE_EQ_INIT_PROD_IDX);
+
+	/* set eq_max_cnt */
+	roce_set_field(eqc->byte_12,
+		       HNS_ROCE_EQC_MAX_CNT_M,
+		       HNS_ROCE_EQC_MAX_CNT_S, eq->eq_max_cnt);
+
+	/* set eq_period */
+	roce_set_field(eqc->byte_12,
+		       HNS_ROCE_EQC_PERIOD_M,
+		       HNS_ROCE_EQC_PERIOD_S, eq->eq_period);
+
+	/* set eqe_report_timer */
+	roce_set_field(eqc->eqe_report_timer,
+		       HNS_ROCE_EQC_REPORT_TIMER_M,
+		       HNS_ROCE_EQC_REPORT_TIMER_S,
+		       HNS_ROCE_EQ_INIT_REPORT_TIMER);
+
+	/* set eqe_ba [34:3] */
+	roce_set_field(eqc->eqe_ba0,
+		       HNS_ROCE_EQC_EQE_BA_L_M,
+		       HNS_ROCE_EQC_EQE_BA_L_S, eq->eqe_ba >> 3);
+
+	/* set eqe_ba [64:35] */
+	roce_set_field(eqc->eqe_ba1,
+		       HNS_ROCE_EQC_EQE_BA_H_M,
+		       HNS_ROCE_EQC_EQE_BA_H_S, eq->eqe_ba >> 35);
+
+	/* set eq shift */
+	roce_set_field(eqc->byte_28,
+		       HNS_ROCE_EQC_SHIFT_M,
+		       HNS_ROCE_EQC_SHIFT_S, eq->shift);
+
+	/* set eq MSI_IDX */
+	roce_set_field(eqc->byte_28,
+		       HNS_ROCE_EQC_MSI_INDX_M,
+		       HNS_ROCE_EQC_MSI_INDX_S,
+		       HNS_ROCE_EQ_INIT_MSI_IDX);
+
+	/* set cur_eqe_ba [27:12] */
+	roce_set_field(eqc->byte_28,
+		       HNS_ROCE_EQC_CUR_EQE_BA_L_M,
+		       HNS_ROCE_EQC_CUR_EQE_BA_L_S, eq->cur_eqe_ba >> 12);
+
+	/* set cur_eqe_ba [59:28] */
+	roce_set_field(eqc->byte_32,
+		       HNS_ROCE_EQC_CUR_EQE_BA_M_M,
+		       HNS_ROCE_EQC_CUR_EQE_BA_M_S, eq->cur_eqe_ba >> 28);
+
+	/* set cur_eqe_ba [63:60] */
+	roce_set_field(eqc->byte_36,
+		       HNS_ROCE_EQC_CUR_EQE_BA_H_M,
+		       HNS_ROCE_EQC_CUR_EQE_BA_H_S, eq->cur_eqe_ba >> 60);
+
+	/* set eq consumer idx */
+	roce_set_field(eqc->byte_36,
+		       HNS_ROCE_EQC_CONS_INDX_M,
+		       HNS_ROCE_EQC_CONS_INDX_S,
+		       HNS_ROCE_EQ_INIT_CONS_IDX);
+
+	/* set nex_eqe_ba[43:12] */
+	roce_set_field(eqc->nxt_eqe_ba0,
+		       HNS_ROCE_EQC_NXT_EQE_BA_L_M,
+		       HNS_ROCE_EQC_NXT_EQE_BA_L_S, eq->nxt_eqe_ba >> 12);
+
+	/* set nex_eqe_ba[63:44] */
+	roce_set_field(eqc->nxt_eqe_ba1,
+		       HNS_ROCE_EQC_NXT_EQE_BA_H_M,
+		       HNS_ROCE_EQC_NXT_EQE_BA_H_S, eq->nxt_eqe_ba >> 44);
+}
+
+static int hns_roce_mhop_alloc_eq(struct hns_roce_dev *hr_dev,
+				  struct hns_roce_eq *eq)
+{
+	struct device *dev = hr_dev->dev;
+	int eq_alloc_done = 0;
+	int eq_buf_cnt = 0;
+	int eqe_alloc;
+	u32 buf_chk_sz;
+	u32 bt_chk_sz;
+	u32 mhop_num;
+	u64 size;
+	u64 idx;
+	int ba_num;
+	int bt_num;
+	int record_i;
+	int record_j;
+	int i = 0;
+	int j = 0;
+
+	mhop_num = hr_dev->caps.eqe_hop_num;
+	buf_chk_sz = 1 << (hr_dev->caps.eqe_buf_pg_sz + PAGE_SHIFT);
+	bt_chk_sz = 1 << (hr_dev->caps.eqe_ba_pg_sz + PAGE_SHIFT);
+
+	ba_num = (PAGE_ALIGN(eq->entries * eq->eqe_size) + buf_chk_sz - 1)
+		  / buf_chk_sz;
+	bt_num = (ba_num + bt_chk_sz / 8 - 1) / (bt_chk_sz / 8);
+
+	/* hop_num = 0 */
+	if (mhop_num == HNS_ROCE_HOP_NUM_0) {
+		if (eq->entries > buf_chk_sz / eq->eqe_size) {
+			dev_err(dev, "eq entries %d is larger than buf_pg_sz!",
+				eq->entries);
+			return -EINVAL;
+		}
+		eq->bt_l0 = dma_alloc_coherent(dev, eq->entries * eq->eqe_size,
+					       &(eq->l0_dma), GFP_KERNEL);
+		if (!eq->bt_l0)
+			return -ENOMEM;
+
+		eq->cur_eqe_ba = eq->l0_dma;
+		eq->nxt_eqe_ba = 0;
+
+		memset(eq->bt_l0, 0, eq->entries * eq->eqe_size);
+
+		return 0;
+	}
+
+	eq->buf_dma = kcalloc(ba_num, sizeof(*eq->buf_dma), GFP_KERNEL);
+	if (!eq->buf_dma)
+		return -ENOMEM;
+	eq->buf = kcalloc(ba_num, sizeof(*eq->buf), GFP_KERNEL);
+	if (!eq->buf)
+		goto err_kcalloc_buf;
+
+	if (mhop_num == 2) {
+		eq->l1_dma = kcalloc(bt_num, sizeof(*eq->l1_dma), GFP_KERNEL);
+		if (!eq->l1_dma)
+			goto err_kcalloc_l1_dma;
+
+		eq->bt_l1 = kcalloc(bt_num, sizeof(*eq->bt_l1), GFP_KERNEL);
+		if (!eq->bt_l1)
+			goto err_kcalloc_bt_l1;
+	}
+
+	/* alloc L0 BT */
+	eq->bt_l0 = dma_alloc_coherent(dev, bt_chk_sz, &eq->l0_dma, GFP_KERNEL);
+	if (!eq->bt_l0)
+		goto err_dma_alloc_l0;
+
+	if (mhop_num == 1) {
+		if (ba_num > (bt_chk_sz / 8))
+			dev_err(dev, "ba_num %d is too large for 1 hop\n",
+				ba_num);
+
+		/* alloc buf */
+		for (i = 0; i < bt_chk_sz / 8; i++) {
+			if (eq_buf_cnt + 1 < ba_num) {
+				size = buf_chk_sz;
+			} else {
+				eqe_alloc = i * (buf_chk_sz / eq->eqe_size);
+				size = (eq->entries - eqe_alloc) * eq->eqe_size;
+			}
+			eq->buf[i] = dma_alloc_coherent(dev, size,
+							&(eq->buf_dma[i]),
+							GFP_KERNEL);
+			if (!eq->buf[i])
+				goto err_dma_alloc_buf;
+
+			memset(eq->buf[i], 0, size);
+			*(eq->bt_l0 + i) = eq->buf_dma[i];
+
+			eq_buf_cnt++;
+			if (eq_buf_cnt >= ba_num)
+				break;
+		}
+		eq->cur_eqe_ba = eq->buf_dma[0];
+		eq->nxt_eqe_ba = eq->buf_dma[1];
+
+	} else if (mhop_num == 2) {
+		/* alloc L1 BT and buf */
+		for (i = 0; i < bt_chk_sz / 8; i++) {
+			eq->bt_l1[i] = dma_alloc_coherent(dev, bt_chk_sz,
+							  &(eq->l1_dma[i]),
+							  GFP_KERNEL);
+			if (!eq->bt_l1[i])
+				goto err_dma_alloc_l1;
+			*(eq->bt_l0 + i) = eq->l1_dma[i];
+
+			for (j = 0; j < bt_chk_sz / 8; j++) {
+				idx = i * bt_chk_sz / 8 + j;
+				if (eq_buf_cnt + 1 < ba_num) {
+					size = buf_chk_sz;
+				} else {
+					eqe_alloc = (buf_chk_sz / eq->eqe_size)
+						    * idx;
+					size = (eq->entries - eqe_alloc)
+						* eq->eqe_size;
+				}
+				eq->buf[idx] = dma_alloc_coherent(dev, size,
+							    &(eq->buf_dma[idx]),
+							    GFP_KERNEL);
+				if (!eq->buf[idx])
+					goto err_dma_alloc_buf;
+
+				memset(eq->buf[idx], 0, size);
+				*(eq->bt_l1[i] + j) = eq->buf_dma[idx];
+
+				eq_buf_cnt++;
+				if (eq_buf_cnt >= ba_num) {
+					eq_alloc_done = 1;
+					break;
+				}
+			}
+
+			if (eq_alloc_done)
+				break;
+		}
+		eq->cur_eqe_ba = eq->buf_dma[0];
+		eq->nxt_eqe_ba = eq->buf_dma[1];
+	}
+
+	eq->l0_last_num = i + 1;
+	if (mhop_num == 2)
+		eq->l1_last_num = j + 1;
+
+	return 0;
+
+err_dma_alloc_l1:
+	dma_free_coherent(dev, bt_chk_sz, eq->bt_l0, eq->l0_dma);
+	eq->bt_l0 = NULL;
+	eq->l0_dma = 0;
+	for (i -= 1; i >= 0; i--) {
+		dma_free_coherent(dev, bt_chk_sz, eq->bt_l1[i],
+				  eq->l1_dma[i]);
+
+		for (j = 0; j < bt_chk_sz / 8; j++) {
+			idx = i * bt_chk_sz / 8 + j;
+			dma_free_coherent(dev, buf_chk_sz, eq->buf[idx],
+					  eq->buf_dma[idx]);
+		}
+	}
+	goto err_dma_alloc_l0;
+
+err_dma_alloc_buf:
+	dma_free_coherent(dev, bt_chk_sz, eq->bt_l0, eq->l0_dma);
+	eq->bt_l0 = NULL;
+	eq->l0_dma = 0;
+
+	if (mhop_num == 1)
+		for (i -= i; i >= 0; i--)
+			dma_free_coherent(dev, buf_chk_sz, eq->buf[i],
+					  eq->buf_dma[i]);
+	else if (mhop_num == 2) {
+		record_i = i;
+		record_j = j;
+		for (; i >= 0; i--) {
+			dma_free_coherent(dev, bt_chk_sz, eq->bt_l1[i],
+					  eq->l1_dma[i]);
+
+			for (j = 0; j < bt_chk_sz / 8; j++) {
+				if (i == record_i && j >= record_j)
+					break;
+
+				idx = i * bt_chk_sz / 8 + j;
+				dma_free_coherent(dev, buf_chk_sz,
+						  eq->buf[idx],
+						  eq->buf_dma[idx]);
+			}
+		}
+	}
+
+err_dma_alloc_l0:
+	kfree(eq->bt_l1);
+	eq->bt_l1 = NULL;
+
+err_kcalloc_bt_l1:
+	kfree(eq->l1_dma);
+	eq->l1_dma = NULL;
+
+err_kcalloc_l1_dma:
+	kfree(eq->buf);
+	eq->buf = NULL;
+
+err_kcalloc_buf:
+	kfree(eq->buf_dma);
+	eq->buf_dma = NULL;
+
+	return -ENOMEM;
+}
+
+static int hns_roce_v2_create_eq(struct hns_roce_dev *hr_dev,
+				 struct hns_roce_eq *eq,
+				 unsigned int eq_cmd)
+{
+	struct device *dev = hr_dev->dev;
+	struct hns_roce_cmd_mailbox *mailbox;
+	u32 buf_chk_sz = 0;
+	int ret;
+
+	/* Allocate mailbox memory */
+	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	if (!hr_dev->caps.eqe_hop_num) {
+		buf_chk_sz = 1 << (hr_dev->caps.eqe_buf_pg_sz + PAGE_SHIFT);
+
+		eq->buf_list = kzalloc(sizeof(struct hns_roce_buf_list),
+				       GFP_KERNEL);
+		if (!eq->buf_list) {
+			ret = -ENOMEM;
+			goto free_cmd_mbox;
+		}
+
+		eq->buf_list->buf = dma_alloc_coherent(dev, buf_chk_sz,
+						       &(eq->buf_list->map),
+						       GFP_KERNEL);
+		if (!eq->buf_list->buf) {
+			ret = -ENOMEM;
+			goto err_alloc_buf;
+		}
+
+		memset(eq->buf_list->buf, 0, buf_chk_sz);
+	} else {
+		ret = hns_roce_mhop_alloc_eq(hr_dev, eq);
+		if (ret) {
+			ret = -ENOMEM;
+			goto free_cmd_mbox;
+		}
+	}
+
+	hns_roce_config_eqc(hr_dev, eq, mailbox->buf);
+
+	ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, eq->eqn, 0,
+				eq_cmd, HNS_ROCE_CMD_TIMEOUT_MSECS);
+	if (ret) {
+		dev_err(dev, "[mailbox cmd] creat eqc failed.\n");
+		goto err_cmd_mbox;
+	}
+
+	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+
+	return 0;
+
+err_cmd_mbox:
+	if (!hr_dev->caps.eqe_hop_num)
+		dma_free_coherent(dev, buf_chk_sz, eq->buf_list->buf,
+				  eq->buf_list->map);
+	else {
+		hns_roce_mhop_free_eq(hr_dev, eq);
+		goto free_cmd_mbox;
+	}
+
+err_alloc_buf:
+	kfree(eq->buf_list);
+
+free_cmd_mbox:
+	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+
+	return ret;
+}
+
+static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_eq_table *eq_table = &hr_dev->eq_table;
+	struct device *dev = hr_dev->dev;
+	struct hns_roce_eq *eq;
+	unsigned int eq_cmd;
+	int irq_num;
+	int eq_num;
+	int other_num;
+	int comp_num;
+	int aeq_num;
+	int i, j, k;
+	int ret;
+
+	other_num = hr_dev->caps.num_other_vectors;
+	comp_num = hr_dev->caps.num_comp_vectors;
+	aeq_num = hr_dev->caps.num_aeq_vectors;
+
+	eq_num = comp_num + aeq_num;
+	irq_num = eq_num + other_num;
+
+	eq_table->eq = kcalloc(eq_num, sizeof(*eq_table->eq), GFP_KERNEL);
+	if (!eq_table->eq)
+		return -ENOMEM;
+
+	for (i = 0; i < irq_num; i++) {
+		hr_dev->irq_names[i] = kzalloc(HNS_ROCE_INT_NAME_LEN,
+					       GFP_KERNEL);
+		if (!hr_dev->irq_names[i]) {
+			ret = -ENOMEM;
+			goto err_failed_kzalloc;
+		}
+	}
+
+	/* create eq */
+	for (j = 0; j < eq_num; j++) {
+		eq = &eq_table->eq[j];
+		eq->hr_dev = hr_dev;
+		eq->eqn = j;
+		if (j < comp_num) {
+			/* CEQ */
+			eq_cmd = HNS_ROCE_CMD_CREATE_CEQC;
+			eq->type_flag = HNS_ROCE_CEQ;
+			eq->entries = hr_dev->caps.ceqe_depth;
+			eq->eqe_size = HNS_ROCE_CEQ_ENTRY_SIZE;
+			eq->irq = hr_dev->irq[j + other_num + aeq_num];
+			eq->eq_max_cnt = HNS_ROCE_CEQ_DEFAULT_BURST_NUM;
+			eq->eq_period = HNS_ROCE_CEQ_DEFAULT_INTERVAL;
+		} else {
+			/* AEQ */
+			eq_cmd = HNS_ROCE_CMD_CREATE_AEQC;
+			eq->type_flag = HNS_ROCE_AEQ;
+			eq->entries = hr_dev->caps.aeqe_depth;
+			eq->eqe_size = HNS_ROCE_AEQ_ENTRY_SIZE;
+			eq->irq = hr_dev->irq[j - comp_num + other_num];
+			eq->eq_max_cnt = HNS_ROCE_AEQ_DEFAULT_BURST_NUM;
+			eq->eq_period = HNS_ROCE_AEQ_DEFAULT_INTERVAL;
+		}
+
+		ret = hns_roce_v2_create_eq(hr_dev, eq, eq_cmd);
+		if (ret) {
+			dev_err(dev, "eq create failed.\n");
+			goto err_create_eq_fail;
+		}
+	}
+
+	/* enable irq */
+	hns_roce_v2_int_mask_enable(hr_dev, eq_num, EQ_ENABLE);
+
+	/* irq contains: abnormal + AEQ + CEQ*/
+	for (k = 0; k < irq_num; k++)
+		if (k < other_num)
+			snprintf((char *)hr_dev->irq_names[k],
+				 HNS_ROCE_INT_NAME_LEN, "hns-abn-%d", k);
+		else if (k < (other_num + aeq_num))
+			snprintf((char *)hr_dev->irq_names[k],
+				 HNS_ROCE_INT_NAME_LEN, "hns-aeq-%d",
+				 k - other_num);
+		else
+			snprintf((char *)hr_dev->irq_names[k],
+				 HNS_ROCE_INT_NAME_LEN, "hns-ceq-%d",
+				 k - other_num - aeq_num);
+
+	for (k = 0; k < irq_num; k++) {
+		if (k < other_num)
+			ret = request_irq(hr_dev->irq[k],
+					  hns_roce_v2_msix_interrupt_abn,
+					  0, hr_dev->irq_names[k], hr_dev);
+
+		else if (k < (other_num + comp_num))
+			ret = request_irq(eq_table->eq[k - other_num].irq,
+					  hns_roce_v2_msix_interrupt_eq,
+					  0, hr_dev->irq_names[k + aeq_num],
+					  &eq_table->eq[k - other_num]);
+		else
+			ret = request_irq(eq_table->eq[k - other_num].irq,
+					  hns_roce_v2_msix_interrupt_eq,
+					  0, hr_dev->irq_names[k - comp_num],
+					  &eq_table->eq[k - other_num]);
+		if (ret) {
+			dev_err(dev, "Request irq error!\n");
+			goto err_request_irq_fail;
+		}
+	}
+
+	return 0;
+
+err_request_irq_fail:
+	for (k -= 1; k >= 0; k--)
+		if (k < other_num)
+			free_irq(hr_dev->irq[k], hr_dev);
+		else
+			free_irq(eq_table->eq[k - other_num].irq,
+				 &eq_table->eq[k - other_num]);
+
+err_create_eq_fail:
+	for (j -= 1; j >= 0; j--)
+		hns_roce_v2_free_eq(hr_dev, &eq_table->eq[j]);
+
+err_failed_kzalloc:
+	for (i -= 1; i >= 0; i--)
+		kfree(hr_dev->irq_names[i]);
+	kfree(eq_table->eq);
+
+	return ret;
+}
+
+static void hns_roce_v2_cleanup_eq_table(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_eq_table *eq_table = &hr_dev->eq_table;
+	int irq_num;
+	int eq_num;
+	int i;
+
+	eq_num = hr_dev->caps.num_comp_vectors + hr_dev->caps.num_aeq_vectors;
+	irq_num = eq_num + hr_dev->caps.num_other_vectors;
+
+	/* Disable irq */
+	hns_roce_v2_int_mask_enable(hr_dev, eq_num, EQ_DISABLE);
+
+	for (i = 0; i < hr_dev->caps.num_other_vectors; i++)
+		free_irq(hr_dev->irq[i], hr_dev);
+
+	for (i = 0; i < eq_num; i++) {
+		hns_roce_v2_destroy_eqc(hr_dev, i);
+
+		free_irq(eq_table->eq[i].irq, &eq_table->eq[i]);
+
+		hns_roce_v2_free_eq(hr_dev, &eq_table->eq[i]);
+	}
+
+	for (i = 0; i < irq_num; i++)
+		kfree(hr_dev->irq_names[i]);
+
+	kfree(eq_table->eq);
+}
+
 static const struct hns_roce_hw hns_roce_hw_v2 = {
 static const struct hns_roce_hw hns_roce_hw_v2 = {
 	.cmq_init = hns_roce_v2_cmq_init,
 	.cmq_init = hns_roce_v2_cmq_init,
 	.cmq_exit = hns_roce_v2_cmq_exit,
 	.cmq_exit = hns_roce_v2_cmq_exit,
@@ -3183,6 +4646,8 @@ static const struct hns_roce_hw hns_roce_hw_v2 = {
 	.post_recv = hns_roce_v2_post_recv,
 	.post_recv = hns_roce_v2_post_recv,
 	.req_notify_cq = hns_roce_v2_req_notify_cq,
 	.req_notify_cq = hns_roce_v2_req_notify_cq,
 	.poll_cq = hns_roce_v2_poll_cq,
 	.poll_cq = hns_roce_v2_poll_cq,
+	.init_eq = hns_roce_v2_init_eq_table,
+	.cleanup_eq = hns_roce_v2_cleanup_eq_table,
 };
 };
 
 
 static const struct pci_device_id hns_roce_hw_v2_pci_tbl[] = {
 static const struct pci_device_id hns_roce_hw_v2_pci_tbl[] = {
@@ -3197,6 +4662,7 @@ static int hns_roce_hw_v2_get_cfg(struct hns_roce_dev *hr_dev,
 				  struct hnae3_handle *handle)
 				  struct hnae3_handle *handle)
 {
 {
 	const struct pci_device_id *id;
 	const struct pci_device_id *id;
+	int i;
 
 
 	id = pci_match_id(hns_roce_hw_v2_pci_tbl, hr_dev->pci_dev);
 	id = pci_match_id(hns_roce_hw_v2_pci_tbl, hr_dev->pci_dev);
 	if (!id) {
 	if (!id) {
@@ -3214,8 +4680,15 @@ static int hns_roce_hw_v2_get_cfg(struct hns_roce_dev *hr_dev,
 	hr_dev->iboe.netdevs[0] = handle->rinfo.netdev;
 	hr_dev->iboe.netdevs[0] = handle->rinfo.netdev;
 	hr_dev->iboe.phy_port[0] = 0;
 	hr_dev->iboe.phy_port[0] = 0;
 
 
+	addrconf_addr_eui48((u8 *)&hr_dev->ib_dev.node_guid,
+			    hr_dev->iboe.netdevs[0]->dev_addr);
+
+	for (i = 0; i < HNS_ROCE_V2_MAX_IRQ_NUM; i++)
+		hr_dev->irq[i] = pci_irq_vector(handle->pdev,
+						i + handle->rinfo.base_vector);
+
 	/* cmd issue mode: 0 is poll, 1 is event */
 	/* cmd issue mode: 0 is poll, 1 is event */
-	hr_dev->cmd_mod = 0;
+	hr_dev->cmd_mod = 1;
 	hr_dev->loop_idc = 0;
 	hr_dev->loop_idc = 0;
 
 
 	return 0;
 	return 0;

+ 278 - 5
drivers/infiniband/hw/hns/hns_roce_hw_v2.h

@@ -53,6 +53,10 @@
 #define HNS_ROCE_V2_MAX_SQ_INLINE		0x20
 #define HNS_ROCE_V2_MAX_SQ_INLINE		0x20
 #define HNS_ROCE_V2_UAR_NUM			256
 #define HNS_ROCE_V2_UAR_NUM			256
 #define HNS_ROCE_V2_PHY_UAR_NUM			1
 #define HNS_ROCE_V2_PHY_UAR_NUM			1
+#define HNS_ROCE_V2_MAX_IRQ_NUM			65
+#define HNS_ROCE_V2_COMP_VEC_NUM		63
+#define HNS_ROCE_V2_AEQE_VEC_NUM		1
+#define HNS_ROCE_V2_ABNORMAL_VEC_NUM		1
 #define HNS_ROCE_V2_MAX_MTPT_NUM		0x8000
 #define HNS_ROCE_V2_MAX_MTPT_NUM		0x8000
 #define HNS_ROCE_V2_MAX_MTT_SEGS		0x1000000
 #define HNS_ROCE_V2_MAX_MTT_SEGS		0x1000000
 #define HNS_ROCE_V2_MAX_CQE_SEGS		0x1000000
 #define HNS_ROCE_V2_MAX_CQE_SEGS		0x1000000
@@ -78,6 +82,8 @@
 #define HNS_ROCE_MTT_HOP_NUM			1
 #define HNS_ROCE_MTT_HOP_NUM			1
 #define HNS_ROCE_CQE_HOP_NUM			1
 #define HNS_ROCE_CQE_HOP_NUM			1
 #define HNS_ROCE_PBL_HOP_NUM			2
 #define HNS_ROCE_PBL_HOP_NUM			2
+#define HNS_ROCE_EQE_HOP_NUM			2
+
 #define HNS_ROCE_V2_GID_INDEX_NUM		256
 #define HNS_ROCE_V2_GID_INDEX_NUM		256
 
 
 #define HNS_ROCE_V2_TABLE_CHUNK_SIZE		(1 << 18)
 #define HNS_ROCE_V2_TABLE_CHUNK_SIZE		(1 << 18)
@@ -105,6 +111,12 @@
 	(step_idx == 1 && hop_num == 1) || \
 	(step_idx == 1 && hop_num == 1) || \
 	(step_idx == 2 && hop_num == 2))
 	(step_idx == 2 && hop_num == 2))
 
 
+enum {
+	NO_ARMED = 0x0,
+	REG_NXT_CEQE = 0x2,
+	REG_NXT_SE_CEQE = 0x3
+};
+
 #define V2_CQ_DB_REQ_NOT_SOL			0
 #define V2_CQ_DB_REQ_NOT_SOL			0
 #define V2_CQ_DB_REQ_NOT			1
 #define V2_CQ_DB_REQ_NOT			1
 
 
@@ -229,6 +241,9 @@ struct hns_roce_v2_cq_context {
 	u32	cqe_report_timer;
 	u32	cqe_report_timer;
 	u32	byte_64_se_cqe_idx;
 	u32	byte_64_se_cqe_idx;
 };
 };
+#define HNS_ROCE_V2_CQ_DEFAULT_BURST_NUM 0x0
+#define HNS_ROCE_V2_CQ_DEFAULT_INTERVAL	0x0
+
 #define	V2_CQC_BYTE_4_CQ_ST_S 0
 #define	V2_CQC_BYTE_4_CQ_ST_S 0
 #define V2_CQC_BYTE_4_CQ_ST_M GENMASK(1, 0)
 #define V2_CQC_BYTE_4_CQ_ST_M GENMASK(1, 0)
 
 
@@ -747,11 +762,14 @@ struct hns_roce_v2_qp_context {
 
 
 struct hns_roce_v2_cqe {
 struct hns_roce_v2_cqe {
 	u32	byte_4;
 	u32	byte_4;
-	u32	rkey_immtdata;
+	union {
+		__le32 rkey;
+		__be32 immtdata;
+	};
 	u32	byte_12;
 	u32	byte_12;
 	u32	byte_16;
 	u32	byte_16;
 	u32	byte_cnt;
 	u32	byte_cnt;
-	u32	smac;
+	u8	smac[4];
 	u32	byte_28;
 	u32	byte_28;
 	u32	byte_32;
 	u32	byte_32;
 };
 };
@@ -901,6 +919,90 @@ struct hns_roce_v2_cq_db {
 
 
 #define V2_CQ_DB_PARAMETER_NOTIFY_S 24
 #define V2_CQ_DB_PARAMETER_NOTIFY_S 24
 
 
+struct hns_roce_v2_ud_send_wqe {
+	u32	byte_4;
+	u32	msg_len;
+	u32	immtdata;
+	u32	byte_16;
+	u32	byte_20;
+	u32	byte_24;
+	u32	qkey;
+	u32	byte_32;
+	u32	byte_36;
+	u32	byte_40;
+	u32	dmac;
+	u32	byte_48;
+	u8	dgid[GID_LEN_V2];
+
+};
+#define	V2_UD_SEND_WQE_BYTE_4_OPCODE_S 0
+#define V2_UD_SEND_WQE_BYTE_4_OPCODE_M GENMASK(4, 0)
+
+#define	V2_UD_SEND_WQE_BYTE_4_OWNER_S 7
+
+#define	V2_UD_SEND_WQE_BYTE_4_CQE_S 8
+
+#define	V2_UD_SEND_WQE_BYTE_4_SE_S 11
+
+#define	V2_UD_SEND_WQE_BYTE_16_PD_S 0
+#define V2_UD_SEND_WQE_BYTE_16_PD_M GENMASK(23, 0)
+
+#define	V2_UD_SEND_WQE_BYTE_16_SGE_NUM_S 24
+#define V2_UD_SEND_WQE_BYTE_16_SGE_NUM_M GENMASK(31, 24)
+
+#define	V2_UD_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S 0
+#define V2_UD_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M GENMASK(23, 0)
+
+#define	V2_UD_SEND_WQE_BYTE_24_UDPSPN_S 16
+#define V2_UD_SEND_WQE_BYTE_24_UDPSPN_M GENMASK(31, 16)
+
+#define	V2_UD_SEND_WQE_BYTE_32_DQPN_S 0
+#define V2_UD_SEND_WQE_BYTE_32_DQPN_M GENMASK(23, 0)
+
+#define	V2_UD_SEND_WQE_BYTE_36_VLAN_S 0
+#define V2_UD_SEND_WQE_BYTE_36_VLAN_M GENMASK(15, 0)
+
+#define	V2_UD_SEND_WQE_BYTE_36_HOPLIMIT_S 16
+#define V2_UD_SEND_WQE_BYTE_36_HOPLIMIT_M GENMASK(23, 16)
+
+#define	V2_UD_SEND_WQE_BYTE_36_TCLASS_S 24
+#define V2_UD_SEND_WQE_BYTE_36_TCLASS_M GENMASK(31, 24)
+
+#define	V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_S 0
+#define V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_M GENMASK(19, 0)
+
+#define	V2_UD_SEND_WQE_BYTE_40_SL_S 20
+#define V2_UD_SEND_WQE_BYTE_40_SL_M GENMASK(23, 20)
+
+#define	V2_UD_SEND_WQE_BYTE_40_PORTN_S 24
+#define V2_UD_SEND_WQE_BYTE_40_PORTN_M GENMASK(26, 24)
+
+#define	V2_UD_SEND_WQE_BYTE_40_LBI_S 31
+
+#define	V2_UD_SEND_WQE_DMAC_0_S 0
+#define V2_UD_SEND_WQE_DMAC_0_M GENMASK(7, 0)
+
+#define	V2_UD_SEND_WQE_DMAC_1_S 8
+#define V2_UD_SEND_WQE_DMAC_1_M GENMASK(15, 8)
+
+#define	V2_UD_SEND_WQE_DMAC_2_S 16
+#define V2_UD_SEND_WQE_DMAC_2_M GENMASK(23, 16)
+
+#define	V2_UD_SEND_WQE_DMAC_3_S 24
+#define V2_UD_SEND_WQE_DMAC_3_M GENMASK(31, 24)
+
+#define	V2_UD_SEND_WQE_BYTE_48_DMAC_4_S 0
+#define V2_UD_SEND_WQE_BYTE_48_DMAC_4_M GENMASK(7, 0)
+
+#define	V2_UD_SEND_WQE_BYTE_48_DMAC_5_S 8
+#define V2_UD_SEND_WQE_BYTE_48_DMAC_5_M GENMASK(15, 8)
+
+#define	V2_UD_SEND_WQE_BYTE_48_SGID_INDX_S 16
+#define V2_UD_SEND_WQE_BYTE_48_SGID_INDX_M GENMASK(23, 16)
+
+#define	V2_UD_SEND_WQE_BYTE_48_SMAC_INDX_S 24
+#define V2_UD_SEND_WQE_BYTE_48_SMAC_INDX_M GENMASK(31, 24)
+
 struct hns_roce_v2_rc_send_wqe {
 struct hns_roce_v2_rc_send_wqe {
 	u32		byte_4;
 	u32		byte_4;
 	u32		msg_len;
 	u32		msg_len;
@@ -1129,9 +1231,6 @@ struct hns_roce_cmq_desc {
 	u32 data[6];
 	u32 data[6];
 };
 };
 
 
-#define ROCEE_VF_MB_CFG0_REG		0x40
-#define ROCEE_VF_MB_STATUS_REG		0x58
-
 #define HNS_ROCE_V2_GO_BIT_TIMEOUT_MSECS	10000
 #define HNS_ROCE_V2_GO_BIT_TIMEOUT_MSECS	10000
 
 
 #define HNS_ROCE_HW_RUN_BIT_SHIFT	31
 #define HNS_ROCE_HW_RUN_BIT_SHIFT	31
@@ -1174,4 +1273,178 @@ struct hns_roce_v2_priv {
 	struct hns_roce_v2_cmq cmq;
 	struct hns_roce_v2_cmq cmq;
 };
 };
 
 
+struct hns_roce_eq_context {
+	u32	byte_4;
+	u32	byte_8;
+	u32	byte_12;
+	u32	eqe_report_timer;
+	u32	eqe_ba0;
+	u32	eqe_ba1;
+	u32	byte_28;
+	u32	byte_32;
+	u32	byte_36;
+	u32	nxt_eqe_ba0;
+	u32	nxt_eqe_ba1;
+	u32	rsv[5];
+};
+
+#define HNS_ROCE_AEQ_DEFAULT_BURST_NUM	0x0
+#define HNS_ROCE_AEQ_DEFAULT_INTERVAL	0x0
+#define HNS_ROCE_CEQ_DEFAULT_BURST_NUM	0x0
+#define HNS_ROCE_CEQ_DEFAULT_INTERVAL	0x0
+
+#define HNS_ROCE_V2_EQ_STATE_INVALID		0
+#define HNS_ROCE_V2_EQ_STATE_VALID		1
+#define HNS_ROCE_V2_EQ_STATE_OVERFLOW		2
+#define HNS_ROCE_V2_EQ_STATE_FAILURE		3
+
+#define HNS_ROCE_V2_EQ_OVER_IGNORE_0		0
+#define HNS_ROCE_V2_EQ_OVER_IGNORE_1		1
+
+#define HNS_ROCE_V2_EQ_COALESCE_0		0
+#define HNS_ROCE_V2_EQ_COALESCE_1		1
+
+#define HNS_ROCE_V2_EQ_FIRED			0
+#define HNS_ROCE_V2_EQ_ARMED			1
+#define HNS_ROCE_V2_EQ_ALWAYS_ARMED		3
+
+#define HNS_ROCE_EQ_INIT_EQE_CNT		0
+#define HNS_ROCE_EQ_INIT_PROD_IDX		0
+#define HNS_ROCE_EQ_INIT_REPORT_TIMER		0
+#define HNS_ROCE_EQ_INIT_MSI_IDX		0
+#define HNS_ROCE_EQ_INIT_CONS_IDX		0
+#define HNS_ROCE_EQ_INIT_NXT_EQE_BA		0
+
+#define HNS_ROCE_V2_CEQ_CEQE_OWNER_S		31
+#define HNS_ROCE_V2_AEQ_AEQE_OWNER_S		31
+
+#define HNS_ROCE_V2_COMP_EQE_NUM		0x1000
+#define HNS_ROCE_V2_ASYNC_EQE_NUM		0x1000
+
+#define HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S	0
+#define HNS_ROCE_V2_VF_INT_ST_BUS_ERR_S		1
+#define HNS_ROCE_V2_VF_INT_ST_OTHER_ERR_S	2
+
+#define HNS_ROCE_EQ_DB_CMD_AEQ			0x0
+#define HNS_ROCE_EQ_DB_CMD_AEQ_ARMED		0x1
+#define HNS_ROCE_EQ_DB_CMD_CEQ			0x2
+#define HNS_ROCE_EQ_DB_CMD_CEQ_ARMED		0x3
+
+#define EQ_ENABLE				1
+#define EQ_DISABLE				0
+
+#define EQ_REG_OFFSET				0x4
+
+#define HNS_ROCE_INT_NAME_LEN			32
+#define HNS_ROCE_V2_EQN_M GENMASK(23, 0)
+
+#define HNS_ROCE_V2_CONS_IDX_M GENMASK(23, 0)
+
+#define HNS_ROCE_V2_VF_ABN_INT_EN_S 0
+#define HNS_ROCE_V2_VF_ABN_INT_EN_M GENMASK(0, 0)
+#define HNS_ROCE_V2_VF_ABN_INT_ST_M GENMASK(2, 0)
+#define HNS_ROCE_V2_VF_ABN_INT_CFG_M GENMASK(2, 0)
+#define HNS_ROCE_V2_VF_EVENT_INT_EN_M GENMASK(0, 0)
+
+/* WORD0 */
+#define HNS_ROCE_EQC_EQ_ST_S 0
+#define HNS_ROCE_EQC_EQ_ST_M GENMASK(1, 0)
+
+#define HNS_ROCE_EQC_HOP_NUM_S 2
+#define HNS_ROCE_EQC_HOP_NUM_M GENMASK(3, 2)
+
+#define HNS_ROCE_EQC_OVER_IGNORE_S 4
+#define HNS_ROCE_EQC_OVER_IGNORE_M GENMASK(4, 4)
+
+#define HNS_ROCE_EQC_COALESCE_S 5
+#define HNS_ROCE_EQC_COALESCE_M GENMASK(5, 5)
+
+#define HNS_ROCE_EQC_ARM_ST_S 6
+#define HNS_ROCE_EQC_ARM_ST_M GENMASK(7, 6)
+
+#define HNS_ROCE_EQC_EQN_S 8
+#define HNS_ROCE_EQC_EQN_M GENMASK(15, 8)
+
+#define HNS_ROCE_EQC_EQE_CNT_S 16
+#define HNS_ROCE_EQC_EQE_CNT_M GENMASK(31, 16)
+
+/* WORD1 */
+#define HNS_ROCE_EQC_BA_PG_SZ_S 0
+#define HNS_ROCE_EQC_BA_PG_SZ_M GENMASK(3, 0)
+
+#define HNS_ROCE_EQC_BUF_PG_SZ_S 4
+#define HNS_ROCE_EQC_BUF_PG_SZ_M GENMASK(7, 4)
+
+#define HNS_ROCE_EQC_PROD_INDX_S 8
+#define HNS_ROCE_EQC_PROD_INDX_M GENMASK(31, 8)
+
+/* WORD2 */
+#define HNS_ROCE_EQC_MAX_CNT_S 0
+#define HNS_ROCE_EQC_MAX_CNT_M GENMASK(15, 0)
+
+#define HNS_ROCE_EQC_PERIOD_S 16
+#define HNS_ROCE_EQC_PERIOD_M GENMASK(31, 16)
+
+/* WORD3 */
+#define HNS_ROCE_EQC_REPORT_TIMER_S 0
+#define HNS_ROCE_EQC_REPORT_TIMER_M GENMASK(31, 0)
+
+/* WORD4 */
+#define HNS_ROCE_EQC_EQE_BA_L_S 0
+#define HNS_ROCE_EQC_EQE_BA_L_M GENMASK(31, 0)
+
+/* WORD5 */
+#define HNS_ROCE_EQC_EQE_BA_H_S 0
+#define HNS_ROCE_EQC_EQE_BA_H_M GENMASK(28, 0)
+
+/* WORD6 */
+#define HNS_ROCE_EQC_SHIFT_S 0
+#define HNS_ROCE_EQC_SHIFT_M GENMASK(7, 0)
+
+#define HNS_ROCE_EQC_MSI_INDX_S 8
+#define HNS_ROCE_EQC_MSI_INDX_M GENMASK(15, 8)
+
+#define HNS_ROCE_EQC_CUR_EQE_BA_L_S 16
+#define HNS_ROCE_EQC_CUR_EQE_BA_L_M GENMASK(31, 16)
+
+/* WORD7 */
+#define HNS_ROCE_EQC_CUR_EQE_BA_M_S 0
+#define HNS_ROCE_EQC_CUR_EQE_BA_M_M GENMASK(31, 0)
+
+/* WORD8 */
+#define HNS_ROCE_EQC_CUR_EQE_BA_H_S 0
+#define HNS_ROCE_EQC_CUR_EQE_BA_H_M GENMASK(3, 0)
+
+#define HNS_ROCE_EQC_CONS_INDX_S 8
+#define HNS_ROCE_EQC_CONS_INDX_M GENMASK(31, 8)
+
+/* WORD9 */
+#define HNS_ROCE_EQC_NXT_EQE_BA_L_S 0
+#define HNS_ROCE_EQC_NXT_EQE_BA_L_M GENMASK(31, 0)
+
+/* WORD10 */
+#define HNS_ROCE_EQC_NXT_EQE_BA_H_S 0
+#define HNS_ROCE_EQC_NXT_EQE_BA_H_M GENMASK(19, 0)
+
+#define HNS_ROCE_V2_CEQE_COMP_CQN_S 0
+#define HNS_ROCE_V2_CEQE_COMP_CQN_M GENMASK(23, 0)
+
+#define HNS_ROCE_V2_AEQE_EVENT_TYPE_S 0
+#define HNS_ROCE_V2_AEQE_EVENT_TYPE_M GENMASK(7, 0)
+
+#define HNS_ROCE_V2_AEQE_SUB_TYPE_S 8
+#define HNS_ROCE_V2_AEQE_SUB_TYPE_M GENMASK(15, 8)
+
+#define HNS_ROCE_V2_EQ_DB_CMD_S	16
+#define HNS_ROCE_V2_EQ_DB_CMD_M	GENMASK(17, 16)
+
+#define HNS_ROCE_V2_EQ_DB_TAG_S	0
+#define HNS_ROCE_V2_EQ_DB_TAG_M	GENMASK(7, 0)
+
+#define HNS_ROCE_V2_EQ_DB_PARA_S 0
+#define HNS_ROCE_V2_EQ_DB_PARA_M GENMASK(23, 0)
+
+#define HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_S 0
+#define HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_M GENMASK(23, 0)
+
 #endif
 #endif

+ 6 - 10
drivers/infiniband/hw/hns/hns_roce_main.c

@@ -748,12 +748,10 @@ int hns_roce_init(struct hns_roce_dev *hr_dev)
 		goto error_failed_cmd_init;
 		goto error_failed_cmd_init;
 	}
 	}
 
 
-	if (hr_dev->cmd_mod) {
-		ret = hns_roce_init_eq_table(hr_dev);
-		if (ret) {
-			dev_err(dev, "eq init failed!\n");
-			goto error_failed_eq_table;
-		}
+	ret = hr_dev->hw->init_eq(hr_dev);
+	if (ret) {
+		dev_err(dev, "eq init failed!\n");
+		goto error_failed_eq_table;
 	}
 	}
 
 
 	if (hr_dev->cmd_mod) {
 	if (hr_dev->cmd_mod) {
@@ -805,8 +803,7 @@ error_failed_init_hem:
 		hns_roce_cmd_use_polling(hr_dev);
 		hns_roce_cmd_use_polling(hr_dev);
 
 
 error_failed_use_event:
 error_failed_use_event:
-	if (hr_dev->cmd_mod)
-		hns_roce_cleanup_eq_table(hr_dev);
+	hr_dev->hw->cleanup_eq(hr_dev);
 
 
 error_failed_eq_table:
 error_failed_eq_table:
 	hns_roce_cmd_cleanup(hr_dev);
 	hns_roce_cmd_cleanup(hr_dev);
@@ -837,8 +834,7 @@ void hns_roce_exit(struct hns_roce_dev *hr_dev)
 	if (hr_dev->cmd_mod)
 	if (hr_dev->cmd_mod)
 		hns_roce_cmd_use_polling(hr_dev);
 		hns_roce_cmd_use_polling(hr_dev);
 
 
-	if (hr_dev->cmd_mod)
-		hns_roce_cleanup_eq_table(hr_dev);
+	hr_dev->hw->cleanup_eq(hr_dev);
 	hns_roce_cmd_cleanup(hr_dev);
 	hns_roce_cmd_cleanup(hr_dev);
 	if (hr_dev->hw->cmq_exit)
 	if (hr_dev->hw->cmq_exit)
 		hr_dev->hw->cmq_exit(hr_dev);
 		hr_dev->hw->cmq_exit(hr_dev);

+ 63 - 9
drivers/infiniband/hw/hns/hns_roce_qp.c

@@ -65,6 +65,7 @@ void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type)
 	if (atomic_dec_and_test(&qp->refcount))
 	if (atomic_dec_and_test(&qp->refcount))
 		complete(&qp->free);
 		complete(&qp->free);
 }
 }
+EXPORT_SYMBOL_GPL(hns_roce_qp_event);
 
 
 static void hns_roce_ib_qp_event(struct hns_roce_qp *hr_qp,
 static void hns_roce_ib_qp_event(struct hns_roce_qp *hr_qp,
 				 enum hns_roce_event type)
 				 enum hns_roce_event type)
@@ -454,6 +455,13 @@ static int hns_roce_set_kernel_sq_size(struct hns_roce_dev *hr_dev,
 		hr_qp->sge.sge_shift = 4;
 		hr_qp->sge.sge_shift = 4;
 	}
 	}
 
 
+	/* ud sqwqe's sge use extend sge */
+	if (hr_dev->caps.max_sq_sg > 2 && hr_qp->ibqp.qp_type == IB_QPT_GSI) {
+		hr_qp->sge.sge_cnt = roundup_pow_of_two(hr_qp->sq.wqe_cnt *
+				     hr_qp->sq.max_gs);
+		hr_qp->sge.sge_shift = 4;
+	}
+
 	/* Get buf size, SQ and RQ are aligned to PAGE_SIZE */
 	/* Get buf size, SQ and RQ are aligned to PAGE_SIZE */
 	page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT);
 	page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT);
 	hr_qp->sq.offset = 0;
 	hr_qp->sq.offset = 0;
@@ -493,6 +501,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 	int ret = 0;
 	int ret = 0;
 	u32 page_shift;
 	u32 page_shift;
 	u32 npages;
 	u32 npages;
+	int i;
 
 
 	mutex_init(&hr_qp->mutex);
 	mutex_init(&hr_qp->mutex);
 	spin_lock_init(&hr_qp->sq.lock);
 	spin_lock_init(&hr_qp->sq.lock);
@@ -500,6 +509,8 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 
 
 	hr_qp->state = IB_QPS_RESET;
 	hr_qp->state = IB_QPS_RESET;
 
 
+	hr_qp->ibqp.qp_type = init_attr->qp_type;
+
 	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
 	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
 		hr_qp->sq_signal_bits = IB_SIGNAL_ALL_WR;
 		hr_qp->sq_signal_bits = IB_SIGNAL_ALL_WR;
 	else
 	else
@@ -512,18 +523,48 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 		goto err_out;
 		goto err_out;
 	}
 	}
 
 
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) {
+		/* allocate recv inline buf */
+		hr_qp->rq_inl_buf.wqe_list = kcalloc(hr_qp->rq.wqe_cnt,
+					       sizeof(struct hns_roce_rinl_wqe),
+					       GFP_KERNEL);
+		if (!hr_qp->rq_inl_buf.wqe_list) {
+			ret = -ENOMEM;
+			goto err_out;
+		}
+
+		hr_qp->rq_inl_buf.wqe_cnt = hr_qp->rq.wqe_cnt;
+
+		/* Firstly, allocate a list of sge space buffer */
+		hr_qp->rq_inl_buf.wqe_list[0].sg_list =
+					kcalloc(hr_qp->rq_inl_buf.wqe_cnt,
+					       init_attr->cap.max_recv_sge *
+					       sizeof(struct hns_roce_rinl_sge),
+					       GFP_KERNEL);
+		if (!hr_qp->rq_inl_buf.wqe_list[0].sg_list) {
+			ret = -ENOMEM;
+			goto err_wqe_list;
+		}
+
+		for (i = 1; i < hr_qp->rq_inl_buf.wqe_cnt; i++)
+			/* Secondly, reallocate the buffer */
+			hr_qp->rq_inl_buf.wqe_list[i].sg_list =
+				&hr_qp->rq_inl_buf.wqe_list[0].sg_list[i *
+				init_attr->cap.max_recv_sge];
+	}
+
 	if (ib_pd->uobject) {
 	if (ib_pd->uobject) {
 		if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
 		if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
 			dev_err(dev, "ib_copy_from_udata error for create qp\n");
 			dev_err(dev, "ib_copy_from_udata error for create qp\n");
 			ret = -EFAULT;
 			ret = -EFAULT;
-			goto err_out;
+			goto err_rq_sge_list;
 		}
 		}
 
 
 		ret = hns_roce_set_user_sq_size(hr_dev, &init_attr->cap, hr_qp,
 		ret = hns_roce_set_user_sq_size(hr_dev, &init_attr->cap, hr_qp,
 						&ucmd);
 						&ucmd);
 		if (ret) {
 		if (ret) {
 			dev_err(dev, "hns_roce_set_user_sq_size error for create qp\n");
 			dev_err(dev, "hns_roce_set_user_sq_size error for create qp\n");
-			goto err_out;
+			goto err_rq_sge_list;
 		}
 		}
 
 
 		hr_qp->umem = ib_umem_get(ib_pd->uobject->context,
 		hr_qp->umem = ib_umem_get(ib_pd->uobject->context,
@@ -532,7 +573,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 		if (IS_ERR(hr_qp->umem)) {
 		if (IS_ERR(hr_qp->umem)) {
 			dev_err(dev, "ib_umem_get error for create qp\n");
 			dev_err(dev, "ib_umem_get error for create qp\n");
 			ret = PTR_ERR(hr_qp->umem);
 			ret = PTR_ERR(hr_qp->umem);
-			goto err_out;
+			goto err_rq_sge_list;
 		}
 		}
 
 
 		hr_qp->mtt.mtt_type = MTT_TYPE_WQE;
 		hr_qp->mtt.mtt_type = MTT_TYPE_WQE;
@@ -566,13 +607,13 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 		    IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) {
 		    IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) {
 			dev_err(dev, "init_attr->create_flags error!\n");
 			dev_err(dev, "init_attr->create_flags error!\n");
 			ret = -EINVAL;
 			ret = -EINVAL;
-			goto err_out;
+			goto err_rq_sge_list;
 		}
 		}
 
 
 		if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) {
 		if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) {
 			dev_err(dev, "init_attr->create_flags error!\n");
 			dev_err(dev, "init_attr->create_flags error!\n");
 			ret = -EINVAL;
 			ret = -EINVAL;
-			goto err_out;
+			goto err_rq_sge_list;
 		}
 		}
 
 
 		/* Set SQ size */
 		/* Set SQ size */
@@ -580,7 +621,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 						  hr_qp);
 						  hr_qp);
 		if (ret) {
 		if (ret) {
 			dev_err(dev, "hns_roce_set_kernel_sq_size error!\n");
 			dev_err(dev, "hns_roce_set_kernel_sq_size error!\n");
-			goto err_out;
+			goto err_rq_sge_list;
 		}
 		}
 
 
 		/* QP doorbell register address */
 		/* QP doorbell register address */
@@ -596,7 +637,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 				       &hr_qp->hr_buf, page_shift)) {
 				       &hr_qp->hr_buf, page_shift)) {
 			dev_err(dev, "hns_roce_buf_alloc error!\n");
 			dev_err(dev, "hns_roce_buf_alloc error!\n");
 			ret = -ENOMEM;
 			ret = -ENOMEM;
-			goto err_out;
+			goto err_rq_sge_list;
 		}
 		}
 
 
 		hr_qp->mtt.mtt_type = MTT_TYPE_WQE;
 		hr_qp->mtt.mtt_type = MTT_TYPE_WQE;
@@ -678,6 +719,14 @@ err_buf:
 	else
 	else
 		hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf);
 		hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf);
 
 
+err_rq_sge_list:
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE)
+		kfree(hr_qp->rq_inl_buf.wqe_list[0].sg_list);
+
+err_wqe_list:
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE)
+		kfree(hr_qp->rq_inl_buf.wqe_list);
+
 err_out:
 err_out:
 	return ret;
 	return ret;
 }
 }
@@ -724,8 +773,13 @@ struct ib_qp *hns_roce_create_qp(struct ib_pd *pd,
 		hr_qp = &hr_sqp->hr_qp;
 		hr_qp = &hr_sqp->hr_qp;
 		hr_qp->port = init_attr->port_num - 1;
 		hr_qp->port = init_attr->port_num - 1;
 		hr_qp->phy_port = hr_dev->iboe.phy_port[hr_qp->port];
 		hr_qp->phy_port = hr_dev->iboe.phy_port[hr_qp->port];
-		hr_qp->ibqp.qp_num = HNS_ROCE_MAX_PORTS +
-				     hr_dev->iboe.phy_port[hr_qp->port];
+
+		/* when hw version is v1, the sqpn is allocated */
+		if (hr_dev->caps.max_sq_sg <= 2)
+			hr_qp->ibqp.qp_num = HNS_ROCE_MAX_PORTS +
+					     hr_dev->iboe.phy_port[hr_qp->port];
+		else
+			hr_qp->ibqp.qp_num = 1;
 
 
 		ret = hns_roce_create_qp_common(hr_dev, pd, init_attr, udata,
 		ret = hns_roce_create_qp_common(hr_dev, pd, init_attr, udata,
 						hr_qp->ibqp.qp_num, hr_qp);
 						hr_qp->ibqp.qp_num, hr_qp);

+ 0 - 1
drivers/infiniband/hw/i40iw/Kconfig

@@ -5,4 +5,3 @@ config INFINIBAND_I40IW
 	select GENERIC_ALLOCATOR
 	select GENERIC_ALLOCATOR
 	---help---
 	---help---
 	Intel(R) Ethernet X722 iWARP Driver
 	Intel(R) Ethernet X722 iWARP Driver
-	INET && I40IW && INFINIBAND && I40E

+ 3 - 0
drivers/infiniband/hw/i40iw/i40iw.h

@@ -587,5 +587,8 @@ int i40iw_inet6addr_event(struct notifier_block *notifier,
 int i40iw_net_event(struct notifier_block *notifier,
 int i40iw_net_event(struct notifier_block *notifier,
 		    unsigned long event,
 		    unsigned long event,
 		    void *ptr);
 		    void *ptr);
+int i40iw_netdevice_event(struct notifier_block *notifier,
+			  unsigned long event,
+			  void *ptr);
 
 
 #endif
 #endif

+ 39 - 29
drivers/infiniband/hw/i40iw/i40iw_cm.c

@@ -92,14 +92,9 @@ void i40iw_free_sqbuf(struct i40iw_sc_vsi *vsi, void *bufp)
 static u8 i40iw_derive_hw_ird_setting(u16 cm_ird)
 static u8 i40iw_derive_hw_ird_setting(u16 cm_ird)
 {
 {
 	u8 encoded_ird_size;
 	u8 encoded_ird_size;
-	u8 pof2_cm_ird = 1;
-
-	/* round-off to next powerof2 */
-	while (pof2_cm_ird < cm_ird)
-		pof2_cm_ird *= 2;
 
 
 	/* ird_size field is encoded in qp_ctx */
 	/* ird_size field is encoded in qp_ctx */
-	switch (pof2_cm_ird) {
+	switch (cm_ird ? roundup_pow_of_two(cm_ird) : 0) {
 	case I40IW_HW_IRD_SETTING_64:
 	case I40IW_HW_IRD_SETTING_64:
 		encoded_ird_size = 3;
 		encoded_ird_size = 3;
 		break;
 		break;
@@ -125,13 +120,16 @@ static u8 i40iw_derive_hw_ird_setting(u16 cm_ird)
  * @conn_ird: connection IRD
  * @conn_ird: connection IRD
  * @conn_ord: connection ORD
  * @conn_ord: connection ORD
  */
  */
-static void i40iw_record_ird_ord(struct i40iw_cm_node *cm_node, u16 conn_ird, u16 conn_ord)
+static void i40iw_record_ird_ord(struct i40iw_cm_node *cm_node, u32 conn_ird,
+				 u32 conn_ord)
 {
 {
 	if (conn_ird > I40IW_MAX_IRD_SIZE)
 	if (conn_ird > I40IW_MAX_IRD_SIZE)
 		conn_ird = I40IW_MAX_IRD_SIZE;
 		conn_ird = I40IW_MAX_IRD_SIZE;
 
 
 	if (conn_ord > I40IW_MAX_ORD_SIZE)
 	if (conn_ord > I40IW_MAX_ORD_SIZE)
 		conn_ord = I40IW_MAX_ORD_SIZE;
 		conn_ord = I40IW_MAX_ORD_SIZE;
+	else if (!conn_ord && cm_node->send_rdma0_op == SEND_RDMA_READ_ZERO)
+		conn_ord = 1;
 
 
 	cm_node->ird_size = conn_ird;
 	cm_node->ird_size = conn_ird;
 	cm_node->ord_size = conn_ord;
 	cm_node->ord_size = conn_ord;
@@ -2878,15 +2876,13 @@ static struct i40iw_cm_listener *i40iw_make_listen_node(
  * i40iw_create_cm_node - make a connection node with params
  * i40iw_create_cm_node - make a connection node with params
  * @cm_core: cm's core
  * @cm_core: cm's core
  * @iwdev: iwarp device structure
  * @iwdev: iwarp device structure
- * @private_data_len: len to provate data for mpa request
- * @private_data: pointer to private data for connection
+ * @conn_param: upper layer connection parameters
  * @cm_info: quad info for connection
  * @cm_info: quad info for connection
  */
  */
 static struct i40iw_cm_node *i40iw_create_cm_node(
 static struct i40iw_cm_node *i40iw_create_cm_node(
 					struct i40iw_cm_core *cm_core,
 					struct i40iw_cm_core *cm_core,
 					struct i40iw_device *iwdev,
 					struct i40iw_device *iwdev,
-					u16 private_data_len,
-					void *private_data,
+					struct iw_cm_conn_param *conn_param,
 					struct i40iw_cm_info *cm_info)
 					struct i40iw_cm_info *cm_info)
 {
 {
 	struct i40iw_cm_node *cm_node;
 	struct i40iw_cm_node *cm_node;
@@ -2894,6 +2890,9 @@ static struct i40iw_cm_node *i40iw_create_cm_node(
 	struct i40iw_cm_node *loopback_remotenode;
 	struct i40iw_cm_node *loopback_remotenode;
 	struct i40iw_cm_info loopback_cm_info;
 	struct i40iw_cm_info loopback_cm_info;
 
 
+	u16 private_data_len = conn_param->private_data_len;
+	const void *private_data = conn_param->private_data;
+
 	/* create a CM connection node */
 	/* create a CM connection node */
 	cm_node = i40iw_make_cm_node(cm_core, iwdev, cm_info, NULL);
 	cm_node = i40iw_make_cm_node(cm_core, iwdev, cm_info, NULL);
 	if (!cm_node)
 	if (!cm_node)
@@ -2902,6 +2901,8 @@ static struct i40iw_cm_node *i40iw_create_cm_node(
 	cm_node->tcp_cntxt.client = 1;
 	cm_node->tcp_cntxt.client = 1;
 	cm_node->tcp_cntxt.rcv_wscale = I40IW_CM_DEFAULT_RCV_WND_SCALE;
 	cm_node->tcp_cntxt.rcv_wscale = I40IW_CM_DEFAULT_RCV_WND_SCALE;
 
 
+	i40iw_record_ird_ord(cm_node, conn_param->ird, conn_param->ord);
+
 	if (!memcmp(cm_info->loc_addr, cm_info->rem_addr, sizeof(cm_info->loc_addr))) {
 	if (!memcmp(cm_info->loc_addr, cm_info->rem_addr, sizeof(cm_info->loc_addr))) {
 		loopback_remotelistener = i40iw_find_listener(
 		loopback_remotelistener = i40iw_find_listener(
 						cm_core,
 						cm_core,
@@ -2935,6 +2936,10 @@ static struct i40iw_cm_node *i40iw_create_cm_node(
 			       private_data_len);
 			       private_data_len);
 			loopback_remotenode->pdata.size = private_data_len;
 			loopback_remotenode->pdata.size = private_data_len;
 
 
+			if (loopback_remotenode->ord_size > cm_node->ird_size)
+				loopback_remotenode->ord_size =
+					cm_node->ird_size;
+
 			cm_node->state = I40IW_CM_STATE_OFFLOADED;
 			cm_node->state = I40IW_CM_STATE_OFFLOADED;
 			cm_node->tcp_cntxt.rcv_nxt =
 			cm_node->tcp_cntxt.rcv_nxt =
 				loopback_remotenode->tcp_cntxt.loc_seq_num;
 				loopback_remotenode->tcp_cntxt.loc_seq_num;
@@ -3691,7 +3696,7 @@ int i40iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 	cm_node->qhash_set = false;
 	cm_node->qhash_set = false;
 	i40iw_modify_qp(&iwqp->ibqp, &attr, IB_QP_STATE, NULL);
 	i40iw_modify_qp(&iwqp->ibqp, &attr, IB_QP_STATE, NULL);
 
 
-	cm_node->accelerated = 1;
+	cm_node->accelerated = true;
 	status =
 	status =
 		i40iw_send_cm_event(cm_node, cm_id, IW_CM_EVENT_ESTABLISHED, 0);
 		i40iw_send_cm_event(cm_node, cm_id, IW_CM_EVENT_ESTABLISHED, 0);
 	if (status)
 	if (status)
@@ -3815,9 +3820,7 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 		    __func__, cm_id->tos, cm_info.user_pri);
 		    __func__, cm_id->tos, cm_info.user_pri);
 	cm_id->add_ref(cm_id);
 	cm_id->add_ref(cm_id);
 	cm_node = i40iw_create_cm_node(&iwdev->cm_core, iwdev,
 	cm_node = i40iw_create_cm_node(&iwdev->cm_core, iwdev,
-				       conn_param->private_data_len,
-				       (void *)conn_param->private_data,
-				       &cm_info);
+				       conn_param, &cm_info);
 
 
 	if (IS_ERR(cm_node)) {
 	if (IS_ERR(cm_node)) {
 		ret = PTR_ERR(cm_node);
 		ret = PTR_ERR(cm_node);
@@ -3849,11 +3852,6 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 	}
 	}
 
 
 	cm_node->apbvt_set = true;
 	cm_node->apbvt_set = true;
-	i40iw_record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord);
-	if (cm_node->send_rdma0_op == SEND_RDMA_READ_ZERO &&
-	    !cm_node->ord_size)
-		cm_node->ord_size = 1;
-
 	iwqp->cm_node = cm_node;
 	iwqp->cm_node = cm_node;
 	cm_node->iwqp = iwqp;
 	cm_node->iwqp = iwqp;
 	iwqp->cm_id = cm_id;
 	iwqp->cm_id = cm_id;
@@ -4058,7 +4056,7 @@ static void i40iw_cm_event_connected(struct i40iw_cm_event *event)
 	cm_node->qhash_set = false;
 	cm_node->qhash_set = false;
 	i40iw_modify_qp(&iwqp->ibqp, &attr, IB_QP_STATE, NULL);
 	i40iw_modify_qp(&iwqp->ibqp, &attr, IB_QP_STATE, NULL);
 
 
-	cm_node->accelerated = 1;
+	cm_node->accelerated = true;
 	status = i40iw_send_cm_event(cm_node, cm_id, IW_CM_EVENT_CONNECT_REPLY,
 	status = i40iw_send_cm_event(cm_node, cm_id, IW_CM_EVENT_CONNECT_REPLY,
 				     0);
 				     0);
 	if (status)
 	if (status)
@@ -4242,10 +4240,16 @@ set_qhash:
 }
 }
 
 
 /**
 /**
- * i40iw_cm_disconnect_all - disconnect all connected qp's
+ * i40iw_cm_teardown_connections - teardown QPs
  * @iwdev: device pointer
  * @iwdev: device pointer
+ * @ipaddr: Pointer to IPv4 or IPv6 address
+ * @ipv4: flag indicating IPv4 when true
+ * @disconnect_all: flag indicating disconnect all QPs
+ * teardown QPs where source or destination addr matches ip addr
  */
  */
-void i40iw_cm_disconnect_all(struct i40iw_device *iwdev)
+void i40iw_cm_teardown_connections(struct i40iw_device *iwdev, u32 *ipaddr,
+				   struct i40iw_cm_info *nfo,
+				   bool disconnect_all)
 {
 {
 	struct i40iw_cm_core *cm_core = &iwdev->cm_core;
 	struct i40iw_cm_core *cm_core = &iwdev->cm_core;
 	struct list_head *list_core_temp;
 	struct list_head *list_core_temp;
@@ -4259,8 +4263,13 @@ void i40iw_cm_disconnect_all(struct i40iw_device *iwdev)
 	spin_lock_irqsave(&cm_core->ht_lock, flags);
 	spin_lock_irqsave(&cm_core->ht_lock, flags);
 	list_for_each_safe(list_node, list_core_temp, &cm_core->connected_nodes) {
 	list_for_each_safe(list_node, list_core_temp, &cm_core->connected_nodes) {
 		cm_node = container_of(list_node, struct i40iw_cm_node, list);
 		cm_node = container_of(list_node, struct i40iw_cm_node, list);
-		atomic_inc(&cm_node->ref_count);
-		list_add(&cm_node->connected_entry, &connected_list);
+		if (disconnect_all ||
+		    (nfo->vlan_id == cm_node->vlan_id &&
+		    (!memcmp(cm_node->loc_addr, ipaddr, nfo->ipv4 ? 4 : 16) ||
+		     !memcmp(cm_node->rem_addr, ipaddr, nfo->ipv4 ? 4 : 16)))) {
+			atomic_inc(&cm_node->ref_count);
+			list_add(&cm_node->connected_entry, &connected_list);
+		}
 	}
 	}
 	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
 	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
 
 
@@ -4294,6 +4303,9 @@ void i40iw_if_notify(struct i40iw_device *iwdev, struct net_device *netdev,
 	enum i40iw_quad_hash_manage_type op =
 	enum i40iw_quad_hash_manage_type op =
 		ifup ? I40IW_QHASH_MANAGE_TYPE_ADD : I40IW_QHASH_MANAGE_TYPE_DELETE;
 		ifup ? I40IW_QHASH_MANAGE_TYPE_ADD : I40IW_QHASH_MANAGE_TYPE_DELETE;
 
 
+	nfo.vlan_id = vlan_id;
+	nfo.ipv4 = ipv4;
+
 	/* Disable or enable qhash for listeners */
 	/* Disable or enable qhash for listeners */
 	spin_lock_irqsave(&cm_core->listen_list_lock, flags);
 	spin_lock_irqsave(&cm_core->listen_list_lock, flags);
 	list_for_each_entry(listen_node, &cm_core->listen_nodes, list) {
 	list_for_each_entry(listen_node, &cm_core->listen_nodes, list) {
@@ -4303,8 +4315,6 @@ void i40iw_if_notify(struct i40iw_device *iwdev, struct net_device *netdev,
 			memcpy(nfo.loc_addr, listen_node->loc_addr,
 			memcpy(nfo.loc_addr, listen_node->loc_addr,
 			       sizeof(nfo.loc_addr));
 			       sizeof(nfo.loc_addr));
 			nfo.loc_port = listen_node->loc_port;
 			nfo.loc_port = listen_node->loc_port;
-			nfo.ipv4 = listen_node->ipv4;
-			nfo.vlan_id = listen_node->vlan_id;
 			nfo.user_pri = listen_node->user_pri;
 			nfo.user_pri = listen_node->user_pri;
 			if (!list_empty(&listen_node->child_listen_list)) {
 			if (!list_empty(&listen_node->child_listen_list)) {
 				i40iw_qhash_ctrl(iwdev,
 				i40iw_qhash_ctrl(iwdev,
@@ -4326,7 +4336,7 @@ void i40iw_if_notify(struct i40iw_device *iwdev, struct net_device *netdev,
 	}
 	}
 	spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
 	spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
 
 
-	/* disconnect any connected qp's on ifdown */
+	/* teardown connected qp's on ifdown */
 	if (!ifup)
 	if (!ifup)
-		i40iw_cm_disconnect_all(iwdev);
+		i40iw_cm_teardown_connections(iwdev, ipaddr, &nfo, false);
 }
 }

+ 4 - 4
drivers/infiniband/hw/i40iw/i40iw_cm.h

@@ -276,8 +276,6 @@ struct i40iw_cm_tcp_context {
 	u32 mss;
 	u32 mss;
 	u8 snd_wscale;
 	u8 snd_wscale;
 	u8 rcv_wscale;
 	u8 rcv_wscale;
-
-	struct timeval sent_ts;
 };
 };
 
 
 enum i40iw_cm_listener_state {
 enum i40iw_cm_listener_state {
@@ -337,7 +335,7 @@ struct i40iw_cm_node {
 	u16     mpav2_ird_ord;
 	u16     mpav2_ird_ord;
 	struct iw_cm_id *cm_id;
 	struct iw_cm_id *cm_id;
 	struct list_head list;
 	struct list_head list;
-	int accelerated;
+	bool accelerated;
 	struct i40iw_cm_listener *listener;
 	struct i40iw_cm_listener *listener;
 	int apbvt_set;
 	int apbvt_set;
 	int accept_pend;
 	int accept_pend;
@@ -455,5 +453,7 @@ int i40iw_arp_table(struct i40iw_device *iwdev,
 
 
 void i40iw_if_notify(struct i40iw_device *iwdev, struct net_device *netdev,
 void i40iw_if_notify(struct i40iw_device *iwdev, struct net_device *netdev,
 		     u32 *ipaddr, bool ipv4, bool ifup);
 		     u32 *ipaddr, bool ipv4, bool ifup);
-void i40iw_cm_disconnect_all(struct i40iw_device *iwdev);
+void i40iw_cm_teardown_connections(struct i40iw_device *iwdev, u32 *ipaddr,
+				   struct i40iw_cm_info *nfo,
+				   bool disconnect_all);
 #endif /* I40IW_CM_H */
 #endif /* I40IW_CM_H */

+ 8 - 17
drivers/infiniband/hw/i40iw/i40iw_ctrl.c

@@ -1893,8 +1893,6 @@ static enum i40iw_status_code i40iw_sc_get_next_aeqe(struct i40iw_sc_aeq *aeq,
 static enum i40iw_status_code i40iw_sc_repost_aeq_entries(struct i40iw_sc_dev *dev,
 static enum i40iw_status_code i40iw_sc_repost_aeq_entries(struct i40iw_sc_dev *dev,
 							  u32 count)
 							  u32 count)
 {
 {
-	if (count > I40IW_MAX_AEQ_ALLOCATE_COUNT)
-		return I40IW_ERR_INVALID_SIZE;
 
 
 	if (dev->is_pf)
 	if (dev->is_pf)
 		i40iw_wr32(dev->hw, I40E_PFPE_AEQALLOC, count);
 		i40iw_wr32(dev->hw, I40E_PFPE_AEQALLOC, count);
@@ -3872,7 +3870,6 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_
 	struct i40iw_virt_mem virt_mem;
 	struct i40iw_virt_mem virt_mem;
 	u32 i, mem_size;
 	u32 i, mem_size;
 	u32 qpwantedoriginal, qpwanted, mrwanted, pblewanted;
 	u32 qpwantedoriginal, qpwanted, mrwanted, pblewanted;
-	u32 powerof2;
 	u64 sd_needed;
 	u64 sd_needed;
 	u32 loop_count = 0;
 	u32 loop_count = 0;
 
 
@@ -3928,8 +3925,10 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_
 		hmc_info->hmc_obj[I40IW_HMC_IW_APBVT_ENTRY].cnt = 1;
 		hmc_info->hmc_obj[I40IW_HMC_IW_APBVT_ENTRY].cnt = 1;
 		hmc_info->hmc_obj[I40IW_HMC_IW_MR].cnt = mrwanted;
 		hmc_info->hmc_obj[I40IW_HMC_IW_MR].cnt = mrwanted;
 
 
-		hmc_info->hmc_obj[I40IW_HMC_IW_XF].cnt = I40IW_MAX_WQ_ENTRIES * qpwanted;
-		hmc_info->hmc_obj[I40IW_HMC_IW_Q1].cnt = 4 * I40IW_MAX_IRD_SIZE * qpwanted;
+		hmc_info->hmc_obj[I40IW_HMC_IW_XF].cnt =
+			roundup_pow_of_two(I40IW_MAX_WQ_ENTRIES * qpwanted);
+		hmc_info->hmc_obj[I40IW_HMC_IW_Q1].cnt =
+			roundup_pow_of_two(2 * I40IW_MAX_IRD_SIZE * qpwanted);
 		hmc_info->hmc_obj[I40IW_HMC_IW_XFFL].cnt =
 		hmc_info->hmc_obj[I40IW_HMC_IW_XFFL].cnt =
 			hmc_info->hmc_obj[I40IW_HMC_IW_XF].cnt / hmc_fpm_misc->xf_block_size;
 			hmc_info->hmc_obj[I40IW_HMC_IW_XF].cnt / hmc_fpm_misc->xf_block_size;
 		hmc_info->hmc_obj[I40IW_HMC_IW_Q1FL].cnt =
 		hmc_info->hmc_obj[I40IW_HMC_IW_Q1FL].cnt =
@@ -3945,16 +3944,10 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_
 		if ((loop_count > 1000) ||
 		if ((loop_count > 1000) ||
 		    ((!(loop_count % 10)) &&
 		    ((!(loop_count % 10)) &&
 		    (qpwanted > qpwantedoriginal * 2 / 3))) {
 		    (qpwanted > qpwantedoriginal * 2 / 3))) {
-			if (qpwanted > FPM_MULTIPLIER) {
-				qpwanted -= FPM_MULTIPLIER;
-				powerof2 = 1;
-				while (powerof2 < qpwanted)
-					powerof2 *= 2;
-				powerof2 /= 2;
-				qpwanted = powerof2;
-			} else {
-				qpwanted /= 2;
-			}
+			if (qpwanted > FPM_MULTIPLIER)
+				qpwanted = roundup_pow_of_two(qpwanted -
+							      FPM_MULTIPLIER);
+			qpwanted >>= 1;
 		}
 		}
 		if (mrwanted > FPM_MULTIPLIER * 10)
 		if (mrwanted > FPM_MULTIPLIER * 10)
 			mrwanted -= FPM_MULTIPLIER * 10;
 			mrwanted -= FPM_MULTIPLIER * 10;
@@ -3962,8 +3955,6 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_
 			pblewanted -= FPM_MULTIPLIER * 1000;
 			pblewanted -= FPM_MULTIPLIER * 1000;
 	} while (sd_needed > hmc_fpm_misc->max_sds && loop_count < 2000);
 	} while (sd_needed > hmc_fpm_misc->max_sds && loop_count < 2000);
 
 
-	sd_needed = i40iw_est_sd(dev, hmc_info);
-
 	i40iw_debug(dev, I40IW_DEBUG_HMC,
 	i40iw_debug(dev, I40IW_DEBUG_HMC,
 		    "loop_cnt=%d, sd_needed=%lld, qpcnt = %d, cqcnt=%d, mrcnt=%d, pblecnt=%d\n",
 		    "loop_cnt=%d, sd_needed=%lld, qpcnt = %d, cqcnt=%d, mrcnt=%d, pblecnt=%d\n",
 		    loop_count, sd_needed,
 		    loop_count, sd_needed,

+ 1 - 0
drivers/infiniband/hw/i40iw/i40iw_d.h

@@ -97,6 +97,7 @@
 #define RDMA_OPCODE_MASK        0x0f
 #define RDMA_OPCODE_MASK        0x0f
 #define RDMA_READ_REQ_OPCODE    1
 #define RDMA_READ_REQ_OPCODE    1
 #define Q2_BAD_FRAME_OFFSET     72
 #define Q2_BAD_FRAME_OFFSET     72
+#define Q2_FPSN_OFFSET          64
 #define CQE_MAJOR_DRV           0x8000
 #define CQE_MAJOR_DRV           0x8000
 
 
 #define I40IW_TERM_SENT 0x01
 #define I40IW_TERM_SENT 0x01

+ 2 - 1
drivers/infiniband/hw/i40iw/i40iw_hw.c

@@ -385,6 +385,8 @@ void i40iw_process_aeq(struct i40iw_device *iwdev)
 				iwcq->ibcq.event_handler(&ibevent, iwcq->ibcq.cq_context);
 				iwcq->ibcq.event_handler(&ibevent, iwcq->ibcq.cq_context);
 			}
 			}
 			break;
 			break;
+		case I40IW_AE_LLP_DOUBT_REACHABILITY:
+			break;
 		case I40IW_AE_PRIV_OPERATION_DENIED:
 		case I40IW_AE_PRIV_OPERATION_DENIED:
 		case I40IW_AE_STAG_ZERO_INVALID:
 		case I40IW_AE_STAG_ZERO_INVALID:
 		case I40IW_AE_IB_RREQ_AND_Q1_FULL:
 		case I40IW_AE_IB_RREQ_AND_Q1_FULL:
@@ -403,7 +405,6 @@ void i40iw_process_aeq(struct i40iw_device *iwdev)
 		case I40IW_AE_LLP_SEGMENT_TOO_SMALL:
 		case I40IW_AE_LLP_SEGMENT_TOO_SMALL:
 		case I40IW_AE_LLP_SYN_RECEIVED:
 		case I40IW_AE_LLP_SYN_RECEIVED:
 		case I40IW_AE_LLP_TOO_MANY_RETRIES:
 		case I40IW_AE_LLP_TOO_MANY_RETRIES:
-		case I40IW_AE_LLP_DOUBT_REACHABILITY:
 		case I40IW_AE_LCE_QP_CATASTROPHIC:
 		case I40IW_AE_LCE_QP_CATASTROPHIC:
 		case I40IW_AE_LCE_FUNCTION_CATASTROPHIC:
 		case I40IW_AE_LCE_FUNCTION_CATASTROPHIC:
 		case I40IW_AE_LCE_CQ_CATASTROPHIC:
 		case I40IW_AE_LCE_CQ_CATASTROPHIC:

+ 10 - 3
drivers/infiniband/hw/i40iw/i40iw_main.c

@@ -99,6 +99,10 @@ static struct notifier_block i40iw_net_notifier = {
 	.notifier_call = i40iw_net_event
 	.notifier_call = i40iw_net_event
 };
 };
 
 
+static struct notifier_block i40iw_netdevice_notifier = {
+	.notifier_call = i40iw_netdevice_event
+};
+
 /**
 /**
  * i40iw_find_i40e_handler - find a handler given a client info
  * i40iw_find_i40e_handler - find a handler given a client info
  * @ldev: pointer to a client info
  * @ldev: pointer to a client info
@@ -483,6 +487,7 @@ static enum i40iw_status_code i40iw_create_hmc_objs(struct i40iw_device *iwdev,
 	for (i = 0; i < IW_HMC_OBJ_TYPE_NUM; i++) {
 	for (i = 0; i < IW_HMC_OBJ_TYPE_NUM; i++) {
 		info.rsrc_type = iw_hmc_obj_types[i];
 		info.rsrc_type = iw_hmc_obj_types[i];
 		info.count = dev->hmc_info->hmc_obj[info.rsrc_type].cnt;
 		info.count = dev->hmc_info->hmc_obj[info.rsrc_type].cnt;
+		info.add_sd_cnt = 0;
 		status = i40iw_create_hmc_obj_type(dev, &info);
 		status = i40iw_create_hmc_obj_type(dev, &info);
 		if (status) {
 		if (status) {
 			i40iw_pr_err("create obj type %d status = %d\n",
 			i40iw_pr_err("create obj type %d status = %d\n",
@@ -607,7 +612,7 @@ static enum i40iw_status_code i40iw_create_cqp(struct i40iw_device *iwdev)
 	INIT_LIST_HEAD(&cqp->cqp_avail_reqs);
 	INIT_LIST_HEAD(&cqp->cqp_avail_reqs);
 	INIT_LIST_HEAD(&cqp->cqp_pending_reqs);
 	INIT_LIST_HEAD(&cqp->cqp_pending_reqs);
 	/* init the waitq of the cqp_requests and add them to the list */
 	/* init the waitq of the cqp_requests and add them to the list */
-	for (i = 0; i < I40IW_CQP_SW_SQSIZE_2048; i++) {
+	for (i = 0; i < sqsize; i++) {
 		init_waitqueue_head(&cqp->cqp_requests[i].waitq);
 		init_waitqueue_head(&cqp->cqp_requests[i].waitq);
 		list_add_tail(&cqp->cqp_requests[i].list, &cqp->cqp_avail_reqs);
 		list_add_tail(&cqp->cqp_requests[i].list, &cqp->cqp_avail_reqs);
 	}
 	}
@@ -1285,7 +1290,7 @@ static void i40iw_wait_pe_ready(struct i40iw_hw *hw)
 			      __LINE__, statuscpu2);
 			      __LINE__, statuscpu2);
 		if ((statuscpu0 == 0x80) && (statuscpu1 == 0x80) && (statuscpu2 == 0x80))
 		if ((statuscpu0 == 0x80) && (statuscpu1 == 0x80) && (statuscpu2 == 0x80))
 			break;	/* SUCCESS */
 			break;	/* SUCCESS */
-		mdelay(1000);
+		msleep(1000);
 		retrycount++;
 		retrycount++;
 	} while (retrycount < 14);
 	} while (retrycount < 14);
 	i40iw_wr32(hw, 0xb4040, 0x4C104C5);
 	i40iw_wr32(hw, 0xb4040, 0x4C104C5);
@@ -1393,6 +1398,7 @@ static void i40iw_register_notifiers(void)
 	register_inetaddr_notifier(&i40iw_inetaddr_notifier);
 	register_inetaddr_notifier(&i40iw_inetaddr_notifier);
 	register_inet6addr_notifier(&i40iw_inetaddr6_notifier);
 	register_inet6addr_notifier(&i40iw_inetaddr6_notifier);
 	register_netevent_notifier(&i40iw_net_notifier);
 	register_netevent_notifier(&i40iw_net_notifier);
+	register_netdevice_notifier(&i40iw_netdevice_notifier);
 }
 }
 
 
 /**
 /**
@@ -1404,6 +1410,7 @@ static void i40iw_unregister_notifiers(void)
 	unregister_netevent_notifier(&i40iw_net_notifier);
 	unregister_netevent_notifier(&i40iw_net_notifier);
 	unregister_inetaddr_notifier(&i40iw_inetaddr_notifier);
 	unregister_inetaddr_notifier(&i40iw_inetaddr_notifier);
 	unregister_inet6addr_notifier(&i40iw_inetaddr6_notifier);
 	unregister_inet6addr_notifier(&i40iw_inetaddr6_notifier);
+	unregister_netdevice_notifier(&i40iw_netdevice_notifier);
 }
 }
 
 
 /**
 /**
@@ -1793,7 +1800,7 @@ static void i40iw_close(struct i40e_info *ldev, struct i40e_client *client, bool
 	if (reset)
 	if (reset)
 		iwdev->reset = true;
 		iwdev->reset = true;
 
 
-	i40iw_cm_disconnect_all(iwdev);
+	i40iw_cm_teardown_connections(iwdev, NULL, NULL, true);
 	destroy_workqueue(iwdev->virtchnl_wq);
 	destroy_workqueue(iwdev->virtchnl_wq);
 	i40iw_deinit_device(iwdev);
 	i40iw_deinit_device(iwdev);
 }
 }

+ 2 - 3
drivers/infiniband/hw/i40iw/i40iw_puda.c

@@ -48,7 +48,6 @@ static void i40iw_ieq_tx_compl(struct i40iw_sc_vsi *vsi, void *sqwrid);
 static void i40iw_ilq_putback_rcvbuf(struct i40iw_sc_qp *qp, u32 wqe_idx);
 static void i40iw_ilq_putback_rcvbuf(struct i40iw_sc_qp *qp, u32 wqe_idx);
 static enum i40iw_status_code i40iw_puda_replenish_rq(struct i40iw_puda_rsrc
 static enum i40iw_status_code i40iw_puda_replenish_rq(struct i40iw_puda_rsrc
 						      *rsrc, bool initial);
 						      *rsrc, bool initial);
-static void i40iw_ieq_cleanup_qp(struct i40iw_puda_rsrc *ieq, struct i40iw_sc_qp *qp);
 /**
 /**
  * i40iw_puda_get_listbuf - get buffer from puda list
  * i40iw_puda_get_listbuf - get buffer from puda list
  * @list: list to use for buffers (ILQ or IEQ)
  * @list: list to use for buffers (ILQ or IEQ)
@@ -1378,7 +1377,7 @@ static void i40iw_ieq_handle_exception(struct i40iw_puda_rsrc *ieq,
 	u32 *hw_host_ctx = (u32 *)qp->hw_host_ctx;
 	u32 *hw_host_ctx = (u32 *)qp->hw_host_ctx;
 	u32 rcv_wnd = hw_host_ctx[23];
 	u32 rcv_wnd = hw_host_ctx[23];
 	/* first partial seq # in q2 */
 	/* first partial seq # in q2 */
-	u32 fps = qp->q2_buf[16];
+	u32 fps = *(u32 *)(qp->q2_buf + Q2_FPSN_OFFSET);
 	struct list_head *rxlist = &pfpdu->rxlist;
 	struct list_head *rxlist = &pfpdu->rxlist;
 	struct list_head *plist;
 	struct list_head *plist;
 
 
@@ -1483,7 +1482,7 @@ static void i40iw_ieq_tx_compl(struct i40iw_sc_vsi *vsi, void *sqwrid)
  * @ieq: ieq resource
  * @ieq: ieq resource
  * @qp: all pending fpdu buffers
  * @qp: all pending fpdu buffers
  */
  */
-static void i40iw_ieq_cleanup_qp(struct i40iw_puda_rsrc *ieq, struct i40iw_sc_qp *qp)
+void i40iw_ieq_cleanup_qp(struct i40iw_puda_rsrc *ieq, struct i40iw_sc_qp *qp)
 {
 {
 	struct i40iw_puda_buf *buf;
 	struct i40iw_puda_buf *buf;
 	struct i40iw_pfpdu *pfpdu = &qp->pfpdu;
 	struct i40iw_pfpdu *pfpdu = &qp->pfpdu;

+ 1 - 0
drivers/infiniband/hw/i40iw/i40iw_puda.h

@@ -184,4 +184,5 @@ enum i40iw_status_code i40iw_cqp_qp_create_cmd(struct i40iw_sc_dev *dev, struct
 enum i40iw_status_code i40iw_cqp_cq_create_cmd(struct i40iw_sc_dev *dev, struct i40iw_sc_cq *cq);
 enum i40iw_status_code i40iw_cqp_cq_create_cmd(struct i40iw_sc_dev *dev, struct i40iw_sc_cq *cq);
 void i40iw_cqp_qp_destroy_cmd(struct i40iw_sc_dev *dev, struct i40iw_sc_qp *qp);
 void i40iw_cqp_qp_destroy_cmd(struct i40iw_sc_dev *dev, struct i40iw_sc_qp *qp);
 void i40iw_cqp_cq_destroy_cmd(struct i40iw_sc_dev *dev, struct i40iw_sc_cq *cq);
 void i40iw_cqp_cq_destroy_cmd(struct i40iw_sc_dev *dev, struct i40iw_sc_cq *cq);
+void i40iw_ieq_cleanup_qp(struct i40iw_puda_rsrc *ieq, struct i40iw_sc_qp *qp);
 #endif
 #endif

+ 2 - 16
drivers/infiniband/hw/i40iw/i40iw_uk.c

@@ -893,20 +893,6 @@ exit:
 	return ret_code;
 	return ret_code;
 }
 }
 
 
-/**
- * i40iw_qp_roundup - return round up QP WQ depth
- * @wqdepth: WQ depth in quantas to round up
- */
-static int i40iw_qp_round_up(u32 wqdepth)
-{
-	int scount = 1;
-
-	for (wqdepth--; scount <= 16; scount *= 2)
-		wqdepth |= wqdepth >> scount;
-
-	return ++wqdepth;
-}
-
 /**
 /**
  * i40iw_get_wqe_shift - get shift count for maximum wqe size
  * i40iw_get_wqe_shift - get shift count for maximum wqe size
  * @sge: Maximum Scatter Gather Elements wqe
  * @sge: Maximum Scatter Gather Elements wqe
@@ -934,7 +920,7 @@ void i40iw_get_wqe_shift(u32 sge, u32 inline_data, u8 *shift)
  */
  */
 enum i40iw_status_code i40iw_get_sqdepth(u32 sq_size, u8 shift, u32 *sqdepth)
 enum i40iw_status_code i40iw_get_sqdepth(u32 sq_size, u8 shift, u32 *sqdepth)
 {
 {
-	*sqdepth = i40iw_qp_round_up((sq_size << shift) + I40IW_SQ_RSVD);
+	*sqdepth = roundup_pow_of_two((sq_size << shift) + I40IW_SQ_RSVD);
 
 
 	if (*sqdepth < (I40IW_QP_SW_MIN_WQSIZE << shift))
 	if (*sqdepth < (I40IW_QP_SW_MIN_WQSIZE << shift))
 		*sqdepth = I40IW_QP_SW_MIN_WQSIZE << shift;
 		*sqdepth = I40IW_QP_SW_MIN_WQSIZE << shift;
@@ -953,7 +939,7 @@ enum i40iw_status_code i40iw_get_sqdepth(u32 sq_size, u8 shift, u32 *sqdepth)
  */
  */
 enum i40iw_status_code i40iw_get_rqdepth(u32 rq_size, u8 shift, u32 *rqdepth)
 enum i40iw_status_code i40iw_get_rqdepth(u32 rq_size, u8 shift, u32 *rqdepth)
 {
 {
-	*rqdepth = i40iw_qp_round_up((rq_size << shift) + I40IW_RQ_RSVD);
+	*rqdepth = roundup_pow_of_two((rq_size << shift) + I40IW_RQ_RSVD);
 
 
 	if (*rqdepth < (I40IW_QP_SW_MIN_WQSIZE << shift))
 	if (*rqdepth < (I40IW_QP_SW_MIN_WQSIZE << shift))
 		*rqdepth = I40IW_QP_SW_MIN_WQSIZE << shift;
 		*rqdepth = I40IW_QP_SW_MIN_WQSIZE << shift;

+ 1 - 2
drivers/infiniband/hw/i40iw/i40iw_user.h

@@ -59,7 +59,6 @@ enum i40iw_device_capabilities_const {
 	I40IW_MAX_CEQ_ENTRIES =			131071,
 	I40IW_MAX_CEQ_ENTRIES =			131071,
 	I40IW_MIN_CQ_SIZE =			1,
 	I40IW_MIN_CQ_SIZE =			1,
 	I40IW_MAX_CQ_SIZE =			1048575,
 	I40IW_MAX_CQ_SIZE =			1048575,
-	I40IW_MAX_AEQ_ALLOCATE_COUNT =		255,
 	I40IW_DB_ID_ZERO =			0,
 	I40IW_DB_ID_ZERO =			0,
 	I40IW_MAX_WQ_FRAGMENT_COUNT =		3,
 	I40IW_MAX_WQ_FRAGMENT_COUNT =		3,
 	I40IW_MAX_SGE_RD =			1,
 	I40IW_MAX_SGE_RD =			1,
@@ -72,7 +71,7 @@ enum i40iw_device_capabilities_const {
 	I40IW_MAX_SQ_PAYLOAD_SIZE =		2145386496,
 	I40IW_MAX_SQ_PAYLOAD_SIZE =		2145386496,
 	I40IW_MAX_INLINE_DATA_SIZE =		48,
 	I40IW_MAX_INLINE_DATA_SIZE =		48,
 	I40IW_MAX_PUSHMODE_INLINE_DATA_SIZE =	48,
 	I40IW_MAX_PUSHMODE_INLINE_DATA_SIZE =	48,
-	I40IW_MAX_IRD_SIZE =			63,
+	I40IW_MAX_IRD_SIZE =			64,
 	I40IW_MAX_ORD_SIZE =			127,
 	I40IW_MAX_ORD_SIZE =			127,
 	I40IW_MAX_WQ_ENTRIES =			2048,
 	I40IW_MAX_WQ_ENTRIES =			2048,
 	I40IW_Q2_BUFFER_SIZE =			(248 + 100),
 	I40IW_Q2_BUFFER_SIZE =			(248 + 100),

+ 47 - 3
drivers/infiniband/hw/i40iw/i40iw_utils.c

@@ -137,7 +137,7 @@ inline u32 i40iw_rd32(struct i40iw_hw *hw, u32 reg)
 }
 }
 
 
 /**
 /**
- * i40iw_inetaddr_event - system notifier for netdev events
+ * i40iw_inetaddr_event - system notifier for ipv4 addr events
  * @notfier: not used
  * @notfier: not used
  * @event: event for notifier
  * @event: event for notifier
  * @ptr: if address
  * @ptr: if address
@@ -200,7 +200,7 @@ int i40iw_inetaddr_event(struct notifier_block *notifier,
 }
 }
 
 
 /**
 /**
- * i40iw_inet6addr_event - system notifier for ipv6 netdev events
+ * i40iw_inet6addr_event - system notifier for ipv6 addr events
  * @notfier: not used
  * @notfier: not used
  * @event: event for notifier
  * @event: event for notifier
  * @ptr: if address
  * @ptr: if address
@@ -252,7 +252,7 @@ int i40iw_inet6addr_event(struct notifier_block *notifier,
 }
 }
 
 
 /**
 /**
- * i40iw_net_event - system notifier for net events
+ * i40iw_net_event - system notifier for netevents
  * @notfier: not used
  * @notfier: not used
  * @event: event for notifier
  * @event: event for notifier
  * @ptr: neighbor
  * @ptr: neighbor
@@ -296,6 +296,50 @@ int i40iw_net_event(struct notifier_block *notifier, unsigned long event, void *
 	return NOTIFY_DONE;
 	return NOTIFY_DONE;
 }
 }
 
 
+/**
+ * i40iw_netdevice_event - system notifier for netdev events
+ * @notfier: not used
+ * @event: event for notifier
+ * @ptr: netdev
+ */
+int i40iw_netdevice_event(struct notifier_block *notifier,
+			  unsigned long event,
+			  void *ptr)
+{
+	struct net_device *event_netdev;
+	struct net_device *netdev;
+	struct i40iw_device *iwdev;
+	struct i40iw_handler *hdl;
+
+	event_netdev = netdev_notifier_info_to_dev(ptr);
+
+	hdl = i40iw_find_netdev(event_netdev);
+	if (!hdl)
+		return NOTIFY_DONE;
+
+	iwdev = &hdl->device;
+	if (iwdev->init_state < RDMA_DEV_REGISTERED || iwdev->closing)
+		return NOTIFY_DONE;
+
+	netdev = iwdev->ldev->netdev;
+	if (netdev != event_netdev)
+		return NOTIFY_DONE;
+
+	iwdev->iw_status = 1;
+
+	switch (event) {
+	case NETDEV_DOWN:
+		iwdev->iw_status = 0;
+		/* Fall through */
+	case NETDEV_UP:
+		i40iw_port_ibevent(iwdev);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
 /**
 /**
  * i40iw_get_cqp_request - get cqp struct
  * i40iw_get_cqp_request - get cqp struct
  * @cqp: device cqp ptr
  * @cqp: device cqp ptr

+ 2 - 3
drivers/infiniband/hw/i40iw/i40iw_verbs.c

@@ -412,6 +412,7 @@ void i40iw_free_qp_resources(struct i40iw_device *iwdev,
 {
 {
 	struct i40iw_pbl *iwpbl = &iwqp->iwpbl;
 	struct i40iw_pbl *iwpbl = &iwqp->iwpbl;
 
 
+	i40iw_ieq_cleanup_qp(iwdev->vsi.ieq, &iwqp->sc_qp);
 	i40iw_dealloc_push_page(iwdev, &iwqp->sc_qp);
 	i40iw_dealloc_push_page(iwdev, &iwqp->sc_qp);
 	if (qp_num)
 	if (qp_num)
 		i40iw_free_resource(iwdev, iwdev->allocated_qps, qp_num);
 		i40iw_free_resource(iwdev, iwdev->allocated_qps, qp_num);
@@ -1637,6 +1638,7 @@ static struct ib_mr *i40iw_alloc_mr(struct ib_pd *pd,
 		err_code = -EOVERFLOW;
 		err_code = -EOVERFLOW;
 		goto err;
 		goto err;
 	}
 	}
+	stag &= ~I40IW_CQPSQ_STAG_KEY_MASK;
 	iwmr->stag = stag;
 	iwmr->stag = stag;
 	iwmr->ibmr.rkey = stag;
 	iwmr->ibmr.rkey = stag;
 	iwmr->ibmr.lkey = stag;
 	iwmr->ibmr.lkey = stag;
@@ -2242,14 +2244,12 @@ static int i40iw_post_send(struct ib_qp *ibqp,
 				info.op.inline_rdma_write.len = ib_wr->sg_list[0].length;
 				info.op.inline_rdma_write.len = ib_wr->sg_list[0].length;
 				info.op.inline_rdma_write.rem_addr.tag_off = rdma_wr(ib_wr)->remote_addr;
 				info.op.inline_rdma_write.rem_addr.tag_off = rdma_wr(ib_wr)->remote_addr;
 				info.op.inline_rdma_write.rem_addr.stag = rdma_wr(ib_wr)->rkey;
 				info.op.inline_rdma_write.rem_addr.stag = rdma_wr(ib_wr)->rkey;
-				info.op.inline_rdma_write.rem_addr.len = ib_wr->sg_list->length;
 				ret = ukqp->ops.iw_inline_rdma_write(ukqp, &info, false);
 				ret = ukqp->ops.iw_inline_rdma_write(ukqp, &info, false);
 			} else {
 			} else {
 				info.op.rdma_write.lo_sg_list = (void *)ib_wr->sg_list;
 				info.op.rdma_write.lo_sg_list = (void *)ib_wr->sg_list;
 				info.op.rdma_write.num_lo_sges = ib_wr->num_sge;
 				info.op.rdma_write.num_lo_sges = ib_wr->num_sge;
 				info.op.rdma_write.rem_addr.tag_off = rdma_wr(ib_wr)->remote_addr;
 				info.op.rdma_write.rem_addr.tag_off = rdma_wr(ib_wr)->remote_addr;
 				info.op.rdma_write.rem_addr.stag = rdma_wr(ib_wr)->rkey;
 				info.op.rdma_write.rem_addr.stag = rdma_wr(ib_wr)->rkey;
-				info.op.rdma_write.rem_addr.len = ib_wr->sg_list->length;
 				ret = ukqp->ops.iw_rdma_write(ukqp, &info, false);
 				ret = ukqp->ops.iw_rdma_write(ukqp, &info, false);
 			}
 			}
 
 
@@ -2271,7 +2271,6 @@ static int i40iw_post_send(struct ib_qp *ibqp,
 			info.op_type = I40IW_OP_TYPE_RDMA_READ;
 			info.op_type = I40IW_OP_TYPE_RDMA_READ;
 			info.op.rdma_read.rem_addr.tag_off = rdma_wr(ib_wr)->remote_addr;
 			info.op.rdma_read.rem_addr.tag_off = rdma_wr(ib_wr)->remote_addr;
 			info.op.rdma_read.rem_addr.stag = rdma_wr(ib_wr)->rkey;
 			info.op.rdma_read.rem_addr.stag = rdma_wr(ib_wr)->rkey;
-			info.op.rdma_read.rem_addr.len = ib_wr->sg_list->length;
 			info.op.rdma_read.lo_addr.tag_off = ib_wr->sg_list->addr;
 			info.op.rdma_read.lo_addr.tag_off = ib_wr->sg_list->addr;
 			info.op.rdma_read.lo_addr.stag = ib_wr->sg_list->lkey;
 			info.op.rdma_read.lo_addr.stag = ib_wr->sg_list->lkey;
 			info.op.rdma_read.lo_addr.len = ib_wr->sg_list->length;
 			info.op.rdma_read.lo_addr.len = ib_wr->sg_list->length;

+ 2 - 2
drivers/infiniband/hw/mlx4/cq.c

@@ -170,7 +170,7 @@ err_buf:
 	return err;
 	return err;
 }
 }
 
 
-#define CQ_CREATE_FLAGS_SUPPORTED IB_CQ_FLAGS_TIMESTAMP_COMPLETION
+#define CQ_CREATE_FLAGS_SUPPORTED IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION
 struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
 struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
 				const struct ib_cq_init_attr *attr,
 				const struct ib_cq_init_attr *attr,
 				struct ib_ucontext *context,
 				struct ib_ucontext *context,
@@ -246,7 +246,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
 
 
 	err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar,
 	err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar,
 			    cq->db.dma, &cq->mcq, vector, 0,
 			    cq->db.dma, &cq->mcq, vector, 0,
-			    !!(cq->create_flags & IB_CQ_FLAGS_TIMESTAMP_COMPLETION));
+			    !!(cq->create_flags & IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION));
 	if (err)
 	if (err)
 		goto err_dbmap;
 		goto err_dbmap;
 
 

+ 11 - 8
drivers/infiniband/hw/mlx4/main.c

@@ -589,6 +589,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
 		if (props->rss_caps.supported_qpts) {
 		if (props->rss_caps.supported_qpts) {
 			resp.rss_caps.rx_hash_function =
 			resp.rss_caps.rx_hash_function =
 				MLX4_IB_RX_HASH_FUNC_TOEPLITZ;
 				MLX4_IB_RX_HASH_FUNC_TOEPLITZ;
+
 			resp.rss_caps.rx_hash_fields_mask =
 			resp.rss_caps.rx_hash_fields_mask =
 				MLX4_IB_RX_HASH_SRC_IPV4 |
 				MLX4_IB_RX_HASH_SRC_IPV4 |
 				MLX4_IB_RX_HASH_DST_IPV4 |
 				MLX4_IB_RX_HASH_DST_IPV4 |
@@ -598,6 +599,11 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
 				MLX4_IB_RX_HASH_DST_PORT_TCP |
 				MLX4_IB_RX_HASH_DST_PORT_TCP |
 				MLX4_IB_RX_HASH_SRC_PORT_UDP |
 				MLX4_IB_RX_HASH_SRC_PORT_UDP |
 				MLX4_IB_RX_HASH_DST_PORT_UDP;
 				MLX4_IB_RX_HASH_DST_PORT_UDP;
+
+			if (dev->dev->caps.tunnel_offload_mode ==
+			    MLX4_TUNNEL_OFFLOAD_MODE_VXLAN)
+				resp.rss_caps.rx_hash_fields_mask |=
+					MLX4_IB_RX_HASH_INNER;
 		}
 		}
 	}
 	}
 
 
@@ -2995,9 +3001,8 @@ err_steer_free_bitmap:
 	kfree(ibdev->ib_uc_qpns_bitmap);
 	kfree(ibdev->ib_uc_qpns_bitmap);
 
 
 err_steer_qp_release:
 err_steer_qp_release:
-	if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED)
-		mlx4_qp_release_range(dev, ibdev->steer_qpn_base,
-				      ibdev->steer_qpn_count);
+	mlx4_qp_release_range(dev, ibdev->steer_qpn_base,
+			      ibdev->steer_qpn_count);
 err_counter:
 err_counter:
 	for (i = 0; i < ibdev->num_ports; ++i)
 	for (i = 0; i < ibdev->num_ports; ++i)
 		mlx4_ib_delete_counters_table(ibdev, &ibdev->counters_table[i]);
 		mlx4_ib_delete_counters_table(ibdev, &ibdev->counters_table[i]);
@@ -3102,11 +3107,9 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
 		ibdev->iboe.nb.notifier_call = NULL;
 		ibdev->iboe.nb.notifier_call = NULL;
 	}
 	}
 
 
-	if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) {
-		mlx4_qp_release_range(dev, ibdev->steer_qpn_base,
-				      ibdev->steer_qpn_count);
-		kfree(ibdev->ib_uc_qpns_bitmap);
-	}
+	mlx4_qp_release_range(dev, ibdev->steer_qpn_base,
+			      ibdev->steer_qpn_count);
+	kfree(ibdev->ib_uc_qpns_bitmap);
 
 
 	iounmap(ibdev->uar_map);
 	iounmap(ibdev->uar_map);
 	for (p = 0; p < ibdev->num_ports; ++p)
 	for (p = 0; p < ibdev->num_ports; ++p)

+ 18 - 2
drivers/infiniband/hw/mlx4/qp.c

@@ -734,10 +734,24 @@ static int set_qp_rss(struct mlx4_ib_dev *dev, struct mlx4_ib_rss *rss_ctx,
 		return (-EOPNOTSUPP);
 		return (-EOPNOTSUPP);
 	}
 	}
 
 
+	if (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_INNER) {
+		if (dev->dev->caps.tunnel_offload_mode ==
+		    MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
+			/*
+			 * Hash according to inner headers if exist, otherwise
+			 * according to outer headers.
+			 */
+			rss_ctx->flags |= MLX4_RSS_BY_INNER_HEADERS_IPONLY;
+		} else {
+			pr_debug("RSS Hash for inner headers isn't supported\n");
+			return (-EOPNOTSUPP);
+		}
+	}
+
 	return 0;
 	return 0;
 }
 }
 
 
-static int create_qp_rss(struct mlx4_ib_dev *dev, struct ib_pd *ibpd,
+static int create_qp_rss(struct mlx4_ib_dev *dev,
 			 struct ib_qp_init_attr *init_attr,
 			 struct ib_qp_init_attr *init_attr,
 			 struct mlx4_ib_create_qp_rss *ucmd,
 			 struct mlx4_ib_create_qp_rss *ucmd,
 			 struct mlx4_ib_qp *qp)
 			 struct mlx4_ib_qp *qp)
@@ -860,7 +874,7 @@ static struct ib_qp *_mlx4_ib_create_qp_rss(struct ib_pd *pd,
 	qp->pri.vid = 0xFFFF;
 	qp->pri.vid = 0xFFFF;
 	qp->alt.vid = 0xFFFF;
 	qp->alt.vid = 0xFFFF;
 
 
-	err = create_qp_rss(to_mdev(pd->device), pd, init_attr, &ucmd, qp);
+	err = create_qp_rss(to_mdev(pd->device), init_attr, &ucmd, qp);
 	if (err) {
 	if (err) {
 		kfree(qp);
 		kfree(qp);
 		return ERR_PTR(err);
 		return ERR_PTR(err);
@@ -1836,6 +1850,8 @@ static int _mlx4_set_path(struct mlx4_ib_dev *dev,
 			mlx4_ib_gid_index_to_real_index(dev, port,
 			mlx4_ib_gid_index_to_real_index(dev, port,
 							grh->sgid_index);
 							grh->sgid_index);
 
 
+		if (real_sgid_index < 0)
+			return real_sgid_index;
 		if (real_sgid_index >= dev->dev->caps.gid_table_len[port]) {
 		if (real_sgid_index >= dev->dev->caps.gid_table_len[port]) {
 			pr_err("sgid_index (%u) too large. max is %d\n",
 			pr_err("sgid_index (%u) too large. max is %d\n",
 			       real_sgid_index, dev->dev->caps.gid_table_len[port] - 1);
 			       real_sgid_index, dev->dev->caps.gid_table_len[port] - 1);

+ 59 - 24
drivers/infiniband/hw/mlx5/cong.c

@@ -247,21 +247,30 @@ static void mlx5_ib_set_cc_param_mask_val(void *field, int offset,
 	}
 	}
 }
 }
 
 
-static int mlx5_ib_get_cc_params(struct mlx5_ib_dev *dev, int offset, u32 *var)
+static int mlx5_ib_get_cc_params(struct mlx5_ib_dev *dev, u8 port_num,
+				 int offset, u32 *var)
 {
 {
 	int outlen = MLX5_ST_SZ_BYTES(query_cong_params_out);
 	int outlen = MLX5_ST_SZ_BYTES(query_cong_params_out);
 	void *out;
 	void *out;
 	void *field;
 	void *field;
 	int err;
 	int err;
 	enum mlx5_ib_cong_node_type node;
 	enum mlx5_ib_cong_node_type node;
+	struct mlx5_core_dev *mdev;
+
+	/* Takes a 1-based port number */
+	mdev = mlx5_ib_get_native_port_mdev(dev, port_num + 1, NULL);
+	if (!mdev)
+		return -ENODEV;
 
 
 	out = kvzalloc(outlen, GFP_KERNEL);
 	out = kvzalloc(outlen, GFP_KERNEL);
-	if (!out)
-		return -ENOMEM;
+	if (!out) {
+		err = -ENOMEM;
+		goto alloc_err;
+	}
 
 
 	node = mlx5_ib_param_to_node(offset);
 	node = mlx5_ib_param_to_node(offset);
 
 
-	err = mlx5_cmd_query_cong_params(dev->mdev, node, out, outlen);
+	err = mlx5_cmd_query_cong_params(mdev, node, out, outlen);
 	if (err)
 	if (err)
 		goto free;
 		goto free;
 
 
@@ -270,21 +279,32 @@ static int mlx5_ib_get_cc_params(struct mlx5_ib_dev *dev, int offset, u32 *var)
 
 
 free:
 free:
 	kvfree(out);
 	kvfree(out);
+alloc_err:
+	mlx5_ib_put_native_port_mdev(dev, port_num + 1);
 	return err;
 	return err;
 }
 }
 
 
-static int mlx5_ib_set_cc_params(struct mlx5_ib_dev *dev, int offset, u32 var)
+static int mlx5_ib_set_cc_params(struct mlx5_ib_dev *dev, u8 port_num,
+				 int offset, u32 var)
 {
 {
 	int inlen = MLX5_ST_SZ_BYTES(modify_cong_params_in);
 	int inlen = MLX5_ST_SZ_BYTES(modify_cong_params_in);
 	void *in;
 	void *in;
 	void *field;
 	void *field;
 	enum mlx5_ib_cong_node_type node;
 	enum mlx5_ib_cong_node_type node;
+	struct mlx5_core_dev *mdev;
 	u32 attr_mask = 0;
 	u32 attr_mask = 0;
 	int err;
 	int err;
 
 
+	/* Takes a 1-based port number */
+	mdev = mlx5_ib_get_native_port_mdev(dev, port_num + 1, NULL);
+	if (!mdev)
+		return -ENODEV;
+
 	in = kvzalloc(inlen, GFP_KERNEL);
 	in = kvzalloc(inlen, GFP_KERNEL);
-	if (!in)
-		return -ENOMEM;
+	if (!in) {
+		err = -ENOMEM;
+		goto alloc_err;
+	}
 
 
 	MLX5_SET(modify_cong_params_in, in, opcode,
 	MLX5_SET(modify_cong_params_in, in, opcode,
 		 MLX5_CMD_OP_MODIFY_CONG_PARAMS);
 		 MLX5_CMD_OP_MODIFY_CONG_PARAMS);
@@ -299,8 +319,10 @@ static int mlx5_ib_set_cc_params(struct mlx5_ib_dev *dev, int offset, u32 var)
 	MLX5_SET(field_select_r_roce_rp, field, field_select_r_roce_rp,
 	MLX5_SET(field_select_r_roce_rp, field, field_select_r_roce_rp,
 		 attr_mask);
 		 attr_mask);
 
 
-	err = mlx5_cmd_modify_cong_params(dev->mdev, in, inlen);
+	err = mlx5_cmd_modify_cong_params(mdev, in, inlen);
 	kvfree(in);
 	kvfree(in);
+alloc_err:
+	mlx5_ib_put_native_port_mdev(dev, port_num + 1);
 	return err;
 	return err;
 }
 }
 
 
@@ -324,7 +346,7 @@ static ssize_t set_param(struct file *filp, const char __user *buf,
 	if (kstrtou32(lbuf, 0, &var))
 	if (kstrtou32(lbuf, 0, &var))
 		return -EINVAL;
 		return -EINVAL;
 
 
-	ret = mlx5_ib_set_cc_params(param->dev, offset, var);
+	ret = mlx5_ib_set_cc_params(param->dev, param->port_num, offset, var);
 	return ret ? ret : count;
 	return ret ? ret : count;
 }
 }
 
 
@@ -340,7 +362,7 @@ static ssize_t get_param(struct file *filp, char __user *buf, size_t count,
 	if (*pos)
 	if (*pos)
 		return 0;
 		return 0;
 
 
-	ret = mlx5_ib_get_cc_params(param->dev, offset, &var);
+	ret = mlx5_ib_get_cc_params(param->dev, param->port_num, offset, &var);
 	if (ret)
 	if (ret)
 		return ret;
 		return ret;
 
 
@@ -362,44 +384,51 @@ static const struct file_operations dbg_cc_fops = {
 	.read	= get_param,
 	.read	= get_param,
 };
 };
 
 
-void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev)
+void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num)
 {
 {
 	if (!mlx5_debugfs_root ||
 	if (!mlx5_debugfs_root ||
-	    !dev->dbg_cc_params ||
-	    !dev->dbg_cc_params->root)
+	    !dev->port[port_num].dbg_cc_params ||
+	    !dev->port[port_num].dbg_cc_params->root)
 		return;
 		return;
 
 
-	debugfs_remove_recursive(dev->dbg_cc_params->root);
-	kfree(dev->dbg_cc_params);
-	dev->dbg_cc_params = NULL;
+	debugfs_remove_recursive(dev->port[port_num].dbg_cc_params->root);
+	kfree(dev->port[port_num].dbg_cc_params);
+	dev->port[port_num].dbg_cc_params = NULL;
 }
 }
 
 
-int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev)
+int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num)
 {
 {
 	struct mlx5_ib_dbg_cc_params *dbg_cc_params;
 	struct mlx5_ib_dbg_cc_params *dbg_cc_params;
+	struct mlx5_core_dev *mdev;
 	int i;
 	int i;
 
 
 	if (!mlx5_debugfs_root)
 	if (!mlx5_debugfs_root)
 		goto out;
 		goto out;
 
 
-	if (!MLX5_CAP_GEN(dev->mdev, cc_query_allowed) ||
-	    !MLX5_CAP_GEN(dev->mdev, cc_modify_allowed))
+	/* Takes a 1-based port number */
+	mdev = mlx5_ib_get_native_port_mdev(dev, port_num + 1, NULL);
+	if (!mdev)
 		goto out;
 		goto out;
 
 
+	if (!MLX5_CAP_GEN(mdev, cc_query_allowed) ||
+	    !MLX5_CAP_GEN(mdev, cc_modify_allowed))
+		goto put_mdev;
+
 	dbg_cc_params = kzalloc(sizeof(*dbg_cc_params), GFP_KERNEL);
 	dbg_cc_params = kzalloc(sizeof(*dbg_cc_params), GFP_KERNEL);
 	if (!dbg_cc_params)
 	if (!dbg_cc_params)
-		goto out;
+		goto err;
 
 
-	dev->dbg_cc_params = dbg_cc_params;
+	dev->port[port_num].dbg_cc_params = dbg_cc_params;
 
 
 	dbg_cc_params->root = debugfs_create_dir("cc_params",
 	dbg_cc_params->root = debugfs_create_dir("cc_params",
-						 dev->mdev->priv.dbg_root);
+						 mdev->priv.dbg_root);
 	if (!dbg_cc_params->root)
 	if (!dbg_cc_params->root)
 		goto err;
 		goto err;
 
 
 	for (i = 0; i < MLX5_IB_DBG_CC_MAX; i++) {
 	for (i = 0; i < MLX5_IB_DBG_CC_MAX; i++) {
 		dbg_cc_params->params[i].offset = i;
 		dbg_cc_params->params[i].offset = i;
 		dbg_cc_params->params[i].dev = dev;
 		dbg_cc_params->params[i].dev = dev;
+		dbg_cc_params->params[i].port_num = port_num;
 		dbg_cc_params->params[i].dentry =
 		dbg_cc_params->params[i].dentry =
 			debugfs_create_file(mlx5_ib_dbg_cc_name[i],
 			debugfs_create_file(mlx5_ib_dbg_cc_name[i],
 					    0600, dbg_cc_params->root,
 					    0600, dbg_cc_params->root,
@@ -408,11 +437,17 @@ int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev)
 		if (!dbg_cc_params->params[i].dentry)
 		if (!dbg_cc_params->params[i].dentry)
 			goto err;
 			goto err;
 	}
 	}
-out:	return 0;
+
+put_mdev:
+	mlx5_ib_put_native_port_mdev(dev, port_num + 1);
+out:
+	return 0;
 
 
 err:
 err:
 	mlx5_ib_warn(dev, "cong debugfs failure\n");
 	mlx5_ib_warn(dev, "cong debugfs failure\n");
-	mlx5_ib_cleanup_cong_debugfs(dev);
+	mlx5_ib_cleanup_cong_debugfs(dev, port_num);
+	mlx5_ib_put_native_port_mdev(dev, port_num + 1);
+
 	/*
 	/*
 	 * We don't want to fail driver if debugfs failed to initialize,
 	 * We don't want to fail driver if debugfs failed to initialize,
 	 * so we are not forwarding error to the user.
 	 * so we are not forwarding error to the user.

+ 1 - 1
drivers/infiniband/hw/mlx5/cq.c

@@ -1010,7 +1010,7 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
 	MLX5_SET(cqc, cqc, uar_page, index);
 	MLX5_SET(cqc, cqc, uar_page, index);
 	MLX5_SET(cqc, cqc, c_eqn, eqn);
 	MLX5_SET(cqc, cqc, c_eqn, eqn);
 	MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma);
 	MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma);
-	if (cq->create_flags & IB_CQ_FLAGS_IGNORE_OVERRUN)
+	if (cq->create_flags & IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN)
 		MLX5_SET(cqc, cqc, oi, 1);
 		MLX5_SET(cqc, cqc, oi, 1);
 
 
 	err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen);
 	err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen);

+ 15 - 8
drivers/infiniband/hw/mlx5/mad.c

@@ -197,10 +197,9 @@ static void pma_cnt_assign(struct ib_pma_portcounters *pma_cnt,
 			     vl_15_dropped);
 			     vl_15_dropped);
 }
 }
 
 
-static int process_pma_cmd(struct ib_device *ibdev, u8 port_num,
+static int process_pma_cmd(struct mlx5_core_dev *mdev, u8 port_num,
 			   const struct ib_mad *in_mad, struct ib_mad *out_mad)
 			   const struct ib_mad *in_mad, struct ib_mad *out_mad)
 {
 {
-	struct mlx5_ib_dev *dev = to_mdev(ibdev);
 	int err;
 	int err;
 	void *out_cnt;
 	void *out_cnt;
 
 
@@ -222,7 +221,7 @@ static int process_pma_cmd(struct ib_device *ibdev, u8 port_num,
 		if (!out_cnt)
 		if (!out_cnt)
 			return IB_MAD_RESULT_FAILURE;
 			return IB_MAD_RESULT_FAILURE;
 
 
-		err = mlx5_core_query_vport_counter(dev->mdev, 0, 0,
+		err = mlx5_core_query_vport_counter(mdev, 0, 0,
 						    port_num, out_cnt, sz);
 						    port_num, out_cnt, sz);
 		if (!err)
 		if (!err)
 			pma_cnt_ext_assign(pma_cnt_ext, out_cnt);
 			pma_cnt_ext_assign(pma_cnt_ext, out_cnt);
@@ -235,7 +234,7 @@ static int process_pma_cmd(struct ib_device *ibdev, u8 port_num,
 		if (!out_cnt)
 		if (!out_cnt)
 			return IB_MAD_RESULT_FAILURE;
 			return IB_MAD_RESULT_FAILURE;
 
 
-		err = mlx5_core_query_ib_ppcnt(dev->mdev, port_num,
+		err = mlx5_core_query_ib_ppcnt(mdev, port_num,
 					       out_cnt, sz);
 					       out_cnt, sz);
 		if (!err)
 		if (!err)
 			pma_cnt_assign(pma_cnt, out_cnt);
 			pma_cnt_assign(pma_cnt, out_cnt);
@@ -255,9 +254,11 @@ int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
 			u16 *out_mad_pkey_index)
 			u16 *out_mad_pkey_index)
 {
 {
 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
-	struct mlx5_core_dev *mdev = dev->mdev;
 	const struct ib_mad *in_mad = (const struct ib_mad *)in;
 	const struct ib_mad *in_mad = (const struct ib_mad *)in;
 	struct ib_mad *out_mad = (struct ib_mad *)out;
 	struct ib_mad *out_mad = (struct ib_mad *)out;
+	struct mlx5_core_dev *mdev;
+	u8 mdev_port_num;
+	int ret;
 
 
 	if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) ||
 	if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) ||
 			 *out_mad_size != sizeof(*out_mad)))
 			 *out_mad_size != sizeof(*out_mad)))
@@ -265,14 +266,20 @@ int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
 
 
 	memset(out_mad->data, 0, sizeof(out_mad->data));
 	memset(out_mad->data, 0, sizeof(out_mad->data));
 
 
+	mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num);
+	if (!mdev)
+		return IB_MAD_RESULT_FAILURE;
+
 	if (MLX5_CAP_GEN(mdev, vport_counters) &&
 	if (MLX5_CAP_GEN(mdev, vport_counters) &&
 	    in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT &&
 	    in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT &&
 	    in_mad->mad_hdr.method == IB_MGMT_METHOD_GET) {
 	    in_mad->mad_hdr.method == IB_MGMT_METHOD_GET) {
-		return process_pma_cmd(ibdev, port_num, in_mad, out_mad);
+		ret = process_pma_cmd(mdev, mdev_port_num, in_mad, out_mad);
 	} else {
 	} else {
-		return process_mad(ibdev, mad_flags, port_num, in_wc, in_grh,
+		ret =  process_mad(ibdev, mad_flags, port_num, in_wc, in_grh,
 				   in_mad, out_mad);
 				   in_mad, out_mad);
 	}
 	}
+	mlx5_ib_put_native_port_mdev(dev, port_num);
+	return ret;
 }
 }
 
 
 int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port)
 int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port)
@@ -519,7 +526,7 @@ int mlx5_query_mad_ifc_port(struct ib_device *ibdev, u8 port,
 	int ext_active_speed;
 	int ext_active_speed;
 	int err = -ENOMEM;
 	int err = -ENOMEM;
 
 
-	if (port < 1 || port > MLX5_CAP_GEN(mdev, num_ports)) {
+	if (port < 1 || port > dev->num_ports) {
 		mlx5_ib_warn(dev, "invalid port number %d\n", port);
 		mlx5_ib_warn(dev, "invalid port number %d\n", port);
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}

+ 1115 - 296
drivers/infiniband/hw/mlx5/main.c

@@ -50,16 +50,14 @@
 #include <rdma/ib_cache.h>
 #include <rdma/ib_cache.h>
 #include <linux/mlx5/port.h>
 #include <linux/mlx5/port.h>
 #include <linux/mlx5/vport.h>
 #include <linux/mlx5/vport.h>
+#include <linux/mlx5/fs.h>
 #include <linux/list.h>
 #include <linux/list.h>
 #include <rdma/ib_smi.h>
 #include <rdma/ib_smi.h>
 #include <rdma/ib_umem.h>
 #include <rdma/ib_umem.h>
 #include <linux/in.h>
 #include <linux/in.h>
 #include <linux/etherdevice.h>
 #include <linux/etherdevice.h>
-#include <linux/mlx5/fs.h>
-#include <linux/mlx5/vport.h>
 #include "mlx5_ib.h"
 #include "mlx5_ib.h"
 #include "cmd.h"
 #include "cmd.h"
-#include <linux/mlx5/vport.h>
 
 
 #define DRIVER_NAME "mlx5_ib"
 #define DRIVER_NAME "mlx5_ib"
 #define DRIVER_VERSION "5.0-0"
 #define DRIVER_VERSION "5.0-0"
@@ -72,10 +70,36 @@ static char mlx5_version[] =
 	DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v"
 	DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v"
 	DRIVER_VERSION "\n";
 	DRIVER_VERSION "\n";
 
 
+struct mlx5_ib_event_work {
+	struct work_struct	work;
+	struct mlx5_core_dev	*dev;
+	void			*context;
+	enum mlx5_dev_event	event;
+	unsigned long		param;
+};
+
 enum {
 enum {
 	MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3,
 	MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3,
 };
 };
 
 
+static struct workqueue_struct *mlx5_ib_event_wq;
+static LIST_HEAD(mlx5_ib_unaffiliated_port_list);
+static LIST_HEAD(mlx5_ib_dev_list);
+/*
+ * This mutex should be held when accessing either of the above lists
+ */
+static DEFINE_MUTEX(mlx5_ib_multiport_mutex);
+
+struct mlx5_ib_dev *mlx5_ib_get_ibdev_from_mpi(struct mlx5_ib_multiport_info *mpi)
+{
+	struct mlx5_ib_dev *dev;
+
+	mutex_lock(&mlx5_ib_multiport_mutex);
+	dev = mpi->ibdev;
+	mutex_unlock(&mlx5_ib_multiport_mutex);
+	return dev;
+}
+
 static enum rdma_link_layer
 static enum rdma_link_layer
 mlx5_port_type_cap_to_rdma_ll(int port_type_cap)
 mlx5_port_type_cap_to_rdma_ll(int port_type_cap)
 {
 {
@@ -115,24 +139,32 @@ static int get_port_state(struct ib_device *ibdev,
 static int mlx5_netdev_event(struct notifier_block *this,
 static int mlx5_netdev_event(struct notifier_block *this,
 			     unsigned long event, void *ptr)
 			     unsigned long event, void *ptr)
 {
 {
+	struct mlx5_roce *roce = container_of(this, struct mlx5_roce, nb);
 	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
 	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
-	struct mlx5_ib_dev *ibdev = container_of(this, struct mlx5_ib_dev,
-						 roce.nb);
+	u8 port_num = roce->native_port_num;
+	struct mlx5_core_dev *mdev;
+	struct mlx5_ib_dev *ibdev;
+
+	ibdev = roce->dev;
+	mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL);
+	if (!mdev)
+		return NOTIFY_DONE;
 
 
 	switch (event) {
 	switch (event) {
 	case NETDEV_REGISTER:
 	case NETDEV_REGISTER:
 	case NETDEV_UNREGISTER:
 	case NETDEV_UNREGISTER:
-		write_lock(&ibdev->roce.netdev_lock);
-		if (ndev->dev.parent == &ibdev->mdev->pdev->dev)
-			ibdev->roce.netdev = (event == NETDEV_UNREGISTER) ?
-					     NULL : ndev;
-		write_unlock(&ibdev->roce.netdev_lock);
+		write_lock(&roce->netdev_lock);
+
+		if (ndev->dev.parent == &mdev->pdev->dev)
+			roce->netdev = (event == NETDEV_UNREGISTER) ?
+					NULL : ndev;
+		write_unlock(&roce->netdev_lock);
 		break;
 		break;
 
 
 	case NETDEV_CHANGE:
 	case NETDEV_CHANGE:
 	case NETDEV_UP:
 	case NETDEV_UP:
 	case NETDEV_DOWN: {
 	case NETDEV_DOWN: {
-		struct net_device *lag_ndev = mlx5_lag_get_roce_netdev(ibdev->mdev);
+		struct net_device *lag_ndev = mlx5_lag_get_roce_netdev(mdev);
 		struct net_device *upper = NULL;
 		struct net_device *upper = NULL;
 
 
 		if (lag_ndev) {
 		if (lag_ndev) {
@@ -140,27 +172,28 @@ static int mlx5_netdev_event(struct notifier_block *this,
 			dev_put(lag_ndev);
 			dev_put(lag_ndev);
 		}
 		}
 
 
-		if ((upper == ndev || (!upper && ndev == ibdev->roce.netdev))
+		if ((upper == ndev || (!upper && ndev == roce->netdev))
 		    && ibdev->ib_active) {
 		    && ibdev->ib_active) {
 			struct ib_event ibev = { };
 			struct ib_event ibev = { };
 			enum ib_port_state port_state;
 			enum ib_port_state port_state;
 
 
-			if (get_port_state(&ibdev->ib_dev, 1, &port_state))
-				return NOTIFY_DONE;
+			if (get_port_state(&ibdev->ib_dev, port_num,
+					   &port_state))
+				goto done;
 
 
-			if (ibdev->roce.last_port_state == port_state)
-				return NOTIFY_DONE;
+			if (roce->last_port_state == port_state)
+				goto done;
 
 
-			ibdev->roce.last_port_state = port_state;
+			roce->last_port_state = port_state;
 			ibev.device = &ibdev->ib_dev;
 			ibev.device = &ibdev->ib_dev;
 			if (port_state == IB_PORT_DOWN)
 			if (port_state == IB_PORT_DOWN)
 				ibev.event = IB_EVENT_PORT_ERR;
 				ibev.event = IB_EVENT_PORT_ERR;
 			else if (port_state == IB_PORT_ACTIVE)
 			else if (port_state == IB_PORT_ACTIVE)
 				ibev.event = IB_EVENT_PORT_ACTIVE;
 				ibev.event = IB_EVENT_PORT_ACTIVE;
 			else
 			else
-				return NOTIFY_DONE;
+				goto done;
 
 
-			ibev.element.port_num = 1;
+			ibev.element.port_num = port_num;
 			ib_dispatch_event(&ibev);
 			ib_dispatch_event(&ibev);
 		}
 		}
 		break;
 		break;
@@ -169,7 +202,8 @@ static int mlx5_netdev_event(struct notifier_block *this,
 	default:
 	default:
 		break;
 		break;
 	}
 	}
-
+done:
+	mlx5_ib_put_native_port_mdev(ibdev, port_num);
 	return NOTIFY_DONE;
 	return NOTIFY_DONE;
 }
 }
 
 
@@ -178,22 +212,88 @@ static struct net_device *mlx5_ib_get_netdev(struct ib_device *device,
 {
 {
 	struct mlx5_ib_dev *ibdev = to_mdev(device);
 	struct mlx5_ib_dev *ibdev = to_mdev(device);
 	struct net_device *ndev;
 	struct net_device *ndev;
+	struct mlx5_core_dev *mdev;
+
+	mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL);
+	if (!mdev)
+		return NULL;
 
 
-	ndev = mlx5_lag_get_roce_netdev(ibdev->mdev);
+	ndev = mlx5_lag_get_roce_netdev(mdev);
 	if (ndev)
 	if (ndev)
-		return ndev;
+		goto out;
 
 
 	/* Ensure ndev does not disappear before we invoke dev_hold()
 	/* Ensure ndev does not disappear before we invoke dev_hold()
 	 */
 	 */
-	read_lock(&ibdev->roce.netdev_lock);
-	ndev = ibdev->roce.netdev;
+	read_lock(&ibdev->roce[port_num - 1].netdev_lock);
+	ndev = ibdev->roce[port_num - 1].netdev;
 	if (ndev)
 	if (ndev)
 		dev_hold(ndev);
 		dev_hold(ndev);
-	read_unlock(&ibdev->roce.netdev_lock);
+	read_unlock(&ibdev->roce[port_num - 1].netdev_lock);
 
 
+out:
+	mlx5_ib_put_native_port_mdev(ibdev, port_num);
 	return ndev;
 	return ndev;
 }
 }
 
 
+struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev,
+						   u8 ib_port_num,
+						   u8 *native_port_num)
+{
+	enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev,
+							  ib_port_num);
+	struct mlx5_core_dev *mdev = NULL;
+	struct mlx5_ib_multiport_info *mpi;
+	struct mlx5_ib_port *port;
+
+	if (native_port_num)
+		*native_port_num = 1;
+
+	if (!mlx5_core_mp_enabled(ibdev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
+		return ibdev->mdev;
+
+	port = &ibdev->port[ib_port_num - 1];
+	if (!port)
+		return NULL;
+
+	spin_lock(&port->mp.mpi_lock);
+	mpi = ibdev->port[ib_port_num - 1].mp.mpi;
+	if (mpi && !mpi->unaffiliate) {
+		mdev = mpi->mdev;
+		/* If it's the master no need to refcount, it'll exist
+		 * as long as the ib_dev exists.
+		 */
+		if (!mpi->is_master)
+			mpi->mdev_refcnt++;
+	}
+	spin_unlock(&port->mp.mpi_lock);
+
+	return mdev;
+}
+
+void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *ibdev, u8 port_num)
+{
+	enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev,
+							  port_num);
+	struct mlx5_ib_multiport_info *mpi;
+	struct mlx5_ib_port *port;
+
+	if (!mlx5_core_mp_enabled(ibdev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
+		return;
+
+	port = &ibdev->port[port_num - 1];
+
+	spin_lock(&port->mp.mpi_lock);
+	mpi = ibdev->port[port_num - 1].mp.mpi;
+	if (mpi->is_master)
+		goto out;
+
+	mpi->mdev_refcnt--;
+	if (mpi->unaffiliate)
+		complete(&mpi->unref_comp);
+out:
+	spin_unlock(&port->mp.mpi_lock);
+}
+
 static int translate_eth_proto_oper(u32 eth_proto_oper, u8 *active_speed,
 static int translate_eth_proto_oper(u32 eth_proto_oper, u8 *active_speed,
 				    u8 *active_width)
 				    u8 *active_width)
 {
 {
@@ -256,19 +356,33 @@ static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
 				struct ib_port_attr *props)
 				struct ib_port_attr *props)
 {
 {
 	struct mlx5_ib_dev *dev = to_mdev(device);
 	struct mlx5_ib_dev *dev = to_mdev(device);
-	struct mlx5_core_dev *mdev = dev->mdev;
+	struct mlx5_core_dev *mdev;
 	struct net_device *ndev, *upper;
 	struct net_device *ndev, *upper;
 	enum ib_mtu ndev_ib_mtu;
 	enum ib_mtu ndev_ib_mtu;
+	bool put_mdev = true;
 	u16 qkey_viol_cntr;
 	u16 qkey_viol_cntr;
 	u32 eth_prot_oper;
 	u32 eth_prot_oper;
+	u8 mdev_port_num;
 	int err;
 	int err;
 
 
+	mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num);
+	if (!mdev) {
+		/* This means the port isn't affiliated yet. Get the
+		 * info for the master port instead.
+		 */
+		put_mdev = false;
+		mdev = dev->mdev;
+		mdev_port_num = 1;
+		port_num = 1;
+	}
+
 	/* Possible bad flows are checked before filling out props so in case
 	/* Possible bad flows are checked before filling out props so in case
 	 * of an error it will still be zeroed out.
 	 * of an error it will still be zeroed out.
 	 */
 	 */
-	err = mlx5_query_port_eth_proto_oper(mdev, &eth_prot_oper, port_num);
+	err = mlx5_query_port_eth_proto_oper(mdev, &eth_prot_oper,
+					     mdev_port_num);
 	if (err)
 	if (err)
-		return err;
+		goto out;
 
 
 	translate_eth_proto_oper(eth_prot_oper, &props->active_speed,
 	translate_eth_proto_oper(eth_prot_oper, &props->active_speed,
 				 &props->active_width);
 				 &props->active_width);
@@ -284,12 +398,16 @@ static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
 	props->state            = IB_PORT_DOWN;
 	props->state            = IB_PORT_DOWN;
 	props->phys_state       = 3;
 	props->phys_state       = 3;
 
 
-	mlx5_query_nic_vport_qkey_viol_cntr(dev->mdev, &qkey_viol_cntr);
+	mlx5_query_nic_vport_qkey_viol_cntr(mdev, &qkey_viol_cntr);
 	props->qkey_viol_cntr = qkey_viol_cntr;
 	props->qkey_viol_cntr = qkey_viol_cntr;
 
 
+	/* If this is a stub query for an unaffiliated port stop here */
+	if (!put_mdev)
+		goto out;
+
 	ndev = mlx5_ib_get_netdev(device, port_num);
 	ndev = mlx5_ib_get_netdev(device, port_num);
 	if (!ndev)
 	if (!ndev)
-		return 0;
+		goto out;
 
 
 	if (mlx5_lag_is_active(dev->mdev)) {
 	if (mlx5_lag_is_active(dev->mdev)) {
 		rcu_read_lock();
 		rcu_read_lock();
@@ -312,7 +430,10 @@ static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
 	dev_put(ndev);
 	dev_put(ndev);
 
 
 	props->active_mtu	= min(props->max_mtu, ndev_ib_mtu);
 	props->active_mtu	= min(props->max_mtu, ndev_ib_mtu);
-	return 0;
+out:
+	if (put_mdev)
+		mlx5_ib_put_native_port_mdev(dev, port_num);
+	return err;
 }
 }
 
 
 static int set_roce_addr(struct mlx5_ib_dev *dev, u8 port_num,
 static int set_roce_addr(struct mlx5_ib_dev *dev, u8 port_num,
@@ -354,7 +475,7 @@ static int set_roce_addr(struct mlx5_ib_dev *dev, u8 port_num,
 
 
 	return mlx5_core_roce_gid_set(dev->mdev, index, roce_version,
 	return mlx5_core_roce_gid_set(dev->mdev, index, roce_version,
 				      roce_l3_type, gid->raw, mac, vlan,
 				      roce_l3_type, gid->raw, mac, vlan,
-				      vlan_id);
+				      vlan_id, port_num);
 }
 }
 
 
 static int mlx5_ib_add_gid(struct ib_device *device, u8 port_num,
 static int mlx5_ib_add_gid(struct ib_device *device, u8 port_num,
@@ -438,11 +559,11 @@ static int mlx5_get_vport_access_method(struct ib_device *ibdev)
 }
 }
 
 
 static void get_atomic_caps(struct mlx5_ib_dev *dev,
 static void get_atomic_caps(struct mlx5_ib_dev *dev,
+			    u8 atomic_size_qp,
 			    struct ib_device_attr *props)
 			    struct ib_device_attr *props)
 {
 {
 	u8 tmp;
 	u8 tmp;
 	u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations);
 	u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations);
-	u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp);
 	u8 atomic_req_8B_endianness_mode =
 	u8 atomic_req_8B_endianness_mode =
 		MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianness_mode);
 		MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianness_mode);
 
 
@@ -459,6 +580,29 @@ static void get_atomic_caps(struct mlx5_ib_dev *dev,
 	}
 	}
 }
 }
 
 
+static void get_atomic_caps_qp(struct mlx5_ib_dev *dev,
+			       struct ib_device_attr *props)
+{
+	u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp);
+
+	get_atomic_caps(dev, atomic_size_qp, props);
+}
+
+static void get_atomic_caps_dc(struct mlx5_ib_dev *dev,
+			       struct ib_device_attr *props)
+{
+	u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc);
+
+	get_atomic_caps(dev, atomic_size_qp, props);
+}
+
+bool mlx5_ib_dc_atomic_is_supported(struct mlx5_ib_dev *dev)
+{
+	struct ib_device_attr props = {};
+
+	get_atomic_caps_dc(dev, &props);
+	return (props.atomic_cap == IB_ATOMIC_HCA) ? true : false;
+}
 static int mlx5_query_system_image_guid(struct ib_device *ibdev,
 static int mlx5_query_system_image_guid(struct ib_device *ibdev,
 					__be64 *sys_image_guid)
 					__be64 *sys_image_guid)
 {
 {
@@ -587,6 +731,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 	int max_rq_sg;
 	int max_rq_sg;
 	int max_sq_sg;
 	int max_sq_sg;
 	u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz);
 	u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz);
+	bool raw_support = !mlx5_core_mp_enabled(mdev);
 	struct mlx5_ib_query_device_resp resp = {};
 	struct mlx5_ib_query_device_resp resp = {};
 	size_t resp_len;
 	size_t resp_len;
 	u64 max_tso;
 	u64 max_tso;
@@ -650,7 +795,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 	if (MLX5_CAP_GEN(mdev, block_lb_mc))
 	if (MLX5_CAP_GEN(mdev, block_lb_mc))
 		props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
 		props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
 
 
-	if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads)) {
+	if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && raw_support) {
 		if (MLX5_CAP_ETH(mdev, csum_cap)) {
 		if (MLX5_CAP_ETH(mdev, csum_cap)) {
 			/* Legacy bit to support old userspace libraries */
 			/* Legacy bit to support old userspace libraries */
 			props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM;
 			props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM;
@@ -682,7 +827,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 						MLX5_RX_HASH_SRC_PORT_TCP |
 						MLX5_RX_HASH_SRC_PORT_TCP |
 						MLX5_RX_HASH_DST_PORT_TCP |
 						MLX5_RX_HASH_DST_PORT_TCP |
 						MLX5_RX_HASH_SRC_PORT_UDP |
 						MLX5_RX_HASH_SRC_PORT_UDP |
-						MLX5_RX_HASH_DST_PORT_UDP;
+						MLX5_RX_HASH_DST_PORT_UDP |
+						MLX5_RX_HASH_INNER;
 			resp.response_length += sizeof(resp.rss_caps);
 			resp.response_length += sizeof(resp.rss_caps);
 		}
 		}
 	} else {
 	} else {
@@ -698,7 +844,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 	}
 	}
 
 
 	if (MLX5_CAP_GEN(dev->mdev, rq_delay_drop) &&
 	if (MLX5_CAP_GEN(dev->mdev, rq_delay_drop) &&
-	    MLX5_CAP_GEN(dev->mdev, general_notification_event))
+	    MLX5_CAP_GEN(dev->mdev, general_notification_event) &&
+	    raw_support)
 		props->raw_packet_caps |= IB_RAW_PACKET_CAP_DELAY_DROP;
 		props->raw_packet_caps |= IB_RAW_PACKET_CAP_DELAY_DROP;
 
 
 	if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads) &&
 	if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads) &&
@@ -706,7 +853,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 		props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
 		props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
 
 
 	if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
 	if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
-	    MLX5_CAP_ETH(dev->mdev, scatter_fcs)) {
+	    MLX5_CAP_ETH(dev->mdev, scatter_fcs) &&
+	    raw_support) {
 		/* Legacy bit to support old userspace libraries */
 		/* Legacy bit to support old userspace libraries */
 		props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS;
 		props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS;
 		props->raw_packet_caps |= IB_RAW_PACKET_CAP_SCATTER_FCS;
 		props->raw_packet_caps |= IB_RAW_PACKET_CAP_SCATTER_FCS;
@@ -746,7 +894,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 	props->max_srq_sge	   = max_rq_sg - 1;
 	props->max_srq_sge	   = max_rq_sg - 1;
 	props->max_fast_reg_page_list_len =
 	props->max_fast_reg_page_list_len =
 		1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size);
 		1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size);
-	get_atomic_caps(dev, props);
+	get_atomic_caps_qp(dev, props);
 	props->masked_atomic_cap   = IB_ATOMIC_NONE;
 	props->masked_atomic_cap   = IB_ATOMIC_NONE;
 	props->max_mcast_grp	   = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
 	props->max_mcast_grp	   = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
 	props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg);
 	props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg);
@@ -770,7 +918,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 		props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION;
 		props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION;
 
 
 	if (mlx5_ib_port_link_layer(ibdev, 1) ==
 	if (mlx5_ib_port_link_layer(ibdev, 1) ==
-	    IB_LINK_LAYER_ETHERNET) {
+	    IB_LINK_LAYER_ETHERNET && raw_support) {
 		props->rss_caps.max_rwq_indirection_tables =
 		props->rss_caps.max_rwq_indirection_tables =
 			1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt);
 			1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt);
 		props->rss_caps.max_rwq_indirection_table_size =
 		props->rss_caps.max_rwq_indirection_table_size =
@@ -807,7 +955,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 		resp.response_length += sizeof(resp.cqe_comp_caps);
 		resp.response_length += sizeof(resp.cqe_comp_caps);
 	}
 	}
 
 
-	if (field_avail(typeof(resp), packet_pacing_caps, uhw->outlen)) {
+	if (field_avail(typeof(resp), packet_pacing_caps, uhw->outlen) &&
+	    raw_support) {
 		if (MLX5_CAP_QOS(mdev, packet_pacing) &&
 		if (MLX5_CAP_QOS(mdev, packet_pacing) &&
 		    MLX5_CAP_GEN(mdev, qos)) {
 		    MLX5_CAP_GEN(mdev, qos)) {
 			resp.packet_pacing_caps.qp_rate_limit_max =
 			resp.packet_pacing_caps.qp_rate_limit_max =
@@ -866,7 +1015,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 		}
 		}
 	}
 	}
 
 
-	if (field_avail(typeof(resp), striding_rq_caps, uhw->outlen)) {
+	if (field_avail(typeof(resp), striding_rq_caps, uhw->outlen) &&
+	    raw_support) {
 		resp.response_length += sizeof(resp.striding_rq_caps);
 		resp.response_length += sizeof(resp.striding_rq_caps);
 		if (MLX5_CAP_GEN(mdev, striding_rq)) {
 		if (MLX5_CAP_GEN(mdev, striding_rq)) {
 			resp.striding_rq_caps.min_single_stride_log_num_of_bytes =
 			resp.striding_rq_caps.min_single_stride_log_num_of_bytes =
@@ -1097,7 +1247,22 @@ int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
 	}
 	}
 
 
 	if (!ret && props) {
 	if (!ret && props) {
-		count = mlx5_core_reserved_gids_count(to_mdev(ibdev)->mdev);
+		struct mlx5_ib_dev *dev = to_mdev(ibdev);
+		struct mlx5_core_dev *mdev;
+		bool put_mdev = true;
+
+		mdev = mlx5_ib_get_native_port_mdev(dev, port, NULL);
+		if (!mdev) {
+			/* If the port isn't affiliated yet query the master.
+			 * The master and slave will have the same values.
+			 */
+			mdev = dev->mdev;
+			port = 1;
+			put_mdev = false;
+		}
+		count = mlx5_core_reserved_gids_count(mdev);
+		if (put_mdev)
+			mlx5_ib_put_native_port_mdev(dev, port);
 		props->gid_tbl_len -= count;
 		props->gid_tbl_len -= count;
 	}
 	}
 	return ret;
 	return ret;
@@ -1122,20 +1287,43 @@ static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
 
 
 }
 }
 
 
-static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
-			      u16 *pkey)
+static int mlx5_query_hca_nic_pkey(struct ib_device *ibdev, u8 port,
+				   u16 index, u16 *pkey)
 {
 {
 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
-	struct mlx5_core_dev *mdev = dev->mdev;
+	struct mlx5_core_dev *mdev;
+	bool put_mdev = true;
+	u8 mdev_port_num;
+	int err;
+
+	mdev = mlx5_ib_get_native_port_mdev(dev, port, &mdev_port_num);
+	if (!mdev) {
+		/* The port isn't affiliated yet, get the PKey from the master
+		 * port. For RoCE the PKey tables will be the same.
+		 */
+		put_mdev = false;
+		mdev = dev->mdev;
+		mdev_port_num = 1;
+	}
+
+	err = mlx5_query_hca_vport_pkey(mdev, 0, mdev_port_num, 0,
+					index, pkey);
+	if (put_mdev)
+		mlx5_ib_put_native_port_mdev(dev, port);
+
+	return err;
+}
 
 
+static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+			      u16 *pkey)
+{
 	switch (mlx5_get_vport_access_method(ibdev)) {
 	switch (mlx5_get_vport_access_method(ibdev)) {
 	case MLX5_VPORT_ACCESS_METHOD_MAD:
 	case MLX5_VPORT_ACCESS_METHOD_MAD:
 		return mlx5_query_mad_ifc_pkey(ibdev, port, index, pkey);
 		return mlx5_query_mad_ifc_pkey(ibdev, port, index, pkey);
 
 
 	case MLX5_VPORT_ACCESS_METHOD_HCA:
 	case MLX5_VPORT_ACCESS_METHOD_HCA:
 	case MLX5_VPORT_ACCESS_METHOD_NIC:
 	case MLX5_VPORT_ACCESS_METHOD_NIC:
-		return mlx5_query_hca_vport_pkey(mdev, 0, port,  0, index,
-						 pkey);
+		return mlx5_query_hca_nic_pkey(ibdev, port, index, pkey);
 	default:
 	default:
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
@@ -1174,23 +1362,32 @@ static int set_port_caps_atomic(struct mlx5_ib_dev *dev, u8 port_num, u32 mask,
 				u32 value)
 				u32 value)
 {
 {
 	struct mlx5_hca_vport_context ctx = {};
 	struct mlx5_hca_vport_context ctx = {};
+	struct mlx5_core_dev *mdev;
+	u8 mdev_port_num;
 	int err;
 	int err;
 
 
-	err = mlx5_query_hca_vport_context(dev->mdev, 0,
-					   port_num, 0, &ctx);
+	mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num);
+	if (!mdev)
+		return -ENODEV;
+
+	err = mlx5_query_hca_vport_context(mdev, 0, mdev_port_num, 0, &ctx);
 	if (err)
 	if (err)
-		return err;
+		goto out;
 
 
 	if (~ctx.cap_mask1_perm & mask) {
 	if (~ctx.cap_mask1_perm & mask) {
 		mlx5_ib_warn(dev, "trying to change bitmask 0x%X but change supported 0x%X\n",
 		mlx5_ib_warn(dev, "trying to change bitmask 0x%X but change supported 0x%X\n",
 			     mask, ctx.cap_mask1_perm);
 			     mask, ctx.cap_mask1_perm);
-		return -EINVAL;
+		err = -EINVAL;
+		goto out;
 	}
 	}
 
 
 	ctx.cap_mask1 = value;
 	ctx.cap_mask1 = value;
 	ctx.cap_mask1_perm = mask;
 	ctx.cap_mask1_perm = mask;
-	err = mlx5_core_modify_hca_vport_context(dev->mdev, 0,
-						 port_num, 0, &ctx);
+	err = mlx5_core_modify_hca_vport_context(mdev, 0, mdev_port_num,
+						 0, &ctx);
+
+out:
+	mlx5_ib_put_native_port_mdev(dev, port_num);
 
 
 	return err;
 	return err;
 }
 }
@@ -1241,9 +1438,18 @@ static void print_lib_caps(struct mlx5_ib_dev *dev, u64 caps)
 		    caps & MLX5_LIB_CAP_4K_UAR ? "y" : "n");
 		    caps & MLX5_LIB_CAP_4K_UAR ? "y" : "n");
 }
 }
 
 
+static u16 calc_dynamic_bfregs(int uars_per_sys_page)
+{
+	/* Large page with non 4k uar support might limit the dynamic size */
+	if (uars_per_sys_page == 1  && PAGE_SIZE > 4096)
+		return MLX5_MIN_DYN_BFREGS;
+
+	return MLX5_MAX_DYN_BFREGS;
+}
+
 static int calc_total_bfregs(struct mlx5_ib_dev *dev, bool lib_uar_4k,
 static int calc_total_bfregs(struct mlx5_ib_dev *dev, bool lib_uar_4k,
 			     struct mlx5_ib_alloc_ucontext_req_v2 *req,
 			     struct mlx5_ib_alloc_ucontext_req_v2 *req,
-			     u32 *num_sys_pages)
+			     struct mlx5_bfreg_info *bfregi)
 {
 {
 	int uars_per_sys_page;
 	int uars_per_sys_page;
 	int bfregs_per_sys_page;
 	int bfregs_per_sys_page;
@@ -1260,16 +1466,21 @@ static int calc_total_bfregs(struct mlx5_ib_dev *dev, bool lib_uar_4k,
 
 
 	uars_per_sys_page = get_uars_per_sys_page(dev, lib_uar_4k);
 	uars_per_sys_page = get_uars_per_sys_page(dev, lib_uar_4k);
 	bfregs_per_sys_page = uars_per_sys_page * MLX5_NON_FP_BFREGS_PER_UAR;
 	bfregs_per_sys_page = uars_per_sys_page * MLX5_NON_FP_BFREGS_PER_UAR;
+	/* This holds the required static allocation asked by the user */
 	req->total_num_bfregs = ALIGN(req->total_num_bfregs, bfregs_per_sys_page);
 	req->total_num_bfregs = ALIGN(req->total_num_bfregs, bfregs_per_sys_page);
-	*num_sys_pages = req->total_num_bfregs / bfregs_per_sys_page;
-
 	if (req->num_low_latency_bfregs > req->total_num_bfregs - 1)
 	if (req->num_low_latency_bfregs > req->total_num_bfregs - 1)
 		return -EINVAL;
 		return -EINVAL;
 
 
-	mlx5_ib_dbg(dev, "uar_4k: fw support %s, lib support %s, user requested %d bfregs, allocated %d, using %d sys pages\n",
+	bfregi->num_static_sys_pages = req->total_num_bfregs / bfregs_per_sys_page;
+	bfregi->num_dyn_bfregs = ALIGN(calc_dynamic_bfregs(uars_per_sys_page), bfregs_per_sys_page);
+	bfregi->total_num_bfregs = req->total_num_bfregs + bfregi->num_dyn_bfregs;
+	bfregi->num_sys_pages = bfregi->total_num_bfregs / bfregs_per_sys_page;
+
+	mlx5_ib_dbg(dev, "uar_4k: fw support %s, lib support %s, user requested %d bfregs, allocated %d, total bfregs %d, using %d sys pages\n",
 		    MLX5_CAP_GEN(dev->mdev, uar_4k) ? "yes" : "no",
 		    MLX5_CAP_GEN(dev->mdev, uar_4k) ? "yes" : "no",
 		    lib_uar_4k ? "yes" : "no", ref_bfregs,
 		    lib_uar_4k ? "yes" : "no", ref_bfregs,
-		    req->total_num_bfregs, *num_sys_pages);
+		    req->total_num_bfregs, bfregi->total_num_bfregs,
+		    bfregi->num_sys_pages);
 
 
 	return 0;
 	return 0;
 }
 }
@@ -1281,13 +1492,17 @@ static int allocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *conte
 	int i;
 	int i;
 
 
 	bfregi = &context->bfregi;
 	bfregi = &context->bfregi;
-	for (i = 0; i < bfregi->num_sys_pages; i++) {
+	for (i = 0; i < bfregi->num_static_sys_pages; i++) {
 		err = mlx5_cmd_alloc_uar(dev->mdev, &bfregi->sys_pages[i]);
 		err = mlx5_cmd_alloc_uar(dev->mdev, &bfregi->sys_pages[i]);
 		if (err)
 		if (err)
 			goto error;
 			goto error;
 
 
 		mlx5_ib_dbg(dev, "allocated uar %d\n", bfregi->sys_pages[i]);
 		mlx5_ib_dbg(dev, "allocated uar %d\n", bfregi->sys_pages[i]);
 	}
 	}
+
+	for (i = bfregi->num_static_sys_pages; i < bfregi->num_sys_pages; i++)
+		bfregi->sys_pages[i] = MLX5_IB_INVALID_UAR_INDEX;
+
 	return 0;
 	return 0;
 
 
 error:
 error:
@@ -1306,12 +1521,16 @@ static int deallocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *con
 
 
 	bfregi = &context->bfregi;
 	bfregi = &context->bfregi;
 	for (i = 0; i < bfregi->num_sys_pages; i++) {
 	for (i = 0; i < bfregi->num_sys_pages; i++) {
-		err = mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]);
-		if (err) {
-			mlx5_ib_warn(dev, "failed to free uar %d\n", i);
-			return err;
+		if (i < bfregi->num_static_sys_pages ||
+		    bfregi->sys_pages[i] != MLX5_IB_INVALID_UAR_INDEX) {
+			err = mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]);
+			if (err) {
+				mlx5_ib_warn(dev, "failed to free uar %d, err=%d\n", i, err);
+				return err;
+			}
 		}
 		}
 	}
 	}
+
 	return 0;
 	return 0;
 }
 }
 
 
@@ -1362,6 +1581,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
 	struct mlx5_ib_alloc_ucontext_req_v2 req = {};
 	struct mlx5_ib_alloc_ucontext_req_v2 req = {};
 	struct mlx5_ib_alloc_ucontext_resp resp = {};
 	struct mlx5_ib_alloc_ucontext_resp resp = {};
+	struct mlx5_core_dev *mdev = dev->mdev;
 	struct mlx5_ib_ucontext *context;
 	struct mlx5_ib_ucontext *context;
 	struct mlx5_bfreg_info *bfregi;
 	struct mlx5_bfreg_info *bfregi;
 	int ver;
 	int ver;
@@ -1422,13 +1642,13 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	bfregi = &context->bfregi;
 	bfregi = &context->bfregi;
 
 
 	/* updates req->total_num_bfregs */
 	/* updates req->total_num_bfregs */
-	err = calc_total_bfregs(dev, lib_uar_4k, &req, &bfregi->num_sys_pages);
+	err = calc_total_bfregs(dev, lib_uar_4k, &req, bfregi);
 	if (err)
 	if (err)
 		goto out_ctx;
 		goto out_ctx;
 
 
 	mutex_init(&bfregi->lock);
 	mutex_init(&bfregi->lock);
 	bfregi->lib_uar_4k = lib_uar_4k;
 	bfregi->lib_uar_4k = lib_uar_4k;
-	bfregi->count = kcalloc(req.total_num_bfregs, sizeof(*bfregi->count),
+	bfregi->count = kcalloc(bfregi->total_num_bfregs, sizeof(*bfregi->count),
 				GFP_KERNEL);
 				GFP_KERNEL);
 	if (!bfregi->count) {
 	if (!bfregi->count) {
 		err = -ENOMEM;
 		err = -ENOMEM;
@@ -1470,7 +1690,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	mutex_init(&context->db_page_mutex);
 	mutex_init(&context->db_page_mutex);
 
 
 	resp.tot_bfregs = req.total_num_bfregs;
 	resp.tot_bfregs = req.total_num_bfregs;
-	resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports);
+	resp.num_ports = dev->num_ports;
 
 
 	if (field_avail(typeof(resp), cqe_version, udata->outlen))
 	if (field_avail(typeof(resp), cqe_version, udata->outlen))
 		resp.response_length += sizeof(resp.cqe_version);
 		resp.response_length += sizeof(resp.cqe_version);
@@ -1489,6 +1709,12 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 		resp.response_length += sizeof(resp.eth_min_inline);
 		resp.response_length += sizeof(resp.eth_min_inline);
 	}
 	}
 
 
+	if (field_avail(typeof(resp), clock_info_versions, udata->outlen)) {
+		if (mdev->clock_info)
+			resp.clock_info_versions = BIT(MLX5_IB_CLOCK_INFO_V1);
+		resp.response_length += sizeof(resp.clock_info_versions);
+	}
+
 	/*
 	/*
 	 * We don't want to expose information from the PCI bar that is located
 	 * We don't want to expose information from the PCI bar that is located
 	 * after 4096 bytes, so if the arch only supports larger pages, let's
 	 * after 4096 bytes, so if the arch only supports larger pages, let's
@@ -1502,8 +1728,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 			resp.hca_core_clock_offset =
 			resp.hca_core_clock_offset =
 				offsetof(struct mlx5_init_seg, internal_timer_h) % PAGE_SIZE;
 				offsetof(struct mlx5_init_seg, internal_timer_h) % PAGE_SIZE;
 		}
 		}
-		resp.response_length += sizeof(resp.hca_core_clock_offset) +
-					sizeof(resp.reserved2);
+		resp.response_length += sizeof(resp.hca_core_clock_offset);
 	}
 	}
 
 
 	if (field_avail(typeof(resp), log_uar_size, udata->outlen))
 	if (field_avail(typeof(resp), log_uar_size, udata->outlen))
@@ -1512,6 +1737,11 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	if (field_avail(typeof(resp), num_uars_per_page, udata->outlen))
 	if (field_avail(typeof(resp), num_uars_per_page, udata->outlen))
 		resp.response_length += sizeof(resp.num_uars_per_page);
 		resp.response_length += sizeof(resp.num_uars_per_page);
 
 
+	if (field_avail(typeof(resp), num_dyn_bfregs, udata->outlen)) {
+		resp.num_dyn_bfregs = bfregi->num_dyn_bfregs;
+		resp.response_length += sizeof(resp.num_dyn_bfregs);
+	}
+
 	err = ib_copy_to_udata(udata, &resp, resp.response_length);
 	err = ib_copy_to_udata(udata, &resp, resp.response_length);
 	if (err)
 	if (err)
 		goto out_td;
 		goto out_td;
@@ -1566,15 +1796,13 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
 }
 }
 
 
 static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev,
 static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev,
-				 struct mlx5_bfreg_info *bfregi,
-				 int idx)
+				 int uar_idx)
 {
 {
 	int fw_uars_per_page;
 	int fw_uars_per_page;
 
 
 	fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_UARS_IN_PAGE : 1;
 	fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_UARS_IN_PAGE : 1;
 
 
-	return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) +
-			bfregi->sys_pages[idx] / fw_uars_per_page;
+	return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + uar_idx / fw_uars_per_page;
 }
 }
 
 
 static int get_command(unsigned long offset)
 static int get_command(unsigned long offset)
@@ -1592,6 +1820,12 @@ static int get_index(unsigned long offset)
 	return get_arg(offset);
 	return get_arg(offset);
 }
 }
 
 
+/* Index resides in an extra byte to enable larger values than 255 */
+static int get_extended_index(unsigned long offset)
+{
+	return get_arg(offset) | ((offset >> 16) & 0xff) << 8;
+}
+
 static void  mlx5_ib_vma_open(struct vm_area_struct *area)
 static void  mlx5_ib_vma_open(struct vm_area_struct *area)
 {
 {
 	/* vma_open is called when a new VMA is created on top of our VMA.  This
 	/* vma_open is called when a new VMA is created on top of our VMA.  This
@@ -1733,6 +1967,38 @@ static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd)
 	}
 	}
 }
 }
 
 
+static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev,
+					struct vm_area_struct *vma,
+					struct mlx5_ib_ucontext *context)
+{
+	phys_addr_t pfn;
+	int err;
+
+	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+		return -EINVAL;
+
+	if (get_index(vma->vm_pgoff) != MLX5_IB_CLOCK_INFO_V1)
+		return -EOPNOTSUPP;
+
+	if (vma->vm_flags & VM_WRITE)
+		return -EPERM;
+
+	if (!dev->mdev->clock_info_page)
+		return -EOPNOTSUPP;
+
+	pfn = page_to_pfn(dev->mdev->clock_info_page);
+	err = remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE,
+			      vma->vm_page_prot);
+	if (err)
+		return err;
+
+	mlx5_ib_dbg(dev, "mapped clock info at 0x%lx, PA 0x%llx\n",
+		    vma->vm_start,
+		    (unsigned long long)pfn << PAGE_SHIFT);
+
+	return mlx5_ib_set_vma_data(vma, context);
+}
+
 static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
 static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
 		    struct vm_area_struct *vma,
 		    struct vm_area_struct *vma,
 		    struct mlx5_ib_ucontext *context)
 		    struct mlx5_ib_ucontext *context)
@@ -1742,21 +2008,29 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
 	unsigned long idx;
 	unsigned long idx;
 	phys_addr_t pfn, pa;
 	phys_addr_t pfn, pa;
 	pgprot_t prot;
 	pgprot_t prot;
-	int uars_per_page;
+	u32 bfreg_dyn_idx = 0;
+	u32 uar_index;
+	int dyn_uar = (cmd == MLX5_IB_MMAP_ALLOC_WC);
+	int max_valid_idx = dyn_uar ? bfregi->num_sys_pages :
+				bfregi->num_static_sys_pages;
 
 
 	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
 	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
 		return -EINVAL;
 		return -EINVAL;
 
 
-	uars_per_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k);
-	idx = get_index(vma->vm_pgoff);
-	if (idx % uars_per_page ||
-	    idx * uars_per_page >= bfregi->num_sys_pages) {
-		mlx5_ib_warn(dev, "invalid uar index %lu\n", idx);
+	if (dyn_uar)
+		idx = get_extended_index(vma->vm_pgoff) + bfregi->num_static_sys_pages;
+	else
+		idx = get_index(vma->vm_pgoff);
+
+	if (idx >= max_valid_idx) {
+		mlx5_ib_warn(dev, "invalid uar index %lu, max=%d\n",
+			     idx, max_valid_idx);
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
 	switch (cmd) {
 	switch (cmd) {
 	case MLX5_IB_MMAP_WC_PAGE:
 	case MLX5_IB_MMAP_WC_PAGE:
+	case MLX5_IB_MMAP_ALLOC_WC:
 /* Some architectures don't support WC memory */
 /* Some architectures don't support WC memory */
 #if defined(CONFIG_X86)
 #if defined(CONFIG_X86)
 		if (!pat_enabled())
 		if (!pat_enabled())
@@ -1776,7 +2050,40 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
-	pfn = uar_index2pfn(dev, bfregi, idx);
+	if (dyn_uar) {
+		int uars_per_page;
+
+		uars_per_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k);
+		bfreg_dyn_idx = idx * (uars_per_page * MLX5_NON_FP_BFREGS_PER_UAR);
+		if (bfreg_dyn_idx >= bfregi->total_num_bfregs) {
+			mlx5_ib_warn(dev, "invalid bfreg_dyn_idx %u, max=%u\n",
+				     bfreg_dyn_idx, bfregi->total_num_bfregs);
+			return -EINVAL;
+		}
+
+		mutex_lock(&bfregi->lock);
+		/* Fail if uar already allocated, first bfreg index of each
+		 * page holds its count.
+		 */
+		if (bfregi->count[bfreg_dyn_idx]) {
+			mlx5_ib_warn(dev, "wrong offset, idx %lu is busy, bfregn=%u\n", idx, bfreg_dyn_idx);
+			mutex_unlock(&bfregi->lock);
+			return -EINVAL;
+		}
+
+		bfregi->count[bfreg_dyn_idx]++;
+		mutex_unlock(&bfregi->lock);
+
+		err = mlx5_cmd_alloc_uar(dev->mdev, &uar_index);
+		if (err) {
+			mlx5_ib_warn(dev, "UAR alloc failed\n");
+			goto free_bfreg;
+		}
+	} else {
+		uar_index = bfregi->sys_pages[idx];
+	}
+
+	pfn = uar_index2pfn(dev, uar_index);
 	mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn);
 	mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn);
 
 
 	vma->vm_page_prot = prot;
 	vma->vm_page_prot = prot;
@@ -1785,14 +2092,32 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
 	if (err) {
 	if (err) {
 		mlx5_ib_err(dev, "io_remap_pfn_range failed with error=%d, vm_start=0x%lx, pfn=%pa, mmap_cmd=%s\n",
 		mlx5_ib_err(dev, "io_remap_pfn_range failed with error=%d, vm_start=0x%lx, pfn=%pa, mmap_cmd=%s\n",
 			    err, vma->vm_start, &pfn, mmap_cmd2str(cmd));
 			    err, vma->vm_start, &pfn, mmap_cmd2str(cmd));
-		return -EAGAIN;
+		err = -EAGAIN;
+		goto err;
 	}
 	}
 
 
 	pa = pfn << PAGE_SHIFT;
 	pa = pfn << PAGE_SHIFT;
 	mlx5_ib_dbg(dev, "mapped %s at 0x%lx, PA %pa\n", mmap_cmd2str(cmd),
 	mlx5_ib_dbg(dev, "mapped %s at 0x%lx, PA %pa\n", mmap_cmd2str(cmd),
 		    vma->vm_start, &pa);
 		    vma->vm_start, &pa);
 
 
-	return mlx5_ib_set_vma_data(vma, context);
+	err = mlx5_ib_set_vma_data(vma, context);
+	if (err)
+		goto err;
+
+	if (dyn_uar)
+		bfregi->sys_pages[idx] = uar_index;
+	return 0;
+
+err:
+	if (!dyn_uar)
+		return err;
+
+	mlx5_cmd_free_uar(dev->mdev, idx);
+
+free_bfreg:
+	mlx5_ib_free_bfreg(dev, bfregi, bfreg_dyn_idx);
+
+	return err;
 }
 }
 
 
 static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
 static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
@@ -1807,6 +2132,7 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm
 	case MLX5_IB_MMAP_WC_PAGE:
 	case MLX5_IB_MMAP_WC_PAGE:
 	case MLX5_IB_MMAP_NC_PAGE:
 	case MLX5_IB_MMAP_NC_PAGE:
 	case MLX5_IB_MMAP_REGULAR_PAGE:
 	case MLX5_IB_MMAP_REGULAR_PAGE:
+	case MLX5_IB_MMAP_ALLOC_WC:
 		return uar_mmap(dev, command, vma, context);
 		return uar_mmap(dev, command, vma, context);
 
 
 	case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
 	case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
@@ -1835,6 +2161,8 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm
 			    vma->vm_start,
 			    vma->vm_start,
 			    (unsigned long long)pfn << PAGE_SHIFT);
 			    (unsigned long long)pfn << PAGE_SHIFT);
 		break;
 		break;
+	case MLX5_IB_MMAP_CLOCK_INFO:
+		return mlx5_ib_mmap_clock_info_page(dev, vma, context);
 
 
 	default:
 	default:
 		return -EINVAL;
 		return -EINVAL;
@@ -2663,7 +2991,7 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp,
 		return ERR_PTR(-ENOMEM);
 		return ERR_PTR(-ENOMEM);
 
 
 	if (domain != IB_FLOW_DOMAIN_USER ||
 	if (domain != IB_FLOW_DOMAIN_USER ||
-	    flow_attr->port > MLX5_CAP_GEN(dev->mdev, num_ports) ||
+	    flow_attr->port > dev->num_ports ||
 	    (flow_attr->flags & ~IB_FLOW_ATTR_FLAGS_DONT_TRAP))
 	    (flow_attr->flags & ~IB_FLOW_ATTR_FLAGS_DONT_TRAP))
 		return ERR_PTR(-EINVAL);
 		return ERR_PTR(-EINVAL);
 
 
@@ -2928,15 +3256,24 @@ static void delay_drop_handler(struct work_struct *work)
 	mutex_unlock(&delay_drop->lock);
 	mutex_unlock(&delay_drop->lock);
 }
 }
 
 
-static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
-			  enum mlx5_dev_event event, unsigned long param)
+static void mlx5_ib_handle_event(struct work_struct *_work)
 {
 {
-	struct mlx5_ib_dev *ibdev = (struct mlx5_ib_dev *)context;
+	struct mlx5_ib_event_work *work =
+		container_of(_work, struct mlx5_ib_event_work, work);
+	struct mlx5_ib_dev *ibdev;
 	struct ib_event ibev;
 	struct ib_event ibev;
 	bool fatal = false;
 	bool fatal = false;
 	u8 port = 0;
 	u8 port = 0;
 
 
-	switch (event) {
+	if (mlx5_core_is_mp_slave(work->dev)) {
+		ibdev = mlx5_ib_get_ibdev_from_mpi(work->context);
+		if (!ibdev)
+			goto out;
+	} else {
+		ibdev = work->context;
+	}
+
+	switch (work->event) {
 	case MLX5_DEV_EVENT_SYS_ERROR:
 	case MLX5_DEV_EVENT_SYS_ERROR:
 		ibev.event = IB_EVENT_DEVICE_FATAL;
 		ibev.event = IB_EVENT_DEVICE_FATAL;
 		mlx5_ib_handle_internal_error(ibdev);
 		mlx5_ib_handle_internal_error(ibdev);
@@ -2946,39 +3283,39 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
 	case MLX5_DEV_EVENT_PORT_UP:
 	case MLX5_DEV_EVENT_PORT_UP:
 	case MLX5_DEV_EVENT_PORT_DOWN:
 	case MLX5_DEV_EVENT_PORT_DOWN:
 	case MLX5_DEV_EVENT_PORT_INITIALIZED:
 	case MLX5_DEV_EVENT_PORT_INITIALIZED:
-		port = (u8)param;
+		port = (u8)work->param;
 
 
 		/* In RoCE, port up/down events are handled in
 		/* In RoCE, port up/down events are handled in
 		 * mlx5_netdev_event().
 		 * mlx5_netdev_event().
 		 */
 		 */
 		if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) ==
 		if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) ==
 			IB_LINK_LAYER_ETHERNET)
 			IB_LINK_LAYER_ETHERNET)
-			return;
+			goto out;
 
 
-		ibev.event = (event == MLX5_DEV_EVENT_PORT_UP) ?
+		ibev.event = (work->event == MLX5_DEV_EVENT_PORT_UP) ?
 			     IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
 			     IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
 		break;
 		break;
 
 
 	case MLX5_DEV_EVENT_LID_CHANGE:
 	case MLX5_DEV_EVENT_LID_CHANGE:
 		ibev.event = IB_EVENT_LID_CHANGE;
 		ibev.event = IB_EVENT_LID_CHANGE;
-		port = (u8)param;
+		port = (u8)work->param;
 		break;
 		break;
 
 
 	case MLX5_DEV_EVENT_PKEY_CHANGE:
 	case MLX5_DEV_EVENT_PKEY_CHANGE:
 		ibev.event = IB_EVENT_PKEY_CHANGE;
 		ibev.event = IB_EVENT_PKEY_CHANGE;
-		port = (u8)param;
+		port = (u8)work->param;
 
 
 		schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work);
 		schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work);
 		break;
 		break;
 
 
 	case MLX5_DEV_EVENT_GUID_CHANGE:
 	case MLX5_DEV_EVENT_GUID_CHANGE:
 		ibev.event = IB_EVENT_GID_CHANGE;
 		ibev.event = IB_EVENT_GID_CHANGE;
-		port = (u8)param;
+		port = (u8)work->param;
 		break;
 		break;
 
 
 	case MLX5_DEV_EVENT_CLIENT_REREG:
 	case MLX5_DEV_EVENT_CLIENT_REREG:
 		ibev.event = IB_EVENT_CLIENT_REREGISTER;
 		ibev.event = IB_EVENT_CLIENT_REREGISTER;
-		port = (u8)param;
+		port = (u8)work->param;
 		break;
 		break;
 	case MLX5_DEV_EVENT_DELAY_DROP_TIMEOUT:
 	case MLX5_DEV_EVENT_DELAY_DROP_TIMEOUT:
 		schedule_work(&ibdev->delay_drop.delay_drop_work);
 		schedule_work(&ibdev->delay_drop.delay_drop_work);
@@ -3000,9 +3337,26 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
 
 
 	if (fatal)
 	if (fatal)
 		ibdev->ib_active = false;
 		ibdev->ib_active = false;
-
 out:
 out:
-	return;
+	kfree(work);
+}
+
+static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
+			  enum mlx5_dev_event event, unsigned long param)
+{
+	struct mlx5_ib_event_work *work;
+
+	work = kmalloc(sizeof(*work), GFP_ATOMIC);
+	if (!work)
+		return;
+
+	INIT_WORK(&work->work, mlx5_ib_handle_event);
+	work->dev = dev;
+	work->param = param;
+	work->context = context;
+	work->event = event;
+
+	queue_work(mlx5_ib_event_wq, &work->work);
 }
 }
 
 
 static int set_has_smi_cap(struct mlx5_ib_dev *dev)
 static int set_has_smi_cap(struct mlx5_ib_dev *dev)
@@ -3011,7 +3365,7 @@ static int set_has_smi_cap(struct mlx5_ib_dev *dev)
 	int err;
 	int err;
 	int port;
 	int port;
 
 
-	for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) {
+	for (port = 1; port <= dev->num_ports; port++) {
 		dev->mdev->port_caps[port - 1].has_smi = false;
 		dev->mdev->port_caps[port - 1].has_smi = false;
 		if (MLX5_CAP_GEN(dev->mdev, port_type) ==
 		if (MLX5_CAP_GEN(dev->mdev, port_type) ==
 		    MLX5_CAP_PORT_TYPE_IB) {
 		    MLX5_CAP_PORT_TYPE_IB) {
@@ -3038,16 +3392,15 @@ static void get_ext_port_caps(struct mlx5_ib_dev *dev)
 {
 {
 	int port;
 	int port;
 
 
-	for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++)
+	for (port = 1; port <= dev->num_ports; port++)
 		mlx5_query_ext_port_caps(dev, port);
 		mlx5_query_ext_port_caps(dev, port);
 }
 }
 
 
-static int get_port_caps(struct mlx5_ib_dev *dev)
+static int get_port_caps(struct mlx5_ib_dev *dev, u8 port)
 {
 {
 	struct ib_device_attr *dprops = NULL;
 	struct ib_device_attr *dprops = NULL;
 	struct ib_port_attr *pprops = NULL;
 	struct ib_port_attr *pprops = NULL;
 	int err = -ENOMEM;
 	int err = -ENOMEM;
-	int port;
 	struct ib_udata uhw = {.inlen = 0, .outlen = 0};
 	struct ib_udata uhw = {.inlen = 0, .outlen = 0};
 
 
 	pprops = kmalloc(sizeof(*pprops), GFP_KERNEL);
 	pprops = kmalloc(sizeof(*pprops), GFP_KERNEL);
@@ -3068,22 +3421,21 @@ static int get_port_caps(struct mlx5_ib_dev *dev)
 		goto out;
 		goto out;
 	}
 	}
 
 
-	for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) {
-		memset(pprops, 0, sizeof(*pprops));
-		err = mlx5_ib_query_port(&dev->ib_dev, port, pprops);
-		if (err) {
-			mlx5_ib_warn(dev, "query_port %d failed %d\n",
-				     port, err);
-			break;
-		}
-		dev->mdev->port_caps[port - 1].pkey_table_len =
-						dprops->max_pkeys;
-		dev->mdev->port_caps[port - 1].gid_table_len =
-						pprops->gid_tbl_len;
-		mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n",
-			    dprops->max_pkeys, pprops->gid_tbl_len);
+	memset(pprops, 0, sizeof(*pprops));
+	err = mlx5_ib_query_port(&dev->ib_dev, port, pprops);
+	if (err) {
+		mlx5_ib_warn(dev, "query_port %d failed %d\n",
+			     port, err);
+		goto out;
 	}
 	}
 
 
+	dev->mdev->port_caps[port - 1].pkey_table_len =
+					dprops->max_pkeys;
+	dev->mdev->port_caps[port - 1].gid_table_len =
+					pprops->gid_tbl_len;
+	mlx5_ib_dbg(dev, "port %d: pkey_table_len %d, gid_table_len %d\n",
+		    port, dprops->max_pkeys, pprops->gid_tbl_len);
+
 out:
 out:
 	kfree(pprops);
 	kfree(pprops);
 	kfree(dprops);
 	kfree(dprops);
@@ -3373,12 +3725,14 @@ static u32 get_core_cap_flags(struct ib_device *ibdev)
 	enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1);
 	enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1);
 	u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type);
 	u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type);
 	u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version);
 	u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version);
+	bool raw_support = !mlx5_core_mp_enabled(dev->mdev);
 	u32 ret = 0;
 	u32 ret = 0;
 
 
 	if (ll == IB_LINK_LAYER_INFINIBAND)
 	if (ll == IB_LINK_LAYER_INFINIBAND)
 		return RDMA_CORE_PORT_IBA_IB;
 		return RDMA_CORE_PORT_IBA_IB;
 
 
-	ret = RDMA_CORE_PORT_RAW_PACKET;
+	if (raw_support)
+		ret = RDMA_CORE_PORT_RAW_PACKET;
 
 
 	if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP))
 	if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP))
 		return ret;
 		return ret;
@@ -3468,33 +3822,33 @@ static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev)
 	}
 	}
 }
 }
 
 
-static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev)
+static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num)
 {
 {
 	int err;
 	int err;
 
 
-	dev->roce.nb.notifier_call = mlx5_netdev_event;
-	err = register_netdevice_notifier(&dev->roce.nb);
+	dev->roce[port_num].nb.notifier_call = mlx5_netdev_event;
+	err = register_netdevice_notifier(&dev->roce[port_num].nb);
 	if (err) {
 	if (err) {
-		dev->roce.nb.notifier_call = NULL;
+		dev->roce[port_num].nb.notifier_call = NULL;
 		return err;
 		return err;
 	}
 	}
 
 
 	return 0;
 	return 0;
 }
 }
 
 
-static void mlx5_remove_netdev_notifier(struct mlx5_ib_dev *dev)
+static void mlx5_remove_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num)
 {
 {
-	if (dev->roce.nb.notifier_call) {
-		unregister_netdevice_notifier(&dev->roce.nb);
-		dev->roce.nb.notifier_call = NULL;
+	if (dev->roce[port_num].nb.notifier_call) {
+		unregister_netdevice_notifier(&dev->roce[port_num].nb);
+		dev->roce[port_num].nb.notifier_call = NULL;
 	}
 	}
 }
 }
 
 
-static int mlx5_enable_eth(struct mlx5_ib_dev *dev)
+static int mlx5_enable_eth(struct mlx5_ib_dev *dev, u8 port_num)
 {
 {
 	int err;
 	int err;
 
 
-	err = mlx5_add_netdev_notifier(dev);
+	err = mlx5_add_netdev_notifier(dev, port_num);
 	if (err)
 	if (err)
 		return err;
 		return err;
 
 
@@ -3515,7 +3869,7 @@ err_disable_roce:
 		mlx5_nic_vport_disable_roce(dev->mdev);
 		mlx5_nic_vport_disable_roce(dev->mdev);
 
 
 err_unregister_netdevice_notifier:
 err_unregister_netdevice_notifier:
-	mlx5_remove_netdev_notifier(dev);
+	mlx5_remove_netdev_notifier(dev, port_num);
 	return err;
 	return err;
 }
 }
 
 
@@ -3577,11 +3931,12 @@ static const struct mlx5_ib_counter extended_err_cnts[] = {
 
 
 static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev)
 static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev)
 {
 {
-	unsigned int i;
+	int i;
 
 
 	for (i = 0; i < dev->num_ports; i++) {
 	for (i = 0; i < dev->num_ports; i++) {
-		mlx5_core_dealloc_q_counter(dev->mdev,
-					    dev->port[i].cnts.set_id);
+		if (dev->port[i].cnts.set_id)
+			mlx5_core_dealloc_q_counter(dev->mdev,
+						    dev->port[i].cnts.set_id);
 		kfree(dev->port[i].cnts.names);
 		kfree(dev->port[i].cnts.names);
 		kfree(dev->port[i].cnts.offsets);
 		kfree(dev->port[i].cnts.offsets);
 	}
 	}
@@ -3623,6 +3978,7 @@ static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev,
 
 
 err_names:
 err_names:
 	kfree(cnts->names);
 	kfree(cnts->names);
+	cnts->names = NULL;
 	return -ENOMEM;
 	return -ENOMEM;
 }
 }
 
 
@@ -3669,37 +4025,33 @@ static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev,
 
 
 static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev)
 static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev)
 {
 {
+	int err = 0;
 	int i;
 	int i;
-	int ret;
 
 
 	for (i = 0; i < dev->num_ports; i++) {
 	for (i = 0; i < dev->num_ports; i++) {
-		struct mlx5_ib_port *port = &dev->port[i];
+		err = __mlx5_ib_alloc_counters(dev, &dev->port[i].cnts);
+		if (err)
+			goto err_alloc;
 
 
-		ret = mlx5_core_alloc_q_counter(dev->mdev,
-						&port->cnts.set_id);
-		if (ret) {
+		mlx5_ib_fill_counters(dev, dev->port[i].cnts.names,
+				      dev->port[i].cnts.offsets);
+
+		err = mlx5_core_alloc_q_counter(dev->mdev,
+						&dev->port[i].cnts.set_id);
+		if (err) {
 			mlx5_ib_warn(dev,
 			mlx5_ib_warn(dev,
 				     "couldn't allocate queue counter for port %d, err %d\n",
 				     "couldn't allocate queue counter for port %d, err %d\n",
-				     i + 1, ret);
-			goto dealloc_counters;
+				     i + 1, err);
+			goto err_alloc;
 		}
 		}
-
-		ret = __mlx5_ib_alloc_counters(dev, &port->cnts);
-		if (ret)
-			goto dealloc_counters;
-
-		mlx5_ib_fill_counters(dev, port->cnts.names,
-				      port->cnts.offsets);
+		dev->port[i].cnts.set_id_valid = true;
 	}
 	}
 
 
 	return 0;
 	return 0;
 
 
-dealloc_counters:
-	while (--i >= 0)
-		mlx5_core_dealloc_q_counter(dev->mdev,
-					    dev->port[i].cnts.set_id);
-
-	return ret;
+err_alloc:
+	mlx5_ib_dealloc_counters(dev);
+	return err;
 }
 }
 
 
 static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev,
 static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev,
@@ -3718,7 +4070,7 @@ static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev,
 					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
 					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
 }
 }
 
 
-static int mlx5_ib_query_q_counters(struct mlx5_ib_dev *dev,
+static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev,
 				    struct mlx5_ib_port *port,
 				    struct mlx5_ib_port *port,
 				    struct rdma_hw_stats *stats)
 				    struct rdma_hw_stats *stats)
 {
 {
@@ -3731,7 +4083,7 @@ static int mlx5_ib_query_q_counters(struct mlx5_ib_dev *dev,
 	if (!out)
 	if (!out)
 		return -ENOMEM;
 		return -ENOMEM;
 
 
-	ret = mlx5_core_query_q_counter(dev->mdev,
+	ret = mlx5_core_query_q_counter(mdev,
 					port->cnts.set_id, 0,
 					port->cnts.set_id, 0,
 					out, outlen);
 					out, outlen);
 	if (ret)
 	if (ret)
@@ -3753,28 +4105,43 @@ static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
 {
 {
 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
 	struct mlx5_ib_port *port = &dev->port[port_num - 1];
 	struct mlx5_ib_port *port = &dev->port[port_num - 1];
+	struct mlx5_core_dev *mdev;
 	int ret, num_counters;
 	int ret, num_counters;
+	u8 mdev_port_num;
 
 
 	if (!stats)
 	if (!stats)
 		return -EINVAL;
 		return -EINVAL;
 
 
-	ret = mlx5_ib_query_q_counters(dev, port, stats);
+	num_counters = port->cnts.num_q_counters + port->cnts.num_cong_counters;
+
+	/* q_counters are per IB device, query the master mdev */
+	ret = mlx5_ib_query_q_counters(dev->mdev, port, stats);
 	if (ret)
 	if (ret)
 		return ret;
 		return ret;
-	num_counters = port->cnts.num_q_counters;
 
 
 	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
 	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
+		mdev = mlx5_ib_get_native_port_mdev(dev, port_num,
+						    &mdev_port_num);
+		if (!mdev) {
+			/* If port is not affiliated yet, its in down state
+			 * which doesn't have any counters yet, so it would be
+			 * zero. So no need to read from the HCA.
+			 */
+			goto done;
+		}
 		ret = mlx5_lag_query_cong_counters(dev->mdev,
 		ret = mlx5_lag_query_cong_counters(dev->mdev,
 						   stats->value +
 						   stats->value +
 						   port->cnts.num_q_counters,
 						   port->cnts.num_q_counters,
 						   port->cnts.num_cong_counters,
 						   port->cnts.num_cong_counters,
 						   port->cnts.offsets +
 						   port->cnts.offsets +
 						   port->cnts.num_q_counters);
 						   port->cnts.num_q_counters);
+
+		mlx5_ib_put_native_port_mdev(dev, port_num);
 		if (ret)
 		if (ret)
 			return ret;
 			return ret;
-		num_counters += port->cnts.num_cong_counters;
 	}
 	}
 
 
+done:
 	return num_counters;
 	return num_counters;
 }
 }
 
 
@@ -3936,67 +4303,306 @@ mlx5_ib_get_vector_affinity(struct ib_device *ibdev, int comp_vector)
 	return mlx5_get_vector_affinity(dev->mdev, comp_vector);
 	return mlx5_get_vector_affinity(dev->mdev, comp_vector);
 }
 }
 
 
-static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
+/* The mlx5_ib_multiport_mutex should be held when calling this function */
+static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev,
+				      struct mlx5_ib_multiport_info *mpi)
 {
 {
-	struct mlx5_ib_dev *dev;
-	enum rdma_link_layer ll;
-	int port_type_cap;
-	const char *name;
+	u8 port_num = mlx5_core_native_port_num(mpi->mdev) - 1;
+	struct mlx5_ib_port *port = &ibdev->port[port_num];
+	int comps;
 	int err;
 	int err;
 	int i;
 	int i;
 
 
-	port_type_cap = MLX5_CAP_GEN(mdev, port_type);
-	ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
+	mlx5_ib_cleanup_cong_debugfs(ibdev, port_num);
 
 
-	printk_once(KERN_INFO "%s", mlx5_version);
+	spin_lock(&port->mp.mpi_lock);
+	if (!mpi->ibdev) {
+		spin_unlock(&port->mp.mpi_lock);
+		return;
+	}
+	mpi->ibdev = NULL;
 
 
-	dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev));
-	if (!dev)
-		return NULL;
+	spin_unlock(&port->mp.mpi_lock);
+	mlx5_remove_netdev_notifier(ibdev, port_num);
+	spin_lock(&port->mp.mpi_lock);
 
 
-	dev->mdev = mdev;
+	comps = mpi->mdev_refcnt;
+	if (comps) {
+		mpi->unaffiliate = true;
+		init_completion(&mpi->unref_comp);
+		spin_unlock(&port->mp.mpi_lock);
 
 
-	dev->port = kcalloc(MLX5_CAP_GEN(mdev, num_ports), sizeof(*dev->port),
-			    GFP_KERNEL);
-	if (!dev->port)
-		goto err_dealloc;
+		for (i = 0; i < comps; i++)
+			wait_for_completion(&mpi->unref_comp);
+
+		spin_lock(&port->mp.mpi_lock);
+		mpi->unaffiliate = false;
+	}
+
+	port->mp.mpi = NULL;
 
 
-	rwlock_init(&dev->roce.netdev_lock);
-	err = get_port_caps(dev);
+	list_add_tail(&mpi->list, &mlx5_ib_unaffiliated_port_list);
+
+	spin_unlock(&port->mp.mpi_lock);
+
+	err = mlx5_nic_vport_unaffiliate_multiport(mpi->mdev);
+
+	mlx5_ib_dbg(ibdev, "unaffiliated port %d\n", port_num + 1);
+	/* Log an error, still needed to cleanup the pointers and add
+	 * it back to the list.
+	 */
 	if (err)
 	if (err)
-		goto err_free_port;
+		mlx5_ib_err(ibdev, "Failed to unaffiliate port %u\n",
+			    port_num + 1);
 
 
-	if (mlx5_use_mad_ifc(dev))
-		get_ext_port_caps(dev);
+	ibdev->roce[port_num].last_port_state = IB_PORT_DOWN;
+}
 
 
-	if (!mlx5_lag_is_active(mdev))
-		name = "mlx5_%d";
-	else
-		name = "mlx5_bond_%d";
+/* The mlx5_ib_multiport_mutex should be held when calling this function */
+static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev,
+				    struct mlx5_ib_multiport_info *mpi)
+{
+	u8 port_num = mlx5_core_native_port_num(mpi->mdev) - 1;
+	int err;
 
 
-	strlcpy(dev->ib_dev.name, name, IB_DEVICE_NAME_MAX);
-	dev->ib_dev.owner		= THIS_MODULE;
-	dev->ib_dev.node_type		= RDMA_NODE_IB_CA;
-	dev->ib_dev.local_dma_lkey	= 0 /* not supported for now */;
-	dev->num_ports		= MLX5_CAP_GEN(mdev, num_ports);
-	dev->ib_dev.phys_port_cnt     = dev->num_ports;
-	dev->ib_dev.num_comp_vectors    =
-		dev->mdev->priv.eq_table.num_comp_vectors;
-	dev->ib_dev.dev.parent		= &mdev->pdev->dev;
+	spin_lock(&ibdev->port[port_num].mp.mpi_lock);
+	if (ibdev->port[port_num].mp.mpi) {
+		mlx5_ib_warn(ibdev, "port %d already affiliated.\n",
+			     port_num + 1);
+		spin_unlock(&ibdev->port[port_num].mp.mpi_lock);
+		return false;
+	}
 
 
-	dev->ib_dev.uverbs_abi_ver	= MLX5_IB_UVERBS_ABI_VERSION;
-	dev->ib_dev.uverbs_cmd_mask	=
-		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
-		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
-		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
-		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
-		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
-		(1ull << IB_USER_VERBS_CMD_CREATE_AH)		|
-		(1ull << IB_USER_VERBS_CMD_DESTROY_AH)		|
-		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
-		(1ull << IB_USER_VERBS_CMD_REREG_MR)		|
-		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
-		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)	|
+	ibdev->port[port_num].mp.mpi = mpi;
+	mpi->ibdev = ibdev;
+	spin_unlock(&ibdev->port[port_num].mp.mpi_lock);
+
+	err = mlx5_nic_vport_affiliate_multiport(ibdev->mdev, mpi->mdev);
+	if (err)
+		goto unbind;
+
+	err = get_port_caps(ibdev, mlx5_core_native_port_num(mpi->mdev));
+	if (err)
+		goto unbind;
+
+	err = mlx5_add_netdev_notifier(ibdev, port_num);
+	if (err) {
+		mlx5_ib_err(ibdev, "failed adding netdev notifier for port %u\n",
+			    port_num + 1);
+		goto unbind;
+	}
+
+	err = mlx5_ib_init_cong_debugfs(ibdev, port_num);
+	if (err)
+		goto unbind;
+
+	return true;
+
+unbind:
+	mlx5_ib_unbind_slave_port(ibdev, mpi);
+	return false;
+}
+
+static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev)
+{
+	int port_num = mlx5_core_native_port_num(dev->mdev) - 1;
+	enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev,
+							  port_num + 1);
+	struct mlx5_ib_multiport_info *mpi;
+	int err;
+	int i;
+
+	if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
+		return 0;
+
+	err = mlx5_query_nic_vport_system_image_guid(dev->mdev,
+						     &dev->sys_image_guid);
+	if (err)
+		return err;
+
+	err = mlx5_nic_vport_enable_roce(dev->mdev);
+	if (err)
+		return err;
+
+	mutex_lock(&mlx5_ib_multiport_mutex);
+	for (i = 0; i < dev->num_ports; i++) {
+		bool bound = false;
+
+		/* build a stub multiport info struct for the native port. */
+		if (i == port_num) {
+			mpi = kzalloc(sizeof(*mpi), GFP_KERNEL);
+			if (!mpi) {
+				mutex_unlock(&mlx5_ib_multiport_mutex);
+				mlx5_nic_vport_disable_roce(dev->mdev);
+				return -ENOMEM;
+			}
+
+			mpi->is_master = true;
+			mpi->mdev = dev->mdev;
+			mpi->sys_image_guid = dev->sys_image_guid;
+			dev->port[i].mp.mpi = mpi;
+			mpi->ibdev = dev;
+			mpi = NULL;
+			continue;
+		}
+
+		list_for_each_entry(mpi, &mlx5_ib_unaffiliated_port_list,
+				    list) {
+			if (dev->sys_image_guid == mpi->sys_image_guid &&
+			    (mlx5_core_native_port_num(mpi->mdev) - 1) == i) {
+				bound = mlx5_ib_bind_slave_port(dev, mpi);
+			}
+
+			if (bound) {
+				dev_dbg(&mpi->mdev->pdev->dev, "removing port from unaffiliated list.\n");
+				mlx5_ib_dbg(dev, "port %d bound\n", i + 1);
+				list_del(&mpi->list);
+				break;
+			}
+		}
+		if (!bound) {
+			get_port_caps(dev, i + 1);
+			mlx5_ib_dbg(dev, "no free port found for port %d\n",
+				    i + 1);
+		}
+	}
+
+	list_add_tail(&dev->ib_dev_list, &mlx5_ib_dev_list);
+	mutex_unlock(&mlx5_ib_multiport_mutex);
+	return err;
+}
+
+static void mlx5_ib_cleanup_multiport_master(struct mlx5_ib_dev *dev)
+{
+	int port_num = mlx5_core_native_port_num(dev->mdev) - 1;
+	enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev,
+							  port_num + 1);
+	int i;
+
+	if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
+		return;
+
+	mutex_lock(&mlx5_ib_multiport_mutex);
+	for (i = 0; i < dev->num_ports; i++) {
+		if (dev->port[i].mp.mpi) {
+			/* Destroy the native port stub */
+			if (i == port_num) {
+				kfree(dev->port[i].mp.mpi);
+				dev->port[i].mp.mpi = NULL;
+			} else {
+				mlx5_ib_dbg(dev, "unbinding port_num: %d\n", i + 1);
+				mlx5_ib_unbind_slave_port(dev, dev->port[i].mp.mpi);
+			}
+		}
+	}
+
+	mlx5_ib_dbg(dev, "removing from devlist\n");
+	list_del(&dev->ib_dev_list);
+	mutex_unlock(&mlx5_ib_multiport_mutex);
+
+	mlx5_nic_vport_disable_roce(dev->mdev);
+}
+
+static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
+{
+	mlx5_ib_cleanup_multiport_master(dev);
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	cleanup_srcu_struct(&dev->mr_srcu);
+#endif
+	kfree(dev->port);
+}
+
+static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_core_dev *mdev = dev->mdev;
+	const char *name;
+	int err;
+	int i;
+
+	dev->port = kcalloc(dev->num_ports, sizeof(*dev->port),
+			    GFP_KERNEL);
+	if (!dev->port)
+		return -ENOMEM;
+
+	for (i = 0; i < dev->num_ports; i++) {
+		spin_lock_init(&dev->port[i].mp.mpi_lock);
+		rwlock_init(&dev->roce[i].netdev_lock);
+	}
+
+	err = mlx5_ib_init_multiport_master(dev);
+	if (err)
+		goto err_free_port;
+
+	if (!mlx5_core_mp_enabled(mdev)) {
+		int i;
+
+		for (i = 1; i <= dev->num_ports; i++) {
+			err = get_port_caps(dev, i);
+			if (err)
+				break;
+		}
+	} else {
+		err = get_port_caps(dev, mlx5_core_native_port_num(mdev));
+	}
+	if (err)
+		goto err_mp;
+
+	if (mlx5_use_mad_ifc(dev))
+		get_ext_port_caps(dev);
+
+	if (!mlx5_lag_is_active(mdev))
+		name = "mlx5_%d";
+	else
+		name = "mlx5_bond_%d";
+
+	strlcpy(dev->ib_dev.name, name, IB_DEVICE_NAME_MAX);
+	dev->ib_dev.owner		= THIS_MODULE;
+	dev->ib_dev.node_type		= RDMA_NODE_IB_CA;
+	dev->ib_dev.local_dma_lkey	= 0 /* not supported for now */;
+	dev->ib_dev.phys_port_cnt	= dev->num_ports;
+	dev->ib_dev.num_comp_vectors    =
+		dev->mdev->priv.eq_table.num_comp_vectors;
+	dev->ib_dev.dev.parent		= &mdev->pdev->dev;
+
+	mutex_init(&dev->flow_db.lock);
+	mutex_init(&dev->cap_mask_mutex);
+	INIT_LIST_HEAD(&dev->qp_list);
+	spin_lock_init(&dev->reset_flow_resource_lock);
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	err = init_srcu_struct(&dev->mr_srcu);
+	if (err)
+		goto err_free_port;
+#endif
+
+	return 0;
+err_mp:
+	mlx5_ib_cleanup_multiport_master(dev);
+
+err_free_port:
+	kfree(dev->port);
+
+	return -ENOMEM;
+}
+
+static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_core_dev *mdev = dev->mdev;
+	int err;
+
+	dev->ib_dev.uverbs_abi_ver	= MLX5_IB_UVERBS_ABI_VERSION;
+	dev->ib_dev.uverbs_cmd_mask	=
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_AH)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_AH)		|
+		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_REREG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)	|
 		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
 		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
 		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)		|
 		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)		|
 		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
 		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
@@ -4022,8 +4628,6 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 	dev->ib_dev.query_device	= mlx5_ib_query_device;
 	dev->ib_dev.query_device	= mlx5_ib_query_device;
 	dev->ib_dev.query_port		= mlx5_ib_query_port;
 	dev->ib_dev.query_port		= mlx5_ib_query_port;
 	dev->ib_dev.get_link_layer	= mlx5_ib_port_link_layer;
 	dev->ib_dev.get_link_layer	= mlx5_ib_port_link_layer;
-	if (ll == IB_LINK_LAYER_ETHERNET)
-		dev->ib_dev.get_netdev	= mlx5_ib_get_netdev;
 	dev->ib_dev.query_gid		= mlx5_ib_query_gid;
 	dev->ib_dev.query_gid		= mlx5_ib_query_gid;
 	dev->ib_dev.add_gid		= mlx5_ib_add_gid;
 	dev->ib_dev.add_gid		= mlx5_ib_add_gid;
 	dev->ib_dev.del_gid		= mlx5_ib_del_gid;
 	dev->ib_dev.del_gid		= mlx5_ib_del_gid;
@@ -4080,8 +4684,6 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 
 
 	dev->ib_dev.disassociate_ucontext = mlx5_ib_disassociate_ucontext;
 	dev->ib_dev.disassociate_ucontext = mlx5_ib_disassociate_ucontext;
 
 
-	mlx5_ib_internal_fill_odp_caps(dev);
-
 	dev->umr_fence = mlx5_get_umr_fence(MLX5_CAP_GEN(mdev, umr_fence));
 	dev->umr_fence = mlx5_get_umr_fence(MLX5_CAP_GEN(mdev, umr_fence));
 
 
 	if (MLX5_CAP_GEN(mdev, imaicl)) {
 	if (MLX5_CAP_GEN(mdev, imaicl)) {
@@ -4092,11 +4694,6 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 			(1ull << IB_USER_VERBS_CMD_DEALLOC_MW);
 			(1ull << IB_USER_VERBS_CMD_DEALLOC_MW);
 	}
 	}
 
 
-	if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) {
-		dev->ib_dev.get_hw_stats	= mlx5_ib_get_hw_stats;
-		dev->ib_dev.alloc_hw_stats	= mlx5_ib_alloc_hw_stats;
-	}
-
 	if (MLX5_CAP_GEN(mdev, xrc)) {
 	if (MLX5_CAP_GEN(mdev, xrc)) {
 		dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd;
 		dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd;
 		dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd;
 		dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd;
@@ -4111,8 +4708,39 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 			(1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
 			(1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
 			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
 			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
 
 
-	if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
-	    IB_LINK_LAYER_ETHERNET) {
+	err = init_node_data(dev);
+	if (err)
+		return err;
+
+	if ((MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) &&
+	    (MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) ||
+	     MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
+		mutex_init(&dev->lb_mutex);
+
+	return 0;
+}
+
+static int mlx5_ib_stage_roce_init(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_core_dev *mdev = dev->mdev;
+	enum rdma_link_layer ll;
+	int port_type_cap;
+	u8 port_num;
+	int err;
+	int i;
+
+	port_num = mlx5_core_native_port_num(dev->mdev) - 1;
+	port_type_cap = MLX5_CAP_GEN(mdev, port_type);
+	ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
+
+	if (ll == IB_LINK_LAYER_ETHERNET) {
+		for (i = 0; i < dev->num_ports; i++) {
+			dev->roce[i].dev = dev;
+			dev->roce[i].native_port_num = i + 1;
+			dev->roce[i].last_port_state = IB_PORT_DOWN;
+		}
+
+		dev->ib_dev.get_netdev	= mlx5_ib_get_netdev;
 		dev->ib_dev.create_wq	 = mlx5_ib_create_wq;
 		dev->ib_dev.create_wq	 = mlx5_ib_create_wq;
 		dev->ib_dev.modify_wq	 = mlx5_ib_modify_wq;
 		dev->ib_dev.modify_wq	 = mlx5_ib_modify_wq;
 		dev->ib_dev.destroy_wq	 = mlx5_ib_destroy_wq;
 		dev->ib_dev.destroy_wq	 = mlx5_ib_destroy_wq;
@@ -4124,143 +4752,329 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) |
 			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) |
 			(1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) |
 			(1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) |
 			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL);
 			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL);
+		err = mlx5_enable_eth(dev, port_num);
+		if (err)
+			return err;
 	}
 	}
-	err = init_node_data(dev);
-	if (err)
-		goto err_free_port;
 
 
-	mutex_init(&dev->flow_db.lock);
-	mutex_init(&dev->cap_mask_mutex);
-	INIT_LIST_HEAD(&dev->qp_list);
-	spin_lock_init(&dev->reset_flow_resource_lock);
+	return 0;
+}
+
+static void mlx5_ib_stage_roce_cleanup(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_core_dev *mdev = dev->mdev;
+	enum rdma_link_layer ll;
+	int port_type_cap;
+	u8 port_num;
+
+	port_num = mlx5_core_native_port_num(dev->mdev) - 1;
+	port_type_cap = MLX5_CAP_GEN(mdev, port_type);
+	ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
 
 
 	if (ll == IB_LINK_LAYER_ETHERNET) {
 	if (ll == IB_LINK_LAYER_ETHERNET) {
-		err = mlx5_enable_eth(dev);
-		if (err)
-			goto err_free_port;
-		dev->roce.last_port_state = IB_PORT_DOWN;
+		mlx5_disable_eth(dev);
+		mlx5_remove_netdev_notifier(dev, port_num);
 	}
 	}
+}
 
 
-	err = create_dev_resources(&dev->devr);
-	if (err)
-		goto err_disable_eth;
+static int mlx5_ib_stage_dev_res_init(struct mlx5_ib_dev *dev)
+{
+	return create_dev_resources(&dev->devr);
+}
 
 
-	err = mlx5_ib_odp_init_one(dev);
-	if (err)
-		goto err_rsrc;
+static void mlx5_ib_stage_dev_res_cleanup(struct mlx5_ib_dev *dev)
+{
+	destroy_dev_resources(&dev->devr);
+}
+
+static int mlx5_ib_stage_odp_init(struct mlx5_ib_dev *dev)
+{
+	mlx5_ib_internal_fill_odp_caps(dev);
+
+	return mlx5_ib_odp_init_one(dev);
+}
 
 
+static int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev)
+{
 	if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) {
 	if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) {
-		err = mlx5_ib_alloc_counters(dev);
-		if (err)
-			goto err_odp;
+		dev->ib_dev.get_hw_stats	= mlx5_ib_get_hw_stats;
+		dev->ib_dev.alloc_hw_stats	= mlx5_ib_alloc_hw_stats;
+
+		return mlx5_ib_alloc_counters(dev);
 	}
 	}
 
 
-	err = mlx5_ib_init_cong_debugfs(dev);
-	if (err)
-		goto err_cnt;
+	return 0;
+}
+
+static void mlx5_ib_stage_counters_cleanup(struct mlx5_ib_dev *dev)
+{
+	if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
+		mlx5_ib_dealloc_counters(dev);
+}
+
+static int mlx5_ib_stage_cong_debugfs_init(struct mlx5_ib_dev *dev)
+{
+	return mlx5_ib_init_cong_debugfs(dev,
+					 mlx5_core_native_port_num(dev->mdev) - 1);
+}
+
+static void mlx5_ib_stage_cong_debugfs_cleanup(struct mlx5_ib_dev *dev)
+{
+	mlx5_ib_cleanup_cong_debugfs(dev,
+				     mlx5_core_native_port_num(dev->mdev) - 1);
+}
 
 
+static int mlx5_ib_stage_uar_init(struct mlx5_ib_dev *dev)
+{
 	dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev);
 	dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev);
-	if (IS_ERR(dev->mdev->priv.uar))
-		goto err_cong;
+	if (!dev->mdev->priv.uar)
+		return -ENOMEM;
+	return 0;
+}
+
+static void mlx5_ib_stage_uar_cleanup(struct mlx5_ib_dev *dev)
+{
+	mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar);
+}
+
+static int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev)
+{
+	int err;
 
 
 	err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false);
 	err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false);
 	if (err)
 	if (err)
-		goto err_uar_page;
+		return err;
 
 
 	err = mlx5_alloc_bfreg(dev->mdev, &dev->fp_bfreg, false, true);
 	err = mlx5_alloc_bfreg(dev->mdev, &dev->fp_bfreg, false, true);
 	if (err)
 	if (err)
-		goto err_bfreg;
+		mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
 
 
-	err = ib_register_device(&dev->ib_dev, NULL);
-	if (err)
-		goto err_fp_bfreg;
+	return err;
+}
 
 
-	err = create_umr_res(dev);
-	if (err)
-		goto err_dev;
+static void mlx5_ib_stage_bfrag_cleanup(struct mlx5_ib_dev *dev)
+{
+	mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
+	mlx5_free_bfreg(dev->mdev, &dev->bfreg);
+}
 
 
+static int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev)
+{
+	return ib_register_device(&dev->ib_dev, NULL);
+}
+
+static void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev)
+{
+	ib_unregister_device(&dev->ib_dev);
+}
+
+static int mlx5_ib_stage_umr_res_init(struct mlx5_ib_dev *dev)
+{
+	return create_umr_res(dev);
+}
+
+static void mlx5_ib_stage_umr_res_cleanup(struct mlx5_ib_dev *dev)
+{
+	destroy_umrc_res(dev);
+}
+
+static int mlx5_ib_stage_delay_drop_init(struct mlx5_ib_dev *dev)
+{
 	init_delay_drop(dev);
 	init_delay_drop(dev);
 
 
+	return 0;
+}
+
+static void mlx5_ib_stage_delay_drop_cleanup(struct mlx5_ib_dev *dev)
+{
+	cancel_delay_drop(dev);
+}
+
+static int mlx5_ib_stage_class_attr_init(struct mlx5_ib_dev *dev)
+{
+	int err;
+	int i;
+
 	for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) {
 	for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) {
 		err = device_create_file(&dev->ib_dev.dev,
 		err = device_create_file(&dev->ib_dev.dev,
 					 mlx5_class_attributes[i]);
 					 mlx5_class_attributes[i]);
 		if (err)
 		if (err)
-			goto err_delay_drop;
+			return err;
 	}
 	}
 
 
-	if ((MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) &&
-	    (MLX5_CAP_GEN(mdev, disable_local_lb_uc) ||
-	     MLX5_CAP_GEN(mdev, disable_local_lb_mc)))
-		mutex_init(&dev->lb_mutex);
+	return 0;
+}
+
+static void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
+			     const struct mlx5_ib_profile *profile,
+			     int stage)
+{
+	/* Number of stages to cleanup */
+	while (stage) {
+		stage--;
+		if (profile->stage[stage].cleanup)
+			profile->stage[stage].cleanup(dev);
+	}
+
+	ib_dealloc_device((struct ib_device *)dev);
+}
+
+static void *mlx5_ib_add_slave_port(struct mlx5_core_dev *mdev, u8 port_num);
 
 
+static void *__mlx5_ib_add(struct mlx5_core_dev *mdev,
+			   const struct mlx5_ib_profile *profile)
+{
+	struct mlx5_ib_dev *dev;
+	int err;
+	int i;
+
+	printk_once(KERN_INFO "%s", mlx5_version);
+
+	dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev));
+	if (!dev)
+		return NULL;
+
+	dev->mdev = mdev;
+	dev->num_ports = max(MLX5_CAP_GEN(mdev, num_ports),
+			     MLX5_CAP_GEN(mdev, num_vhca_ports));
+
+	for (i = 0; i < MLX5_IB_STAGE_MAX; i++) {
+		if (profile->stage[i].init) {
+			err = profile->stage[i].init(dev);
+			if (err)
+				goto err_out;
+		}
+	}
+
+	dev->profile = profile;
 	dev->ib_active = true;
 	dev->ib_active = true;
 
 
 	return dev;
 	return dev;
 
 
-err_delay_drop:
-	cancel_delay_drop(dev);
-	destroy_umrc_res(dev);
+err_out:
+	__mlx5_ib_remove(dev, profile, i);
 
 
-err_dev:
-	ib_unregister_device(&dev->ib_dev);
+	return NULL;
+}
 
 
-err_fp_bfreg:
-	mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
+static const struct mlx5_ib_profile pf_profile = {
+	STAGE_CREATE(MLX5_IB_STAGE_INIT,
+		     mlx5_ib_stage_init_init,
+		     mlx5_ib_stage_init_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_CAPS,
+		     mlx5_ib_stage_caps_init,
+		     NULL),
+	STAGE_CREATE(MLX5_IB_STAGE_ROCE,
+		     mlx5_ib_stage_roce_init,
+		     mlx5_ib_stage_roce_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
+		     mlx5_ib_stage_dev_res_init,
+		     mlx5_ib_stage_dev_res_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_ODP,
+		     mlx5_ib_stage_odp_init,
+		     NULL),
+	STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
+		     mlx5_ib_stage_counters_init,
+		     mlx5_ib_stage_counters_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS,
+		     mlx5_ib_stage_cong_debugfs_init,
+		     mlx5_ib_stage_cong_debugfs_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_UAR,
+		     mlx5_ib_stage_uar_init,
+		     mlx5_ib_stage_uar_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_BFREG,
+		     mlx5_ib_stage_bfrag_init,
+		     mlx5_ib_stage_bfrag_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
+		     mlx5_ib_stage_ib_reg_init,
+		     mlx5_ib_stage_ib_reg_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_UMR_RESOURCES,
+		     mlx5_ib_stage_umr_res_init,
+		     mlx5_ib_stage_umr_res_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_DELAY_DROP,
+		     mlx5_ib_stage_delay_drop_init,
+		     mlx5_ib_stage_delay_drop_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_CLASS_ATTR,
+		     mlx5_ib_stage_class_attr_init,
+		     NULL),
+};
 
 
-err_bfreg:
-	mlx5_free_bfreg(dev->mdev, &dev->bfreg);
+static void *mlx5_ib_add_slave_port(struct mlx5_core_dev *mdev, u8 port_num)
+{
+	struct mlx5_ib_multiport_info *mpi;
+	struct mlx5_ib_dev *dev;
+	bool bound = false;
+	int err;
 
 
-err_uar_page:
-	mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar);
+	mpi = kzalloc(sizeof(*mpi), GFP_KERNEL);
+	if (!mpi)
+		return NULL;
 
 
-err_cong:
-	mlx5_ib_cleanup_cong_debugfs(dev);
-err_cnt:
-	if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
-		mlx5_ib_dealloc_counters(dev);
+	mpi->mdev = mdev;
 
 
-err_odp:
-	mlx5_ib_odp_remove_one(dev);
+	err = mlx5_query_nic_vport_system_image_guid(mdev,
+						     &mpi->sys_image_guid);
+	if (err) {
+		kfree(mpi);
+		return NULL;
+	}
 
 
-err_rsrc:
-	destroy_dev_resources(&dev->devr);
+	mutex_lock(&mlx5_ib_multiport_mutex);
+	list_for_each_entry(dev, &mlx5_ib_dev_list, ib_dev_list) {
+		if (dev->sys_image_guid == mpi->sys_image_guid)
+			bound = mlx5_ib_bind_slave_port(dev, mpi);
 
 
-err_disable_eth:
-	if (ll == IB_LINK_LAYER_ETHERNET) {
-		mlx5_disable_eth(dev);
-		mlx5_remove_netdev_notifier(dev);
+		if (bound) {
+			rdma_roce_rescan_device(&dev->ib_dev);
+			break;
+		}
 	}
 	}
 
 
-err_free_port:
-	kfree(dev->port);
+	if (!bound) {
+		list_add_tail(&mpi->list, &mlx5_ib_unaffiliated_port_list);
+		dev_dbg(&mdev->pdev->dev, "no suitable IB device found to bind to, added to unaffiliated list.\n");
+	} else {
+		mlx5_ib_dbg(dev, "bound port %u\n", port_num + 1);
+	}
+	mutex_unlock(&mlx5_ib_multiport_mutex);
 
 
-err_dealloc:
-	ib_dealloc_device((struct ib_device *)dev);
+	return mpi;
+}
 
 
-	return NULL;
+static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
+{
+	enum rdma_link_layer ll;
+	int port_type_cap;
+
+	port_type_cap = MLX5_CAP_GEN(mdev, port_type);
+	ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
+
+	if (mlx5_core_is_mp_slave(mdev) && ll == IB_LINK_LAYER_ETHERNET) {
+		u8 port_num = mlx5_core_native_port_num(mdev) - 1;
+
+		return mlx5_ib_add_slave_port(mdev, port_num);
+	}
+
+	return __mlx5_ib_add(mdev, &pf_profile);
 }
 }
 
 
 static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
 static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
 {
 {
-	struct mlx5_ib_dev *dev = context;
-	enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1);
+	struct mlx5_ib_multiport_info *mpi;
+	struct mlx5_ib_dev *dev;
 
 
-	cancel_delay_drop(dev);
-	mlx5_remove_netdev_notifier(dev);
-	ib_unregister_device(&dev->ib_dev);
-	mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
-	mlx5_free_bfreg(dev->mdev, &dev->bfreg);
-	mlx5_put_uars_page(dev->mdev, mdev->priv.uar);
-	mlx5_ib_cleanup_cong_debugfs(dev);
-	if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
-		mlx5_ib_dealloc_counters(dev);
-	destroy_umrc_res(dev);
-	mlx5_ib_odp_remove_one(dev);
-	destroy_dev_resources(&dev->devr);
-	if (ll == IB_LINK_LAYER_ETHERNET)
-		mlx5_disable_eth(dev);
-	kfree(dev->port);
-	ib_dealloc_device(&dev->ib_dev);
+	if (mlx5_core_is_mp_slave(mdev)) {
+		mpi = context;
+		mutex_lock(&mlx5_ib_multiport_mutex);
+		if (mpi->ibdev)
+			mlx5_ib_unbind_slave_port(mpi->ibdev, mpi);
+		list_del(&mpi->list);
+		mutex_unlock(&mlx5_ib_multiport_mutex);
+		return;
+	}
+
+	dev = context;
+	__mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX);
 }
 }
 
 
 static struct mlx5_interface mlx5_ib_interface = {
 static struct mlx5_interface mlx5_ib_interface = {
@@ -4277,6 +5091,10 @@ static int __init mlx5_ib_init(void)
 {
 {
 	int err;
 	int err;
 
 
+	mlx5_ib_event_wq = alloc_ordered_workqueue("mlx5_ib_event_wq", 0);
+	if (!mlx5_ib_event_wq)
+		return -ENOMEM;
+
 	mlx5_ib_odp_init();
 	mlx5_ib_odp_init();
 
 
 	err = mlx5_register_interface(&mlx5_ib_interface);
 	err = mlx5_register_interface(&mlx5_ib_interface);
@@ -4287,6 +5105,7 @@ static int __init mlx5_ib_init(void)
 static void __exit mlx5_ib_cleanup(void)
 static void __exit mlx5_ib_cleanup(void)
 {
 {
 	mlx5_unregister_interface(&mlx5_ib_interface);
 	mlx5_unregister_interface(&mlx5_ib_interface);
+	destroy_workqueue(mlx5_ib_event_wq);
 }
 }
 
 
 module_init(mlx5_ib_init);
 module_init(mlx5_ib_init);

+ 91 - 20
drivers/infiniband/hw/mlx5/mlx5_ib.h

@@ -70,15 +70,6 @@ enum {
 	MLX5_IB_MMAP_CMD_MASK	= 0xff,
 	MLX5_IB_MMAP_CMD_MASK	= 0xff,
 };
 };
 
 
-enum mlx5_ib_mmap_cmd {
-	MLX5_IB_MMAP_REGULAR_PAGE		= 0,
-	MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES	= 1,
-	MLX5_IB_MMAP_WC_PAGE			= 2,
-	MLX5_IB_MMAP_NC_PAGE			= 3,
-	/* 5 is chosen in order to be compatible with old versions of libmlx5 */
-	MLX5_IB_MMAP_CORE_CLOCK			= 5,
-};
-
 enum {
 enum {
 	MLX5_RES_SCAT_DATA32_CQE	= 0x1,
 	MLX5_RES_SCAT_DATA32_CQE	= 0x1,
 	MLX5_RES_SCAT_DATA64_CQE	= 0x2,
 	MLX5_RES_SCAT_DATA64_CQE	= 0x2,
@@ -112,6 +103,11 @@ enum {
 	MLX5_TM_MAX_SGE			= 1,
 	MLX5_TM_MAX_SGE			= 1,
 };
 };
 
 
+enum {
+	MLX5_IB_INVALID_UAR_INDEX	= BIT(31),
+	MLX5_IB_INVALID_BFREG		= BIT(31),
+};
+
 struct mlx5_ib_vma_private_data {
 struct mlx5_ib_vma_private_data {
 	struct list_head list;
 	struct list_head list;
 	struct vm_area_struct *vma;
 	struct vm_area_struct *vma;
@@ -200,6 +196,8 @@ struct mlx5_ib_flow_db {
  * creates the actual hardware QP.
  * creates the actual hardware QP.
  */
  */
 #define MLX5_IB_QPT_HW_GSI	IB_QPT_RESERVED2
 #define MLX5_IB_QPT_HW_GSI	IB_QPT_RESERVED2
+#define MLX5_IB_QPT_DCI		IB_QPT_RESERVED3
+#define MLX5_IB_QPT_DCT		IB_QPT_RESERVED4
 #define MLX5_IB_WR_UMR		IB_WR_RESERVED1
 #define MLX5_IB_WR_UMR		IB_WR_RESERVED1
 
 
 #define MLX5_IB_UMR_OCTOWORD	       16
 #define MLX5_IB_UMR_OCTOWORD	       16
@@ -360,12 +358,18 @@ struct mlx5_bf {
 	struct mlx5_sq_bfreg   *bfreg;
 	struct mlx5_sq_bfreg   *bfreg;
 };
 };
 
 
+struct mlx5_ib_dct {
+	struct mlx5_core_dct    mdct;
+	u32                     *in;
+};
+
 struct mlx5_ib_qp {
 struct mlx5_ib_qp {
 	struct ib_qp		ibqp;
 	struct ib_qp		ibqp;
 	union {
 	union {
 		struct mlx5_ib_qp_trans trans_qp;
 		struct mlx5_ib_qp_trans trans_qp;
 		struct mlx5_ib_raw_packet_qp raw_packet_qp;
 		struct mlx5_ib_raw_packet_qp raw_packet_qp;
 		struct mlx5_ib_rss_qp rss_qp;
 		struct mlx5_ib_rss_qp rss_qp;
+		struct mlx5_ib_dct dct;
 	};
 	};
 	struct mlx5_buf		buf;
 	struct mlx5_buf		buf;
 
 
@@ -404,6 +408,8 @@ struct mlx5_ib_qp {
 	u32			rate_limit;
 	u32			rate_limit;
 	u32                     underlay_qpn;
 	u32                     underlay_qpn;
 	bool			tunnel_offload_en;
 	bool			tunnel_offload_en;
+	/* storage for qp sub type when core qp type is IB_QPT_DRIVER */
+	enum ib_qp_type		qp_sub_type;
 };
 };
 
 
 struct mlx5_ib_cq_buf {
 struct mlx5_ib_cq_buf {
@@ -636,10 +642,21 @@ struct mlx5_ib_counters {
 	u32 num_q_counters;
 	u32 num_q_counters;
 	u32 num_cong_counters;
 	u32 num_cong_counters;
 	u16 set_id;
 	u16 set_id;
+	bool set_id_valid;
+};
+
+struct mlx5_ib_multiport_info;
+
+struct mlx5_ib_multiport {
+	struct mlx5_ib_multiport_info *mpi;
+	/* To be held when accessing the multiport info */
+	spinlock_t mpi_lock;
 };
 };
 
 
 struct mlx5_ib_port {
 struct mlx5_ib_port {
 	struct mlx5_ib_counters cnts;
 	struct mlx5_ib_counters cnts;
+	struct mlx5_ib_multiport mp;
+	struct mlx5_ib_dbg_cc_params	*dbg_cc_params;
 };
 };
 
 
 struct mlx5_roce {
 struct mlx5_roce {
@@ -651,12 +668,15 @@ struct mlx5_roce {
 	struct notifier_block	nb;
 	struct notifier_block	nb;
 	atomic_t		next_port;
 	atomic_t		next_port;
 	enum ib_port_state last_port_state;
 	enum ib_port_state last_port_state;
+	struct mlx5_ib_dev	*dev;
+	u8			native_port_num;
 };
 };
 
 
 struct mlx5_ib_dbg_param {
 struct mlx5_ib_dbg_param {
 	int			offset;
 	int			offset;
 	struct mlx5_ib_dev	*dev;
 	struct mlx5_ib_dev	*dev;
 	struct dentry		*dentry;
 	struct dentry		*dentry;
+	u8			port_num;
 };
 };
 
 
 enum mlx5_ib_dbg_cc_types {
 enum mlx5_ib_dbg_cc_types {
@@ -709,10 +729,50 @@ struct mlx5_ib_delay_drop {
 	struct mlx5_ib_dbg_delay_drop *dbg;
 	struct mlx5_ib_dbg_delay_drop *dbg;
 };
 };
 
 
+enum mlx5_ib_stages {
+	MLX5_IB_STAGE_INIT,
+	MLX5_IB_STAGE_CAPS,
+	MLX5_IB_STAGE_ROCE,
+	MLX5_IB_STAGE_DEVICE_RESOURCES,
+	MLX5_IB_STAGE_ODP,
+	MLX5_IB_STAGE_COUNTERS,
+	MLX5_IB_STAGE_CONG_DEBUGFS,
+	MLX5_IB_STAGE_UAR,
+	MLX5_IB_STAGE_BFREG,
+	MLX5_IB_STAGE_IB_REG,
+	MLX5_IB_STAGE_UMR_RESOURCES,
+	MLX5_IB_STAGE_DELAY_DROP,
+	MLX5_IB_STAGE_CLASS_ATTR,
+	MLX5_IB_STAGE_MAX,
+};
+
+struct mlx5_ib_stage {
+	int (*init)(struct mlx5_ib_dev *dev);
+	void (*cleanup)(struct mlx5_ib_dev *dev);
+};
+
+#define STAGE_CREATE(_stage, _init, _cleanup) \
+	.stage[_stage] = {.init = _init, .cleanup = _cleanup}
+
+struct mlx5_ib_profile {
+	struct mlx5_ib_stage stage[MLX5_IB_STAGE_MAX];
+};
+
+struct mlx5_ib_multiport_info {
+	struct list_head list;
+	struct mlx5_ib_dev *ibdev;
+	struct mlx5_core_dev *mdev;
+	struct completion unref_comp;
+	u64 sys_image_guid;
+	u32 mdev_refcnt;
+	bool is_master;
+	bool unaffiliate;
+};
+
 struct mlx5_ib_dev {
 struct mlx5_ib_dev {
 	struct ib_device		ib_dev;
 	struct ib_device		ib_dev;
 	struct mlx5_core_dev		*mdev;
 	struct mlx5_core_dev		*mdev;
-	struct mlx5_roce		roce;
+	struct mlx5_roce		roce[MLX5_MAX_PORTS];
 	int				num_ports;
 	int				num_ports;
 	/* serialize update of capability mask
 	/* serialize update of capability mask
 	 */
 	 */
@@ -746,12 +806,14 @@ struct mlx5_ib_dev {
 	struct mlx5_sq_bfreg	bfreg;
 	struct mlx5_sq_bfreg	bfreg;
 	struct mlx5_sq_bfreg	fp_bfreg;
 	struct mlx5_sq_bfreg	fp_bfreg;
 	struct mlx5_ib_delay_drop	delay_drop;
 	struct mlx5_ib_delay_drop	delay_drop;
-	struct mlx5_ib_dbg_cc_params	*dbg_cc_params;
+	const struct mlx5_ib_profile	*profile;
 
 
 	/* protect the user_td */
 	/* protect the user_td */
 	struct mutex		lb_mutex;
 	struct mutex		lb_mutex;
 	u32			user_td;
 	u32			user_td;
 	u8			umr_fence;
 	u8			umr_fence;
+	struct list_head	ib_dev_list;
+	u64			sys_image_guid;
 };
 };
 
 
 static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
 static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
@@ -956,13 +1018,14 @@ struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device,
 						      struct ib_rwq_ind_table_init_attr *init_attr,
 						      struct ib_rwq_ind_table_init_attr *init_attr,
 						      struct ib_udata *udata);
 						      struct ib_udata *udata);
 int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table);
 int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table);
+bool mlx5_ib_dc_atomic_is_supported(struct mlx5_ib_dev *dev);
+
 
 
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev);
 void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev);
 void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context,
 void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context,
 		    struct mlx5_pagefault *pfault);
 		    struct mlx5_pagefault *pfault);
 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
-void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev);
 int __init mlx5_ib_odp_init(void);
 int __init mlx5_ib_odp_init(void);
 void mlx5_ib_odp_cleanup(void);
 void mlx5_ib_odp_cleanup(void);
 void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
 void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
@@ -977,7 +1040,6 @@ static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
 }
 }
 
 
 static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; }
 static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; }
-static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev)	    {}
 static inline int mlx5_ib_odp_init(void) { return 0; }
 static inline int mlx5_ib_odp_init(void) { return 0; }
 static inline void mlx5_ib_odp_cleanup(void)				    {}
 static inline void mlx5_ib_odp_cleanup(void)				    {}
 static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {}
 static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {}
@@ -1001,8 +1063,8 @@ __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num,
 int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port_num,
 int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port_num,
 			   int index, enum ib_gid_type *gid_type);
 			   int index, enum ib_gid_type *gid_type);
 
 
-void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev);
-int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev);
+void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num);
+int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num);
 
 
 /* GSI QP helper functions */
 /* GSI QP helper functions */
 struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd,
 struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd,
@@ -1021,6 +1083,15 @@ void mlx5_ib_gsi_pkey_change(struct mlx5_ib_gsi_qp *gsi);
 
 
 int mlx5_ib_generate_wc(struct ib_cq *ibcq, struct ib_wc *wc);
 int mlx5_ib_generate_wc(struct ib_cq *ibcq, struct ib_wc *wc);
 
 
+void mlx5_ib_free_bfreg(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi,
+			int bfregn);
+struct mlx5_ib_dev *mlx5_ib_get_ibdev_from_mpi(struct mlx5_ib_multiport_info *mpi);
+struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *dev,
+						   u8 ib_port_num,
+						   u8 *native_port_num);
+void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *dev,
+				  u8 port_num);
+
 static inline void init_query_mad(struct ib_smp *mad)
 static inline void init_query_mad(struct ib_smp *mad)
 {
 {
 	mad->base_version  = 1;
 	mad->base_version  = 1;
@@ -1052,8 +1123,8 @@ static inline u32 check_cq_create_flags(u32 flags)
 	 * It returns non-zero value for unsupported CQ
 	 * It returns non-zero value for unsupported CQ
 	 * create flags, otherwise it returns zero.
 	 * create flags, otherwise it returns zero.
 	 */
 	 */
-	return (flags & ~(IB_CQ_FLAGS_IGNORE_OVERRUN |
-			  IB_CQ_FLAGS_TIMESTAMP_COMPLETION));
+	return (flags & ~(IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN |
+			  IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION));
 }
 }
 
 
 static inline int verify_assign_uidx(u8 cqe_version, u32 cmd_uidx,
 static inline int verify_assign_uidx(u8 cqe_version, u32 cmd_uidx,
@@ -1113,10 +1184,10 @@ static inline int get_uars_per_sys_page(struct mlx5_ib_dev *dev, bool lib_suppor
 				MLX5_UARS_IN_PAGE : 1;
 				MLX5_UARS_IN_PAGE : 1;
 }
 }
 
 
-static inline int get_num_uars(struct mlx5_ib_dev *dev,
-			       struct mlx5_bfreg_info *bfregi)
+static inline int get_num_static_uars(struct mlx5_ib_dev *dev,
+				      struct mlx5_bfreg_info *bfregi)
 {
 {
-	return get_uars_per_sys_page(dev, bfregi->lib_uar_4k) * bfregi->num_sys_pages;
+	return get_uars_per_sys_page(dev, bfregi->lib_uar_4k) * bfregi->num_static_sys_pages;
 }
 }
 
 
 #endif /* MLX5_IB_H */
 #endif /* MLX5_IB_H */

+ 3 - 0
drivers/infiniband/hw/mlx5/mr.c

@@ -1206,6 +1206,9 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	int err;
 	int err;
 	bool use_umr = true;
 	bool use_umr = true;
 
 
+	if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
+		return ERR_PTR(-EINVAL);
+
 	mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
 	mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
 		    start, virt_addr, length, access_flags);
 		    start, virt_addr, length, access_flags);
 
 

+ 0 - 9
drivers/infiniband/hw/mlx5/odp.c

@@ -1207,10 +1207,6 @@ int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
 {
 {
 	int ret;
 	int ret;
 
 
-	ret = init_srcu_struct(&dev->mr_srcu);
-	if (ret)
-		return ret;
-
 	if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) {
 	if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) {
 		ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey);
 		ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey);
 		if (ret) {
 		if (ret) {
@@ -1222,11 +1218,6 @@ int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
 	return 0;
 	return 0;
 }
 }
 
 
-void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *dev)
-{
-	cleanup_srcu_struct(&dev->mr_srcu);
-}
-
 int mlx5_ib_odp_init(void)
 int mlx5_ib_odp_init(void)
 {
 {
 	mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) -
 	mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) -

+ 405 - 27
drivers/infiniband/hw/mlx5/qp.c

@@ -493,7 +493,7 @@ enum {
 
 
 static int max_bfregs(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi)
 static int max_bfregs(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi)
 {
 {
-	return get_num_uars(dev, bfregi) * MLX5_NON_FP_BFREGS_PER_UAR;
+	return get_num_static_uars(dev, bfregi) * MLX5_NON_FP_BFREGS_PER_UAR;
 }
 }
 
 
 static int num_med_bfreg(struct mlx5_ib_dev *dev,
 static int num_med_bfreg(struct mlx5_ib_dev *dev,
@@ -581,7 +581,7 @@ static int alloc_bfreg(struct mlx5_ib_dev *dev,
 	return bfregn;
 	return bfregn;
 }
 }
 
 
-static void free_bfreg(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi, int bfregn)
+void mlx5_ib_free_bfreg(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi, int bfregn)
 {
 {
 	mutex_lock(&bfregi->lock);
 	mutex_lock(&bfregi->lock);
 	bfregi->count[bfregn]--;
 	bfregi->count[bfregn]--;
@@ -613,6 +613,7 @@ static int to_mlx5_st(enum ib_qp_type type)
 	case IB_QPT_XRC_TGT:		return MLX5_QP_ST_XRC;
 	case IB_QPT_XRC_TGT:		return MLX5_QP_ST_XRC;
 	case IB_QPT_SMI:		return MLX5_QP_ST_QP0;
 	case IB_QPT_SMI:		return MLX5_QP_ST_QP0;
 	case MLX5_IB_QPT_HW_GSI:	return MLX5_QP_ST_QP1;
 	case MLX5_IB_QPT_HW_GSI:	return MLX5_QP_ST_QP1;
+	case MLX5_IB_QPT_DCI:		return MLX5_QP_ST_DCI;
 	case IB_QPT_RAW_IPV6:		return MLX5_QP_ST_RAW_IPV6;
 	case IB_QPT_RAW_IPV6:		return MLX5_QP_ST_RAW_IPV6;
 	case IB_QPT_RAW_PACKET:
 	case IB_QPT_RAW_PACKET:
 	case IB_QPT_RAW_ETHERTYPE:	return MLX5_QP_ST_RAW_ETHERTYPE;
 	case IB_QPT_RAW_ETHERTYPE:	return MLX5_QP_ST_RAW_ETHERTYPE;
@@ -627,7 +628,8 @@ static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq,
 			       struct mlx5_ib_cq *recv_cq);
 			       struct mlx5_ib_cq *recv_cq);
 
 
 static int bfregn_to_uar_index(struct mlx5_ib_dev *dev,
 static int bfregn_to_uar_index(struct mlx5_ib_dev *dev,
-			       struct mlx5_bfreg_info *bfregi, int bfregn)
+			       struct mlx5_bfreg_info *bfregi, int bfregn,
+			       bool dyn_bfreg)
 {
 {
 	int bfregs_per_sys_page;
 	int bfregs_per_sys_page;
 	int index_of_sys_page;
 	int index_of_sys_page;
@@ -637,8 +639,16 @@ static int bfregn_to_uar_index(struct mlx5_ib_dev *dev,
 				MLX5_NON_FP_BFREGS_PER_UAR;
 				MLX5_NON_FP_BFREGS_PER_UAR;
 	index_of_sys_page = bfregn / bfregs_per_sys_page;
 	index_of_sys_page = bfregn / bfregs_per_sys_page;
 
 
-	offset = bfregn % bfregs_per_sys_page / MLX5_NON_FP_BFREGS_PER_UAR;
+	if (dyn_bfreg) {
+		index_of_sys_page += bfregi->num_static_sys_pages;
+		if (bfregn > bfregi->num_dyn_bfregs ||
+		    bfregi->sys_pages[index_of_sys_page] == MLX5_IB_INVALID_UAR_INDEX) {
+			mlx5_ib_dbg(dev, "Invalid dynamic uar index\n");
+			return -EINVAL;
+		}
+	}
 
 
+	offset = bfregn % bfregs_per_sys_page / MLX5_NON_FP_BFREGS_PER_UAR;
 	return bfregi->sys_pages[index_of_sys_page] + offset;
 	return bfregi->sys_pages[index_of_sys_page] + offset;
 }
 }
 
 
@@ -764,7 +774,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 	struct mlx5_ib_create_qp ucmd;
 	struct mlx5_ib_create_qp ucmd;
 	struct mlx5_ib_ubuffer *ubuffer = &base->ubuffer;
 	struct mlx5_ib_ubuffer *ubuffer = &base->ubuffer;
 	int page_shift = 0;
 	int page_shift = 0;
-	int uar_index;
+	int uar_index = 0;
 	int npages;
 	int npages;
 	u32 offset = 0;
 	u32 offset = 0;
 	int bfregn;
 	int bfregn;
@@ -780,12 +790,20 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 	}
 	}
 
 
 	context = to_mucontext(pd->uobject->context);
 	context = to_mucontext(pd->uobject->context);
-	/*
-	 * TBD: should come from the verbs when we have the API
-	 */
-	if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL)
+	if (ucmd.flags & MLX5_QP_FLAG_BFREG_INDEX) {
+		uar_index = bfregn_to_uar_index(dev, &context->bfregi,
+						ucmd.bfreg_index, true);
+		if (uar_index < 0)
+			return uar_index;
+
+		bfregn = MLX5_IB_INVALID_BFREG;
+	} else if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) {
+		/*
+		 * TBD: should come from the verbs when we have the API
+		 */
 		/* In CROSS_CHANNEL CQ and QP must use the same UAR */
 		/* In CROSS_CHANNEL CQ and QP must use the same UAR */
 		bfregn = MLX5_CROSS_CHANNEL_BFREG;
 		bfregn = MLX5_CROSS_CHANNEL_BFREG;
+	}
 	else {
 	else {
 		bfregn = alloc_bfreg(dev, &context->bfregi, MLX5_IB_LATENCY_CLASS_HIGH);
 		bfregn = alloc_bfreg(dev, &context->bfregi, MLX5_IB_LATENCY_CLASS_HIGH);
 		if (bfregn < 0) {
 		if (bfregn < 0) {
@@ -804,8 +822,10 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 		}
 		}
 	}
 	}
 
 
-	uar_index = bfregn_to_uar_index(dev, &context->bfregi, bfregn);
 	mlx5_ib_dbg(dev, "bfregn 0x%x, uar_index 0x%x\n", bfregn, uar_index);
 	mlx5_ib_dbg(dev, "bfregn 0x%x, uar_index 0x%x\n", bfregn, uar_index);
+	if (bfregn != MLX5_IB_INVALID_BFREG)
+		uar_index = bfregn_to_uar_index(dev, &context->bfregi, bfregn,
+						false);
 
 
 	qp->rq.offset = 0;
 	qp->rq.offset = 0;
 	qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB);
 	qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB);
@@ -845,7 +865,10 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 	MLX5_SET(qpc, qpc, page_offset, offset);
 	MLX5_SET(qpc, qpc, page_offset, offset);
 
 
 	MLX5_SET(qpc, qpc, uar_page, uar_index);
 	MLX5_SET(qpc, qpc, uar_page, uar_index);
-	resp->bfreg_index = adjust_bfregn(dev, &context->bfregi, bfregn);
+	if (bfregn != MLX5_IB_INVALID_BFREG)
+		resp->bfreg_index = adjust_bfregn(dev, &context->bfregi, bfregn);
+	else
+		resp->bfreg_index = MLX5_IB_INVALID_BFREG;
 	qp->bfregn = bfregn;
 	qp->bfregn = bfregn;
 
 
 	err = mlx5_ib_db_map_user(context, ucmd.db_addr, &qp->db);
 	err = mlx5_ib_db_map_user(context, ucmd.db_addr, &qp->db);
@@ -874,7 +897,8 @@ err_umem:
 		ib_umem_release(ubuffer->umem);
 		ib_umem_release(ubuffer->umem);
 
 
 err_bfreg:
 err_bfreg:
-	free_bfreg(dev, &context->bfregi, bfregn);
+	if (bfregn != MLX5_IB_INVALID_BFREG)
+		mlx5_ib_free_bfreg(dev, &context->bfregi, bfregn);
 	return err;
 	return err;
 }
 }
 
 
@@ -887,7 +911,13 @@ static void destroy_qp_user(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 	mlx5_ib_db_unmap_user(context, &qp->db);
 	mlx5_ib_db_unmap_user(context, &qp->db);
 	if (base->ubuffer.umem)
 	if (base->ubuffer.umem)
 		ib_umem_release(base->ubuffer.umem);
 		ib_umem_release(base->ubuffer.umem);
-	free_bfreg(dev, &context->bfregi, qp->bfregn);
+
+	/*
+	 * Free only the BFREGs which are handled by the kernel.
+	 * BFREGs of UARs allocated dynamically are handled by user.
+	 */
+	if (qp->bfregn != MLX5_IB_INVALID_BFREG)
+		mlx5_ib_free_bfreg(dev, &context->bfregi, qp->bfregn);
 }
 }
 
 
 static int create_kernel_qp(struct mlx5_ib_dev *dev,
 static int create_kernel_qp(struct mlx5_ib_dev *dev,
@@ -1015,6 +1045,7 @@ static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
 static u32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr)
 static u32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr)
 {
 {
 	if (attr->srq || (attr->qp_type == IB_QPT_XRC_TGT) ||
 	if (attr->srq || (attr->qp_type == IB_QPT_XRC_TGT) ||
+	    (attr->qp_type == MLX5_IB_QPT_DCI) ||
 	    (attr->qp_type == IB_QPT_XRC_INI))
 	    (attr->qp_type == IB_QPT_XRC_INI))
 		return MLX5_SRQ_RQ;
 		return MLX5_SRQ_RQ;
 	else if (!qp->has_rq)
 	else if (!qp->has_rq)
@@ -2086,20 +2117,108 @@ static const char *ib_qp_type_str(enum ib_qp_type type)
 		return "IB_QPT_RAW_PACKET";
 		return "IB_QPT_RAW_PACKET";
 	case MLX5_IB_QPT_REG_UMR:
 	case MLX5_IB_QPT_REG_UMR:
 		return "MLX5_IB_QPT_REG_UMR";
 		return "MLX5_IB_QPT_REG_UMR";
+	case IB_QPT_DRIVER:
+		return "IB_QPT_DRIVER";
 	case IB_QPT_MAX:
 	case IB_QPT_MAX:
 	default:
 	default:
 		return "Invalid QP type";
 		return "Invalid QP type";
 	}
 	}
 }
 }
 
 
+static struct ib_qp *mlx5_ib_create_dct(struct ib_pd *pd,
+					struct ib_qp_init_attr *attr,
+					struct mlx5_ib_create_qp *ucmd)
+{
+	struct mlx5_ib_dev *dev;
+	struct mlx5_ib_qp *qp;
+	int err = 0;
+	u32 uidx = MLX5_IB_DEFAULT_UIDX;
+	void *dctc;
+
+	if (!attr->srq || !attr->recv_cq)
+		return ERR_PTR(-EINVAL);
+
+	dev = to_mdev(pd->device);
+
+	err = get_qp_user_index(to_mucontext(pd->uobject->context),
+				ucmd, sizeof(*ucmd), &uidx);
+	if (err)
+		return ERR_PTR(err);
+
+	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+	if (!qp)
+		return ERR_PTR(-ENOMEM);
+
+	qp->dct.in = kzalloc(MLX5_ST_SZ_BYTES(create_dct_in), GFP_KERNEL);
+	if (!qp->dct.in) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	dctc = MLX5_ADDR_OF(create_dct_in, qp->dct.in, dct_context_entry);
+	qp->qp_sub_type = MLX5_IB_QPT_DCT;
+	MLX5_SET(dctc, dctc, pd, to_mpd(pd)->pdn);
+	MLX5_SET(dctc, dctc, srqn_xrqn, to_msrq(attr->srq)->msrq.srqn);
+	MLX5_SET(dctc, dctc, cqn, to_mcq(attr->recv_cq)->mcq.cqn);
+	MLX5_SET64(dctc, dctc, dc_access_key, ucmd->access_key);
+	MLX5_SET(dctc, dctc, user_index, uidx);
+
+	qp->state = IB_QPS_RESET;
+
+	return &qp->ibqp;
+err_free:
+	kfree(qp);
+	return ERR_PTR(err);
+}
+
+static int set_mlx_qp_type(struct mlx5_ib_dev *dev,
+			   struct ib_qp_init_attr *init_attr,
+			   struct mlx5_ib_create_qp *ucmd,
+			   struct ib_udata *udata)
+{
+	enum { MLX_QP_FLAGS = MLX5_QP_FLAG_TYPE_DCT | MLX5_QP_FLAG_TYPE_DCI };
+	int err;
+
+	if (!udata)
+		return -EINVAL;
+
+	if (udata->inlen < sizeof(*ucmd)) {
+		mlx5_ib_dbg(dev, "create_qp user command is smaller than expected\n");
+		return -EINVAL;
+	}
+	err = ib_copy_from_udata(ucmd, udata, sizeof(*ucmd));
+	if (err)
+		return err;
+
+	if ((ucmd->flags & MLX_QP_FLAGS) == MLX5_QP_FLAG_TYPE_DCI) {
+		init_attr->qp_type = MLX5_IB_QPT_DCI;
+	} else {
+		if ((ucmd->flags & MLX_QP_FLAGS) == MLX5_QP_FLAG_TYPE_DCT) {
+			init_attr->qp_type = MLX5_IB_QPT_DCT;
+		} else {
+			mlx5_ib_dbg(dev, "Invalid QP flags\n");
+			return -EINVAL;
+		}
+	}
+
+	if (!MLX5_CAP_GEN(dev->mdev, dct)) {
+		mlx5_ib_dbg(dev, "DC transport is not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
 struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
 struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
-				struct ib_qp_init_attr *init_attr,
+				struct ib_qp_init_attr *verbs_init_attr,
 				struct ib_udata *udata)
 				struct ib_udata *udata)
 {
 {
 	struct mlx5_ib_dev *dev;
 	struct mlx5_ib_dev *dev;
 	struct mlx5_ib_qp *qp;
 	struct mlx5_ib_qp *qp;
 	u16 xrcdn = 0;
 	u16 xrcdn = 0;
 	int err;
 	int err;
+	struct ib_qp_init_attr mlx_init_attr;
+	struct ib_qp_init_attr *init_attr = verbs_init_attr;
 
 
 	if (pd) {
 	if (pd) {
 		dev = to_mdev(pd->device);
 		dev = to_mdev(pd->device);
@@ -2124,6 +2243,26 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
 		dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device);
 		dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device);
 	}
 	}
 
 
+	if (init_attr->qp_type == IB_QPT_DRIVER) {
+		struct mlx5_ib_create_qp ucmd;
+
+		init_attr = &mlx_init_attr;
+		memcpy(init_attr, verbs_init_attr, sizeof(*verbs_init_attr));
+		err = set_mlx_qp_type(dev, init_attr, &ucmd, udata);
+		if (err)
+			return ERR_PTR(err);
+
+		if (init_attr->qp_type == MLX5_IB_QPT_DCI) {
+			if (init_attr->cap.max_recv_wr ||
+			    init_attr->cap.max_recv_sge) {
+				mlx5_ib_dbg(dev, "DCI QP requires zero size receive queue\n");
+				return ERR_PTR(-EINVAL);
+			}
+		} else {
+			return mlx5_ib_create_dct(pd, init_attr, &ucmd);
+		}
+	}
+
 	switch (init_attr->qp_type) {
 	switch (init_attr->qp_type) {
 	case IB_QPT_XRC_TGT:
 	case IB_QPT_XRC_TGT:
 	case IB_QPT_XRC_INI:
 	case IB_QPT_XRC_INI:
@@ -2145,6 +2284,7 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
 	case IB_QPT_SMI:
 	case IB_QPT_SMI:
 	case MLX5_IB_QPT_HW_GSI:
 	case MLX5_IB_QPT_HW_GSI:
 	case MLX5_IB_QPT_REG_UMR:
 	case MLX5_IB_QPT_REG_UMR:
+	case MLX5_IB_QPT_DCI:
 		qp = kzalloc(sizeof(*qp), GFP_KERNEL);
 		qp = kzalloc(sizeof(*qp), GFP_KERNEL);
 		if (!qp)
 		if (!qp)
 			return ERR_PTR(-ENOMEM);
 			return ERR_PTR(-ENOMEM);
@@ -2185,9 +2325,31 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
 		return ERR_PTR(-EINVAL);
 		return ERR_PTR(-EINVAL);
 	}
 	}
 
 
+	if (verbs_init_attr->qp_type == IB_QPT_DRIVER)
+		qp->qp_sub_type = init_attr->qp_type;
+
 	return &qp->ibqp;
 	return &qp->ibqp;
 }
 }
 
 
+static int mlx5_ib_destroy_dct(struct mlx5_ib_qp *mqp)
+{
+	struct mlx5_ib_dev *dev = to_mdev(mqp->ibqp.device);
+
+	if (mqp->state == IB_QPS_RTR) {
+		int err;
+
+		err = mlx5_core_destroy_dct(dev->mdev, &mqp->dct.mdct);
+		if (err) {
+			mlx5_ib_warn(dev, "failed to destroy DCT %d\n", err);
+			return err;
+		}
+	}
+
+	kfree(mqp->dct.in);
+	kfree(mqp);
+	return 0;
+}
+
 int mlx5_ib_destroy_qp(struct ib_qp *qp)
 int mlx5_ib_destroy_qp(struct ib_qp *qp)
 {
 {
 	struct mlx5_ib_dev *dev = to_mdev(qp->device);
 	struct mlx5_ib_dev *dev = to_mdev(qp->device);
@@ -2196,6 +2358,9 @@ int mlx5_ib_destroy_qp(struct ib_qp *qp)
 	if (unlikely(qp->qp_type == IB_QPT_GSI))
 	if (unlikely(qp->qp_type == IB_QPT_GSI))
 		return mlx5_ib_gsi_destroy_qp(qp);
 		return mlx5_ib_gsi_destroy_qp(qp);
 
 
+	if (mqp->qp_sub_type == MLX5_IB_QPT_DCT)
+		return mlx5_ib_destroy_dct(mqp);
+
 	destroy_qp_common(dev, mqp);
 	destroy_qp_common(dev, mqp);
 
 
 	kfree(mqp);
 	kfree(mqp);
@@ -2763,7 +2928,8 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 	if (!context)
 	if (!context)
 		return -ENOMEM;
 		return -ENOMEM;
 
 
-	err = to_mlx5_st(ibqp->qp_type);
+	err = to_mlx5_st(ibqp->qp_type == IB_QPT_DRIVER ?
+			 qp->qp_sub_type : ibqp->qp_type);
 	if (err < 0) {
 	if (err < 0) {
 		mlx5_ib_dbg(dev, "unsupported qp type %d\n", ibqp->qp_type);
 		mlx5_ib_dbg(dev, "unsupported qp type %d\n", ibqp->qp_type);
 		goto out;
 		goto out;
@@ -2796,8 +2962,9 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 		    (ibqp->qp_type == IB_QPT_XRC_INI) ||
 		    (ibqp->qp_type == IB_QPT_XRC_INI) ||
 		    (ibqp->qp_type == IB_QPT_XRC_TGT)) {
 		    (ibqp->qp_type == IB_QPT_XRC_TGT)) {
 			if (mlx5_lag_is_active(dev->mdev)) {
 			if (mlx5_lag_is_active(dev->mdev)) {
+				u8 p = mlx5_core_native_port_num(dev->mdev);
 				tx_affinity = (unsigned int)atomic_add_return(1,
 				tx_affinity = (unsigned int)atomic_add_return(1,
-						&dev->roce.next_port) %
+						&dev->roce[p].next_port) %
 						MLX5_MAX_PORTS + 1;
 						MLX5_MAX_PORTS + 1;
 				context->flags |= cpu_to_be32(tx_affinity << 24);
 				context->flags |= cpu_to_be32(tx_affinity << 24);
 			}
 			}
@@ -2922,7 +3089,8 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 
 
 	mlx5_cur = to_mlx5_state(cur_state);
 	mlx5_cur = to_mlx5_state(cur_state);
 	mlx5_new = to_mlx5_state(new_state);
 	mlx5_new = to_mlx5_state(new_state);
-	mlx5_st = to_mlx5_st(ibqp->qp_type);
+	mlx5_st = to_mlx5_st(ibqp->qp_type == IB_QPT_DRIVER ?
+			     qp->qp_sub_type : ibqp->qp_type);
 	if (mlx5_st < 0)
 	if (mlx5_st < 0)
 		goto out;
 		goto out;
 
 
@@ -2994,6 +3162,139 @@ out:
 	return err;
 	return err;
 }
 }
 
 
+static inline bool is_valid_mask(int mask, int req, int opt)
+{
+	if ((mask & req) != req)
+		return false;
+
+	if (mask & ~(req | opt))
+		return false;
+
+	return true;
+}
+
+/* check valid transition for driver QP types
+ * for now the only QP type that this function supports is DCI
+ */
+static bool modify_dci_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state new_state,
+				enum ib_qp_attr_mask attr_mask)
+{
+	int req = IB_QP_STATE;
+	int opt = 0;
+
+	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+		req |= IB_QP_PKEY_INDEX | IB_QP_PORT;
+		return is_valid_mask(attr_mask, req, opt);
+	} else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_INIT) {
+		opt = IB_QP_PKEY_INDEX | IB_QP_PORT;
+		return is_valid_mask(attr_mask, req, opt);
+	} else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
+		req |= IB_QP_PATH_MTU;
+		opt = IB_QP_PKEY_INDEX;
+		return is_valid_mask(attr_mask, req, opt);
+	} else if (cur_state == IB_QPS_RTR && new_state == IB_QPS_RTS) {
+		req |= IB_QP_TIMEOUT | IB_QP_RETRY_CNT | IB_QP_RNR_RETRY |
+		       IB_QP_MAX_QP_RD_ATOMIC | IB_QP_SQ_PSN;
+		opt = IB_QP_MIN_RNR_TIMER;
+		return is_valid_mask(attr_mask, req, opt);
+	} else if (cur_state == IB_QPS_RTS && new_state == IB_QPS_RTS) {
+		opt = IB_QP_MIN_RNR_TIMER;
+		return is_valid_mask(attr_mask, req, opt);
+	} else if (cur_state != IB_QPS_RESET && new_state == IB_QPS_ERR) {
+		return is_valid_mask(attr_mask, req, opt);
+	}
+	return false;
+}
+
+/* mlx5_ib_modify_dct: modify a DCT QP
+ * valid transitions are:
+ * RESET to INIT: must set access_flags, pkey_index and port
+ * INIT  to RTR : must set min_rnr_timer, tclass, flow_label,
+ *			   mtu, gid_index and hop_limit
+ * Other transitions and attributes are illegal
+ */
+static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+			      int attr_mask, struct ib_udata *udata)
+{
+	struct mlx5_ib_qp *qp = to_mqp(ibqp);
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	enum ib_qp_state cur_state, new_state;
+	int err = 0;
+	int required = IB_QP_STATE;
+	void *dctc;
+
+	if (!(attr_mask & IB_QP_STATE))
+		return -EINVAL;
+
+	cur_state = qp->state;
+	new_state = attr->qp_state;
+
+	dctc = MLX5_ADDR_OF(create_dct_in, qp->dct.in, dct_context_entry);
+	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+		required |= IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT;
+		if (!is_valid_mask(attr_mask, required, 0))
+			return -EINVAL;
+
+		if (attr->port_num == 0 ||
+		    attr->port_num > MLX5_CAP_GEN(dev->mdev, num_ports)) {
+			mlx5_ib_dbg(dev, "invalid port number %d. number of ports is %d\n",
+				    attr->port_num, dev->num_ports);
+			return -EINVAL;
+		}
+		if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ)
+			MLX5_SET(dctc, dctc, rre, 1);
+		if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE)
+			MLX5_SET(dctc, dctc, rwe, 1);
+		if (attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) {
+			if (!mlx5_ib_dc_atomic_is_supported(dev))
+				return -EOPNOTSUPP;
+			MLX5_SET(dctc, dctc, rae, 1);
+			MLX5_SET(dctc, dctc, atomic_mode, MLX5_ATOMIC_MODE_DCT_CX);
+		}
+		MLX5_SET(dctc, dctc, pkey_index, attr->pkey_index);
+		MLX5_SET(dctc, dctc, port, attr->port_num);
+		MLX5_SET(dctc, dctc, counter_set_id, dev->port[attr->port_num - 1].cnts.set_id);
+
+	} else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
+		struct mlx5_ib_modify_qp_resp resp = {};
+		u32 min_resp_len = offsetof(typeof(resp), dctn) +
+				   sizeof(resp.dctn);
+
+		if (udata->outlen < min_resp_len)
+			return -EINVAL;
+		resp.response_length = min_resp_len;
+
+		required |= IB_QP_MIN_RNR_TIMER | IB_QP_AV | IB_QP_PATH_MTU;
+		if (!is_valid_mask(attr_mask, required, 0))
+			return -EINVAL;
+		MLX5_SET(dctc, dctc, min_rnr_nak, attr->min_rnr_timer);
+		MLX5_SET(dctc, dctc, tclass, attr->ah_attr.grh.traffic_class);
+		MLX5_SET(dctc, dctc, flow_label, attr->ah_attr.grh.flow_label);
+		MLX5_SET(dctc, dctc, mtu, attr->path_mtu);
+		MLX5_SET(dctc, dctc, my_addr_index, attr->ah_attr.grh.sgid_index);
+		MLX5_SET(dctc, dctc, hop_limit, attr->ah_attr.grh.hop_limit);
+
+		err = mlx5_core_create_dct(dev->mdev, &qp->dct.mdct, qp->dct.in,
+					   MLX5_ST_SZ_BYTES(create_dct_in));
+		if (err)
+			return err;
+		resp.dctn = qp->dct.mdct.mqp.qpn;
+		err = ib_copy_to_udata(udata, &resp, resp.response_length);
+		if (err) {
+			mlx5_core_destroy_dct(dev->mdev, &qp->dct.mdct);
+			return err;
+		}
+	} else {
+		mlx5_ib_warn(dev, "Modify DCT: Invalid transition from %d to %d\n", cur_state, new_state);
+		return -EINVAL;
+	}
+	if (err)
+		qp->state = IB_QPS_ERR;
+	else
+		qp->state = new_state;
+	return err;
+}
+
 int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		      int attr_mask, struct ib_udata *udata)
 		      int attr_mask, struct ib_udata *udata)
 {
 {
@@ -3011,8 +3312,14 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 	if (unlikely(ibqp->qp_type == IB_QPT_GSI))
 	if (unlikely(ibqp->qp_type == IB_QPT_GSI))
 		return mlx5_ib_gsi_modify_qp(ibqp, attr, attr_mask);
 		return mlx5_ib_gsi_modify_qp(ibqp, attr, attr_mask);
 
 
-	qp_type = (unlikely(ibqp->qp_type == MLX5_IB_QPT_HW_GSI)) ?
-		IB_QPT_GSI : ibqp->qp_type;
+	if (ibqp->qp_type == IB_QPT_DRIVER)
+		qp_type = qp->qp_sub_type;
+	else
+		qp_type = (unlikely(ibqp->qp_type == MLX5_IB_QPT_HW_GSI)) ?
+			IB_QPT_GSI : ibqp->qp_type;
+
+	if (qp_type == MLX5_IB_QPT_DCT)
+		return mlx5_ib_modify_dct(ibqp, attr, attr_mask, udata);
 
 
 	mutex_lock(&qp->mutex);
 	mutex_lock(&qp->mutex);
 
 
@@ -3031,15 +3338,21 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 			goto out;
 			goto out;
 		}
 		}
 	} else if (qp_type != MLX5_IB_QPT_REG_UMR &&
 	} else if (qp_type != MLX5_IB_QPT_REG_UMR &&
-	    !ib_modify_qp_is_ok(cur_state, new_state, qp_type, attr_mask, ll)) {
+		   qp_type != MLX5_IB_QPT_DCI &&
+		   !ib_modify_qp_is_ok(cur_state, new_state, qp_type, attr_mask, ll)) {
 		mlx5_ib_dbg(dev, "invalid QP state transition from %d to %d, qp_type %d, attr_mask 0x%x\n",
 		mlx5_ib_dbg(dev, "invalid QP state transition from %d to %d, qp_type %d, attr_mask 0x%x\n",
 			    cur_state, new_state, ibqp->qp_type, attr_mask);
 			    cur_state, new_state, ibqp->qp_type, attr_mask);
 		goto out;
 		goto out;
+	} else if (qp_type == MLX5_IB_QPT_DCI &&
+		   !modify_dci_qp_is_ok(cur_state, new_state, attr_mask)) {
+		mlx5_ib_dbg(dev, "invalid QP state transition from %d to %d, qp_type %d, attr_mask 0x%x\n",
+			    cur_state, new_state, qp_type, attr_mask);
+		goto out;
 	}
 	}
 
 
 	if ((attr_mask & IB_QP_PORT) &&
 	if ((attr_mask & IB_QP_PORT) &&
 	    (attr->port_num == 0 ||
 	    (attr->port_num == 0 ||
-	     attr->port_num > MLX5_CAP_GEN(dev->mdev, num_ports))) {
+	     attr->port_num > dev->num_ports)) {
 		mlx5_ib_dbg(dev, "invalid port number %d. number of ports is %d\n",
 		mlx5_ib_dbg(dev, "invalid port number %d. number of ports is %d\n",
 			    attr->port_num, dev->num_ports);
 			    attr->port_num, dev->num_ports);
 		goto out;
 		goto out;
@@ -4358,11 +4671,10 @@ static void to_rdma_ah_attr(struct mlx5_ib_dev *ibdev,
 			    struct rdma_ah_attr *ah_attr,
 			    struct rdma_ah_attr *ah_attr,
 			    struct mlx5_qp_path *path)
 			    struct mlx5_qp_path *path)
 {
 {
-	struct mlx5_core_dev *dev = ibdev->mdev;
 
 
 	memset(ah_attr, 0, sizeof(*ah_attr));
 	memset(ah_attr, 0, sizeof(*ah_attr));
 
 
-	if (!path->port || path->port > MLX5_CAP_GEN(dev, num_ports))
+	if (!path->port || path->port > ibdev->num_ports)
 		return;
 		return;
 
 
 	ah_attr->type = rdma_ah_find_type(&ibdev->ib_dev, path->port);
 	ah_attr->type = rdma_ah_find_type(&ibdev->ib_dev, path->port);
@@ -4577,6 +4889,71 @@ out:
 	return err;
 	return err;
 }
 }
 
 
+static int mlx5_ib_dct_query_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *mqp,
+				struct ib_qp_attr *qp_attr, int qp_attr_mask,
+				struct ib_qp_init_attr *qp_init_attr)
+{
+	struct mlx5_core_dct	*dct = &mqp->dct.mdct;
+	u32 *out;
+	u32 access_flags = 0;
+	int outlen = MLX5_ST_SZ_BYTES(query_dct_out);
+	void *dctc;
+	int err;
+	int supported_mask = IB_QP_STATE |
+			     IB_QP_ACCESS_FLAGS |
+			     IB_QP_PORT |
+			     IB_QP_MIN_RNR_TIMER |
+			     IB_QP_AV |
+			     IB_QP_PATH_MTU |
+			     IB_QP_PKEY_INDEX;
+
+	if (qp_attr_mask & ~supported_mask)
+		return -EINVAL;
+	if (mqp->state != IB_QPS_RTR)
+		return -EINVAL;
+
+	out = kzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	err = mlx5_core_dct_query(dev->mdev, dct, out, outlen);
+	if (err)
+		goto out;
+
+	dctc = MLX5_ADDR_OF(query_dct_out, out, dct_context_entry);
+
+	if (qp_attr_mask & IB_QP_STATE)
+		qp_attr->qp_state = IB_QPS_RTR;
+
+	if (qp_attr_mask & IB_QP_ACCESS_FLAGS) {
+		if (MLX5_GET(dctc, dctc, rre))
+			access_flags |= IB_ACCESS_REMOTE_READ;
+		if (MLX5_GET(dctc, dctc, rwe))
+			access_flags |= IB_ACCESS_REMOTE_WRITE;
+		if (MLX5_GET(dctc, dctc, rae))
+			access_flags |= IB_ACCESS_REMOTE_ATOMIC;
+		qp_attr->qp_access_flags = access_flags;
+	}
+
+	if (qp_attr_mask & IB_QP_PORT)
+		qp_attr->port_num = MLX5_GET(dctc, dctc, port);
+	if (qp_attr_mask & IB_QP_MIN_RNR_TIMER)
+		qp_attr->min_rnr_timer = MLX5_GET(dctc, dctc, min_rnr_nak);
+	if (qp_attr_mask & IB_QP_AV) {
+		qp_attr->ah_attr.grh.traffic_class = MLX5_GET(dctc, dctc, tclass);
+		qp_attr->ah_attr.grh.flow_label = MLX5_GET(dctc, dctc, flow_label);
+		qp_attr->ah_attr.grh.sgid_index = MLX5_GET(dctc, dctc, my_addr_index);
+		qp_attr->ah_attr.grh.hop_limit = MLX5_GET(dctc, dctc, hop_limit);
+	}
+	if (qp_attr_mask & IB_QP_PATH_MTU)
+		qp_attr->path_mtu = MLX5_GET(dctc, dctc, mtu);
+	if (qp_attr_mask & IB_QP_PKEY_INDEX)
+		qp_attr->pkey_index = MLX5_GET(dctc, dctc, pkey_index);
+out:
+	kfree(out);
+	return err;
+}
+
 int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
 int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
 		     int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
 		     int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
 {
 {
@@ -4596,6 +4973,10 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
 	memset(qp_init_attr, 0, sizeof(*qp_init_attr));
 	memset(qp_init_attr, 0, sizeof(*qp_init_attr));
 	memset(qp_attr, 0, sizeof(*qp_attr));
 	memset(qp_attr, 0, sizeof(*qp_attr));
 
 
+	if (unlikely(qp->qp_sub_type == MLX5_IB_QPT_DCT))
+		return mlx5_ib_dct_query_qp(dev, qp, qp_attr,
+					    qp_attr_mask, qp_init_attr);
+
 	mutex_lock(&qp->mutex);
 	mutex_lock(&qp->mutex);
 
 
 	if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET ||
 	if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET ||
@@ -4685,13 +5066,10 @@ int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd)
 	int err;
 	int err;
 
 
 	err = mlx5_core_xrcd_dealloc(dev->mdev, xrcdn);
 	err = mlx5_core_xrcd_dealloc(dev->mdev, xrcdn);
-	if (err) {
+	if (err)
 		mlx5_ib_warn(dev, "failed to dealloc xrcdn 0x%x\n", xrcdn);
 		mlx5_ib_warn(dev, "failed to dealloc xrcdn 0x%x\n", xrcdn);
-		return err;
-	}
 
 
 	kfree(xrcd);
 	kfree(xrcd);
-
 	return 0;
 	return 0;
 }
 }
 
 

+ 3 - 4
drivers/infiniband/hw/mthca/mthca_memfree.c

@@ -472,7 +472,7 @@ int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
 		goto out;
 		goto out;
 	}
 	}
 
 
-	ret = get_user_pages(uaddr & PAGE_MASK, 1, FOLL_WRITE, pages, NULL);
+	ret = get_user_pages_fast(uaddr & PAGE_MASK, 1, FOLL_WRITE, pages);
 	if (ret < 0)
 	if (ret < 0)
 		goto out;
 		goto out;
 
 
@@ -623,13 +623,12 @@ int mthca_alloc_db(struct mthca_dev *dev, enum mthca_db_type type,
 	page = dev->db_tab->page + end;
 	page = dev->db_tab->page + end;
 
 
 alloc:
 alloc:
-	page->db_rec = dma_alloc_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE,
-					  &page->mapping, GFP_KERNEL);
+	page->db_rec = dma_zalloc_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE,
+					   &page->mapping, GFP_KERNEL);
 	if (!page->db_rec) {
 	if (!page->db_rec) {
 		ret = -ENOMEM;
 		ret = -ENOMEM;
 		goto out;
 		goto out;
 	}
 	}
-	memset(page->db_rec, 0, MTHCA_ICM_PAGE_SIZE);
 
 
 	ret = mthca_MAP_ICM_page(dev, page->mapping,
 	ret = mthca_MAP_ICM_page(dev, page->mapping,
 				 mthca_uarc_virt(dev, &dev->driver_uar, i));
 				 mthca_uarc_virt(dev, &dev->driver_uar, i));

+ 0 - 112
drivers/infiniband/hw/mthca/mthca_user.h

@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2005 Topspin Communications.  All rights reserved.
- * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef MTHCA_USER_H
-#define MTHCA_USER_H
-
-#include <linux/types.h>
-
-/*
- * Increment this value if any changes that break userspace ABI
- * compatibility are made.
- */
-#define MTHCA_UVERBS_ABI_VERSION	1
-
-/*
- * Make sure that all structs defined in this file remain laid out so
- * that they pack the same way on 32-bit and 64-bit architectures (to
- * avoid incompatibility between 32-bit userspace and 64-bit kernels).
- * In particular do not use pointer types -- pass pointers in __u64
- * instead.
- */
-
-struct mthca_alloc_ucontext_resp {
-	__u32 qp_tab_size;
-	__u32 uarc_size;
-};
-
-struct mthca_alloc_pd_resp {
-	__u32 pdn;
-	__u32 reserved;
-};
-
-struct mthca_reg_mr {
-/*
- * Mark the memory region with a DMA attribute that causes
- * in-flight DMA to be flushed when the region is written to:
- */
-#define MTHCA_MR_DMASYNC	0x1
-	__u32 mr_attrs;
-	__u32 reserved;
-};
-
-struct mthca_create_cq {
-	__u32 lkey;
-	__u32 pdn;
-	__u64 arm_db_page;
-	__u64 set_db_page;
-	__u32 arm_db_index;
-	__u32 set_db_index;
-};
-
-struct mthca_create_cq_resp {
-	__u32 cqn;
-	__u32 reserved;
-};
-
-struct mthca_resize_cq {
-	__u32 lkey;
-	__u32 reserved;
-};
-
-struct mthca_create_srq {
-	__u32 lkey;
-	__u32 db_index;
-	__u64 db_page;
-};
-
-struct mthca_create_srq_resp {
-	__u32 srqn;
-	__u32 reserved;
-};
-
-struct mthca_create_qp {
-	__u32 lkey;
-	__u32 reserved;
-	__u64 sq_db_page;
-	__u64 rq_db_page;
-	__u32 sq_db_index;
-	__u32 rq_db_index;
-};
-
-#endif /* MTHCA_USER_H */

Some files were not shown because too many files changed in this diff