Prechádzať zdrojové kódy

Merge tag 'rdma-next-2017-08-10' of git://git.kernel.org/pub/scm/linux/kernel/git/leon/linux-rdma into rdma-netlink

RDMA netlink infrastructure v2

Signed-off-by: Doug Ledford <dledford@redhat.com>
Doug Ledford 8 rokov pred
rodič
commit
db14dff174
100 zmenil súbory, kde vykonal 6005 pridanie a 2208 odobranie
  1. 2 1
      drivers/infiniband/core/Makefile
  2. 5 7
      drivers/infiniband/core/addr.c
  3. 18 15
      drivers/infiniband/core/cma.c
  4. 21 5
      drivers/infiniband/core/core_priv.h
  5. 83 36
      drivers/infiniband/core/device.c
  6. 4 8
      drivers/infiniband/core/iwcm.c
  7. 4 16
      drivers/infiniband/core/iwpm_msg.c
  8. 2 13
      drivers/infiniband/core/iwpm_util.c
  9. 163 146
      drivers/infiniband/core/netlink.c
  10. 322 0
      drivers/infiniband/core/nldev.c
  11. 2 0
      drivers/infiniband/core/roce_gid_mgmt.c
  12. 8 10
      drivers/infiniband/core/sa_query.c
  13. 2 2
      drivers/infiniband/core/sysfs.c
  14. 14 3
      drivers/infiniband/core/uverbs_cmd.c
  15. 111 6
      drivers/infiniband/core/verbs.c
  16. 25 63
      drivers/infiniband/hw/bnxt_re/ib_verbs.c
  17. 0 3
      drivers/infiniband/hw/bnxt_re/ib_verbs.h
  18. 49 9
      drivers/infiniband/hw/bnxt_re/main.c
  19. 380 85
      drivers/infiniband/hw/bnxt_re/qplib_fp.c
  20. 24 1
      drivers/infiniband/hw/bnxt_re/qplib_fp.h
  21. 25 1
      drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
  22. 9 1
      drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
  23. 10 0
      drivers/infiniband/hw/bnxt_re/qplib_res.c
  24. 2 0
      drivers/infiniband/hw/bnxt_re/qplib_res.h
  25. 61 16
      drivers/infiniband/hw/bnxt_re/qplib_sp.c
  26. 2 0
      drivers/infiniband/hw/bnxt_re/qplib_sp.h
  27. 2 2
      drivers/infiniband/hw/bnxt_re/roce_hsi.h
  28. 0 1
      drivers/infiniband/hw/cxgb3/iwch.c
  29. 2 3
      drivers/infiniband/hw/cxgb3/iwch_provider.c
  30. 0 1
      drivers/infiniband/hw/cxgb4/device.c
  31. 2 3
      drivers/infiniband/hw/cxgb4/provider.c
  32. 1 1
      drivers/infiniband/hw/hfi1/Makefile
  33. 9 9
      drivers/infiniband/hw/hfi1/affinity.c
  34. 8 6
      drivers/infiniband/hw/hfi1/affinity.h
  35. 3 3
      drivers/infiniband/hw/hfi1/aspm.h
  36. 370 290
      drivers/infiniband/hw/hfi1/chip.c
  37. 15 9
      drivers/infiniband/hw/hfi1/chip.h
  38. 1 0
      drivers/infiniband/hw/hfi1/common.h
  39. 128 63
      drivers/infiniband/hw/hfi1/driver.c
  40. 5 6
      drivers/infiniband/hw/hfi1/eprom.c
  41. 114 0
      drivers/infiniband/hw/hfi1/exp_rcv.c
  42. 190 0
      drivers/infiniband/hw/hfi1/exp_rcv.h
  43. 91 61
      drivers/infiniband/hw/hfi1/file_ops.c
  44. 16 0
      drivers/infiniband/hw/hfi1/firmware.c
  45. 61 95
      drivers/infiniband/hw/hfi1/hfi.h
  46. 90 21
      drivers/infiniband/hw/hfi1/init.c
  47. 2 1
      drivers/infiniband/hw/hfi1/intr.c
  48. 68 2
      drivers/infiniband/hw/hfi1/iowait.h
  49. 478 212
      drivers/infiniband/hw/hfi1/mad.c
  50. 3 2
      drivers/infiniband/hw/hfi1/mad.h
  51. 10 4
      drivers/infiniband/hw/hfi1/mmu_rb.c
  52. 3 2
      drivers/infiniband/hw/hfi1/mmu_rb.h
  53. 261 123
      drivers/infiniband/hw/hfi1/pcie.c
  54. 11 4
      drivers/infiniband/hw/hfi1/pio.c
  55. 18 26
      drivers/infiniband/hw/hfi1/platform.c
  56. 11 10
      drivers/infiniband/hw/hfi1/qp.c
  57. 15 27
      drivers/infiniband/hw/hfi1/rc.c
  58. 50 50
      drivers/infiniband/hw/hfi1/ruc.c
  59. 27 15
      drivers/infiniband/hw/hfi1/sdma.c
  60. 2 1
      drivers/infiniband/hw/hfi1/sdma.h
  61. 50 8
      drivers/infiniband/hw/hfi1/trace.c
  62. 215 107
      drivers/infiniband/hw/hfi1/trace_ibhdrs.h
  63. 20 0
      drivers/infiniband/hw/hfi1/trace_misc.h
  64. 58 34
      drivers/infiniband/hw/hfi1/trace_rx.h
  65. 4 14
      drivers/infiniband/hw/hfi1/uc.c
  66. 23 41
      drivers/infiniband/hw/hfi1/ud.c
  67. 3 125
      drivers/infiniband/hw/hfi1/user_exp_rcv.c
  68. 3 23
      drivers/infiniband/hw/hfi1/user_exp_rcv.h
  69. 102 128
      drivers/infiniband/hw/hfi1/user_sdma.c
  70. 3 3
      drivers/infiniband/hw/hfi1/user_sdma.h
  71. 59 63
      drivers/infiniband/hw/hfi1/verbs.c
  72. 5 6
      drivers/infiniband/hw/hfi1/verbs.h
  73. 1 0
      drivers/infiniband/hw/hfi1/vnic.h
  74. 10 7
      drivers/infiniband/hw/hfi1/vnic_main.c
  75. 11 3
      drivers/infiniband/hw/hfi1/vnic_sdma.c
  76. 1 1
      drivers/infiniband/hw/hns/Kconfig
  77. 1 0
      drivers/infiniband/hw/hns/hns_roce_alloc.c
  78. 1 0
      drivers/infiniband/hw/hns/hns_roce_device.h
  79. 2 1
      drivers/infiniband/hw/hns/hns_roce_eq.c
  80. 1 2
      drivers/infiniband/hw/hns/hns_roce_hw_v1.c
  81. 1 0
      drivers/infiniband/hw/hns/hns_roce_mr.c
  82. 1 1
      drivers/infiniband/hw/hns/hns_roce_qp.c
  83. 0 1
      drivers/infiniband/hw/i40iw/i40iw_main.c
  84. 3 4
      drivers/infiniband/hw/i40iw/i40iw_verbs.c
  85. 2 0
      drivers/infiniband/hw/mlx4/cq.c
  86. 48 6
      drivers/infiniband/hw/mlx4/main.c
  87. 41 1
      drivers/infiniband/hw/mlx4/mlx4_ib.h
  88. 963 68
      drivers/infiniband/hw/mlx4/qp.c
  89. 1 1
      drivers/infiniband/hw/mlx5/Makefile
  90. 20 0
      drivers/infiniband/hw/mlx5/cmd.c
  91. 4 0
      drivers/infiniband/hw/mlx5/cmd.h
  92. 421 0
      drivers/infiniband/hw/mlx5/cong.c
  93. 9 0
      drivers/infiniband/hw/mlx5/ib_virt.c
  94. 307 29
      drivers/infiniband/hw/mlx5/main.c
  95. 76 3
      drivers/infiniband/hw/mlx5/mlx5_ib.h
  96. 13 5
      drivers/infiniband/hw/mlx5/mr.c
  97. 1 1
      drivers/infiniband/hw/mlx5/odp.c
  98. 104 18
      drivers/infiniband/hw/mlx5/qp.c
  99. 0 1
      drivers/infiniband/hw/mthca/mthca_main.c
  100. 2 3
      drivers/infiniband/hw/mthca/mthca_provider.c

+ 2 - 1
drivers/infiniband/core/Makefile

@@ -11,7 +11,8 @@ ib_core-y :=			packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
 				device.o fmr_pool.o cache.o netlink.o \
 				device.o fmr_pool.o cache.o netlink.o \
 				roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \
 				roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \
 				multicast.o mad.o smi.o agent.o mad_rmpp.o \
 				multicast.o mad.o smi.o agent.o mad_rmpp.o \
-				security.o
+				security.o nldev.o
+
 ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
 ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
 ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
 ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
 ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
 ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o

+ 5 - 7
drivers/infiniband/core/addr.c

@@ -129,13 +129,11 @@ static void ib_nl_process_good_ip_rsep(const struct nlmsghdr *nlh)
 }
 }
 
 
 int ib_nl_handle_ip_res_resp(struct sk_buff *skb,
 int ib_nl_handle_ip_res_resp(struct sk_buff *skb,
-			     struct netlink_callback *cb)
+			     struct nlmsghdr *nlh,
+			     struct netlink_ext_ack *extack)
 {
 {
-	const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
-
 	if ((nlh->nlmsg_flags & NLM_F_REQUEST) ||
 	if ((nlh->nlmsg_flags & NLM_F_REQUEST) ||
-	    !(NETLINK_CB(skb).sk) ||
-	    !netlink_capable(skb, CAP_NET_ADMIN))
+	    !(NETLINK_CB(skb).sk))
 		return -EPERM;
 		return -EPERM;
 
 
 	if (ib_nl_is_good_ip_resp(nlh))
 	if (ib_nl_is_good_ip_resp(nlh))
@@ -185,7 +183,7 @@ static int ib_nl_ip_send_msg(struct rdma_dev_addr *dev_addr,
 
 
 	/* Repair the nlmsg header length */
 	/* Repair the nlmsg header length */
 	nlmsg_end(skb, nlh);
 	nlmsg_end(skb, nlh);
-	ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, GFP_KERNEL);
+	rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, GFP_KERNEL);
 
 
 	/* Make the request retry, so when we get the response from userspace
 	/* Make the request retry, so when we get the response from userspace
 	 * we will have something.
 	 * we will have something.
@@ -326,7 +324,7 @@ static void queue_req(struct addr_req *req)
 static int ib_nl_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
 static int ib_nl_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
 			  const void *daddr, u32 seq, u16 family)
 			  const void *daddr, u32 seq, u16 family)
 {
 {
-	if (ibnl_chk_listeners(RDMA_NL_GROUP_LS))
+	if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS))
 		return -EADDRNOTAVAIL;
 		return -EADDRNOTAVAIL;
 
 
 	/* We fill in what we can, the response will fill the rest */
 	/* We fill in what we can, the response will fill the rest */

+ 18 - 15
drivers/infiniband/core/cma.c

@@ -72,6 +72,7 @@ MODULE_LICENSE("Dual BSD/GPL");
 #define CMA_MAX_CM_RETRIES 15
 #define CMA_MAX_CM_RETRIES 15
 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
 #define CMA_IBOE_PACKET_LIFETIME 18
 #define CMA_IBOE_PACKET_LIFETIME 18
+#define CMA_PREFERRED_ROCE_GID_TYPE (1 << IB_GID_TYPE_ROCE_UDP_ENCAP)
 
 
 static const char * const cma_events[] = {
 static const char * const cma_events[] = {
 	[RDMA_CM_EVENT_ADDR_RESOLVED]	 = "address resolved",
 	[RDMA_CM_EVENT_ADDR_RESOLVED]	 = "address resolved",
@@ -3998,7 +3999,8 @@ static void iboe_mcast_work_handler(struct work_struct *work)
 	kfree(mw);
 	kfree(mw);
 }
 }
 
 
-static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid)
+static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid,
+			      enum ib_gid_type gid_type)
 {
 {
 	struct sockaddr_in *sin = (struct sockaddr_in *)addr;
 	struct sockaddr_in *sin = (struct sockaddr_in *)addr;
 	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr;
 	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr;
@@ -4008,8 +4010,8 @@ static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid)
 	} else if (addr->sa_family == AF_INET6) {
 	} else if (addr->sa_family == AF_INET6) {
 		memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
 		memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
 	} else {
 	} else {
-		mgid->raw[0] = 0xff;
-		mgid->raw[1] = 0x0e;
+		mgid->raw[0] = (gid_type == IB_GID_TYPE_IB) ? 0xff : 0;
+		mgid->raw[1] = (gid_type == IB_GID_TYPE_IB) ? 0x0e : 0;
 		mgid->raw[2] = 0;
 		mgid->raw[2] = 0;
 		mgid->raw[3] = 0;
 		mgid->raw[3] = 0;
 		mgid->raw[4] = 0;
 		mgid->raw[4] = 0;
@@ -4050,7 +4052,9 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
 		goto out1;
 		goto out1;
 	}
 	}
 
 
-	cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid);
+	gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num -
+		   rdma_start_port(id_priv->cma_dev->device)];
+	cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid, gid_type);
 
 
 	mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff);
 	mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff);
 	if (id_priv->id.ps == RDMA_PS_UDP)
 	if (id_priv->id.ps == RDMA_PS_UDP)
@@ -4066,8 +4070,6 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
 	mc->multicast.ib->rec.hop_limit = 1;
 	mc->multicast.ib->rec.hop_limit = 1;
 	mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu);
 	mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu);
 
 
-	gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num -
-		   rdma_start_port(id_priv->cma_dev->device)];
 	if (addr->sa_family == AF_INET) {
 	if (addr->sa_family == AF_INET) {
 		if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) {
 		if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) {
 			mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT;
 			mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT;
@@ -4280,8 +4282,12 @@ static void cma_add_one(struct ib_device *device)
 	for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) {
 	for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) {
 		supported_gids = roce_gid_type_mask_support(device, i);
 		supported_gids = roce_gid_type_mask_support(device, i);
 		WARN_ON(!supported_gids);
 		WARN_ON(!supported_gids);
-		cma_dev->default_gid_type[i - rdma_start_port(device)] =
-			find_first_bit(&supported_gids, BITS_PER_LONG);
+		if (supported_gids & CMA_PREFERRED_ROCE_GID_TYPE)
+			cma_dev->default_gid_type[i - rdma_start_port(device)] =
+				CMA_PREFERRED_ROCE_GID_TYPE;
+		else
+			cma_dev->default_gid_type[i - rdma_start_port(device)] =
+				find_first_bit(&supported_gids, BITS_PER_LONG);
 		cma_dev->default_roce_tos[i - rdma_start_port(device)] = 0;
 		cma_dev->default_roce_tos[i - rdma_start_port(device)] = 0;
 	}
 	}
 
 
@@ -4452,9 +4458,8 @@ out:
 	return skb->len;
 	return skb->len;
 }
 }
 
 
-static const struct ibnl_client_cbs cma_cb_table[] = {
-	[RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats,
-				       .module = THIS_MODULE },
+static const struct rdma_nl_cbs cma_cb_table[] = {
+	[RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats},
 };
 };
 
 
 static int cma_init_net(struct net *net)
 static int cma_init_net(struct net *net)
@@ -4506,9 +4511,7 @@ static int __init cma_init(void)
 	if (ret)
 	if (ret)
 		goto err;
 		goto err;
 
 
-	if (ibnl_add_client(RDMA_NL_RDMA_CM, ARRAY_SIZE(cma_cb_table),
-			    cma_cb_table))
-		pr_warn("RDMA CMA: failed to add netlink callback\n");
+	rdma_nl_register(RDMA_NL_RDMA_CM, cma_cb_table);
 	cma_configfs_init();
 	cma_configfs_init();
 
 
 	return 0;
 	return 0;
@@ -4525,7 +4528,7 @@ err_wq:
 static void __exit cma_cleanup(void)
 static void __exit cma_cleanup(void)
 {
 {
 	cma_configfs_exit();
 	cma_configfs_exit();
-	ibnl_remove_client(RDMA_NL_RDMA_CM);
+	rdma_nl_unregister(RDMA_NL_RDMA_CM);
 	ib_unregister_client(&cma_client);
 	ib_unregister_client(&cma_client);
 	unregister_netdevice_notifier(&cma_nb);
 	unregister_netdevice_notifier(&cma_nb);
 	rdma_addr_unregister_client(&addr_client);
 	rdma_addr_unregister_client(&addr_client);

+ 21 - 5
drivers/infiniband/core/core_priv.h

@@ -102,6 +102,14 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
 			      roce_netdev_callback cb,
 			      roce_netdev_callback cb,
 			      void *cookie);
 			      void *cookie);
 
 
+typedef int (*nldev_callback)(struct ib_device *device,
+			      struct sk_buff *skb,
+			      struct netlink_callback *cb,
+			      unsigned int idx);
+
+int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,
+		     struct netlink_callback *cb);
+
 enum ib_cache_gid_default_mode {
 enum ib_cache_gid_default_mode {
 	IB_CACHE_GID_DEFAULT_MODE_SET,
 	IB_CACHE_GID_DEFAULT_MODE_SET,
 	IB_CACHE_GID_DEFAULT_MODE_DELETE
 	IB_CACHE_GID_DEFAULT_MODE_DELETE
@@ -179,8 +187,8 @@ void ib_mad_cleanup(void);
 int ib_sa_init(void);
 int ib_sa_init(void);
 void ib_sa_cleanup(void);
 void ib_sa_cleanup(void);
 
 
-int ibnl_init(void);
-void ibnl_cleanup(void);
+int rdma_nl_init(void);
+void rdma_nl_exit(void);
 
 
 /**
 /**
  * Check if there are any listeners to the netlink group
  * Check if there are any listeners to the netlink group
@@ -190,11 +198,14 @@ void ibnl_cleanup(void);
 int ibnl_chk_listeners(unsigned int group);
 int ibnl_chk_listeners(unsigned int group);
 
 
 int ib_nl_handle_resolve_resp(struct sk_buff *skb,
 int ib_nl_handle_resolve_resp(struct sk_buff *skb,
-			      struct netlink_callback *cb);
+			      struct nlmsghdr *nlh,
+			      struct netlink_ext_ack *extack);
 int ib_nl_handle_set_timeout(struct sk_buff *skb,
 int ib_nl_handle_set_timeout(struct sk_buff *skb,
-			     struct netlink_callback *cb);
+			     struct nlmsghdr *nlh,
+			     struct netlink_ext_ack *extack);
 int ib_nl_handle_ip_res_resp(struct sk_buff *skb,
 int ib_nl_handle_ip_res_resp(struct sk_buff *skb,
-			     struct netlink_callback *cb);
+			     struct nlmsghdr *nlh,
+			     struct netlink_ext_ack *extack);
 
 
 int ib_get_cached_subnet_prefix(struct ib_device *device,
 int ib_get_cached_subnet_prefix(struct ib_device *device,
 				u8                port_num,
 				u8                port_num,
@@ -301,4 +312,9 @@ static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map,
 	return 0;
 	return 0;
 }
 }
 #endif
 #endif
+
+struct ib_device *__ib_device_get_by_index(u32 ifindex);
+/* RDMA device netlink */
+void nldev_init(void);
+void nldev_exit(void);
 #endif /* _CORE_PRIV_H */
 #endif /* _CORE_PRIV_H */

+ 83 - 36
drivers/infiniband/core/device.c

@@ -134,6 +134,17 @@ static int ib_device_check_mandatory(struct ib_device *device)
 	return 0;
 	return 0;
 }
 }
 
 
+struct ib_device *__ib_device_get_by_index(u32 index)
+{
+	struct ib_device *device;
+
+	list_for_each_entry(device, &device_list, core_list)
+		if (device->index == index)
+			return device;
+
+	return NULL;
+}
+
 static struct ib_device *__ib_device_get_by_name(const char *name)
 static struct ib_device *__ib_device_get_by_name(const char *name)
 {
 {
 	struct ib_device *device;
 	struct ib_device *device;
@@ -145,7 +156,6 @@ static struct ib_device *__ib_device_get_by_name(const char *name)
 	return NULL;
 	return NULL;
 }
 }
 
 
-
 static int alloc_name(char *name)
 static int alloc_name(char *name)
 {
 {
 	unsigned long *inuse;
 	unsigned long *inuse;
@@ -326,10 +336,10 @@ static int read_port_immutable(struct ib_device *device)
 	return 0;
 	return 0;
 }
 }
 
 
-void ib_get_device_fw_str(struct ib_device *dev, char *str, size_t str_len)
+void ib_get_device_fw_str(struct ib_device *dev, char *str)
 {
 {
 	if (dev->get_dev_fw_str)
 	if (dev->get_dev_fw_str)
-		dev->get_dev_fw_str(dev, str, str_len);
+		dev->get_dev_fw_str(dev, str);
 	else
 	else
 		str[0] = '\0';
 		str[0] = '\0';
 }
 }
@@ -394,6 +404,30 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event,
 	return NOTIFY_OK;
 	return NOTIFY_OK;
 }
 }
 
 
+/**
+ *	__dev_new_index	-	allocate an device index
+ *
+ *	Returns a suitable unique value for a new device interface
+ *	number.  It assumes that there are less than 2^32-1 ib devices
+ *	will be present in the system.
+ */
+static u32 __dev_new_index(void)
+{
+	/*
+	 * The device index to allow stable naming.
+	 * Similar to struct net -> ifindex.
+	 */
+	static u32 index;
+
+	for (;;) {
+		if (!(++index))
+			index = 1;
+
+		if (!__ib_device_get_by_index(index))
+			return index;
+	}
+}
+
 /**
 /**
  * ib_register_device - Register an IB device with IB core
  * ib_register_device - Register an IB device with IB core
  * @device:Device to register
  * @device:Device to register
@@ -492,6 +526,7 @@ int ib_register_device(struct ib_device *device,
 		if (client->add && !add_client_context(device, client))
 		if (client->add && !add_client_context(device, client))
 			client->add(device);
 			client->add(device);
 
 
+	device->index = __dev_new_index();
 	down_write(&lists_rwsem);
 	down_write(&lists_rwsem);
 	list_add_tail(&device->core_list, &device_list);
 	list_add_tail(&device->core_list, &device_list);
 	up_write(&lists_rwsem);
 	up_write(&lists_rwsem);
@@ -892,6 +927,31 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
 	up_read(&lists_rwsem);
 	up_read(&lists_rwsem);
 }
 }
 
 
+/**
+ * ib_enum_all_devs - enumerate all ib_devices
+ * @cb: Callback to call for each found ib_device
+ *
+ * Enumerates all ib_devices and calls callback() on each device.
+ */
+int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,
+		     struct netlink_callback *cb)
+{
+	struct ib_device *dev;
+	unsigned int idx = 0;
+	int ret = 0;
+
+	down_read(&lists_rwsem);
+	list_for_each_entry(dev, &device_list, core_list) {
+		ret = nldev_cb(dev, skb, cb, idx);
+		if (ret)
+			break;
+		idx++;
+	}
+
+	up_read(&lists_rwsem);
+	return ret;
+}
+
 /**
 /**
  * ib_query_pkey - Get P_Key table entry
  * ib_query_pkey - Get P_Key table entry
  * @device:Device to query
  * @device:Device to query
@@ -1086,29 +1146,21 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
 }
 }
 EXPORT_SYMBOL(ib_get_net_dev_by_params);
 EXPORT_SYMBOL(ib_get_net_dev_by_params);
 
 
-static struct ibnl_client_cbs ibnl_ls_cb_table[] = {
+static const struct rdma_nl_cbs ibnl_ls_cb_table[] = {
 	[RDMA_NL_LS_OP_RESOLVE] = {
 	[RDMA_NL_LS_OP_RESOLVE] = {
-		.dump = ib_nl_handle_resolve_resp,
-		.module = THIS_MODULE },
+		.doit = ib_nl_handle_resolve_resp,
+		.flags = RDMA_NL_ADMIN_PERM,
+	},
 	[RDMA_NL_LS_OP_SET_TIMEOUT] = {
 	[RDMA_NL_LS_OP_SET_TIMEOUT] = {
-		.dump = ib_nl_handle_set_timeout,
-		.module = THIS_MODULE },
+		.doit = ib_nl_handle_set_timeout,
+		.flags = RDMA_NL_ADMIN_PERM,
+	},
 	[RDMA_NL_LS_OP_IP_RESOLVE] = {
 	[RDMA_NL_LS_OP_IP_RESOLVE] = {
-		.dump = ib_nl_handle_ip_res_resp,
-		.module = THIS_MODULE },
+		.doit = ib_nl_handle_ip_res_resp,
+		.flags = RDMA_NL_ADMIN_PERM,
+	},
 };
 };
 
 
-static int ib_add_ibnl_clients(void)
-{
-	return ibnl_add_client(RDMA_NL_LS, ARRAY_SIZE(ibnl_ls_cb_table),
-			       ibnl_ls_cb_table);
-}
-
-static void ib_remove_ibnl_clients(void)
-{
-	ibnl_remove_client(RDMA_NL_LS);
-}
-
 static int __init ib_core_init(void)
 static int __init ib_core_init(void)
 {
 {
 	int ret;
 	int ret;
@@ -1130,9 +1182,9 @@ static int __init ib_core_init(void)
 		goto err_comp;
 		goto err_comp;
 	}
 	}
 
 
-	ret = ibnl_init();
+	ret = rdma_nl_init();
 	if (ret) {
 	if (ret) {
-		pr_warn("Couldn't init IB netlink interface\n");
+		pr_warn("Couldn't init IB netlink interface: err %d\n", ret);
 		goto err_sysfs;
 		goto err_sysfs;
 	}
 	}
 
 
@@ -1154,24 +1206,18 @@ static int __init ib_core_init(void)
 		goto err_mad;
 		goto err_mad;
 	}
 	}
 
 
-	ret = ib_add_ibnl_clients();
-	if (ret) {
-		pr_warn("Couldn't register ibnl clients\n");
-		goto err_sa;
-	}
-
 	ret = register_lsm_notifier(&ibdev_lsm_nb);
 	ret = register_lsm_notifier(&ibdev_lsm_nb);
 	if (ret) {
 	if (ret) {
 		pr_warn("Couldn't register LSM notifier. ret %d\n", ret);
 		pr_warn("Couldn't register LSM notifier. ret %d\n", ret);
-		goto err_ibnl_clients;
+		goto err_sa;
 	}
 	}
 
 
+	nldev_init();
+	rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table);
 	ib_cache_setup();
 	ib_cache_setup();
 
 
 	return 0;
 	return 0;
 
 
-err_ibnl_clients:
-	ib_remove_ibnl_clients();
 err_sa:
 err_sa:
 	ib_sa_cleanup();
 	ib_sa_cleanup();
 err_mad:
 err_mad:
@@ -1179,7 +1225,7 @@ err_mad:
 err_addr:
 err_addr:
 	addr_cleanup();
 	addr_cleanup();
 err_ibnl:
 err_ibnl:
-	ibnl_cleanup();
+	rdma_nl_exit();
 err_sysfs:
 err_sysfs:
 	class_unregister(&ib_class);
 	class_unregister(&ib_class);
 err_comp:
 err_comp:
@@ -1191,13 +1237,14 @@ err:
 
 
 static void __exit ib_core_cleanup(void)
 static void __exit ib_core_cleanup(void)
 {
 {
-	unregister_lsm_notifier(&ibdev_lsm_nb);
 	ib_cache_cleanup();
 	ib_cache_cleanup();
-	ib_remove_ibnl_clients();
+	nldev_exit();
+	rdma_nl_unregister(RDMA_NL_LS);
+	unregister_lsm_notifier(&ibdev_lsm_nb);
 	ib_sa_cleanup();
 	ib_sa_cleanup();
 	ib_mad_cleanup();
 	ib_mad_cleanup();
 	addr_cleanup();
 	addr_cleanup();
-	ibnl_cleanup();
+	rdma_nl_exit();
 	class_unregister(&ib_class);
 	class_unregister(&ib_class);
 	destroy_workqueue(ib_comp_wq);
 	destroy_workqueue(ib_comp_wq);
 	/* Make sure that any pending umem accounting work is done. */
 	/* Make sure that any pending umem accounting work is done. */

+ 4 - 8
drivers/infiniband/core/iwcm.c

@@ -80,7 +80,7 @@ const char *__attribute_const__ iwcm_reject_msg(int reason)
 }
 }
 EXPORT_SYMBOL(iwcm_reject_msg);
 EXPORT_SYMBOL(iwcm_reject_msg);
 
 
-static struct ibnl_client_cbs iwcm_nl_cb_table[] = {
+static struct rdma_nl_cbs iwcm_nl_cb_table[] = {
 	[RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb},
 	[RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb},
 	[RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb},
 	[RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb},
 	[RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb},
 	[RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb},
@@ -1175,12 +1175,8 @@ static int __init iw_cm_init(void)
 	ret = iwpm_init(RDMA_NL_IWCM);
 	ret = iwpm_init(RDMA_NL_IWCM);
 	if (ret)
 	if (ret)
 		pr_err("iw_cm: couldn't init iwpm\n");
 		pr_err("iw_cm: couldn't init iwpm\n");
-
-	ret = ibnl_add_client(RDMA_NL_IWCM, ARRAY_SIZE(iwcm_nl_cb_table),
-			      iwcm_nl_cb_table);
-	if (ret)
-		pr_err("iw_cm: couldn't register netlink callbacks\n");
-
+	else
+		rdma_nl_register(RDMA_NL_IWCM, iwcm_nl_cb_table);
 	iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", WQ_MEM_RECLAIM);
 	iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", WQ_MEM_RECLAIM);
 	if (!iwcm_wq)
 	if (!iwcm_wq)
 		return -ENOMEM;
 		return -ENOMEM;
@@ -1200,7 +1196,7 @@ static void __exit iw_cm_cleanup(void)
 {
 {
 	unregister_net_sysctl_table(iwcm_ctl_table_hdr);
 	unregister_net_sysctl_table(iwcm_ctl_table_hdr);
 	destroy_workqueue(iwcm_wq);
 	destroy_workqueue(iwcm_wq);
-	ibnl_remove_client(RDMA_NL_IWCM);
+	rdma_nl_unregister(RDMA_NL_IWCM);
 	iwpm_exit(RDMA_NL_IWCM);
 	iwpm_exit(RDMA_NL_IWCM);
 }
 }
 
 

+ 4 - 16
drivers/infiniband/core/iwpm_msg.c

@@ -42,7 +42,6 @@ int iwpm_valid_pid(void)
 {
 {
 	return iwpm_user_pid > 0;
 	return iwpm_user_pid > 0;
 }
 }
-EXPORT_SYMBOL(iwpm_valid_pid);
 
 
 /*
 /*
  * iwpm_register_pid - Send a netlink query to user space
  * iwpm_register_pid - Send a netlink query to user space
@@ -104,7 +103,7 @@ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client)
 	pr_debug("%s: Multicasting a nlmsg (dev = %s ifname = %s iwpm = %s)\n",
 	pr_debug("%s: Multicasting a nlmsg (dev = %s ifname = %s iwpm = %s)\n",
 		__func__, pm_msg->dev_name, pm_msg->if_name, iwpm_ulib_name);
 		__func__, pm_msg->dev_name, pm_msg->if_name, iwpm_ulib_name);
 
 
-	ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_IWPM, GFP_KERNEL);
+	ret = rdma_nl_multicast(skb, RDMA_NL_GROUP_IWPM, GFP_KERNEL);
 	if (ret) {
 	if (ret) {
 		skb = NULL; /* skb is freed in the netlink send-op handling */
 		skb = NULL; /* skb is freed in the netlink send-op handling */
 		iwpm_user_pid = IWPM_PID_UNAVAILABLE;
 		iwpm_user_pid = IWPM_PID_UNAVAILABLE;
@@ -122,7 +121,6 @@ pid_query_error:
 		iwpm_free_nlmsg_request(&nlmsg_request->kref);
 		iwpm_free_nlmsg_request(&nlmsg_request->kref);
 	return ret;
 	return ret;
 }
 }
-EXPORT_SYMBOL(iwpm_register_pid);
 
 
 /*
 /*
  * iwpm_add_mapping - Send a netlink add mapping message
  * iwpm_add_mapping - Send a netlink add mapping message
@@ -174,7 +172,7 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
 		goto add_mapping_error;
 		goto add_mapping_error;
 	nlmsg_request->req_buffer = pm_msg;
 	nlmsg_request->req_buffer = pm_msg;
 
 
-	ret = ibnl_unicast(skb, nlh, iwpm_user_pid);
+	ret = rdma_nl_unicast_wait(skb, iwpm_user_pid);
 	if (ret) {
 	if (ret) {
 		skb = NULL; /* skb is freed in the netlink send-op handling */
 		skb = NULL; /* skb is freed in the netlink send-op handling */
 		iwpm_user_pid = IWPM_PID_UNDEFINED;
 		iwpm_user_pid = IWPM_PID_UNDEFINED;
@@ -191,7 +189,6 @@ add_mapping_error:
 		iwpm_free_nlmsg_request(&nlmsg_request->kref);
 		iwpm_free_nlmsg_request(&nlmsg_request->kref);
 	return ret;
 	return ret;
 }
 }
-EXPORT_SYMBOL(iwpm_add_mapping);
 
 
 /*
 /*
  * iwpm_add_and_query_mapping - Send a netlink add and query
  * iwpm_add_and_query_mapping - Send a netlink add and query
@@ -251,7 +248,7 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
 		goto query_mapping_error;
 		goto query_mapping_error;
 	nlmsg_request->req_buffer = pm_msg;
 	nlmsg_request->req_buffer = pm_msg;
 
 
-	ret = ibnl_unicast(skb, nlh, iwpm_user_pid);
+	ret = rdma_nl_unicast_wait(skb, iwpm_user_pid);
 	if (ret) {
 	if (ret) {
 		skb = NULL; /* skb is freed in the netlink send-op handling */
 		skb = NULL; /* skb is freed in the netlink send-op handling */
 		err_str = "Unable to send a nlmsg";
 		err_str = "Unable to send a nlmsg";
@@ -267,7 +264,6 @@ query_mapping_error:
 		iwpm_free_nlmsg_request(&nlmsg_request->kref);
 		iwpm_free_nlmsg_request(&nlmsg_request->kref);
 	return ret;
 	return ret;
 }
 }
-EXPORT_SYMBOL(iwpm_add_and_query_mapping);
 
 
 /*
 /*
  * iwpm_remove_mapping - Send a netlink remove mapping message
  * iwpm_remove_mapping - Send a netlink remove mapping message
@@ -312,7 +308,7 @@ int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client)
 	if (ret)
 	if (ret)
 		goto remove_mapping_error;
 		goto remove_mapping_error;
 
 
-	ret = ibnl_unicast(skb, nlh, iwpm_user_pid);
+	ret = rdma_nl_unicast_wait(skb, iwpm_user_pid);
 	if (ret) {
 	if (ret) {
 		skb = NULL; /* skb is freed in the netlink send-op handling */
 		skb = NULL; /* skb is freed in the netlink send-op handling */
 		iwpm_user_pid = IWPM_PID_UNDEFINED;
 		iwpm_user_pid = IWPM_PID_UNDEFINED;
@@ -328,7 +324,6 @@ remove_mapping_error:
 		dev_kfree_skb_any(skb);
 		dev_kfree_skb_any(skb);
 	return ret;
 	return ret;
 }
 }
-EXPORT_SYMBOL(iwpm_remove_mapping);
 
 
 /* netlink attribute policy for the received response to register pid request */
 /* netlink attribute policy for the received response to register pid request */
 static const struct nla_policy resp_reg_policy[IWPM_NLA_RREG_PID_MAX] = {
 static const struct nla_policy resp_reg_policy[IWPM_NLA_RREG_PID_MAX] = {
@@ -397,7 +392,6 @@ register_pid_response_exit:
 	up(&nlmsg_request->sem);
 	up(&nlmsg_request->sem);
 	return 0;
 	return 0;
 }
 }
-EXPORT_SYMBOL(iwpm_register_pid_cb);
 
 
 /* netlink attribute policy for the received response to add mapping request */
 /* netlink attribute policy for the received response to add mapping request */
 static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = {
 static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = {
@@ -466,7 +460,6 @@ add_mapping_response_exit:
 	up(&nlmsg_request->sem);
 	up(&nlmsg_request->sem);
 	return 0;
 	return 0;
 }
 }
-EXPORT_SYMBOL(iwpm_add_mapping_cb);
 
 
 /* netlink attribute policy for the response to add and query mapping request
 /* netlink attribute policy for the response to add and query mapping request
  * and response with remote address info */
  * and response with remote address info */
@@ -558,7 +551,6 @@ query_mapping_response_exit:
 	up(&nlmsg_request->sem);
 	up(&nlmsg_request->sem);
 	return 0;
 	return 0;
 }
 }
-EXPORT_SYMBOL(iwpm_add_and_query_mapping_cb);
 
 
 /*
 /*
  * iwpm_remote_info_cb - Process a port mapper message, containing
  * iwpm_remote_info_cb - Process a port mapper message, containing
@@ -627,7 +619,6 @@ int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
 			"remote_info: Mapped remote sockaddr:");
 			"remote_info: Mapped remote sockaddr:");
 	return ret;
 	return ret;
 }
 }
-EXPORT_SYMBOL(iwpm_remote_info_cb);
 
 
 /* netlink attribute policy for the received request for mapping info */
 /* netlink attribute policy for the received request for mapping info */
 static const struct nla_policy resp_mapinfo_policy[IWPM_NLA_MAPINFO_REQ_MAX] = {
 static const struct nla_policy resp_mapinfo_policy[IWPM_NLA_MAPINFO_REQ_MAX] = {
@@ -677,7 +668,6 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
 	ret = iwpm_send_mapinfo(nl_client, iwpm_user_pid);
 	ret = iwpm_send_mapinfo(nl_client, iwpm_user_pid);
 	return ret;
 	return ret;
 }
 }
-EXPORT_SYMBOL(iwpm_mapping_info_cb);
 
 
 /* netlink attribute policy for the received mapping info ack */
 /* netlink attribute policy for the received mapping info ack */
 static const struct nla_policy ack_mapinfo_policy[IWPM_NLA_MAPINFO_NUM_MAX] = {
 static const struct nla_policy ack_mapinfo_policy[IWPM_NLA_MAPINFO_NUM_MAX] = {
@@ -707,7 +697,6 @@ int iwpm_ack_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
 	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
 	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
 	return 0;
 	return 0;
 }
 }
-EXPORT_SYMBOL(iwpm_ack_mapping_info_cb);
 
 
 /* netlink attribute policy for the received port mapper error message */
 /* netlink attribute policy for the received port mapper error message */
 static const struct nla_policy map_error_policy[IWPM_NLA_ERR_MAX] = {
 static const struct nla_policy map_error_policy[IWPM_NLA_ERR_MAX] = {
@@ -751,4 +740,3 @@ int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb)
 	up(&nlmsg_request->sem);
 	up(&nlmsg_request->sem);
 	return 0;
 	return 0;
 }
 }
-EXPORT_SYMBOL(iwpm_mapping_error_cb);

+ 2 - 13
drivers/infiniband/core/iwpm_util.c

@@ -54,8 +54,6 @@ static struct iwpm_admin_data iwpm_admin;
 int iwpm_init(u8 nl_client)
 int iwpm_init(u8 nl_client)
 {
 {
 	int ret = 0;
 	int ret = 0;
-	if (iwpm_valid_client(nl_client))
-		return -EINVAL;
 	mutex_lock(&iwpm_admin_lock);
 	mutex_lock(&iwpm_admin_lock);
 	if (atomic_read(&iwpm_admin.refcount) == 0) {
 	if (atomic_read(&iwpm_admin.refcount) == 0) {
 		iwpm_hash_bucket = kzalloc(IWPM_MAPINFO_HASH_SIZE *
 		iwpm_hash_bucket = kzalloc(IWPM_MAPINFO_HASH_SIZE *
@@ -83,7 +81,6 @@ init_exit:
 	}
 	}
 	return ret;
 	return ret;
 }
 }
-EXPORT_SYMBOL(iwpm_init);
 
 
 static void free_hash_bucket(void);
 static void free_hash_bucket(void);
 static void free_reminfo_bucket(void);
 static void free_reminfo_bucket(void);
@@ -109,7 +106,6 @@ int iwpm_exit(u8 nl_client)
 	iwpm_set_registration(nl_client, IWPM_REG_UNDEF);
 	iwpm_set_registration(nl_client, IWPM_REG_UNDEF);
 	return 0;
 	return 0;
 }
 }
-EXPORT_SYMBOL(iwpm_exit);
 
 
 static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *,
 static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *,
 					       struct sockaddr_storage *);
 					       struct sockaddr_storage *);
@@ -148,7 +144,6 @@ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr,
 	spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
 	spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
 	return ret;
 	return ret;
 }
 }
-EXPORT_SYMBOL(iwpm_create_mapinfo);
 
 
 int iwpm_remove_mapinfo(struct sockaddr_storage *local_sockaddr,
 int iwpm_remove_mapinfo(struct sockaddr_storage *local_sockaddr,
 			struct sockaddr_storage *mapped_local_addr)
 			struct sockaddr_storage *mapped_local_addr)
@@ -184,7 +179,6 @@ remove_mapinfo_exit:
 	spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
 	spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
 	return ret;
 	return ret;
 }
 }
-EXPORT_SYMBOL(iwpm_remove_mapinfo);
 
 
 static void free_hash_bucket(void)
 static void free_hash_bucket(void)
 {
 {
@@ -297,7 +291,6 @@ get_remote_info_exit:
 	spin_unlock_irqrestore(&iwpm_reminfo_lock, flags);
 	spin_unlock_irqrestore(&iwpm_reminfo_lock, flags);
 	return ret;
 	return ret;
 }
 }
-EXPORT_SYMBOL(iwpm_get_remote_info);
 
 
 struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq,
 struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq,
 					u8 nl_client, gfp_t gfp)
 					u8 nl_client, gfp_t gfp)
@@ -383,15 +376,11 @@ int iwpm_get_nlmsg_seq(void)
 
 
 int iwpm_valid_client(u8 nl_client)
 int iwpm_valid_client(u8 nl_client)
 {
 {
-	if (nl_client >= RDMA_NL_NUM_CLIENTS)
-		return 0;
 	return iwpm_admin.client_list[nl_client];
 	return iwpm_admin.client_list[nl_client];
 }
 }
 
 
 void iwpm_set_valid(u8 nl_client, int valid)
 void iwpm_set_valid(u8 nl_client, int valid)
 {
 {
-	if (nl_client >= RDMA_NL_NUM_CLIENTS)
-		return;
 	iwpm_admin.client_list[nl_client] = valid;
 	iwpm_admin.client_list[nl_client] = valid;
 }
 }
 
 
@@ -608,7 +597,7 @@ static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid)
 				&mapping_num, IWPM_NLA_MAPINFO_SEND_NUM);
 				&mapping_num, IWPM_NLA_MAPINFO_SEND_NUM);
 	if (ret)
 	if (ret)
 		goto mapinfo_num_error;
 		goto mapinfo_num_error;
-	ret = ibnl_unicast(skb, nlh, iwpm_pid);
+	ret = rdma_nl_unicast(skb, iwpm_pid);
 	if (ret) {
 	if (ret) {
 		skb = NULL;
 		skb = NULL;
 		err_str = "Unable to send a nlmsg";
 		err_str = "Unable to send a nlmsg";
@@ -637,7 +626,7 @@ static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid)
 		return -ENOMEM;
 		return -ENOMEM;
 	}
 	}
 	nlh->nlmsg_type = NLMSG_DONE;
 	nlh->nlmsg_type = NLMSG_DONE;
-	ret = ibnl_unicast(skb, (struct nlmsghdr *)skb->data, iwpm_pid);
+	ret = rdma_nl_unicast(skb, iwpm_pid);
 	if (ret)
 	if (ret)
 		pr_warn("%s Unable to send a nlmsg\n", __func__);
 		pr_warn("%s Unable to send a nlmsg\n", __func__);
 	return ret;
 	return ret;

+ 163 - 146
drivers/infiniband/core/netlink.c

@@ -1,4 +1,5 @@
 /*
 /*
+ * Copyright (c) 2017 Mellanox Technologies Inc.  All rights reserved.
  * Copyright (c) 2010 Voltaire Inc.  All rights reserved.
  * Copyright (c) 2010 Voltaire Inc.  All rights reserved.
  *
  *
  * This software is available to you under a choice of one of two
  * This software is available to you under a choice of one of two
@@ -39,237 +40,253 @@
 #include <rdma/rdma_netlink.h>
 #include <rdma/rdma_netlink.h>
 #include "core_priv.h"
 #include "core_priv.h"
 
 
-struct ibnl_client {
-	struct list_head		list;
-	int				index;
-	int				nops;
-	const struct ibnl_client_cbs   *cb_table;
-};
+#include "core_priv.h"
 
 
-static DEFINE_MUTEX(ibnl_mutex);
+static DEFINE_MUTEX(rdma_nl_mutex);
 static struct sock *nls;
 static struct sock *nls;
-static LIST_HEAD(client_list);
+static struct {
+	const struct rdma_nl_cbs   *cb_table;
+} rdma_nl_types[RDMA_NL_NUM_CLIENTS];
 
 
-int ibnl_chk_listeners(unsigned int group)
+int rdma_nl_chk_listeners(unsigned int group)
 {
 {
-	if (netlink_has_listeners(nls, group) == 0)
-		return -1;
-	return 0;
+	return (netlink_has_listeners(nls, group)) ? 0 : -1;
 }
 }
+EXPORT_SYMBOL(rdma_nl_chk_listeners);
 
 
-int ibnl_add_client(int index, int nops,
-		    const struct ibnl_client_cbs cb_table[])
+static bool is_nl_msg_valid(unsigned int type, unsigned int op)
 {
 {
-	struct ibnl_client *cur;
-	struct ibnl_client *nl_client;
-
-	nl_client = kmalloc(sizeof *nl_client, GFP_KERNEL);
-	if (!nl_client)
-		return -ENOMEM;
+	static const unsigned int max_num_ops[RDMA_NL_NUM_CLIENTS - 1] = {
+				  RDMA_NL_RDMA_CM_NUM_OPS,
+				  RDMA_NL_IWPM_NUM_OPS,
+				  0,
+				  RDMA_NL_LS_NUM_OPS,
+				  RDMA_NLDEV_NUM_OPS };
 
 
-	nl_client->index	= index;
-	nl_client->nops		= nops;
-	nl_client->cb_table	= cb_table;
+	/*
+	 * This BUILD_BUG_ON is intended to catch addition of new
+	 * RDMA netlink protocol without updating the array above.
+	 */
+	BUILD_BUG_ON(RDMA_NL_NUM_CLIENTS != 6);
 
 
-	mutex_lock(&ibnl_mutex);
+	if (type > RDMA_NL_NUM_CLIENTS - 1)
+		return false;
 
 
-	list_for_each_entry(cur, &client_list, list) {
-		if (cur->index == index) {
-			pr_warn("Client for %d already exists\n", index);
-			mutex_unlock(&ibnl_mutex);
-			kfree(nl_client);
-			return -EINVAL;
-		}
-	}
+	return (op < max_num_ops[type - 1]) ? true : false;
+}
 
 
-	list_add_tail(&nl_client->list, &client_list);
+static bool is_nl_valid(unsigned int type, unsigned int op)
+{
+	const struct rdma_nl_cbs *cb_table;
 
 
-	mutex_unlock(&ibnl_mutex);
+	if (!is_nl_msg_valid(type, op))
+		return false;
 
 
-	return 0;
+	cb_table = rdma_nl_types[type].cb_table;
+	if (!cb_table || (!cb_table[op].dump && !cb_table[op].doit))
+		return false;
+	return true;
 }
 }
-EXPORT_SYMBOL(ibnl_add_client);
 
 
-int ibnl_remove_client(int index)
+void rdma_nl_register(unsigned int index,
+		      const struct rdma_nl_cbs cb_table[])
 {
 {
-	struct ibnl_client *cur, *next;
-
-	mutex_lock(&ibnl_mutex);
-	list_for_each_entry_safe(cur, next, &client_list, list) {
-		if (cur->index == index) {
-			list_del(&(cur->list));
-			mutex_unlock(&ibnl_mutex);
-			kfree(cur);
-			return 0;
-		}
+	mutex_lock(&rdma_nl_mutex);
+	if (!is_nl_msg_valid(index, 0)) {
+		/*
+		 * All clients are not interesting in success/failure of
+		 * this call. They want to see the print to error log and
+		 * continue their initialization. Print warning for them,
+		 * because it is programmer's error to be here.
+		 */
+		mutex_unlock(&rdma_nl_mutex);
+		WARN(true,
+		     "The not-valid %u index was supplied to RDMA netlink\n",
+		     index);
+		return;
 	}
 	}
-	pr_warn("Can't remove callback for client idx %d. Not found\n", index);
-	mutex_unlock(&ibnl_mutex);
 
 
-	return -EINVAL;
+	if (rdma_nl_types[index].cb_table) {
+		mutex_unlock(&rdma_nl_mutex);
+		WARN(true,
+		     "The %u index is already registered in RDMA netlink\n",
+		     index);
+		return;
+	}
+
+	rdma_nl_types[index].cb_table = cb_table;
+	mutex_unlock(&rdma_nl_mutex);
+}
+EXPORT_SYMBOL(rdma_nl_register);
+
+void rdma_nl_unregister(unsigned int index)
+{
+	mutex_lock(&rdma_nl_mutex);
+	rdma_nl_types[index].cb_table = NULL;
+	mutex_unlock(&rdma_nl_mutex);
 }
 }
-EXPORT_SYMBOL(ibnl_remove_client);
+EXPORT_SYMBOL(rdma_nl_unregister);
 
 
 void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq,
 void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq,
 		   int len, int client, int op, int flags)
 		   int len, int client, int op, int flags)
 {
 {
-	unsigned char *prev_tail;
-
-	prev_tail = skb_tail_pointer(skb);
-	*nlh = nlmsg_put(skb, 0, seq, RDMA_NL_GET_TYPE(client, op),
-			 len, flags);
+	*nlh = nlmsg_put(skb, 0, seq, RDMA_NL_GET_TYPE(client, op), len, flags);
 	if (!*nlh)
 	if (!*nlh)
-		goto out_nlmsg_trim;
-	(*nlh)->nlmsg_len = skb_tail_pointer(skb) - prev_tail;
+		return NULL;
 	return nlmsg_data(*nlh);
 	return nlmsg_data(*nlh);
-
-out_nlmsg_trim:
-	nlmsg_trim(skb, prev_tail);
-	return NULL;
 }
 }
 EXPORT_SYMBOL(ibnl_put_msg);
 EXPORT_SYMBOL(ibnl_put_msg);
 
 
 int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh,
 int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh,
 		  int len, void *data, int type)
 		  int len, void *data, int type)
 {
 {
-	unsigned char *prev_tail;
-
-	prev_tail = skb_tail_pointer(skb);
-	if (nla_put(skb, type, len, data))
-		goto nla_put_failure;
-	nlh->nlmsg_len += skb_tail_pointer(skb) - prev_tail;
+	if (nla_put(skb, type, len, data)) {
+		nlmsg_cancel(skb, nlh);
+		return -EMSGSIZE;
+	}
 	return 0;
 	return 0;
-
-nla_put_failure:
-	nlmsg_trim(skb, prev_tail - nlh->nlmsg_len);
-	return -EMSGSIZE;
 }
 }
 EXPORT_SYMBOL(ibnl_put_attr);
 EXPORT_SYMBOL(ibnl_put_attr);
 
 
-static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
-			struct netlink_ext_ack *extack)
+static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
+			   struct netlink_ext_ack *extack)
 {
 {
-	struct ibnl_client *client;
 	int type = nlh->nlmsg_type;
 	int type = nlh->nlmsg_type;
-	int index = RDMA_NL_GET_CLIENT(type);
+	unsigned int index = RDMA_NL_GET_CLIENT(type);
 	unsigned int op = RDMA_NL_GET_OP(type);
 	unsigned int op = RDMA_NL_GET_OP(type);
+	const struct rdma_nl_cbs *cb_table;
+
+	if (!is_nl_valid(index, op))
+		return -EINVAL;
+
+	cb_table = rdma_nl_types[index].cb_table;
 
 
-	list_for_each_entry(client, &client_list, list) {
-		if (client->index == index) {
-			if (op >= client->nops || !client->cb_table[op].dump)
-				return -EINVAL;
-
-			/*
-			 * For response or local service set_timeout request,
-			 * there is no need to use netlink_dump_start.
-			 */
-			if (!(nlh->nlmsg_flags & NLM_F_REQUEST) ||
-			    (index == RDMA_NL_LS &&
-			     op == RDMA_NL_LS_OP_SET_TIMEOUT)) {
-				struct netlink_callback cb = {
-					.skb = skb,
-					.nlh = nlh,
-					.dump = client->cb_table[op].dump,
-					.module = client->cb_table[op].module,
-				};
-
-				return cb.dump(skb, &cb);
-			}
-
-			{
-				struct netlink_dump_control c = {
-					.dump = client->cb_table[op].dump,
-					.module = client->cb_table[op].module,
-				};
-				return netlink_dump_start(nls, skb, nlh, &c);
-			}
-		}
+	if ((cb_table[op].flags & RDMA_NL_ADMIN_PERM) &&
+	    !netlink_capable(skb, CAP_NET_ADMIN))
+		return -EPERM;
+
+	/* FIXME: Convert IWCM to properly handle doit callbacks */
+	if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_RDMA_CM ||
+	    index == RDMA_NL_IWCM) {
+		struct netlink_dump_control c = {
+			.dump = cb_table[op].dump,
+		};
+		return netlink_dump_start(nls, skb, nlh, &c);
 	}
 	}
 
 
-	pr_info("Index %d wasn't found in client list\n", index);
-	return -EINVAL;
+	if (cb_table[op].doit)
+		return cb_table[op].doit(skb, nlh, extack);
+
+	return 0;
 }
 }
 
 
-static void ibnl_rcv_reply_skb(struct sk_buff *skb)
+/*
+ * This function is similar to netlink_rcv_skb with one exception:
+ * It calls to the callback for the netlink messages without NLM_F_REQUEST
+ * flag. These messages are intended for RDMA_NL_LS consumer, so it is allowed
+ * for that consumer only.
+ */
+static int rdma_nl_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *,
+						   struct nlmsghdr *,
+						   struct netlink_ext_ack *))
 {
 {
+	struct netlink_ext_ack extack = {};
 	struct nlmsghdr *nlh;
 	struct nlmsghdr *nlh;
-	int msglen;
+	int err;
 
 
-	/*
-	 * Process responses until there is no more message or the first
-	 * request. Generally speaking, it is not recommended to mix responses
-	 * with requests.
-	 */
 	while (skb->len >= nlmsg_total_size(0)) {
 	while (skb->len >= nlmsg_total_size(0)) {
+		int msglen;
+
 		nlh = nlmsg_hdr(skb);
 		nlh = nlmsg_hdr(skb);
+		err = 0;
 
 
 		if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len)
 		if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len)
-			return;
-
-		/* Handle response only */
-		if (nlh->nlmsg_flags & NLM_F_REQUEST)
-			return;
-
-		ibnl_rcv_msg(skb, nlh, NULL);
+			return 0;
 
 
+		/*
+		 * Generally speaking, the only requests are handled
+		 * by the kernel, but RDMA_NL_LS is different, because it
+		 * runs backward netlink scheme. Kernel initiates messages
+		 * and waits for reply with data to keep pathrecord cache
+		 * in sync.
+		 */
+		if (!(nlh->nlmsg_flags & NLM_F_REQUEST) &&
+		    (RDMA_NL_GET_CLIENT(nlh->nlmsg_type) != RDMA_NL_LS))
+			goto ack;
+
+		/* Skip control messages */
+		if (nlh->nlmsg_type < NLMSG_MIN_TYPE)
+			goto ack;
+
+		err = cb(skb, nlh, &extack);
+		if (err == -EINTR)
+			goto skip;
+
+ack:
+		if (nlh->nlmsg_flags & NLM_F_ACK || err)
+			netlink_ack(skb, nlh, err, &extack);
+
+skip:
 		msglen = NLMSG_ALIGN(nlh->nlmsg_len);
 		msglen = NLMSG_ALIGN(nlh->nlmsg_len);
 		if (msglen > skb->len)
 		if (msglen > skb->len)
 			msglen = skb->len;
 			msglen = skb->len;
 		skb_pull(skb, msglen);
 		skb_pull(skb, msglen);
 	}
 	}
+
+	return 0;
 }
 }
 
 
-static void ibnl_rcv(struct sk_buff *skb)
+static void rdma_nl_rcv(struct sk_buff *skb)
 {
 {
-	mutex_lock(&ibnl_mutex);
-	ibnl_rcv_reply_skb(skb);
-	netlink_rcv_skb(skb, &ibnl_rcv_msg);
-	mutex_unlock(&ibnl_mutex);
+	mutex_lock(&rdma_nl_mutex);
+	rdma_nl_rcv_skb(skb, &rdma_nl_rcv_msg);
+	mutex_unlock(&rdma_nl_mutex);
 }
 }
 
 
-int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh,
-			__u32 pid)
+int rdma_nl_unicast(struct sk_buff *skb, u32 pid)
+{
+	int err;
+
+	err = netlink_unicast(nls, skb, pid, MSG_DONTWAIT);
+	return (err < 0) ? err : 0;
+}
+EXPORT_SYMBOL(rdma_nl_unicast);
+
+int rdma_nl_unicast_wait(struct sk_buff *skb, __u32 pid)
 {
 {
 	int err;
 	int err;
 
 
 	err = netlink_unicast(nls, skb, pid, 0);
 	err = netlink_unicast(nls, skb, pid, 0);
 	return (err < 0) ? err : 0;
 	return (err < 0) ? err : 0;
 }
 }
-EXPORT_SYMBOL(ibnl_unicast);
+EXPORT_SYMBOL(rdma_nl_unicast_wait);
 
 
-int ibnl_multicast(struct sk_buff *skb, struct nlmsghdr *nlh,
-			unsigned int group, gfp_t flags)
+int rdma_nl_multicast(struct sk_buff *skb, unsigned int group, gfp_t flags)
 {
 {
 	return nlmsg_multicast(nls, skb, 0, group, flags);
 	return nlmsg_multicast(nls, skb, 0, group, flags);
 }
 }
-EXPORT_SYMBOL(ibnl_multicast);
+EXPORT_SYMBOL(rdma_nl_multicast);
 
 
-int __init ibnl_init(void)
+int __init rdma_nl_init(void)
 {
 {
 	struct netlink_kernel_cfg cfg = {
 	struct netlink_kernel_cfg cfg = {
-		.input	= ibnl_rcv,
+		.input	= rdma_nl_rcv,
 	};
 	};
 
 
 	nls = netlink_kernel_create(&init_net, NETLINK_RDMA, &cfg);
 	nls = netlink_kernel_create(&init_net, NETLINK_RDMA, &cfg);
-	if (!nls) {
-		pr_warn("Failed to create netlink socket\n");
+	if (!nls)
 		return -ENOMEM;
 		return -ENOMEM;
-	}
 
 
 	nls->sk_sndtimeo = 10 * HZ;
 	nls->sk_sndtimeo = 10 * HZ;
 	return 0;
 	return 0;
 }
 }
 
 
-void ibnl_cleanup(void)
+void rdma_nl_exit(void)
 {
 {
-	struct ibnl_client *cur, *next;
+	int idx;
 
 
-	mutex_lock(&ibnl_mutex);
-	list_for_each_entry_safe(cur, next, &client_list, list) {
-		list_del(&(cur->list));
-		kfree(cur);
-	}
-	mutex_unlock(&ibnl_mutex);
+	for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++)
+		rdma_nl_unregister(idx);
 
 
 	netlink_kernel_release(nls);
 	netlink_kernel_release(nls);
 }
 }

+ 322 - 0
drivers/infiniband/core/nldev.c

@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <net/netlink.h>
+#include <rdma/rdma_netlink.h>
+
+#include "core_priv.h"
+
+static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
+	[RDMA_NLDEV_ATTR_DEV_INDEX]     = { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_DEV_NAME]	= { .type = NLA_NUL_STRING,
+					    .len = IB_DEVICE_NAME_MAX - 1},
+	[RDMA_NLDEV_ATTR_PORT_INDEX]	= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_FW_VERSION]	= { .type = NLA_NUL_STRING,
+					    .len = IB_FW_VERSION_NAME_MAX - 1},
+	[RDMA_NLDEV_ATTR_NODE_GUID]	= { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_SUBNET_PREFIX]	= { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_LID]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_SM_LID]	= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_LMC]		= { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_PORT_STATE]	= { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 },
+};
+
+static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
+{
+	char fw[IB_FW_VERSION_NAME_MAX];
+
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index))
+		return -EMSGSIZE;
+	if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, device->name))
+		return -EMSGSIZE;
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, rdma_end_port(device)))
+		return -EMSGSIZE;
+
+	BUILD_BUG_ON(sizeof(device->attrs.device_cap_flags) != sizeof(u64));
+	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS,
+			      device->attrs.device_cap_flags, 0))
+		return -EMSGSIZE;
+
+	ib_get_device_fw_str(device, fw);
+	/* Device without FW has strlen(fw) */
+	if (strlen(fw) && nla_put_string(msg, RDMA_NLDEV_ATTR_FW_VERSION, fw))
+		return -EMSGSIZE;
+
+	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_NODE_GUID,
+			      be64_to_cpu(device->node_guid), 0))
+		return -EMSGSIZE;
+	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SYS_IMAGE_GUID,
+			      be64_to_cpu(device->attrs.sys_image_guid), 0))
+		return -EMSGSIZE;
+	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_NODE_TYPE, device->node_type))
+		return -EMSGSIZE;
+	return 0;
+}
+
+static int fill_port_info(struct sk_buff *msg,
+			  struct ib_device *device, u32 port)
+{
+	struct ib_port_attr attr;
+	int ret;
+
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index))
+		return -EMSGSIZE;
+	if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, device->name))
+		return -EMSGSIZE;
+	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port))
+		return -EMSGSIZE;
+
+	ret = ib_query_port(device, port, &attr);
+	if (ret)
+		return ret;
+
+	BUILD_BUG_ON(sizeof(attr.port_cap_flags) > sizeof(u64));
+	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS,
+			      (u64)attr.port_cap_flags, 0))
+		return -EMSGSIZE;
+	if (rdma_protocol_ib(device, port) &&
+	    nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SUBNET_PREFIX,
+			      attr.subnet_prefix, 0))
+		return -EMSGSIZE;
+	if (rdma_protocol_ib(device, port)) {
+		if (nla_put_u32(msg, RDMA_NLDEV_ATTR_LID, attr.lid))
+			return -EMSGSIZE;
+		if (nla_put_u32(msg, RDMA_NLDEV_ATTR_SM_LID, attr.sm_lid))
+			return -EMSGSIZE;
+		if (nla_put_u8(msg, RDMA_NLDEV_ATTR_LMC, attr.lmc))
+			return -EMSGSIZE;
+	}
+	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_STATE, attr.state))
+		return -EMSGSIZE;
+	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_PHYS_STATE, attr.phys_state))
+		return -EMSGSIZE;
+	return 0;
+}
+
+static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+			  struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	struct ib_device *device;
+	struct sk_buff *msg;
+	u32 index;
+	int err;
+
+	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+			  nldev_policy, extack);
+	if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+		return -EINVAL;
+
+	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+
+	device = __ib_device_get_by_index(index);
+	if (!device)
+		return -EINVAL;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET),
+			0, 0);
+
+	err = fill_dev_info(msg, device);
+	if (err) {
+		nlmsg_free(msg);
+		return err;
+	}
+
+	nlmsg_end(msg, nlh);
+
+	return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+}
+
+static int _nldev_get_dumpit(struct ib_device *device,
+			     struct sk_buff *skb,
+			     struct netlink_callback *cb,
+			     unsigned int idx)
+{
+	int start = cb->args[0];
+	struct nlmsghdr *nlh;
+
+	if (idx < start)
+		return 0;
+
+	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET),
+			0, NLM_F_MULTI);
+
+	if (fill_dev_info(skb, device)) {
+		nlmsg_cancel(skb, nlh);
+		goto out;
+	}
+
+	nlmsg_end(skb, nlh);
+
+	idx++;
+
+out:	cb->args[0] = idx;
+	return skb->len;
+}
+
+static int nldev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	/*
+	 * There is no need to take lock, because
+	 * we are relying on ib_core's lists_rwsem
+	 */
+	return ib_enum_all_devs(_nldev_get_dumpit, skb, cb);
+}
+
+static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+			       struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	struct ib_device *device;
+	struct sk_buff *msg;
+	u32 index;
+	u32 port;
+	int err;
+
+	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+			  nldev_policy, extack);
+	if (err || !tb[RDMA_NLDEV_ATTR_PORT_INDEX])
+		return -EINVAL;
+
+	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	device = __ib_device_get_by_index(index);
+	if (!device)
+		return -EINVAL;
+
+	port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+	if (!rdma_is_port_valid(device, port))
+		return -EINVAL;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET),
+			0, 0);
+
+	err = fill_port_info(msg, device, port);
+	if (err) {
+		nlmsg_free(msg);
+		return err;
+	}
+
+	nlmsg_end(msg, nlh);
+
+	return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+}
+
+static int nldev_port_get_dumpit(struct sk_buff *skb,
+				 struct netlink_callback *cb)
+{
+	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	struct ib_device *device;
+	int start = cb->args[0];
+	struct nlmsghdr *nlh;
+	u32 idx = 0;
+	u32 ifindex;
+	int err;
+	u32 p;
+
+	err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+			  nldev_policy, NULL);
+	if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+		return -EINVAL;
+
+	ifindex = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+	device = __ib_device_get_by_index(ifindex);
+	if (!device)
+		return -EINVAL;
+
+	for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) {
+		/*
+		 * The dumpit function returns all information from specific
+		 * index. This specific index is taken from the netlink
+		 * messages request sent by user and it is available
+		 * in cb->args[0].
+		 *
+		 * Usually, the user doesn't fill this field and it causes
+		 * to return everything.
+		 *
+		 */
+		if (idx < start) {
+			idx++;
+			continue;
+		}
+
+		nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+				cb->nlh->nlmsg_seq,
+				RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+						 RDMA_NLDEV_CMD_PORT_GET),
+				0, NLM_F_MULTI);
+
+		if (fill_port_info(skb, device, p)) {
+			nlmsg_cancel(skb, nlh);
+			goto out;
+		}
+		idx++;
+		nlmsg_end(skb, nlh);
+	}
+
+out:	cb->args[0] = idx;
+	return skb->len;
+}
+
+static const struct rdma_nl_cbs nldev_cb_table[] = {
+	[RDMA_NLDEV_CMD_GET] = {
+		.doit = nldev_get_doit,
+		.dump = nldev_get_dumpit,
+	},
+	[RDMA_NLDEV_CMD_PORT_GET] = {
+		.doit = nldev_port_get_doit,
+		.dump = nldev_port_get_dumpit,
+	},
+};
+
+void __init nldev_init(void)
+{
+	rdma_nl_register(RDMA_NL_NLDEV, nldev_cb_table);
+}
+
+void __exit nldev_exit(void)
+{
+	rdma_nl_unregister(RDMA_NL_NLDEV);
+}

+ 2 - 0
drivers/infiniband/core/roce_gid_mgmt.c

@@ -44,6 +44,8 @@
 
 
 static struct workqueue_struct *gid_cache_wq;
 static struct workqueue_struct *gid_cache_wq;
 
 
+static struct workqueue_struct *gid_cache_wq;
+
 enum gid_op_type {
 enum gid_op_type {
 	GID_DEL = 0,
 	GID_DEL = 0,
 	GID_ADD
 	GID_ADD

+ 8 - 10
drivers/infiniband/core/sa_query.c

@@ -861,7 +861,7 @@ static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask)
 	/* Repair the nlmsg header length */
 	/* Repair the nlmsg header length */
 	nlmsg_end(skb, nlh);
 	nlmsg_end(skb, nlh);
 
 
-	ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, gfp_mask);
+	ret = rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, gfp_mask);
 	if (!ret)
 	if (!ret)
 		ret = len;
 		ret = len;
 	else
 	else
@@ -1021,9 +1021,9 @@ static void ib_nl_request_timeout(struct work_struct *work)
 }
 }
 
 
 int ib_nl_handle_set_timeout(struct sk_buff *skb,
 int ib_nl_handle_set_timeout(struct sk_buff *skb,
-			     struct netlink_callback *cb)
+			     struct nlmsghdr *nlh,
+			     struct netlink_ext_ack *extack)
 {
 {
-	const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
 	int timeout, delta, abs_delta;
 	int timeout, delta, abs_delta;
 	const struct nlattr *attr;
 	const struct nlattr *attr;
 	unsigned long flags;
 	unsigned long flags;
@@ -1033,8 +1033,7 @@ int ib_nl_handle_set_timeout(struct sk_buff *skb,
 	int ret;
 	int ret;
 
 
 	if (!(nlh->nlmsg_flags & NLM_F_REQUEST) ||
 	if (!(nlh->nlmsg_flags & NLM_F_REQUEST) ||
-	    !(NETLINK_CB(skb).sk) ||
-	    !netlink_capable(skb, CAP_NET_ADMIN))
+	    !(NETLINK_CB(skb).sk))
 		return -EPERM;
 		return -EPERM;
 
 
 	ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
 	ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
@@ -1098,9 +1097,9 @@ static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh)
 }
 }
 
 
 int ib_nl_handle_resolve_resp(struct sk_buff *skb,
 int ib_nl_handle_resolve_resp(struct sk_buff *skb,
-			      struct netlink_callback *cb)
+			      struct nlmsghdr *nlh,
+			      struct netlink_ext_ack *extack)
 {
 {
-	const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
 	unsigned long flags;
 	unsigned long flags;
 	struct ib_sa_query *query;
 	struct ib_sa_query *query;
 	struct ib_mad_send_buf *send_buf;
 	struct ib_mad_send_buf *send_buf;
@@ -1109,8 +1108,7 @@ int ib_nl_handle_resolve_resp(struct sk_buff *skb,
 	int ret;
 	int ret;
 
 
 	if ((nlh->nlmsg_flags & NLM_F_REQUEST) ||
 	if ((nlh->nlmsg_flags & NLM_F_REQUEST) ||
-	    !(NETLINK_CB(skb).sk) ||
-	    !netlink_capable(skb, CAP_NET_ADMIN))
+	    !(NETLINK_CB(skb).sk))
 		return -EPERM;
 		return -EPERM;
 
 
 	spin_lock_irqsave(&ib_nl_request_lock, flags);
 	spin_lock_irqsave(&ib_nl_request_lock, flags);
@@ -1420,7 +1418,7 @@ static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
 
 
 	if ((query->flags & IB_SA_ENABLE_LOCAL_SERVICE) &&
 	if ((query->flags & IB_SA_ENABLE_LOCAL_SERVICE) &&
 	    (!(query->flags & IB_SA_QUERY_OPA))) {
 	    (!(query->flags & IB_SA_QUERY_OPA))) {
-		if (!ibnl_chk_listeners(RDMA_NL_GROUP_LS)) {
+		if (!rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) {
 			if (!ib_nl_make_request(query, gfp_mask))
 			if (!ib_nl_make_request(query, gfp_mask))
 				return id;
 				return id;
 		}
 		}

+ 2 - 2
drivers/infiniband/core/sysfs.c

@@ -1210,8 +1210,8 @@ static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
 {
 {
 	struct ib_device *dev = container_of(device, struct ib_device, dev);
 	struct ib_device *dev = container_of(device, struct ib_device, dev);
 
 
-	ib_get_device_fw_str(dev, buf, PAGE_SIZE);
-	strlcat(buf, "\n", PAGE_SIZE);
+	ib_get_device_fw_str(dev, buf);
+	strlcat(buf, "\n", IB_FW_VERSION_NAME_MAX);
 	return strlen(buf);
 	return strlen(buf);
 }
 }
 
 

+ 14 - 3
drivers/infiniband/core/uverbs_cmd.c

@@ -1383,8 +1383,9 @@ static int create_qp(struct ib_uverbs_file *file,
 		attr.rwq_ind_tbl = ind_tbl;
 		attr.rwq_ind_tbl = ind_tbl;
 	}
 	}
 
 
-	if ((cmd_sz >= offsetof(typeof(*cmd), reserved1) +
-		       sizeof(cmd->reserved1)) && cmd->reserved1) {
+	if (cmd_sz > sizeof(*cmd) &&
+	    !ib_is_udata_cleared(ucore, sizeof(*cmd),
+				 cmd_sz - sizeof(*cmd))) {
 		ret = -EOPNOTSUPP;
 		ret = -EOPNOTSUPP;
 		goto err_put;
 		goto err_put;
 	}
 	}
@@ -1482,11 +1483,21 @@ static int create_qp(struct ib_uverbs_file *file,
 				IB_QP_CREATE_MANAGED_SEND |
 				IB_QP_CREATE_MANAGED_SEND |
 				IB_QP_CREATE_MANAGED_RECV |
 				IB_QP_CREATE_MANAGED_RECV |
 				IB_QP_CREATE_SCATTER_FCS |
 				IB_QP_CREATE_SCATTER_FCS |
-				IB_QP_CREATE_CVLAN_STRIPPING)) {
+				IB_QP_CREATE_CVLAN_STRIPPING |
+				IB_QP_CREATE_SOURCE_QPN)) {
 		ret = -EINVAL;
 		ret = -EINVAL;
 		goto err_put;
 		goto err_put;
 	}
 	}
 
 
+	if (attr.create_flags & IB_QP_CREATE_SOURCE_QPN) {
+		if (!capable(CAP_NET_RAW)) {
+			ret = -EPERM;
+			goto err_put;
+		}
+
+		attr.source_qpn = cmd->source_qpn;
+	}
+
 	buf = (void *)cmd + sizeof(*cmd);
 	buf = (void *)cmd + sizeof(*cmd);
 	if (cmd_sz > sizeof(*cmd))
 	if (cmd_sz > sizeof(*cmd))
 		if (!(buf[0] == 0 && !memcmp(buf, buf + 1,
 		if (!(buf[0] == 0 && !memcmp(buf, buf + 1,

+ 111 - 6
drivers/infiniband/core/verbs.c

@@ -1244,6 +1244,18 @@ int ib_resolve_eth_dmac(struct ib_device *device,
 	if (rdma_link_local_addr((struct in6_addr *)grh->dgid.raw)) {
 	if (rdma_link_local_addr((struct in6_addr *)grh->dgid.raw)) {
 		rdma_get_ll_mac((struct in6_addr *)grh->dgid.raw,
 		rdma_get_ll_mac((struct in6_addr *)grh->dgid.raw,
 				ah_attr->roce.dmac);
 				ah_attr->roce.dmac);
+		return 0;
+	}
+	if (rdma_is_multicast_addr((struct in6_addr *)ah_attr->grh.dgid.raw)) {
+		if (ipv6_addr_v4mapped((struct in6_addr *)ah_attr->grh.dgid.raw)) {
+			__be32 addr = 0;
+
+			memcpy(&addr, ah_attr->grh.dgid.raw + 12, 4);
+			ip_eth_mc_map(addr, (char *)ah_attr->roce.dmac);
+		} else {
+			ipv6_eth_mc_map((struct in6_addr *)ah_attr->grh.dgid.raw,
+					(char *)ah_attr->roce.dmac);
+		}
 	} else {
 	} else {
 		union ib_gid		sgid;
 		union ib_gid		sgid;
 		struct ib_gid_attr	sgid_attr;
 		struct ib_gid_attr	sgid_attr;
@@ -1302,6 +1314,61 @@ int ib_modify_qp_with_udata(struct ib_qp *qp, struct ib_qp_attr *attr,
 }
 }
 EXPORT_SYMBOL(ib_modify_qp_with_udata);
 EXPORT_SYMBOL(ib_modify_qp_with_udata);
 
 
+int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u8 *speed, u8 *width)
+{
+	int rc;
+	u32 netdev_speed;
+	struct net_device *netdev;
+	struct ethtool_link_ksettings lksettings;
+
+	if (rdma_port_get_link_layer(dev, port_num) != IB_LINK_LAYER_ETHERNET)
+		return -EINVAL;
+
+	if (!dev->get_netdev)
+		return -EOPNOTSUPP;
+
+	netdev = dev->get_netdev(dev, port_num);
+	if (!netdev)
+		return -ENODEV;
+
+	rtnl_lock();
+	rc = __ethtool_get_link_ksettings(netdev, &lksettings);
+	rtnl_unlock();
+
+	dev_put(netdev);
+
+	if (!rc) {
+		netdev_speed = lksettings.base.speed;
+	} else {
+		netdev_speed = SPEED_1000;
+		pr_warn("%s speed is unknown, defaulting to %d\n", netdev->name,
+			netdev_speed);
+	}
+
+	if (netdev_speed <= SPEED_1000) {
+		*width = IB_WIDTH_1X;
+		*speed = IB_SPEED_SDR;
+	} else if (netdev_speed <= SPEED_10000) {
+		*width = IB_WIDTH_1X;
+		*speed = IB_SPEED_FDR10;
+	} else if (netdev_speed <= SPEED_20000) {
+		*width = IB_WIDTH_4X;
+		*speed = IB_SPEED_DDR;
+	} else if (netdev_speed <= SPEED_25000) {
+		*width = IB_WIDTH_1X;
+		*speed = IB_SPEED_EDR;
+	} else if (netdev_speed <= SPEED_40000) {
+		*width = IB_WIDTH_4X;
+		*speed = IB_SPEED_FDR10;
+	} else {
+		*width = IB_WIDTH_4X;
+		*speed = IB_SPEED_EDR;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_get_eth_speed);
+
 int ib_modify_qp(struct ib_qp *qp,
 int ib_modify_qp(struct ib_qp *qp,
 		 struct ib_qp_attr *qp_attr,
 		 struct ib_qp_attr *qp_attr,
 		 int qp_attr_mask)
 		 int qp_attr_mask)
@@ -1569,15 +1636,53 @@ EXPORT_SYMBOL(ib_dealloc_fmr);
 
 
 /* Multicast groups */
 /* Multicast groups */
 
 
+static bool is_valid_mcast_lid(struct ib_qp *qp, u16 lid)
+{
+	struct ib_qp_init_attr init_attr = {};
+	struct ib_qp_attr attr = {};
+	int num_eth_ports = 0;
+	int port;
+
+	/* If QP state >= init, it is assigned to a port and we can check this
+	 * port only.
+	 */
+	if (!ib_query_qp(qp, &attr, IB_QP_STATE | IB_QP_PORT, &init_attr)) {
+		if (attr.qp_state >= IB_QPS_INIT) {
+			if (qp->device->get_link_layer(qp->device, attr.port_num) !=
+			    IB_LINK_LAYER_INFINIBAND)
+				return true;
+			goto lid_check;
+		}
+	}
+
+	/* Can't get a quick answer, iterate over all ports */
+	for (port = 0; port < qp->device->phys_port_cnt; port++)
+		if (qp->device->get_link_layer(qp->device, port) !=
+		    IB_LINK_LAYER_INFINIBAND)
+			num_eth_ports++;
+
+	/* If we have at lease one Ethernet port, RoCE annex declares that
+	 * multicast LID should be ignored. We can't tell at this step if the
+	 * QP belongs to an IB or Ethernet port.
+	 */
+	if (num_eth_ports)
+		return true;
+
+	/* If all the ports are IB, we can check according to IB spec. */
+lid_check:
+	return !(lid < be16_to_cpu(IB_MULTICAST_LID_BASE) ||
+		 lid == be16_to_cpu(IB_LID_PERMISSIVE));
+}
+
 int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
 int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
 {
 {
 	int ret;
 	int ret;
 
 
 	if (!qp->device->attach_mcast)
 	if (!qp->device->attach_mcast)
 		return -ENOSYS;
 		return -ENOSYS;
-	if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD ||
-	    lid < be16_to_cpu(IB_MULTICAST_LID_BASE) ||
-	    lid == be16_to_cpu(IB_LID_PERMISSIVE))
+
+	if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) ||
+	    qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid))
 		return -EINVAL;
 		return -EINVAL;
 
 
 	ret = qp->device->attach_mcast(qp, gid, lid);
 	ret = qp->device->attach_mcast(qp, gid, lid);
@@ -1593,9 +1698,9 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
 
 
 	if (!qp->device->detach_mcast)
 	if (!qp->device->detach_mcast)
 		return -ENOSYS;
 		return -ENOSYS;
-	if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD ||
-	    lid < be16_to_cpu(IB_MULTICAST_LID_BASE) ||
-	    lid == be16_to_cpu(IB_LID_PERMISSIVE))
+
+	if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) ||
+	    qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid))
 		return -EINVAL;
 		return -EINVAL;
 
 
 	ret = qp->device->detach_mcast(qp, gid, lid);
 	ret = qp->device->detach_mcast(qp, gid, lid);

+ 25 - 63
drivers/infiniband/hw/bnxt_re/ib_verbs.c

@@ -223,50 +223,6 @@ int bnxt_re_modify_device(struct ib_device *ibdev,
 	return 0;
 	return 0;
 }
 }
 
 
-static void __to_ib_speed_width(struct net_device *netdev, u8 *speed, u8 *width)
-{
-	struct ethtool_link_ksettings lksettings;
-	u32 espeed;
-
-	if (netdev->ethtool_ops && netdev->ethtool_ops->get_link_ksettings) {
-		memset(&lksettings, 0, sizeof(lksettings));
-		rtnl_lock();
-		netdev->ethtool_ops->get_link_ksettings(netdev, &lksettings);
-		rtnl_unlock();
-		espeed = lksettings.base.speed;
-	} else {
-		espeed = SPEED_UNKNOWN;
-	}
-	switch (espeed) {
-	case SPEED_1000:
-		*speed = IB_SPEED_SDR;
-		*width = IB_WIDTH_1X;
-		break;
-	case SPEED_10000:
-		*speed = IB_SPEED_QDR;
-		*width = IB_WIDTH_1X;
-		break;
-	case SPEED_20000:
-		*speed = IB_SPEED_DDR;
-		*width = IB_WIDTH_4X;
-		break;
-	case SPEED_25000:
-		*speed = IB_SPEED_EDR;
-		*width = IB_WIDTH_1X;
-		break;
-	case SPEED_40000:
-		*speed = IB_SPEED_QDR;
-		*width = IB_WIDTH_4X;
-		break;
-	case SPEED_50000:
-		break;
-	default:
-		*speed = IB_SPEED_SDR;
-		*width = IB_WIDTH_1X;
-		break;
-	}
-}
-
 /* Port */
 /* Port */
 int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num,
 int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num,
 		       struct ib_port_attr *port_attr)
 		       struct ib_port_attr *port_attr)
@@ -308,25 +264,9 @@ int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num,
 	 * IB stack to avoid race in the NETDEV_UNREG path
 	 * IB stack to avoid race in the NETDEV_UNREG path
 	 */
 	 */
 	if (test_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags))
 	if (test_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags))
-		__to_ib_speed_width(rdev->netdev, &port_attr->active_speed,
-				    &port_attr->active_width);
-	return 0;
-}
-
-int bnxt_re_modify_port(struct ib_device *ibdev, u8 port_num,
-			int port_modify_mask,
-			struct ib_port_modify *port_modify)
-{
-	switch (port_modify_mask) {
-	case IB_PORT_SHUTDOWN:
-		break;
-	case IB_PORT_INIT_TYPE:
-		break;
-	case IB_PORT_RESET_QKEY_CNTR:
-		break;
-	default:
-		break;
-	}
+		if (!ib_get_eth_speed(ibdev, port_num, &port_attr->active_speed,
+				      &port_attr->active_width))
+			return -EINVAL;
 	return 0;
 	return 0;
 }
 }
 
 
@@ -846,6 +786,7 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp)
 	struct bnxt_re_dev *rdev = qp->rdev;
 	struct bnxt_re_dev *rdev = qp->rdev;
 	int rc;
 	int rc;
 
 
+	bnxt_qplib_del_flush_qp(&qp->qplib_qp);
 	rc = bnxt_qplib_destroy_qp(&rdev->qplib_res, &qp->qplib_qp);
 	rc = bnxt_qplib_destroy_qp(&rdev->qplib_res, &qp->qplib_qp);
 	if (rc) {
 	if (rc) {
 		dev_err(rdev_to_dev(rdev), "Failed to destroy HW QP");
 		dev_err(rdev_to_dev(rdev), "Failed to destroy HW QP");
@@ -860,6 +801,7 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp)
 			return rc;
 			return rc;
 		}
 		}
 
 
+		bnxt_qplib_del_flush_qp(&qp->qplib_qp);
 		rc = bnxt_qplib_destroy_qp(&rdev->qplib_res,
 		rc = bnxt_qplib_destroy_qp(&rdev->qplib_res,
 					   &rdev->qp1_sqp->qplib_qp);
 					   &rdev->qp1_sqp->qplib_qp);
 		if (rc) {
 		if (rc) {
@@ -1404,6 +1346,21 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr,
 		}
 		}
 		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_STATE;
 		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_STATE;
 		qp->qplib_qp.state = __from_ib_qp_state(qp_attr->qp_state);
 		qp->qplib_qp.state = __from_ib_qp_state(qp_attr->qp_state);
+
+		if (!qp->sumem &&
+		    qp->qplib_qp.state == CMDQ_MODIFY_QP_NEW_STATE_ERR) {
+			dev_dbg(rdev_to_dev(rdev),
+				"Move QP = %p to flush list\n",
+				qp);
+			bnxt_qplib_add_flush_qp(&qp->qplib_qp);
+		}
+		if (!qp->sumem &&
+		    qp->qplib_qp.state == CMDQ_MODIFY_QP_NEW_STATE_RESET) {
+			dev_dbg(rdev_to_dev(rdev),
+				"Move QP = %p out of flush list\n",
+				qp);
+			bnxt_qplib_del_flush_qp(&qp->qplib_qp);
+		}
 	}
 	}
 	if (qp_attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) {
 	if (qp_attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) {
 		qp->qplib_qp.modify_flags |=
 		qp->qplib_qp.modify_flags |=
@@ -2414,6 +2371,7 @@ struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
 	}
 	}
 	cq->qplib_cq.max_wqe = entries;
 	cq->qplib_cq.max_wqe = entries;
 	cq->qplib_cq.cnq_hw_ring_id = rdev->nq.ring_id;
 	cq->qplib_cq.cnq_hw_ring_id = rdev->nq.ring_id;
+	cq->qplib_cq.nq	= &rdev->nq;
 
 
 	rc = bnxt_qplib_create_cq(&rdev->qplib_res, &cq->qplib_cq);
 	rc = bnxt_qplib_create_cq(&rdev->qplib_res, &cq->qplib_cq);
 	if (rc) {
 	if (rc) {
@@ -2921,6 +2879,10 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc)
 					sq->send_phantom = false;
 					sq->send_phantom = false;
 			}
 			}
 		}
 		}
+		if (ncqe < budget)
+			ncqe += bnxt_qplib_process_flush_list(&cq->qplib_cq,
+							      cqe + ncqe,
+							      budget - ncqe);
 
 
 		if (!ncqe)
 		if (!ncqe)
 			break;
 			break;

+ 0 - 3
drivers/infiniband/hw/bnxt_re/ib_verbs.h

@@ -141,9 +141,6 @@ int bnxt_re_modify_device(struct ib_device *ibdev,
 			  struct ib_device_modify *device_modify);
 			  struct ib_device_modify *device_modify);
 int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num,
 int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num,
 		       struct ib_port_attr *port_attr);
 		       struct ib_port_attr *port_attr);
-int bnxt_re_modify_port(struct ib_device *ibdev, u8 port_num,
-			int port_modify_mask,
-			struct ib_port_modify *port_modify);
 int bnxt_re_get_port_immutable(struct ib_device *ibdev, u8 port_num,
 int bnxt_re_get_port_immutable(struct ib_device *ibdev, u8 port_num,
 			       struct ib_port_immutable *immutable);
 			       struct ib_port_immutable *immutable);
 int bnxt_re_query_pkey(struct ib_device *ibdev, u8 port_num,
 int bnxt_re_query_pkey(struct ib_device *ibdev, u8 port_num,

+ 49 - 9
drivers/infiniband/hw/bnxt_re/main.c

@@ -70,7 +70,6 @@ static char version[] =
 MODULE_AUTHOR("Eddie Wai <eddie.wai@broadcom.com>");
 MODULE_AUTHOR("Eddie Wai <eddie.wai@broadcom.com>");
 MODULE_DESCRIPTION(BNXT_RE_DESC " Driver");
 MODULE_DESCRIPTION(BNXT_RE_DESC " Driver");
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_LICENSE("Dual BSD/GPL");
-MODULE_VERSION(ROCE_DRV_MODULE_VERSION);
 
 
 /* globals */
 /* globals */
 static struct list_head bnxt_re_dev_list = LIST_HEAD_INIT(bnxt_re_dev_list);
 static struct list_head bnxt_re_dev_list = LIST_HEAD_INIT(bnxt_re_dev_list);
@@ -474,7 +473,6 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
 	ibdev->modify_device		= bnxt_re_modify_device;
 	ibdev->modify_device		= bnxt_re_modify_device;
 
 
 	ibdev->query_port		= bnxt_re_query_port;
 	ibdev->query_port		= bnxt_re_query_port;
-	ibdev->modify_port		= bnxt_re_modify_port;
 	ibdev->get_port_immutable	= bnxt_re_get_port_immutable;
 	ibdev->get_port_immutable	= bnxt_re_get_port_immutable;
 	ibdev->query_pkey		= bnxt_re_query_pkey;
 	ibdev->query_pkey		= bnxt_re_query_pkey;
 	ibdev->query_gid		= bnxt_re_query_gid;
 	ibdev->query_gid		= bnxt_re_query_gid;
@@ -835,6 +833,42 @@ static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev)
 	mutex_unlock(&rdev->qp_lock);
 	mutex_unlock(&rdev->qp_lock);
 }
 }
 
 
+static int bnxt_re_update_gid(struct bnxt_re_dev *rdev)
+{
+	struct bnxt_qplib_sgid_tbl *sgid_tbl = &rdev->qplib_res.sgid_tbl;
+	struct bnxt_qplib_gid gid;
+	u16 gid_idx, index;
+	int rc = 0;
+
+	if (!test_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags))
+		return 0;
+
+	if (!sgid_tbl) {
+		dev_err(rdev_to_dev(rdev), "QPLIB: SGID table not allocated");
+		return -EINVAL;
+	}
+
+	for (index = 0; index < sgid_tbl->active; index++) {
+		gid_idx = sgid_tbl->hw_id[index];
+
+		if (!memcmp(&sgid_tbl->tbl[index], &bnxt_qplib_gid_zero,
+			    sizeof(bnxt_qplib_gid_zero)))
+			continue;
+		/* need to modify the VLAN enable setting of non VLAN GID only
+		 * as setting is done for VLAN GID while adding GID
+		 */
+		if (sgid_tbl->vlan[index])
+			continue;
+
+		memcpy(&gid, &sgid_tbl->tbl[index], sizeof(gid));
+
+		rc = bnxt_qplib_update_sgid(sgid_tbl, &gid, gid_idx,
+					    rdev->qplib_res.netdev->dev_addr);
+	}
+
+	return rc;
+}
+
 static u32 bnxt_re_get_priority_mask(struct bnxt_re_dev *rdev)
 static u32 bnxt_re_get_priority_mask(struct bnxt_re_dev *rdev)
 {
 {
 	u32 prio_map = 0, tmp_map = 0;
 	u32 prio_map = 0, tmp_map = 0;
@@ -854,8 +888,6 @@ static u32 bnxt_re_get_priority_mask(struct bnxt_re_dev *rdev)
 	tmp_map = dcb_ieee_getapp_mask(netdev, &app);
 	tmp_map = dcb_ieee_getapp_mask(netdev, &app);
 	prio_map |= tmp_map;
 	prio_map |= tmp_map;
 
 
-	if (!prio_map)
-		prio_map = -EFAULT;
 	return prio_map;
 	return prio_map;
 }
 }
 
 
@@ -881,10 +913,7 @@ static int bnxt_re_setup_qos(struct bnxt_re_dev *rdev)
 	int rc;
 	int rc;
 
 
 	/* Get priority for roce */
 	/* Get priority for roce */
-	rc = bnxt_re_get_priority_mask(rdev);
-	if (rc < 0)
-		return rc;
-	prio_map = (u8)rc;
+	prio_map = bnxt_re_get_priority_mask(rdev);
 
 
 	if (prio_map == rdev->cur_prio_map)
 	if (prio_map == rdev->cur_prio_map)
 		return 0;
 		return 0;
@@ -906,6 +935,16 @@ static int bnxt_re_setup_qos(struct bnxt_re_dev *rdev)
 		return rc;
 		return rc;
 	}
 	}
 
 
+	/* Actual priorities are not programmed as they are already
+	 * done by L2 driver; just enable or disable priority vlan tagging
+	 */
+	if ((prio_map == 0 && rdev->qplib_res.prio) ||
+	    (prio_map != 0 && !rdev->qplib_res.prio)) {
+		rdev->qplib_res.prio = prio_map ? true : false;
+
+		bnxt_re_update_gid(rdev);
+	}
+
 	return 0;
 	return 0;
 }
 }
 
 
@@ -998,7 +1037,8 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 	/* Establish RCFW Communication Channel to initialize the context
 	/* Establish RCFW Communication Channel to initialize the context
 	 * memory for the function and all child VFs
 	 * memory for the function and all child VFs
 	 */
 	 */
-	rc = bnxt_qplib_alloc_rcfw_channel(rdev->en_dev->pdev, &rdev->rcfw);
+	rc = bnxt_qplib_alloc_rcfw_channel(rdev->en_dev->pdev, &rdev->rcfw,
+					   BNXT_RE_MAX_QPC_COUNT);
 	if (rc)
 	if (rc)
 		goto fail;
 		goto fail;
 
 

+ 380 - 85
drivers/infiniband/hw/bnxt_re/qplib_fp.c

@@ -51,6 +51,168 @@
 #include "qplib_fp.h"
 #include "qplib_fp.h"
 
 
 static void bnxt_qplib_arm_cq_enable(struct bnxt_qplib_cq *cq);
 static void bnxt_qplib_arm_cq_enable(struct bnxt_qplib_cq *cq);
+static void __clean_cq(struct bnxt_qplib_cq *cq, u64 qp);
+
+static void bnxt_qplib_cancel_phantom_processing(struct bnxt_qplib_qp *qp)
+{
+	qp->sq.condition = false;
+	qp->sq.send_phantom = false;
+	qp->sq.single = false;
+}
+
+/* Flush list */
+static void __bnxt_qplib_add_flush_qp(struct bnxt_qplib_qp *qp)
+{
+	struct bnxt_qplib_cq *scq, *rcq;
+
+	scq = qp->scq;
+	rcq = qp->rcq;
+
+	if (!qp->sq.flushed) {
+		dev_dbg(&scq->hwq.pdev->dev,
+			"QPLIB: FP: Adding to SQ Flush list = %p",
+			qp);
+		bnxt_qplib_cancel_phantom_processing(qp);
+		list_add_tail(&qp->sq_flush, &scq->sqf_head);
+		qp->sq.flushed = true;
+	}
+	if (!qp->srq) {
+		if (!qp->rq.flushed) {
+			dev_dbg(&rcq->hwq.pdev->dev,
+				"QPLIB: FP: Adding to RQ Flush list = %p",
+				qp);
+			list_add_tail(&qp->rq_flush, &rcq->rqf_head);
+			qp->rq.flushed = true;
+		}
+	}
+}
+
+void bnxt_qplib_acquire_cq_locks(struct bnxt_qplib_qp *qp,
+				 unsigned long *flags)
+	__acquires(&qp->scq->hwq.lock) __acquires(&qp->rcq->hwq.lock)
+{
+	spin_lock_irqsave(&qp->scq->hwq.lock, *flags);
+	if (qp->scq == qp->rcq)
+		__acquire(&qp->rcq->hwq.lock);
+	else
+		spin_lock(&qp->rcq->hwq.lock);
+}
+
+void bnxt_qplib_release_cq_locks(struct bnxt_qplib_qp *qp,
+				 unsigned long *flags)
+	__releases(&qp->scq->hwq.lock) __releases(&qp->rcq->hwq.lock)
+{
+	if (qp->scq == qp->rcq)
+		__release(&qp->rcq->hwq.lock);
+	else
+		spin_unlock(&qp->rcq->hwq.lock);
+	spin_unlock_irqrestore(&qp->scq->hwq.lock, *flags);
+}
+
+static struct bnxt_qplib_cq *bnxt_qplib_find_buddy_cq(struct bnxt_qplib_qp *qp,
+						      struct bnxt_qplib_cq *cq)
+{
+	struct bnxt_qplib_cq *buddy_cq = NULL;
+
+	if (qp->scq == qp->rcq)
+		buddy_cq = NULL;
+	else if (qp->scq == cq)
+		buddy_cq = qp->rcq;
+	else
+		buddy_cq = qp->scq;
+	return buddy_cq;
+}
+
+static void bnxt_qplib_lock_buddy_cq(struct bnxt_qplib_qp *qp,
+				     struct bnxt_qplib_cq *cq)
+	__acquires(&buddy_cq->hwq.lock)
+{
+	struct bnxt_qplib_cq *buddy_cq = NULL;
+
+	buddy_cq = bnxt_qplib_find_buddy_cq(qp, cq);
+	if (!buddy_cq)
+		__acquire(&cq->hwq.lock);
+	else
+		spin_lock(&buddy_cq->hwq.lock);
+}
+
+static void bnxt_qplib_unlock_buddy_cq(struct bnxt_qplib_qp *qp,
+				       struct bnxt_qplib_cq *cq)
+	__releases(&buddy_cq->hwq.lock)
+{
+	struct bnxt_qplib_cq *buddy_cq = NULL;
+
+	buddy_cq = bnxt_qplib_find_buddy_cq(qp, cq);
+	if (!buddy_cq)
+		__release(&cq->hwq.lock);
+	else
+		spin_unlock(&buddy_cq->hwq.lock);
+}
+
+void bnxt_qplib_add_flush_qp(struct bnxt_qplib_qp *qp)
+{
+	unsigned long flags;
+
+	bnxt_qplib_acquire_cq_locks(qp, &flags);
+	__bnxt_qplib_add_flush_qp(qp);
+	bnxt_qplib_release_cq_locks(qp, &flags);
+}
+
+static void __bnxt_qplib_del_flush_qp(struct bnxt_qplib_qp *qp)
+{
+	struct bnxt_qplib_cq *scq, *rcq;
+
+	scq = qp->scq;
+	rcq = qp->rcq;
+
+	if (qp->sq.flushed) {
+		qp->sq.flushed = false;
+		list_del(&qp->sq_flush);
+	}
+	if (!qp->srq) {
+		if (qp->rq.flushed) {
+			qp->rq.flushed = false;
+			list_del(&qp->rq_flush);
+		}
+	}
+}
+
+void bnxt_qplib_del_flush_qp(struct bnxt_qplib_qp *qp)
+{
+	unsigned long flags;
+
+	bnxt_qplib_acquire_cq_locks(qp, &flags);
+	__clean_cq(qp->scq, (u64)(unsigned long)qp);
+	qp->sq.hwq.prod = 0;
+	qp->sq.hwq.cons = 0;
+	__clean_cq(qp->rcq, (u64)(unsigned long)qp);
+	qp->rq.hwq.prod = 0;
+	qp->rq.hwq.cons = 0;
+
+	__bnxt_qplib_del_flush_qp(qp);
+	bnxt_qplib_release_cq_locks(qp, &flags);
+}
+
+static void bnxt_qpn_cqn_sched_task(struct work_struct *work)
+{
+	struct bnxt_qplib_nq_work *nq_work =
+			container_of(work, struct bnxt_qplib_nq_work, work);
+
+	struct bnxt_qplib_cq *cq = nq_work->cq;
+	struct bnxt_qplib_nq *nq = nq_work->nq;
+
+	if (cq && nq) {
+		spin_lock_bh(&cq->compl_lock);
+		if (atomic_read(&cq->arm_state) && nq->cqn_handler) {
+			dev_dbg(&nq->pdev->dev,
+				"%s:Trigger cq  = %p event nq = %p\n",
+				__func__, cq, nq);
+			nq->cqn_handler(nq, cq);
+		}
+		spin_unlock_bh(&cq->compl_lock);
+	}
+	kfree(nq_work);
+}
 
 
 static void bnxt_qplib_free_qp_hdr_buf(struct bnxt_qplib_res *res,
 static void bnxt_qplib_free_qp_hdr_buf(struct bnxt_qplib_res *res,
 				       struct bnxt_qplib_qp *qp)
 				       struct bnxt_qplib_qp *qp)
@@ -119,6 +281,7 @@ static void bnxt_qplib_service_nq(unsigned long data)
 	struct bnxt_qplib_nq *nq = (struct bnxt_qplib_nq *)data;
 	struct bnxt_qplib_nq *nq = (struct bnxt_qplib_nq *)data;
 	struct bnxt_qplib_hwq *hwq = &nq->hwq;
 	struct bnxt_qplib_hwq *hwq = &nq->hwq;
 	struct nq_base *nqe, **nq_ptr;
 	struct nq_base *nqe, **nq_ptr;
+	struct bnxt_qplib_cq *cq;
 	int num_cqne_processed = 0;
 	int num_cqne_processed = 0;
 	u32 sw_cons, raw_cons;
 	u32 sw_cons, raw_cons;
 	u16 type;
 	u16 type;
@@ -143,15 +306,17 @@ static void bnxt_qplib_service_nq(unsigned long data)
 			q_handle = le32_to_cpu(nqcne->cq_handle_low);
 			q_handle = le32_to_cpu(nqcne->cq_handle_low);
 			q_handle |= (u64)le32_to_cpu(nqcne->cq_handle_high)
 			q_handle |= (u64)le32_to_cpu(nqcne->cq_handle_high)
 						     << 32;
 						     << 32;
-			bnxt_qplib_arm_cq_enable((struct bnxt_qplib_cq *)
-						 ((unsigned long)q_handle));
-			if (!nq->cqn_handler(nq, (struct bnxt_qplib_cq *)
-						 ((unsigned long)q_handle)))
+			cq = (struct bnxt_qplib_cq *)(unsigned long)q_handle;
+			bnxt_qplib_arm_cq_enable(cq);
+			spin_lock_bh(&cq->compl_lock);
+			atomic_set(&cq->arm_state, 0);
+			if (!nq->cqn_handler(nq, (cq)))
 				num_cqne_processed++;
 				num_cqne_processed++;
 			else
 			else
 				dev_warn(&nq->pdev->dev,
 				dev_warn(&nq->pdev->dev,
 					 "QPLIB: cqn - type 0x%x not handled",
 					 "QPLIB: cqn - type 0x%x not handled",
 					 type);
 					 type);
+			spin_unlock_bh(&cq->compl_lock);
 			break;
 			break;
 		}
 		}
 		case NQ_BASE_TYPE_DBQ_EVENT:
 		case NQ_BASE_TYPE_DBQ_EVENT:
@@ -190,6 +355,10 @@ static irqreturn_t bnxt_qplib_nq_irq(int irq, void *dev_instance)
 
 
 void bnxt_qplib_disable_nq(struct bnxt_qplib_nq *nq)
 void bnxt_qplib_disable_nq(struct bnxt_qplib_nq *nq)
 {
 {
+	if (nq->cqn_wq) {
+		destroy_workqueue(nq->cqn_wq);
+		nq->cqn_wq = NULL;
+	}
 	/* Make sure the HW is stopped! */
 	/* Make sure the HW is stopped! */
 	synchronize_irq(nq->vector);
 	synchronize_irq(nq->vector);
 	tasklet_disable(&nq->worker);
 	tasklet_disable(&nq->worker);
@@ -216,7 +385,7 @@ int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq,
 					     void *, u8 event))
 					     void *, u8 event))
 {
 {
 	resource_size_t nq_base;
 	resource_size_t nq_base;
-	int rc;
+	int rc = -1;
 
 
 	nq->pdev = pdev;
 	nq->pdev = pdev;
 	nq->vector = msix_vector;
 	nq->vector = msix_vector;
@@ -227,6 +396,11 @@ int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq,
 
 
 	tasklet_init(&nq->worker, bnxt_qplib_service_nq, (unsigned long)nq);
 	tasklet_init(&nq->worker, bnxt_qplib_service_nq, (unsigned long)nq);
 
 
+	/* Have a task to schedule CQ notifiers in post send case */
+	nq->cqn_wq  = create_singlethread_workqueue("bnxt_qplib_nq");
+	if (!nq->cqn_wq)
+		goto fail;
+
 	nq->requested = false;
 	nq->requested = false;
 	rc = request_irq(nq->vector, bnxt_qplib_nq_irq, 0, "bnxt_qplib_nq", nq);
 	rc = request_irq(nq->vector, bnxt_qplib_nq_irq, 0, "bnxt_qplib_nq", nq);
 	if (rc) {
 	if (rc) {
@@ -401,8 +575,8 @@ int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
 
 
 	qp->id = le32_to_cpu(resp.xid);
 	qp->id = le32_to_cpu(resp.xid);
 	qp->cur_qp_state = CMDQ_MODIFY_QP_NEW_STATE_RESET;
 	qp->cur_qp_state = CMDQ_MODIFY_QP_NEW_STATE_RESET;
-	sq->flush_in_progress = false;
-	rq->flush_in_progress = false;
+	rcfw->qp_tbl[qp->id].qp_id = qp->id;
+	rcfw->qp_tbl[qp->id].qp_handle = (void *)qp;
 
 
 	return 0;
 	return 0;
 
 
@@ -615,8 +789,10 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
 
 
 	qp->id = le32_to_cpu(resp.xid);
 	qp->id = le32_to_cpu(resp.xid);
 	qp->cur_qp_state = CMDQ_MODIFY_QP_NEW_STATE_RESET;
 	qp->cur_qp_state = CMDQ_MODIFY_QP_NEW_STATE_RESET;
-	sq->flush_in_progress = false;
-	rq->flush_in_progress = false;
+	INIT_LIST_HEAD(&qp->sq_flush);
+	INIT_LIST_HEAD(&qp->rq_flush);
+	rcfw->qp_tbl[qp->id].qp_id = qp->id;
+	rcfw->qp_tbl[qp->id].qp_handle = (void *)qp;
 
 
 	return 0;
 	return 0;
 
 
@@ -963,13 +1139,19 @@ int bnxt_qplib_destroy_qp(struct bnxt_qplib_res *res,
 	u16 cmd_flags = 0;
 	u16 cmd_flags = 0;
 	int rc;
 	int rc;
 
 
+	rcfw->qp_tbl[qp->id].qp_id = BNXT_QPLIB_QP_ID_INVALID;
+	rcfw->qp_tbl[qp->id].qp_handle = NULL;
+
 	RCFW_CMD_PREP(req, DESTROY_QP, cmd_flags);
 	RCFW_CMD_PREP(req, DESTROY_QP, cmd_flags);
 
 
 	req.qp_cid = cpu_to_le32(qp->id);
 	req.qp_cid = cpu_to_le32(qp->id);
 	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
 	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
 					  (void *)&resp, NULL, 0);
 					  (void *)&resp, NULL, 0);
-	if (rc)
+	if (rc) {
+		rcfw->qp_tbl[qp->id].qp_id = qp->id;
+		rcfw->qp_tbl[qp->id].qp_handle = qp;
 		return rc;
 		return rc;
+	}
 
 
 	/* Must walk the associated CQs to nullified the QP ptr */
 	/* Must walk the associated CQs to nullified the QP ptr */
 	spin_lock_irqsave(&qp->scq->hwq.lock, flags);
 	spin_lock_irqsave(&qp->scq->hwq.lock, flags);
@@ -1074,14 +1256,21 @@ int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp,
 	struct bnxt_qplib_swq *swq;
 	struct bnxt_qplib_swq *swq;
 	struct sq_send *hw_sq_send_hdr, **hw_sq_send_ptr;
 	struct sq_send *hw_sq_send_hdr, **hw_sq_send_ptr;
 	struct sq_sge *hw_sge;
 	struct sq_sge *hw_sge;
+	struct bnxt_qplib_nq_work *nq_work = NULL;
+	bool sch_handler = false;
 	u32 sw_prod;
 	u32 sw_prod;
 	u8 wqe_size16;
 	u8 wqe_size16;
 	int i, rc = 0, data_len = 0, pkt_num = 0;
 	int i, rc = 0, data_len = 0, pkt_num = 0;
 	__le32 temp32;
 	__le32 temp32;
 
 
 	if (qp->state != CMDQ_MODIFY_QP_NEW_STATE_RTS) {
 	if (qp->state != CMDQ_MODIFY_QP_NEW_STATE_RTS) {
-		rc = -EINVAL;
-		goto done;
+		if (qp->state == CMDQ_MODIFY_QP_NEW_STATE_ERR) {
+			sch_handler = true;
+			dev_dbg(&sq->hwq.pdev->dev,
+				"%s Error QP. Scheduling for poll_cq\n",
+				__func__);
+			goto queue_err;
+		}
 	}
 	}
 
 
 	if (bnxt_qplib_queue_full(sq)) {
 	if (bnxt_qplib_queue_full(sq)) {
@@ -1301,12 +1490,35 @@ int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp,
 			((swq->next_psn << SQ_PSN_SEARCH_NEXT_PSN_SFT) &
 			((swq->next_psn << SQ_PSN_SEARCH_NEXT_PSN_SFT) &
 			 SQ_PSN_SEARCH_NEXT_PSN_MASK));
 			 SQ_PSN_SEARCH_NEXT_PSN_MASK));
 	}
 	}
-
+queue_err:
+	if (sch_handler) {
+		/* Store the ULP info in the software structures */
+		sw_prod = HWQ_CMP(sq->hwq.prod, &sq->hwq);
+		swq = &sq->swq[sw_prod];
+		swq->wr_id = wqe->wr_id;
+		swq->type = wqe->type;
+		swq->flags = wqe->flags;
+		if (qp->sig_type)
+			swq->flags |= SQ_SEND_FLAGS_SIGNAL_COMP;
+		swq->start_psn = sq->psn & BTH_PSN_MASK;
+	}
 	sq->hwq.prod++;
 	sq->hwq.prod++;
-
 	qp->wqe_cnt++;
 	qp->wqe_cnt++;
 
 
 done:
 done:
+	if (sch_handler) {
+		nq_work = kzalloc(sizeof(*nq_work), GFP_ATOMIC);
+		if (nq_work) {
+			nq_work->cq = qp->scq;
+			nq_work->nq = qp->scq->nq;
+			INIT_WORK(&nq_work->work, bnxt_qpn_cqn_sched_task);
+			queue_work(qp->scq->nq->cqn_wq, &nq_work->work);
+		} else {
+			dev_err(&sq->hwq.pdev->dev,
+				"QPLIB: FP: Failed to allocate SQ nq_work!");
+			rc = -ENOMEM;
+		}
+	}
 	return rc;
 	return rc;
 }
 }
 
 
@@ -1334,15 +1546,17 @@ int bnxt_qplib_post_recv(struct bnxt_qplib_qp *qp,
 	struct bnxt_qplib_q *rq = &qp->rq;
 	struct bnxt_qplib_q *rq = &qp->rq;
 	struct rq_wqe *rqe, **rqe_ptr;
 	struct rq_wqe *rqe, **rqe_ptr;
 	struct sq_sge *hw_sge;
 	struct sq_sge *hw_sge;
+	struct bnxt_qplib_nq_work *nq_work = NULL;
+	bool sch_handler = false;
 	u32 sw_prod;
 	u32 sw_prod;
 	int i, rc = 0;
 	int i, rc = 0;
 
 
 	if (qp->state == CMDQ_MODIFY_QP_NEW_STATE_ERR) {
 	if (qp->state == CMDQ_MODIFY_QP_NEW_STATE_ERR) {
-		dev_err(&rq->hwq.pdev->dev,
-			"QPLIB: FP: QP (0x%x) is in the 0x%x state",
-			qp->id, qp->state);
-		rc = -EINVAL;
-		goto done;
+		sch_handler = true;
+		dev_dbg(&rq->hwq.pdev->dev,
+			"%s Error QP. Scheduling for poll_cq\n",
+			__func__);
+		goto queue_err;
 	}
 	}
 	if (bnxt_qplib_queue_full(rq)) {
 	if (bnxt_qplib_queue_full(rq)) {
 		dev_err(&rq->hwq.pdev->dev,
 		dev_err(&rq->hwq.pdev->dev,
@@ -1378,7 +1592,27 @@ int bnxt_qplib_post_recv(struct bnxt_qplib_qp *qp,
 	/* Supply the rqe->wr_id index to the wr_id_tbl for now */
 	/* Supply the rqe->wr_id index to the wr_id_tbl for now */
 	rqe->wr_id[0] = cpu_to_le32(sw_prod);
 	rqe->wr_id[0] = cpu_to_le32(sw_prod);
 
 
+queue_err:
+	if (sch_handler) {
+		/* Store the ULP info in the software structures */
+		sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq);
+		rq->swq[sw_prod].wr_id = wqe->wr_id;
+	}
+
 	rq->hwq.prod++;
 	rq->hwq.prod++;
+	if (sch_handler) {
+		nq_work = kzalloc(sizeof(*nq_work), GFP_ATOMIC);
+		if (nq_work) {
+			nq_work->cq = qp->rcq;
+			nq_work->nq = qp->rcq->nq;
+			INIT_WORK(&nq_work->work, bnxt_qpn_cqn_sched_task);
+			queue_work(qp->rcq->nq->cqn_wq, &nq_work->work);
+		} else {
+			dev_err(&rq->hwq.pdev->dev,
+				"QPLIB: FP: Failed to allocate RQ nq_work!");
+			rc = -ENOMEM;
+		}
+	}
 done:
 done:
 	return rc;
 	return rc;
 }
 }
@@ -1471,6 +1705,9 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq)
 	cq->dbr_base = res->dpi_tbl.dbr_bar_reg_iomem;
 	cq->dbr_base = res->dpi_tbl.dbr_bar_reg_iomem;
 	cq->period = BNXT_QPLIB_QUEUE_START_PERIOD;
 	cq->period = BNXT_QPLIB_QUEUE_START_PERIOD;
 	init_waitqueue_head(&cq->waitq);
 	init_waitqueue_head(&cq->waitq);
+	INIT_LIST_HEAD(&cq->sqf_head);
+	INIT_LIST_HEAD(&cq->rqf_head);
+	spin_lock_init(&cq->compl_lock);
 
 
 	bnxt_qplib_arm_cq_enable(cq);
 	bnxt_qplib_arm_cq_enable(cq);
 	return 0;
 	return 0;
@@ -1513,9 +1750,13 @@ static int __flush_sq(struct bnxt_qplib_q *sq, struct bnxt_qplib_qp *qp,
 	while (*budget) {
 	while (*budget) {
 		sw_cons = HWQ_CMP(sq->hwq.cons, &sq->hwq);
 		sw_cons = HWQ_CMP(sq->hwq.cons, &sq->hwq);
 		if (sw_cons == sw_prod) {
 		if (sw_cons == sw_prod) {
-			sq->flush_in_progress = false;
 			break;
 			break;
 		}
 		}
+		/* Skip the FENCE WQE completions */
+		if (sq->swq[sw_cons].wr_id == BNXT_QPLIB_FENCE_WRID) {
+			bnxt_qplib_cancel_phantom_processing(qp);
+			goto skip_compl;
+		}
 		memset(cqe, 0, sizeof(*cqe));
 		memset(cqe, 0, sizeof(*cqe));
 		cqe->status = CQ_REQ_STATUS_WORK_REQUEST_FLUSHED_ERR;
 		cqe->status = CQ_REQ_STATUS_WORK_REQUEST_FLUSHED_ERR;
 		cqe->opcode = CQ_BASE_CQE_TYPE_REQ;
 		cqe->opcode = CQ_BASE_CQE_TYPE_REQ;
@@ -1525,6 +1766,7 @@ static int __flush_sq(struct bnxt_qplib_q *sq, struct bnxt_qplib_qp *qp,
 		cqe->type = sq->swq[sw_cons].type;
 		cqe->type = sq->swq[sw_cons].type;
 		cqe++;
 		cqe++;
 		(*budget)--;
 		(*budget)--;
+skip_compl:
 		sq->hwq.cons++;
 		sq->hwq.cons++;
 	}
 	}
 	*pcqe = cqe;
 	*pcqe = cqe;
@@ -1536,11 +1778,24 @@ static int __flush_sq(struct bnxt_qplib_q *sq, struct bnxt_qplib_qp *qp,
 }
 }
 
 
 static int __flush_rq(struct bnxt_qplib_q *rq, struct bnxt_qplib_qp *qp,
 static int __flush_rq(struct bnxt_qplib_q *rq, struct bnxt_qplib_qp *qp,
-		      int opcode, struct bnxt_qplib_cqe **pcqe, int *budget)
+		      struct bnxt_qplib_cqe **pcqe, int *budget)
 {
 {
 	struct bnxt_qplib_cqe *cqe;
 	struct bnxt_qplib_cqe *cqe;
 	u32 sw_prod, sw_cons;
 	u32 sw_prod, sw_cons;
 	int rc = 0;
 	int rc = 0;
+	int opcode = 0;
+
+	switch (qp->type) {
+	case CMDQ_CREATE_QP1_TYPE_GSI:
+		opcode = CQ_BASE_CQE_TYPE_RES_RAWETH_QP1;
+		break;
+	case CMDQ_CREATE_QP_TYPE_RC:
+		opcode = CQ_BASE_CQE_TYPE_RES_RC;
+		break;
+	case CMDQ_CREATE_QP_TYPE_UD:
+		opcode = CQ_BASE_CQE_TYPE_RES_UD;
+		break;
+	}
 
 
 	/* Flush the rest of the RQ */
 	/* Flush the rest of the RQ */
 	sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq);
 	sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq);
@@ -1567,6 +1822,21 @@ static int __flush_rq(struct bnxt_qplib_q *rq, struct bnxt_qplib_qp *qp,
 	return rc;
 	return rc;
 }
 }
 
 
+void bnxt_qplib_mark_qp_error(void *qp_handle)
+{
+	struct bnxt_qplib_qp *qp = qp_handle;
+
+	if (!qp)
+		return;
+
+	/* Must block new posting of SQ and RQ */
+	qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR;
+	bnxt_qplib_cancel_phantom_processing(qp);
+
+	/* Add qp to flush list of the CQ */
+	__bnxt_qplib_add_flush_qp(qp);
+}
+
 /* Note: SQE is valid from sw_sq_cons up to cqe_sq_cons (exclusive)
 /* Note: SQE is valid from sw_sq_cons up to cqe_sq_cons (exclusive)
  *       CQE is track from sw_cq_cons to max_element but valid only if VALID=1
  *       CQE is track from sw_cq_cons to max_element but valid only if VALID=1
  */
  */
@@ -1694,10 +1964,12 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq,
 			cqe_sq_cons, sq->hwq.max_elements);
 			cqe_sq_cons, sq->hwq.max_elements);
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
-	/* If we were in the middle of flushing the SQ, continue */
-	if (sq->flush_in_progress)
-		goto flush;
 
 
+	if (qp->sq.flushed) {
+		dev_dbg(&cq->hwq.pdev->dev,
+			"%s: QPLIB: QP in Flush QP = %p\n", __func__, qp);
+		goto done;
+	}
 	/* Require to walk the sq's swq to fabricate CQEs for all previously
 	/* Require to walk the sq's swq to fabricate CQEs for all previously
 	 * signaled SWQEs due to CQE aggregation from the current sq cons
 	 * signaled SWQEs due to CQE aggregation from the current sq cons
 	 * to the cqe_sq_cons
 	 * to the cqe_sq_cons
@@ -1733,11 +2005,9 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq,
 				sw_sq_cons, cqe->wr_id, cqe->status);
 				sw_sq_cons, cqe->wr_id, cqe->status);
 			cqe++;
 			cqe++;
 			(*budget)--;
 			(*budget)--;
-			sq->flush_in_progress = true;
-			/* Must block new posting of SQ and RQ */
-			qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR;
-			sq->condition = false;
-			sq->single = false;
+			bnxt_qplib_lock_buddy_cq(qp, cq);
+			bnxt_qplib_mark_qp_error(qp);
+			bnxt_qplib_unlock_buddy_cq(qp, cq);
 		} else {
 		} else {
 			if (swq->flags & SQ_SEND_FLAGS_SIGNAL_COMP) {
 			if (swq->flags & SQ_SEND_FLAGS_SIGNAL_COMP) {
 				/* Before we complete, do WA 9060 */
 				/* Before we complete, do WA 9060 */
@@ -1768,15 +2038,6 @@ out:
 	 * the WC for this CQE
 	 * the WC for this CQE
 	 */
 	 */
 	sq->single = false;
 	sq->single = false;
-	if (!sq->flush_in_progress)
-		goto done;
-flush:
-	/* Require to walk the sq's swq to fabricate CQEs for all
-	 * previously posted SWQEs due to the error CQE received
-	 */
-	rc = __flush_sq(sq, qp, pcqe, budget);
-	if (!rc)
-		sq->flush_in_progress = false;
 done:
 done:
 	return rc;
 	return rc;
 }
 }
@@ -1798,6 +2059,12 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq,
 		dev_err(&cq->hwq.pdev->dev, "QPLIB: process_cq RC qp is NULL");
 		dev_err(&cq->hwq.pdev->dev, "QPLIB: process_cq RC qp is NULL");
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
+	if (qp->rq.flushed) {
+		dev_dbg(&cq->hwq.pdev->dev,
+			"%s: QPLIB: QP in Flush QP = %p\n", __func__, qp);
+		goto done;
+	}
+
 	cqe = *pcqe;
 	cqe = *pcqe;
 	cqe->opcode = hwcqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK;
 	cqe->opcode = hwcqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK;
 	cqe->length = le32_to_cpu(hwcqe->length);
 	cqe->length = le32_to_cpu(hwcqe->length);
@@ -1817,8 +2084,6 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq,
 			wr_id_idx, rq->hwq.max_elements);
 			wr_id_idx, rq->hwq.max_elements);
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
-	if (rq->flush_in_progress)
-		goto flush_rq;
 
 
 	cqe->wr_id = rq->swq[wr_id_idx].wr_id;
 	cqe->wr_id = rq->swq[wr_id_idx].wr_id;
 	cqe++;
 	cqe++;
@@ -1827,12 +2092,13 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq,
 	*pcqe = cqe;
 	*pcqe = cqe;
 
 
 	if (hwcqe->status != CQ_RES_RC_STATUS_OK) {
 	if (hwcqe->status != CQ_RES_RC_STATUS_OK) {
-		rq->flush_in_progress = true;
-flush_rq:
-		rc = __flush_rq(rq, qp, CQ_BASE_CQE_TYPE_RES_RC, pcqe, budget);
-		if (!rc)
-			rq->flush_in_progress = false;
+		 /* Add qp to flush list of the CQ */
+		bnxt_qplib_lock_buddy_cq(qp, cq);
+		__bnxt_qplib_add_flush_qp(qp);
+		bnxt_qplib_unlock_buddy_cq(qp, cq);
 	}
 	}
+
+done:
 	return rc;
 	return rc;
 }
 }
 
 
@@ -1853,6 +2119,11 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq,
 		dev_err(&cq->hwq.pdev->dev, "QPLIB: process_cq UD qp is NULL");
 		dev_err(&cq->hwq.pdev->dev, "QPLIB: process_cq UD qp is NULL");
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
+	if (qp->rq.flushed) {
+		dev_dbg(&cq->hwq.pdev->dev,
+			"%s: QPLIB: QP in Flush QP = %p\n", __func__, qp);
+		goto done;
+	}
 	cqe = *pcqe;
 	cqe = *pcqe;
 	cqe->opcode = hwcqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK;
 	cqe->opcode = hwcqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK;
 	cqe->length = le32_to_cpu(hwcqe->length);
 	cqe->length = le32_to_cpu(hwcqe->length);
@@ -1876,8 +2147,6 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq,
 			wr_id_idx, rq->hwq.max_elements);
 			wr_id_idx, rq->hwq.max_elements);
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
-	if (rq->flush_in_progress)
-		goto flush_rq;
 
 
 	cqe->wr_id = rq->swq[wr_id_idx].wr_id;
 	cqe->wr_id = rq->swq[wr_id_idx].wr_id;
 	cqe++;
 	cqe++;
@@ -1886,12 +2155,12 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq,
 	*pcqe = cqe;
 	*pcqe = cqe;
 
 
 	if (hwcqe->status != CQ_RES_RC_STATUS_OK) {
 	if (hwcqe->status != CQ_RES_RC_STATUS_OK) {
-		rq->flush_in_progress = true;
-flush_rq:
-		rc = __flush_rq(rq, qp, CQ_BASE_CQE_TYPE_RES_UD, pcqe, budget);
-		if (!rc)
-			rq->flush_in_progress = false;
+		/* Add qp to flush list of the CQ */
+		bnxt_qplib_lock_buddy_cq(qp, cq);
+		__bnxt_qplib_add_flush_qp(qp);
+		bnxt_qplib_unlock_buddy_cq(qp, cq);
 	}
 	}
+done:
 	return rc;
 	return rc;
 }
 }
 
 
@@ -1932,6 +2201,11 @@ static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq,
 			"QPLIB: process_cq Raw/QP1 qp is NULL");
 			"QPLIB: process_cq Raw/QP1 qp is NULL");
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
+	if (qp->rq.flushed) {
+		dev_dbg(&cq->hwq.pdev->dev,
+			"%s: QPLIB: QP in Flush QP = %p\n", __func__, qp);
+		goto done;
+	}
 	cqe = *pcqe;
 	cqe = *pcqe;
 	cqe->opcode = hwcqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK;
 	cqe->opcode = hwcqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK;
 	cqe->flags = le16_to_cpu(hwcqe->flags);
 	cqe->flags = le16_to_cpu(hwcqe->flags);
@@ -1960,8 +2234,6 @@ static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq,
 			wr_id_idx, rq->hwq.max_elements);
 			wr_id_idx, rq->hwq.max_elements);
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
-	if (rq->flush_in_progress)
-		goto flush_rq;
 
 
 	cqe->wr_id = rq->swq[wr_id_idx].wr_id;
 	cqe->wr_id = rq->swq[wr_id_idx].wr_id;
 	cqe++;
 	cqe++;
@@ -1970,13 +2242,13 @@ static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq,
 	*pcqe = cqe;
 	*pcqe = cqe;
 
 
 	if (hwcqe->status != CQ_RES_RC_STATUS_OK) {
 	if (hwcqe->status != CQ_RES_RC_STATUS_OK) {
-		rq->flush_in_progress = true;
-flush_rq:
-		rc = __flush_rq(rq, qp, CQ_BASE_CQE_TYPE_RES_RAWETH_QP1, pcqe,
-				budget);
-		if (!rc)
-			rq->flush_in_progress = false;
+		/* Add qp to flush list of the CQ */
+		bnxt_qplib_lock_buddy_cq(qp, cq);
+		__bnxt_qplib_add_flush_qp(qp);
+		bnxt_qplib_unlock_buddy_cq(qp, cq);
 	}
 	}
+
+done:
 	return rc;
 	return rc;
 }
 }
 
 
@@ -1990,7 +2262,6 @@ static int bnxt_qplib_cq_process_terminal(struct bnxt_qplib_cq *cq,
 	struct bnxt_qplib_cqe *cqe;
 	struct bnxt_qplib_cqe *cqe;
 	u32 sw_cons = 0, cqe_cons;
 	u32 sw_cons = 0, cqe_cons;
 	int rc = 0;
 	int rc = 0;
-	u8 opcode = 0;
 
 
 	/* Check the Status */
 	/* Check the Status */
 	if (hwcqe->status != CQ_TERMINAL_STATUS_OK)
 	if (hwcqe->status != CQ_TERMINAL_STATUS_OK)
@@ -2005,6 +2276,7 @@ static int bnxt_qplib_cq_process_terminal(struct bnxt_qplib_cq *cq,
 			"QPLIB: FP: CQ Process terminal qp is NULL");
 			"QPLIB: FP: CQ Process terminal qp is NULL");
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
+
 	/* Must block new posting of SQ and RQ */
 	/* Must block new posting of SQ and RQ */
 	qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR;
 	qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR;
 
 
@@ -2023,9 +2295,12 @@ static int bnxt_qplib_cq_process_terminal(struct bnxt_qplib_cq *cq,
 			cqe_cons, sq->hwq.max_elements);
 			cqe_cons, sq->hwq.max_elements);
 		goto do_rq;
 		goto do_rq;
 	}
 	}
-	/* If we were in the middle of flushing, continue */
-	if (sq->flush_in_progress)
-		goto flush_sq;
+
+	if (qp->sq.flushed) {
+		dev_dbg(&cq->hwq.pdev->dev,
+			"%s: QPLIB: QP in Flush QP = %p\n", __func__, qp);
+		goto sq_done;
+	}
 
 
 	/* Terminal CQE can also include aggregated successful CQEs prior.
 	/* Terminal CQE can also include aggregated successful CQEs prior.
 	 * So we must complete all CQEs from the current sq's cons to the
 	 * So we must complete all CQEs from the current sq's cons to the
@@ -2055,11 +2330,6 @@ static int bnxt_qplib_cq_process_terminal(struct bnxt_qplib_cq *cq,
 		rc = -EAGAIN;
 		rc = -EAGAIN;
 		goto sq_done;
 		goto sq_done;
 	}
 	}
-	sq->flush_in_progress = true;
-flush_sq:
-	rc = __flush_sq(sq, qp, pcqe, budget);
-	if (!rc)
-		sq->flush_in_progress = false;
 sq_done:
 sq_done:
 	if (rc)
 	if (rc)
 		return rc;
 		return rc;
@@ -2075,26 +2345,23 @@ do_rq:
 			cqe_cons, rq->hwq.max_elements);
 			cqe_cons, rq->hwq.max_elements);
 		goto done;
 		goto done;
 	}
 	}
+
+	if (qp->rq.flushed) {
+		dev_dbg(&cq->hwq.pdev->dev,
+			"%s: QPLIB: QP in Flush QP = %p\n", __func__, qp);
+		rc = 0;
+		goto done;
+	}
+
 	/* Terminal CQE requires all posted RQEs to complete with FLUSHED_ERR
 	/* Terminal CQE requires all posted RQEs to complete with FLUSHED_ERR
 	 * from the current rq->cons to the rq->prod regardless what the
 	 * from the current rq->cons to the rq->prod regardless what the
 	 * rq->cons the terminal CQE indicates
 	 * rq->cons the terminal CQE indicates
 	 */
 	 */
-	rq->flush_in_progress = true;
-	switch (qp->type) {
-	case CMDQ_CREATE_QP1_TYPE_GSI:
-		opcode = CQ_BASE_CQE_TYPE_RES_RAWETH_QP1;
-		break;
-	case CMDQ_CREATE_QP_TYPE_RC:
-		opcode = CQ_BASE_CQE_TYPE_RES_RC;
-		break;
-	case CMDQ_CREATE_QP_TYPE_UD:
-		opcode = CQ_BASE_CQE_TYPE_RES_UD;
-		break;
-	}
 
 
-	rc = __flush_rq(rq, qp, opcode, pcqe, budget);
-	if (!rc)
-		rq->flush_in_progress = false;
+	/* Add qp to flush list of the CQ */
+	bnxt_qplib_lock_buddy_cq(qp, cq);
+	__bnxt_qplib_add_flush_qp(qp);
+	bnxt_qplib_unlock_buddy_cq(qp, cq);
 done:
 done:
 	return rc;
 	return rc;
 }
 }
@@ -2115,6 +2382,33 @@ static int bnxt_qplib_cq_process_cutoff(struct bnxt_qplib_cq *cq,
 	return 0;
 	return 0;
 }
 }
 
 
+int bnxt_qplib_process_flush_list(struct bnxt_qplib_cq *cq,
+				  struct bnxt_qplib_cqe *cqe,
+				  int num_cqes)
+{
+	struct bnxt_qplib_qp *qp = NULL;
+	u32 budget = num_cqes;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cq->hwq.lock, flags);
+	list_for_each_entry(qp, &cq->sqf_head, sq_flush) {
+		dev_dbg(&cq->hwq.pdev->dev,
+			"QPLIB: FP: Flushing SQ QP= %p",
+			qp);
+		__flush_sq(&qp->sq, qp, &cqe, &budget);
+	}
+
+	list_for_each_entry(qp, &cq->rqf_head, rq_flush) {
+		dev_dbg(&cq->hwq.pdev->dev,
+			"QPLIB: FP: Flushing RQ QP= %p",
+			qp);
+		__flush_rq(&qp->rq, qp, &cqe, &budget);
+	}
+	spin_unlock_irqrestore(&cq->hwq.lock, flags);
+
+	return num_cqes - budget;
+}
+
 int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe,
 int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe,
 		       int num_cqes, struct bnxt_qplib_qp **lib_qp)
 		       int num_cqes, struct bnxt_qplib_qp **lib_qp)
 {
 {
@@ -2205,6 +2499,7 @@ void bnxt_qplib_req_notify_cq(struct bnxt_qplib_cq *cq, u32 arm_type)
 	spin_lock_irqsave(&cq->hwq.lock, flags);
 	spin_lock_irqsave(&cq->hwq.lock, flags);
 	if (arm_type)
 	if (arm_type)
 		bnxt_qplib_arm_cq(cq, arm_type);
 		bnxt_qplib_arm_cq(cq, arm_type);
-
+	/* Using cq->arm_state variable to track whether to issue cq handler */
+	atomic_set(&cq->arm_state, 1);
 	spin_unlock_irqrestore(&cq->hwq.lock, flags);
 	spin_unlock_irqrestore(&cq->hwq.lock, flags);
 }
 }

+ 24 - 1
drivers/infiniband/hw/bnxt_re/qplib_fp.h

@@ -220,19 +220,20 @@ struct bnxt_qplib_q {
 	u16				q_full_delta;
 	u16				q_full_delta;
 	u16				max_sge;
 	u16				max_sge;
 	u32				psn;
 	u32				psn;
-	bool				flush_in_progress;
 	bool				condition;
 	bool				condition;
 	bool				single;
 	bool				single;
 	bool				send_phantom;
 	bool				send_phantom;
 	u32				phantom_wqe_cnt;
 	u32				phantom_wqe_cnt;
 	u32				phantom_cqe_cnt;
 	u32				phantom_cqe_cnt;
 	u32				next_cq_cons;
 	u32				next_cq_cons;
+	bool				flushed;
 };
 };
 
 
 struct bnxt_qplib_qp {
 struct bnxt_qplib_qp {
 	struct bnxt_qplib_pd		*pd;
 	struct bnxt_qplib_pd		*pd;
 	struct bnxt_qplib_dpi		*dpi;
 	struct bnxt_qplib_dpi		*dpi;
 	u64				qp_handle;
 	u64				qp_handle;
+#define        BNXT_QPLIB_QP_ID_INVALID        0xFFFFFFFF
 	u32				id;
 	u32				id;
 	u8				type;
 	u8				type;
 	u8				sig_type;
 	u8				sig_type;
@@ -296,6 +297,8 @@ struct bnxt_qplib_qp {
 	dma_addr_t			sq_hdr_buf_map;
 	dma_addr_t			sq_hdr_buf_map;
 	void				*rq_hdr_buf;
 	void				*rq_hdr_buf;
 	dma_addr_t			rq_hdr_buf_map;
 	dma_addr_t			rq_hdr_buf_map;
+	struct list_head		sq_flush;
+	struct list_head		rq_flush;
 };
 };
 
 
 #define BNXT_QPLIB_MAX_CQE_ENTRY_SIZE	sizeof(struct cq_base)
 #define BNXT_QPLIB_MAX_CQE_ENTRY_SIZE	sizeof(struct cq_base)
@@ -351,6 +354,7 @@ struct bnxt_qplib_cq {
 	u16				period;
 	u16				period;
 	struct bnxt_qplib_hwq		hwq;
 	struct bnxt_qplib_hwq		hwq;
 	u32				cnq_hw_ring_id;
 	u32				cnq_hw_ring_id;
+	struct bnxt_qplib_nq		*nq;
 	bool				resize_in_progress;
 	bool				resize_in_progress;
 	struct scatterlist		*sghead;
 	struct scatterlist		*sghead;
 	u32				nmap;
 	u32				nmap;
@@ -360,6 +364,9 @@ struct bnxt_qplib_cq {
 	unsigned long			flags;
 	unsigned long			flags;
 #define CQ_FLAGS_RESIZE_IN_PROG		1
 #define CQ_FLAGS_RESIZE_IN_PROG		1
 	wait_queue_head_t		waitq;
 	wait_queue_head_t		waitq;
+	struct list_head		sqf_head, rqf_head;
+	atomic_t			arm_state;
+	spinlock_t			compl_lock; /* synch CQ handlers */
 };
 };
 
 
 #define BNXT_QPLIB_MAX_IRRQE_ENTRY_SIZE	sizeof(struct xrrq_irrq)
 #define BNXT_QPLIB_MAX_IRRQE_ENTRY_SIZE	sizeof(struct xrrq_irrq)
@@ -417,6 +424,13 @@ struct bnxt_qplib_nq {
 						(struct bnxt_qplib_nq *nq,
 						(struct bnxt_qplib_nq *nq,
 						 void *srq,
 						 void *srq,
 						 u8 event);
 						 u8 event);
+	struct workqueue_struct         *cqn_wq;
+};
+
+struct bnxt_qplib_nq_work {
+	struct work_struct      work;
+	struct bnxt_qplib_nq    *nq;
+	struct bnxt_qplib_cq    *cq;
 };
 };
 
 
 void bnxt_qplib_disable_nq(struct bnxt_qplib_nq *nq);
 void bnxt_qplib_disable_nq(struct bnxt_qplib_nq *nq);
@@ -453,4 +467,13 @@ bool bnxt_qplib_is_cq_empty(struct bnxt_qplib_cq *cq);
 void bnxt_qplib_req_notify_cq(struct bnxt_qplib_cq *cq, u32 arm_type);
 void bnxt_qplib_req_notify_cq(struct bnxt_qplib_cq *cq, u32 arm_type);
 void bnxt_qplib_free_nq(struct bnxt_qplib_nq *nq);
 void bnxt_qplib_free_nq(struct bnxt_qplib_nq *nq);
 int bnxt_qplib_alloc_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq);
 int bnxt_qplib_alloc_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq);
+void bnxt_qplib_add_flush_qp(struct bnxt_qplib_qp *qp);
+void bnxt_qplib_del_flush_qp(struct bnxt_qplib_qp *qp);
+void bnxt_qplib_acquire_cq_locks(struct bnxt_qplib_qp *qp,
+				 unsigned long *flags);
+void bnxt_qplib_release_cq_locks(struct bnxt_qplib_qp *qp,
+				 unsigned long *flags);
+int bnxt_qplib_process_flush_list(struct bnxt_qplib_cq *cq,
+				  struct bnxt_qplib_cqe *cqe,
+				  int num_cqes);
 #endif /* __BNXT_QPLIB_FP_H__ */
 #endif /* __BNXT_QPLIB_FP_H__ */

+ 25 - 1
drivers/infiniband/hw/bnxt_re/qplib_rcfw.c

@@ -44,6 +44,9 @@
 #include "roce_hsi.h"
 #include "roce_hsi.h"
 #include "qplib_res.h"
 #include "qplib_res.h"
 #include "qplib_rcfw.h"
 #include "qplib_rcfw.h"
+#include "qplib_sp.h"
+#include "qplib_fp.h"
+
 static void bnxt_qplib_service_creq(unsigned long data);
 static void bnxt_qplib_service_creq(unsigned long data);
 
 
 /* Hardware communication channel */
 /* Hardware communication channel */
@@ -279,16 +282,29 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw,
 				       struct creq_qp_event *qp_event)
 				       struct creq_qp_event *qp_event)
 {
 {
 	struct bnxt_qplib_hwq *cmdq = &rcfw->cmdq;
 	struct bnxt_qplib_hwq *cmdq = &rcfw->cmdq;
+	struct creq_qp_error_notification *err_event;
 	struct bnxt_qplib_crsq *crsqe;
 	struct bnxt_qplib_crsq *crsqe;
 	unsigned long flags;
 	unsigned long flags;
+	struct bnxt_qplib_qp *qp;
 	u16 cbit, blocked = 0;
 	u16 cbit, blocked = 0;
 	u16 cookie;
 	u16 cookie;
 	__le16  mcookie;
 	__le16  mcookie;
+	u32 qp_id;
 
 
 	switch (qp_event->event) {
 	switch (qp_event->event) {
 	case CREQ_QP_EVENT_EVENT_QP_ERROR_NOTIFICATION:
 	case CREQ_QP_EVENT_EVENT_QP_ERROR_NOTIFICATION:
+		err_event = (struct creq_qp_error_notification *)qp_event;
+		qp_id = le32_to_cpu(err_event->xid);
+		qp = rcfw->qp_tbl[qp_id].qp_handle;
 		dev_dbg(&rcfw->pdev->dev,
 		dev_dbg(&rcfw->pdev->dev,
 			"QPLIB: Received QP error notification");
 			"QPLIB: Received QP error notification");
+		dev_dbg(&rcfw->pdev->dev,
+			"QPLIB: qpid 0x%x, req_err=0x%x, resp_err=0x%x\n",
+			qp_id, err_event->req_err_state_reason,
+			err_event->res_err_state_reason);
+		bnxt_qplib_acquire_cq_locks(qp, &flags);
+		bnxt_qplib_mark_qp_error(qp);
+		bnxt_qplib_release_cq_locks(qp, &flags);
 		break;
 		break;
 	default:
 	default:
 		/* Command Response */
 		/* Command Response */
@@ -507,6 +523,7 @@ skip_ctx_setup:
 
 
 void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw)
 void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw)
 {
 {
+	kfree(rcfw->qp_tbl);
 	kfree(rcfw->crsqe_tbl);
 	kfree(rcfw->crsqe_tbl);
 	bnxt_qplib_free_hwq(rcfw->pdev, &rcfw->cmdq);
 	bnxt_qplib_free_hwq(rcfw->pdev, &rcfw->cmdq);
 	bnxt_qplib_free_hwq(rcfw->pdev, &rcfw->creq);
 	bnxt_qplib_free_hwq(rcfw->pdev, &rcfw->creq);
@@ -514,7 +531,8 @@ void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw)
 }
 }
 
 
 int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev,
 int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev,
-				  struct bnxt_qplib_rcfw *rcfw)
+				  struct bnxt_qplib_rcfw *rcfw,
+				  int qp_tbl_sz)
 {
 {
 	rcfw->pdev = pdev;
 	rcfw->pdev = pdev;
 	rcfw->creq.max_elements = BNXT_QPLIB_CREQE_MAX_CNT;
 	rcfw->creq.max_elements = BNXT_QPLIB_CREQE_MAX_CNT;
@@ -541,6 +559,12 @@ int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev,
 	if (!rcfw->crsqe_tbl)
 	if (!rcfw->crsqe_tbl)
 		goto fail;
 		goto fail;
 
 
+	rcfw->qp_tbl_size = qp_tbl_sz;
+	rcfw->qp_tbl = kcalloc(qp_tbl_sz, sizeof(struct bnxt_qplib_qp_node),
+			       GFP_KERNEL);
+	if (!rcfw->qp_tbl)
+		goto fail;
+
 	return 0;
 	return 0;
 
 
 fail:
 fail:

+ 9 - 1
drivers/infiniband/hw/bnxt_re/qplib_rcfw.h

@@ -148,6 +148,11 @@ struct bnxt_qplib_rcfw_sbuf {
 	u32 size;
 	u32 size;
 };
 };
 
 
+struct bnxt_qplib_qp_node {
+	u32 qp_id;              /* QP id */
+	void *qp_handle;        /* ptr to qplib_qp */
+};
+
 /* RCFW Communication Channels */
 /* RCFW Communication Channels */
 struct bnxt_qplib_rcfw {
 struct bnxt_qplib_rcfw {
 	struct pci_dev		*pdev;
 	struct pci_dev		*pdev;
@@ -181,11 +186,13 @@ struct bnxt_qplib_rcfw {
 	/* Actual Cmd and Resp Queues */
 	/* Actual Cmd and Resp Queues */
 	struct bnxt_qplib_hwq	cmdq;
 	struct bnxt_qplib_hwq	cmdq;
 	struct bnxt_qplib_crsq	*crsqe_tbl;
 	struct bnxt_qplib_crsq	*crsqe_tbl;
+	int qp_tbl_size;
+	struct bnxt_qplib_qp_node *qp_tbl;
 };
 };
 
 
 void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw);
 void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw);
 int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev,
 int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev,
-				  struct bnxt_qplib_rcfw *rcfw);
+				  struct bnxt_qplib_rcfw *rcfw, int qp_tbl_sz);
 void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw);
 void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw);
 int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev,
 int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev,
 				   struct bnxt_qplib_rcfw *rcfw,
 				   struct bnxt_qplib_rcfw *rcfw,
@@ -207,4 +214,5 @@ int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw,
 int bnxt_qplib_deinit_rcfw(struct bnxt_qplib_rcfw *rcfw);
 int bnxt_qplib_deinit_rcfw(struct bnxt_qplib_rcfw *rcfw);
 int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw,
 int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw,
 			 struct bnxt_qplib_ctx *ctx, int is_virtfn);
 			 struct bnxt_qplib_ctx *ctx, int is_virtfn);
+void bnxt_qplib_mark_qp_error(void *qp_handle);
 #endif /* __BNXT_QPLIB_RCFW_H__ */
 #endif /* __BNXT_QPLIB_RCFW_H__ */

+ 10 - 0
drivers/infiniband/hw/bnxt_re/qplib_res.c

@@ -468,9 +468,11 @@ static void bnxt_qplib_free_sgid_tbl(struct bnxt_qplib_res *res,
 	kfree(sgid_tbl->tbl);
 	kfree(sgid_tbl->tbl);
 	kfree(sgid_tbl->hw_id);
 	kfree(sgid_tbl->hw_id);
 	kfree(sgid_tbl->ctx);
 	kfree(sgid_tbl->ctx);
+	kfree(sgid_tbl->vlan);
 	sgid_tbl->tbl = NULL;
 	sgid_tbl->tbl = NULL;
 	sgid_tbl->hw_id = NULL;
 	sgid_tbl->hw_id = NULL;
 	sgid_tbl->ctx = NULL;
 	sgid_tbl->ctx = NULL;
+	sgid_tbl->vlan = NULL;
 	sgid_tbl->max = 0;
 	sgid_tbl->max = 0;
 	sgid_tbl->active = 0;
 	sgid_tbl->active = 0;
 }
 }
@@ -491,8 +493,15 @@ static int bnxt_qplib_alloc_sgid_tbl(struct bnxt_qplib_res *res,
 	if (!sgid_tbl->ctx)
 	if (!sgid_tbl->ctx)
 		goto out_free2;
 		goto out_free2;
 
 
+	sgid_tbl->vlan = kcalloc(max, sizeof(u8), GFP_KERNEL);
+	if (!sgid_tbl->vlan)
+		goto out_free3;
+
 	sgid_tbl->max = max;
 	sgid_tbl->max = max;
 	return 0;
 	return 0;
+out_free3:
+	kfree(sgid_tbl->ctx);
+	sgid_tbl->ctx = NULL;
 out_free2:
 out_free2:
 	kfree(sgid_tbl->hw_id);
 	kfree(sgid_tbl->hw_id);
 	sgid_tbl->hw_id = NULL;
 	sgid_tbl->hw_id = NULL;
@@ -514,6 +523,7 @@ static void bnxt_qplib_cleanup_sgid_tbl(struct bnxt_qplib_res *res,
 	}
 	}
 	memset(sgid_tbl->tbl, 0, sizeof(struct bnxt_qplib_gid) * sgid_tbl->max);
 	memset(sgid_tbl->tbl, 0, sizeof(struct bnxt_qplib_gid) * sgid_tbl->max);
 	memset(sgid_tbl->hw_id, -1, sizeof(u16) * sgid_tbl->max);
 	memset(sgid_tbl->hw_id, -1, sizeof(u16) * sgid_tbl->max);
+	memset(sgid_tbl->vlan, 0, sizeof(u8) * sgid_tbl->max);
 	sgid_tbl->active = 0;
 	sgid_tbl->active = 0;
 }
 }
 
 

+ 2 - 0
drivers/infiniband/hw/bnxt_re/qplib_res.h

@@ -116,6 +116,7 @@ struct bnxt_qplib_sgid_tbl {
 	u16				max;
 	u16				max;
 	u16				active;
 	u16				active;
 	void				*ctx;
 	void				*ctx;
+	u8				*vlan;
 };
 };
 
 
 struct bnxt_qplib_pkey_tbl {
 struct bnxt_qplib_pkey_tbl {
@@ -188,6 +189,7 @@ struct bnxt_qplib_res {
 	struct bnxt_qplib_sgid_tbl	sgid_tbl;
 	struct bnxt_qplib_sgid_tbl	sgid_tbl;
 	struct bnxt_qplib_pkey_tbl	pkey_tbl;
 	struct bnxt_qplib_pkey_tbl	pkey_tbl;
 	struct bnxt_qplib_dpi_tbl	dpi_tbl;
 	struct bnxt_qplib_dpi_tbl	dpi_tbl;
+	bool				prio;
 };
 };
 
 
 #define to_bnxt_qplib(ptr, type, member)	\
 #define to_bnxt_qplib(ptr, type, member)	\

+ 61 - 16
drivers/infiniband/hw/bnxt_re/qplib_sp.c

@@ -213,6 +213,7 @@ int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
 	}
 	}
 	memcpy(&sgid_tbl->tbl[index], &bnxt_qplib_gid_zero,
 	memcpy(&sgid_tbl->tbl[index], &bnxt_qplib_gid_zero,
 	       sizeof(bnxt_qplib_gid_zero));
 	       sizeof(bnxt_qplib_gid_zero));
+	sgid_tbl->vlan[index] = 0;
 	sgid_tbl->active--;
 	sgid_tbl->active--;
 	dev_dbg(&res->pdev->dev,
 	dev_dbg(&res->pdev->dev,
 		"QPLIB: SGID deleted hw_id[0x%x] = 0x%x active = 0x%x",
 		"QPLIB: SGID deleted hw_id[0x%x] = 0x%x active = 0x%x",
@@ -265,28 +266,32 @@ int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
 		struct cmdq_add_gid req;
 		struct cmdq_add_gid req;
 		struct creq_add_gid_resp resp;
 		struct creq_add_gid_resp resp;
 		u16 cmd_flags = 0;
 		u16 cmd_flags = 0;
-		u32 temp32[4];
-		u16 temp16[3];
 		int rc;
 		int rc;
 
 
 		RCFW_CMD_PREP(req, ADD_GID, cmd_flags);
 		RCFW_CMD_PREP(req, ADD_GID, cmd_flags);
 
 
-		memcpy(temp32, gid->data, sizeof(struct bnxt_qplib_gid));
-		req.gid[0] = cpu_to_be32(temp32[3]);
-		req.gid[1] = cpu_to_be32(temp32[2]);
-		req.gid[2] = cpu_to_be32(temp32[1]);
-		req.gid[3] = cpu_to_be32(temp32[0]);
-		if (vlan_id != 0xFFFF)
-			req.vlan = cpu_to_le16((vlan_id &
-					CMDQ_ADD_GID_VLAN_VLAN_ID_MASK) |
-					CMDQ_ADD_GID_VLAN_TPID_TPID_8100 |
-					CMDQ_ADD_GID_VLAN_VLAN_EN);
+		req.gid[0] = cpu_to_be32(((u32 *)gid->data)[3]);
+		req.gid[1] = cpu_to_be32(((u32 *)gid->data)[2]);
+		req.gid[2] = cpu_to_be32(((u32 *)gid->data)[1]);
+		req.gid[3] = cpu_to_be32(((u32 *)gid->data)[0]);
+		/*
+		 * driver should ensure that all RoCE traffic is always VLAN
+		 * tagged if RoCE traffic is running on non-zero VLAN ID or
+		 * RoCE traffic is running on non-zero Priority.
+		 */
+		if ((vlan_id != 0xFFFF) || res->prio) {
+			if (vlan_id != 0xFFFF)
+				req.vlan = cpu_to_le16
+				(vlan_id & CMDQ_ADD_GID_VLAN_VLAN_ID_MASK);
+			req.vlan |= cpu_to_le16
+					(CMDQ_ADD_GID_VLAN_TPID_TPID_8100 |
+					 CMDQ_ADD_GID_VLAN_VLAN_EN);
+		}
 
 
 		/* MAC in network format */
 		/* MAC in network format */
-		memcpy(temp16, smac, 6);
-		req.src_mac[0] = cpu_to_be16(temp16[0]);
-		req.src_mac[1] = cpu_to_be16(temp16[1]);
-		req.src_mac[2] = cpu_to_be16(temp16[2]);
+		req.src_mac[0] = cpu_to_be16(((u16 *)smac)[0]);
+		req.src_mac[1] = cpu_to_be16(((u16 *)smac)[1]);
+		req.src_mac[2] = cpu_to_be16(((u16 *)smac)[2]);
 
 
 		rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
 		rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
 						  (void *)&resp, NULL, 0);
 						  (void *)&resp, NULL, 0);
@@ -297,6 +302,9 @@ int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
 	/* Add GID to the sgid_tbl */
 	/* Add GID to the sgid_tbl */
 	memcpy(&sgid_tbl->tbl[free_idx], gid, sizeof(*gid));
 	memcpy(&sgid_tbl->tbl[free_idx], gid, sizeof(*gid));
 	sgid_tbl->active++;
 	sgid_tbl->active++;
+	if (vlan_id != 0xFFFF)
+		sgid_tbl->vlan[free_idx] = 1;
+
 	dev_dbg(&res->pdev->dev,
 	dev_dbg(&res->pdev->dev,
 		"QPLIB: SGID added hw_id[0x%x] = 0x%x active = 0x%x",
 		"QPLIB: SGID added hw_id[0x%x] = 0x%x active = 0x%x",
 		 free_idx, sgid_tbl->hw_id[free_idx], sgid_tbl->active);
 		 free_idx, sgid_tbl->hw_id[free_idx], sgid_tbl->active);
@@ -306,6 +314,43 @@ int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
 	return 0;
 	return 0;
 }
 }
 
 
+int bnxt_qplib_update_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
+			   struct bnxt_qplib_gid *gid, u16 gid_idx,
+			   u8 *smac)
+{
+	struct bnxt_qplib_res *res = to_bnxt_qplib(sgid_tbl,
+						   struct bnxt_qplib_res,
+						   sgid_tbl);
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct creq_modify_gid_resp resp;
+	struct cmdq_modify_gid req;
+	int rc;
+	u16 cmd_flags = 0;
+
+	RCFW_CMD_PREP(req, MODIFY_GID, cmd_flags);
+
+	req.gid[0] = cpu_to_be32(((u32 *)gid->data)[3]);
+	req.gid[1] = cpu_to_be32(((u32 *)gid->data)[2]);
+	req.gid[2] = cpu_to_be32(((u32 *)gid->data)[1]);
+	req.gid[3] = cpu_to_be32(((u32 *)gid->data)[0]);
+	if (res->prio) {
+		req.vlan |= cpu_to_le16
+			(CMDQ_ADD_GID_VLAN_TPID_TPID_8100 |
+			 CMDQ_ADD_GID_VLAN_VLAN_EN);
+	}
+
+	/* MAC in network format */
+	req.src_mac[0] = cpu_to_be16(((u16 *)smac)[0]);
+	req.src_mac[1] = cpu_to_be16(((u16 *)smac)[1]);
+	req.src_mac[2] = cpu_to_be16(((u16 *)smac)[2]);
+
+	req.gid_index = cpu_to_le16(gid_idx);
+
+	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
+					  (void *)&resp, NULL, 0);
+	return rc;
+}
+
 /* pkeys */
 /* pkeys */
 int bnxt_qplib_get_pkey(struct bnxt_qplib_res *res,
 int bnxt_qplib_get_pkey(struct bnxt_qplib_res *res,
 			struct bnxt_qplib_pkey_tbl *pkey_tbl, u16 index,
 			struct bnxt_qplib_pkey_tbl *pkey_tbl, u16 index,

+ 2 - 0
drivers/infiniband/hw/bnxt_re/qplib_sp.h

@@ -135,6 +135,8 @@ int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
 int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
 int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
 			struct bnxt_qplib_gid *gid, u8 *mac, u16 vlan_id,
 			struct bnxt_qplib_gid *gid, u8 *mac, u16 vlan_id,
 			bool update, u32 *index);
 			bool update, u32 *index);
+int bnxt_qplib_update_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
+			   struct bnxt_qplib_gid *gid, u16 gid_idx, u8 *smac);
 int bnxt_qplib_get_pkey(struct bnxt_qplib_res *res,
 int bnxt_qplib_get_pkey(struct bnxt_qplib_res *res,
 			struct bnxt_qplib_pkey_tbl *pkey_tbl, u16 index,
 			struct bnxt_qplib_pkey_tbl *pkey_tbl, u16 index,
 			u16 *pkey);
 			u16 *pkey);

+ 2 - 2
drivers/infiniband/hw/bnxt_re/roce_hsi.h

@@ -1473,8 +1473,8 @@ struct cmdq_modify_gid {
 	u8 resp_size;
 	u8 resp_size;
 	u8 reserved8;
 	u8 reserved8;
 	__le64 resp_addr;
 	__le64 resp_addr;
-	__le32 gid[4];
-	__le16 src_mac[3];
+	__be32 gid[4];
+	__be16 src_mac[3];
 	__le16 vlan;
 	__le16 vlan;
 	#define CMDQ_MODIFY_GID_VLAN_VLAN_ID_MASK		    0xfffUL
 	#define CMDQ_MODIFY_GID_VLAN_VLAN_ID_MASK		    0xfffUL
 	#define CMDQ_MODIFY_GID_VLAN_VLAN_ID_SFT		    0
 	#define CMDQ_MODIFY_GID_VLAN_VLAN_ID_SFT		    0

+ 0 - 1
drivers/infiniband/hw/cxgb3/iwch.c

@@ -45,7 +45,6 @@
 MODULE_AUTHOR("Boyd Faulkner, Steve Wise");
 MODULE_AUTHOR("Boyd Faulkner, Steve Wise");
 MODULE_DESCRIPTION("Chelsio T3 RDMA Driver");
 MODULE_DESCRIPTION("Chelsio T3 RDMA Driver");
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_LICENSE("Dual BSD/GPL");
-MODULE_VERSION(DRV_VERSION);
 
 
 static void open_rnic_dev(struct t3cdev *);
 static void open_rnic_dev(struct t3cdev *);
 static void close_rnic_dev(struct t3cdev *);
 static void close_rnic_dev(struct t3cdev *);

+ 2 - 3
drivers/infiniband/hw/cxgb3/iwch_provider.c

@@ -1336,8 +1336,7 @@ static int iwch_port_immutable(struct ib_device *ibdev, u8 port_num,
 	return 0;
 	return 0;
 }
 }
 
 
-static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str,
-			       size_t str_len)
+static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str)
 {
 {
 	struct iwch_dev *iwch_dev = to_iwch_dev(ibdev);
 	struct iwch_dev *iwch_dev = to_iwch_dev(ibdev);
 	struct ethtool_drvinfo info;
 	struct ethtool_drvinfo info;
@@ -1345,7 +1344,7 @@ static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str,
 
 
 	pr_debug("%s dev 0x%p\n", __func__, iwch_dev);
 	pr_debug("%s dev 0x%p\n", __func__, iwch_dev);
 	lldev->ethtool_ops->get_drvinfo(lldev, &info);
 	lldev->ethtool_ops->get_drvinfo(lldev, &info);
-	snprintf(str, str_len, "%s", info.fw_version);
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%s", info.fw_version);
 }
 }
 
 
 int iwch_register_device(struct iwch_dev *dev)
 int iwch_register_device(struct iwch_dev *dev)

+ 0 - 1
drivers/infiniband/hw/cxgb4/device.c

@@ -44,7 +44,6 @@
 MODULE_AUTHOR("Steve Wise");
 MODULE_AUTHOR("Steve Wise");
 MODULE_DESCRIPTION("Chelsio T4/T5 RDMA Driver");
 MODULE_DESCRIPTION("Chelsio T4/T5 RDMA Driver");
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_LICENSE("Dual BSD/GPL");
-MODULE_VERSION(DRV_VERSION);
 
 
 static int allow_db_fc_on_t5;
 static int allow_db_fc_on_t5;
 module_param(allow_db_fc_on_t5, int, 0644);
 module_param(allow_db_fc_on_t5, int, 0644);

+ 2 - 3
drivers/infiniband/hw/cxgb4/provider.c

@@ -517,14 +517,13 @@ static int c4iw_port_immutable(struct ib_device *ibdev, u8 port_num,
 	return 0;
 	return 0;
 }
 }
 
 
-static void get_dev_fw_str(struct ib_device *dev, char *str,
-			   size_t str_len)
+static void get_dev_fw_str(struct ib_device *dev, char *str)
 {
 {
 	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
 	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
 						 ibdev);
 						 ibdev);
 	pr_debug("%s dev 0x%p\n", __func__, dev);
 	pr_debug("%s dev 0x%p\n", __func__, dev);
 
 
-	snprintf(str, str_len, "%u.%u.%u.%u",
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u.%u.%u",
 		 FW_HDR_FW_VER_MAJOR_G(c4iw_dev->rdev.lldi.fw_vers),
 		 FW_HDR_FW_VER_MAJOR_G(c4iw_dev->rdev.lldi.fw_vers),
 		 FW_HDR_FW_VER_MINOR_G(c4iw_dev->rdev.lldi.fw_vers),
 		 FW_HDR_FW_VER_MINOR_G(c4iw_dev->rdev.lldi.fw_vers),
 		 FW_HDR_FW_VER_MICRO_G(c4iw_dev->rdev.lldi.fw_vers),
 		 FW_HDR_FW_VER_MICRO_G(c4iw_dev->rdev.lldi.fw_vers),

+ 1 - 1
drivers/infiniband/hw/hfi1/Makefile

@@ -8,7 +8,7 @@
 obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o
 obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o
 
 
 hfi1-y := affinity.o chip.o device.o driver.o efivar.o \
 hfi1-y := affinity.o chip.o device.o driver.o efivar.o \
-	eprom.o file_ops.o firmware.o \
+	eprom.o exp_rcv.o file_ops.o firmware.o \
 	init.o intr.o mad.o mmu_rb.o pcie.o pio.o pio_copy.o platform.o \
 	init.o intr.o mad.o mmu_rb.o pcie.o pio.o pio_copy.o platform.o \
 	qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o \
 	qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o \
 	uc.o ud.o user_exp_rcv.o user_pages.o user_sdma.o verbs.o \
 	uc.o ud.o user_exp_rcv.o user_pages.o user_sdma.o verbs.o \

+ 9 - 9
drivers/infiniband/hw/hfi1/affinity.c

@@ -1,5 +1,5 @@
 /*
 /*
- * Copyright(c) 2015, 2016 Intel Corporation.
+ * Copyright(c) 2015 - 2017 Intel Corporation.
  *
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
  * redistributing this file, you may do so under either license.
@@ -335,10 +335,10 @@ static void hfi1_update_sdma_affinity(struct hfi1_msix_entry *msix, int cpu)
 	sde->cpu = cpu;
 	sde->cpu = cpu;
 	cpumask_clear(&msix->mask);
 	cpumask_clear(&msix->mask);
 	cpumask_set_cpu(cpu, &msix->mask);
 	cpumask_set_cpu(cpu, &msix->mask);
-	dd_dev_dbg(dd, "IRQ vector: %u, type %s engine %u -> cpu: %d\n",
-		   msix->msix.vector, irq_type_names[msix->type],
+	dd_dev_dbg(dd, "IRQ: %u, type %s engine %u -> cpu: %d\n",
+		   msix->irq, irq_type_names[msix->type],
 		   sde->this_idx, cpu);
 		   sde->this_idx, cpu);
-	irq_set_affinity_hint(msix->msix.vector, &msix->mask);
+	irq_set_affinity_hint(msix->irq, &msix->mask);
 
 
 	/*
 	/*
 	 * Set the new cpu in the hfi1_affinity_node and clean
 	 * Set the new cpu in the hfi1_affinity_node and clean
@@ -387,7 +387,7 @@ static void hfi1_setup_sdma_notifier(struct hfi1_msix_entry *msix)
 {
 {
 	struct irq_affinity_notify *notify = &msix->notify;
 	struct irq_affinity_notify *notify = &msix->notify;
 
 
-	notify->irq = msix->msix.vector;
+	notify->irq = msix->irq;
 	notify->notify = hfi1_irq_notifier_notify;
 	notify->notify = hfi1_irq_notifier_notify;
 	notify->release = hfi1_irq_notifier_release;
 	notify->release = hfi1_irq_notifier_release;
 
 
@@ -472,10 +472,10 @@ static int get_irq_affinity(struct hfi1_devdata *dd,
 	}
 	}
 
 
 	cpumask_set_cpu(cpu, &msix->mask);
 	cpumask_set_cpu(cpu, &msix->mask);
-	dd_dev_info(dd, "IRQ vector: %u, type %s %s -> cpu: %d\n",
-		    msix->msix.vector, irq_type_names[msix->type],
+	dd_dev_info(dd, "IRQ: %u, type %s %s -> cpu: %d\n",
+		    msix->irq, irq_type_names[msix->type],
 		    extra, cpu);
 		    extra, cpu);
-	irq_set_affinity_hint(msix->msix.vector, &msix->mask);
+	irq_set_affinity_hint(msix->irq, &msix->mask);
 
 
 	if (msix->type == IRQ_SDMA) {
 	if (msix->type == IRQ_SDMA) {
 		sde->cpu = cpu;
 		sde->cpu = cpu;
@@ -533,7 +533,7 @@ void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
 		}
 		}
 	}
 	}
 
 
-	irq_set_affinity_hint(msix->msix.vector, NULL);
+	irq_set_affinity_hint(msix->irq, NULL);
 	cpumask_clear(&msix->mask);
 	cpumask_clear(&msix->mask);
 	mutex_unlock(&node_affinity.lock);
 	mutex_unlock(&node_affinity.lock);
 }
 }

+ 8 - 6
drivers/infiniband/hw/hfi1/affinity.h

@@ -1,5 +1,5 @@
 /*
 /*
- * Copyright(c) 2015, 2016 Intel Corporation.
+ * Copyright(c) 2015 - 2017 Intel Corporation.
  *
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
  * redistributing this file, you may do so under either license.
@@ -75,24 +75,26 @@ struct hfi1_msix_entry;
 /* Initialize non-HT cpu cores mask */
 /* Initialize non-HT cpu cores mask */
 void init_real_cpu_mask(void);
 void init_real_cpu_mask(void);
 /* Initialize driver affinity data */
 /* Initialize driver affinity data */
-int hfi1_dev_affinity_init(struct hfi1_devdata *);
+int hfi1_dev_affinity_init(struct hfi1_devdata *dd);
 /*
 /*
  * Set IRQ affinity to a CPU. The function will determine the
  * Set IRQ affinity to a CPU. The function will determine the
  * CPU and set the affinity to it.
  * CPU and set the affinity to it.
  */
  */
-int hfi1_get_irq_affinity(struct hfi1_devdata *, struct hfi1_msix_entry *);
+int hfi1_get_irq_affinity(struct hfi1_devdata *dd,
+			  struct hfi1_msix_entry *msix);
 /*
 /*
  * Remove the IRQ's CPU affinity. This function also updates
  * Remove the IRQ's CPU affinity. This function also updates
  * any internal CPU tracking data
  * any internal CPU tracking data
  */
  */
-void hfi1_put_irq_affinity(struct hfi1_devdata *, struct hfi1_msix_entry *);
+void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
+			   struct hfi1_msix_entry *msix);
 /*
 /*
  * Determine a CPU affinity for a user process, if the process does not
  * Determine a CPU affinity for a user process, if the process does not
  * have an affinity set yet.
  * have an affinity set yet.
  */
  */
-int hfi1_get_proc_affinity(int);
+int hfi1_get_proc_affinity(int node);
 /* Release a CPU used by a user process. */
 /* Release a CPU used by a user process. */
-void hfi1_put_proc_affinity(int);
+void hfi1_put_proc_affinity(int cpu);
 
 
 struct hfi1_affinity_node {
 struct hfi1_affinity_node {
 	int node;
 	int node;

+ 3 - 3
drivers/infiniband/hw/hfi1/aspm.h

@@ -237,7 +237,7 @@ static inline void aspm_disable_all(struct hfi1_devdata *dd)
 {
 {
 	struct hfi1_ctxtdata *rcd;
 	struct hfi1_ctxtdata *rcd;
 	unsigned long flags;
 	unsigned long flags;
-	unsigned i;
+	u16 i;
 
 
 	for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) {
 	for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) {
 		rcd = dd->rcd[i];
 		rcd = dd->rcd[i];
@@ -256,7 +256,7 @@ static inline void aspm_enable_all(struct hfi1_devdata *dd)
 {
 {
 	struct hfi1_ctxtdata *rcd;
 	struct hfi1_ctxtdata *rcd;
 	unsigned long flags;
 	unsigned long flags;
-	unsigned i;
+	u16 i;
 
 
 	aspm_enable(dd);
 	aspm_enable(dd);
 
 
@@ -284,7 +284,7 @@ static inline void aspm_ctx_init(struct hfi1_ctxtdata *rcd)
 
 
 static inline void aspm_init(struct hfi1_devdata *dd)
 static inline void aspm_init(struct hfi1_devdata *dd)
 {
 {
-	unsigned i;
+	u16 i;
 
 
 	spin_lock_init(&dd->aspm_lock);
 	spin_lock_init(&dd->aspm_lock);
 	dd->aspm_supported = aspm_hw_l1_supported(dd);
 	dd->aspm_supported = aspm_hw_l1_supported(dd);

+ 370 - 290
drivers/infiniband/hw/hfi1/chip.c

@@ -1012,14 +1012,15 @@ static struct flag_table dc8051_info_err_flags[] = {
  */
  */
 static struct flag_table dc8051_info_host_msg_flags[] = {
 static struct flag_table dc8051_info_host_msg_flags[] = {
 	FLAG_ENTRY0("Host request done", 0x0001),
 	FLAG_ENTRY0("Host request done", 0x0001),
-	FLAG_ENTRY0("BC SMA message", 0x0002),
-	FLAG_ENTRY0("BC PWR_MGM message", 0x0004),
+	FLAG_ENTRY0("BC PWR_MGM message", 0x0002),
+	FLAG_ENTRY0("BC SMA message", 0x0004),
 	FLAG_ENTRY0("BC Unknown message (BCC)", 0x0008),
 	FLAG_ENTRY0("BC Unknown message (BCC)", 0x0008),
 	FLAG_ENTRY0("BC Unknown message (LCB)", 0x0010),
 	FLAG_ENTRY0("BC Unknown message (LCB)", 0x0010),
 	FLAG_ENTRY0("External device config request", 0x0020),
 	FLAG_ENTRY0("External device config request", 0x0020),
 	FLAG_ENTRY0("VerifyCap all frames received", 0x0040),
 	FLAG_ENTRY0("VerifyCap all frames received", 0x0040),
 	FLAG_ENTRY0("LinkUp achieved", 0x0080),
 	FLAG_ENTRY0("LinkUp achieved", 0x0080),
 	FLAG_ENTRY0("Link going down", 0x0100),
 	FLAG_ENTRY0("Link going down", 0x0100),
+	FLAG_ENTRY0("Link width downgraded", 0x0200),
 };
 };
 
 
 static u32 encoded_size(u32 size);
 static u32 encoded_size(u32 size);
@@ -1066,6 +1067,8 @@ static int thermal_init(struct hfi1_devdata *dd);
 
 
 static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
 static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
 				  int msecs);
 				  int msecs);
+static int wait_physical_linkstate(struct hfi1_pportdata *ppd, u32 state,
+				   int msecs);
 static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc);
 static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc);
 static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr);
 static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr);
 static void handle_temp_err(struct hfi1_devdata *dd);
 static void handle_temp_err(struct hfi1_devdata *dd);
@@ -1294,25 +1297,71 @@ CNTR_ELEM(#name, \
 	  CNTR_SYNTH, \
 	  CNTR_SYNTH, \
 	  access_ibp_##cntr)
 	  access_ibp_##cntr)
 
 
+/**
+ * hfi_addr_from_offset - return addr for readq/writeq
+ * @dd - the dd device
+ * @offset - the offset of the CSR within bar0
+ *
+ * This routine selects the appropriate base address
+ * based on the indicated offset.
+ */
+static inline void __iomem *hfi1_addr_from_offset(
+	const struct hfi1_devdata *dd,
+	u32 offset)
+{
+	if (offset >= dd->base2_start)
+		return dd->kregbase2 + (offset - dd->base2_start);
+	return dd->kregbase1 + offset;
+}
+
+/**
+ * read_csr - read CSR at the indicated offset
+ * @dd - the dd device
+ * @offset - the offset of the CSR within bar0
+ *
+ * Return: the value read or all FF's if there
+ * is no mapping
+ */
 u64 read_csr(const struct hfi1_devdata *dd, u32 offset)
 u64 read_csr(const struct hfi1_devdata *dd, u32 offset)
 {
 {
-	if (dd->flags & HFI1_PRESENT) {
-		return readq((void __iomem *)dd->kregbase + offset);
-	}
+	if (dd->flags & HFI1_PRESENT)
+		return readq(hfi1_addr_from_offset(dd, offset));
 	return -1;
 	return -1;
 }
 }
 
 
+/**
+ * write_csr - write CSR at the indicated offset
+ * @dd - the dd device
+ * @offset - the offset of the CSR within bar0
+ * @value - value to write
+ */
 void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value)
 void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value)
 {
 {
-	if (dd->flags & HFI1_PRESENT)
-		writeq(value, (void __iomem *)dd->kregbase + offset);
+	if (dd->flags & HFI1_PRESENT) {
+		void __iomem *base = hfi1_addr_from_offset(dd, offset);
+
+		/* avoid write to RcvArray */
+		if (WARN_ON(offset >= RCV_ARRAY && offset < dd->base2_start))
+			return;
+		writeq(value, base);
+	}
 }
 }
 
 
+/**
+ * get_csr_addr - return te iomem address for offset
+ * @dd - the dd device
+ * @offset - the offset of the CSR within bar0
+ *
+ * Return: The iomem address to use in subsequent
+ * writeq/readq operations.
+ */
 void __iomem *get_csr_addr(
 void __iomem *get_csr_addr(
-	struct hfi1_devdata *dd,
+	const struct hfi1_devdata *dd,
 	u32 offset)
 	u32 offset)
 {
 {
-	return (void __iomem *)dd->kregbase + offset;
+	if (dd->flags & HFI1_PRESENT)
+		return hfi1_addr_from_offset(dd, offset);
+	return NULL;
 }
 }
 
 
 static inline u64 read_write_csr(const struct hfi1_devdata *dd, u32 csr,
 static inline u64 read_write_csr(const struct hfi1_devdata *dd, u32 csr,
@@ -5496,7 +5545,7 @@ static void update_rcverr_timer(unsigned long opaque)
 		set_link_down_reason(
 		set_link_down_reason(
 		ppd, OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN, 0,
 		ppd, OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN, 0,
 		OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN);
 		OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN);
-		queue_work(ppd->hfi1_wq, &ppd->link_bounce_work);
+		queue_work(ppd->link_wq, &ppd->link_bounce_work);
 	}
 	}
 	dd->rcv_ovfl_cnt = (u32)cur_ovfl_cnt;
 	dd->rcv_ovfl_cnt = (u32)cur_ovfl_cnt;
 
 
@@ -6051,7 +6100,7 @@ static void handle_qsfp_int(struct hfi1_devdata *dd, u32 src_ctx, u64 reg)
 				 * will not happen. We have to do it here
 				 * will not happen. We have to do it here
 				 * before turning the DC off.
 				 * before turning the DC off.
 				 */
 				 */
-				queue_work(ppd->hfi1_wq, &ppd->link_down_work);
+				queue_work(ppd->link_wq, &ppd->link_down_work);
 			}
 			}
 		} else {
 		} else {
 			dd_dev_info(dd, "%s: QSFP module inserted\n",
 			dd_dev_info(dd, "%s: QSFP module inserted\n",
@@ -6086,7 +6135,7 @@ static void handle_qsfp_int(struct hfi1_devdata *dd, u32 src_ctx, u64 reg)
 
 
 	/* Schedule the QSFP work only if there is a cable attached. */
 	/* Schedule the QSFP work only if there is a cable attached. */
 	if (qsfp_mod_present(ppd))
 	if (qsfp_mod_present(ppd))
-		queue_work(ppd->hfi1_wq, &ppd->qsfp_info.qsfp_work);
+		queue_work(ppd->link_wq, &ppd->qsfp_info.qsfp_work);
 }
 }
 
 
 static int request_host_lcb_access(struct hfi1_devdata *dd)
 static int request_host_lcb_access(struct hfi1_devdata *dd)
@@ -6741,7 +6790,7 @@ static void rxe_freeze(struct hfi1_devdata *dd)
 
 
 	/* disable all receive contexts */
 	/* disable all receive contexts */
 	for (i = 0; i < dd->num_rcv_contexts; i++)
 	for (i = 0; i < dd->num_rcv_contexts; i++)
-		hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS, i);
+		hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS, dd->rcd[i]);
 }
 }
 
 
 /*
 /*
@@ -6753,7 +6802,7 @@ static void rxe_freeze(struct hfi1_devdata *dd)
 static void rxe_kernel_unfreeze(struct hfi1_devdata *dd)
 static void rxe_kernel_unfreeze(struct hfi1_devdata *dd)
 {
 {
 	u32 rcvmask;
 	u32 rcvmask;
-	int i;
+	u16 i;
 
 
 	/* enable all kernel contexts */
 	/* enable all kernel contexts */
 	for (i = 0; i < dd->num_rcv_contexts; i++) {
 	for (i = 0; i < dd->num_rcv_contexts; i++) {
@@ -6765,9 +6814,9 @@ static void rxe_kernel_unfreeze(struct hfi1_devdata *dd)
 
 
 		rcvmask = HFI1_RCVCTRL_CTXT_ENB;
 		rcvmask = HFI1_RCVCTRL_CTXT_ENB;
 		/* HFI1_RCVCTRL_TAILUPD_[ENB|DIS] needs to be set explicitly */
 		/* HFI1_RCVCTRL_TAILUPD_[ENB|DIS] needs to be set explicitly */
-		rcvmask |= HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, DMA_RTAIL) ?
+		rcvmask |= HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ?
 			HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS;
 			HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS;
-		hfi1_rcvctrl(dd, rcvmask, i);
+		hfi1_rcvctrl(dd, rcvmask, rcd);
 	}
 	}
 
 
 	/* enable port */
 	/* enable port */
@@ -6906,7 +6955,7 @@ static void reset_neighbor_info(struct hfi1_pportdata *ppd)
 
 
 static const char * const link_down_reason_strs[] = {
 static const char * const link_down_reason_strs[] = {
 	[OPA_LINKDOWN_REASON_NONE] = "None",
 	[OPA_LINKDOWN_REASON_NONE] = "None",
-	[OPA_LINKDOWN_REASON_RCV_ERROR_0] = "Recive error 0",
+	[OPA_LINKDOWN_REASON_RCV_ERROR_0] = "Receive error 0",
 	[OPA_LINKDOWN_REASON_BAD_PKT_LEN] = "Bad packet length",
 	[OPA_LINKDOWN_REASON_BAD_PKT_LEN] = "Bad packet length",
 	[OPA_LINKDOWN_REASON_PKT_TOO_LONG] = "Packet too long",
 	[OPA_LINKDOWN_REASON_PKT_TOO_LONG] = "Packet too long",
 	[OPA_LINKDOWN_REASON_PKT_TOO_SHORT] = "Packet too short",
 	[OPA_LINKDOWN_REASON_PKT_TOO_SHORT] = "Packet too short",
@@ -6996,6 +7045,7 @@ void handle_link_down(struct work_struct *work)
 	/* Go offline first, then deal with reading/writing through 8051 */
 	/* Go offline first, then deal with reading/writing through 8051 */
 	was_up = !!(ppd->host_link_state & HLS_UP);
 	was_up = !!(ppd->host_link_state & HLS_UP);
 	set_link_state(ppd, HLS_DN_OFFLINE);
 	set_link_state(ppd, HLS_DN_OFFLINE);
+	xchg(&ppd->is_link_down_queued, 0);
 
 
 	if (was_up) {
 	if (was_up) {
 		lcl_reason = 0;
 		lcl_reason = 0;
@@ -7689,12 +7739,12 @@ static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg)
 			host_msg &= ~(u64)HOST_REQ_DONE;
 			host_msg &= ~(u64)HOST_REQ_DONE;
 		}
 		}
 		if (host_msg & BC_SMA_MSG) {
 		if (host_msg & BC_SMA_MSG) {
-			queue_work(ppd->hfi1_wq, &ppd->sma_message_work);
+			queue_work(ppd->link_wq, &ppd->sma_message_work);
 			host_msg &= ~(u64)BC_SMA_MSG;
 			host_msg &= ~(u64)BC_SMA_MSG;
 		}
 		}
 		if (host_msg & LINKUP_ACHIEVED) {
 		if (host_msg & LINKUP_ACHIEVED) {
 			dd_dev_info(dd, "8051: Link up\n");
 			dd_dev_info(dd, "8051: Link up\n");
-			queue_work(ppd->hfi1_wq, &ppd->link_up_work);
+			queue_work(ppd->link_wq, &ppd->link_up_work);
 			host_msg &= ~(u64)LINKUP_ACHIEVED;
 			host_msg &= ~(u64)LINKUP_ACHIEVED;
 		}
 		}
 		if (host_msg & EXT_DEVICE_CFG_REQ) {
 		if (host_msg & EXT_DEVICE_CFG_REQ) {
@@ -7702,7 +7752,7 @@ static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg)
 			host_msg &= ~(u64)EXT_DEVICE_CFG_REQ;
 			host_msg &= ~(u64)EXT_DEVICE_CFG_REQ;
 		}
 		}
 		if (host_msg & VERIFY_CAP_FRAME) {
 		if (host_msg & VERIFY_CAP_FRAME) {
-			queue_work(ppd->hfi1_wq, &ppd->link_vc_work);
+			queue_work(ppd->link_wq, &ppd->link_vc_work);
 			host_msg &= ~(u64)VERIFY_CAP_FRAME;
 			host_msg &= ~(u64)VERIFY_CAP_FRAME;
 		}
 		}
 		if (host_msg & LINK_GOING_DOWN) {
 		if (host_msg & LINK_GOING_DOWN) {
@@ -7717,7 +7767,7 @@ static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg)
 			host_msg &= ~(u64)LINK_GOING_DOWN;
 			host_msg &= ~(u64)LINK_GOING_DOWN;
 		}
 		}
 		if (host_msg & LINK_WIDTH_DOWNGRADED) {
 		if (host_msg & LINK_WIDTH_DOWNGRADED) {
-			queue_work(ppd->hfi1_wq, &ppd->link_downgrade_work);
+			queue_work(ppd->link_wq, &ppd->link_downgrade_work);
 			host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED;
 			host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED;
 		}
 		}
 		if (host_msg) {
 		if (host_msg) {
@@ -7756,11 +7806,12 @@ static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg)
 		 */
 		 */
 		if ((ppd->host_link_state &
 		if ((ppd->host_link_state &
 		    (HLS_GOING_OFFLINE | HLS_LINK_COOLDOWN)) ||
 		    (HLS_GOING_OFFLINE | HLS_LINK_COOLDOWN)) ||
-		    ppd->link_enabled == 0) {
+		    ppd->link_enabled == 0 || ppd->is_link_down_queued) {
 			dd_dev_info(dd, "%s: not queuing link down\n",
 			dd_dev_info(dd, "%s: not queuing link down\n",
 				    __func__);
 				    __func__);
 		} else {
 		} else {
-			queue_work(ppd->hfi1_wq, &ppd->link_down_work);
+			xchg(&ppd->is_link_down_queued, 1);
+			queue_work(ppd->link_wq, &ppd->link_down_work);
 		}
 		}
 	}
 	}
 }
 }
@@ -7968,7 +8019,7 @@ static void handle_dcc_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
 		dd_dev_info_ratelimited(dd, "%s: PortErrorAction bounce\n",
 		dd_dev_info_ratelimited(dd, "%s: PortErrorAction bounce\n",
 					__func__);
 					__func__);
 		set_link_down_reason(ppd, lcl_reason, 0, lcl_reason);
 		set_link_down_reason(ppd, lcl_reason, 0, lcl_reason);
-		queue_work(ppd->hfi1_wq, &ppd->link_bounce_work);
+		queue_work(ppd->link_wq, &ppd->link_bounce_work);
 	}
 	}
 }
 }
 
 
@@ -8781,6 +8832,20 @@ static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id,
 			& REMOTE_DEVICE_REV_MASK;
 			& REMOTE_DEVICE_REV_MASK;
 }
 }
 
 
+int write_host_interface_version(struct hfi1_devdata *dd, u8 version)
+{
+	u32 frame;
+	u32 mask;
+
+	mask = (HOST_INTERFACE_VERSION_MASK << HOST_INTERFACE_VERSION_SHIFT);
+	read_8051_config(dd, RESERVED_REGISTERS, GENERAL_CONFIG, &frame);
+	/* Clear, then set field */
+	frame &= ~mask;
+	frame |= ((u32)version << HOST_INTERFACE_VERSION_SHIFT);
+	return load_8051_config(dd, RESERVED_REGISTERS, GENERAL_CONFIG,
+				frame);
+}
+
 void read_misc_status(struct hfi1_devdata *dd, u8 *ver_major, u8 *ver_minor,
 void read_misc_status(struct hfi1_devdata *dd, u8 *ver_major, u8 *ver_minor,
 		      u8 *ver_patch)
 		      u8 *ver_patch)
 {
 {
@@ -9257,12 +9322,6 @@ int start_link(struct hfi1_pportdata *ppd)
 	 */
 	 */
 	tune_serdes(ppd);
 	tune_serdes(ppd);
 
 
-	if (!ppd->link_enabled) {
-		dd_dev_info(ppd->dd,
-			    "%s: stopping link start because link is disabled\n",
-			    __func__);
-		return 0;
-	}
 	if (!ppd->driver_link_ready) {
 	if (!ppd->driver_link_ready) {
 		dd_dev_info(ppd->dd,
 		dd_dev_info(ppd->dd,
 			    "%s: stopping link start because driver is not ready\n",
 			    "%s: stopping link start because driver is not ready\n",
@@ -9373,13 +9432,13 @@ static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd,
 
 
 	if ((qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_ALARM) ||
 	if ((qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_ALARM) ||
 	    (qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_WARNING))
 	    (qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_WARNING))
-		dd_dev_info(dd, "%s: QSFP cable temperature too high\n",
-			    __func__);
+		dd_dev_err(dd, "%s: QSFP cable temperature too high\n",
+			   __func__);
 
 
 	if ((qsfp_interrupt_status[0] & QSFP_LOW_TEMP_ALARM) ||
 	if ((qsfp_interrupt_status[0] & QSFP_LOW_TEMP_ALARM) ||
 	    (qsfp_interrupt_status[0] & QSFP_LOW_TEMP_WARNING))
 	    (qsfp_interrupt_status[0] & QSFP_LOW_TEMP_WARNING))
-		dd_dev_info(dd, "%s: QSFP cable temperature too low\n",
-			    __func__);
+		dd_dev_err(dd, "%s: QSFP cable temperature too low\n",
+			   __func__);
 
 
 	/*
 	/*
 	 * The remaining alarms/warnings don't matter if the link is down.
 	 * The remaining alarms/warnings don't matter if the link is down.
@@ -9389,75 +9448,75 @@ static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd,
 
 
 	if ((qsfp_interrupt_status[1] & QSFP_HIGH_VCC_ALARM) ||
 	if ((qsfp_interrupt_status[1] & QSFP_HIGH_VCC_ALARM) ||
 	    (qsfp_interrupt_status[1] & QSFP_HIGH_VCC_WARNING))
 	    (qsfp_interrupt_status[1] & QSFP_HIGH_VCC_WARNING))
-		dd_dev_info(dd, "%s: QSFP supply voltage too high\n",
-			    __func__);
+		dd_dev_err(dd, "%s: QSFP supply voltage too high\n",
+			   __func__);
 
 
 	if ((qsfp_interrupt_status[1] & QSFP_LOW_VCC_ALARM) ||
 	if ((qsfp_interrupt_status[1] & QSFP_LOW_VCC_ALARM) ||
 	    (qsfp_interrupt_status[1] & QSFP_LOW_VCC_WARNING))
 	    (qsfp_interrupt_status[1] & QSFP_LOW_VCC_WARNING))
-		dd_dev_info(dd, "%s: QSFP supply voltage too low\n",
-			    __func__);
+		dd_dev_err(dd, "%s: QSFP supply voltage too low\n",
+			   __func__);
 
 
 	/* Byte 2 is vendor specific */
 	/* Byte 2 is vendor specific */
 
 
 	if ((qsfp_interrupt_status[3] & QSFP_HIGH_POWER_ALARM) ||
 	if ((qsfp_interrupt_status[3] & QSFP_HIGH_POWER_ALARM) ||
 	    (qsfp_interrupt_status[3] & QSFP_HIGH_POWER_WARNING))
 	    (qsfp_interrupt_status[3] & QSFP_HIGH_POWER_WARNING))
-		dd_dev_info(dd, "%s: Cable RX channel 1/2 power too high\n",
-			    __func__);
+		dd_dev_err(dd, "%s: Cable RX channel 1/2 power too high\n",
+			   __func__);
 
 
 	if ((qsfp_interrupt_status[3] & QSFP_LOW_POWER_ALARM) ||
 	if ((qsfp_interrupt_status[3] & QSFP_LOW_POWER_ALARM) ||
 	    (qsfp_interrupt_status[3] & QSFP_LOW_POWER_WARNING))
 	    (qsfp_interrupt_status[3] & QSFP_LOW_POWER_WARNING))
-		dd_dev_info(dd, "%s: Cable RX channel 1/2 power too low\n",
-			    __func__);
+		dd_dev_err(dd, "%s: Cable RX channel 1/2 power too low\n",
+			   __func__);
 
 
 	if ((qsfp_interrupt_status[4] & QSFP_HIGH_POWER_ALARM) ||
 	if ((qsfp_interrupt_status[4] & QSFP_HIGH_POWER_ALARM) ||
 	    (qsfp_interrupt_status[4] & QSFP_HIGH_POWER_WARNING))
 	    (qsfp_interrupt_status[4] & QSFP_HIGH_POWER_WARNING))
-		dd_dev_info(dd, "%s: Cable RX channel 3/4 power too high\n",
-			    __func__);
+		dd_dev_err(dd, "%s: Cable RX channel 3/4 power too high\n",
+			   __func__);
 
 
 	if ((qsfp_interrupt_status[4] & QSFP_LOW_POWER_ALARM) ||
 	if ((qsfp_interrupt_status[4] & QSFP_LOW_POWER_ALARM) ||
 	    (qsfp_interrupt_status[4] & QSFP_LOW_POWER_WARNING))
 	    (qsfp_interrupt_status[4] & QSFP_LOW_POWER_WARNING))
-		dd_dev_info(dd, "%s: Cable RX channel 3/4 power too low\n",
-			    __func__);
+		dd_dev_err(dd, "%s: Cable RX channel 3/4 power too low\n",
+			   __func__);
 
 
 	if ((qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_ALARM) ||
 	if ((qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_ALARM) ||
 	    (qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_WARNING))
 	    (qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_WARNING))
-		dd_dev_info(dd, "%s: Cable TX channel 1/2 bias too high\n",
-			    __func__);
+		dd_dev_err(dd, "%s: Cable TX channel 1/2 bias too high\n",
+			   __func__);
 
 
 	if ((qsfp_interrupt_status[5] & QSFP_LOW_BIAS_ALARM) ||
 	if ((qsfp_interrupt_status[5] & QSFP_LOW_BIAS_ALARM) ||
 	    (qsfp_interrupt_status[5] & QSFP_LOW_BIAS_WARNING))
 	    (qsfp_interrupt_status[5] & QSFP_LOW_BIAS_WARNING))
-		dd_dev_info(dd, "%s: Cable TX channel 1/2 bias too low\n",
-			    __func__);
+		dd_dev_err(dd, "%s: Cable TX channel 1/2 bias too low\n",
+			   __func__);
 
 
 	if ((qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_ALARM) ||
 	if ((qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_ALARM) ||
 	    (qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_WARNING))
 	    (qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_WARNING))
-		dd_dev_info(dd, "%s: Cable TX channel 3/4 bias too high\n",
-			    __func__);
+		dd_dev_err(dd, "%s: Cable TX channel 3/4 bias too high\n",
+			   __func__);
 
 
 	if ((qsfp_interrupt_status[6] & QSFP_LOW_BIAS_ALARM) ||
 	if ((qsfp_interrupt_status[6] & QSFP_LOW_BIAS_ALARM) ||
 	    (qsfp_interrupt_status[6] & QSFP_LOW_BIAS_WARNING))
 	    (qsfp_interrupt_status[6] & QSFP_LOW_BIAS_WARNING))
-		dd_dev_info(dd, "%s: Cable TX channel 3/4 bias too low\n",
-			    __func__);
+		dd_dev_err(dd, "%s: Cable TX channel 3/4 bias too low\n",
+			   __func__);
 
 
 	if ((qsfp_interrupt_status[7] & QSFP_HIGH_POWER_ALARM) ||
 	if ((qsfp_interrupt_status[7] & QSFP_HIGH_POWER_ALARM) ||
 	    (qsfp_interrupt_status[7] & QSFP_HIGH_POWER_WARNING))
 	    (qsfp_interrupt_status[7] & QSFP_HIGH_POWER_WARNING))
-		dd_dev_info(dd, "%s: Cable TX channel 1/2 power too high\n",
-			    __func__);
+		dd_dev_err(dd, "%s: Cable TX channel 1/2 power too high\n",
+			   __func__);
 
 
 	if ((qsfp_interrupt_status[7] & QSFP_LOW_POWER_ALARM) ||
 	if ((qsfp_interrupt_status[7] & QSFP_LOW_POWER_ALARM) ||
 	    (qsfp_interrupt_status[7] & QSFP_LOW_POWER_WARNING))
 	    (qsfp_interrupt_status[7] & QSFP_LOW_POWER_WARNING))
-		dd_dev_info(dd, "%s: Cable TX channel 1/2 power too low\n",
-			    __func__);
+		dd_dev_err(dd, "%s: Cable TX channel 1/2 power too low\n",
+			   __func__);
 
 
 	if ((qsfp_interrupt_status[8] & QSFP_HIGH_POWER_ALARM) ||
 	if ((qsfp_interrupt_status[8] & QSFP_HIGH_POWER_ALARM) ||
 	    (qsfp_interrupt_status[8] & QSFP_HIGH_POWER_WARNING))
 	    (qsfp_interrupt_status[8] & QSFP_HIGH_POWER_WARNING))
-		dd_dev_info(dd, "%s: Cable TX channel 3/4 power too high\n",
-			    __func__);
+		dd_dev_err(dd, "%s: Cable TX channel 3/4 power too high\n",
+			   __func__);
 
 
 	if ((qsfp_interrupt_status[8] & QSFP_LOW_POWER_ALARM) ||
 	if ((qsfp_interrupt_status[8] & QSFP_LOW_POWER_ALARM) ||
 	    (qsfp_interrupt_status[8] & QSFP_LOW_POWER_WARNING))
 	    (qsfp_interrupt_status[8] & QSFP_LOW_POWER_WARNING))
-		dd_dev_info(dd, "%s: Cable TX channel 3/4 power too low\n",
-			    __func__);
+		dd_dev_err(dd, "%s: Cable TX channel 3/4 power too low\n",
+			   __func__);
 
 
 	/* Bytes 9-10 and 11-12 are reserved */
 	/* Bytes 9-10 and 11-12 are reserved */
 	/* Bytes 13-15 are vendor specific */
 	/* Bytes 13-15 are vendor specific */
@@ -9480,6 +9539,13 @@ void qsfp_event(struct work_struct *work)
 	if (!qsfp_mod_present(ppd))
 	if (!qsfp_mod_present(ppd))
 		return;
 		return;
 
 
+	if (ppd->host_link_state == HLS_DN_DISABLE) {
+		dd_dev_info(ppd->dd,
+			    "%s: stopping link start because link is disabled\n",
+			    __func__);
+		return;
+	}
+
 	/*
 	/*
 	 * Turn DC back on after cable has been re-inserted. Up until
 	 * Turn DC back on after cable has been re-inserted. Up until
 	 * now, the DC has been in reset to save power.
 	 * now, the DC has been in reset to save power.
@@ -9635,7 +9701,7 @@ static void try_start_link(struct hfi1_pportdata *ppd)
 			    "QSFP not responding, waiting and retrying %d\n",
 			    "QSFP not responding, waiting and retrying %d\n",
 			    (int)ppd->qsfp_retry_count);
 			    (int)ppd->qsfp_retry_count);
 		ppd->qsfp_retry_count++;
 		ppd->qsfp_retry_count++;
-		queue_delayed_work(ppd->hfi1_wq, &ppd->start_link_work,
+		queue_delayed_work(ppd->link_wq, &ppd->start_link_work,
 				   msecs_to_jiffies(QSFP_RETRY_WAIT));
 				   msecs_to_jiffies(QSFP_RETRY_WAIT));
 		return;
 		return;
 	}
 	}
@@ -9742,17 +9808,6 @@ static inline int init_cpu_counters(struct hfi1_devdata *dd)
 	return 0;
 	return 0;
 }
 }
 
 
-static const char * const pt_names[] = {
-	"expected",
-	"eager",
-	"invalid"
-};
-
-static const char *pt_name(u32 type)
-{
-	return type >= ARRAY_SIZE(pt_names) ? "unknown" : pt_names[type];
-}
-
 /*
 /*
  * index is the index into the receive array
  * index is the index into the receive array
  */
  */
@@ -9760,35 +9815,34 @@ void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
 		  u32 type, unsigned long pa, u16 order)
 		  u32 type, unsigned long pa, u16 order)
 {
 {
 	u64 reg;
 	u64 reg;
-	void __iomem *base = (dd->rcvarray_wc ? dd->rcvarray_wc :
-			      (dd->kregbase + RCV_ARRAY));
 
 
 	if (!(dd->flags & HFI1_PRESENT))
 	if (!(dd->flags & HFI1_PRESENT))
 		goto done;
 		goto done;
 
 
-	if (type == PT_INVALID) {
+	if (type == PT_INVALID || type == PT_INVALID_FLUSH) {
 		pa = 0;
 		pa = 0;
+		order = 0;
 	} else if (type > PT_INVALID) {
 	} else if (type > PT_INVALID) {
 		dd_dev_err(dd,
 		dd_dev_err(dd,
 			   "unexpected receive array type %u for index %u, not handled\n",
 			   "unexpected receive array type %u for index %u, not handled\n",
 			   type, index);
 			   type, index);
 		goto done;
 		goto done;
 	}
 	}
-
-	hfi1_cdbg(TID, "type %s, index 0x%x, pa 0x%lx, bsize 0x%lx",
-		  pt_name(type), index, pa, (unsigned long)order);
+	trace_hfi1_put_tid(dd, index, type, pa, order);
 
 
 #define RT_ADDR_SHIFT 12	/* 4KB kernel address boundary */
 #define RT_ADDR_SHIFT 12	/* 4KB kernel address boundary */
 	reg = RCV_ARRAY_RT_WRITE_ENABLE_SMASK
 	reg = RCV_ARRAY_RT_WRITE_ENABLE_SMASK
 		| (u64)order << RCV_ARRAY_RT_BUF_SIZE_SHIFT
 		| (u64)order << RCV_ARRAY_RT_BUF_SIZE_SHIFT
 		| ((pa >> RT_ADDR_SHIFT) & RCV_ARRAY_RT_ADDR_MASK)
 		| ((pa >> RT_ADDR_SHIFT) & RCV_ARRAY_RT_ADDR_MASK)
 					<< RCV_ARRAY_RT_ADDR_SHIFT;
 					<< RCV_ARRAY_RT_ADDR_SHIFT;
-	writeq(reg, base + (index * 8));
+	trace_hfi1_write_rcvarray(dd->rcvarray_wc + (index * 8), reg);
+	writeq(reg, dd->rcvarray_wc + (index * 8));
 
 
-	if (type == PT_EAGER)
+	if (type == PT_EAGER || type == PT_INVALID_FLUSH || (index & 3) == 3)
 		/*
 		/*
-		 * Eager entries are written one-by-one so we have to push them
-		 * after we write the entry.
+		 * Eager entries are written and flushed
+		 *
+		 * Expected entries are flushed every 4 writes
 		 */
 		 */
 		flush_wc();
 		flush_wc();
 done:
 done:
@@ -9810,15 +9864,6 @@ void hfi1_clear_tids(struct hfi1_ctxtdata *rcd)
 		hfi1_put_tid(dd, i, PT_INVALID, 0, 0);
 		hfi1_put_tid(dd, i, PT_INVALID, 0, 0);
 }
 }
 
 
-struct ib_header *hfi1_get_msgheader(
-	struct hfi1_devdata *dd, __le32 *rhf_addr)
-{
-	u32 offset = rhf_hdrq_offset(rhf_to_cpu(rhf_addr));
-
-	return (struct ib_header *)
-		(rhf_addr - dd->rhf_offset + offset);
-}
-
 static const char * const ib_cfg_name_strings[] = {
 static const char * const ib_cfg_name_strings[] = {
 	"HFI1_IB_CFG_LIDLMC",
 	"HFI1_IB_CFG_LIDLMC",
 	"HFI1_IB_CFG_LWID_DG_ENB",
 	"HFI1_IB_CFG_LWID_DG_ENB",
@@ -10037,28 +10082,6 @@ static void set_lidlmc(struct hfi1_pportdata *ppd)
 	sdma_update_lmc(dd, mask, ppd->lid);
 	sdma_update_lmc(dd, mask, ppd->lid);
 }
 }
 
 
-static int wait_phy_linkstate(struct hfi1_devdata *dd, u32 state, u32 msecs)
-{
-	unsigned long timeout;
-	u32 curr_state;
-
-	timeout = jiffies + msecs_to_jiffies(msecs);
-	while (1) {
-		curr_state = read_physical_state(dd);
-		if (curr_state == state)
-			break;
-		if (time_after(jiffies, timeout)) {
-			dd_dev_err(dd,
-				   "timeout waiting for phy link state 0x%x, current state is 0x%x\n",
-				   state, curr_state);
-			return -ETIMEDOUT;
-		}
-		usleep_range(1950, 2050); /* sleep 2ms-ish */
-	}
-
-	return 0;
-}
-
 static const char *state_completed_string(u32 completed)
 static const char *state_completed_string(u32 completed)
 {
 {
 	static const char * const state_completed[] = {
 	static const char * const state_completed[] = {
@@ -10253,49 +10276,35 @@ static void force_logical_link_state_down(struct hfi1_pportdata *ppd)
 static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason)
 static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason)
 {
 {
 	struct hfi1_devdata *dd = ppd->dd;
 	struct hfi1_devdata *dd = ppd->dd;
-	u32 pstate, previous_state;
+	u32 previous_state;
 	int ret;
 	int ret;
-	int do_transition;
-	int do_wait;
 
 
 	update_lcb_cache(dd);
 	update_lcb_cache(dd);
 
 
 	previous_state = ppd->host_link_state;
 	previous_state = ppd->host_link_state;
 	ppd->host_link_state = HLS_GOING_OFFLINE;
 	ppd->host_link_state = HLS_GOING_OFFLINE;
-	pstate = read_physical_state(dd);
-	if (pstate == PLS_OFFLINE) {
-		do_transition = 0;	/* in right state */
-		do_wait = 0;		/* ...no need to wait */
-	} else if ((pstate & 0xf0) == PLS_OFFLINE) {
-		do_transition = 0;	/* in an offline transient state */
-		do_wait = 1;		/* ...wait for it to settle */
-	} else {
-		do_transition = 1;	/* need to move to offline */
-		do_wait = 1;		/* ...will need to wait */
-	}
 
 
-	if (do_transition) {
-		ret = set_physical_link_state(dd,
-					      (rem_reason << 8) | PLS_OFFLINE);
+	/* start offline transition */
+	ret = set_physical_link_state(dd, (rem_reason << 8) | PLS_OFFLINE);
 
 
-		if (ret != HCMD_SUCCESS) {
-			dd_dev_err(dd,
-				   "Failed to transition to Offline link state, return %d\n",
-				   ret);
-			return -EINVAL;
-		}
-		if (ppd->offline_disabled_reason ==
-				HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE))
-			ppd->offline_disabled_reason =
-			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_TRANSIENT);
+	if (ret != HCMD_SUCCESS) {
+		dd_dev_err(dd,
+			   "Failed to transition to Offline link state, return %d\n",
+			   ret);
+		return -EINVAL;
 	}
 	}
+	if (ppd->offline_disabled_reason ==
+			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE))
+		ppd->offline_disabled_reason =
+		HFI1_ODR_MASK(OPA_LINKDOWN_REASON_TRANSIENT);
 
 
-	if (do_wait) {
-		/* it can take a while for the link to go down */
-		ret = wait_phy_linkstate(dd, PLS_OFFLINE, 10000);
-		if (ret < 0)
-			return ret;
-	}
+	/*
+	 * Wait for offline transition. It can take a while for
+	 * the link to go down.
+	 */
+	ret = wait_physical_linkstate(ppd, PLS_OFFLINE, 10000);
+	if (ret < 0)
+		return ret;
 
 
 	/*
 	/*
 	 * Now in charge of LCB - must be after the physical state is
 	 * Now in charge of LCB - must be after the physical state is
@@ -10483,6 +10492,14 @@ void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason,
 	}
 	}
 }
 }
 
 
+/*
+ * Verify if BCT for data VLs is non-zero.
+ */
+static inline bool data_vls_operational(struct hfi1_pportdata *ppd)
+{
+	return !!ppd->actual_vls_operational;
+}
+
 /*
 /*
  * Change the physical and/or logical link state.
  * Change the physical and/or logical link state.
  *
  *
@@ -10545,38 +10562,58 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
 			goto unexpected;
 			goto unexpected;
 		}
 		}
 
 
+		/*
+		 * Wait for Link_Up physical state.
+		 * Physical and Logical states should already be
+		 * be transitioned to LinkUp and LinkInit respectively.
+		 */
+		ret = wait_physical_linkstate(ppd, PLS_LINKUP, 1000);
+		if (ret) {
+			dd_dev_err(dd,
+				   "%s: physical state did not change to LINK-UP\n",
+				   __func__);
+			break;
+		}
+
 		ret = wait_logical_linkstate(ppd, IB_PORT_INIT, 1000);
 		ret = wait_logical_linkstate(ppd, IB_PORT_INIT, 1000);
 		if (ret) {
 		if (ret) {
 			dd_dev_err(dd,
 			dd_dev_err(dd,
 				   "%s: logical state did not change to INIT\n",
 				   "%s: logical state did not change to INIT\n",
 				   __func__);
 				   __func__);
-		} else {
-			/* clear old transient LINKINIT_REASON code */
-			if (ppd->linkinit_reason >= OPA_LINKINIT_REASON_CLEAR)
-				ppd->linkinit_reason =
-					OPA_LINKINIT_REASON_LINKUP;
+			break;
+		}
 
 
-			/* enable the port */
-			add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+		/* clear old transient LINKINIT_REASON code */
+		if (ppd->linkinit_reason >= OPA_LINKINIT_REASON_CLEAR)
+			ppd->linkinit_reason =
+				OPA_LINKINIT_REASON_LINKUP;
 
 
-			handle_linkup_change(dd, 1);
-			ppd->host_link_state = HLS_UP_INIT;
-		}
+		/* enable the port */
+		add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+
+		handle_linkup_change(dd, 1);
+		ppd->host_link_state = HLS_UP_INIT;
 		break;
 		break;
 	case HLS_UP_ARMED:
 	case HLS_UP_ARMED:
 		if (ppd->host_link_state != HLS_UP_INIT)
 		if (ppd->host_link_state != HLS_UP_INIT)
 			goto unexpected;
 			goto unexpected;
 
 
-		ppd->host_link_state = HLS_UP_ARMED;
+		if (!data_vls_operational(ppd)) {
+			dd_dev_err(dd,
+				   "%s: data VLs not operational\n", __func__);
+			ret = -EINVAL;
+			break;
+		}
+
 		set_logical_state(dd, LSTATE_ARMED);
 		set_logical_state(dd, LSTATE_ARMED);
 		ret = wait_logical_linkstate(ppd, IB_PORT_ARMED, 1000);
 		ret = wait_logical_linkstate(ppd, IB_PORT_ARMED, 1000);
 		if (ret) {
 		if (ret) {
-			/* logical state didn't change, stay at init */
-			ppd->host_link_state = HLS_UP_INIT;
 			dd_dev_err(dd,
 			dd_dev_err(dd,
 				   "%s: logical state did not change to ARMED\n",
 				   "%s: logical state did not change to ARMED\n",
 				   __func__);
 				   __func__);
+			break;
 		}
 		}
+		ppd->host_link_state = HLS_UP_ARMED;
 		/*
 		/*
 		 * The simulator does not currently implement SMA messages,
 		 * The simulator does not currently implement SMA messages,
 		 * so neighbor_normal is not set.  Set it here when we first
 		 * so neighbor_normal is not set.  Set it here when we first
@@ -10589,18 +10626,16 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
 		if (ppd->host_link_state != HLS_UP_ARMED)
 		if (ppd->host_link_state != HLS_UP_ARMED)
 			goto unexpected;
 			goto unexpected;
 
 
-		ppd->host_link_state = HLS_UP_ACTIVE;
 		set_logical_state(dd, LSTATE_ACTIVE);
 		set_logical_state(dd, LSTATE_ACTIVE);
 		ret = wait_logical_linkstate(ppd, IB_PORT_ACTIVE, 1000);
 		ret = wait_logical_linkstate(ppd, IB_PORT_ACTIVE, 1000);
 		if (ret) {
 		if (ret) {
-			/* logical state didn't change, stay at armed */
-			ppd->host_link_state = HLS_UP_ARMED;
 			dd_dev_err(dd,
 			dd_dev_err(dd,
 				   "%s: logical state did not change to ACTIVE\n",
 				   "%s: logical state did not change to ACTIVE\n",
 				   __func__);
 				   __func__);
 		} else {
 		} else {
 			/* tell all engines to go running */
 			/* tell all engines to go running */
 			sdma_all_running(dd);
 			sdma_all_running(dd);
+			ppd->host_link_state = HLS_UP_ACTIVE;
 
 
 			/* Signal the IB layer that the port has went active */
 			/* Signal the IB layer that the port has went active */
 			event.device = &dd->verbs_dev.rdi.ibdev;
 			event.device = &dd->verbs_dev.rdi.ibdev;
@@ -10658,6 +10693,8 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
 		 */
 		 */
 		if (ret)
 		if (ret)
 			goto_offline(ppd, 0);
 			goto_offline(ppd, 0);
+		else
+			cache_physical_state(ppd);
 		break;
 		break;
 	case HLS_DN_DISABLE:
 	case HLS_DN_DISABLE:
 		/* link is disabled */
 		/* link is disabled */
@@ -10682,6 +10719,13 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
 				ret = -EINVAL;
 				ret = -EINVAL;
 				break;
 				break;
 			}
 			}
+			ret = wait_physical_linkstate(ppd, PLS_DISABLED, 10000);
+			if (ret) {
+				dd_dev_err(dd,
+					   "%s: physical state did not change to DISABLED\n",
+					   __func__);
+				break;
+			}
 			dc_shutdown(dd);
 			dc_shutdown(dd);
 		}
 		}
 		ppd->host_link_state = HLS_DN_DISABLE;
 		ppd->host_link_state = HLS_DN_DISABLE;
@@ -10699,6 +10743,7 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
 		if (ppd->host_link_state != HLS_DN_POLL)
 		if (ppd->host_link_state != HLS_DN_POLL)
 			goto unexpected;
 			goto unexpected;
 		ppd->host_link_state = HLS_VERIFY_CAP;
 		ppd->host_link_state = HLS_VERIFY_CAP;
+		cache_physical_state(ppd);
 		break;
 		break;
 	case HLS_GOING_UP:
 	case HLS_GOING_UP:
 		if (ppd->host_link_state != HLS_VERIFY_CAP)
 		if (ppd->host_link_state != HLS_VERIFY_CAP)
@@ -11693,16 +11738,18 @@ static u32 encoded_size(u32 size)
 	return 0x1;	/* if invalid, go with the minimum size */
 	return 0x1;	/* if invalid, go with the minimum size */
 }
 }
 
 
-void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt)
+void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op,
+		  struct hfi1_ctxtdata *rcd)
 {
 {
-	struct hfi1_ctxtdata *rcd;
 	u64 rcvctrl, reg;
 	u64 rcvctrl, reg;
 	int did_enable = 0;
 	int did_enable = 0;
+	u16 ctxt;
 
 
-	rcd = dd->rcd[ctxt];
 	if (!rcd)
 	if (!rcd)
 		return;
 		return;
 
 
+	ctxt = rcd->ctxt;
+
 	hfi1_cdbg(RCVCTRL, "ctxt %d op 0x%x", ctxt, op);
 	hfi1_cdbg(RCVCTRL, "ctxt %d op 0x%x", ctxt, op);
 
 
 	rcvctrl = read_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL);
 	rcvctrl = read_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL);
@@ -12672,21 +12719,56 @@ static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
 	return -ETIMEDOUT;
 	return -ETIMEDOUT;
 }
 }
 
 
-u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd)
+/*
+ * Read the physical hardware link state and set the driver's cached value
+ * of it.
+ */
+void cache_physical_state(struct hfi1_pportdata *ppd)
 {
 {
-	u32 pstate;
+	u32 read_pstate;
 	u32 ib_pstate;
 	u32 ib_pstate;
 
 
-	pstate = read_physical_state(ppd->dd);
-	ib_pstate = chip_to_opa_pstate(ppd->dd, pstate);
-	if (ppd->last_pstate != ib_pstate) {
+	read_pstate = read_physical_state(ppd->dd);
+	ib_pstate = chip_to_opa_pstate(ppd->dd, read_pstate);
+	/* check if OPA pstate changed */
+	if (chip_to_opa_pstate(ppd->dd, ppd->pstate) != ib_pstate) {
 		dd_dev_info(ppd->dd,
 		dd_dev_info(ppd->dd,
 			    "%s: physical state changed to %s (0x%x), phy 0x%x\n",
 			    "%s: physical state changed to %s (0x%x), phy 0x%x\n",
 			    __func__, opa_pstate_name(ib_pstate), ib_pstate,
 			    __func__, opa_pstate_name(ib_pstate), ib_pstate,
-			    pstate);
-		ppd->last_pstate = ib_pstate;
+			    read_pstate);
 	}
 	}
-	return ib_pstate;
+	ppd->pstate = read_pstate;
+}
+
+/*
+ * wait_physical_linkstate - wait for an physical link state change to occur
+ * @ppd: port device
+ * @state: the state to wait for
+ * @msecs: the number of milliseconds to wait
+ *
+ * Wait up to msecs milliseconds for physical link state change to occur.
+ * Returns 0 if state reached, otherwise -ETIMEDOUT.
+ */
+static int wait_physical_linkstate(struct hfi1_pportdata *ppd, u32 state,
+				   int msecs)
+{
+	unsigned long timeout;
+
+	timeout = jiffies + msecs_to_jiffies(msecs);
+	while (1) {
+		cache_physical_state(ppd);
+		if (ppd->pstate == state)
+			break;
+		if (time_after(jiffies, timeout)) {
+			dd_dev_err(ppd->dd,
+				   "timeout waiting for phy link state 0x%x, current state is 0x%x\n",
+				   state, ppd->pstate);
+			return -ETIMEDOUT;
+		}
+		usleep_range(1950, 2050); /* sleep 2ms-ish */
+	}
+
+	return 0;
 }
 }
 
 
 #define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
 #define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
@@ -12809,30 +12891,24 @@ static void clean_up_interrupts(struct hfi1_devdata *dd)
 		for (i = 0; i < dd->num_msix_entries; i++, me++) {
 		for (i = 0; i < dd->num_msix_entries; i++, me++) {
 			if (!me->arg) /* => no irq, no affinity */
 			if (!me->arg) /* => no irq, no affinity */
 				continue;
 				continue;
-			hfi1_put_irq_affinity(dd, &dd->msix_entries[i]);
-			free_irq(me->msix.vector, me->arg);
+			hfi1_put_irq_affinity(dd, me);
+			free_irq(me->irq, me->arg);
 		}
 		}
+
+		/* clean structures */
+		kfree(dd->msix_entries);
+		dd->msix_entries = NULL;
+		dd->num_msix_entries = 0;
 	} else {
 	} else {
 		/* INTx */
 		/* INTx */
 		if (dd->requested_intx_irq) {
 		if (dd->requested_intx_irq) {
 			free_irq(dd->pcidev->irq, dd);
 			free_irq(dd->pcidev->irq, dd);
 			dd->requested_intx_irq = 0;
 			dd->requested_intx_irq = 0;
 		}
 		}
-	}
-
-	/* turn off interrupts */
-	if (dd->num_msix_entries) {
-		/* MSI-X */
-		pci_disable_msix(dd->pcidev);
-	} else {
-		/* INTx */
 		disable_intx(dd->pcidev);
 		disable_intx(dd->pcidev);
 	}
 	}
 
 
-	/* clean structures */
-	kfree(dd->msix_entries);
-	dd->msix_entries = NULL;
-	dd->num_msix_entries = 0;
+	pci_free_irq_vectors(dd->pcidev);
 }
 }
 
 
 /*
 /*
@@ -12986,13 +13062,21 @@ static int request_msix_irqs(struct hfi1_devdata *dd)
 			continue;
 			continue;
 		/* make sure the name is terminated */
 		/* make sure the name is terminated */
 		me->name[sizeof(me->name) - 1] = 0;
 		me->name[sizeof(me->name) - 1] = 0;
+		me->irq = pci_irq_vector(dd->pcidev, i);
+		/*
+		 * On err return me->irq.  Don't need to clear this
+		 * because 'arg' has not been set, and cleanup will
+		 * do the right thing.
+		 */
+		if (me->irq < 0)
+			return me->irq;
 
 
-		ret = request_threaded_irq(me->msix.vector, handler, thread, 0,
+		ret = request_threaded_irq(me->irq, handler, thread, 0,
 					   me->name, arg);
 					   me->name, arg);
 		if (ret) {
 		if (ret) {
 			dd_dev_err(dd,
 			dd_dev_err(dd,
-				   "unable to allocate %s interrupt, vector %d, index %d, err %d\n",
-				   err_info, me->msix.vector, idx, ret);
+				   "unable to allocate %s interrupt, irq %d, index %d, err %d\n",
+				   err_info, me->irq, idx, ret);
 			return ret;
 			return ret;
 		}
 		}
 		/*
 		/*
@@ -13003,8 +13087,7 @@ static int request_msix_irqs(struct hfi1_devdata *dd)
 
 
 		ret = hfi1_get_irq_affinity(dd, me);
 		ret = hfi1_get_irq_affinity(dd, me);
 		if (ret)
 		if (ret)
-			dd_dev_err(dd,
-				   "unable to pin IRQ %d\n", ret);
+			dd_dev_err(dd, "unable to pin IRQ %d\n", ret);
 	}
 	}
 
 
 	return ret;
 	return ret;
@@ -13023,7 +13106,7 @@ void hfi1_vnic_synchronize_irq(struct hfi1_devdata *dd)
 		struct hfi1_ctxtdata *rcd = dd->vnic.ctxt[i];
 		struct hfi1_ctxtdata *rcd = dd->vnic.ctxt[i];
 		struct hfi1_msix_entry *me = &dd->msix_entries[rcd->msix_intr];
 		struct hfi1_msix_entry *me = &dd->msix_entries[rcd->msix_intr];
 
 
-		synchronize_irq(me->msix.vector);
+		synchronize_irq(me->irq);
 	}
 	}
 }
 }
 
 
@@ -13036,7 +13119,7 @@ void hfi1_reset_vnic_msix_info(struct hfi1_ctxtdata *rcd)
 		return;
 		return;
 
 
 	hfi1_put_irq_affinity(dd, me);
 	hfi1_put_irq_affinity(dd, me);
-	free_irq(me->msix.vector, me->arg);
+	free_irq(me->irq, me->arg);
 
 
 	me->arg = NULL;
 	me->arg = NULL;
 }
 }
@@ -13064,14 +13147,19 @@ void hfi1_set_vnic_msix_info(struct hfi1_ctxtdata *rcd)
 		 DRIVER_NAME "_%d kctxt%d", dd->unit, idx);
 		 DRIVER_NAME "_%d kctxt%d", dd->unit, idx);
 	me->name[sizeof(me->name) - 1] = 0;
 	me->name[sizeof(me->name) - 1] = 0;
 	me->type = IRQ_RCVCTXT;
 	me->type = IRQ_RCVCTXT;
-
+	me->irq = pci_irq_vector(dd->pcidev, rcd->msix_intr);
+	if (me->irq < 0) {
+		dd_dev_err(dd, "vnic irq vector request (idx %d) fail %d\n",
+			   idx, me->irq);
+		return;
+	}
 	remap_intr(dd, IS_RCVAVAIL_START + idx, rcd->msix_intr);
 	remap_intr(dd, IS_RCVAVAIL_START + idx, rcd->msix_intr);
 
 
-	ret = request_threaded_irq(me->msix.vector, receive_context_interrupt,
+	ret = request_threaded_irq(me->irq, receive_context_interrupt,
 				   receive_context_thread, 0, me->name, arg);
 				   receive_context_thread, 0, me->name, arg);
 	if (ret) {
 	if (ret) {
-		dd_dev_err(dd, "vnic irq request (vector %d, idx %d) fail %d\n",
-			   me->msix.vector, idx, ret);
+		dd_dev_err(dd, "vnic irq request (irq %d, idx %d) fail %d\n",
+			   me->irq, idx, ret);
 		return;
 		return;
 	}
 	}
 	/*
 	/*
@@ -13084,7 +13172,7 @@ void hfi1_set_vnic_msix_info(struct hfi1_ctxtdata *rcd)
 	if (ret) {
 	if (ret) {
 		dd_dev_err(dd,
 		dd_dev_err(dd,
 			   "unable to pin IRQ %d\n", ret);
 			   "unable to pin IRQ %d\n", ret);
-		free_irq(me->msix.vector, me->arg);
+		free_irq(me->irq, me->arg);
 	}
 	}
 }
 }
 
 
@@ -13107,9 +13195,8 @@ static void reset_interrupts(struct hfi1_devdata *dd)
 
 
 static int set_up_interrupts(struct hfi1_devdata *dd)
 static int set_up_interrupts(struct hfi1_devdata *dd)
 {
 {
-	struct hfi1_msix_entry *entries;
-	u32 total, request;
-	int i, ret;
+	u32 total;
+	int ret, request;
 	int single_interrupt = 0; /* we expect to have all the interrupts */
 	int single_interrupt = 0; /* we expect to have all the interrupts */
 
 
 	/*
 	/*
@@ -13121,39 +13208,31 @@ static int set_up_interrupts(struct hfi1_devdata *dd)
 	 */
 	 */
 	total = 1 + dd->num_sdma + dd->n_krcv_queues + HFI1_NUM_VNIC_CTXT;
 	total = 1 + dd->num_sdma + dd->n_krcv_queues + HFI1_NUM_VNIC_CTXT;
 
 
-	entries = kcalloc(total, sizeof(*entries), GFP_KERNEL);
-	if (!entries) {
-		ret = -ENOMEM;
-		goto fail;
-	}
-	/* 1-1 MSI-X entry assignment */
-	for (i = 0; i < total; i++)
-		entries[i].msix.entry = i;
-
 	/* ask for MSI-X interrupts */
 	/* ask for MSI-X interrupts */
-	request = total;
-	request_msix(dd, &request, entries);
-
-	if (request == 0) {
+	request = request_msix(dd, total);
+	if (request < 0) {
+		ret = request;
+		goto fail;
+	} else if (request == 0) {
 		/* using INTx */
 		/* using INTx */
 		/* dd->num_msix_entries already zero */
 		/* dd->num_msix_entries already zero */
-		kfree(entries);
 		single_interrupt = 1;
 		single_interrupt = 1;
 		dd_dev_err(dd, "MSI-X failed, using INTx interrupts\n");
 		dd_dev_err(dd, "MSI-X failed, using INTx interrupts\n");
+	} else if (request < total) {
+		/* using MSI-X, with reduced interrupts */
+		dd_dev_err(dd, "reduced interrupt found, wanted %u, got %u\n",
+			   total, request);
+		ret = -EINVAL;
+		goto fail;
 	} else {
 	} else {
-		/* using MSI-X */
-		dd->num_msix_entries = request;
-		dd->msix_entries = entries;
-
-		if (request != total) {
-			/* using MSI-X, with reduced interrupts */
-			dd_dev_err(
-				dd,
-				"cannot handle reduced interrupt case, want %u, got %u\n",
-				total, request);
-			ret = -EINVAL;
+		dd->msix_entries = kcalloc(total, sizeof(*dd->msix_entries),
+					   GFP_KERNEL);
+		if (!dd->msix_entries) {
+			ret = -ENOMEM;
 			goto fail;
 			goto fail;
 		}
 		}
+		/* using MSI-X */
+		dd->num_msix_entries = total;
 		dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total);
 		dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total);
 	}
 	}
 
 
@@ -13396,8 +13475,7 @@ static void write_uninitialized_csrs_and_memories(struct hfi1_devdata *dd)
 
 
 	/* RcvArray */
 	/* RcvArray */
 	for (i = 0; i < dd->chip_rcv_array_count; i++)
 	for (i = 0; i < dd->chip_rcv_array_count; i++)
-		write_csr(dd, RCV_ARRAY + (8 * i),
-			  RCV_ARRAY_RT_WRITE_ENABLE_SMASK);
+		hfi1_put_tid(dd, i, PT_INVALID_FLUSH, 0, 0);
 
 
 	/* RcvQPMapTable */
 	/* RcvQPMapTable */
 	for (i = 0; i < 32; i++)
 	for (i = 0; i < 32; i++)
@@ -13831,9 +13909,10 @@ static void init_sc2vl_tables(struct hfi1_devdata *dd)
  * a reset following the (possible) FLR in this routine.
  * a reset following the (possible) FLR in this routine.
  *
  *
  */
  */
-static void init_chip(struct hfi1_devdata *dd)
+static int init_chip(struct hfi1_devdata *dd)
 {
 {
 	int i;
 	int i;
+	int ret = 0;
 
 
 	/*
 	/*
 	 * Put the HFI CSRs in a known state.
 	 * Put the HFI CSRs in a known state.
@@ -13881,12 +13960,22 @@ static void init_chip(struct hfi1_devdata *dd)
 		pcie_flr(dd->pcidev);
 		pcie_flr(dd->pcidev);
 
 
 		/* restore command and BARs */
 		/* restore command and BARs */
-		restore_pci_variables(dd);
+		ret = restore_pci_variables(dd);
+		if (ret) {
+			dd_dev_err(dd, "%s: Could not restore PCI variables\n",
+				   __func__);
+			return ret;
+		}
 
 
 		if (is_ax(dd)) {
 		if (is_ax(dd)) {
 			dd_dev_info(dd, "Resetting CSRs with FLR\n");
 			dd_dev_info(dd, "Resetting CSRs with FLR\n");
 			pcie_flr(dd->pcidev);
 			pcie_flr(dd->pcidev);
-			restore_pci_variables(dd);
+			ret = restore_pci_variables(dd);
+			if (ret) {
+				dd_dev_err(dd, "%s: Could not restore PCI variables\n",
+					   __func__);
+				return ret;
+			}
 		}
 		}
 	} else {
 	} else {
 		dd_dev_info(dd, "Resetting CSRs with writes\n");
 		dd_dev_info(dd, "Resetting CSRs with writes\n");
@@ -13914,6 +14003,7 @@ static void init_chip(struct hfi1_devdata *dd)
 	write_csr(dd, ASIC_QSFP1_OUT, 0x1f);
 	write_csr(dd, ASIC_QSFP1_OUT, 0x1f);
 	write_csr(dd, ASIC_QSFP2_OUT, 0x1f);
 	write_csr(dd, ASIC_QSFP2_OUT, 0x1f);
 	init_chip_resources(dd);
 	init_chip_resources(dd);
+	return ret;
 }
 }
 
 
 static void init_early_variables(struct hfi1_devdata *dd)
 static void init_early_variables(struct hfi1_devdata *dd)
@@ -14470,99 +14560,86 @@ static void init_txe(struct hfi1_devdata *dd)
 		write_csr(dd, SEND_CM_TIMER_CTRL, HFI1_CREDIT_RETURN_RATE);
 		write_csr(dd, SEND_CM_TIMER_CTRL, HFI1_CREDIT_RETURN_RATE);
 }
 }
 
 
-int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt, u16 jkey)
+int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd,
+		       u16 jkey)
 {
 {
-	struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
-	unsigned sctxt;
-	int ret = 0;
+	u8 hw_ctxt;
 	u64 reg;
 	u64 reg;
 
 
-	if (!rcd || !rcd->sc) {
-		ret = -EINVAL;
-		goto done;
-	}
-	sctxt = rcd->sc->hw_context;
+	if (!rcd || !rcd->sc)
+		return -EINVAL;
+
+	hw_ctxt = rcd->sc->hw_context;
 	reg = SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK | /* mask is always 1's */
 	reg = SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK | /* mask is always 1's */
 		((jkey & SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK) <<
 		((jkey & SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK) <<
 		 SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT);
 		 SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT);
 	/* JOB_KEY_ALLOW_PERMISSIVE is not allowed by default */
 	/* JOB_KEY_ALLOW_PERMISSIVE is not allowed by default */
 	if (HFI1_CAP_KGET_MASK(rcd->flags, ALLOW_PERM_JKEY))
 	if (HFI1_CAP_KGET_MASK(rcd->flags, ALLOW_PERM_JKEY))
 		reg |= SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK;
 		reg |= SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK;
-	write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_JOB_KEY, reg);
+	write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_JOB_KEY, reg);
 	/*
 	/*
 	 * Enable send-side J_KEY integrity check, unless this is A0 h/w
 	 * Enable send-side J_KEY integrity check, unless this is A0 h/w
 	 */
 	 */
 	if (!is_ax(dd)) {
 	if (!is_ax(dd)) {
-		reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+		reg = read_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE);
 		reg |= SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
 		reg |= SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
-		write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+		write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE, reg);
 	}
 	}
 
 
 	/* Enable J_KEY check on receive context. */
 	/* Enable J_KEY check on receive context. */
 	reg = RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK |
 	reg = RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK |
 		((jkey & RCV_KEY_CTRL_JOB_KEY_VALUE_MASK) <<
 		((jkey & RCV_KEY_CTRL_JOB_KEY_VALUE_MASK) <<
 		 RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT);
 		 RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT);
-	write_kctxt_csr(dd, ctxt, RCV_KEY_CTRL, reg);
-done:
-	return ret;
+	write_kctxt_csr(dd, rcd->ctxt, RCV_KEY_CTRL, reg);
+
+	return 0;
 }
 }
 
 
-int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt)
+int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
 {
 {
-	struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
-	unsigned sctxt;
-	int ret = 0;
+	u8 hw_ctxt;
 	u64 reg;
 	u64 reg;
 
 
-	if (!rcd || !rcd->sc) {
-		ret = -EINVAL;
-		goto done;
-	}
-	sctxt = rcd->sc->hw_context;
-	write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_JOB_KEY, 0);
+	if (!rcd || !rcd->sc)
+		return -EINVAL;
+
+	hw_ctxt = rcd->sc->hw_context;
+	write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_JOB_KEY, 0);
 	/*
 	/*
 	 * Disable send-side J_KEY integrity check, unless this is A0 h/w.
 	 * Disable send-side J_KEY integrity check, unless this is A0 h/w.
 	 * This check would not have been enabled for A0 h/w, see
 	 * This check would not have been enabled for A0 h/w, see
 	 * set_ctxt_jkey().
 	 * set_ctxt_jkey().
 	 */
 	 */
 	if (!is_ax(dd)) {
 	if (!is_ax(dd)) {
-		reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+		reg = read_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE);
 		reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
 		reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
-		write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+		write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE, reg);
 	}
 	}
 	/* Turn off the J_KEY on the receive side */
 	/* Turn off the J_KEY on the receive side */
-	write_kctxt_csr(dd, ctxt, RCV_KEY_CTRL, 0);
-done:
-	return ret;
+	write_kctxt_csr(dd, rcd->ctxt, RCV_KEY_CTRL, 0);
+
+	return 0;
 }
 }
 
 
-int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey)
+int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd,
+		       u16 pkey)
 {
 {
-	struct hfi1_ctxtdata *rcd;
-	unsigned sctxt;
-	int ret = 0;
+	u8 hw_ctxt;
 	u64 reg;
 	u64 reg;
 
 
-	if (ctxt < dd->num_rcv_contexts) {
-		rcd = dd->rcd[ctxt];
-	} else {
-		ret = -EINVAL;
-		goto done;
-	}
-	if (!rcd || !rcd->sc) {
-		ret = -EINVAL;
-		goto done;
-	}
-	sctxt = rcd->sc->hw_context;
+	if (!rcd || !rcd->sc)
+		return -EINVAL;
+
+	hw_ctxt = rcd->sc->hw_context;
 	reg = ((u64)pkey & SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK) <<
 	reg = ((u64)pkey & SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK) <<
 		SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT;
 		SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT;
-	write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, reg);
-	reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+	write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_PARTITION_KEY, reg);
+	reg = read_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE);
 	reg |= SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
 	reg |= SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
 	reg &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK;
 	reg &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK;
-	write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
-done:
-	return ret;
+	write_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE, reg);
+
+	return 0;
 }
 }
 
 
 int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *ctxt)
 int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *ctxt)
@@ -14573,9 +14650,6 @@ int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *ctxt)
 	if (!ctxt || !ctxt->sc)
 	if (!ctxt || !ctxt->sc)
 		return -EINVAL;
 		return -EINVAL;
 
 
-	if (ctxt->ctxt >= dd->num_rcv_contexts)
-		return -EINVAL;
-
 	hw_ctxt = ctxt->sc->hw_context;
 	hw_ctxt = ctxt->sc->hw_context;
 	reg = read_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE);
 	reg = read_kctxt_csr(dd, hw_ctxt, SEND_CTXT_CHECK_ENABLE);
 	reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
 	reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
@@ -14773,7 +14847,6 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
 		}
 		}
 		ppd->vls_supported = num_vls;
 		ppd->vls_supported = num_vls;
 		ppd->vls_operational = ppd->vls_supported;
 		ppd->vls_operational = ppd->vls_supported;
-		ppd->actual_vls_operational = ppd->vls_supported;
 		/* Set the default MTU. */
 		/* Set the default MTU. */
 		for (vl = 0; vl < num_vls; vl++)
 		for (vl = 0; vl < num_vls; vl++)
 			dd->vld[vl].mtu = hfi1_max_mtu;
 			dd->vld[vl].mtu = hfi1_max_mtu;
@@ -14793,7 +14866,7 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
 		/* start in offline */
 		/* start in offline */
 		ppd->host_link_state = HLS_DN_OFFLINE;
 		ppd->host_link_state = HLS_DN_OFFLINE;
 		init_vl_arb_caches(ppd);
 		init_vl_arb_caches(ppd);
-		ppd->last_pstate = 0xff; /* invalid value */
+		ppd->pstate = PLS_OFFLINE;
 	}
 	}
 
 
 	dd->link_default = HLS_DN_POLL;
 	dd->link_default = HLS_DN_POLL;
@@ -14807,6 +14880,11 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
 	if (ret < 0)
 	if (ret < 0)
 		goto bail_free;
 		goto bail_free;
 
 
+	/* Save PCI space registers to rewrite after device reset */
+	ret = save_pci_variables(dd);
+	if (ret < 0)
+		goto bail_cleanup;
+
 	/* verify that reads actually work, save revision for reset check */
 	/* verify that reads actually work, save revision for reset check */
 	dd->revision = read_csr(dd, CCE_REVISION);
 	dd->revision = read_csr(dd, CCE_REVISION);
 	if (dd->revision == ~(u64)0) {
 	if (dd->revision == ~(u64)0) {
@@ -14899,7 +14977,9 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
 		goto bail_cleanup;
 		goto bail_cleanup;
 
 
 	/* obtain chip sizes, reset chip CSRs */
 	/* obtain chip sizes, reset chip CSRs */
-	init_chip(dd);
+	ret = init_chip(dd);
+	if (ret)
+		goto bail_cleanup;
 
 
 	/* read in the PCIe link speed information */
 	/* read in the PCIe link speed information */
 	ret = pcie_speeds(dd);
 	ret = pcie_speeds(dd);

+ 15 - 9
drivers/infiniband/hw/hfi1/chip.h

@@ -384,6 +384,7 @@
 #define VERIFY_CAP_LOCAL_FABRIC	     0x08
 #define VERIFY_CAP_LOCAL_FABRIC	     0x08
 #define VERIFY_CAP_LOCAL_LINK_WIDTH  0x09
 #define VERIFY_CAP_LOCAL_LINK_WIDTH  0x09
 #define LOCAL_DEVICE_ID		     0x0a
 #define LOCAL_DEVICE_ID		     0x0a
+#define RESERVED_REGISTERS	     0x0b
 #define LOCAL_LNI_INFO		     0x0c
 #define LOCAL_LNI_INFO		     0x0c
 #define REMOTE_LNI_INFO              0x0d
 #define REMOTE_LNI_INFO              0x0d
 #define MISC_STATUS		     0x0e
 #define MISC_STATUS		     0x0e
@@ -506,6 +507,9 @@
 #define DOWN_REMOTE_REASON_SHIFT 16
 #define DOWN_REMOTE_REASON_SHIFT 16
 #define DOWN_REMOTE_REASON_MASK  0xff
 #define DOWN_REMOTE_REASON_MASK  0xff
 
 
+#define HOST_INTERFACE_VERSION_SHIFT 16
+#define HOST_INTERFACE_VERSION_MASK  0xff
+
 /* verify capability PHY power management bits */
 /* verify capability PHY power management bits */
 #define PWRM_BER_CONTROL	0x1
 #define PWRM_BER_CONTROL	0x1
 #define PWRM_BANDWIDTH_CONTROL	0x2
 #define PWRM_BANDWIDTH_CONTROL	0x2
@@ -605,11 +609,11 @@ int read_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 *data);
 int write_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 data);
 int write_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 data);
 
 
 void __iomem *get_csr_addr(
 void __iomem *get_csr_addr(
-	struct hfi1_devdata *dd,
+	const struct hfi1_devdata *dd,
 	u32 offset);
 	u32 offset);
 
 
 static inline void __iomem *get_kctxt_csr_addr(
 static inline void __iomem *get_kctxt_csr_addr(
-	struct hfi1_devdata *dd,
+	const struct hfi1_devdata *dd,
 	int ctxt,
 	int ctxt,
 	u32 offset0)
 	u32 offset0)
 {
 {
@@ -704,6 +708,7 @@ int read_8051_data(struct hfi1_devdata *dd, u32 addr, u32 len, u64 *result);
 /* chip.c */
 /* chip.c */
 void read_misc_status(struct hfi1_devdata *dd, u8 *ver_major, u8 *ver_minor,
 void read_misc_status(struct hfi1_devdata *dd, u8 *ver_major, u8 *ver_minor,
 		      u8 *ver_patch);
 		      u8 *ver_patch);
+int write_host_interface_version(struct hfi1_devdata *dd, u8 version);
 void read_guid(struct hfi1_devdata *dd);
 void read_guid(struct hfi1_devdata *dd);
 int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout);
 int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout);
 void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason,
 void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason,
@@ -744,6 +749,7 @@ int is_bx(struct hfi1_devdata *dd);
 u32 read_physical_state(struct hfi1_devdata *dd);
 u32 read_physical_state(struct hfi1_devdata *dd);
 u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate);
 u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate);
 u32 get_logical_state(struct hfi1_pportdata *ppd);
 u32 get_logical_state(struct hfi1_pportdata *ppd);
+void cache_physical_state(struct hfi1_pportdata *ppd);
 const char *opa_lstate_name(u32 lstate);
 const char *opa_lstate_name(u32 lstate);
 const char *opa_pstate_name(u32 pstate);
 const char *opa_pstate_name(u32 pstate);
 u32 driver_physical_state(struct hfi1_pportdata *ppd);
 u32 driver_physical_state(struct hfi1_pportdata *ppd);
@@ -1347,21 +1353,21 @@ enum {
 u64 get_all_cpu_total(u64 __percpu *cntr);
 u64 get_all_cpu_total(u64 __percpu *cntr);
 void hfi1_start_cleanup(struct hfi1_devdata *dd);
 void hfi1_start_cleanup(struct hfi1_devdata *dd);
 void hfi1_clear_tids(struct hfi1_ctxtdata *rcd);
 void hfi1_clear_tids(struct hfi1_ctxtdata *rcd);
-struct ib_header *hfi1_get_msgheader(
-				struct hfi1_devdata *dd, __le32 *rhf_addr);
 void hfi1_init_ctxt(struct send_context *sc);
 void hfi1_init_ctxt(struct send_context *sc);
 void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
 void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
 		  u32 type, unsigned long pa, u16 order);
 		  u32 type, unsigned long pa, u16 order);
 void hfi1_quiet_serdes(struct hfi1_pportdata *ppd);
 void hfi1_quiet_serdes(struct hfi1_pportdata *ppd);
-void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt);
+void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op,
+		  struct hfi1_ctxtdata *rcd);
 u32 hfi1_read_cntrs(struct hfi1_devdata *dd, char **namep, u64 **cntrp);
 u32 hfi1_read_cntrs(struct hfi1_devdata *dd, char **namep, u64 **cntrp);
 u32 hfi1_read_portcntrs(struct hfi1_pportdata *ppd, char **namep, u64 **cntrp);
 u32 hfi1_read_portcntrs(struct hfi1_pportdata *ppd, char **namep, u64 **cntrp);
-u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd);
 int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which);
 int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which);
 int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val);
 int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val);
-int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt, u16 jkey);
-int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt);
-int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey);
+int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd,
+		       u16 jkey);
+int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *ctxt);
+int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *ctxt,
+		       u16 pkey);
 int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *ctxt);
 int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, struct hfi1_ctxtdata *ctxt);
 void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality);
 void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality);
 void hfi1_init_vnic_rsm(struct hfi1_devdata *dd);
 void hfi1_init_vnic_rsm(struct hfi1_devdata *dd);

+ 1 - 0
drivers/infiniband/hw/hfi1/common.h

@@ -325,6 +325,7 @@ struct diag_pkt {
 #define HFI1_LRH_BTH 0x0002      /* 1. word of IB LRH - next header: BTH */
 #define HFI1_LRH_BTH 0x0002      /* 1. word of IB LRH - next header: BTH */
 
 
 /* misc. */
 /* misc. */
+#define SC15_PACKET 0xF
 #define SIZE_OF_CRC 1
 #define SIZE_OF_CRC 1
 
 
 #define LIM_MGMT_P_KEY       0x7FFF
 #define LIM_MGMT_P_KEY       0x7FFF

+ 128 - 63
drivers/infiniband/hw/hfi1/driver.c

@@ -96,7 +96,6 @@ MODULE_PARM_DESC(cap_mask, "Bit mask of enabled/disabled HW features");
 
 
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_DESCRIPTION("Intel Omni-Path Architecture driver");
 MODULE_DESCRIPTION("Intel Omni-Path Architecture driver");
-MODULE_VERSION(HFI1_DRIVER_VERSION);
 
 
 /*
 /*
  * MAX_PKT_RCV is the max # if packets processed per receive interrupt.
  * MAX_PKT_RCV is the max # if packets processed per receive interrupt.
@@ -196,7 +195,7 @@ int hfi1_count_active_units(void)
 
 
 	spin_lock_irqsave(&hfi1_devs_lock, flags);
 	spin_lock_irqsave(&hfi1_devs_lock, flags);
 	list_for_each_entry(dd, &hfi1_dev_list, list) {
 	list_for_each_entry(dd, &hfi1_dev_list, list) {
-		if (!(dd->flags & HFI1_PRESENT) || !dd->kregbase)
+		if (!(dd->flags & HFI1_PRESENT) || !dd->kregbase1)
 			continue;
 			continue;
 		for (pidx = 0; pidx < dd->num_pports; ++pidx) {
 		for (pidx = 0; pidx < dd->num_pports; ++pidx) {
 			ppd = dd->pport + pidx;
 			ppd = dd->pport + pidx;
@@ -224,6 +223,20 @@ static inline void *get_egrbuf(const struct hfi1_ctxtdata *rcd, u64 rhf,
 			(offset * RCV_BUF_BLOCK_SIZE));
 			(offset * RCV_BUF_BLOCK_SIZE));
 }
 }
 
 
+static inline void *hfi1_get_header(struct hfi1_devdata *dd,
+				    __le32 *rhf_addr)
+{
+	u32 offset = rhf_hdrq_offset(rhf_to_cpu(rhf_addr));
+
+	return (void *)(rhf_addr - dd->rhf_offset + offset);
+}
+
+static inline struct ib_header *hfi1_get_msgheader(struct hfi1_devdata *dd,
+						   __le32 *rhf_addr)
+{
+	return (struct ib_header *)hfi1_get_header(dd, rhf_addr);
+}
+
 /*
 /*
  * Validate and encode the a given RcvArray Buffer size.
  * Validate and encode the a given RcvArray Buffer size.
  * The function will check whether the given size falls within
  * The function will check whether the given size falls within
@@ -249,7 +262,8 @@ static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd,
 {
 {
 	struct ib_header *rhdr = packet->hdr;
 	struct ib_header *rhdr = packet->hdr;
 	u32 rte = rhf_rcv_type_err(packet->rhf);
 	u32 rte = rhf_rcv_type_err(packet->rhf);
-	int lnh = ib_get_lnh(rhdr);
+	u8 lnh = ib_get_lnh(rhdr);
+	bool has_grh = false;
 	struct hfi1_ibport *ibp = rcd_to_iport(rcd);
 	struct hfi1_ibport *ibp = rcd_to_iport(rcd);
 	struct hfi1_devdata *dd = ppd->dd;
 	struct hfi1_devdata *dd = ppd->dd;
 	struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
 	struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
@@ -257,37 +271,42 @@ static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd,
 	if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR))
 	if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR))
 		return;
 		return;
 
 
+	if (lnh == HFI1_LRH_BTH) {
+		packet->ohdr = &rhdr->u.oth;
+	} else if (lnh == HFI1_LRH_GRH) {
+		has_grh = true;
+		packet->ohdr = &rhdr->u.l.oth;
+		packet->grh = &rhdr->u.l.grh;
+	} else {
+		goto drop;
+	}
+
 	if (packet->rhf & RHF_TID_ERR) {
 	if (packet->rhf & RHF_TID_ERR) {
 		/* For TIDERR and RC QPs preemptively schedule a NAK */
 		/* For TIDERR and RC QPs preemptively schedule a NAK */
-		struct ib_other_headers *ohdr = NULL;
 		u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */
 		u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */
-		u16 lid  = ib_get_dlid(rhdr);
+		u32 dlid = ib_get_dlid(rhdr);
 		u32 qp_num;
 		u32 qp_num;
-		u32 rcv_flags = 0;
+		u32 mlid_base = be16_to_cpu(IB_MULTICAST_LID_BASE);
 
 
 		/* Sanity check packet */
 		/* Sanity check packet */
 		if (tlen < 24)
 		if (tlen < 24)
 			goto drop;
 			goto drop;
 
 
 		/* Check for GRH */
 		/* Check for GRH */
-		if (lnh == HFI1_LRH_BTH) {
-			ohdr = &rhdr->u.oth;
-		} else if (lnh == HFI1_LRH_GRH) {
+		if (has_grh) {
 			u32 vtf;
 			u32 vtf;
+			struct ib_grh *grh = packet->grh;
 
 
-			ohdr = &rhdr->u.l.oth;
-			if (rhdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR)
+			if (grh->next_hdr != IB_GRH_NEXT_HDR)
 				goto drop;
 				goto drop;
-			vtf = be32_to_cpu(rhdr->u.l.grh.version_tclass_flow);
+			vtf = be32_to_cpu(grh->version_tclass_flow);
 			if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
 			if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
 				goto drop;
 				goto drop;
-			rcv_flags |= HFI1_HAS_GRH;
-		} else {
-			goto drop;
 		}
 		}
+
 		/* Get the destination QP number. */
 		/* Get the destination QP number. */
-		qp_num = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
-		if (lid < be16_to_cpu(IB_MULTICAST_LID_BASE)) {
+		qp_num = ib_bth_get_qpn(packet->ohdr);
+		if (dlid < mlid_base) {
 			struct rvt_qp *qp;
 			struct rvt_qp *qp;
 			unsigned long flags;
 			unsigned long flags;
 
 
@@ -312,11 +331,7 @@ static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd,
 
 
 			switch (qp->ibqp.qp_type) {
 			switch (qp->ibqp.qp_type) {
 			case IB_QPT_RC:
 			case IB_QPT_RC:
-				hfi1_rc_hdrerr(
-					rcd,
-					rhdr,
-					rcv_flags,
-					qp);
+				hfi1_rc_hdrerr(rcd, packet, qp);
 				break;
 				break;
 			default:
 			default:
 				/* For now don't handle any other QP types */
 				/* For now don't handle any other QP types */
@@ -332,9 +347,8 @@ static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd,
 	switch (rte) {
 	switch (rte) {
 	case RHF_RTE_ERROR_OP_CODE_ERR:
 	case RHF_RTE_ERROR_OP_CODE_ERR:
 	{
 	{
-		u32 opcode;
 		void *ebuf = NULL;
 		void *ebuf = NULL;
-		__be32 *bth = NULL;
+		u8 opcode;
 
 
 		if (rhf_use_egr_bfr(packet->rhf))
 		if (rhf_use_egr_bfr(packet->rhf))
 			ebuf = packet->ebuf;
 			ebuf = packet->ebuf;
@@ -342,16 +356,7 @@ static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd,
 		if (!ebuf)
 		if (!ebuf)
 			goto drop; /* this should never happen */
 			goto drop; /* this should never happen */
 
 
-		if (lnh == HFI1_LRH_BTH)
-			bth = (__be32 *)ebuf;
-		else if (lnh == HFI1_LRH_GRH)
-			bth = (__be32 *)((char *)ebuf + sizeof(struct ib_grh));
-		else
-			goto drop;
-
-		opcode = be32_to_cpu(bth[0]) >> 24;
-		opcode &= 0xff;
-
+		opcode = ib_bth_get_opcode(packet->ohdr);
 		if (opcode == IB_OPCODE_CNP) {
 		if (opcode == IB_OPCODE_CNP) {
 			/*
 			/*
 			 * Only in pre-B0 h/w is the CNP_OPCODE handled
 			 * Only in pre-B0 h/w is the CNP_OPCODE handled
@@ -365,7 +370,7 @@ static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd,
 			sc5 = hfi1_9B_get_sc5(rhdr, packet->rhf);
 			sc5 = hfi1_9B_get_sc5(rhdr, packet->rhf);
 			sl = ibp->sc_to_sl[sc5];
 			sl = ibp->sc_to_sl[sc5];
 
 
-			lqpn = be32_to_cpu(bth[1]) & RVT_QPN_MASK;
+			lqpn = ib_bth_get_qpn(packet->ohdr);
 			rcu_read_lock();
 			rcu_read_lock();
 			qp = rvt_lookup_qpn(rdi, &ibp->rvp, lqpn);
 			qp = rvt_lookup_qpn(rdi, &ibp->rvp, lqpn);
 			if (!qp) {
 			if (!qp) {
@@ -415,7 +420,6 @@ static inline void init_packet(struct hfi1_ctxtdata *rcd,
 	packet->rhf = rhf_to_cpu(packet->rhf_addr);
 	packet->rhf = rhf_to_cpu(packet->rhf_addr);
 	packet->rhqoff = rcd->head;
 	packet->rhqoff = rcd->head;
 	packet->numpkt = 0;
 	packet->numpkt = 0;
-	packet->rcv_flags = 0;
 }
 }
 
 
 void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
 void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
@@ -424,21 +428,18 @@ void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
 	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
 	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
 	struct ib_header *hdr = pkt->hdr;
 	struct ib_header *hdr = pkt->hdr;
 	struct ib_other_headers *ohdr = pkt->ohdr;
 	struct ib_other_headers *ohdr = pkt->ohdr;
-	struct ib_grh *grh = NULL;
+	struct ib_grh *grh = pkt->grh;
 	u32 rqpn = 0, bth1;
 	u32 rqpn = 0, bth1;
 	u16 rlid, dlid = ib_get_dlid(hdr);
 	u16 rlid, dlid = ib_get_dlid(hdr);
 	u8 sc, svc_type;
 	u8 sc, svc_type;
 	bool is_mcast = false;
 	bool is_mcast = false;
 
 
-	if (pkt->rcv_flags & HFI1_HAS_GRH)
-		grh = &hdr->u.l.grh;
-
 	switch (qp->ibqp.qp_type) {
 	switch (qp->ibqp.qp_type) {
 	case IB_QPT_SMI:
 	case IB_QPT_SMI:
 	case IB_QPT_GSI:
 	case IB_QPT_GSI:
 	case IB_QPT_UD:
 	case IB_QPT_UD:
 		rlid = ib_get_slid(hdr);
 		rlid = ib_get_slid(hdr);
-		rqpn = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK;
+		rqpn = ib_get_sqpn(ohdr);
 		svc_type = IB_CC_SVCTYPE_UD;
 		svc_type = IB_CC_SVCTYPE_UD;
 		is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
 		is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
 			(dlid != be16_to_cpu(IB_LID_PERMISSIVE));
 			(dlid != be16_to_cpu(IB_LID_PERMISSIVE));
@@ -461,7 +462,7 @@ void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
 
 
 	bth1 = be32_to_cpu(ohdr->bth[1]);
 	bth1 = be32_to_cpu(ohdr->bth[1]);
 	if (do_cnp && (bth1 & IB_FECN_SMASK)) {
 	if (do_cnp && (bth1 & IB_FECN_SMASK)) {
-		u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]);
+		u16 pkey = ib_bth_get_pkey(ohdr);
 
 
 		return_cnp(ibp, qp, rqpn, pkey, dlid, rlid, sc, grh);
 		return_cnp(ibp, qp, rqpn, pkey, dlid, rlid, sc, grh);
 	}
 	}
@@ -591,9 +592,10 @@ static void __prescan_rxq(struct hfi1_packet *packet)
 
 
 		if (lnh == HFI1_LRH_BTH) {
 		if (lnh == HFI1_LRH_BTH) {
 			packet->ohdr = &hdr->u.oth;
 			packet->ohdr = &hdr->u.oth;
+			packet->grh = NULL;
 		} else if (lnh == HFI1_LRH_GRH) {
 		} else if (lnh == HFI1_LRH_GRH) {
 			packet->ohdr = &hdr->u.l.oth;
 			packet->ohdr = &hdr->u.l.oth;
-			packet->rcv_flags |= HFI1_HAS_GRH;
+			packet->grh = &hdr->u.l.grh;
 		} else {
 		} else {
 			goto next; /* just in case */
 			goto next; /* just in case */
 		}
 		}
@@ -698,10 +700,8 @@ static inline int process_rcv_packet(struct hfi1_packet *packet, int thread)
 {
 {
 	int ret;
 	int ret;
 
 
-	packet->hdr = hfi1_get_msgheader(packet->rcd->dd,
-					 packet->rhf_addr);
-	packet->hlen = (u8 *)packet->rhf_addr - (u8 *)packet->hdr;
 	packet->etype = rhf_rcv_type(packet->rhf);
 	packet->etype = rhf_rcv_type(packet->rhf);
+
 	/* total length */
 	/* total length */
 	packet->tlen = rhf_pkt_len(packet->rhf); /* in bytes */
 	packet->tlen = rhf_pkt_len(packet->rhf); /* in bytes */
 	/* retrieve eager buffer details */
 	/* retrieve eager buffer details */
@@ -759,7 +759,7 @@ static inline void process_rcv_update(int last, struct hfi1_packet *packet)
 			       packet->etail, 0, 0);
 			       packet->etail, 0, 0);
 		packet->updegr = 0;
 		packet->updegr = 0;
 	}
 	}
-	packet->rcv_flags = 0;
+	packet->grh = NULL;
 }
 }
 
 
 static inline void finish_packet(struct hfi1_packet *packet)
 static inline void finish_packet(struct hfi1_packet *packet)
@@ -837,9 +837,9 @@ bail:
 	return last;
 	return last;
 }
 }
 
 
-static inline void set_nodma_rtail(struct hfi1_devdata *dd, u8 ctxt)
+static inline void set_nodma_rtail(struct hfi1_devdata *dd, u16 ctxt)
 {
 {
-	int i;
+	u16 i;
 
 
 	/*
 	/*
 	 * For dynamically allocated kernel contexts (like vnic) switch
 	 * For dynamically allocated kernel contexts (like vnic) switch
@@ -857,9 +857,9 @@ static inline void set_nodma_rtail(struct hfi1_devdata *dd, u8 ctxt)
 			&handle_receive_interrupt_nodma_rtail;
 			&handle_receive_interrupt_nodma_rtail;
 }
 }
 
 
-static inline void set_dma_rtail(struct hfi1_devdata *dd, u8 ctxt)
+static inline void set_dma_rtail(struct hfi1_devdata *dd, u16 ctxt)
 {
 {
-	int i;
+	u16 i;
 
 
 	/*
 	/*
 	 * For dynamically allocated kernel contexts (like vnic) switch
 	 * For dynamically allocated kernel contexts (like vnic) switch
@@ -879,7 +879,7 @@ static inline void set_dma_rtail(struct hfi1_devdata *dd, u8 ctxt)
 
 
 void set_all_slowpath(struct hfi1_devdata *dd)
 void set_all_slowpath(struct hfi1_devdata *dd)
 {
 {
-	int i;
+	u16 i;
 
 
 	/* HFI1_CTRL_CTXT must always use the slow path interrupt handler */
 	/* HFI1_CTRL_CTXT must always use the slow path interrupt handler */
 	for (i = HFI1_CTRL_CTXT + 1; i < dd->num_rcv_contexts; i++) {
 	for (i = HFI1_CTRL_CTXT + 1; i < dd->num_rcv_contexts; i++) {
@@ -896,20 +896,25 @@ static inline int set_armed_to_active(struct hfi1_ctxtdata *rcd,
 				      struct hfi1_devdata *dd)
 				      struct hfi1_devdata *dd)
 {
 {
 	struct work_struct *lsaw = &rcd->ppd->linkstate_active_work;
 	struct work_struct *lsaw = &rcd->ppd->linkstate_active_work;
-	struct ib_header *hdr = hfi1_get_msgheader(packet->rcd->dd,
-						   packet->rhf_addr);
 	u8 etype = rhf_rcv_type(packet->rhf);
 	u8 etype = rhf_rcv_type(packet->rhf);
+	u8 sc = SC15_PACKET;
 
 
-	if (etype == RHF_RCV_TYPE_IB &&
-	    hfi1_9B_get_sc5(hdr, packet->rhf) != 0xf) {
-		int hwstate = read_logical_state(dd);
+	if (etype == RHF_RCV_TYPE_IB) {
+		struct ib_header *hdr = hfi1_get_msgheader(packet->rcd->dd,
+							   packet->rhf_addr);
+		sc = hfi1_9B_get_sc5(hdr, packet->rhf);
+	}
+	if (sc != SC15_PACKET) {
+		int hwstate = driver_lstate(rcd->ppd);
 
 
-		if (hwstate != LSTATE_ACTIVE) {
-			dd_dev_info(dd, "Unexpected link state %d\n", hwstate);
+		if (hwstate != IB_PORT_ACTIVE) {
+			dd_dev_info(dd,
+				    "Unexpected link state %s\n",
+				    opa_lstate_name(hwstate));
 			return 0;
 			return 0;
 		}
 		}
 
 
-		queue_work(rcd->ppd->hfi1_wq, lsaw);
+		queue_work(rcd->ppd->link_wq, lsaw);
 		return 1;
 		return 1;
 	}
 	}
 	return 0;
 	return 0;
@@ -1063,7 +1068,7 @@ void receive_interrupt_work(struct work_struct *work)
 	struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
 	struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
 						  linkstate_active_work);
 						  linkstate_active_work);
 	struct hfi1_devdata *dd = ppd->dd;
 	struct hfi1_devdata *dd = ppd->dd;
-	int i;
+	u16 i;
 
 
 	/* Received non-SC15 packet implies neighbor_normal */
 	/* Received non-SC15 packet implies neighbor_normal */
 	ppd->neighbor_normal = 1;
 	ppd->neighbor_normal = 1;
@@ -1264,7 +1269,8 @@ void hfi1_start_led_override(struct hfi1_pportdata *ppd, unsigned int timeon,
  */
  */
 int hfi1_reset_device(int unit)
 int hfi1_reset_device(int unit)
 {
 {
-	int ret, i;
+	int ret;
+	u16 i;
 	struct hfi1_devdata *dd = hfi1_lookup(unit);
 	struct hfi1_devdata *dd = hfi1_lookup(unit);
 	struct hfi1_pportdata *ppd;
 	struct hfi1_pportdata *ppd;
 	unsigned long flags;
 	unsigned long flags;
@@ -1277,7 +1283,7 @@ int hfi1_reset_device(int unit)
 
 
 	dd_dev_info(dd, "Reset on unit %u requested\n", unit);
 	dd_dev_info(dd, "Reset on unit %u requested\n", unit);
 
 
-	if (!dd->kregbase || !(dd->flags & HFI1_PRESENT)) {
+	if (!dd->kregbase1 || !(dd->flags & HFI1_PRESENT)) {
 		dd_dev_info(dd,
 		dd_dev_info(dd,
 			    "Invalid unit number %u or not initialized or not present\n",
 			    "Invalid unit number %u or not initialized or not present\n",
 			    unit);
 			    unit);
@@ -1321,6 +1327,58 @@ bail:
 	return ret;
 	return ret;
 }
 }
 
 
+static inline void hfi1_setup_ib_header(struct hfi1_packet *packet)
+{
+	packet->hdr = (struct hfi1_ib_message_header *)
+			hfi1_get_msgheader(packet->rcd->dd,
+					   packet->rhf_addr);
+	packet->hlen = (u8 *)packet->rhf_addr - (u8 *)packet->hdr;
+}
+
+static int hfi1_setup_9B_packet(struct hfi1_packet *packet)
+{
+	struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd);
+	struct ib_header *hdr;
+	u8 lnh;
+
+	hfi1_setup_ib_header(packet);
+	hdr = packet->hdr;
+
+	lnh = ib_get_lnh(hdr);
+	if (lnh == HFI1_LRH_BTH) {
+		packet->ohdr = &hdr->u.oth;
+		packet->grh = NULL;
+	} else if (lnh == HFI1_LRH_GRH) {
+		u32 vtf;
+
+		packet->ohdr = &hdr->u.l.oth;
+		packet->grh = &hdr->u.l.grh;
+		if (packet->grh->next_hdr != IB_GRH_NEXT_HDR)
+			goto drop;
+		vtf = be32_to_cpu(packet->grh->version_tclass_flow);
+		if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
+			goto drop;
+	} else {
+		goto drop;
+	}
+
+	/* Query commonly used fields from packet header */
+	packet->opcode = ib_bth_get_opcode(packet->ohdr);
+	packet->slid = ib_get_slid(hdr);
+	packet->dlid = ib_get_dlid(hdr);
+	packet->sl = ib_get_sl(hdr);
+	packet->sc = hfi1_9B_get_sc5(hdr, packet->rhf);
+	packet->pad = ib_bth_get_pad(packet->ohdr);
+	packet->extra_byte = 0;
+	packet->fecn = ib_bth_get_fecn(packet->ohdr);
+	packet->becn = ib_bth_get_becn(packet->ohdr);
+
+	return 0;
+drop:
+	ibp->rvp.n_pkt_drops++;
+	return -EINVAL;
+}
+
 void handle_eflags(struct hfi1_packet *packet)
 void handle_eflags(struct hfi1_packet *packet)
 {
 {
 	struct hfi1_ctxtdata *rcd = packet->rcd;
 	struct hfi1_ctxtdata *rcd = packet->rcd;
@@ -1351,6 +1409,9 @@ int process_receive_ib(struct hfi1_packet *packet)
 	if (unlikely(hfi1_dbg_fault_packet(packet)))
 	if (unlikely(hfi1_dbg_fault_packet(packet)))
 		return RHF_RCV_CONTINUE;
 		return RHF_RCV_CONTINUE;
 
 
+	if (hfi1_setup_9B_packet(packet))
+		return RHF_RCV_CONTINUE;
+
 	trace_hfi1_rcvhdr(packet->rcd->ppd->dd,
 	trace_hfi1_rcvhdr(packet->rcd->ppd->dd,
 			  packet->rcd->ctxt,
 			  packet->rcd->ctxt,
 			  rhf_err_flags(packet->rhf),
 			  rhf_err_flags(packet->rhf),
@@ -1422,6 +1483,7 @@ int process_receive_error(struct hfi1_packet *packet)
 		 rhf_rcv_type_err(packet->rhf) == 3))
 		 rhf_rcv_type_err(packet->rhf) == 3))
 		return RHF_RCV_CONTINUE;
 		return RHF_RCV_CONTINUE;
 
 
+	hfi1_setup_ib_header(packet);
 	handle_eflags(packet);
 	handle_eflags(packet);
 
 
 	if (unlikely(rhf_err_flags(packet->rhf)))
 	if (unlikely(rhf_err_flags(packet->rhf)))
@@ -1435,6 +1497,8 @@ int kdeth_process_expected(struct hfi1_packet *packet)
 {
 {
 	if (unlikely(hfi1_dbg_fault_packet(packet)))
 	if (unlikely(hfi1_dbg_fault_packet(packet)))
 		return RHF_RCV_CONTINUE;
 		return RHF_RCV_CONTINUE;
+
+	hfi1_setup_ib_header(packet);
 	if (unlikely(rhf_err_flags(packet->rhf)))
 	if (unlikely(rhf_err_flags(packet->rhf)))
 		handle_eflags(packet);
 		handle_eflags(packet);
 
 
@@ -1445,6 +1509,7 @@ int kdeth_process_expected(struct hfi1_packet *packet)
 
 
 int kdeth_process_eager(struct hfi1_packet *packet)
 int kdeth_process_eager(struct hfi1_packet *packet)
 {
 {
+	hfi1_setup_ib_header(packet);
 	if (unlikely(rhf_err_flags(packet->rhf)))
 	if (unlikely(rhf_err_flags(packet->rhf)))
 		handle_eflags(packet);
 		handle_eflags(packet);
 	if (unlikely(hfi1_dbg_fault_packet(packet)))
 	if (unlikely(hfi1_dbg_fault_packet(packet)))

+ 5 - 6
drivers/infiniband/hw/hfi1/eprom.c

@@ -250,7 +250,6 @@ static int read_partition_platform_config(struct hfi1_devdata *dd, void **data,
 {
 {
 	void *buffer;
 	void *buffer;
 	void *p;
 	void *p;
-	u32 length;
 	int ret;
 	int ret;
 
 
 	buffer = kmalloc(P1_SIZE, GFP_KERNEL);
 	buffer = kmalloc(P1_SIZE, GFP_KERNEL);
@@ -265,13 +264,13 @@ static int read_partition_platform_config(struct hfi1_devdata *dd, void **data,
 
 
 	/* scan for image magic that may trail the actual data */
 	/* scan for image magic that may trail the actual data */
 	p = strnstr(buffer, IMAGE_TRAIL_MAGIC, P1_SIZE);
 	p = strnstr(buffer, IMAGE_TRAIL_MAGIC, P1_SIZE);
-	if (p)
-		length = p - buffer;
-	else
-		length = P1_SIZE;
+	if (!p) {
+		kfree(buffer);
+		return -ENOENT;
+	}
 
 
 	*data = buffer;
 	*data = buffer;
-	*size = length;
+	*size = p - buffer;
 	return 0;
 	return 0;
 }
 }
 
 

+ 114 - 0
drivers/infiniband/hw/hfi1/exp_rcv.c

@@ -0,0 +1,114 @@
+/*
+ * Copyright(c) 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "exp_rcv.h"
+#include "trace.h"
+
+/**
+ * exp_tid_group_init - initialize exp_tid_set
+ * @set - the set
+ */
+void hfi1_exp_tid_group_init(struct exp_tid_set *set)
+{
+	INIT_LIST_HEAD(&set->list);
+	set->count = 0;
+}
+
+/**
+ * alloc_ctxt_rcv_groups - initialize expected receive groups
+ * @rcd - the context to add the groupings to
+ */
+int hfi1_alloc_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd)
+{
+	struct hfi1_devdata *dd = rcd->dd;
+	u32 tidbase;
+	struct tid_group *grp;
+	int i;
+
+	tidbase = rcd->expected_base;
+	for (i = 0; i < rcd->expected_count /
+		     dd->rcv_entries.group_size; i++) {
+		grp = kzalloc(sizeof(*grp), GFP_KERNEL);
+		if (!grp)
+			goto bail;
+		grp->size = dd->rcv_entries.group_size;
+		grp->base = tidbase;
+		tid_group_add_tail(grp, &rcd->tid_group_list);
+		tidbase += dd->rcv_entries.group_size;
+	}
+
+	return 0;
+bail:
+	hfi1_free_ctxt_rcv_groups(rcd);
+	return -ENOMEM;
+}
+
+/**
+ * free_ctxt_rcv_groups - free  expected receive groups
+ * @rcd - the context to free
+ *
+ * The routine dismantles the expect receive linked
+ * list and clears any tids associated with the receive
+ * context.
+ *
+ * This should only be called for kernel contexts and the
+ * a base user context.
+ */
+void hfi1_free_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd)
+{
+	struct tid_group *grp, *gptr;
+
+	WARN_ON(!EXP_TID_SET_EMPTY(rcd->tid_full_list));
+	WARN_ON(!EXP_TID_SET_EMPTY(rcd->tid_used_list));
+
+	list_for_each_entry_safe(grp, gptr, &rcd->tid_group_list.list, list) {
+		tid_group_remove(grp, &rcd->tid_group_list);
+		kfree(grp);
+	}
+
+	hfi1_clear_tids(rcd);
+}

+ 190 - 0
drivers/infiniband/hw/hfi1/exp_rcv.h

@@ -0,0 +1,190 @@
+#ifndef _HFI1_EXP_RCV_H
+#define _HFI1_EXP_RCV_H
+/*
+ * Copyright(c) 2017 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+
+#define EXP_TID_SET_EMPTY(set) (set.count == 0 && list_empty(&set.list))
+
+#define EXP_TID_TIDLEN_MASK   0x7FFULL
+#define EXP_TID_TIDLEN_SHIFT  0
+#define EXP_TID_TIDCTRL_MASK  0x3ULL
+#define EXP_TID_TIDCTRL_SHIFT 20
+#define EXP_TID_TIDIDX_MASK   0x3FFULL
+#define EXP_TID_TIDIDX_SHIFT  22
+#define EXP_TID_GET(tid, field)	\
+	(((tid) >> EXP_TID_TID##field##_SHIFT) & EXP_TID_TID##field##_MASK)
+
+#define EXP_TID_SET(field, value)			\
+	(((value) & EXP_TID_TID##field##_MASK) <<	\
+	 EXP_TID_TID##field##_SHIFT)
+#define EXP_TID_CLEAR(tid, field) ({					\
+		(tid) &= ~(EXP_TID_TID##field##_MASK <<			\
+			   EXP_TID_TID##field##_SHIFT);			\
+		})
+#define EXP_TID_RESET(tid, field, value) do {				\
+		EXP_TID_CLEAR(tid, field);				\
+		(tid) |= EXP_TID_SET(field, (value));			\
+	} while (0)
+
+/*
+ * Define fields in the KDETH header so we can update the header
+ * template.
+ */
+#define KDETH_OFFSET_SHIFT        0
+#define KDETH_OFFSET_MASK         0x7fff
+#define KDETH_OM_SHIFT            15
+#define KDETH_OM_MASK             0x1
+#define KDETH_TID_SHIFT           16
+#define KDETH_TID_MASK            0x3ff
+#define KDETH_TIDCTRL_SHIFT       26
+#define KDETH_TIDCTRL_MASK        0x3
+#define KDETH_INTR_SHIFT          28
+#define KDETH_INTR_MASK           0x1
+#define KDETH_SH_SHIFT            29
+#define KDETH_SH_MASK             0x1
+#define KDETH_KVER_SHIFT          30
+#define KDETH_KVER_MASK           0x3
+#define KDETH_JKEY_SHIFT          0x0
+#define KDETH_JKEY_MASK           0xff
+#define KDETH_HCRC_UPPER_SHIFT    16
+#define KDETH_HCRC_UPPER_MASK     0xff
+#define KDETH_HCRC_LOWER_SHIFT    24
+#define KDETH_HCRC_LOWER_MASK     0xff
+
+#define KDETH_GET(val, field)						\
+	(((le32_to_cpu((val))) >> KDETH_##field##_SHIFT) & KDETH_##field##_MASK)
+#define KDETH_SET(dw, field, val) do {					\
+		u32 dwval = le32_to_cpu(dw);				\
+		dwval &= ~(KDETH_##field##_MASK << KDETH_##field##_SHIFT); \
+		dwval |= (((val) & KDETH_##field##_MASK) << \
+			  KDETH_##field##_SHIFT);			\
+		dw = cpu_to_le32(dwval);				\
+	} while (0)
+
+#define KDETH_RESET(dw, field, val) ({ dw = 0; KDETH_SET(dw, field, val); })
+
+/* KDETH OM multipliers and switch over point */
+#define KDETH_OM_SMALL     4
+#define KDETH_OM_SMALL_SHIFT     2
+#define KDETH_OM_LARGE     64
+#define KDETH_OM_LARGE_SHIFT     6
+#define KDETH_OM_MAX_SIZE  (1 << ((KDETH_OM_LARGE / KDETH_OM_SMALL) + 1))
+
+struct tid_group {
+	struct list_head list;
+	u32 base;
+	u8 size;
+	u8 used;
+	u8 map;
+};
+
+/*
+ * Write an "empty" RcvArray entry.
+ * This function exists so the TID registaration code can use it
+ * to write to unused/unneeded entries and still take advantage
+ * of the WC performance improvements. The HFI will ignore this
+ * write to the RcvArray entry.
+ */
+static inline void rcv_array_wc_fill(struct hfi1_devdata *dd, u32 index)
+{
+	/*
+	 * Doing the WC fill writes only makes sense if the device is
+	 * present and the RcvArray has been mapped as WC memory.
+	 */
+	if ((dd->flags & HFI1_PRESENT) && dd->rcvarray_wc) {
+		writeq(0, dd->rcvarray_wc + (index * 8));
+		if ((index & 3) == 3)
+			flush_wc();
+	}
+}
+
+static inline void tid_group_add_tail(struct tid_group *grp,
+				      struct exp_tid_set *set)
+{
+	list_add_tail(&grp->list, &set->list);
+	set->count++;
+}
+
+static inline void tid_group_remove(struct tid_group *grp,
+				    struct exp_tid_set *set)
+{
+	list_del_init(&grp->list);
+	set->count--;
+}
+
+static inline void tid_group_move(struct tid_group *group,
+				  struct exp_tid_set *s1,
+				  struct exp_tid_set *s2)
+{
+	tid_group_remove(group, s1);
+	tid_group_add_tail(group, s2);
+}
+
+static inline struct tid_group *tid_group_pop(struct exp_tid_set *set)
+{
+	struct tid_group *grp =
+		list_first_entry(&set->list, struct tid_group, list);
+	list_del_init(&grp->list);
+	set->count--;
+	return grp;
+}
+
+static inline u32 rcventry2tidinfo(u32 rcventry)
+{
+	u32 pair = rcventry & ~0x1;
+
+	return EXP_TID_SET(IDX, pair >> 1) |
+		EXP_TID_SET(CTRL, 1 << (rcventry - pair));
+}
+
+int hfi1_alloc_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd);
+void hfi1_free_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd);
+void hfi1_exp_tid_group_init(struct exp_tid_set *set);
+
+#endif /* _HFI1_EXP_RCV_H */

+ 91 - 61
drivers/infiniband/hw/hfi1/file_ops.c

@@ -81,19 +81,23 @@ static u64 kvirt_to_phys(void *addr);
 static int assign_ctxt(struct hfi1_filedata *fd, struct hfi1_user_info *uinfo);
 static int assign_ctxt(struct hfi1_filedata *fd, struct hfi1_user_info *uinfo);
 static int init_subctxts(struct hfi1_ctxtdata *uctxt,
 static int init_subctxts(struct hfi1_ctxtdata *uctxt,
 			 const struct hfi1_user_info *uinfo);
 			 const struct hfi1_user_info *uinfo);
-static int init_user_ctxt(struct hfi1_filedata *fd);
+static int init_user_ctxt(struct hfi1_filedata *fd,
+			  struct hfi1_ctxtdata *uctxt);
 static void user_init(struct hfi1_ctxtdata *uctxt);
 static void user_init(struct hfi1_ctxtdata *uctxt);
 static int get_ctxt_info(struct hfi1_filedata *fd, void __user *ubase,
 static int get_ctxt_info(struct hfi1_filedata *fd, void __user *ubase,
 			 __u32 len);
 			 __u32 len);
 static int get_base_info(struct hfi1_filedata *fd, void __user *ubase,
 static int get_base_info(struct hfi1_filedata *fd, void __user *ubase,
 			 __u32 len);
 			 __u32 len);
-static int setup_base_ctxt(struct hfi1_filedata *fd);
+static int setup_base_ctxt(struct hfi1_filedata *fd,
+			   struct hfi1_ctxtdata *uctxt);
 static int setup_subctxt(struct hfi1_ctxtdata *uctxt);
 static int setup_subctxt(struct hfi1_ctxtdata *uctxt);
 
 
 static int find_sub_ctxt(struct hfi1_filedata *fd,
 static int find_sub_ctxt(struct hfi1_filedata *fd,
 			 const struct hfi1_user_info *uinfo);
 			 const struct hfi1_user_info *uinfo);
 static int allocate_ctxt(struct hfi1_filedata *fd, struct hfi1_devdata *dd,
 static int allocate_ctxt(struct hfi1_filedata *fd, struct hfi1_devdata *dd,
-			 struct hfi1_user_info *uinfo);
+			 struct hfi1_user_info *uinfo,
+			 struct hfi1_ctxtdata **cd);
+static void deallocate_ctxt(struct hfi1_ctxtdata *uctxt);
 static unsigned int poll_urgent(struct file *fp, struct poll_table_struct *pt);
 static unsigned int poll_urgent(struct file *fp, struct poll_table_struct *pt);
 static unsigned int poll_next(struct file *fp, struct poll_table_struct *pt);
 static unsigned int poll_next(struct file *fp, struct poll_table_struct *pt);
 static int user_event_ack(struct hfi1_ctxtdata *uctxt, u16 subctxt,
 static int user_event_ack(struct hfi1_ctxtdata *uctxt, u16 subctxt,
@@ -181,7 +185,7 @@ static int hfi1_file_open(struct inode *inode, struct file *fp)
 					       struct hfi1_devdata,
 					       struct hfi1_devdata,
 					       user_cdev);
 					       user_cdev);
 
 
-	if (!((dd->flags & HFI1_PRESENT) && dd->kregbase))
+	if (!((dd->flags & HFI1_PRESENT) && dd->kregbase1))
 		return -EINVAL;
 		return -EINVAL;
 
 
 	if (!atomic_inc_not_zero(&dd->user_refcount))
 	if (!atomic_inc_not_zero(&dd->user_refcount))
@@ -267,12 +271,14 @@ static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
 			/*
 			/*
 			 * Copy the number of tidlist entries we used
 			 * Copy the number of tidlist entries we used
 			 * and the length of the buffer we registered.
 			 * and the length of the buffer we registered.
-			 * These fields are adjacent in the structure so
-			 * we can copy them at the same time.
 			 */
 			 */
 			addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
 			addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
 			if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
 			if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
-					 sizeof(tinfo.tidcnt) +
+					 sizeof(tinfo.tidcnt)))
+				return -EFAULT;
+
+			addr = arg + offsetof(struct hfi1_tid_info, length);
+			if (copy_to_user((void __user *)addr, &tinfo.length,
 					 sizeof(tinfo.length)))
 					 sizeof(tinfo.length)))
 				ret = -EFAULT;
 				ret = -EFAULT;
 		}
 		}
@@ -388,8 +394,7 @@ static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
 
 
 			sc_disable(sc);
 			sc_disable(sc);
 			ret = sc_enable(sc);
 			ret = sc_enable(sc);
-			hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB,
-				     uctxt->ctxt);
+			hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB, uctxt);
 		} else {
 		} else {
 			ret = sc_restart(sc);
 			ret = sc_restart(sc);
 		}
 		}
@@ -757,7 +762,7 @@ static int hfi1_file_close(struct inode *inode, struct file *fp)
 
 
 	flush_wc();
 	flush_wc();
 	/* drain user sdma queue */
 	/* drain user sdma queue */
-	hfi1_user_sdma_free_queues(fdata);
+	hfi1_user_sdma_free_queues(fdata, uctxt);
 
 
 	/* release the cpu */
 	/* release the cpu */
 	hfi1_put_proc_affinity(fdata->rec_cpu_num);
 	hfi1_put_proc_affinity(fdata->rec_cpu_num);
@@ -774,6 +779,8 @@ static int hfi1_file_close(struct inode *inode, struct file *fp)
 	*ev = 0;
 	*ev = 0;
 
 
 	__clear_bit(fdata->subctxt, uctxt->in_use_ctxts);
 	__clear_bit(fdata->subctxt, uctxt->in_use_ctxts);
+	fdata->uctxt = NULL;
+	hfi1_rcd_put(uctxt); /* fdata reference */
 	if (!bitmap_empty(uctxt->in_use_ctxts, HFI1_MAX_SHARED_CTXTS)) {
 	if (!bitmap_empty(uctxt->in_use_ctxts, HFI1_MAX_SHARED_CTXTS)) {
 		mutex_unlock(&hfi1_mutex);
 		mutex_unlock(&hfi1_mutex);
 		goto done;
 		goto done;
@@ -790,34 +797,26 @@ static int hfi1_file_close(struct inode *inode, struct file *fp)
 		     HFI1_RCVCTRL_TAILUPD_DIS |
 		     HFI1_RCVCTRL_TAILUPD_DIS |
 		     HFI1_RCVCTRL_ONE_PKT_EGR_DIS |
 		     HFI1_RCVCTRL_ONE_PKT_EGR_DIS |
 		     HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
 		     HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
-		     HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt->ctxt);
+		     HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt);
 	/* Clear the context's J_KEY */
 	/* Clear the context's J_KEY */
-	hfi1_clear_ctxt_jkey(dd, uctxt->ctxt);
+	hfi1_clear_ctxt_jkey(dd, uctxt);
 	/*
 	/*
-	 * Reset context integrity checks to default.
-	 * (writes to CSRs probably belong in chip.c)
+	 * If a send context is allocated, reset context integrity
+	 * checks to default and disable the send context.
 	 */
 	 */
-	write_kctxt_csr(dd, uctxt->sc->hw_context, SEND_CTXT_CHECK_ENABLE,
-			hfi1_pkt_default_send_ctxt_mask(dd, uctxt->sc->type));
-	sc_disable(uctxt->sc);
+	if (uctxt->sc) {
+		set_pio_integrity(uctxt->sc);
+		sc_disable(uctxt->sc);
+	}
 	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
 	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
 
 
-	dd->rcd[uctxt->ctxt] = NULL;
-
-	hfi1_user_exp_rcv_grp_free(uctxt);
+	hfi1_free_ctxt_rcv_groups(uctxt);
 	hfi1_clear_ctxt_pkey(dd, uctxt);
 	hfi1_clear_ctxt_pkey(dd, uctxt);
 
 
-	uctxt->rcvwait_to = 0;
-	uctxt->piowait_to = 0;
-	uctxt->rcvnowait = 0;
-	uctxt->pionowait = 0;
 	uctxt->event_flags = 0;
 	uctxt->event_flags = 0;
-
-	hfi1_stats.sps_ctxts--;
-	if (++dd->freectxts == dd->num_user_contexts)
-		aspm_enable_all(dd);
 	mutex_unlock(&hfi1_mutex);
 	mutex_unlock(&hfi1_mutex);
-	hfi1_free_ctxtdata(dd, uctxt);
+
+	deallocate_ctxt(uctxt);
 done:
 done:
 	mmdrop(fdata->mm);
 	mmdrop(fdata->mm);
 	kobject_put(&dd->kobj);
 	kobject_put(&dd->kobj);
@@ -849,6 +848,7 @@ static int assign_ctxt(struct hfi1_filedata *fd, struct hfi1_user_info *uinfo)
 {
 {
 	int ret;
 	int ret;
 	unsigned int swmajor, swminor;
 	unsigned int swmajor, swminor;
+	struct hfi1_ctxtdata *uctxt = NULL;
 
 
 	swmajor = uinfo->userversion >> 16;
 	swmajor = uinfo->userversion >> 16;
 	if (swmajor != HFI1_USER_SWMAJOR)
 	if (swmajor != HFI1_USER_SWMAJOR)
@@ -874,7 +874,7 @@ static int assign_ctxt(struct hfi1_filedata *fd, struct hfi1_user_info *uinfo)
 	 * couldn't find a sub context.
 	 * couldn't find a sub context.
 	 */
 	 */
 	if (!ret)
 	if (!ret)
-		ret = allocate_ctxt(fd, fd->dd, uinfo);
+		ret = allocate_ctxt(fd, fd->dd, uinfo, &uctxt);
 
 
 	mutex_unlock(&hfi1_mutex);
 	mutex_unlock(&hfi1_mutex);
 
 
@@ -887,31 +887,38 @@ static int assign_ctxt(struct hfi1_filedata *fd, struct hfi1_user_info *uinfo)
 		ret = wait_event_interruptible(fd->uctxt->wait, !test_bit(
 		ret = wait_event_interruptible(fd->uctxt->wait, !test_bit(
 					       HFI1_CTXT_BASE_UNINIT,
 					       HFI1_CTXT_BASE_UNINIT,
 					       &fd->uctxt->event_flags));
 					       &fd->uctxt->event_flags));
-		if (test_bit(HFI1_CTXT_BASE_FAILED, &fd->uctxt->event_flags)) {
-			clear_bit(fd->subctxt, fd->uctxt->in_use_ctxts);
-			return -ENOMEM;
-		}
+		if (test_bit(HFI1_CTXT_BASE_FAILED, &fd->uctxt->event_flags))
+			ret = -ENOMEM;
+
 		/* The only thing a sub context needs is the user_xxx stuff */
 		/* The only thing a sub context needs is the user_xxx stuff */
 		if (!ret)
 		if (!ret)
-			ret = init_user_ctxt(fd);
+			ret = init_user_ctxt(fd, fd->uctxt);
 
 
 		if (ret)
 		if (ret)
 			clear_bit(fd->subctxt, fd->uctxt->in_use_ctxts);
 			clear_bit(fd->subctxt, fd->uctxt->in_use_ctxts);
+
 	} else if (!ret) {
 	} else if (!ret) {
-		ret = setup_base_ctxt(fd);
-		if (fd->uctxt->subctxt_cnt) {
+		ret = setup_base_ctxt(fd, uctxt);
+		if (uctxt->subctxt_cnt) {
 			/* If there is an error, set the failed bit. */
 			/* If there is an error, set the failed bit. */
 			if (ret)
 			if (ret)
 				set_bit(HFI1_CTXT_BASE_FAILED,
 				set_bit(HFI1_CTXT_BASE_FAILED,
-					&fd->uctxt->event_flags);
+					&uctxt->event_flags);
 			/*
 			/*
 			 * Base context is done, notify anybody using a
 			 * Base context is done, notify anybody using a
 			 * sub-context that is waiting for this completion
 			 * sub-context that is waiting for this completion
 			 */
 			 */
-			clear_bit(HFI1_CTXT_BASE_UNINIT,
-				  &fd->uctxt->event_flags);
-			wake_up(&fd->uctxt->wait);
+			clear_bit(HFI1_CTXT_BASE_UNINIT, &uctxt->event_flags);
+			wake_up(&uctxt->wait);
 		}
 		}
+		if (ret)
+			deallocate_ctxt(uctxt);
+	}
+
+	/* If an error occurred, clear the reference */
+	if (ret && fd->uctxt) {
+		hfi1_rcd_put(fd->uctxt);
+		fd->uctxt = NULL;
 	}
 	}
 
 
 	return ret;
 	return ret;
@@ -924,7 +931,7 @@ static int assign_ctxt(struct hfi1_filedata *fd, struct hfi1_user_info *uinfo)
 static int find_sub_ctxt(struct hfi1_filedata *fd,
 static int find_sub_ctxt(struct hfi1_filedata *fd,
 			 const struct hfi1_user_info *uinfo)
 			 const struct hfi1_user_info *uinfo)
 {
 {
-	int i;
+	u16 i;
 	struct hfi1_devdata *dd = fd->dd;
 	struct hfi1_devdata *dd = fd->dd;
 	u16 subctxt;
 	u16 subctxt;
 
 
@@ -961,6 +968,8 @@ static int find_sub_ctxt(struct hfi1_filedata *fd,
 
 
 		fd->uctxt = uctxt;
 		fd->uctxt = uctxt;
 		fd->subctxt = subctxt;
 		fd->subctxt = subctxt;
+
+		hfi1_rcd_get(uctxt);
 		__set_bit(fd->subctxt, uctxt->in_use_ctxts);
 		__set_bit(fd->subctxt, uctxt->in_use_ctxts);
 
 
 		return 1;
 		return 1;
@@ -970,10 +979,11 @@ static int find_sub_ctxt(struct hfi1_filedata *fd,
 }
 }
 
 
 static int allocate_ctxt(struct hfi1_filedata *fd, struct hfi1_devdata *dd,
 static int allocate_ctxt(struct hfi1_filedata *fd, struct hfi1_devdata *dd,
-			 struct hfi1_user_info *uinfo)
+			 struct hfi1_user_info *uinfo,
+			 struct hfi1_ctxtdata **cd)
 {
 {
 	struct hfi1_ctxtdata *uctxt;
 	struct hfi1_ctxtdata *uctxt;
-	unsigned int ctxt;
+	u16 ctxt;
 	int ret, numa;
 	int ret, numa;
 
 
 	if (dd->flags & HFI1_FROZEN) {
 	if (dd->flags & HFI1_FROZEN) {
@@ -1058,8 +1068,6 @@ static int allocate_ctxt(struct hfi1_filedata *fd, struct hfi1_devdata *dd,
 	strlcpy(uctxt->comm, current->comm, sizeof(uctxt->comm));
 	strlcpy(uctxt->comm, current->comm, sizeof(uctxt->comm));
 	memcpy(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid));
 	memcpy(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid));
 	uctxt->jkey = generate_jkey(current_uid());
 	uctxt->jkey = generate_jkey(current_uid());
-	INIT_LIST_HEAD(&uctxt->sdma_queues);
-	spin_lock_init(&uctxt->sdma_qlock);
 	hfi1_stats.sps_ctxts++;
 	hfi1_stats.sps_ctxts++;
 	/*
 	/*
 	 * Disable ASPM when there are open user/PSM contexts to avoid
 	 * Disable ASPM when there are open user/PSM contexts to avoid
@@ -1067,16 +1075,31 @@ static int allocate_ctxt(struct hfi1_filedata *fd, struct hfi1_devdata *dd,
 	 */
 	 */
 	if (dd->freectxts-- == dd->num_user_contexts)
 	if (dd->freectxts-- == dd->num_user_contexts)
 		aspm_disable_all(dd);
 		aspm_disable_all(dd);
-	fd->uctxt = uctxt;
+
+	*cd = uctxt;
 
 
 	return 0;
 	return 0;
 
 
 ctxdata_free:
 ctxdata_free:
+	*cd = NULL;
 	dd->rcd[ctxt] = NULL;
 	dd->rcd[ctxt] = NULL;
-	hfi1_free_ctxtdata(dd, uctxt);
+	hfi1_rcd_put(uctxt);
 	return ret;
 	return ret;
 }
 }
 
 
+static void deallocate_ctxt(struct hfi1_ctxtdata *uctxt)
+{
+	mutex_lock(&hfi1_mutex);
+	hfi1_stats.sps_ctxts--;
+	if (++uctxt->dd->freectxts == uctxt->dd->num_user_contexts)
+		aspm_enable_all(uctxt->dd);
+
+	/* _rcd_put() should be done after releasing mutex */
+	uctxt->dd->rcd[uctxt->ctxt] = NULL;
+	mutex_unlock(&hfi1_mutex);
+	hfi1_rcd_put(uctxt);  /* dd reference */
+}
+
 static int init_subctxts(struct hfi1_ctxtdata *uctxt,
 static int init_subctxts(struct hfi1_ctxtdata *uctxt,
 			 const struct hfi1_user_info *uinfo)
 			 const struct hfi1_user_info *uinfo)
 {
 {
@@ -1153,7 +1176,7 @@ static void user_init(struct hfi1_ctxtdata *uctxt)
 		clear_rcvhdrtail(uctxt);
 		clear_rcvhdrtail(uctxt);
 
 
 	/* Setup J_KEY before enabling the context */
 	/* Setup J_KEY before enabling the context */
-	hfi1_set_ctxt_jkey(uctxt->dd, uctxt->ctxt, uctxt->jkey);
+	hfi1_set_ctxt_jkey(uctxt->dd, uctxt, uctxt->jkey);
 
 
 	rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB;
 	rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB;
 	if (HFI1_CAP_UGET_MASK(uctxt->flags, HDRSUPP))
 	if (HFI1_CAP_UGET_MASK(uctxt->flags, HDRSUPP))
@@ -1179,7 +1202,7 @@ static void user_init(struct hfi1_ctxtdata *uctxt)
 		rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB;
 		rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB;
 	else
 	else
 		rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_DIS;
 		rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_DIS;
-	hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt->ctxt);
+	hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt);
 }
 }
 
 
 static int get_ctxt_info(struct hfi1_filedata *fd, void __user *ubase,
 static int get_ctxt_info(struct hfi1_filedata *fd, void __user *ubase,
@@ -1223,23 +1246,25 @@ static int get_ctxt_info(struct hfi1_filedata *fd, void __user *ubase,
 	return ret;
 	return ret;
 }
 }
 
 
-static int init_user_ctxt(struct hfi1_filedata *fd)
+static int init_user_ctxt(struct hfi1_filedata *fd,
+			  struct hfi1_ctxtdata *uctxt)
 {
 {
-	struct hfi1_ctxtdata *uctxt = fd->uctxt;
 	int ret;
 	int ret;
 
 
 	ret = hfi1_user_sdma_alloc_queues(uctxt, fd);
 	ret = hfi1_user_sdma_alloc_queues(uctxt, fd);
 	if (ret)
 	if (ret)
 		return ret;
 		return ret;
 
 
-	ret = hfi1_user_exp_rcv_init(fd);
+	ret = hfi1_user_exp_rcv_init(fd, uctxt);
+	if (ret)
+		hfi1_user_sdma_free_queues(fd, uctxt);
 
 
 	return ret;
 	return ret;
 }
 }
 
 
-static int setup_base_ctxt(struct hfi1_filedata *fd)
+static int setup_base_ctxt(struct hfi1_filedata *fd,
+			   struct hfi1_ctxtdata *uctxt)
 {
 {
-	struct hfi1_ctxtdata *uctxt = fd->uctxt;
 	struct hfi1_devdata *dd = uctxt->dd;
 	struct hfi1_devdata *dd = uctxt->dd;
 	int ret = 0;
 	int ret = 0;
 
 
@@ -1260,19 +1285,24 @@ static int setup_base_ctxt(struct hfi1_filedata *fd)
 	if (ret)
 	if (ret)
 		goto setup_failed;
 		goto setup_failed;
 
 
-	ret = hfi1_user_exp_rcv_grp_init(fd);
+	ret = hfi1_alloc_ctxt_rcv_groups(uctxt);
 	if (ret)
 	if (ret)
 		goto setup_failed;
 		goto setup_failed;
 
 
-	ret = init_user_ctxt(fd);
+	ret = init_user_ctxt(fd, uctxt);
 	if (ret)
 	if (ret)
 		goto setup_failed;
 		goto setup_failed;
 
 
 	user_init(uctxt);
 	user_init(uctxt);
 
 
+	/* Now that the context is set up, the fd can get a reference. */
+	fd->uctxt = uctxt;
+	hfi1_rcd_get(uctxt);
+
 	return 0;
 	return 0;
 
 
 setup_failed:
 setup_failed:
+	/* Call _free_ctxtdata, not _rcd_put().  We still need the context. */
 	hfi1_free_ctxtdata(dd, uctxt);
 	hfi1_free_ctxtdata(dd, uctxt);
 	return ret;
 	return ret;
 }
 }
@@ -1390,7 +1420,7 @@ static unsigned int poll_next(struct file *fp,
 	spin_lock_irq(&dd->uctxt_lock);
 	spin_lock_irq(&dd->uctxt_lock);
 	if (hdrqempty(uctxt)) {
 	if (hdrqempty(uctxt)) {
 		set_bit(HFI1_CTXT_WAITING_RCV, &uctxt->event_flags);
 		set_bit(HFI1_CTXT_WAITING_RCV, &uctxt->event_flags);
-		hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_ENB, uctxt->ctxt);
+		hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_ENB, uctxt);
 		pollflag = 0;
 		pollflag = 0;
 	} else {
 	} else {
 		pollflag = POLLIN | POLLRDNORM;
 		pollflag = POLLIN | POLLRDNORM;
@@ -1409,7 +1439,7 @@ int hfi1_set_uevent_bits(struct hfi1_pportdata *ppd, const int evtbit)
 {
 {
 	struct hfi1_ctxtdata *uctxt;
 	struct hfi1_ctxtdata *uctxt;
 	struct hfi1_devdata *dd = ppd->dd;
 	struct hfi1_devdata *dd = ppd->dd;
-	unsigned ctxt;
+	u16 ctxt;
 	int ret = 0;
 	int ret = 0;
 	unsigned long flags;
 	unsigned long flags;
 
 
@@ -1475,7 +1505,7 @@ static int manage_rcvq(struct hfi1_ctxtdata *uctxt, u16 subctxt,
 	} else {
 	} else {
 		rcvctrl_op = HFI1_RCVCTRL_CTXT_DIS;
 		rcvctrl_op = HFI1_RCVCTRL_CTXT_DIS;
 	}
 	}
-	hfi1_rcvctrl(dd, rcvctrl_op, uctxt->ctxt);
+	hfi1_rcvctrl(dd, rcvctrl_op, uctxt);
 	/* always; new head should be equal to new tail; see above */
 	/* always; new head should be equal to new tail; see above */
 bail:
 bail:
 	return 0;
 	return 0;
@@ -1525,7 +1555,7 @@ static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, u16 subctxt, u16 pkey)
 		}
 		}
 
 
 	if (intable)
 	if (intable)
-		ret = hfi1_set_ctxt_pkey(dd, uctxt->ctxt, pkey);
+		ret = hfi1_set_ctxt_pkey(dd, uctxt, pkey);
 done:
 done:
 	return ret;
 	return ret;
 }
 }

+ 16 - 0
drivers/infiniband/hw/hfi1/firmware.c

@@ -69,6 +69,7 @@
 #define ALT_FW_FABRIC_NAME "hfi1_fabric_d.fw"
 #define ALT_FW_FABRIC_NAME "hfi1_fabric_d.fw"
 #define ALT_FW_SBUS_NAME "hfi1_sbus_d.fw"
 #define ALT_FW_SBUS_NAME "hfi1_sbus_d.fw"
 #define ALT_FW_PCIE_NAME "hfi1_pcie_d.fw"
 #define ALT_FW_PCIE_NAME "hfi1_pcie_d.fw"
+#define HOST_INTERFACE_VERSION 1
 
 
 static uint fw_8051_load = 1;
 static uint fw_8051_load = 1;
 static uint fw_fabric_serdes_load = 1;
 static uint fw_fabric_serdes_load = 1;
@@ -615,6 +616,14 @@ retry:
 		fw_fabric_serdes_name = ALT_FW_FABRIC_NAME;
 		fw_fabric_serdes_name = ALT_FW_FABRIC_NAME;
 		fw_sbus_name = ALT_FW_SBUS_NAME;
 		fw_sbus_name = ALT_FW_SBUS_NAME;
 		fw_pcie_serdes_name = ALT_FW_PCIE_NAME;
 		fw_pcie_serdes_name = ALT_FW_PCIE_NAME;
+
+		/*
+		 * Add a delay before obtaining and loading debug firmware.
+		 * Authorization will fail if the delay between firmware
+		 * authorization events is shorter than 50us. Add 100us to
+		 * make a delay time safe.
+		 */
+		usleep_range(100, 120);
 	}
 	}
 
 
 	if (fw_sbus_load) {
 	if (fw_sbus_load) {
@@ -1079,6 +1088,13 @@ static int load_8051_firmware(struct hfi1_devdata *dd,
 	dd_dev_info(dd, "8051 firmware version %d.%d.%d\n",
 	dd_dev_info(dd, "8051 firmware version %d.%d.%d\n",
 		    (int)ver_major, (int)ver_minor, (int)ver_patch);
 		    (int)ver_major, (int)ver_minor, (int)ver_patch);
 	dd->dc8051_ver = dc8051_ver(ver_major, ver_minor, ver_patch);
 	dd->dc8051_ver = dc8051_ver(ver_major, ver_minor, ver_patch);
+	ret = write_host_interface_version(dd, HOST_INTERFACE_VERSION);
+	if (ret != HCMD_SUCCESS) {
+		dd_dev_err(dd,
+			   "Failed to set host interface version, return 0x%x\n",
+			   ret);
+		return -EIO;
+	}
 
 
 	return 0;
 	return 0;
 }
 }

+ 61 - 95
drivers/infiniband/hw/hfi1/hfi.h

@@ -213,13 +213,11 @@ struct hfi1_ctxtdata {
 
 
 	/* dynamic receive available interrupt timeout */
 	/* dynamic receive available interrupt timeout */
 	u32 rcvavail_timeout;
 	u32 rcvavail_timeout;
-	/*
-	 * number of opens (including slave sub-contexts) on this instance
-	 * (ignoring forks, dup, etc. for now)
-	 */
-	int cnt;
+	/* Reference count the base context usage */
+	struct kref kref;
+
 	/* Device context index */
 	/* Device context index */
-	unsigned ctxt;
+	u16 ctxt;
 	/*
 	/*
 	 * non-zero if ctxt can be shared, and defines the maximum number of
 	 * non-zero if ctxt can be shared, and defines the maximum number of
 	 * sub-contexts for this device context.
 	 * sub-contexts for this device context.
@@ -245,24 +243,10 @@ struct hfi1_ctxtdata {
 
 
 	/* lock protecting all Expected TID data */
 	/* lock protecting all Expected TID data */
 	struct mutex exp_lock;
 	struct mutex exp_lock;
-	/* number of pio bufs for this ctxt (all procs, if shared) */
-	u32 piocnt;
-	/* first pio buffer for this ctxt */
-	u32 pio_base;
-	/* chip offset of PIO buffers for this ctxt */
-	u32 piobufs;
 	/* per-context configuration flags */
 	/* per-context configuration flags */
 	unsigned long flags;
 	unsigned long flags;
 	/* per-context event flags for fileops/intr communication */
 	/* per-context event flags for fileops/intr communication */
 	unsigned long event_flags;
 	unsigned long event_flags;
-	/* WAIT_RCV that timed out, no interrupt */
-	u32 rcvwait_to;
-	/* WAIT_PIO that timed out, no interrupt */
-	u32 piowait_to;
-	/* WAIT_RCV already happened, no wait */
-	u32 rcvnowait;
-	/* WAIT_PIO already happened, no wait */
-	u32 pionowait;
 	/* total number of polled urgent packets */
 	/* total number of polled urgent packets */
 	u32 urgent;
 	u32 urgent;
 	/* saved total number of polled urgent packets for poll edge trigger */
 	/* saved total number of polled urgent packets for poll edge trigger */
@@ -292,7 +276,6 @@ struct hfi1_ctxtdata {
 	u8 redirect_seq_cnt;
 	u8 redirect_seq_cnt;
 	/* ctxt rcvhdrq head offset */
 	/* ctxt rcvhdrq head offset */
 	u32 head;
 	u32 head;
-	u32 pkt_count;
 	/* QPs waiting for context processing */
 	/* QPs waiting for context processing */
 	struct list_head qp_wait_list;
 	struct list_head qp_wait_list;
 	/* interrupt handling */
 	/* interrupt handling */
@@ -301,15 +284,6 @@ struct hfi1_ctxtdata {
 	unsigned numa_id; /* numa node of this context */
 	unsigned numa_id; /* numa node of this context */
 	/* verbs stats per CTX */
 	/* verbs stats per CTX */
 	struct hfi1_opcode_stats_perctx *opstats;
 	struct hfi1_opcode_stats_perctx *opstats;
-	/*
-	 * This is the kernel thread that will keep making
-	 * progress on the user sdma requests behind the scenes.
-	 * There is one per context (shared contexts use the master's).
-	 */
-	struct task_struct *progress;
-	struct list_head sdma_queues;
-	/* protect sdma queues */
-	spinlock_t sdma_qlock;
 
 
 	/* Is ASPM interrupt supported for this context */
 	/* Is ASPM interrupt supported for this context */
 	bool aspm_intr_supported;
 	bool aspm_intr_supported;
@@ -356,17 +330,26 @@ struct hfi1_packet {
 	__le32 *rhf_addr;
 	__le32 *rhf_addr;
 	struct rvt_qp *qp;
 	struct rvt_qp *qp;
 	struct ib_other_headers *ohdr;
 	struct ib_other_headers *ohdr;
+	struct ib_grh *grh;
 	u64 rhf;
 	u64 rhf;
 	u32 maxcnt;
 	u32 maxcnt;
 	u32 rhqoff;
 	u32 rhqoff;
+	u32 dlid;
+	u32 slid;
 	u16 tlen;
 	u16 tlen;
 	s16 etail;
 	s16 etail;
 	u8 hlen;
 	u8 hlen;
 	u8 numpkt;
 	u8 numpkt;
 	u8 rsize;
 	u8 rsize;
 	u8 updegr;
 	u8 updegr;
-	u8 rcv_flags;
 	u8 etype;
 	u8 etype;
+	u8 extra_byte;
+	u8 pad;
+	u8 sc;
+	u8 sl;
+	u8 opcode;
+	bool becn;
+	bool fecn;
 };
 };
 
 
 struct rvt_sge_state;
 struct rvt_sge_state;
@@ -512,7 +495,7 @@ static inline void incr_cntr32(u32 *cntr)
 #define MAX_NAME_SIZE 64
 #define MAX_NAME_SIZE 64
 struct hfi1_msix_entry {
 struct hfi1_msix_entry {
 	enum irq_type type;
 	enum irq_type type;
-	struct msix_entry msix;
+	int irq;
 	void *arg;
 	void *arg;
 	char name[MAX_NAME_SIZE];
 	char name[MAX_NAME_SIZE];
 	cpumask_t mask;
 	cpumask_t mask;
@@ -593,6 +576,7 @@ struct hfi1_pportdata {
 	/* SendDMA related entries */
 	/* SendDMA related entries */
 
 
 	struct workqueue_struct *hfi1_wq;
 	struct workqueue_struct *hfi1_wq;
+	struct workqueue_struct *link_wq;
 
 
 	/* move out of interrupt context */
 	/* move out of interrupt context */
 	struct work_struct link_vc_work;
 	struct work_struct link_vc_work;
@@ -654,12 +638,13 @@ struct hfi1_pportdata {
 	u8 link_enabled;	/* link enabled? */
 	u8 link_enabled;	/* link enabled? */
 	u8 linkinit_reason;
 	u8 linkinit_reason;
 	u8 local_tx_rate;	/* rate given to 8051 firmware */
 	u8 local_tx_rate;	/* rate given to 8051 firmware */
-	u8 last_pstate;		/* info only */
+	u8 pstate;		/* info only */
 	u8 qsfp_retry_count;
 	u8 qsfp_retry_count;
 
 
 	/* placeholders for IB MAD packet settings */
 	/* placeholders for IB MAD packet settings */
 	u8 overrun_threshold;
 	u8 overrun_threshold;
 	u8 phy_error_threshold;
 	u8 phy_error_threshold;
+	unsigned int is_link_down_queued;
 
 
 	/* Used to override LED behavior for things like maintenance beaconing*/
 	/* Used to override LED behavior for things like maintenance beaconing*/
 	/*
 	/*
@@ -860,12 +845,15 @@ struct hfi1_devdata {
 	struct device *diag_device;
 	struct device *diag_device;
 	struct device *ui_device;
 	struct device *ui_device;
 
 
-	/* mem-mapped pointer to base of chip regs */
-	u8 __iomem *kregbase;
-	/* end of mem-mapped chip space excluding sendbuf and user regs */
-	u8 __iomem *kregend;
-	/* physical address of chip for io_remap, etc. */
+	/* first mapping up to RcvArray */
+	u8 __iomem *kregbase1;
 	resource_size_t physaddr;
 	resource_size_t physaddr;
+
+	/* second uncached mapping from RcvArray to pio send buffers */
+	u8 __iomem *kregbase2;
+	/* for detecting offset above kregbase2 address */
+	u32 base2_start;
+
 	/* Per VL data. Enough for all VLs but not all elements are set/used. */
 	/* Per VL data. Enough for all VLs but not all elements are set/used. */
 	struct per_vl_data vld[PER_VL_SEND_CONTEXTS];
 	struct per_vl_data vld[PER_VL_SEND_CONTEXTS];
 	/* send context data */
 	/* send context data */
@@ -1229,9 +1217,10 @@ static inline bool hfi1_vnic_is_rsm_full(struct hfi1_devdata *dd, int spare)
 #define dc8051_ver_patch(a) ((a) & 0x0000ff)
 #define dc8051_ver_patch(a) ((a) & 0x0000ff)
 
 
 /* f_put_tid types */
 /* f_put_tid types */
-#define PT_EXPECTED 0
-#define PT_EAGER    1
-#define PT_INVALID  2
+#define PT_EXPECTED       0
+#define PT_EAGER          1
+#define PT_INVALID_FLUSH  2
+#define PT_INVALID        3
 
 
 struct tid_rb_node;
 struct tid_rb_node;
 struct mmu_rb_node;
 struct mmu_rb_node;
@@ -1277,12 +1266,13 @@ void handle_user_interrupt(struct hfi1_ctxtdata *rcd);
 int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd);
 int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd);
 int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd);
 int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd);
 int hfi1_create_ctxts(struct hfi1_devdata *dd);
 int hfi1_create_ctxts(struct hfi1_devdata *dd);
-struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt,
+struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u16 ctxt,
 					   int numa);
 					   int numa);
 void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
 void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
 			 struct hfi1_devdata *dd, u8 hw_pidx, u8 port);
 			 struct hfi1_devdata *dd, u8 hw_pidx, u8 port);
 void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd);
 void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd);
-
+int hfi1_rcd_put(struct hfi1_ctxtdata *rcd);
+void hfi1_rcd_get(struct hfi1_ctxtdata *rcd);
 int handle_receive_interrupt(struct hfi1_ctxtdata *rcd, int thread);
 int handle_receive_interrupt(struct hfi1_ctxtdata *rcd, int thread);
 int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd, int thread);
 int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd, int thread);
 int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd, int thread);
 int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd, int thread);
@@ -1321,6 +1311,22 @@ static inline u32 driver_lstate(struct hfi1_pportdata *ppd)
 		return ppd->lstate;
 		return ppd->lstate;
 }
 }
 
 
+/* return the driver's idea of the physical OPA port state */
+static inline u32 driver_pstate(struct hfi1_pportdata *ppd)
+{
+	/*
+	 * The driver does some processing from the time the physical
+	 * link state is at LINKUP to the time the SM can be notified
+	 * as such. Return IB_PORTPHYSSTATE_TRAINING until the software
+	 * state is ready.
+	 */
+	if (ppd->pstate == PLS_LINKUP &&
+	    !(ppd->host_link_state & HLS_UP))
+		return IB_PORTPHYSSTATE_TRAINING;
+	else
+		return chip_to_opa_pstate(ppd->dd, ppd->pstate);
+}
+
 void receive_interrupt_work(struct work_struct *work);
 void receive_interrupt_work(struct work_struct *work);
 
 
 /* extract service channel from header and rhf */
 /* extract service channel from header and rhf */
@@ -1829,10 +1835,9 @@ void hfi1_pcie_cleanup(struct pci_dev *pdev);
 int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev);
 int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev);
 void hfi1_pcie_ddcleanup(struct hfi1_devdata *);
 void hfi1_pcie_ddcleanup(struct hfi1_devdata *);
 int pcie_speeds(struct hfi1_devdata *dd);
 int pcie_speeds(struct hfi1_devdata *dd);
-void request_msix(struct hfi1_devdata *dd, u32 *nent,
-		  struct hfi1_msix_entry *entry);
-void hfi1_enable_intx(struct pci_dev *pdev);
-void restore_pci_variables(struct hfi1_devdata *dd);
+int request_msix(struct hfi1_devdata *dd, u32 msireq);
+int restore_pci_variables(struct hfi1_devdata *dd);
+int save_pci_variables(struct hfi1_devdata *dd);
 int do_pcie_gen3_transition(struct hfi1_devdata *dd);
 int do_pcie_gen3_transition(struct hfi1_devdata *dd);
 int parse_platform_config(struct hfi1_devdata *dd);
 int parse_platform_config(struct hfi1_devdata *dd);
 int get_platform_config_field(struct hfi1_devdata *dd,
 int get_platform_config_field(struct hfi1_devdata *dd,
@@ -2087,52 +2092,13 @@ int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp);
 #define DD_DEV_ENTRY(dd)       __string(dev, dev_name(&(dd)->pcidev->dev))
 #define DD_DEV_ENTRY(dd)       __string(dev, dev_name(&(dd)->pcidev->dev))
 #define DD_DEV_ASSIGN(dd)      __assign_str(dev, dev_name(&(dd)->pcidev->dev))
 #define DD_DEV_ASSIGN(dd)      __assign_str(dev, dev_name(&(dd)->pcidev->dev))
 
 
-#define packettype_name(etype) { RHF_RCV_TYPE_##etype, #etype }
-#define show_packettype(etype)                  \
-__print_symbolic(etype,                         \
-	packettype_name(EXPECTED),              \
-	packettype_name(EAGER),                 \
-	packettype_name(IB),                    \
-	packettype_name(ERROR),                 \
-	packettype_name(BYPASS))
-
-#define ib_opcode_name(opcode) { IB_OPCODE_##opcode, #opcode  }
-#define show_ib_opcode(opcode)                             \
-__print_symbolic(opcode,                                   \
-	ib_opcode_name(RC_SEND_FIRST),                     \
-	ib_opcode_name(RC_SEND_MIDDLE),                    \
-	ib_opcode_name(RC_SEND_LAST),                      \
-	ib_opcode_name(RC_SEND_LAST_WITH_IMMEDIATE),       \
-	ib_opcode_name(RC_SEND_ONLY),                      \
-	ib_opcode_name(RC_SEND_ONLY_WITH_IMMEDIATE),       \
-	ib_opcode_name(RC_RDMA_WRITE_FIRST),               \
-	ib_opcode_name(RC_RDMA_WRITE_MIDDLE),              \
-	ib_opcode_name(RC_RDMA_WRITE_LAST),                \
-	ib_opcode_name(RC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
-	ib_opcode_name(RC_RDMA_WRITE_ONLY),                \
-	ib_opcode_name(RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
-	ib_opcode_name(RC_RDMA_READ_REQUEST),              \
-	ib_opcode_name(RC_RDMA_READ_RESPONSE_FIRST),       \
-	ib_opcode_name(RC_RDMA_READ_RESPONSE_MIDDLE),      \
-	ib_opcode_name(RC_RDMA_READ_RESPONSE_LAST),        \
-	ib_opcode_name(RC_RDMA_READ_RESPONSE_ONLY),        \
-	ib_opcode_name(RC_ACKNOWLEDGE),                    \
-	ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE),             \
-	ib_opcode_name(RC_COMPARE_SWAP),                   \
-	ib_opcode_name(RC_FETCH_ADD),                      \
-	ib_opcode_name(UC_SEND_FIRST),                     \
-	ib_opcode_name(UC_SEND_MIDDLE),                    \
-	ib_opcode_name(UC_SEND_LAST),                      \
-	ib_opcode_name(UC_SEND_LAST_WITH_IMMEDIATE),       \
-	ib_opcode_name(UC_SEND_ONLY),                      \
-	ib_opcode_name(UC_SEND_ONLY_WITH_IMMEDIATE),       \
-	ib_opcode_name(UC_RDMA_WRITE_FIRST),               \
-	ib_opcode_name(UC_RDMA_WRITE_MIDDLE),              \
-	ib_opcode_name(UC_RDMA_WRITE_LAST),                \
-	ib_opcode_name(UC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
-	ib_opcode_name(UC_RDMA_WRITE_ONLY),                \
-	ib_opcode_name(UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
-	ib_opcode_name(UD_SEND_ONLY),                      \
-	ib_opcode_name(UD_SEND_ONLY_WITH_IMMEDIATE),       \
-	ib_opcode_name(CNP))
+/*
+ * hfi1_check_mcast- Check if the given lid is
+ * in the IB multicast range.
+ */
+static inline bool hfi1_check_mcast(u16 lid)
+{
+	return ((lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
+		(lid != be16_to_cpu(IB_LID_PERMISSIVE)));
+}
 #endif                          /* _HFI1_KERNEL_H */
 #endif                          /* _HFI1_KERNEL_H */

+ 90 - 21
drivers/infiniband/hw/hfi1/init.c

@@ -67,6 +67,7 @@
 #include "aspm.h"
 #include "aspm.h"
 #include "affinity.h"
 #include "affinity.h"
 #include "vnic.h"
 #include "vnic.h"
+#include "exp_rcv.h"
 
 
 #undef pr_fmt
 #undef pr_fmt
 #define pr_fmt(fmt) DRIVER_NAME ": " fmt
 #define pr_fmt(fmt) DRIVER_NAME ": " fmt
@@ -130,7 +131,7 @@ unsigned long *hfi1_cpulist;
  */
  */
 int hfi1_create_ctxts(struct hfi1_devdata *dd)
 int hfi1_create_ctxts(struct hfi1_devdata *dd)
 {
 {
-	unsigned i;
+	u16 i;
 	int ret;
 	int ret;
 
 
 	/* Control context has to be always 0 */
 	/* Control context has to be always 0 */
@@ -190,19 +191,49 @@ int hfi1_create_ctxts(struct hfi1_devdata *dd)
 nomem:
 nomem:
 	ret = -ENOMEM;
 	ret = -ENOMEM;
 
 
-	if (dd->rcd) {
-		for (i = 0; i < dd->num_rcv_contexts; ++i)
-			hfi1_free_ctxtdata(dd, dd->rcd[i]);
-	}
+	for (i = 0; dd->rcd && i < dd->first_dyn_alloc_ctxt; ++i)
+		hfi1_rcd_put(dd->rcd[i]);
+
+	/* All the contexts should be freed, free the array */
 	kfree(dd->rcd);
 	kfree(dd->rcd);
 	dd->rcd = NULL;
 	dd->rcd = NULL;
 	return ret;
 	return ret;
 }
 }
 
 
+/*
+ * Helper routines for the receive context reference count (rcd and uctxt)
+ */
+static void hfi1_rcd_init(struct hfi1_ctxtdata *rcd)
+{
+	kref_init(&rcd->kref);
+}
+
+static void hfi1_rcd_free(struct kref *kref)
+{
+	struct hfi1_ctxtdata *rcd =
+		container_of(kref, struct hfi1_ctxtdata, kref);
+
+	hfi1_free_ctxtdata(rcd->dd, rcd);
+	kfree(rcd);
+}
+
+int hfi1_rcd_put(struct hfi1_ctxtdata *rcd)
+{
+	if (rcd)
+		return kref_put(&rcd->kref, hfi1_rcd_free);
+
+	return 0;
+}
+
+void hfi1_rcd_get(struct hfi1_ctxtdata *rcd)
+{
+	kref_get(&rcd->kref);
+}
+
 /*
 /*
  * Common code for user and kernel context setup.
  * Common code for user and kernel context setup.
  */
  */
-struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt,
+struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u16 ctxt,
 					   int numa)
 					   int numa)
 {
 {
 	struct hfi1_devdata *dd = ppd->dd;
 	struct hfi1_devdata *dd = ppd->dd;
@@ -221,6 +252,9 @@ struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt,
 		hfi1_cdbg(PROC, "setting up context %u\n", ctxt);
 		hfi1_cdbg(PROC, "setting up context %u\n", ctxt);
 
 
 		INIT_LIST_HEAD(&rcd->qp_wait_list);
 		INIT_LIST_HEAD(&rcd->qp_wait_list);
+		hfi1_exp_tid_group_init(&rcd->tid_group_list);
+		hfi1_exp_tid_group_init(&rcd->tid_used_list);
+		hfi1_exp_tid_group_init(&rcd->tid_full_list);
 		rcd->ppd = ppd;
 		rcd->ppd = ppd;
 		rcd->dd = dd;
 		rcd->dd = dd;
 		__set_bit(0, rcd->in_use_ctxts);
 		__set_bit(0, rcd->in_use_ctxts);
@@ -328,6 +362,8 @@ struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt,
 			if (!rcd->opstats)
 			if (!rcd->opstats)
 				goto bail;
 				goto bail;
 		}
 		}
+
+		hfi1_rcd_init(rcd);
 	}
 	}
 	return rcd;
 	return rcd;
 bail:
 bail:
@@ -567,8 +603,8 @@ static int init_after_reset(struct hfi1_devdata *dd)
 	 */
 	 */
 	for (i = 0; i < dd->num_rcv_contexts; i++)
 	for (i = 0; i < dd->num_rcv_contexts; i++)
 		hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
 		hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
-				  HFI1_RCVCTRL_INTRAVAIL_DIS |
-				  HFI1_RCVCTRL_TAILUPD_DIS, i);
+			     HFI1_RCVCTRL_INTRAVAIL_DIS |
+			     HFI1_RCVCTRL_TAILUPD_DIS, dd->rcd[i]);
 	pio_send_control(dd, PSC_GLOBAL_DISABLE);
 	pio_send_control(dd, PSC_GLOBAL_DISABLE);
 	for (i = 0; i < dd->num_send_contexts; i++)
 	for (i = 0; i < dd->num_send_contexts; i++)
 		sc_disable(dd->send_contexts[i].sc);
 		sc_disable(dd->send_contexts[i].sc);
@@ -579,7 +615,7 @@ static int init_after_reset(struct hfi1_devdata *dd)
 static void enable_chip(struct hfi1_devdata *dd)
 static void enable_chip(struct hfi1_devdata *dd)
 {
 {
 	u32 rcvmask;
 	u32 rcvmask;
-	u32 i;
+	u16 i;
 
 
 	/* enable PIO send */
 	/* enable PIO send */
 	pio_send_control(dd, PSC_GLOBAL_ENABLE);
 	pio_send_control(dd, PSC_GLOBAL_ENABLE);
@@ -598,7 +634,7 @@ static void enable_chip(struct hfi1_devdata *dd)
 			rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
 			rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
 		if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_EGR_FULL))
 		if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_EGR_FULL))
 			rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
 			rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
-		hfi1_rcvctrl(dd, rcvmask, i);
+		hfi1_rcvctrl(dd, rcvmask, dd->rcd[i]);
 		sc_enable(dd->rcd[i]->sc);
 		sc_enable(dd->rcd[i]->sc);
 	}
 	}
 }
 }
@@ -624,6 +660,20 @@ static int create_workqueues(struct hfi1_devdata *dd)
 			if (!ppd->hfi1_wq)
 			if (!ppd->hfi1_wq)
 				goto wq_error;
 				goto wq_error;
 		}
 		}
+		if (!ppd->link_wq) {
+			/*
+			 * Make the link workqueue single-threaded to enforce
+			 * serialization.
+			 */
+			ppd->link_wq =
+				alloc_workqueue(
+				    "hfi_link_%d_%d",
+				    WQ_SYSFS | WQ_MEM_RECLAIM | WQ_UNBOUND,
+				    1, /* max_active */
+				    dd->unit, pidx);
+			if (!ppd->link_wq)
+				goto wq_error;
+		}
 	}
 	}
 	return 0;
 	return 0;
 wq_error:
 wq_error:
@@ -634,6 +684,10 @@ wq_error:
 			destroy_workqueue(ppd->hfi1_wq);
 			destroy_workqueue(ppd->hfi1_wq);
 			ppd->hfi1_wq = NULL;
 			ppd->hfi1_wq = NULL;
 		}
 		}
+		if (ppd->link_wq) {
+			destroy_workqueue(ppd->link_wq);
+			ppd->link_wq = NULL;
+		}
 	}
 	}
 	return -ENOMEM;
 	return -ENOMEM;
 }
 }
@@ -656,7 +710,8 @@ wq_error:
 int hfi1_init(struct hfi1_devdata *dd, int reinit)
 int hfi1_init(struct hfi1_devdata *dd, int reinit)
 {
 {
 	int ret = 0, pidx, lastfail = 0;
 	int ret = 0, pidx, lastfail = 0;
-	unsigned i, len;
+	unsigned long len;
+	u16 i;
 	struct hfi1_ctxtdata *rcd;
 	struct hfi1_ctxtdata *rcd;
 	struct hfi1_pportdata *ppd;
 	struct hfi1_pportdata *ppd;
 
 
@@ -878,10 +933,10 @@ static void shutdown_device(struct hfi1_devdata *dd)
 		ppd = dd->pport + pidx;
 		ppd = dd->pport + pidx;
 		for (i = 0; i < dd->num_rcv_contexts; i++)
 		for (i = 0; i < dd->num_rcv_contexts; i++)
 			hfi1_rcvctrl(dd, HFI1_RCVCTRL_TAILUPD_DIS |
 			hfi1_rcvctrl(dd, HFI1_RCVCTRL_TAILUPD_DIS |
-					  HFI1_RCVCTRL_CTXT_DIS |
-					  HFI1_RCVCTRL_INTRAVAIL_DIS |
-					  HFI1_RCVCTRL_PKEY_DIS |
-					  HFI1_RCVCTRL_ONE_PKT_EGR_DIS, i);
+				     HFI1_RCVCTRL_CTXT_DIS |
+				     HFI1_RCVCTRL_INTRAVAIL_DIS |
+				     HFI1_RCVCTRL_PKEY_DIS |
+				     HFI1_RCVCTRL_ONE_PKT_EGR_DIS, dd->rcd[i]);
 		/*
 		/*
 		 * Gracefully stop all sends allowing any in progress to
 		 * Gracefully stop all sends allowing any in progress to
 		 * trickle out first.
 		 * trickle out first.
@@ -917,6 +972,10 @@ static void shutdown_device(struct hfi1_devdata *dd)
 			destroy_workqueue(ppd->hfi1_wq);
 			destroy_workqueue(ppd->hfi1_wq);
 			ppd->hfi1_wq = NULL;
 			ppd->hfi1_wq = NULL;
 		}
 		}
+		if (ppd->link_wq) {
+			destroy_workqueue(ppd->link_wq);
+			ppd->link_wq = NULL;
+		}
 	}
 	}
 	sdma_exit(dd);
 	sdma_exit(dd);
 }
 }
@@ -927,14 +986,11 @@ static void shutdown_device(struct hfi1_devdata *dd)
  * @rcd: the ctxtdata structure
  * @rcd: the ctxtdata structure
  *
  *
  * free up any allocated data for a context
  * free up any allocated data for a context
- * This should not touch anything that would affect a simultaneous
- * re-allocation of context data, because it is called after hfi1_mutex
- * is released (and can be called from reinit as well).
  * It should never change any chip state, or global driver state.
  * It should never change any chip state, or global driver state.
  */
  */
 void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
 void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
 {
 {
-	unsigned e;
+	u32 e;
 
 
 	if (!rcd)
 	if (!rcd)
 		return;
 		return;
@@ -953,6 +1009,7 @@ void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
 
 
 	/* all the RcvArray entries should have been cleared by now */
 	/* all the RcvArray entries should have been cleared by now */
 	kfree(rcd->egrbufs.rcvtids);
 	kfree(rcd->egrbufs.rcvtids);
+	rcd->egrbufs.rcvtids = NULL;
 
 
 	for (e = 0; e < rcd->egrbufs.alloced; e++) {
 	for (e = 0; e < rcd->egrbufs.alloced; e++) {
 		if (rcd->egrbufs.buffers[e].dma)
 		if (rcd->egrbufs.buffers[e].dma)
@@ -962,13 +1019,21 @@ void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
 					  rcd->egrbufs.buffers[e].dma);
 					  rcd->egrbufs.buffers[e].dma);
 	}
 	}
 	kfree(rcd->egrbufs.buffers);
 	kfree(rcd->egrbufs.buffers);
+	rcd->egrbufs.alloced = 0;
+	rcd->egrbufs.buffers = NULL;
 
 
 	sc_free(rcd->sc);
 	sc_free(rcd->sc);
+	rcd->sc = NULL;
+
 	vfree(rcd->subctxt_uregbase);
 	vfree(rcd->subctxt_uregbase);
 	vfree(rcd->subctxt_rcvegrbuf);
 	vfree(rcd->subctxt_rcvegrbuf);
 	vfree(rcd->subctxt_rcvhdr_base);
 	vfree(rcd->subctxt_rcvhdr_base);
 	kfree(rcd->opstats);
 	kfree(rcd->opstats);
-	kfree(rcd);
+
+	rcd->subctxt_uregbase = NULL;
+	rcd->subctxt_rcvegrbuf = NULL;
+	rcd->subctxt_rcvhdr_base = NULL;
+	rcd->opstats = NULL;
 }
 }
 
 
 /*
 /*
@@ -1362,7 +1427,7 @@ static void cleanup_device_data(struct hfi1_devdata *dd)
 		tmp[ctxt] = NULL; /* debugging paranoia */
 		tmp[ctxt] = NULL; /* debugging paranoia */
 		if (rcd) {
 		if (rcd) {
 			hfi1_clear_tids(rcd);
 			hfi1_clear_tids(rcd);
-			hfi1_free_ctxtdata(dd, rcd);
+			hfi1_rcd_put(rcd);
 		}
 		}
 	}
 	}
 	kfree(tmp);
 	kfree(tmp);
@@ -1532,6 +1597,10 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 				destroy_workqueue(ppd->hfi1_wq);
 				destroy_workqueue(ppd->hfi1_wq);
 				ppd->hfi1_wq = NULL;
 				ppd->hfi1_wq = NULL;
 			}
 			}
+			if (ppd->link_wq) {
+				destroy_workqueue(ppd->link_wq);
+				ppd->link_wq = NULL;
+			}
 		}
 		}
 		if (!j)
 		if (!j)
 			hfi1_device_remove(dd);
 			hfi1_device_remove(dd);

+ 2 - 1
drivers/infiniband/hw/hfi1/intr.c

@@ -164,6 +164,7 @@ void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup)
 		ppd->linkup = 0;
 		ppd->linkup = 0;
 
 
 		/* clear HW details of the previous connection */
 		/* clear HW details of the previous connection */
+		ppd->actual_vls_operational = 0;
 		reset_link_credits(dd);
 		reset_link_credits(dd);
 
 
 		/* freeze after a link down to guarantee a clean egress */
 		/* freeze after a link down to guarantee a clean egress */
@@ -196,7 +197,7 @@ void handle_user_interrupt(struct hfi1_ctxtdata *rcd)
 
 
 	if (test_and_clear_bit(HFI1_CTXT_WAITING_RCV, &rcd->event_flags)) {
 	if (test_and_clear_bit(HFI1_CTXT_WAITING_RCV, &rcd->event_flags)) {
 		wake_up_interruptible(&rcd->wait);
 		wake_up_interruptible(&rcd->wait);
-		hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_DIS, rcd->ctxt);
+		hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_DIS, rcd);
 	} else if (test_and_clear_bit(HFI1_CTXT_WAITING_URG,
 	} else if (test_and_clear_bit(HFI1_CTXT_WAITING_URG,
 							&rcd->event_flags)) {
 							&rcd->event_flags)) {
 		rcd->urgent++;
 		rcd->urgent++;

+ 68 - 2
drivers/infiniband/hw/hfi1/iowait.h

@@ -106,7 +106,9 @@ struct iowait {
 		struct sdma_engine *sde,
 		struct sdma_engine *sde,
 		struct iowait *wait,
 		struct iowait *wait,
 		struct sdma_txreq *tx,
 		struct sdma_txreq *tx,
-		unsigned seq);
+		uint seq,
+		bool pkts_sent
+		);
 	void (*wakeup)(struct iowait *wait, int reason);
 	void (*wakeup)(struct iowait *wait, int reason);
 	void (*sdma_drained)(struct iowait *wait);
 	void (*sdma_drained)(struct iowait *wait);
 	seqlock_t *lock;
 	seqlock_t *lock;
@@ -118,6 +120,7 @@ struct iowait {
 	u32 count;
 	u32 count;
 	u32 tx_limit;
 	u32 tx_limit;
 	u32 tx_count;
 	u32 tx_count;
+	u8 starved_cnt;
 };
 };
 
 
 #define SDMA_AVAIL_REASON 0
 #define SDMA_AVAIL_REASON 0
@@ -143,7 +146,8 @@ static inline void iowait_init(
 		struct sdma_engine *sde,
 		struct sdma_engine *sde,
 		struct iowait *wait,
 		struct iowait *wait,
 		struct sdma_txreq *tx,
 		struct sdma_txreq *tx,
-		unsigned seq),
+		uint seq,
+		bool pkts_sent),
 	void (*wakeup)(struct iowait *wait, int reason),
 	void (*wakeup)(struct iowait *wait, int reason),
 	void (*sdma_drained)(struct iowait *wait))
 	void (*sdma_drained)(struct iowait *wait))
 {
 {
@@ -305,4 +309,66 @@ static inline struct sdma_txreq *iowait_get_txhead(struct iowait *wait)
 	return tx;
 	return tx;
 }
 }
 
 
+/**
+ * iowait_queue - Put the iowait on a wait queue
+ * @pkts_sent: have some packets been sent before queuing?
+ * @w: the iowait struct
+ * @wait_head: the wait queue
+ *
+ * This function is called to insert an iowait struct into a
+ * wait queue after a resource (eg, sdma decriptor or pio
+ * buffer) is run out.
+ */
+static inline void iowait_queue(bool pkts_sent, struct iowait *w,
+				struct list_head *wait_head)
+{
+	/*
+	 * To play fair, insert the iowait at the tail of the wait queue if it
+	 * has already sent some packets; Otherwise, put it at the head.
+	 */
+	if (pkts_sent) {
+		list_add_tail(&w->list, wait_head);
+		w->starved_cnt = 0;
+	} else {
+		list_add(&w->list, wait_head);
+		w->starved_cnt++;
+	}
+}
+
+/**
+ * iowait_starve_clear - clear the wait queue's starve count
+ * @pkts_sent: have some packets been sent?
+ * @w: the iowait struct
+ *
+ * This function is called to clear the starve count. If no
+ * packets have been sent, the starve count will not be cleared.
+ */
+static inline void iowait_starve_clear(bool pkts_sent, struct iowait *w)
+{
+	if (pkts_sent)
+		w->starved_cnt = 0;
+}
+
+/**
+ * iowait_starve_find_max - Find the maximum of the starve count
+ * @w: the iowait struct
+ * @max: a variable containing the max starve count
+ * @idx: the index of the current iowait in an array
+ * @max_idx: a variable containing the array index for the
+ *         iowait entry that has the max starve count
+ *
+ * This function is called to compare the starve count of a
+ * given iowait with the given max starve count. The max starve
+ * count and the index will be updated if the iowait's start
+ * count is larger.
+ */
+static inline void iowait_starve_find_max(struct iowait *w, u8 *max,
+					  uint idx, uint *max_idx)
+{
+	if (w->starved_cnt > *max) {
+		*max = w->starved_cnt;
+		*max_idx = idx;
+	}
+}
+
 #endif
 #endif

+ 478 - 212
drivers/infiniband/hw/hfi1/mad.c

@@ -59,6 +59,24 @@
 #define OPA_LINK_WIDTH_RESET_OLD 0x0fff
 #define OPA_LINK_WIDTH_RESET_OLD 0x0fff
 #define OPA_LINK_WIDTH_RESET 0xffff
 #define OPA_LINK_WIDTH_RESET 0xffff
 
 
+struct trap_node {
+	struct list_head list;
+	struct opa_mad_notice_attr data;
+	__be64 tid;
+	int len;
+	u32 retry;
+	u8 in_use;
+	u8 repress;
+};
+
+static int smp_length_check(u32 data_size, u32 request_len)
+{
+	if (unlikely(request_len < data_size))
+		return -EINVAL;
+
+	return 0;
+}
+
 static int reply(struct ib_mad_hdr *smp)
 static int reply(struct ib_mad_hdr *smp)
 {
 {
 	/*
 	/*
@@ -89,28 +107,156 @@ void hfi1_event_pkey_change(struct hfi1_devdata *dd, u8 port)
 	ib_dispatch_event(&event);
 	ib_dispatch_event(&event);
 }
 }
 
 
-static void send_trap(struct hfi1_ibport *ibp, void *data, unsigned len)
+/*
+ * If the port is down, clean up all pending traps.  We need to be careful
+ * with the given trap, because it may be queued.
+ */
+static void cleanup_traps(struct hfi1_ibport *ibp, struct trap_node *trap)
+{
+	struct trap_node *node, *q;
+	unsigned long flags;
+	struct list_head trap_list;
+	int i;
+
+	for (i = 0; i < RVT_MAX_TRAP_LISTS; i++) {
+		spin_lock_irqsave(&ibp->rvp.lock, flags);
+		list_replace_init(&ibp->rvp.trap_lists[i].list, &trap_list);
+		ibp->rvp.trap_lists[i].list_len = 0;
+		spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+
+		/*
+		 * Remove all items from the list, freeing all the non-given
+		 * traps.
+		 */
+		list_for_each_entry_safe(node, q, &trap_list, list) {
+			list_del(&node->list);
+			if (node != trap)
+				kfree(node);
+		}
+	}
+
+	/*
+	 * If this wasn't on one of the lists it would not be freed.  If it
+	 * was on the list, it is now safe to free.
+	 */
+	kfree(trap);
+}
+
+static struct trap_node *check_and_add_trap(struct hfi1_ibport *ibp,
+					    struct trap_node *trap)
+{
+	struct trap_node *node;
+	struct trap_list *trap_list;
+	unsigned long flags;
+	unsigned long timeout;
+	int found = 0;
+
+	/*
+	 * Since the retry (handle timeout) does not remove a trap request
+	 * from the list, all we have to do is compare the node.
+	 */
+	spin_lock_irqsave(&ibp->rvp.lock, flags);
+	trap_list = &ibp->rvp.trap_lists[trap->data.generic_type & 0x0F];
+
+	list_for_each_entry(node, &trap_list->list, list) {
+		if (node == trap) {
+			node->retry++;
+			found = 1;
+			break;
+		}
+	}
+
+	/* If it is not on the list, add it, limited to RVT-MAX_TRAP_LEN. */
+	if (!found) {
+		if (trap_list->list_len < RVT_MAX_TRAP_LEN) {
+			trap_list->list_len++;
+			list_add_tail(&trap->list, &trap_list->list);
+		} else {
+			pr_warn_ratelimited("hfi1: Maximim trap limit reached for 0x%0x traps\n",
+					    trap->data.generic_type);
+			kfree(trap);
+		}
+	}
+
+	/*
+	 * Next check to see if there is a timer pending.  If not, set it up
+	 * and get the first trap from the list.
+	 */
+	node = NULL;
+	if (!timer_pending(&ibp->rvp.trap_timer)) {
+		/*
+		 * o14-2
+		 * If the time out is set we have to wait until it expires
+		 * before the trap can be sent.
+		 * This should be > RVT_TRAP_TIMEOUT
+		 */
+		timeout = (RVT_TRAP_TIMEOUT *
+			   (1UL << ibp->rvp.subnet_timeout)) / 1000;
+		mod_timer(&ibp->rvp.trap_timer,
+			  jiffies + usecs_to_jiffies(timeout));
+		node = list_first_entry(&trap_list->list, struct trap_node,
+					list);
+		node->in_use = 1;
+	}
+	spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+
+	return node;
+}
+
+static void subn_handle_opa_trap_repress(struct hfi1_ibport *ibp,
+					 struct opa_smp *smp)
+{
+	struct trap_list *trap_list;
+	struct trap_node *trap;
+	unsigned long flags;
+	int i;
+
+	if (smp->attr_id != IB_SMP_ATTR_NOTICE)
+		return;
+
+	spin_lock_irqsave(&ibp->rvp.lock, flags);
+	for (i = 0; i < RVT_MAX_TRAP_LISTS; i++) {
+		trap_list = &ibp->rvp.trap_lists[i];
+		trap = list_first_entry_or_null(&trap_list->list,
+						struct trap_node, list);
+		if (trap && trap->tid == smp->tid) {
+			if (trap->in_use) {
+				trap->repress = 1;
+			} else {
+				trap_list->list_len--;
+				list_del(&trap->list);
+				kfree(trap);
+			}
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+}
+
+static void send_trap(struct hfi1_ibport *ibp, struct trap_node *trap)
 {
 {
 	struct ib_mad_send_buf *send_buf;
 	struct ib_mad_send_buf *send_buf;
 	struct ib_mad_agent *agent;
 	struct ib_mad_agent *agent;
 	struct opa_smp *smp;
 	struct opa_smp *smp;
-	int ret;
 	unsigned long flags;
 	unsigned long flags;
-	unsigned long timeout;
 	int pkey_idx;
 	int pkey_idx;
 	u32 qpn = ppd_from_ibp(ibp)->sm_trap_qp;
 	u32 qpn = ppd_from_ibp(ibp)->sm_trap_qp;
 
 
 	agent = ibp->rvp.send_agent;
 	agent = ibp->rvp.send_agent;
-	if (!agent)
+	if (!agent) {
+		cleanup_traps(ibp, trap);
 		return;
 		return;
+	}
 
 
 	/* o14-3.2.1 */
 	/* o14-3.2.1 */
-	if (ppd_from_ibp(ibp)->lstate != IB_PORT_ACTIVE)
+	if (driver_lstate(ppd_from_ibp(ibp)) != IB_PORT_ACTIVE) {
+		cleanup_traps(ibp, trap);
 		return;
 		return;
+	}
 
 
-	/* o14-2 */
-	if (ibp->rvp.trap_timeout && time_before(jiffies,
-						 ibp->rvp.trap_timeout))
+	/* Add the trap to the list if necessary and see if we can send it */
+	trap = check_and_add_trap(ibp, trap);
+	if (!trap)
 		return;
 		return;
 
 
 	pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY);
 	pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY);
@@ -131,11 +277,21 @@ static void send_trap(struct hfi1_ibport *ibp, void *data, unsigned len)
 	smp->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
 	smp->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
 	smp->class_version = OPA_SM_CLASS_VERSION;
 	smp->class_version = OPA_SM_CLASS_VERSION;
 	smp->method = IB_MGMT_METHOD_TRAP;
 	smp->method = IB_MGMT_METHOD_TRAP;
-	ibp->rvp.tid++;
-	smp->tid = cpu_to_be64(ibp->rvp.tid);
+
+	/* Only update the transaction ID for new traps (o13-5). */
+	if (trap->tid == 0) {
+		ibp->rvp.tid++;
+		/* make sure that tid != 0 */
+		if (ibp->rvp.tid == 0)
+			ibp->rvp.tid++;
+		trap->tid = cpu_to_be64(ibp->rvp.tid);
+	}
+	smp->tid = trap->tid;
+
 	smp->attr_id = IB_SMP_ATTR_NOTICE;
 	smp->attr_id = IB_SMP_ATTR_NOTICE;
 	/* o14-1: smp->mkey = 0; */
 	/* o14-1: smp->mkey = 0; */
-	memcpy(smp->route.lid.data, data, len);
+
+	memcpy(smp->route.lid.data, &trap->data, trap->len);
 
 
 	spin_lock_irqsave(&ibp->rvp.lock, flags);
 	spin_lock_irqsave(&ibp->rvp.lock, flags);
 	if (!ibp->rvp.sm_ah) {
 	if (!ibp->rvp.sm_ah) {
@@ -144,65 +300,103 @@ static void send_trap(struct hfi1_ibport *ibp, void *data, unsigned len)
 
 
 			ah = hfi1_create_qp0_ah(ibp, ibp->rvp.sm_lid);
 			ah = hfi1_create_qp0_ah(ibp, ibp->rvp.sm_lid);
 			if (IS_ERR(ah)) {
 			if (IS_ERR(ah)) {
-				ret = PTR_ERR(ah);
-			} else {
-				send_buf->ah = ah;
-				ibp->rvp.sm_ah = ibah_to_rvtah(ah);
-				ret = 0;
+				spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+				return;
 			}
 			}
+			send_buf->ah = ah;
+			ibp->rvp.sm_ah = ibah_to_rvtah(ah);
 		} else {
 		} else {
-			ret = -EINVAL;
+			spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+			return;
 		}
 		}
 	} else {
 	} else {
 		send_buf->ah = &ibp->rvp.sm_ah->ibah;
 		send_buf->ah = &ibp->rvp.sm_ah->ibah;
-		ret = 0;
 	}
 	}
+
+	/*
+	 * If the trap was repressed while things were getting set up, don't
+	 * bother sending it. This could happen for a retry.
+	 */
+	if (trap->repress) {
+		list_del(&trap->list);
+		spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+		kfree(trap);
+		ib_free_send_mad(send_buf);
+		return;
+	}
+
+	trap->in_use = 0;
 	spin_unlock_irqrestore(&ibp->rvp.lock, flags);
 	spin_unlock_irqrestore(&ibp->rvp.lock, flags);
 
 
-	if (!ret)
-		ret = ib_post_send_mad(send_buf, NULL);
-	if (!ret) {
-		/* 4.096 usec. */
-		timeout = (4096 * (1UL << ibp->rvp.subnet_timeout)) / 1000;
-		ibp->rvp.trap_timeout = jiffies + usecs_to_jiffies(timeout);
-	} else {
+	if (ib_post_send_mad(send_buf, NULL))
 		ib_free_send_mad(send_buf);
 		ib_free_send_mad(send_buf);
-		ibp->rvp.trap_timeout = 0;
+}
+
+void hfi1_handle_trap_timer(unsigned long data)
+{
+	struct hfi1_ibport *ibp = (struct hfi1_ibport *)data;
+	struct trap_node *trap = NULL;
+	unsigned long flags;
+	int i;
+
+	/* Find the trap with the highest priority */
+	spin_lock_irqsave(&ibp->rvp.lock, flags);
+	for (i = 0; !trap && i < RVT_MAX_TRAP_LISTS; i++) {
+		trap = list_first_entry_or_null(&ibp->rvp.trap_lists[i].list,
+						struct trap_node, list);
 	}
 	}
+	spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+
+	if (trap)
+		send_trap(ibp, trap);
+}
+
+static struct trap_node *create_trap_node(u8 type, __be16 trap_num, u32 lid)
+{
+	struct trap_node *trap;
+
+	trap = kzalloc(sizeof(*trap), GFP_ATOMIC);
+	if (!trap)
+		return NULL;
+
+	INIT_LIST_HEAD(&trap->list);
+	trap->data.generic_type = type;
+	trap->data.prod_type_lsb = IB_NOTICE_PROD_CA;
+	trap->data.trap_num = trap_num;
+	trap->data.issuer_lid = cpu_to_be32(lid);
+
+	return trap;
 }
 }
 
 
 /*
 /*
- * Send a bad [PQ]_Key trap (ch. 14.3.8).
+ * Send a bad P_Key trap (ch. 14.3.8).
  */
  */
-void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl,
-		    u32 qp1, u32 qp2, u16 lid1, u16 lid2)
+void hfi1_bad_pkey(struct hfi1_ibport *ibp, u32 key, u32 sl,
+		   u32 qp1, u32 qp2, u16 lid1, u16 lid2)
 {
 {
-	struct opa_mad_notice_attr data;
+	struct trap_node *trap;
 	u32 lid = ppd_from_ibp(ibp)->lid;
 	u32 lid = ppd_from_ibp(ibp)->lid;
 	u32 _lid1 = lid1;
 	u32 _lid1 = lid1;
 	u32 _lid2 = lid2;
 	u32 _lid2 = lid2;
 
 
-	memset(&data, 0, sizeof(data));
-
-	if (trap_num == OPA_TRAP_BAD_P_KEY)
-		ibp->rvp.pkey_violations++;
-	else
-		ibp->rvp.qkey_violations++;
 	ibp->rvp.n_pkt_drops++;
 	ibp->rvp.n_pkt_drops++;
+	ibp->rvp.pkey_violations++;
+
+	trap = create_trap_node(IB_NOTICE_TYPE_SECURITY, OPA_TRAP_BAD_P_KEY,
+				lid);
+	if (!trap)
+		return;
 
 
 	/* Send violation trap */
 	/* Send violation trap */
-	data.generic_type = IB_NOTICE_TYPE_SECURITY;
-	data.prod_type_lsb = IB_NOTICE_PROD_CA;
-	data.trap_num = trap_num;
-	data.issuer_lid = cpu_to_be32(lid);
-	data.ntc_257_258.lid1 = cpu_to_be32(_lid1);
-	data.ntc_257_258.lid2 = cpu_to_be32(_lid2);
-	data.ntc_257_258.key = cpu_to_be32(key);
-	data.ntc_257_258.sl = sl << 3;
-	data.ntc_257_258.qp1 = cpu_to_be32(qp1);
-	data.ntc_257_258.qp2 = cpu_to_be32(qp2);
-
-	send_trap(ibp, &data, sizeof(data));
+	trap->data.ntc_257_258.lid1 = cpu_to_be32(_lid1);
+	trap->data.ntc_257_258.lid2 = cpu_to_be32(_lid2);
+	trap->data.ntc_257_258.key = cpu_to_be32(key);
+	trap->data.ntc_257_258.sl = sl << 3;
+	trap->data.ntc_257_258.qp1 = cpu_to_be32(qp1);
+	trap->data.ntc_257_258.qp2 = cpu_to_be32(qp2);
+
+	trap->len = sizeof(trap->data);
+	send_trap(ibp, trap);
 }
 }
 
 
 /*
 /*
@@ -211,34 +405,36 @@ void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl,
 static void bad_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad,
 static void bad_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad,
 		     __be64 mkey, __be32 dr_slid, u8 return_path[], u8 hop_cnt)
 		     __be64 mkey, __be32 dr_slid, u8 return_path[], u8 hop_cnt)
 {
 {
-	struct opa_mad_notice_attr data;
+	struct trap_node *trap;
 	u32 lid = ppd_from_ibp(ibp)->lid;
 	u32 lid = ppd_from_ibp(ibp)->lid;
 
 
-	memset(&data, 0, sizeof(data));
+	trap = create_trap_node(IB_NOTICE_TYPE_SECURITY, OPA_TRAP_BAD_M_KEY,
+				lid);
+	if (!trap)
+		return;
+
 	/* Send violation trap */
 	/* Send violation trap */
-	data.generic_type = IB_NOTICE_TYPE_SECURITY;
-	data.prod_type_lsb = IB_NOTICE_PROD_CA;
-	data.trap_num = OPA_TRAP_BAD_M_KEY;
-	data.issuer_lid = cpu_to_be32(lid);
-	data.ntc_256.lid = data.issuer_lid;
-	data.ntc_256.method = mad->method;
-	data.ntc_256.attr_id = mad->attr_id;
-	data.ntc_256.attr_mod = mad->attr_mod;
-	data.ntc_256.mkey = mkey;
+	trap->data.ntc_256.lid = trap->data.issuer_lid;
+	trap->data.ntc_256.method = mad->method;
+	trap->data.ntc_256.attr_id = mad->attr_id;
+	trap->data.ntc_256.attr_mod = mad->attr_mod;
+	trap->data.ntc_256.mkey = mkey;
 	if (mad->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
 	if (mad->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
-		data.ntc_256.dr_slid = dr_slid;
-		data.ntc_256.dr_trunc_hop = IB_NOTICE_TRAP_DR_NOTICE;
-		if (hop_cnt > ARRAY_SIZE(data.ntc_256.dr_rtn_path)) {
-			data.ntc_256.dr_trunc_hop |=
+		trap->data.ntc_256.dr_slid = dr_slid;
+		trap->data.ntc_256.dr_trunc_hop = IB_NOTICE_TRAP_DR_NOTICE;
+		if (hop_cnt > ARRAY_SIZE(trap->data.ntc_256.dr_rtn_path)) {
+			trap->data.ntc_256.dr_trunc_hop |=
 				IB_NOTICE_TRAP_DR_TRUNC;
 				IB_NOTICE_TRAP_DR_TRUNC;
-			hop_cnt = ARRAY_SIZE(data.ntc_256.dr_rtn_path);
+			hop_cnt = ARRAY_SIZE(trap->data.ntc_256.dr_rtn_path);
 		}
 		}
-		data.ntc_256.dr_trunc_hop |= hop_cnt;
-		memcpy(data.ntc_256.dr_rtn_path, return_path,
+		trap->data.ntc_256.dr_trunc_hop |= hop_cnt;
+		memcpy(trap->data.ntc_256.dr_rtn_path, return_path,
 		       hop_cnt);
 		       hop_cnt);
 	}
 	}
 
 
-	send_trap(ibp, &data, sizeof(data));
+	trap->len = sizeof(trap->data);
+
+	send_trap(ibp, trap);
 }
 }
 
 
 /*
 /*
@@ -246,22 +442,24 @@ static void bad_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad,
  */
  */
 void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num)
 void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num)
 {
 {
-	struct opa_mad_notice_attr data;
+	struct trap_node *trap;
 	struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
 	struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
 	struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
 	struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
 	struct hfi1_ibport *ibp = &dd->pport[port_num - 1].ibport_data;
 	struct hfi1_ibport *ibp = &dd->pport[port_num - 1].ibport_data;
 	u32 lid = ppd_from_ibp(ibp)->lid;
 	u32 lid = ppd_from_ibp(ibp)->lid;
 
 
-	memset(&data, 0, sizeof(data));
+	trap = create_trap_node(IB_NOTICE_TYPE_INFO,
+				OPA_TRAP_CHANGE_CAPABILITY,
+				lid);
+	if (!trap)
+		return;
 
 
-	data.generic_type = IB_NOTICE_TYPE_INFO;
-	data.prod_type_lsb = IB_NOTICE_PROD_CA;
-	data.trap_num = OPA_TRAP_CHANGE_CAPABILITY;
-	data.issuer_lid = cpu_to_be32(lid);
-	data.ntc_144.lid = data.issuer_lid;
-	data.ntc_144.new_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags);
+	trap->data.ntc_144.lid = trap->data.issuer_lid;
+	trap->data.ntc_144.new_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags);
+	trap->data.ntc_144.cap_mask3 = cpu_to_be16(ibp->rvp.port_cap3_flags);
 
 
-	send_trap(ibp, &data, sizeof(data));
+	trap->len = sizeof(trap->data);
+	send_trap(ibp, trap);
 }
 }
 
 
 /*
 /*
@@ -269,19 +467,19 @@ void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num)
  */
  */
 void hfi1_sys_guid_chg(struct hfi1_ibport *ibp)
 void hfi1_sys_guid_chg(struct hfi1_ibport *ibp)
 {
 {
-	struct opa_mad_notice_attr data;
+	struct trap_node *trap;
 	u32 lid = ppd_from_ibp(ibp)->lid;
 	u32 lid = ppd_from_ibp(ibp)->lid;
 
 
-	memset(&data, 0, sizeof(data));
+	trap = create_trap_node(IB_NOTICE_TYPE_INFO, OPA_TRAP_CHANGE_SYSGUID,
+				lid);
+	if (!trap)
+		return;
 
 
-	data.generic_type = IB_NOTICE_TYPE_INFO;
-	data.prod_type_lsb = IB_NOTICE_PROD_CA;
-	data.trap_num = OPA_TRAP_CHANGE_SYSGUID;
-	data.issuer_lid = cpu_to_be32(lid);
-	data.ntc_145.new_sys_guid = ib_hfi1_sys_image_guid;
-	data.ntc_145.lid = data.issuer_lid;
+	trap->data.ntc_145.new_sys_guid = ib_hfi1_sys_image_guid;
+	trap->data.ntc_145.lid = trap->data.issuer_lid;
 
 
-	send_trap(ibp, &data, sizeof(data));
+	trap->len = sizeof(trap->data);
+	send_trap(ibp, trap);
 }
 }
 
 
 /*
 /*
@@ -289,29 +487,30 @@ void hfi1_sys_guid_chg(struct hfi1_ibport *ibp)
  */
  */
 void hfi1_node_desc_chg(struct hfi1_ibport *ibp)
 void hfi1_node_desc_chg(struct hfi1_ibport *ibp)
 {
 {
-	struct opa_mad_notice_attr data;
+	struct trap_node *trap;
 	u32 lid = ppd_from_ibp(ibp)->lid;
 	u32 lid = ppd_from_ibp(ibp)->lid;
 
 
-	memset(&data, 0, sizeof(data));
+	trap = create_trap_node(IB_NOTICE_TYPE_INFO,
+				OPA_TRAP_CHANGE_CAPABILITY,
+				lid);
+	if (!trap)
+		return;
 
 
-	data.generic_type = IB_NOTICE_TYPE_INFO;
-	data.prod_type_lsb = IB_NOTICE_PROD_CA;
-	data.trap_num = OPA_TRAP_CHANGE_CAPABILITY;
-	data.issuer_lid = cpu_to_be32(lid);
-	data.ntc_144.lid = data.issuer_lid;
-	data.ntc_144.change_flags =
+	trap->data.ntc_144.lid = trap->data.issuer_lid;
+	trap->data.ntc_144.change_flags =
 		cpu_to_be16(OPA_NOTICE_TRAP_NODE_DESC_CHG);
 		cpu_to_be16(OPA_NOTICE_TRAP_NODE_DESC_CHG);
 
 
-	send_trap(ibp, &data, sizeof(data));
+	trap->len = sizeof(trap->data);
+	send_trap(ibp, trap);
 }
 }
 
 
 static int __subn_get_opa_nodedesc(struct opa_smp *smp, u32 am,
 static int __subn_get_opa_nodedesc(struct opa_smp *smp, u32 am,
 				   u8 *data, struct ib_device *ibdev,
 				   u8 *data, struct ib_device *ibdev,
-				   u8 port, u32 *resp_len)
+				   u8 port, u32 *resp_len, u32 max_len)
 {
 {
 	struct opa_node_description *nd;
 	struct opa_node_description *nd;
 
 
-	if (am) {
+	if (am || smp_length_check(sizeof(*nd), max_len)) {
 		smp->status |= IB_SMP_INVALID_FIELD;
 		smp->status |= IB_SMP_INVALID_FIELD;
 		return reply((struct ib_mad_hdr *)smp);
 		return reply((struct ib_mad_hdr *)smp);
 	}
 	}
@@ -328,7 +527,7 @@ static int __subn_get_opa_nodedesc(struct opa_smp *smp, u32 am,
 
 
 static int __subn_get_opa_nodeinfo(struct opa_smp *smp, u32 am, u8 *data,
 static int __subn_get_opa_nodeinfo(struct opa_smp *smp, u32 am, u8 *data,
 				   struct ib_device *ibdev, u8 port,
 				   struct ib_device *ibdev, u8 port,
-				   u32 *resp_len)
+				   u32 *resp_len, u32 max_len)
 {
 {
 	struct opa_node_info *ni;
 	struct opa_node_info *ni;
 	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
 	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
@@ -338,6 +537,7 @@ static int __subn_get_opa_nodeinfo(struct opa_smp *smp, u32 am, u8 *data,
 
 
 	/* GUID 0 is illegal */
 	/* GUID 0 is illegal */
 	if (am || pidx >= dd->num_pports || ibdev->node_guid == 0 ||
 	if (am || pidx >= dd->num_pports || ibdev->node_guid == 0 ||
+	    smp_length_check(sizeof(*ni), max_len) ||
 	    get_sguid(to_iport(ibdev, port), HFI1_PORT_GUID_INDEX) == 0) {
 	    get_sguid(to_iport(ibdev, port), HFI1_PORT_GUID_INDEX) == 0) {
 		smp->status |= IB_SMP_INVALID_FIELD;
 		smp->status |= IB_SMP_INVALID_FIELD;
 		return reply((struct ib_mad_hdr *)smp);
 		return reply((struct ib_mad_hdr *)smp);
@@ -519,7 +719,7 @@ void read_ltp_rtt(struct hfi1_devdata *dd)
 
 
 static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
 static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
 				   struct ib_device *ibdev, u8 port,
 				   struct ib_device *ibdev, u8 port,
-				   u32 *resp_len)
+				   u32 *resp_len, u32 max_len)
 {
 {
 	int i;
 	int i;
 	struct hfi1_devdata *dd;
 	struct hfi1_devdata *dd;
@@ -535,7 +735,7 @@ static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
 	u32 buffer_units;
 	u32 buffer_units;
 	u64 tmp = 0;
 	u64 tmp = 0;
 
 
-	if (num_ports != 1) {
+	if (num_ports != 1 || smp_length_check(sizeof(*pi), max_len)) {
 		smp->status |= IB_SMP_INVALID_FIELD;
 		smp->status |= IB_SMP_INVALID_FIELD;
 		return reply((struct ib_mad_hdr *)smp);
 		return reply((struct ib_mad_hdr *)smp);
 	}
 	}
@@ -605,7 +805,7 @@ static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
 		ppd->offline_disabled_reason;
 		ppd->offline_disabled_reason;
 
 
 	pi->port_states.portphysstate_portstate =
 	pi->port_states.portphysstate_portstate =
-		(hfi1_ibphys_portstate(ppd) << 4) | state;
+		(driver_pstate(ppd) << 4) | state;
 
 
 	pi->mkeyprotect_lmc = (ibp->rvp.mkeyprot << 6) | ppd->lmc;
 	pi->mkeyprotect_lmc = (ibp->rvp.mkeyprot << 6) | ppd->lmc;
 
 
@@ -704,11 +904,7 @@ static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
 	buffer_units |= (dd->vl15_init << 11) & OPA_PI_MASK_BUF_UNIT_VL15_INIT;
 	buffer_units |= (dd->vl15_init << 11) & OPA_PI_MASK_BUF_UNIT_VL15_INIT;
 	pi->buffer_units = cpu_to_be32(buffer_units);
 	pi->buffer_units = cpu_to_be32(buffer_units);
 
 
-	pi->opa_cap_mask = cpu_to_be16(OPA_CAP_MASK3_IsSharedSpaceSupported |
-				       OPA_CAP_MASK3_IsEthOnFabricSupported);
-	/* Driver does not support mcast/collective configuration */
-	pi->opa_cap_mask &=
-		cpu_to_be16(~OPA_CAP_MASK3_IsAddrRangeConfigSupported);
+	pi->opa_cap_mask = cpu_to_be16(ibp->rvp.port_cap3_flags);
 	pi->collectivemask_multicastmask = ((HFI1_COLLECTIVE_NR & 0x7)
 	pi->collectivemask_multicastmask = ((HFI1_COLLECTIVE_NR & 0x7)
 					    << 3 | (HFI1_MCAST_NR & 0x7));
 					    << 3 | (HFI1_MCAST_NR & 0x7));
 
 
@@ -748,7 +944,7 @@ static int get_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys)
 
 
 static int __subn_get_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
 static int __subn_get_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
 				    struct ib_device *ibdev, u8 port,
 				    struct ib_device *ibdev, u8 port,
-				    u32 *resp_len)
+				    u32 *resp_len, u32 max_len)
 {
 {
 	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
 	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
 	u32 n_blocks_req = OPA_AM_NBLK(am);
 	u32 n_blocks_req = OPA_AM_NBLK(am);
@@ -771,6 +967,11 @@ static int __subn_get_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
 
 
 	size = (n_blocks_req * OPA_PARTITION_TABLE_BLK_SIZE) * sizeof(u16);
 	size = (n_blocks_req * OPA_PARTITION_TABLE_BLK_SIZE) * sizeof(u16);
 
 
+	if (smp_length_check(size, max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
 	if (start_block + n_blocks_req > n_blocks_avail ||
 	if (start_block + n_blocks_req > n_blocks_avail ||
 	    n_blocks_req > OPA_NUM_PKEY_BLOCKS_PER_SMP) {
 	    n_blocks_req > OPA_NUM_PKEY_BLOCKS_PER_SMP) {
 		pr_warn("OPA Get PKey AM Invalid : s 0x%x; req 0x%x; "
 		pr_warn("OPA Get PKey AM Invalid : s 0x%x; req 0x%x; "
@@ -1074,7 +1275,7 @@ static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp,
  */
  */
 static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
 static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
 				   struct ib_device *ibdev, u8 port,
 				   struct ib_device *ibdev, u8 port,
-				   u32 *resp_len)
+				   u32 *resp_len, u32 max_len)
 {
 {
 	struct opa_port_info *pi = (struct opa_port_info *)data;
 	struct opa_port_info *pi = (struct opa_port_info *)data;
 	struct ib_event event;
 	struct ib_event event;
@@ -1095,7 +1296,8 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
 	int ret, i, invalid = 0, call_set_mtu = 0;
 	int ret, i, invalid = 0, call_set_mtu = 0;
 	int call_link_downgrade_policy = 0;
 	int call_link_downgrade_policy = 0;
 
 
-	if (num_ports != 1) {
+	if (num_ports != 1 ||
+	    smp_length_check(sizeof(*pi), max_len)) {
 		smp->status |= IB_SMP_INVALID_FIELD;
 		smp->status |= IB_SMP_INVALID_FIELD;
 		return reply((struct ib_mad_hdr *)smp);
 		return reply((struct ib_mad_hdr *)smp);
 	}
 	}
@@ -1346,7 +1548,8 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
 	if (ret)
 	if (ret)
 		return ret;
 		return ret;
 
 
-	ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len);
+	ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len,
+				      max_len);
 
 
 	/* restore re-reg bit per o14-12.2.1 */
 	/* restore re-reg bit per o14-12.2.1 */
 	pi->clientrereg_subnettimeout |= clientrereg;
 	pi->clientrereg_subnettimeout |= clientrereg;
@@ -1363,7 +1566,8 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
 	return ret;
 	return ret;
 
 
 get_only:
 get_only:
-	return __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len);
+	return __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len,
+				       max_len);
 }
 }
 
 
 /**
 /**
@@ -1424,7 +1628,7 @@ static int set_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys)
 
 
 static int __subn_set_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
 static int __subn_set_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
 				    struct ib_device *ibdev, u8 port,
 				    struct ib_device *ibdev, u8 port,
-				    u32 *resp_len)
+				    u32 *resp_len, u32 max_len)
 {
 {
 	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
 	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
 	u32 n_blocks_sent = OPA_AM_NBLK(am);
 	u32 n_blocks_sent = OPA_AM_NBLK(am);
@@ -1434,6 +1638,7 @@ static int __subn_set_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
 	int i;
 	int i;
 	u16 n_blocks_avail;
 	u16 n_blocks_avail;
 	unsigned npkeys = hfi1_get_npkeys(dd);
 	unsigned npkeys = hfi1_get_npkeys(dd);
+	u32 size = 0;
 
 
 	if (n_blocks_sent == 0) {
 	if (n_blocks_sent == 0) {
 		pr_warn("OPA Get PKey AM Invalid : P = %d; B = 0x%x; N = 0x%x\n",
 		pr_warn("OPA Get PKey AM Invalid : P = %d; B = 0x%x; N = 0x%x\n",
@@ -1444,6 +1649,13 @@ static int __subn_set_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
 
 
 	n_blocks_avail = (u16)(npkeys / OPA_PARTITION_TABLE_BLK_SIZE) + 1;
 	n_blocks_avail = (u16)(npkeys / OPA_PARTITION_TABLE_BLK_SIZE) + 1;
 
 
+	size = sizeof(u16) * (n_blocks_sent * OPA_PARTITION_TABLE_BLK_SIZE);
+
+	if (smp_length_check(size, max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
 	if (start_block + n_blocks_sent > n_blocks_avail ||
 	if (start_block + n_blocks_sent > n_blocks_avail ||
 	    n_blocks_sent > OPA_NUM_PKEY_BLOCKS_PER_SMP) {
 	    n_blocks_sent > OPA_NUM_PKEY_BLOCKS_PER_SMP) {
 		pr_warn("OPA Set PKey AM Invalid : s 0x%x; req 0x%x; avail 0x%x; blk/smp 0x%lx\n",
 		pr_warn("OPA Set PKey AM Invalid : s 0x%x; req 0x%x; avail 0x%x; blk/smp 0x%lx\n",
@@ -1461,7 +1673,8 @@ static int __subn_set_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
 		return reply((struct ib_mad_hdr *)smp);
 		return reply((struct ib_mad_hdr *)smp);
 	}
 	}
 
 
-	return __subn_get_opa_pkeytable(smp, am, data, ibdev, port, resp_len);
+	return __subn_get_opa_pkeytable(smp, am, data, ibdev, port, resp_len,
+					max_len);
 }
 }
 
 
 #define ILLEGAL_VL 12
 #define ILLEGAL_VL 12
@@ -1522,14 +1735,14 @@ static int get_sc2vlt_tables(struct hfi1_devdata *dd, void *data)
 
 
 static int __subn_get_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
 static int __subn_get_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
 				   struct ib_device *ibdev, u8 port,
 				   struct ib_device *ibdev, u8 port,
-				   u32 *resp_len)
+				   u32 *resp_len, u32 max_len)
 {
 {
 	struct hfi1_ibport *ibp = to_iport(ibdev, port);
 	struct hfi1_ibport *ibp = to_iport(ibdev, port);
 	u8 *p = data;
 	u8 *p = data;
 	size_t size = ARRAY_SIZE(ibp->sl_to_sc); /* == 32 */
 	size_t size = ARRAY_SIZE(ibp->sl_to_sc); /* == 32 */
 	unsigned i;
 	unsigned i;
 
 
-	if (am) {
+	if (am || smp_length_check(size, max_len)) {
 		smp->status |= IB_SMP_INVALID_FIELD;
 		smp->status |= IB_SMP_INVALID_FIELD;
 		return reply((struct ib_mad_hdr *)smp);
 		return reply((struct ib_mad_hdr *)smp);
 	}
 	}
@@ -1545,14 +1758,15 @@ static int __subn_get_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
 
 
 static int __subn_set_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
 static int __subn_set_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
 				   struct ib_device *ibdev, u8 port,
 				   struct ib_device *ibdev, u8 port,
-				   u32 *resp_len)
+				   u32 *resp_len, u32 max_len)
 {
 {
 	struct hfi1_ibport *ibp = to_iport(ibdev, port);
 	struct hfi1_ibport *ibp = to_iport(ibdev, port);
 	u8 *p = data;
 	u8 *p = data;
+	size_t size = ARRAY_SIZE(ibp->sl_to_sc);
 	int i;
 	int i;
 	u8 sc;
 	u8 sc;
 
 
-	if (am) {
+	if (am || smp_length_check(size, max_len)) {
 		smp->status |= IB_SMP_INVALID_FIELD;
 		smp->status |= IB_SMP_INVALID_FIELD;
 		return reply((struct ib_mad_hdr *)smp);
 		return reply((struct ib_mad_hdr *)smp);
 	}
 	}
@@ -1567,19 +1781,20 @@ static int __subn_set_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
 		}
 		}
 	}
 	}
 
 
-	return __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port, resp_len);
+	return __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port, resp_len,
+				       max_len);
 }
 }
 
 
 static int __subn_get_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
 static int __subn_get_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
 				   struct ib_device *ibdev, u8 port,
 				   struct ib_device *ibdev, u8 port,
-				   u32 *resp_len)
+				   u32 *resp_len, u32 max_len)
 {
 {
 	struct hfi1_ibport *ibp = to_iport(ibdev, port);
 	struct hfi1_ibport *ibp = to_iport(ibdev, port);
 	u8 *p = data;
 	u8 *p = data;
 	size_t size = ARRAY_SIZE(ibp->sc_to_sl); /* == 32 */
 	size_t size = ARRAY_SIZE(ibp->sc_to_sl); /* == 32 */
 	unsigned i;
 	unsigned i;
 
 
-	if (am) {
+	if (am || smp_length_check(size, max_len)) {
 		smp->status |= IB_SMP_INVALID_FIELD;
 		smp->status |= IB_SMP_INVALID_FIELD;
 		return reply((struct ib_mad_hdr *)smp);
 		return reply((struct ib_mad_hdr *)smp);
 	}
 	}
@@ -1595,13 +1810,14 @@ static int __subn_get_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
 
 
 static int __subn_set_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
 static int __subn_set_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
 				   struct ib_device *ibdev, u8 port,
 				   struct ib_device *ibdev, u8 port,
-				   u32 *resp_len)
+				   u32 *resp_len, u32 max_len)
 {
 {
 	struct hfi1_ibport *ibp = to_iport(ibdev, port);
 	struct hfi1_ibport *ibp = to_iport(ibdev, port);
+	size_t size = ARRAY_SIZE(ibp->sc_to_sl);
 	u8 *p = data;
 	u8 *p = data;
 	int i;
 	int i;
 
 
-	if (am) {
+	if (am || smp_length_check(size, max_len)) {
 		smp->status |= IB_SMP_INVALID_FIELD;
 		smp->status |= IB_SMP_INVALID_FIELD;
 		return reply((struct ib_mad_hdr *)smp);
 		return reply((struct ib_mad_hdr *)smp);
 	}
 	}
@@ -1609,19 +1825,20 @@ static int __subn_set_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
 	for (i = 0; i < ARRAY_SIZE(ibp->sc_to_sl); i++)
 	for (i = 0; i < ARRAY_SIZE(ibp->sc_to_sl); i++)
 		ibp->sc_to_sl[i] = *p++;
 		ibp->sc_to_sl[i] = *p++;
 
 
-	return __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port, resp_len);
+	return __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port, resp_len,
+				       max_len);
 }
 }
 
 
 static int __subn_get_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
 static int __subn_get_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
 				    struct ib_device *ibdev, u8 port,
 				    struct ib_device *ibdev, u8 port,
-				    u32 *resp_len)
+				    u32 *resp_len, u32 max_len)
 {
 {
 	u32 n_blocks = OPA_AM_NBLK(am);
 	u32 n_blocks = OPA_AM_NBLK(am);
 	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
 	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
 	void *vp = (void *)data;
 	void *vp = (void *)data;
 	size_t size = 4 * sizeof(u64);
 	size_t size = 4 * sizeof(u64);
 
 
-	if (n_blocks != 1) {
+	if (n_blocks != 1 || smp_length_check(size, max_len)) {
 		smp->status |= IB_SMP_INVALID_FIELD;
 		smp->status |= IB_SMP_INVALID_FIELD;
 		return reply((struct ib_mad_hdr *)smp);
 		return reply((struct ib_mad_hdr *)smp);
 	}
 	}
@@ -1636,7 +1853,7 @@ static int __subn_get_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
 
 
 static int __subn_set_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
 static int __subn_set_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
 				    struct ib_device *ibdev, u8 port,
 				    struct ib_device *ibdev, u8 port,
-				    u32 *resp_len)
+				    u32 *resp_len, u32 max_len)
 {
 {
 	u32 n_blocks = OPA_AM_NBLK(am);
 	u32 n_blocks = OPA_AM_NBLK(am);
 	int async_update = OPA_AM_ASYNC(am);
 	int async_update = OPA_AM_ASYNC(am);
@@ -1644,8 +1861,15 @@ static int __subn_set_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
 	void *vp = (void *)data;
 	void *vp = (void *)data;
 	struct hfi1_pportdata *ppd;
 	struct hfi1_pportdata *ppd;
 	int lstate;
 	int lstate;
+	/*
+	 * set_sc2vlt_tables writes the information contained in *data
+	 * to four 64-bit registers SendSC2VLt[0-3]. We need to make
+	 * sure *max_len is not greater than the total size of the four
+	 * SendSC2VLt[0-3] registers.
+	 */
+	size_t size = 4 * sizeof(u64);
 
 
-	if (n_blocks != 1 || async_update) {
+	if (n_blocks != 1 || async_update || smp_length_check(size, max_len)) {
 		smp->status |= IB_SMP_INVALID_FIELD;
 		smp->status |= IB_SMP_INVALID_FIELD;
 		return reply((struct ib_mad_hdr *)smp);
 		return reply((struct ib_mad_hdr *)smp);
 	}
 	}
@@ -1665,27 +1889,28 @@ static int __subn_set_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
 
 
 	set_sc2vlt_tables(dd, vp);
 	set_sc2vlt_tables(dd, vp);
 
 
-	return __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port, resp_len);
+	return __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port, resp_len,
+					max_len);
 }
 }
 
 
 static int __subn_get_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
 static int __subn_get_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
 				     struct ib_device *ibdev, u8 port,
 				     struct ib_device *ibdev, u8 port,
-				     u32 *resp_len)
+				     u32 *resp_len, u32 max_len)
 {
 {
 	u32 n_blocks = OPA_AM_NPORT(am);
 	u32 n_blocks = OPA_AM_NPORT(am);
 	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
 	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
 	struct hfi1_pportdata *ppd;
 	struct hfi1_pportdata *ppd;
 	void *vp = (void *)data;
 	void *vp = (void *)data;
-	int size;
+	int size = sizeof(struct sc2vlnt);
 
 
-	if (n_blocks != 1) {
+	if (n_blocks != 1 || smp_length_check(size, max_len)) {
 		smp->status |= IB_SMP_INVALID_FIELD;
 		smp->status |= IB_SMP_INVALID_FIELD;
 		return reply((struct ib_mad_hdr *)smp);
 		return reply((struct ib_mad_hdr *)smp);
 	}
 	}
 
 
 	ppd = dd->pport + (port - 1);
 	ppd = dd->pport + (port - 1);
 
 
-	size = fm_get_table(ppd, FM_TBL_SC2VLNT, vp);
+	fm_get_table(ppd, FM_TBL_SC2VLNT, vp);
 
 
 	if (resp_len)
 	if (resp_len)
 		*resp_len += size;
 		*resp_len += size;
@@ -1695,15 +1920,16 @@ static int __subn_get_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
 
 
 static int __subn_set_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
 static int __subn_set_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
 				     struct ib_device *ibdev, u8 port,
 				     struct ib_device *ibdev, u8 port,
-				     u32 *resp_len)
+				     u32 *resp_len, u32 max_len)
 {
 {
 	u32 n_blocks = OPA_AM_NPORT(am);
 	u32 n_blocks = OPA_AM_NPORT(am);
 	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
 	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
 	struct hfi1_pportdata *ppd;
 	struct hfi1_pportdata *ppd;
 	void *vp = (void *)data;
 	void *vp = (void *)data;
 	int lstate;
 	int lstate;
+	int size = sizeof(struct sc2vlnt);
 
 
-	if (n_blocks != 1) {
+	if (n_blocks != 1 || smp_length_check(size, max_len)) {
 		smp->status |= IB_SMP_INVALID_FIELD;
 		smp->status |= IB_SMP_INVALID_FIELD;
 		return reply((struct ib_mad_hdr *)smp);
 		return reply((struct ib_mad_hdr *)smp);
 	}
 	}
@@ -1721,12 +1947,12 @@ static int __subn_set_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
 	fm_set_table(ppd, FM_TBL_SC2VLNT, vp);
 	fm_set_table(ppd, FM_TBL_SC2VLNT, vp);
 
 
 	return __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port,
 	return __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port,
-					 resp_len);
+					 resp_len, max_len);
 }
 }
 
 
 static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
 static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
 			      struct ib_device *ibdev, u8 port,
 			      struct ib_device *ibdev, u8 port,
-			      u32 *resp_len)
+			      u32 *resp_len, u32 max_len)
 {
 {
 	u32 nports = OPA_AM_NPORT(am);
 	u32 nports = OPA_AM_NPORT(am);
 	u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
 	u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
@@ -1735,7 +1961,7 @@ static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
 	struct hfi1_pportdata *ppd;
 	struct hfi1_pportdata *ppd;
 	struct opa_port_state_info *psi = (struct opa_port_state_info *)data;
 	struct opa_port_state_info *psi = (struct opa_port_state_info *)data;
 
 
-	if (nports != 1) {
+	if (nports != 1 || smp_length_check(sizeof(*psi), max_len)) {
 		smp->status |= IB_SMP_INVALID_FIELD;
 		smp->status |= IB_SMP_INVALID_FIELD;
 		return reply((struct ib_mad_hdr *)smp);
 		return reply((struct ib_mad_hdr *)smp);
 	}
 	}
@@ -1755,7 +1981,7 @@ static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
 		ppd->offline_disabled_reason;
 		ppd->offline_disabled_reason;
 
 
 	psi->port_states.portphysstate_portstate =
 	psi->port_states.portphysstate_portstate =
-		(hfi1_ibphys_portstate(ppd) << 4) | (lstate & 0xf);
+		(driver_pstate(ppd) << 4) | (lstate & 0xf);
 	psi->link_width_downgrade_tx_active =
 	psi->link_width_downgrade_tx_active =
 		cpu_to_be16(ppd->link_width_downgrade_tx_active);
 		cpu_to_be16(ppd->link_width_downgrade_tx_active);
 	psi->link_width_downgrade_rx_active =
 	psi->link_width_downgrade_rx_active =
@@ -1768,7 +1994,7 @@ static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
 
 
 static int __subn_set_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
 static int __subn_set_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
 			      struct ib_device *ibdev, u8 port,
 			      struct ib_device *ibdev, u8 port,
-			      u32 *resp_len)
+			      u32 *resp_len, u32 max_len)
 {
 {
 	u32 nports = OPA_AM_NPORT(am);
 	u32 nports = OPA_AM_NPORT(am);
 	u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
 	u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
@@ -1779,7 +2005,7 @@ static int __subn_set_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
 	struct opa_port_state_info *psi = (struct opa_port_state_info *)data;
 	struct opa_port_state_info *psi = (struct opa_port_state_info *)data;
 	int ret, invalid = 0;
 	int ret, invalid = 0;
 
 
-	if (nports != 1) {
+	if (nports != 1 || smp_length_check(sizeof(*psi), max_len)) {
 		smp->status |= IB_SMP_INVALID_FIELD;
 		smp->status |= IB_SMP_INVALID_FIELD;
 		return reply((struct ib_mad_hdr *)smp);
 		return reply((struct ib_mad_hdr *)smp);
 	}
 	}
@@ -1809,19 +2035,21 @@ static int __subn_set_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
 	if (invalid)
 	if (invalid)
 		smp->status |= IB_SMP_INVALID_FIELD;
 		smp->status |= IB_SMP_INVALID_FIELD;
 
 
-	return __subn_get_opa_psi(smp, am, data, ibdev, port, resp_len);
+	return __subn_get_opa_psi(smp, am, data, ibdev, port, resp_len,
+				  max_len);
 }
 }
 
 
 static int __subn_get_opa_cable_info(struct opa_smp *smp, u32 am, u8 *data,
 static int __subn_get_opa_cable_info(struct opa_smp *smp, u32 am, u8 *data,
 				     struct ib_device *ibdev, u8 port,
 				     struct ib_device *ibdev, u8 port,
-				     u32 *resp_len)
+				     u32 *resp_len, u32 max_len)
 {
 {
 	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
 	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
 	u32 addr = OPA_AM_CI_ADDR(am);
 	u32 addr = OPA_AM_CI_ADDR(am);
 	u32 len = OPA_AM_CI_LEN(am) + 1;
 	u32 len = OPA_AM_CI_LEN(am) + 1;
 	int ret;
 	int ret;
 
 
-	if (dd->pport->port_type != PORT_TYPE_QSFP) {
+	if (dd->pport->port_type != PORT_TYPE_QSFP ||
+	    smp_length_check(len, max_len)) {
 		smp->status |= IB_SMP_INVALID_FIELD;
 		smp->status |= IB_SMP_INVALID_FIELD;
 		return reply((struct ib_mad_hdr *)smp);
 		return reply((struct ib_mad_hdr *)smp);
 	}
 	}
@@ -1864,21 +2092,22 @@ static int __subn_get_opa_cable_info(struct opa_smp *smp, u32 am, u8 *data,
 }
 }
 
 
 static int __subn_get_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
 static int __subn_get_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
-			      struct ib_device *ibdev, u8 port, u32 *resp_len)
+			      struct ib_device *ibdev, u8 port, u32 *resp_len,
+			      u32 max_len)
 {
 {
 	u32 num_ports = OPA_AM_NPORT(am);
 	u32 num_ports = OPA_AM_NPORT(am);
 	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
 	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
 	struct hfi1_pportdata *ppd;
 	struct hfi1_pportdata *ppd;
 	struct buffer_control *p = (struct buffer_control *)data;
 	struct buffer_control *p = (struct buffer_control *)data;
-	int size;
+	int size = sizeof(struct buffer_control);
 
 
-	if (num_ports != 1) {
+	if (num_ports != 1 || smp_length_check(size, max_len)) {
 		smp->status |= IB_SMP_INVALID_FIELD;
 		smp->status |= IB_SMP_INVALID_FIELD;
 		return reply((struct ib_mad_hdr *)smp);
 		return reply((struct ib_mad_hdr *)smp);
 	}
 	}
 
 
 	ppd = dd->pport + (port - 1);
 	ppd = dd->pport + (port - 1);
-	size = fm_get_table(ppd, FM_TBL_BUFFER_CONTROL, p);
+	fm_get_table(ppd, FM_TBL_BUFFER_CONTROL, p);
 	trace_bct_get(dd, p);
 	trace_bct_get(dd, p);
 	if (resp_len)
 	if (resp_len)
 		*resp_len += size;
 		*resp_len += size;
@@ -1887,14 +2116,15 @@ static int __subn_get_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
 }
 }
 
 
 static int __subn_set_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
 static int __subn_set_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
-			      struct ib_device *ibdev, u8 port, u32 *resp_len)
+			      struct ib_device *ibdev, u8 port, u32 *resp_len,
+			      u32 max_len)
 {
 {
 	u32 num_ports = OPA_AM_NPORT(am);
 	u32 num_ports = OPA_AM_NPORT(am);
 	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
 	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
 	struct hfi1_pportdata *ppd;
 	struct hfi1_pportdata *ppd;
 	struct buffer_control *p = (struct buffer_control *)data;
 	struct buffer_control *p = (struct buffer_control *)data;
 
 
-	if (num_ports != 1) {
+	if (num_ports != 1 || smp_length_check(sizeof(*p), max_len)) {
 		smp->status |= IB_SMP_INVALID_FIELD;
 		smp->status |= IB_SMP_INVALID_FIELD;
 		return reply((struct ib_mad_hdr *)smp);
 		return reply((struct ib_mad_hdr *)smp);
 	}
 	}
@@ -1905,41 +2135,43 @@ static int __subn_set_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
 		return reply((struct ib_mad_hdr *)smp);
 		return reply((struct ib_mad_hdr *)smp);
 	}
 	}
 
 
-	return __subn_get_opa_bct(smp, am, data, ibdev, port, resp_len);
+	return __subn_get_opa_bct(smp, am, data, ibdev, port, resp_len,
+				  max_len);
 }
 }
 
 
 static int __subn_get_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data,
 static int __subn_get_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data,
 				 struct ib_device *ibdev, u8 port,
 				 struct ib_device *ibdev, u8 port,
-				 u32 *resp_len)
+				 u32 *resp_len, u32 max_len)
 {
 {
 	struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
 	struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
 	u32 num_ports = OPA_AM_NPORT(am);
 	u32 num_ports = OPA_AM_NPORT(am);
 	u8 section = (am & 0x00ff0000) >> 16;
 	u8 section = (am & 0x00ff0000) >> 16;
 	u8 *p = data;
 	u8 *p = data;
-	int size = 0;
+	int size = 256;
 
 
-	if (num_ports != 1) {
+	if (num_ports != 1 || smp_length_check(size, max_len)) {
 		smp->status |= IB_SMP_INVALID_FIELD;
 		smp->status |= IB_SMP_INVALID_FIELD;
 		return reply((struct ib_mad_hdr *)smp);
 		return reply((struct ib_mad_hdr *)smp);
 	}
 	}
 
 
 	switch (section) {
 	switch (section) {
 	case OPA_VLARB_LOW_ELEMENTS:
 	case OPA_VLARB_LOW_ELEMENTS:
-		size = fm_get_table(ppd, FM_TBL_VL_LOW_ARB, p);
+		fm_get_table(ppd, FM_TBL_VL_LOW_ARB, p);
 		break;
 		break;
 	case OPA_VLARB_HIGH_ELEMENTS:
 	case OPA_VLARB_HIGH_ELEMENTS:
-		size = fm_get_table(ppd, FM_TBL_VL_HIGH_ARB, p);
+		fm_get_table(ppd, FM_TBL_VL_HIGH_ARB, p);
 		break;
 		break;
 	case OPA_VLARB_PREEMPT_ELEMENTS:
 	case OPA_VLARB_PREEMPT_ELEMENTS:
-		size = fm_get_table(ppd, FM_TBL_VL_PREEMPT_ELEMS, p);
+		fm_get_table(ppd, FM_TBL_VL_PREEMPT_ELEMS, p);
 		break;
 		break;
 	case OPA_VLARB_PREEMPT_MATRIX:
 	case OPA_VLARB_PREEMPT_MATRIX:
-		size = fm_get_table(ppd, FM_TBL_VL_PREEMPT_MATRIX, p);
+		fm_get_table(ppd, FM_TBL_VL_PREEMPT_MATRIX, p);
 		break;
 		break;
 	default:
 	default:
 		pr_warn("OPA SubnGet(VL Arb) AM Invalid : 0x%x\n",
 		pr_warn("OPA SubnGet(VL Arb) AM Invalid : 0x%x\n",
 			be32_to_cpu(smp->attr_mod));
 			be32_to_cpu(smp->attr_mod));
 		smp->status |= IB_SMP_INVALID_FIELD;
 		smp->status |= IB_SMP_INVALID_FIELD;
+		size = 0;
 		break;
 		break;
 	}
 	}
 
 
@@ -1951,14 +2183,15 @@ static int __subn_get_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data,
 
 
 static int __subn_set_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data,
 static int __subn_set_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data,
 				 struct ib_device *ibdev, u8 port,
 				 struct ib_device *ibdev, u8 port,
-				 u32 *resp_len)
+				 u32 *resp_len, u32 max_len)
 {
 {
 	struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
 	struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
 	u32 num_ports = OPA_AM_NPORT(am);
 	u32 num_ports = OPA_AM_NPORT(am);
 	u8 section = (am & 0x00ff0000) >> 16;
 	u8 section = (am & 0x00ff0000) >> 16;
 	u8 *p = data;
 	u8 *p = data;
+	int size = 256;
 
 
-	if (num_ports != 1) {
+	if (num_ports != 1 || smp_length_check(size, max_len)) {
 		smp->status |= IB_SMP_INVALID_FIELD;
 		smp->status |= IB_SMP_INVALID_FIELD;
 		return reply((struct ib_mad_hdr *)smp);
 		return reply((struct ib_mad_hdr *)smp);
 	}
 	}
@@ -1986,7 +2219,8 @@ static int __subn_set_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data,
 		break;
 		break;
 	}
 	}
 
 
-	return __subn_get_opa_vl_arb(smp, am, data, ibdev, port, resp_len);
+	return __subn_get_opa_vl_arb(smp, am, data, ibdev, port, resp_len,
+				     max_len);
 }
 }
 
 
 struct opa_pma_mad {
 struct opa_pma_mad {
@@ -3282,13 +3516,18 @@ struct opa_congestion_info_attr {
 
 
 static int __subn_get_opa_cong_info(struct opa_smp *smp, u32 am, u8 *data,
 static int __subn_get_opa_cong_info(struct opa_smp *smp, u32 am, u8 *data,
 				    struct ib_device *ibdev, u8 port,
 				    struct ib_device *ibdev, u8 port,
-				    u32 *resp_len)
+				    u32 *resp_len, u32 max_len)
 {
 {
 	struct opa_congestion_info_attr *p =
 	struct opa_congestion_info_attr *p =
 		(struct opa_congestion_info_attr *)data;
 		(struct opa_congestion_info_attr *)data;
 	struct hfi1_ibport *ibp = to_iport(ibdev, port);
 	struct hfi1_ibport *ibp = to_iport(ibdev, port);
 	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
 	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
 
 
+	if (smp_length_check(sizeof(*p), max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
 	p->congestion_info = 0;
 	p->congestion_info = 0;
 	p->control_table_cap = ppd->cc_max_table_entries;
 	p->control_table_cap = ppd->cc_max_table_entries;
 	p->congestion_log_length = OPA_CONG_LOG_ELEMS;
 	p->congestion_log_length = OPA_CONG_LOG_ELEMS;
@@ -3301,7 +3540,7 @@ static int __subn_get_opa_cong_info(struct opa_smp *smp, u32 am, u8 *data,
 
 
 static int __subn_get_opa_cong_setting(struct opa_smp *smp, u32 am,
 static int __subn_get_opa_cong_setting(struct opa_smp *smp, u32 am,
 				       u8 *data, struct ib_device *ibdev,
 				       u8 *data, struct ib_device *ibdev,
-				       u8 port, u32 *resp_len)
+				       u8 port, u32 *resp_len, u32 max_len)
 {
 {
 	int i;
 	int i;
 	struct opa_congestion_setting_attr *p =
 	struct opa_congestion_setting_attr *p =
@@ -3311,6 +3550,11 @@ static int __subn_get_opa_cong_setting(struct opa_smp *smp, u32 am,
 	struct opa_congestion_setting_entry_shadow *entries;
 	struct opa_congestion_setting_entry_shadow *entries;
 	struct cc_state *cc_state;
 	struct cc_state *cc_state;
 
 
+	if (smp_length_check(sizeof(*p), max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
 	rcu_read_lock();
 	rcu_read_lock();
 
 
 	cc_state = get_cc_state(ppd);
 	cc_state = get_cc_state(ppd);
@@ -3385,7 +3629,7 @@ static void apply_cc_state(struct hfi1_pportdata *ppd)
 
 
 static int __subn_set_opa_cong_setting(struct opa_smp *smp, u32 am, u8 *data,
 static int __subn_set_opa_cong_setting(struct opa_smp *smp, u32 am, u8 *data,
 				       struct ib_device *ibdev, u8 port,
 				       struct ib_device *ibdev, u8 port,
-				       u32 *resp_len)
+				       u32 *resp_len, u32 max_len)
 {
 {
 	struct opa_congestion_setting_attr *p =
 	struct opa_congestion_setting_attr *p =
 		(struct opa_congestion_setting_attr *)data;
 		(struct opa_congestion_setting_attr *)data;
@@ -3394,6 +3638,11 @@ static int __subn_set_opa_cong_setting(struct opa_smp *smp, u32 am, u8 *data,
 	struct opa_congestion_setting_entry_shadow *entries;
 	struct opa_congestion_setting_entry_shadow *entries;
 	int i;
 	int i;
 
 
+	if (smp_length_check(sizeof(*p), max_len)) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		return reply((struct ib_mad_hdr *)smp);
+	}
+
 	/*
 	/*
 	 * Save details from packet into the ppd.  Hold the cc_state_lock so
 	 * Save details from packet into the ppd.  Hold the cc_state_lock so
 	 * our information is consistent with anyone trying to apply the state.
 	 * our information is consistent with anyone trying to apply the state.
@@ -3415,12 +3664,12 @@ static int __subn_set_opa_cong_setting(struct opa_smp *smp, u32 am, u8 *data,
 	apply_cc_state(ppd);
 	apply_cc_state(ppd);
 
 
 	return __subn_get_opa_cong_setting(smp, am, data, ibdev, port,
 	return __subn_get_opa_cong_setting(smp, am, data, ibdev, port,
-					   resp_len);
+					   resp_len, max_len);
 }
 }
 
 
 static int __subn_get_opa_hfi1_cong_log(struct opa_smp *smp, u32 am,
 static int __subn_get_opa_hfi1_cong_log(struct opa_smp *smp, u32 am,
 					u8 *data, struct ib_device *ibdev,
 					u8 *data, struct ib_device *ibdev,
-					u8 port, u32 *resp_len)
+					u8 port, u32 *resp_len, u32 max_len)
 {
 {
 	struct hfi1_ibport *ibp = to_iport(ibdev, port);
 	struct hfi1_ibport *ibp = to_iport(ibdev, port);
 	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
 	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
@@ -3428,7 +3677,7 @@ static int __subn_get_opa_hfi1_cong_log(struct opa_smp *smp, u32 am,
 	s64 ts;
 	s64 ts;
 	int i;
 	int i;
 
 
-	if (am != 0) {
+	if (am || smp_length_check(sizeof(*cong_log), max_len)) {
 		smp->status |= IB_SMP_INVALID_FIELD;
 		smp->status |= IB_SMP_INVALID_FIELD;
 		return reply((struct ib_mad_hdr *)smp);
 		return reply((struct ib_mad_hdr *)smp);
 	}
 	}
@@ -3486,7 +3735,7 @@ static int __subn_get_opa_hfi1_cong_log(struct opa_smp *smp, u32 am,
 
 
 static int __subn_get_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
 static int __subn_get_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
 				   struct ib_device *ibdev, u8 port,
 				   struct ib_device *ibdev, u8 port,
-				   u32 *resp_len)
+				   u32 *resp_len, u32 max_len)
 {
 {
 	struct ib_cc_table_attr *cc_table_attr =
 	struct ib_cc_table_attr *cc_table_attr =
 		(struct ib_cc_table_attr *)data;
 		(struct ib_cc_table_attr *)data;
@@ -3498,9 +3747,10 @@ static int __subn_get_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
 	int i, j;
 	int i, j;
 	u32 sentry, eentry;
 	u32 sentry, eentry;
 	struct cc_state *cc_state;
 	struct cc_state *cc_state;
+	u32 size = sizeof(u16) * (IB_CCT_ENTRIES * n_blocks + 1);
 
 
 	/* sanity check n_blocks, start_block */
 	/* sanity check n_blocks, start_block */
-	if (n_blocks == 0 ||
+	if (n_blocks == 0 || smp_length_check(size, max_len) ||
 	    start_block + n_blocks > ppd->cc_max_table_entries) {
 	    start_block + n_blocks > ppd->cc_max_table_entries) {
 		smp->status |= IB_SMP_INVALID_FIELD;
 		smp->status |= IB_SMP_INVALID_FIELD;
 		return reply((struct ib_mad_hdr *)smp);
 		return reply((struct ib_mad_hdr *)smp);
@@ -3530,14 +3780,14 @@ static int __subn_get_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
 	rcu_read_unlock();
 	rcu_read_unlock();
 
 
 	if (resp_len)
 	if (resp_len)
-		*resp_len += sizeof(u16) * (IB_CCT_ENTRIES * n_blocks + 1);
+		*resp_len += size;
 
 
 	return reply((struct ib_mad_hdr *)smp);
 	return reply((struct ib_mad_hdr *)smp);
 }
 }
 
 
 static int __subn_set_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
 static int __subn_set_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
 				   struct ib_device *ibdev, u8 port,
 				   struct ib_device *ibdev, u8 port,
-				   u32 *resp_len)
+				   u32 *resp_len, u32 max_len)
 {
 {
 	struct ib_cc_table_attr *p = (struct ib_cc_table_attr *)data;
 	struct ib_cc_table_attr *p = (struct ib_cc_table_attr *)data;
 	struct hfi1_ibport *ibp = to_iport(ibdev, port);
 	struct hfi1_ibport *ibp = to_iport(ibdev, port);
@@ -3548,9 +3798,10 @@ static int __subn_set_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
 	int i, j;
 	int i, j;
 	u32 sentry, eentry;
 	u32 sentry, eentry;
 	u16 ccti_limit;
 	u16 ccti_limit;
+	u32 size = sizeof(u16) * (IB_CCT_ENTRIES * n_blocks + 1);
 
 
 	/* sanity check n_blocks, start_block */
 	/* sanity check n_blocks, start_block */
-	if (n_blocks == 0 ||
+	if (n_blocks == 0 || smp_length_check(size, max_len) ||
 	    start_block + n_blocks > ppd->cc_max_table_entries) {
 	    start_block + n_blocks > ppd->cc_max_table_entries) {
 		smp->status |= IB_SMP_INVALID_FIELD;
 		smp->status |= IB_SMP_INVALID_FIELD;
 		return reply((struct ib_mad_hdr *)smp);
 		return reply((struct ib_mad_hdr *)smp);
@@ -3581,7 +3832,8 @@ static int __subn_set_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
 	/* now apply the information */
 	/* now apply the information */
 	apply_cc_state(ppd);
 	apply_cc_state(ppd);
 
 
-	return __subn_get_opa_cc_table(smp, am, data, ibdev, port, resp_len);
+	return __subn_get_opa_cc_table(smp, am, data, ibdev, port, resp_len,
+				       max_len);
 }
 }
 
 
 struct opa_led_info {
 struct opa_led_info {
@@ -3594,7 +3846,7 @@ struct opa_led_info {
 
 
 static int __subn_get_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
 static int __subn_get_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
 				   struct ib_device *ibdev, u8 port,
 				   struct ib_device *ibdev, u8 port,
-				   u32 *resp_len)
+				   u32 *resp_len, u32 max_len)
 {
 {
 	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
 	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
 	struct hfi1_pportdata *ppd = dd->pport;
 	struct hfi1_pportdata *ppd = dd->pport;
@@ -3602,7 +3854,7 @@ static int __subn_get_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
 	u32 nport = OPA_AM_NPORT(am);
 	u32 nport = OPA_AM_NPORT(am);
 	u32 is_beaconing_active;
 	u32 is_beaconing_active;
 
 
-	if (nport != 1) {
+	if (nport != 1 || smp_length_check(sizeof(*p), max_len)) {
 		smp->status |= IB_SMP_INVALID_FIELD;
 		smp->status |= IB_SMP_INVALID_FIELD;
 		return reply((struct ib_mad_hdr *)smp);
 		return reply((struct ib_mad_hdr *)smp);
 	}
 	}
@@ -3624,14 +3876,14 @@ static int __subn_get_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
 
 
 static int __subn_set_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
 static int __subn_set_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
 				   struct ib_device *ibdev, u8 port,
 				   struct ib_device *ibdev, u8 port,
-				   u32 *resp_len)
+				   u32 *resp_len, u32 max_len)
 {
 {
 	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
 	struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
 	struct opa_led_info *p = (struct opa_led_info *)data;
 	struct opa_led_info *p = (struct opa_led_info *)data;
 	u32 nport = OPA_AM_NPORT(am);
 	u32 nport = OPA_AM_NPORT(am);
 	int on = !!(be32_to_cpu(p->rsvd_led_mask) & OPA_LED_MASK);
 	int on = !!(be32_to_cpu(p->rsvd_led_mask) & OPA_LED_MASK);
 
 
-	if (nport != 1) {
+	if (nport != 1 || smp_length_check(sizeof(*p), max_len)) {
 		smp->status |= IB_SMP_INVALID_FIELD;
 		smp->status |= IB_SMP_INVALID_FIELD;
 		return reply((struct ib_mad_hdr *)smp);
 		return reply((struct ib_mad_hdr *)smp);
 	}
 	}
@@ -3641,12 +3893,13 @@ static int __subn_set_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
 	else
 	else
 		shutdown_led_override(dd->pport);
 		shutdown_led_override(dd->pport);
 
 
-	return __subn_get_opa_led_info(smp, am, data, ibdev, port, resp_len);
+	return __subn_get_opa_led_info(smp, am, data, ibdev, port, resp_len,
+				       max_len);
 }
 }
 
 
 static int subn_get_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
 static int subn_get_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
 			    u8 *data, struct ib_device *ibdev, u8 port,
 			    u8 *data, struct ib_device *ibdev, u8 port,
-			    u32 *resp_len)
+			    u32 *resp_len, u32 max_len)
 {
 {
 	int ret;
 	int ret;
 	struct hfi1_ibport *ibp = to_iport(ibdev, port);
 	struct hfi1_ibport *ibp = to_iport(ibdev, port);
@@ -3654,71 +3907,71 @@ static int subn_get_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
 	switch (attr_id) {
 	switch (attr_id) {
 	case IB_SMP_ATTR_NODE_DESC:
 	case IB_SMP_ATTR_NODE_DESC:
 		ret = __subn_get_opa_nodedesc(smp, am, data, ibdev, port,
 		ret = __subn_get_opa_nodedesc(smp, am, data, ibdev, port,
-					      resp_len);
+					      resp_len, max_len);
 		break;
 		break;
 	case IB_SMP_ATTR_NODE_INFO:
 	case IB_SMP_ATTR_NODE_INFO:
 		ret = __subn_get_opa_nodeinfo(smp, am, data, ibdev, port,
 		ret = __subn_get_opa_nodeinfo(smp, am, data, ibdev, port,
-					      resp_len);
+					      resp_len, max_len);
 		break;
 		break;
 	case IB_SMP_ATTR_PORT_INFO:
 	case IB_SMP_ATTR_PORT_INFO:
 		ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port,
 		ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port,
-					      resp_len);
+					      resp_len, max_len);
 		break;
 		break;
 	case IB_SMP_ATTR_PKEY_TABLE:
 	case IB_SMP_ATTR_PKEY_TABLE:
 		ret = __subn_get_opa_pkeytable(smp, am, data, ibdev, port,
 		ret = __subn_get_opa_pkeytable(smp, am, data, ibdev, port,
-					       resp_len);
+					       resp_len, max_len);
 		break;
 		break;
 	case OPA_ATTRIB_ID_SL_TO_SC_MAP:
 	case OPA_ATTRIB_ID_SL_TO_SC_MAP:
 		ret = __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port,
 		ret = __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port,
-					      resp_len);
+					      resp_len, max_len);
 		break;
 		break;
 	case OPA_ATTRIB_ID_SC_TO_SL_MAP:
 	case OPA_ATTRIB_ID_SC_TO_SL_MAP:
 		ret = __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port,
 		ret = __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port,
-					      resp_len);
+					      resp_len, max_len);
 		break;
 		break;
 	case OPA_ATTRIB_ID_SC_TO_VLT_MAP:
 	case OPA_ATTRIB_ID_SC_TO_VLT_MAP:
 		ret = __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port,
 		ret = __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port,
-					       resp_len);
+					       resp_len, max_len);
 		break;
 		break;
 	case OPA_ATTRIB_ID_SC_TO_VLNT_MAP:
 	case OPA_ATTRIB_ID_SC_TO_VLNT_MAP:
 		ret = __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port,
 		ret = __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port,
-						resp_len);
+						resp_len, max_len);
 		break;
 		break;
 	case OPA_ATTRIB_ID_PORT_STATE_INFO:
 	case OPA_ATTRIB_ID_PORT_STATE_INFO:
 		ret = __subn_get_opa_psi(smp, am, data, ibdev, port,
 		ret = __subn_get_opa_psi(smp, am, data, ibdev, port,
-					 resp_len);
+					 resp_len, max_len);
 		break;
 		break;
 	case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE:
 	case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE:
 		ret = __subn_get_opa_bct(smp, am, data, ibdev, port,
 		ret = __subn_get_opa_bct(smp, am, data, ibdev, port,
-					 resp_len);
+					 resp_len, max_len);
 		break;
 		break;
 	case OPA_ATTRIB_ID_CABLE_INFO:
 	case OPA_ATTRIB_ID_CABLE_INFO:
 		ret = __subn_get_opa_cable_info(smp, am, data, ibdev, port,
 		ret = __subn_get_opa_cable_info(smp, am, data, ibdev, port,
-						resp_len);
+						resp_len, max_len);
 		break;
 		break;
 	case IB_SMP_ATTR_VL_ARB_TABLE:
 	case IB_SMP_ATTR_VL_ARB_TABLE:
 		ret = __subn_get_opa_vl_arb(smp, am, data, ibdev, port,
 		ret = __subn_get_opa_vl_arb(smp, am, data, ibdev, port,
-					    resp_len);
+					    resp_len, max_len);
 		break;
 		break;
 	case OPA_ATTRIB_ID_CONGESTION_INFO:
 	case OPA_ATTRIB_ID_CONGESTION_INFO:
 		ret = __subn_get_opa_cong_info(smp, am, data, ibdev, port,
 		ret = __subn_get_opa_cong_info(smp, am, data, ibdev, port,
-					       resp_len);
+					       resp_len, max_len);
 		break;
 		break;
 	case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING:
 	case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING:
 		ret = __subn_get_opa_cong_setting(smp, am, data, ibdev,
 		ret = __subn_get_opa_cong_setting(smp, am, data, ibdev,
-						  port, resp_len);
+						  port, resp_len, max_len);
 		break;
 		break;
 	case OPA_ATTRIB_ID_HFI_CONGESTION_LOG:
 	case OPA_ATTRIB_ID_HFI_CONGESTION_LOG:
 		ret = __subn_get_opa_hfi1_cong_log(smp, am, data, ibdev,
 		ret = __subn_get_opa_hfi1_cong_log(smp, am, data, ibdev,
-						   port, resp_len);
+						   port, resp_len, max_len);
 		break;
 		break;
 	case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE:
 	case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE:
 		ret = __subn_get_opa_cc_table(smp, am, data, ibdev, port,
 		ret = __subn_get_opa_cc_table(smp, am, data, ibdev, port,
-					      resp_len);
+					      resp_len, max_len);
 		break;
 		break;
 	case IB_SMP_ATTR_LED_INFO:
 	case IB_SMP_ATTR_LED_INFO:
 		ret = __subn_get_opa_led_info(smp, am, data, ibdev, port,
 		ret = __subn_get_opa_led_info(smp, am, data, ibdev, port,
-					      resp_len);
+					      resp_len, max_len);
 		break;
 		break;
 	case IB_SMP_ATTR_SM_INFO:
 	case IB_SMP_ATTR_SM_INFO:
 		if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED)
 		if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED)
@@ -3736,7 +3989,7 @@ static int subn_get_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
 
 
 static int subn_set_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
 static int subn_set_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
 			    u8 *data, struct ib_device *ibdev, u8 port,
 			    u8 *data, struct ib_device *ibdev, u8 port,
-			    u32 *resp_len)
+			    u32 *resp_len, u32 max_len)
 {
 {
 	int ret;
 	int ret;
 	struct hfi1_ibport *ibp = to_iport(ibdev, port);
 	struct hfi1_ibport *ibp = to_iport(ibdev, port);
@@ -3744,51 +3997,51 @@ static int subn_set_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
 	switch (attr_id) {
 	switch (attr_id) {
 	case IB_SMP_ATTR_PORT_INFO:
 	case IB_SMP_ATTR_PORT_INFO:
 		ret = __subn_set_opa_portinfo(smp, am, data, ibdev, port,
 		ret = __subn_set_opa_portinfo(smp, am, data, ibdev, port,
-					      resp_len);
+					      resp_len, max_len);
 		break;
 		break;
 	case IB_SMP_ATTR_PKEY_TABLE:
 	case IB_SMP_ATTR_PKEY_TABLE:
 		ret = __subn_set_opa_pkeytable(smp, am, data, ibdev, port,
 		ret = __subn_set_opa_pkeytable(smp, am, data, ibdev, port,
-					       resp_len);
+					       resp_len, max_len);
 		break;
 		break;
 	case OPA_ATTRIB_ID_SL_TO_SC_MAP:
 	case OPA_ATTRIB_ID_SL_TO_SC_MAP:
 		ret = __subn_set_opa_sl_to_sc(smp, am, data, ibdev, port,
 		ret = __subn_set_opa_sl_to_sc(smp, am, data, ibdev, port,
-					      resp_len);
+					      resp_len, max_len);
 		break;
 		break;
 	case OPA_ATTRIB_ID_SC_TO_SL_MAP:
 	case OPA_ATTRIB_ID_SC_TO_SL_MAP:
 		ret = __subn_set_opa_sc_to_sl(smp, am, data, ibdev, port,
 		ret = __subn_set_opa_sc_to_sl(smp, am, data, ibdev, port,
-					      resp_len);
+					      resp_len, max_len);
 		break;
 		break;
 	case OPA_ATTRIB_ID_SC_TO_VLT_MAP:
 	case OPA_ATTRIB_ID_SC_TO_VLT_MAP:
 		ret = __subn_set_opa_sc_to_vlt(smp, am, data, ibdev, port,
 		ret = __subn_set_opa_sc_to_vlt(smp, am, data, ibdev, port,
-					       resp_len);
+					       resp_len, max_len);
 		break;
 		break;
 	case OPA_ATTRIB_ID_SC_TO_VLNT_MAP:
 	case OPA_ATTRIB_ID_SC_TO_VLNT_MAP:
 		ret = __subn_set_opa_sc_to_vlnt(smp, am, data, ibdev, port,
 		ret = __subn_set_opa_sc_to_vlnt(smp, am, data, ibdev, port,
-						resp_len);
+						resp_len, max_len);
 		break;
 		break;
 	case OPA_ATTRIB_ID_PORT_STATE_INFO:
 	case OPA_ATTRIB_ID_PORT_STATE_INFO:
 		ret = __subn_set_opa_psi(smp, am, data, ibdev, port,
 		ret = __subn_set_opa_psi(smp, am, data, ibdev, port,
-					 resp_len);
+					 resp_len, max_len);
 		break;
 		break;
 	case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE:
 	case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE:
 		ret = __subn_set_opa_bct(smp, am, data, ibdev, port,
 		ret = __subn_set_opa_bct(smp, am, data, ibdev, port,
-					 resp_len);
+					 resp_len, max_len);
 		break;
 		break;
 	case IB_SMP_ATTR_VL_ARB_TABLE:
 	case IB_SMP_ATTR_VL_ARB_TABLE:
 		ret = __subn_set_opa_vl_arb(smp, am, data, ibdev, port,
 		ret = __subn_set_opa_vl_arb(smp, am, data, ibdev, port,
-					    resp_len);
+					    resp_len, max_len);
 		break;
 		break;
 	case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING:
 	case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING:
 		ret = __subn_set_opa_cong_setting(smp, am, data, ibdev,
 		ret = __subn_set_opa_cong_setting(smp, am, data, ibdev,
-						  port, resp_len);
+						  port, resp_len, max_len);
 		break;
 		break;
 	case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE:
 	case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE:
 		ret = __subn_set_opa_cc_table(smp, am, data, ibdev, port,
 		ret = __subn_set_opa_cc_table(smp, am, data, ibdev, port,
-					      resp_len);
+					      resp_len, max_len);
 		break;
 		break;
 	case IB_SMP_ATTR_LED_INFO:
 	case IB_SMP_ATTR_LED_INFO:
 		ret = __subn_set_opa_led_info(smp, am, data, ibdev, port,
 		ret = __subn_set_opa_led_info(smp, am, data, ibdev, port,
-					      resp_len);
+					      resp_len, max_len);
 		break;
 		break;
 	case IB_SMP_ATTR_SM_INFO:
 	case IB_SMP_ATTR_SM_INFO:
 		if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED)
 		if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED)
@@ -3844,7 +4097,10 @@ static int subn_get_opa_aggregate(struct opa_smp *smp,
 		memset(next_smp + sizeof(*agg), 0, agg_data_len);
 		memset(next_smp + sizeof(*agg), 0, agg_data_len);
 
 
 		(void)subn_get_opa_sma(agg->attr_id, smp, am, agg->data,
 		(void)subn_get_opa_sma(agg->attr_id, smp, am, agg->data,
-					ibdev, port, NULL);
+				       ibdev, port, NULL, (u32)agg_data_len);
+
+		if (smp->status & IB_SMP_INVALID_FIELD)
+			break;
 		if (smp->status & ~IB_SMP_DIRECTION) {
 		if (smp->status & ~IB_SMP_DIRECTION) {
 			set_aggr_error(agg);
 			set_aggr_error(agg);
 			return reply((struct ib_mad_hdr *)smp);
 			return reply((struct ib_mad_hdr *)smp);
@@ -3887,7 +4143,9 @@ static int subn_set_opa_aggregate(struct opa_smp *smp,
 		}
 		}
 
 
 		(void)subn_set_opa_sma(agg->attr_id, smp, am, agg->data,
 		(void)subn_set_opa_sma(agg->attr_id, smp, am, agg->data,
-					ibdev, port, NULL);
+				       ibdev, port, NULL, (u32)agg_data_len);
+		if (smp->status & IB_SMP_INVALID_FIELD)
+			break;
 		if (smp->status & ~IB_SMP_DIRECTION) {
 		if (smp->status & ~IB_SMP_DIRECTION) {
 			set_aggr_error(agg);
 			set_aggr_error(agg);
 			return reply((struct ib_mad_hdr *)smp);
 			return reply((struct ib_mad_hdr *)smp);
@@ -3997,12 +4255,13 @@ static int process_subn_opa(struct ib_device *ibdev, int mad_flags,
 	struct opa_smp *smp = (struct opa_smp *)out_mad;
 	struct opa_smp *smp = (struct opa_smp *)out_mad;
 	struct hfi1_ibport *ibp = to_iport(ibdev, port);
 	struct hfi1_ibport *ibp = to_iport(ibdev, port);
 	u8 *data;
 	u8 *data;
-	u32 am;
+	u32 am, data_size;
 	__be16 attr_id;
 	__be16 attr_id;
 	int ret;
 	int ret;
 
 
 	*out_mad = *in_mad;
 	*out_mad = *in_mad;
 	data = opa_get_smp_data(smp);
 	data = opa_get_smp_data(smp);
+	data_size = (u32)opa_get_smp_data_size(smp);
 
 
 	am = be32_to_cpu(smp->attr_mod);
 	am = be32_to_cpu(smp->attr_mod);
 	attr_id = smp->attr_id;
 	attr_id = smp->attr_id;
@@ -4046,7 +4305,8 @@ static int process_subn_opa(struct ib_device *ibdev, int mad_flags,
 		default:
 		default:
 			clear_opa_smp_data(smp);
 			clear_opa_smp_data(smp);
 			ret = subn_get_opa_sma(attr_id, smp, am, data,
 			ret = subn_get_opa_sma(attr_id, smp, am, data,
-					       ibdev, port, resp_len);
+					       ibdev, port, resp_len,
+					       data_size);
 			break;
 			break;
 		case OPA_ATTRIB_ID_AGGREGATE:
 		case OPA_ATTRIB_ID_AGGREGATE:
 			ret = subn_get_opa_aggregate(smp, ibdev, port,
 			ret = subn_get_opa_aggregate(smp, ibdev, port,
@@ -4058,7 +4318,8 @@ static int process_subn_opa(struct ib_device *ibdev, int mad_flags,
 		switch (attr_id) {
 		switch (attr_id) {
 		default:
 		default:
 			ret = subn_set_opa_sma(attr_id, smp, am, data,
 			ret = subn_set_opa_sma(attr_id, smp, am, data,
-					       ibdev, port, resp_len);
+					       ibdev, port, resp_len,
+					       data_size);
 			break;
 			break;
 		case OPA_ATTRIB_ID_AGGREGATE:
 		case OPA_ATTRIB_ID_AGGREGATE:
 			ret = subn_set_opa_aggregate(smp, ibdev, port,
 			ret = subn_set_opa_aggregate(smp, ibdev, port,
@@ -4077,6 +4338,11 @@ static int process_subn_opa(struct ib_device *ibdev, int mad_flags,
 		 */
 		 */
 		ret = IB_MAD_RESULT_SUCCESS;
 		ret = IB_MAD_RESULT_SUCCESS;
 		break;
 		break;
+	case IB_MGMT_METHOD_TRAP_REPRESS:
+		subn_handle_opa_trap_repress(ibp, smp);
+		/* Always successful */
+		ret = IB_MAD_RESULT_SUCCESS;
+		break;
 	default:
 	default:
 		smp->status |= IB_SMP_UNSUP_METHOD;
 		smp->status |= IB_SMP_UNSUP_METHOD;
 		ret = reply((struct ib_mad_hdr *)smp);
 		ret = reply((struct ib_mad_hdr *)smp);

+ 3 - 2
drivers/infiniband/hw/hfi1/mad.h

@@ -1,5 +1,5 @@
 /*
 /*
- * Copyright(c) 2015, 2016 Intel Corporation.
+ * Copyright(c) 2015 - 2017 Intel Corporation.
  *
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
  * redistributing this file, you may do so under either license.
@@ -115,7 +115,7 @@ struct opa_mad_notice_attr {
 			__be32	lid;		/* LID where change occurred */
 			__be32	lid;		/* LID where change occurred */
 			__be32	new_cap_mask;	/* new capability mask */
 			__be32	new_cap_mask;	/* new capability mask */
 			__be16	reserved2;
 			__be16	reserved2;
-			__be16	cap_mask;
+			__be16	cap_mask3;
 			__be16	change_flags;	/* low 4 bits only */
 			__be16	change_flags;	/* low 4 bits only */
 		} __packed ntc_144;
 		} __packed ntc_144;
 
 
@@ -428,5 +428,6 @@ struct sc2vlnt {
 		    COUNTER_MASK(1, 4))
 		    COUNTER_MASK(1, 4))
 
 
 void hfi1_event_pkey_change(struct hfi1_devdata *dd, u8 port);
 void hfi1_event_pkey_change(struct hfi1_devdata *dd, u8 port);
+void hfi1_handle_trap_timer(unsigned long data);
 
 
 #endif				/* _HFI1_MAD_H */
 #endif				/* _HFI1_MAD_H */

+ 10 - 4
drivers/infiniband/hw/hfi1/mmu_rb.c

@@ -217,21 +217,27 @@ static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler,
 	return node;
 	return node;
 }
 }
 
 
-struct mmu_rb_node *hfi1_mmu_rb_extract(struct mmu_rb_handler *handler,
-					unsigned long addr, unsigned long len)
+bool hfi1_mmu_rb_remove_unless_exact(struct mmu_rb_handler *handler,
+				     unsigned long addr, unsigned long len,
+				     struct mmu_rb_node **rb_node)
 {
 {
 	struct mmu_rb_node *node;
 	struct mmu_rb_node *node;
 	unsigned long flags;
 	unsigned long flags;
+	bool ret = false;
 
 
 	spin_lock_irqsave(&handler->lock, flags);
 	spin_lock_irqsave(&handler->lock, flags);
 	node = __mmu_rb_search(handler, addr, len);
 	node = __mmu_rb_search(handler, addr, len);
 	if (node) {
 	if (node) {
+		if (node->addr == addr && node->len == len)
+			goto unlock;
 		__mmu_int_rb_remove(node, &handler->root);
 		__mmu_int_rb_remove(node, &handler->root);
 		list_del(&node->list); /* remove from LRU list */
 		list_del(&node->list); /* remove from LRU list */
+		ret = true;
 	}
 	}
+unlock:
 	spin_unlock_irqrestore(&handler->lock, flags);
 	spin_unlock_irqrestore(&handler->lock, flags);
-
-	return node;
+	*rb_node = node;
+	return ret;
 }
 }
 
 
 void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg)
 void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg)

+ 3 - 2
drivers/infiniband/hw/hfi1/mmu_rb.h

@@ -81,7 +81,8 @@ int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler,
 void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg);
 void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg);
 void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler,
 void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler,
 			struct mmu_rb_node *mnode);
 			struct mmu_rb_node *mnode);
-struct mmu_rb_node *hfi1_mmu_rb_extract(struct mmu_rb_handler *handler,
-					unsigned long addr, unsigned long len);
+bool hfi1_mmu_rb_remove_unless_exact(struct mmu_rb_handler *handler,
+				     unsigned long addr, unsigned long len,
+				     struct mmu_rb_node **rb_node);
 
 
 #endif /* _HFI1_MMU_RB_H */
 #endif /* _HFI1_MMU_RB_H */

+ 261 - 123
drivers/infiniband/hw/hfi1/pcie.c

@@ -1,5 +1,5 @@
 /*
 /*
- * Copyright(c) 2015, 2016 Intel Corporation.
+ * Copyright(c) 2015 - 2017 Intel Corporation.
  *
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
  * redistributing this file, you may do so under either license.
@@ -68,7 +68,7 @@
 /*
 /*
  * Code to adjust PCIe capabilities.
  * Code to adjust PCIe capabilities.
  */
  */
-static void tune_pcie_caps(struct hfi1_devdata *);
+static int tune_pcie_caps(struct hfi1_devdata *);
 
 
 /*
 /*
  * Do all the common PCIe setup and initialization.
  * Do all the common PCIe setup and initialization.
@@ -161,6 +161,7 @@ int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev)
 {
 {
 	unsigned long len;
 	unsigned long len;
 	resource_size_t addr;
 	resource_size_t addr;
+	int ret = 0;
 
 
 	dd->pcidev = pdev;
 	dd->pcidev = pdev;
 	pci_set_drvdata(pdev, dd);
 	pci_set_drvdata(pdev, dd);
@@ -179,47 +180,54 @@ int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev)
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
-	dd->kregbase = ioremap_nocache(addr, TXE_PIO_SEND);
-	if (!dd->kregbase)
+	dd->kregbase1 = ioremap_nocache(addr, RCV_ARRAY);
+	if (!dd->kregbase1) {
+		dd_dev_err(dd, "UC mapping of kregbase1 failed\n");
 		return -ENOMEM;
 		return -ENOMEM;
+	}
+	dd_dev_info(dd, "UC base1: %p for %x\n", dd->kregbase1, RCV_ARRAY);
+	dd->chip_rcv_array_count = readq(dd->kregbase1 + RCV_ARRAY_CNT);
+	dd_dev_info(dd, "RcvArray count: %u\n", dd->chip_rcv_array_count);
+	dd->base2_start  = RCV_ARRAY + dd->chip_rcv_array_count * 8;
+
+	dd->kregbase2 = ioremap_nocache(
+		addr + dd->base2_start,
+		TXE_PIO_SEND - dd->base2_start);
+	if (!dd->kregbase2) {
+		dd_dev_err(dd, "UC mapping of kregbase2 failed\n");
+		goto nomem;
+	}
+	dd_dev_info(dd, "UC base2: %p for %x\n", dd->kregbase2,
+		    TXE_PIO_SEND - dd->base2_start);
 
 
 	dd->piobase = ioremap_wc(addr + TXE_PIO_SEND, TXE_PIO_SIZE);
 	dd->piobase = ioremap_wc(addr + TXE_PIO_SEND, TXE_PIO_SIZE);
 	if (!dd->piobase) {
 	if (!dd->piobase) {
-		iounmap(dd->kregbase);
-		return -ENOMEM;
+		dd_dev_err(dd, "WC mapping of send buffers failed\n");
+		goto nomem;
 	}
 	}
+	dd_dev_info(dd, "WC piobase: %p\n for %x", dd->piobase, TXE_PIO_SIZE);
 
 
-	dd->flags |= HFI1_PRESENT;	/* now register routines work */
-
-	dd->kregend = dd->kregbase + TXE_PIO_SEND;
 	dd->physaddr = addr;        /* used for io_remap, etc. */
 	dd->physaddr = addr;        /* used for io_remap, etc. */
 
 
 	/*
 	/*
-	 * Re-map the chip's RcvArray as write-combining to allow us
+	 * Map the chip's RcvArray as write-combining to allow us
 	 * to write an entire cacheline worth of entries in one shot.
 	 * to write an entire cacheline worth of entries in one shot.
-	 * If this re-map fails, just continue - the RcvArray programming
-	 * function will handle both cases.
 	 */
 	 */
-	dd->chip_rcv_array_count = read_csr(dd, RCV_ARRAY_CNT);
 	dd->rcvarray_wc = ioremap_wc(addr + RCV_ARRAY,
 	dd->rcvarray_wc = ioremap_wc(addr + RCV_ARRAY,
 				     dd->chip_rcv_array_count * 8);
 				     dd->chip_rcv_array_count * 8);
-	dd_dev_info(dd, "WC Remapped RcvArray: %p\n", dd->rcvarray_wc);
-	/*
-	 * Save BARs and command to rewrite after device reset.
-	 */
-	pci_read_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0, &dd->pcibar0);
-	pci_read_config_dword(dd->pcidev, PCI_BASE_ADDRESS_1, &dd->pcibar1);
-	pci_read_config_dword(dd->pcidev, PCI_ROM_ADDRESS, &dd->pci_rom);
-	pci_read_config_word(dd->pcidev, PCI_COMMAND, &dd->pci_command);
-	pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &dd->pcie_devctl);
-	pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL, &dd->pcie_lnkctl);
-	pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL2,
-				  &dd->pcie_devctl2);
-	pci_read_config_dword(dd->pcidev, PCI_CFG_MSIX0, &dd->pci_msix0);
-	pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE1, &dd->pci_lnkctl3);
-	pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2, &dd->pci_tph2);
+	if (!dd->rcvarray_wc) {
+		dd_dev_err(dd, "WC mapping of receive array failed\n");
+		goto nomem;
+	}
+	dd_dev_info(dd, "WC RcvArray: %p for %x\n",
+		    dd->rcvarray_wc, dd->chip_rcv_array_count * 8);
 
 
+	dd->flags |= HFI1_PRESENT;	/* chip.c CSR routines now work */
 	return 0;
 	return 0;
+nomem:
+	ret = -ENOMEM;
+	hfi1_pcie_ddcleanup(dd);
+	return ret;
 }
 }
 
 
 /*
 /*
@@ -229,59 +237,19 @@ int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev)
  */
  */
 void hfi1_pcie_ddcleanup(struct hfi1_devdata *dd)
 void hfi1_pcie_ddcleanup(struct hfi1_devdata *dd)
 {
 {
-	u64 __iomem *base = (void __iomem *)dd->kregbase;
-
 	dd->flags &= ~HFI1_PRESENT;
 	dd->flags &= ~HFI1_PRESENT;
-	dd->kregbase = NULL;
-	iounmap(base);
+	if (dd->kregbase1)
+		iounmap(dd->kregbase1);
+	dd->kregbase1 = NULL;
+	if (dd->kregbase2)
+		iounmap(dd->kregbase2);
+	dd->kregbase2 = NULL;
 	if (dd->rcvarray_wc)
 	if (dd->rcvarray_wc)
 		iounmap(dd->rcvarray_wc);
 		iounmap(dd->rcvarray_wc);
+	dd->rcvarray_wc = NULL;
 	if (dd->piobase)
 	if (dd->piobase)
 		iounmap(dd->piobase);
 		iounmap(dd->piobase);
-}
-
-static void msix_setup(struct hfi1_devdata *dd, int pos, u32 *msixcnt,
-		       struct hfi1_msix_entry *hfi1_msix_entry)
-{
-	int ret;
-	int nvec = *msixcnt;
-	struct msix_entry *msix_entry;
-	int i;
-
-	/*
-	 * We can't pass hfi1_msix_entry array to msix_setup
-	 * so use a dummy msix_entry array and copy the allocated
-	 * irq back to the hfi1_msix_entry array.
-	 */
-	msix_entry = kmalloc_array(nvec, sizeof(*msix_entry), GFP_KERNEL);
-	if (!msix_entry) {
-		ret = -ENOMEM;
-		goto do_intx;
-	}
-
-	for (i = 0; i < nvec; i++)
-		msix_entry[i] = hfi1_msix_entry[i].msix;
-
-	ret = pci_enable_msix_range(dd->pcidev, msix_entry, 1, nvec);
-	if (ret < 0)
-		goto free_msix_entry;
-	nvec = ret;
-
-	for (i = 0; i < nvec; i++)
-		hfi1_msix_entry[i].msix = msix_entry[i];
-
-	kfree(msix_entry);
-	*msixcnt = nvec;
-	return;
-
-free_msix_entry:
-	kfree(msix_entry);
-
-do_intx:
-	dd_dev_err(dd, "pci_enable_msix_range %d vectors failed: %d, falling back to INTx\n",
-		   nvec, ret);
-	*msixcnt = 0;
-	hfi1_enable_intx(dd->pcidev);
+	dd->piobase = NULL;
 }
 }
 
 
 /* return the PCIe link speed from the given link status */
 /* return the PCIe link speed from the given link status */
@@ -314,8 +282,14 @@ static u32 extract_width(u16 linkstat)
 static void update_lbus_info(struct hfi1_devdata *dd)
 static void update_lbus_info(struct hfi1_devdata *dd)
 {
 {
 	u16 linkstat;
 	u16 linkstat;
+	int ret;
+
+	ret = pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKSTA, &linkstat);
+	if (ret) {
+		dd_dev_err(dd, "Unable to read from PCI config\n");
+		return;
+	}
 
 
-	pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKSTA, &linkstat);
 	dd->lbus_width = extract_width(linkstat);
 	dd->lbus_width = extract_width(linkstat);
 	dd->lbus_speed = extract_speed(linkstat);
 	dd->lbus_speed = extract_speed(linkstat);
 	snprintf(dd->lbus_info, sizeof(dd->lbus_info),
 	snprintf(dd->lbus_info, sizeof(dd->lbus_info),
@@ -330,6 +304,7 @@ int pcie_speeds(struct hfi1_devdata *dd)
 {
 {
 	u32 linkcap;
 	u32 linkcap;
 	struct pci_dev *parent = dd->pcidev->bus->self;
 	struct pci_dev *parent = dd->pcidev->bus->self;
+	int ret;
 
 
 	if (!pci_is_pcie(dd->pcidev)) {
 	if (!pci_is_pcie(dd->pcidev)) {
 		dd_dev_err(dd, "Can't find PCI Express capability!\n");
 		dd_dev_err(dd, "Can't find PCI Express capability!\n");
@@ -339,7 +314,12 @@ int pcie_speeds(struct hfi1_devdata *dd)
 	/* find if our max speed is Gen3 and parent supports Gen3 speeds */
 	/* find if our max speed is Gen3 and parent supports Gen3 speeds */
 	dd->link_gen3_capable = 1;
 	dd->link_gen3_capable = 1;
 
 
-	pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &linkcap);
+	ret = pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &linkcap);
+	if (ret) {
+		dd_dev_err(dd, "Unable to read from PCI config\n");
+		return ret;
+	}
+
 	if ((linkcap & PCI_EXP_LNKCAP_SLS) != GEN3_SPEED_VECTOR) {
 	if ((linkcap & PCI_EXP_LNKCAP_SLS) != GEN3_SPEED_VECTOR) {
 		dd_dev_info(dd,
 		dd_dev_info(dd,
 			    "This HFI is not Gen3 capable, max speed 0x%x, need 0x3\n",
 			    "This HFI is not Gen3 capable, max speed 0x%x, need 0x3\n",
@@ -364,49 +344,150 @@ int pcie_speeds(struct hfi1_devdata *dd)
 }
 }
 
 
 /*
 /*
- * Returns in *nent:
- *	- actual number of interrupts allocated
+ * Returns:
+ *	- actual number of interrupts allocated or
  *	- 0 if fell back to INTx.
  *	- 0 if fell back to INTx.
+ *      - error
  */
  */
-void request_msix(struct hfi1_devdata *dd, u32 *nent,
-		  struct hfi1_msix_entry *entry)
+int request_msix(struct hfi1_devdata *dd, u32 msireq)
 {
 {
-	int pos;
+	int nvec, ret;
 
 
-	pos = dd->pcidev->msix_cap;
-	if (*nent && pos) {
-		msix_setup(dd, pos, nent, entry);
-		/* did it, either MSI-X or INTx */
-	} else {
-		*nent = 0;
-		hfi1_enable_intx(dd->pcidev);
+	nvec = pci_alloc_irq_vectors(dd->pcidev, 1, msireq,
+				     PCI_IRQ_MSIX | PCI_IRQ_LEGACY);
+	if (nvec < 0) {
+		dd_dev_err(dd, "pci_alloc_irq_vectors() failed: %d\n", nvec);
+		return nvec;
 	}
 	}
 
 
-	tune_pcie_caps(dd);
+	ret = tune_pcie_caps(dd);
+	if (ret) {
+		dd_dev_err(dd, "tune_pcie_caps() failed: %d\n", ret);
+		pci_free_irq_vectors(dd->pcidev);
+		return ret;
+	}
+
+	/* check for legacy IRQ */
+	if (nvec == 1 && !dd->pcidev->msix_enabled)
+		return 0;
+
+	return nvec;
 }
 }
 
 
-void hfi1_enable_intx(struct pci_dev *pdev)
+/* restore command and BARs after a reset has wiped them out */
+int restore_pci_variables(struct hfi1_devdata *dd)
 {
 {
-	/* first, turn on INTx */
-	pci_intx(pdev, 1);
-	/* then turn off MSI-X */
-	pci_disable_msix(pdev);
+	int ret = 0;
+
+	ret = pci_write_config_word(dd->pcidev, PCI_COMMAND, dd->pci_command);
+	if (ret)
+		goto error;
+
+	ret = pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0,
+				     dd->pcibar0);
+	if (ret)
+		goto error;
+
+	ret = pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_1,
+				     dd->pcibar1);
+	if (ret)
+		goto error;
+
+	ret = pci_write_config_dword(dd->pcidev, PCI_ROM_ADDRESS, dd->pci_rom);
+	if (ret)
+		goto error;
+
+	ret = pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL,
+					 dd->pcie_devctl);
+	if (ret)
+		goto error;
+
+	ret = pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL,
+					 dd->pcie_lnkctl);
+	if (ret)
+		goto error;
+
+	ret = pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL2,
+					 dd->pcie_devctl2);
+	if (ret)
+		goto error;
+
+	ret = pci_write_config_dword(dd->pcidev, PCI_CFG_MSIX0, dd->pci_msix0);
+	if (ret)
+		goto error;
+
+	ret = pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE1,
+				     dd->pci_lnkctl3);
+	if (ret)
+		goto error;
+
+	ret = pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2, dd->pci_tph2);
+	if (ret)
+		goto error;
+
+	return 0;
+
+error:
+	dd_dev_err(dd, "Unable to write to PCI config\n");
+	return ret;
 }
 }
 
 
-/* restore command and BARs after a reset has wiped them out */
-void restore_pci_variables(struct hfi1_devdata *dd)
+/* Save BARs and command to rewrite after device reset */
+int save_pci_variables(struct hfi1_devdata *dd)
 {
 {
-	pci_write_config_word(dd->pcidev, PCI_COMMAND, dd->pci_command);
-	pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0, dd->pcibar0);
-	pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_1, dd->pcibar1);
-	pci_write_config_dword(dd->pcidev, PCI_ROM_ADDRESS, dd->pci_rom);
-	pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, dd->pcie_devctl);
-	pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL, dd->pcie_lnkctl);
-	pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL2,
-				   dd->pcie_devctl2);
-	pci_write_config_dword(dd->pcidev, PCI_CFG_MSIX0, dd->pci_msix0);
-	pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE1, dd->pci_lnkctl3);
-	pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2, dd->pci_tph2);
+	int ret = 0;
+
+	ret = pci_read_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0,
+				    &dd->pcibar0);
+	if (ret)
+		goto error;
+
+	ret = pci_read_config_dword(dd->pcidev, PCI_BASE_ADDRESS_1,
+				    &dd->pcibar1);
+	if (ret)
+		goto error;
+
+	ret = pci_read_config_dword(dd->pcidev, PCI_ROM_ADDRESS, &dd->pci_rom);
+	if (ret)
+		goto error;
+
+	ret = pci_read_config_word(dd->pcidev, PCI_COMMAND, &dd->pci_command);
+	if (ret)
+		goto error;
+
+	ret = pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL,
+					&dd->pcie_devctl);
+	if (ret)
+		goto error;
+
+	ret = pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL,
+					&dd->pcie_lnkctl);
+	if (ret)
+		goto error;
+
+	ret = pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL2,
+					&dd->pcie_devctl2);
+	if (ret)
+		goto error;
+
+	ret = pci_read_config_dword(dd->pcidev, PCI_CFG_MSIX0, &dd->pci_msix0);
+	if (ret)
+		goto error;
+
+	ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE1,
+				    &dd->pci_lnkctl3);
+	if (ret)
+		goto error;
+
+	ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2, &dd->pci_tph2);
+	if (ret)
+		goto error;
+
+	return 0;
+
+error:
+	dd_dev_err(dd, "Unable to read from PCI config\n");
+	return ret;
 }
 }
 
 
 /*
 /*
@@ -421,21 +502,33 @@ uint aspm_mode = ASPM_MODE_DISABLED;
 module_param_named(aspm, aspm_mode, uint, S_IRUGO);
 module_param_named(aspm, aspm_mode, uint, S_IRUGO);
 MODULE_PARM_DESC(aspm, "PCIe ASPM: 0: disable, 1: enable, 2: dynamic");
 MODULE_PARM_DESC(aspm, "PCIe ASPM: 0: disable, 1: enable, 2: dynamic");
 
 
-static void tune_pcie_caps(struct hfi1_devdata *dd)
+static int tune_pcie_caps(struct hfi1_devdata *dd)
 {
 {
 	struct pci_dev *parent;
 	struct pci_dev *parent;
 	u16 rc_mpss, rc_mps, ep_mpss, ep_mps;
 	u16 rc_mpss, rc_mps, ep_mpss, ep_mps;
 	u16 rc_mrrs, ep_mrrs, max_mrrs, ectl;
 	u16 rc_mrrs, ep_mrrs, max_mrrs, ectl;
+	int ret;
 
 
 	/*
 	/*
 	 * Turn on extended tags in DevCtl in case the BIOS has turned it off
 	 * Turn on extended tags in DevCtl in case the BIOS has turned it off
 	 * to improve WFR SDMA bandwidth
 	 * to improve WFR SDMA bandwidth
 	 */
 	 */
-	pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &ectl);
+	ret = pcie_capability_read_word(dd->pcidev,
+					PCI_EXP_DEVCTL, &ectl);
+	if (ret) {
+		dd_dev_err(dd, "Unable to read from PCI config\n");
+		return ret;
+	}
+
 	if (!(ectl & PCI_EXP_DEVCTL_EXT_TAG)) {
 	if (!(ectl & PCI_EXP_DEVCTL_EXT_TAG)) {
 		dd_dev_info(dd, "Enabling PCIe extended tags\n");
 		dd_dev_info(dd, "Enabling PCIe extended tags\n");
 		ectl |= PCI_EXP_DEVCTL_EXT_TAG;
 		ectl |= PCI_EXP_DEVCTL_EXT_TAG;
-		pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, ectl);
+		ret = pcie_capability_write_word(dd->pcidev,
+						 PCI_EXP_DEVCTL, ectl);
+		if (ret) {
+			dd_dev_err(dd, "Unable to write to PCI config\n");
+			return ret;
+		}
 	}
 	}
 	/* Find out supported and configured values for parent (root) */
 	/* Find out supported and configured values for parent (root) */
 	parent = dd->pcidev->bus->self;
 	parent = dd->pcidev->bus->self;
@@ -444,14 +537,14 @@ static void tune_pcie_caps(struct hfi1_devdata *dd)
 	 * access to the upstream component.
 	 * access to the upstream component.
 	 */
 	 */
 	if (!parent)
 	if (!parent)
-		return;
+		return -EINVAL;
 	if (!pci_is_root_bus(parent->bus)) {
 	if (!pci_is_root_bus(parent->bus)) {
 		dd_dev_info(dd, "Parent not root\n");
 		dd_dev_info(dd, "Parent not root\n");
-		return;
+		return -EINVAL;
 	}
 	}
 
 
 	if (!pci_is_pcie(parent) || !pci_is_pcie(dd->pcidev))
 	if (!pci_is_pcie(parent) || !pci_is_pcie(dd->pcidev))
-		return;
+		return -EINVAL;
 	rc_mpss = parent->pcie_mpss;
 	rc_mpss = parent->pcie_mpss;
 	rc_mps = ffs(pcie_get_mps(parent)) - 8;
 	rc_mps = ffs(pcie_get_mps(parent)) - 8;
 	/* Find out supported and configured values for endpoint (us) */
 	/* Find out supported and configured values for endpoint (us) */
@@ -497,6 +590,8 @@ static void tune_pcie_caps(struct hfi1_devdata *dd)
 		ep_mrrs = max_mrrs;
 		ep_mrrs = max_mrrs;
 		pcie_set_readrq(dd->pcidev, ep_mrrs);
 		pcie_set_readrq(dd->pcidev, ep_mrrs);
 	}
 	}
+
+	return 0;
 }
 }
 
 
 /* End of PCIe capability tuning */
 /* End of PCIe capability tuning */
@@ -728,6 +823,7 @@ static int load_eq_table(struct hfi1_devdata *dd, const u8 eq[11][3], u8 fs,
 	u32 violation;
 	u32 violation;
 	u32 i;
 	u32 i;
 	u8 c_minus1, c0, c_plus1;
 	u8 c_minus1, c0, c_plus1;
+	int ret;
 
 
 	for (i = 0; i < 11; i++) {
 	for (i = 0; i < 11; i++) {
 		/* set index */
 		/* set index */
@@ -739,8 +835,14 @@ static int load_eq_table(struct hfi1_devdata *dd, const u8 eq[11][3], u8 fs,
 		pci_write_config_dword(pdev, PCIE_CFG_REG_PL102,
 		pci_write_config_dword(pdev, PCIE_CFG_REG_PL102,
 				       eq_value(c_minus1, c0, c_plus1));
 				       eq_value(c_minus1, c0, c_plus1));
 		/* check if these coefficients violate EQ rules */
 		/* check if these coefficients violate EQ rules */
-		pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL105,
-				      &violation);
+		ret = pci_read_config_dword(dd->pcidev,
+					    PCIE_CFG_REG_PL105, &violation);
+		if (ret) {
+			dd_dev_err(dd, "Unable to read from PCI config\n");
+			hit_error = 1;
+			break;
+		}
+
 		if (violation
 		if (violation
 		    & PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK){
 		    & PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK){
 			if (hit_error == 0) {
 			if (hit_error == 0) {
@@ -1194,7 +1296,13 @@ retry:
 	 * that it is Gen3 capable earlier.
 	 * that it is Gen3 capable earlier.
 	 */
 	 */
 	dd_dev_info(dd, "%s: setting parent target link speed\n", __func__);
 	dd_dev_info(dd, "%s: setting parent target link speed\n", __func__);
-	pcie_capability_read_word(parent, PCI_EXP_LNKCTL2, &lnkctl2);
+	ret = pcie_capability_read_word(parent, PCI_EXP_LNKCTL2, &lnkctl2);
+	if (ret) {
+		dd_dev_err(dd, "Unable to read from PCI config\n");
+		return_error = 1;
+		goto done;
+	}
+
 	dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__,
 	dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__,
 		    (u32)lnkctl2);
 		    (u32)lnkctl2);
 	/* only write to parent if target is not as high as ours */
 	/* only write to parent if target is not as high as ours */
@@ -1203,20 +1311,37 @@ retry:
 		lnkctl2 |= target_vector;
 		lnkctl2 |= target_vector;
 		dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__,
 		dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__,
 			    (u32)lnkctl2);
 			    (u32)lnkctl2);
-		pcie_capability_write_word(parent, PCI_EXP_LNKCTL2, lnkctl2);
+		ret = pcie_capability_write_word(parent,
+						 PCI_EXP_LNKCTL2, lnkctl2);
+		if (ret) {
+			dd_dev_err(dd, "Unable to write to PCI config\n");
+			return_error = 1;
+			goto done;
+		}
 	} else {
 	} else {
 		dd_dev_info(dd, "%s: ..target speed is OK\n", __func__);
 		dd_dev_info(dd, "%s: ..target speed is OK\n", __func__);
 	}
 	}
 
 
 	dd_dev_info(dd, "%s: setting target link speed\n", __func__);
 	dd_dev_info(dd, "%s: setting target link speed\n", __func__);
-	pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL2, &lnkctl2);
+	ret = pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL2, &lnkctl2);
+	if (ret) {
+		dd_dev_err(dd, "Unable to read from PCI config\n");
+		return_error = 1;
+		goto done;
+	}
+
 	dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__,
 	dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__,
 		    (u32)lnkctl2);
 		    (u32)lnkctl2);
 	lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK;
 	lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK;
 	lnkctl2 |= target_vector;
 	lnkctl2 |= target_vector;
 	dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__,
 	dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__,
 		    (u32)lnkctl2);
 		    (u32)lnkctl2);
-	pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL2, lnkctl2);
+	ret = pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL2, lnkctl2);
+	if (ret) {
+		dd_dev_err(dd, "Unable to write to PCI config\n");
+		return_error = 1;
+		goto done;
+	}
 
 
 	/* step 5h: arm gasket logic */
 	/* step 5h: arm gasket logic */
 	/* hold DC in reset across the SBR */
 	/* hold DC in reset across the SBR */
@@ -1266,7 +1391,14 @@ retry:
 
 
 	/* restore PCI space registers we know were reset */
 	/* restore PCI space registers we know were reset */
 	dd_dev_info(dd, "%s: calling restore_pci_variables\n", __func__);
 	dd_dev_info(dd, "%s: calling restore_pci_variables\n", __func__);
-	restore_pci_variables(dd);
+	ret = restore_pci_variables(dd);
+	if (ret) {
+		dd_dev_err(dd, "%s: Could not restore PCI variables\n",
+			   __func__);
+		return_error = 1;
+		goto done;
+	}
+
 	/* restore firmware control */
 	/* restore firmware control */
 	write_csr(dd, MISC_CFG_FW_CTRL, fw_ctrl);
 	write_csr(dd, MISC_CFG_FW_CTRL, fw_ctrl);
 
 
@@ -1296,7 +1428,13 @@ retry:
 	setextled(dd, 0);
 	setextled(dd, 0);
 
 
 	/* check for any per-lane errors */
 	/* check for any per-lane errors */
-	pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, &reg32);
+	ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, &reg32);
+	if (ret) {
+		dd_dev_err(dd, "Unable to read from PCI config\n");
+		return_error = 1;
+		goto done;
+	}
+
 	dd_dev_info(dd, "%s: per-lane errors: 0x%x\n", __func__, reg32);
 	dd_dev_info(dd, "%s: per-lane errors: 0x%x\n", __func__, reg32);
 
 
 	/* extract status, look for our HFI */
 	/* extract status, look for our HFI */

+ 11 - 4
drivers/infiniband/hw/hfi1/pio.c

@@ -1012,7 +1012,7 @@ static void sc_wait_for_packet_egress(struct send_context *sc, int pause)
 				   "%s: context %u(%u) timeout waiting for packets to egress, remaining count %u, bouncing link\n",
 				   "%s: context %u(%u) timeout waiting for packets to egress, remaining count %u, bouncing link\n",
 				   __func__, sc->sw_index,
 				   __func__, sc->sw_index,
 				   sc->hw_context, (u32)reg);
 				   sc->hw_context, (u32)reg);
-			queue_work(dd->pport->hfi1_wq,
+			queue_work(dd->pport->link_wq,
 				   &dd->pport->link_bounce_work);
 				   &dd->pport->link_bounce_work);
 			break;
 			break;
 		}
 		}
@@ -1568,7 +1568,8 @@ static void sc_piobufavail(struct send_context *sc)
 	struct rvt_qp *qp;
 	struct rvt_qp *qp;
 	struct hfi1_qp_priv *priv;
 	struct hfi1_qp_priv *priv;
 	unsigned long flags;
 	unsigned long flags;
-	unsigned i, n = 0;
+	uint i, n = 0, max_idx = 0;
+	u8 max_starved_cnt = 0;
 
 
 	if (dd->send_contexts[sc->sw_index].type != SC_KERNEL &&
 	if (dd->send_contexts[sc->sw_index].type != SC_KERNEL &&
 	    dd->send_contexts[sc->sw_index].type != SC_VL15)
 	    dd->send_contexts[sc->sw_index].type != SC_VL15)
@@ -1591,6 +1592,7 @@ static void sc_piobufavail(struct send_context *sc)
 		priv = qp->priv;
 		priv = qp->priv;
 		list_del_init(&priv->s_iowait.list);
 		list_del_init(&priv->s_iowait.list);
 		priv->s_iowait.lock = NULL;
 		priv->s_iowait.lock = NULL;
+		iowait_starve_find_max(wait, &max_starved_cnt, n, &max_idx);
 		/* refcount held until actual wake up */
 		/* refcount held until actual wake up */
 		qps[n++] = qp;
 		qps[n++] = qp;
 	}
 	}
@@ -1605,9 +1607,14 @@ static void sc_piobufavail(struct send_context *sc)
 	}
 	}
 	write_sequnlock_irqrestore(&dev->iowait_lock, flags);
 	write_sequnlock_irqrestore(&dev->iowait_lock, flags);
 
 
-	for (i = 0; i < n; i++)
-		hfi1_qp_wakeup(qps[i],
+	/* Wake up the most starved one first */
+	if (n)
+		hfi1_qp_wakeup(qps[max_idx],
 			       RVT_S_WAIT_PIO | RVT_S_WAIT_PIO_DRAIN);
 			       RVT_S_WAIT_PIO | RVT_S_WAIT_PIO_DRAIN);
+	for (i = 0; i < n; i++)
+		if (i != max_idx)
+			hfi1_qp_wakeup(qps[i],
+				       RVT_S_WAIT_PIO | RVT_S_WAIT_PIO_DRAIN);
 }
 }
 
 
 /* translate a send credit update to a bit code of reasons */
 /* translate a send credit update to a bit code of reasons */

+ 18 - 26
drivers/infiniband/hw/hfi1/platform.c

@@ -58,8 +58,13 @@ static int validate_scratch_checksum(struct hfi1_devdata *dd)
 	version = (temp_scratch & BITMAP_VERSION_SMASK) >> BITMAP_VERSION_SHIFT;
 	version = (temp_scratch & BITMAP_VERSION_SMASK) >> BITMAP_VERSION_SHIFT;
 
 
 	/* Prevent power on default of all zeroes from passing checksum */
 	/* Prevent power on default of all zeroes from passing checksum */
-	if (!version)
+	if (!version) {
+		dd_dev_err(dd, "%s: Config bitmap uninitialized\n", __func__);
+		dd_dev_err(dd,
+			   "%s: Please update your BIOS to support active channels\n",
+			   __func__);
 		return 0;
 		return 0;
+	}
 
 
 	/*
 	/*
 	 * ASIC scratch 0 only contains the checksum and bitmap version as
 	 * ASIC scratch 0 only contains the checksum and bitmap version as
@@ -84,6 +89,8 @@ static int validate_scratch_checksum(struct hfi1_devdata *dd)
 
 
 	if (checksum + temp_scratch == 0xFFFF)
 	if (checksum + temp_scratch == 0xFFFF)
 		return 1;
 		return 1;
+
+	dd_dev_err(dd, "%s: Configuration bitmap corrupted\n", __func__);
 	return 0;
 	return 0;
 }
 }
 
 
@@ -136,7 +143,6 @@ static void save_platform_config_fields(struct hfi1_devdata *dd)
 void get_platform_config(struct hfi1_devdata *dd)
 void get_platform_config(struct hfi1_devdata *dd)
 {
 {
 	int ret = 0;
 	int ret = 0;
-	unsigned long size = 0;
 	u8 *temp_platform_config = NULL;
 	u8 *temp_platform_config = NULL;
 	u32 esize;
 	u32 esize;
 
 
@@ -145,11 +151,6 @@ void get_platform_config(struct hfi1_devdata *dd)
 			save_platform_config_fields(dd);
 			save_platform_config_fields(dd);
 			return;
 			return;
 		}
 		}
-		dd_dev_err(dd, "%s: Config bitmap corrupted/uninitialized\n",
-			   __func__);
-		dd_dev_err(dd,
-			   "%s: Please update your BIOS to support active channels\n",
-			   __func__);
 	} else {
 	} else {
 		ret = eprom_read_platform_config(dd,
 		ret = eprom_read_platform_config(dd,
 						 (void **)&temp_platform_config,
 						 (void **)&temp_platform_config,
@@ -160,15 +161,6 @@ void get_platform_config(struct hfi1_devdata *dd)
 			dd->platform_config.size = esize;
 			dd->platform_config.size = esize;
 			return;
 			return;
 		}
 		}
-		/* fail, try EFI variable */
-
-		ret = read_hfi1_efi_var(dd, "configuration", &size,
-					(void **)&temp_platform_config);
-		if (!ret) {
-			dd->platform_config.data = temp_platform_config;
-			dd->platform_config.size = size;
-			return;
-		}
 	}
 	}
 	dd_dev_err(dd,
 	dd_dev_err(dd,
 		   "%s: Failed to get platform config, falling back to sub-optimal default file\n",
 		   "%s: Failed to get platform config, falling back to sub-optimal default file\n",
@@ -242,7 +234,7 @@ static int qual_power(struct hfi1_pportdata *ppd)
 
 
 	if (ppd->offline_disabled_reason ==
 	if (ppd->offline_disabled_reason ==
 			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY)) {
 			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY)) {
-		dd_dev_info(
+		dd_dev_err(
 			ppd->dd,
 			ppd->dd,
 			"%s: Port disabled due to system power restrictions\n",
 			"%s: Port disabled due to system power restrictions\n",
 			__func__);
 			__func__);
@@ -268,7 +260,7 @@ static int qual_bitrate(struct hfi1_pportdata *ppd)
 
 
 	if (ppd->offline_disabled_reason ==
 	if (ppd->offline_disabled_reason ==
 			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY)) {
 			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY)) {
-		dd_dev_info(
+		dd_dev_err(
 			ppd->dd,
 			ppd->dd,
 			"%s: Cable failed bitrate check, disabling port\n",
 			"%s: Cable failed bitrate check, disabling port\n",
 			__func__);
 			__func__);
@@ -709,15 +701,15 @@ static void apply_tunings(
 		ret = load_8051_config(ppd->dd, DC_HOST_COMM_SETTINGS,
 		ret = load_8051_config(ppd->dd, DC_HOST_COMM_SETTINGS,
 				       GENERAL_CONFIG, config_data);
 				       GENERAL_CONFIG, config_data);
 		if (ret != HCMD_SUCCESS)
 		if (ret != HCMD_SUCCESS)
-			dd_dev_info(ppd->dd,
-				    "%s: Failed set ext device config params\n",
-				    __func__);
+			dd_dev_err(ppd->dd,
+				   "%s: Failed set ext device config params\n",
+				   __func__);
 	}
 	}
 
 
 	if (tx_preset_index == OPA_INVALID_INDEX) {
 	if (tx_preset_index == OPA_INVALID_INDEX) {
 		if (ppd->port_type == PORT_TYPE_QSFP && limiting_active)
 		if (ppd->port_type == PORT_TYPE_QSFP && limiting_active)
-			dd_dev_info(ppd->dd, "%s: Invalid Tx preset index\n",
-				    __func__);
+			dd_dev_err(ppd->dd, "%s: Invalid Tx preset index\n",
+				   __func__);
 		return;
 		return;
 	}
 	}
 
 
@@ -900,7 +892,7 @@ static int tune_qsfp(struct hfi1_pportdata *ppd,
 	case 0xD: /* fallthrough */
 	case 0xD: /* fallthrough */
 	case 0xF:
 	case 0xF:
 	default:
 	default:
-		dd_dev_info(ppd->dd, "%s: Unknown/unsupported cable\n",
+		dd_dev_warn(ppd->dd, "%s: Unknown/unsupported cable\n",
 			    __func__);
 			    __func__);
 		break;
 		break;
 	}
 	}
@@ -942,7 +934,7 @@ void tune_serdes(struct hfi1_pportdata *ppd)
 	case PORT_TYPE_DISCONNECTED:
 	case PORT_TYPE_DISCONNECTED:
 		ppd->offline_disabled_reason =
 		ppd->offline_disabled_reason =
 			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_DISCONNECTED);
 			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_DISCONNECTED);
-		dd_dev_info(dd, "%s: Port disconnected, disabling port\n",
+		dd_dev_warn(dd, "%s: Port disconnected, disabling port\n",
 			    __func__);
 			    __func__);
 		goto bail;
 		goto bail;
 	case PORT_TYPE_FIXED:
 	case PORT_TYPE_FIXED:
@@ -1027,7 +1019,7 @@ void tune_serdes(struct hfi1_pportdata *ppd)
 		}
 		}
 		break;
 		break;
 	default:
 	default:
-		dd_dev_info(ppd->dd, "%s: Unknown port type\n", __func__);
+		dd_dev_warn(ppd->dd, "%s: Unknown port type\n", __func__);
 		ppd->port_type = PORT_TYPE_UNKNOWN;
 		ppd->port_type = PORT_TYPE_UNKNOWN;
 		tuning_method = OPA_UNKNOWN_TUNING;
 		tuning_method = OPA_UNKNOWN_TUNING;
 		total_atten = 0;
 		total_atten = 0;

+ 11 - 10
drivers/infiniband/hw/hfi1/qp.c

@@ -68,17 +68,12 @@ static int iowait_sleep(
 	struct sdma_engine *sde,
 	struct sdma_engine *sde,
 	struct iowait *wait,
 	struct iowait *wait,
 	struct sdma_txreq *stx,
 	struct sdma_txreq *stx,
-	unsigned seq);
+	unsigned int seq,
+	bool pkts_sent);
 static void iowait_wakeup(struct iowait *wait, int reason);
 static void iowait_wakeup(struct iowait *wait, int reason);
 static void iowait_sdma_drained(struct iowait *wait);
 static void iowait_sdma_drained(struct iowait *wait);
 static void qp_pio_drain(struct rvt_qp *qp);
 static void qp_pio_drain(struct rvt_qp *qp);
 
 
-static inline unsigned mk_qpn(struct rvt_qpn_table *qpt,
-			      struct rvt_qpn_map *map, unsigned off)
-{
-	return (map - qpt->map) * RVT_BITS_PER_PAGE + off;
-}
-
 const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = {
 const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = {
 [IB_WR_RDMA_WRITE] = {
 [IB_WR_RDMA_WRITE] = {
 	.length = sizeof(struct ib_rdma_wr),
 	.length = sizeof(struct ib_rdma_wr),
@@ -377,7 +372,8 @@ static int iowait_sleep(
 	struct sdma_engine *sde,
 	struct sdma_engine *sde,
 	struct iowait *wait,
 	struct iowait *wait,
 	struct sdma_txreq *stx,
 	struct sdma_txreq *stx,
-	unsigned seq)
+	uint seq,
+	bool pkts_sent)
 {
 {
 	struct verbs_txreq *tx = container_of(stx, struct verbs_txreq, txreq);
 	struct verbs_txreq *tx = container_of(stx, struct verbs_txreq, txreq);
 	struct rvt_qp *qp;
 	struct rvt_qp *qp;
@@ -408,7 +404,8 @@ static int iowait_sleep(
 
 
 			ibp->rvp.n_dmawait++;
 			ibp->rvp.n_dmawait++;
 			qp->s_flags |= RVT_S_WAIT_DMA_DESC;
 			qp->s_flags |= RVT_S_WAIT_DMA_DESC;
-			list_add_tail(&priv->s_iowait.list, &sde->dmawait);
+			iowait_queue(pkts_sent, &priv->s_iowait,
+				     &sde->dmawait);
 			priv->s_iowait.lock = &dev->iowait_lock;
 			priv->s_iowait.lock = &dev->iowait_lock;
 			trace_hfi1_qpsleep(qp, RVT_S_WAIT_DMA_DESC);
 			trace_hfi1_qpsleep(qp, RVT_S_WAIT_DMA_DESC);
 			rvt_get_qp(qp);
 			rvt_get_qp(qp);
@@ -607,7 +604,7 @@ void qp_iter_print(struct seq_file *s, struct qp_iter *iter)
 	wqe = rvt_get_swqe_ptr(qp, qp->s_last);
 	wqe = rvt_get_swqe_ptr(qp, qp->s_last);
 	send_context = qp_to_send_context(qp, priv->s_sc);
 	send_context = qp_to_send_context(qp, priv->s_sc);
 	seq_printf(s,
 	seq_printf(s,
-		   "N %d %s QP %x R %u %s %u %u %u f=%x %u %u %u %u %u %u SPSN %x %x %x %x %x RPSN %x (%u %u %u %u %u %u %u) RQP %x LID %x SL %u MTU %u %u %u %u %u SDE %p,%u SC %p,%u SCQ %u %u PID %d\n",
+		   "N %d %s QP %x R %u %s %u %u %u f=%x %u %u %u %u %u %u SPSN %x %x %x %x %x RPSN %x S(%u %u %u %u %u %u %u) R(%u %u %u) RQP %x LID %x SL %u MTU %u %u %u %u %u SDE %p,%u SC %p,%u SCQ %u %u PID %d\n",
 		   iter->n,
 		   iter->n,
 		   qp_idle(qp) ? "I" : "B",
 		   qp_idle(qp) ? "I" : "B",
 		   qp->ibqp.qp_num,
 		   qp->ibqp.qp_num,
@@ -630,6 +627,10 @@ void qp_iter_print(struct seq_file *s, struct qp_iter *iter)
 		   qp->s_last, qp->s_acked, qp->s_cur,
 		   qp->s_last, qp->s_acked, qp->s_cur,
 		   qp->s_tail, qp->s_head, qp->s_size,
 		   qp->s_tail, qp->s_head, qp->s_size,
 		   qp->s_avail,
 		   qp->s_avail,
+		   /* ack_queue ring pointers, size */
+		   qp->s_tail_ack_queue, qp->r_head_ack_queue,
+		   HFI1_MAX_RDMA_ATOMIC,
+		   /* remote QP info  */
 		   qp->remote_qpn,
 		   qp->remote_qpn,
 		   rdma_ah_get_dlid(&qp->remote_ah_attr),
 		   rdma_ah_get_dlid(&qp->remote_ah_attr),
 		   rdma_ah_get_sl(&qp->remote_ah_attr),
 		   rdma_ah_get_sl(&qp->remote_ah_attr),

+ 15 - 27
drivers/infiniband/hw/hfi1/rc.c

@@ -765,7 +765,7 @@ void hfi1_send_rc_ack(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp,
 		ohdr->u.aeth = rvt_compute_aeth(qp);
 		ohdr->u.aeth = rvt_compute_aeth(qp);
 	sc5 = ibp->sl_to_sc[rdma_ah_get_sl(&qp->remote_ah_attr)];
 	sc5 = ibp->sl_to_sc[rdma_ah_get_sl(&qp->remote_ah_attr)];
 	/* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
 	/* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
-	pbc_flags |= ((!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT);
+	pbc_flags |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT);
 	lrh0 |= (sc5 & 0xf) << 12 | (rdma_ah_get_sl(&qp->remote_ah_attr)
 	lrh0 |= (sc5 & 0xf) << 12 | (rdma_ah_get_sl(&qp->remote_ah_attr)
 				     & 0xf) << 4;
 				     & 0xf) << 4;
 	hdr.lrh[0] = cpu_to_be16(lrh0);
 	hdr.lrh[0] = cpu_to_be16(lrh0);
@@ -798,7 +798,8 @@ void hfi1_send_rc_ack(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp,
 		goto queue_ack;
 		goto queue_ack;
 	}
 	}
 
 
-	trace_ack_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &hdr);
+	trace_ack_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
+			       &hdr, ib_is_sc5(sc5));
 
 
 	/* write the pbc and data */
 	/* write the pbc and data */
 	ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, &hdr, hwords);
 	ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, &hdr, hwords);
@@ -1009,7 +1010,7 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct ib_header *hdr)
 		return;
 		return;
 	}
 	}
 
 
-	psn = be32_to_cpu(ohdr->bth[2]);
+	psn = ib_bth_get_psn(ohdr);
 	reset_sending_psn(qp, psn);
 	reset_sending_psn(qp, psn);
 
 
 	/*
 	/*
@@ -1915,17 +1916,16 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
 void hfi1_rc_rcv(struct hfi1_packet *packet)
 void hfi1_rc_rcv(struct hfi1_packet *packet)
 {
 {
 	struct hfi1_ctxtdata *rcd = packet->rcd;
 	struct hfi1_ctxtdata *rcd = packet->rcd;
-	struct ib_header *hdr = packet->hdr;
-	u32 rcv_flags = packet->rcv_flags;
 	void *data = packet->ebuf;
 	void *data = packet->ebuf;
 	u32 tlen = packet->tlen;
 	u32 tlen = packet->tlen;
 	struct rvt_qp *qp = packet->qp;
 	struct rvt_qp *qp = packet->qp;
 	struct hfi1_ibport *ibp = rcd_to_iport(rcd);
 	struct hfi1_ibport *ibp = rcd_to_iport(rcd);
 	struct ib_other_headers *ohdr = packet->ohdr;
 	struct ib_other_headers *ohdr = packet->ohdr;
-	u32 bth0, opcode;
+	u32 bth0;
+	u32 opcode = packet->opcode;
 	u32 hdrsize = packet->hlen;
 	u32 hdrsize = packet->hlen;
 	u32 psn;
 	u32 psn;
-	u32 pad;
+	u32 pad = packet->pad;
 	struct ib_wc wc;
 	struct ib_wc wc;
 	u32 pmtu = qp->pmtu;
 	u32 pmtu = qp->pmtu;
 	int diff;
 	int diff;
@@ -1937,14 +1937,13 @@ void hfi1_rc_rcv(struct hfi1_packet *packet)
 	u32 rkey;
 	u32 rkey;
 
 
 	lockdep_assert_held(&qp->r_lock);
 	lockdep_assert_held(&qp->r_lock);
+
 	bth0 = be32_to_cpu(ohdr->bth[0]);
 	bth0 = be32_to_cpu(ohdr->bth[0]);
-	if (hfi1_ruc_check_hdr(ibp, hdr, rcv_flags & HFI1_HAS_GRH, qp, bth0))
+	if (hfi1_ruc_check_hdr(ibp, packet))
 		return;
 		return;
 
 
 	is_fecn = process_ecn(qp, packet, false);
 	is_fecn = process_ecn(qp, packet, false);
-
-	psn = be32_to_cpu(ohdr->bth[2]);
-	opcode = ib_bth_get_opcode(ohdr);
+	psn = ib_bth_get_psn(ohdr);
 
 
 	/*
 	/*
 	 * Process responses (ACKs) before anything else.  Note that the
 	 * Process responses (ACKs) before anything else.  Note that the
@@ -2074,8 +2073,6 @@ no_immediate_data:
 		wc.wc_flags = 0;
 		wc.wc_flags = 0;
 		wc.ex.imm_data = 0;
 		wc.ex.imm_data = 0;
 send_last:
 send_last:
-		/* Get the number of bytes the message was padded by. */
-		pad = ib_bth_get_pad(ohdr);
 		/* Check for invalid length. */
 		/* Check for invalid length. */
 		/* LAST len should be >= 1 */
 		/* LAST len should be >= 1 */
 		if (unlikely(tlen < (hdrsize + pad + 4)))
 		if (unlikely(tlen < (hdrsize + pad + 4)))
@@ -2368,28 +2365,19 @@ send_ack:
 
 
 void hfi1_rc_hdrerr(
 void hfi1_rc_hdrerr(
 	struct hfi1_ctxtdata *rcd,
 	struct hfi1_ctxtdata *rcd,
-	struct ib_header *hdr,
-	u32 rcv_flags,
+	struct hfi1_packet *packet,
 	struct rvt_qp *qp)
 	struct rvt_qp *qp)
 {
 {
-	int has_grh = rcv_flags & HFI1_HAS_GRH;
-	struct ib_other_headers *ohdr;
 	struct hfi1_ibport *ibp = rcd_to_iport(rcd);
 	struct hfi1_ibport *ibp = rcd_to_iport(rcd);
 	int diff;
 	int diff;
 	u32 opcode;
 	u32 opcode;
-	u32 psn, bth0;
-
-	/* Check for GRH */
-	ohdr = &hdr->u.oth;
-	if (has_grh)
-		ohdr = &hdr->u.l.oth;
+	u32 psn;
 
 
-	bth0 = be32_to_cpu(ohdr->bth[0]);
-	if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, bth0))
+	if (hfi1_ruc_check_hdr(ibp, packet))
 		return;
 		return;
 
 
-	psn = be32_to_cpu(ohdr->bth[2]);
-	opcode = ib_bth_get_opcode(ohdr);
+	psn = ib_bth_get_psn(packet->ohdr);
+	opcode = ib_bth_get_opcode(packet->ohdr);
 
 
 	/* Only deal with RDMA Writes for now */
 	/* Only deal with RDMA Writes for now */
 	if (opcode < IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) {
 	if (opcode < IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) {

+ 50 - 50
drivers/infiniband/hw/hfi1/ruc.c

@@ -74,8 +74,10 @@ static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe)
 		if (wqe->sg_list[i].length == 0)
 		if (wqe->sg_list[i].length == 0)
 			continue;
 			continue;
 		/* Check LKEY */
 		/* Check LKEY */
-		if (!rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
-				 &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
+		ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
+				  NULL, &wqe->sg_list[i],
+				  IB_ACCESS_LOCAL_WRITE);
+		if (unlikely(ret <= 0))
 			goto bad_lkey;
 			goto bad_lkey;
 		qp->r_len += wqe->sg_list[i].length;
 		qp->r_len += wqe->sg_list[i].length;
 		j++;
 		j++;
@@ -214,100 +216,95 @@ static int gid_ok(union ib_gid *gid, __be64 gid_prefix, __be64 id)
  *
  *
  * The s_lock will be acquired around the hfi1_migrate_qp() call.
  * The s_lock will be acquired around the hfi1_migrate_qp() call.
  */
  */
-int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct ib_header *hdr,
-		       int has_grh, struct rvt_qp *qp, u32 bth0)
+int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_packet *packet)
 {
 {
 	__be64 guid;
 	__be64 guid;
 	unsigned long flags;
 	unsigned long flags;
+	struct rvt_qp *qp = packet->qp;
 	u8 sc5 = ibp->sl_to_sc[rdma_ah_get_sl(&qp->remote_ah_attr)];
 	u8 sc5 = ibp->sl_to_sc[rdma_ah_get_sl(&qp->remote_ah_attr)];
-
-	if (qp->s_mig_state == IB_MIG_ARMED && (bth0 & IB_BTH_MIG_REQ)) {
-		if (!has_grh) {
+	u32 dlid = packet->dlid;
+	u32 slid = packet->slid;
+	u32 sl = packet->sl;
+	int migrated;
+	u32 bth0, bth1;
+
+	bth0 = be32_to_cpu(packet->ohdr->bth[0]);
+	bth1 = be32_to_cpu(packet->ohdr->bth[1]);
+	migrated = bth0 & IB_BTH_MIG_REQ;
+
+	if (qp->s_mig_state == IB_MIG_ARMED && migrated) {
+		if (!packet->grh) {
 			if (rdma_ah_get_ah_flags(&qp->alt_ah_attr) &
 			if (rdma_ah_get_ah_flags(&qp->alt_ah_attr) &
 			    IB_AH_GRH)
 			    IB_AH_GRH)
-				goto err;
+				return 1;
 		} else {
 		} else {
 			const struct ib_global_route *grh;
 			const struct ib_global_route *grh;
 
 
 			if (!(rdma_ah_get_ah_flags(&qp->alt_ah_attr) &
 			if (!(rdma_ah_get_ah_flags(&qp->alt_ah_attr) &
 			      IB_AH_GRH))
 			      IB_AH_GRH))
-				goto err;
+				return 1;
 			grh = rdma_ah_read_grh(&qp->alt_ah_attr);
 			grh = rdma_ah_read_grh(&qp->alt_ah_attr);
 			guid = get_sguid(ibp, grh->sgid_index);
 			guid = get_sguid(ibp, grh->sgid_index);
-			if (!gid_ok(&hdr->u.l.grh.dgid, ibp->rvp.gid_prefix,
+			if (!gid_ok(&packet->grh->dgid, ibp->rvp.gid_prefix,
 				    guid))
 				    guid))
-				goto err;
+				return 1;
 			if (!gid_ok(
 			if (!gid_ok(
-				&hdr->u.l.grh.sgid,
+				&packet->grh->sgid,
 				grh->dgid.global.subnet_prefix,
 				grh->dgid.global.subnet_prefix,
 				grh->dgid.global.interface_id))
 				grh->dgid.global.interface_id))
-				goto err;
+				return 1;
 		}
 		}
-		if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0, sc5,
-					    ib_get_slid(hdr)))) {
-			hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY,
-				       (u16)bth0,
-				       ib_get_sl(hdr),
-				       0, qp->ibqp.qp_num,
-				       ib_get_slid(hdr),
-				       ib_get_dlid(hdr));
-			goto err;
+		if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0,
+					    sc5, slid))) {
+			hfi1_bad_pkey(ibp, (u16)bth0, sl,
+				      0, qp->ibqp.qp_num, slid, dlid);
+			return 1;
 		}
 		}
 		/* Validate the SLID. See Ch. 9.6.1.5 and 17.2.8 */
 		/* Validate the SLID. See Ch. 9.6.1.5 and 17.2.8 */
-		if (ib_get_slid(hdr) !=
-			rdma_ah_get_dlid(&qp->alt_ah_attr) ||
+		if (slid != rdma_ah_get_dlid(&qp->alt_ah_attr) ||
 		    ppd_from_ibp(ibp)->port !=
 		    ppd_from_ibp(ibp)->port !=
 			rdma_ah_get_port_num(&qp->alt_ah_attr))
 			rdma_ah_get_port_num(&qp->alt_ah_attr))
-			goto err;
+			return 1;
 		spin_lock_irqsave(&qp->s_lock, flags);
 		spin_lock_irqsave(&qp->s_lock, flags);
 		hfi1_migrate_qp(qp);
 		hfi1_migrate_qp(qp);
 		spin_unlock_irqrestore(&qp->s_lock, flags);
 		spin_unlock_irqrestore(&qp->s_lock, flags);
 	} else {
 	} else {
-		if (!has_grh) {
+		if (!packet->grh) {
 			if (rdma_ah_get_ah_flags(&qp->remote_ah_attr) &
 			if (rdma_ah_get_ah_flags(&qp->remote_ah_attr) &
 						 IB_AH_GRH)
 						 IB_AH_GRH)
-				goto err;
+				return 1;
 		} else {
 		} else {
 			const struct ib_global_route *grh;
 			const struct ib_global_route *grh;
 
 
 			if (!(rdma_ah_get_ah_flags(&qp->remote_ah_attr) &
 			if (!(rdma_ah_get_ah_flags(&qp->remote_ah_attr) &
 						   IB_AH_GRH))
 						   IB_AH_GRH))
-				goto err;
+				return 1;
 			grh = rdma_ah_read_grh(&qp->remote_ah_attr);
 			grh = rdma_ah_read_grh(&qp->remote_ah_attr);
 			guid = get_sguid(ibp, grh->sgid_index);
 			guid = get_sguid(ibp, grh->sgid_index);
-			if (!gid_ok(&hdr->u.l.grh.dgid, ibp->rvp.gid_prefix,
+			if (!gid_ok(&packet->grh->dgid, ibp->rvp.gid_prefix,
 				    guid))
 				    guid))
-				goto err;
+				return 1;
 			if (!gid_ok(
 			if (!gid_ok(
-			     &hdr->u.l.grh.sgid,
+			     &packet->grh->sgid,
 			     grh->dgid.global.subnet_prefix,
 			     grh->dgid.global.subnet_prefix,
 			     grh->dgid.global.interface_id))
 			     grh->dgid.global.interface_id))
-				goto err;
+				return 1;
 		}
 		}
-		if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0, sc5,
-					    ib_get_slid(hdr)))) {
-			hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY,
-				       (u16)bth0,
-				       ib_get_sl(hdr),
-				       0, qp->ibqp.qp_num,
-				       ib_get_slid(hdr),
-				       ib_get_dlid(hdr));
-			goto err;
+		if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0,
+					    sc5, slid))) {
+			hfi1_bad_pkey(ibp, (u16)bth0, sl,
+				      0, qp->ibqp.qp_num, slid, dlid);
+			return 1;
 		}
 		}
 		/* Validate the SLID. See Ch. 9.6.1.5 */
 		/* Validate the SLID. See Ch. 9.6.1.5 */
-		if (ib_get_slid(hdr) !=
-			rdma_ah_get_dlid(&qp->remote_ah_attr) ||
+		if ((slid != rdma_ah_get_dlid(&qp->remote_ah_attr)) ||
 		    ppd_from_ibp(ibp)->port != qp->port_num)
 		    ppd_from_ibp(ibp)->port != qp->port_num)
-			goto err;
-		if (qp->s_mig_state == IB_MIG_REARM &&
-		    !(bth0 & IB_BTH_MIG_REQ))
+			return 1;
+		if (qp->s_mig_state == IB_MIG_REARM && !migrated)
 			qp->s_mig_state = IB_MIG_ARMED;
 			qp->s_mig_state = IB_MIG_ARMED;
 	}
 	}
 
 
 	return 0;
 	return 0;
-
-err:
-	return 1;
 }
 }
 
 
 /**
 /**
@@ -816,6 +813,8 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
 static bool schedule_send_yield(struct rvt_qp *qp,
 static bool schedule_send_yield(struct rvt_qp *qp,
 				struct hfi1_pkt_state *ps)
 				struct hfi1_pkt_state *ps)
 {
 {
+	ps->pkts_sent = true;
+
 	if (unlikely(time_after(jiffies, ps->timeout))) {
 	if (unlikely(time_after(jiffies, ps->timeout))) {
 		if (!ps->in_thread ||
 		if (!ps->in_thread ||
 		    workqueue_congested(ps->cpu, ps->ppd->hfi1_wq)) {
 		    workqueue_congested(ps->cpu, ps->ppd->hfi1_wq)) {
@@ -912,6 +911,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
 	ps.timeout = jiffies + ps.timeout_int;
 	ps.timeout = jiffies + ps.timeout_int;
 	ps.cpu = priv->s_sde ? priv->s_sde->cpu :
 	ps.cpu = priv->s_sde ? priv->s_sde->cpu :
 			cpumask_first(cpumask_of_node(ps.ppd->dd->node));
 			cpumask_first(cpumask_of_node(ps.ppd->dd->node));
+	ps.pkts_sent = false;
 
 
 	/* insure a pre-built packet is handled  */
 	/* insure a pre-built packet is handled  */
 	ps.s_txreq = get_waiting_verbs_txreq(qp);
 	ps.s_txreq = get_waiting_verbs_txreq(qp);
@@ -934,7 +934,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
 			spin_lock_irqsave(&qp->s_lock, ps.flags);
 			spin_lock_irqsave(&qp->s_lock, ps.flags);
 		}
 		}
 	} while (make_req(qp, &ps));
 	} while (make_req(qp, &ps));
-
+	iowait_starve_clear(ps.pkts_sent, &priv->s_iowait);
 	spin_unlock_irqrestore(&qp->s_lock, ps.flags);
 	spin_unlock_irqrestore(&qp->s_lock, ps.flags);
 }
 }
 
 

+ 27 - 15
drivers/infiniband/hw/hfi1/sdma.c

@@ -246,7 +246,7 @@ static void __sdma_process_event(
 	enum sdma_events event);
 	enum sdma_events event);
 static void dump_sdma_state(struct sdma_engine *sde);
 static void dump_sdma_state(struct sdma_engine *sde);
 static void sdma_make_progress(struct sdma_engine *sde, u64 status);
 static void sdma_make_progress(struct sdma_engine *sde, u64 status);
-static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail);
+static void sdma_desc_avail(struct sdma_engine *sde, uint avail);
 static void sdma_flush_descq(struct sdma_engine *sde);
 static void sdma_flush_descq(struct sdma_engine *sde);
 
 
 /**
 /**
@@ -325,7 +325,7 @@ static void sdma_wait_for_packet_egress(struct sdma_engine *sde,
 			/* timed out - bounce the link */
 			/* timed out - bounce the link */
 			dd_dev_err(dd, "%s: engine %u timeout waiting for packets to egress, remaining count %u, bouncing link\n",
 			dd_dev_err(dd, "%s: engine %u timeout waiting for packets to egress, remaining count %u, bouncing link\n",
 				   __func__, sde->this_idx, (u32)reg);
 				   __func__, sde->this_idx, (u32)reg);
-			queue_work(dd->pport->hfi1_wq,
+			queue_work(dd->pport->link_wq,
 				   &dd->pport->link_bounce_work);
 				   &dd->pport->link_bounce_work);
 			break;
 			break;
 		}
 		}
@@ -1340,10 +1340,8 @@ static void sdma_clean(struct hfi1_devdata *dd, size_t num_engines)
  * @dd: hfi1_devdata
  * @dd: hfi1_devdata
  * @port: port number (currently only zero)
  * @port: port number (currently only zero)
  *
  *
- * sdma_init initializes the specified number of engines.
- *
- * The code initializes each sde, its csrs.  Interrupts
- * are not required to be enabled.
+ * Initializes each sde and its csrs.
+ * Interrupts are not required to be enabled.
  *
  *
  * Returns:
  * Returns:
  * 0 - success, -errno on failure
  * 0 - success, -errno on failure
@@ -1764,13 +1762,14 @@ retry:
  *
  *
  * This is called with head_lock held.
  * This is called with head_lock held.
  */
  */
-static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail)
+static void sdma_desc_avail(struct sdma_engine *sde, uint avail)
 {
 {
 	struct iowait *wait, *nw;
 	struct iowait *wait, *nw;
 	struct iowait *waits[SDMA_WAIT_BATCH_SIZE];
 	struct iowait *waits[SDMA_WAIT_BATCH_SIZE];
-	unsigned i, n = 0, seq;
+	uint i, n = 0, seq, max_idx = 0;
 	struct sdma_txreq *stx;
 	struct sdma_txreq *stx;
 	struct hfi1_ibdev *dev = &sde->dd->verbs_dev;
 	struct hfi1_ibdev *dev = &sde->dd->verbs_dev;
+	u8 max_starved_cnt = 0;
 
 
 #ifdef CONFIG_SDMA_VERBOSITY
 #ifdef CONFIG_SDMA_VERBOSITY
 	dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
 	dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
@@ -1805,6 +1804,9 @@ static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail)
 				if (num_desc > avail)
 				if (num_desc > avail)
 					break;
 					break;
 				avail -= num_desc;
 				avail -= num_desc;
+				/* Find the most starved wait memeber */
+				iowait_starve_find_max(wait, &max_starved_cnt,
+						       n, &max_idx);
 				list_del_init(&wait->list);
 				list_del_init(&wait->list);
 				waits[n++] = wait;
 				waits[n++] = wait;
 			}
 			}
@@ -1813,8 +1815,13 @@ static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail)
 		}
 		}
 	} while (read_seqretry(&dev->iowait_lock, seq));
 	} while (read_seqretry(&dev->iowait_lock, seq));
 
 
+	/* Schedule the most starved one first */
+	if (n)
+		waits[max_idx]->wakeup(waits[max_idx], SDMA_AVAIL_REASON);
+
 	for (i = 0; i < n; i++)
 	for (i = 0; i < n; i++)
-		waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON);
+		if (i != max_idx)
+			waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON);
 }
 }
 
 
 /* head_lock must be held */
 /* head_lock must be held */
@@ -2351,7 +2358,8 @@ static inline u16 submit_tx(struct sdma_engine *sde, struct sdma_txreq *tx)
 static int sdma_check_progress(
 static int sdma_check_progress(
 	struct sdma_engine *sde,
 	struct sdma_engine *sde,
 	struct iowait *wait,
 	struct iowait *wait,
-	struct sdma_txreq *tx)
+	struct sdma_txreq *tx,
+	bool pkts_sent)
 {
 {
 	int ret;
 	int ret;
 
 
@@ -2364,7 +2372,7 @@ static int sdma_check_progress(
 
 
 		seq = raw_seqcount_begin(
 		seq = raw_seqcount_begin(
 			(const seqcount_t *)&sde->head_lock.seqcount);
 			(const seqcount_t *)&sde->head_lock.seqcount);
-		ret = wait->sleep(sde, wait, tx, seq);
+		ret = wait->sleep(sde, wait, tx, seq, pkts_sent);
 		if (ret == -EAGAIN)
 		if (ret == -EAGAIN)
 			sde->desc_avail = sdma_descq_freecnt(sde);
 			sde->desc_avail = sdma_descq_freecnt(sde);
 	} else {
 	} else {
@@ -2378,6 +2386,7 @@ static int sdma_check_progress(
  * @sde: sdma engine to use
  * @sde: sdma engine to use
  * @wait: wait structure to use when full (may be NULL)
  * @wait: wait structure to use when full (may be NULL)
  * @tx: sdma_txreq to submit
  * @tx: sdma_txreq to submit
+ * @pkts_sent: has any packet been sent yet?
  *
  *
  * The call submits the tx into the ring.  If a iowait structure is non-NULL
  * The call submits the tx into the ring.  If a iowait structure is non-NULL
  * the packet will be queued to the list in wait.
  * the packet will be queued to the list in wait.
@@ -2389,7 +2398,8 @@ static int sdma_check_progress(
  */
  */
 int sdma_send_txreq(struct sdma_engine *sde,
 int sdma_send_txreq(struct sdma_engine *sde,
 		    struct iowait *wait,
 		    struct iowait *wait,
-		    struct sdma_txreq *tx)
+		    struct sdma_txreq *tx,
+		    bool pkts_sent)
 {
 {
 	int ret = 0;
 	int ret = 0;
 	u16 tail;
 	u16 tail;
@@ -2431,7 +2441,7 @@ unlock_noconn:
 	ret = -ECOMM;
 	ret = -ECOMM;
 	goto unlock;
 	goto unlock;
 nodesc:
 nodesc:
-	ret = sdma_check_progress(sde, wait, tx);
+	ret = sdma_check_progress(sde, wait, tx, pkts_sent);
 	if (ret == -EAGAIN) {
 	if (ret == -EAGAIN) {
 		ret = 0;
 		ret = 0;
 		goto retry;
 		goto retry;
@@ -2500,8 +2510,10 @@ retry:
 	}
 	}
 update_tail:
 update_tail:
 	total_count = submit_count + flush_count;
 	total_count = submit_count + flush_count;
-	if (wait)
+	if (wait) {
 		iowait_sdma_add(wait, total_count);
 		iowait_sdma_add(wait, total_count);
+		iowait_starve_clear(submit_count > 0, wait);
+	}
 	if (tail != INVALID_TAIL)
 	if (tail != INVALID_TAIL)
 		sdma_update_tail(sde, tail);
 		sdma_update_tail(sde, tail);
 	spin_unlock_irqrestore(&sde->tail_lock, flags);
 	spin_unlock_irqrestore(&sde->tail_lock, flags);
@@ -2529,7 +2541,7 @@ unlock_noconn:
 	ret = -ECOMM;
 	ret = -ECOMM;
 	goto update_tail;
 	goto update_tail;
 nodesc:
 nodesc:
-	ret = sdma_check_progress(sde, wait, tx);
+	ret = sdma_check_progress(sde, wait, tx, submit_count > 0);
 	if (ret == -EAGAIN) {
 	if (ret == -EAGAIN) {
 		ret = 0;
 		ret = 0;
 		goto retry;
 		goto retry;

+ 2 - 1
drivers/infiniband/hw/hfi1/sdma.h

@@ -852,7 +852,8 @@ struct iowait;
 
 
 int sdma_send_txreq(struct sdma_engine *sde,
 int sdma_send_txreq(struct sdma_engine *sde,
 		    struct iowait *wait,
 		    struct iowait *wait,
-		    struct sdma_txreq *tx);
+		    struct sdma_txreq *tx,
+		    bool pkts_sent);
 int sdma_send_txlist(struct sdma_engine *sde,
 int sdma_send_txlist(struct sdma_engine *sde,
 		     struct iowait *wait,
 		     struct iowait *wait,
 		     struct list_head *tx_list,
 		     struct list_head *tx_list,

+ 50 - 8
drivers/infiniband/hw/hfi1/trace.c

@@ -47,7 +47,7 @@
 #define CREATE_TRACE_POINTS
 #define CREATE_TRACE_POINTS
 #include "trace.h"
 #include "trace.h"
 
 
-u8 ibhdr_exhdr_len(struct ib_header *hdr)
+u8 hfi1_trace_ib_hdr_len(struct ib_header *hdr)
 {
 {
 	struct ib_other_headers *ohdr;
 	struct ib_other_headers *ohdr;
 	u8 opcode;
 	u8 opcode;
@@ -61,13 +61,18 @@ u8 ibhdr_exhdr_len(struct ib_header *hdr)
 	       0 : hdr_len_by_opcode[opcode] - (12 + 8);
 	       0 : hdr_len_by_opcode[opcode] - (12 + 8);
 }
 }
 
 
-#define IMM_PRN  "imm %d"
-#define RETH_PRN "reth vaddr 0x%.16llx rkey 0x%.8x dlen 0x%.8x"
-#define AETH_PRN "aeth syn 0x%.2x %s msn 0x%.8x"
-#define DETH_PRN "deth qkey 0x%.8x sqpn 0x%.6x"
-#define IETH_PRN "ieth rkey 0x%.8x"
-#define ATOMICACKETH_PRN "origdata %llx"
-#define ATOMICETH_PRN "vaddr 0x%llx rkey 0x%.8x sdata %llx cdata %llx"
+const char *hfi1_trace_get_packet_str(struct hfi1_packet *packet)
+{
+	return "IB";
+}
+
+#define IMM_PRN  "imm:%d"
+#define RETH_PRN "reth vaddr:0x%.16llx rkey:0x%.8x dlen:0x%.8x"
+#define AETH_PRN "aeth syn:0x%.2x %s msn:0x%.8x"
+#define DETH_PRN "deth qkey:0x%.8x sqpn:0x%.6x"
+#define IETH_PRN "ieth rkey:0x%.8x"
+#define ATOMICACKETH_PRN "origdata:%llx"
+#define ATOMICETH_PRN "vaddr:0x%llx rkey:0x%.8x sdata:%llx cdata:%llx"
 
 
 #define OP(transport, op) IB_OPCODE_## transport ## _ ## op
 #define OP(transport, op) IB_OPCODE_## transport ## _ ## op
 
 
@@ -84,6 +89,43 @@ static const char *parse_syndrome(u8 syndrome)
 	return "";
 	return "";
 }
 }
 
 
+void hfi1_trace_parse_bth(struct ib_other_headers *ohdr,
+			  u8 *ack, u8 *becn, u8 *fecn, u8 *mig,
+			  u8 *se, u8 *pad, u8 *opcode, u8 *tver,
+			  u16 *pkey, u32 *psn, u32 *qpn)
+{
+	*ack = ib_bth_get_ackreq(ohdr);
+	*becn = ib_bth_get_becn(ohdr);
+	*fecn = ib_bth_get_fecn(ohdr);
+	*mig = ib_bth_get_migreq(ohdr);
+	*se = ib_bth_get_se(ohdr);
+	*pad = ib_bth_get_pad(ohdr);
+	*opcode = ib_bth_get_opcode(ohdr);
+	*tver = ib_bth_get_tver(ohdr);
+	*pkey = ib_bth_get_pkey(ohdr);
+	*psn = ib_bth_get_psn(ohdr);
+	*qpn = ib_bth_get_qpn(ohdr);
+}
+
+void hfi1_trace_parse_9b_hdr(struct ib_header *hdr, bool sc5,
+			     struct ib_other_headers **ohdr,
+			     u8 *lnh, u8 *lver, u8 *sl, u8 *sc,
+			     u16 *len, u32 *dlid, u32 *slid)
+{
+	*lnh = ib_get_lnh(hdr);
+	*lver = ib_get_lver(hdr);
+	*sl = ib_get_sl(hdr);
+	*sc = ib_get_sc(hdr) | (sc5 << 4);
+	*len = ib_get_len(hdr);
+	*dlid = ib_get_dlid(hdr);
+	*slid = ib_get_slid(hdr);
+
+	if (*lnh == HFI1_LRH_BTH)
+		*ohdr = &hdr->u.oth;
+	else
+		*ohdr = &hdr->u.l.oth;
+}
+
 const char *parse_everbs_hdrs(
 const char *parse_everbs_hdrs(
 	struct trace_seq *p,
 	struct trace_seq *p,
 	u8 opcode,
 	u8 opcode,

+ 215 - 107
drivers/infiniband/hw/hfi1/trace_ibhdrs.h

@@ -55,8 +55,57 @@
 #undef TRACE_SYSTEM
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM hfi1_ibhdrs
 #define TRACE_SYSTEM hfi1_ibhdrs
 
 
-u8 ibhdr_exhdr_len(struct ib_header *hdr);
+#define ib_opcode_name(opcode) { IB_OPCODE_##opcode, #opcode  }
+#define show_ib_opcode(opcode)                             \
+__print_symbolic(opcode,                                   \
+	ib_opcode_name(RC_SEND_FIRST),                     \
+	ib_opcode_name(RC_SEND_MIDDLE),                    \
+	ib_opcode_name(RC_SEND_LAST),                      \
+	ib_opcode_name(RC_SEND_LAST_WITH_IMMEDIATE),       \
+	ib_opcode_name(RC_SEND_ONLY),                      \
+	ib_opcode_name(RC_SEND_ONLY_WITH_IMMEDIATE),       \
+	ib_opcode_name(RC_RDMA_WRITE_FIRST),               \
+	ib_opcode_name(RC_RDMA_WRITE_MIDDLE),              \
+	ib_opcode_name(RC_RDMA_WRITE_LAST),                \
+	ib_opcode_name(RC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
+	ib_opcode_name(RC_RDMA_WRITE_ONLY),                \
+	ib_opcode_name(RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
+	ib_opcode_name(RC_RDMA_READ_REQUEST),              \
+	ib_opcode_name(RC_RDMA_READ_RESPONSE_FIRST),       \
+	ib_opcode_name(RC_RDMA_READ_RESPONSE_MIDDLE),      \
+	ib_opcode_name(RC_RDMA_READ_RESPONSE_LAST),        \
+	ib_opcode_name(RC_RDMA_READ_RESPONSE_ONLY),        \
+	ib_opcode_name(RC_ACKNOWLEDGE),                    \
+	ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE),             \
+	ib_opcode_name(RC_COMPARE_SWAP),                   \
+	ib_opcode_name(RC_FETCH_ADD),                      \
+	ib_opcode_name(UC_SEND_FIRST),                     \
+	ib_opcode_name(UC_SEND_MIDDLE),                    \
+	ib_opcode_name(UC_SEND_LAST),                      \
+	ib_opcode_name(UC_SEND_LAST_WITH_IMMEDIATE),       \
+	ib_opcode_name(UC_SEND_ONLY),                      \
+	ib_opcode_name(UC_SEND_ONLY_WITH_IMMEDIATE),       \
+	ib_opcode_name(UC_RDMA_WRITE_FIRST),               \
+	ib_opcode_name(UC_RDMA_WRITE_MIDDLE),              \
+	ib_opcode_name(UC_RDMA_WRITE_LAST),                \
+	ib_opcode_name(UC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
+	ib_opcode_name(UC_RDMA_WRITE_ONLY),                \
+	ib_opcode_name(UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
+	ib_opcode_name(UD_SEND_ONLY),                      \
+	ib_opcode_name(UD_SEND_ONLY_WITH_IMMEDIATE),       \
+	ib_opcode_name(CNP))
+
 const char *parse_everbs_hdrs(struct trace_seq *p, u8 opcode, void *ehdrs);
 const char *parse_everbs_hdrs(struct trace_seq *p, u8 opcode, void *ehdrs);
+u8 hfi1_trace_ib_hdr_len(struct ib_header *hdr);
+const char *hfi1_trace_get_packet_str(struct hfi1_packet *packet);
+void hfi1_trace_parse_bth(struct ib_other_headers *ohdr,
+			  u8 *ack, u8 *becn, u8 *fecn, u8 *mig,
+			  u8 *se, u8 *pad, u8 *opcode, u8 *tver,
+			  u16 *pkey, u32 *psn, u32 *qpn);
+void hfi1_trace_parse_9b_hdr(struct ib_header *hdr, bool sc5,
+			     struct ib_other_headers **ohdr,
+			     u8 *lnh, u8 *lver, u8 *sl, u8 *sc,
+			     u16 *len, u32 *dlid, u32 *slid);
 
 
 #define __parse_ib_ehdrs(op, ehdrs) parse_everbs_hdrs(p, op, ehdrs)
 #define __parse_ib_ehdrs(op, ehdrs) parse_everbs_hdrs(p, op, ehdrs)
 
 
@@ -66,139 +115,198 @@ __print_symbolic(lrh,                    \
 	lrh_name(LRH_BTH),               \
 	lrh_name(LRH_BTH),               \
 	lrh_name(LRH_GRH))
 	lrh_name(LRH_GRH))
 
 
-#define LRH_PRN "vl %d lver %d sl %d lnh %d,%s dlid %.4x len %d slid %.4x"
+#define LRH_PRN "len:%d sc:%d dlid:0x%.4x slid:0x%.4x"
+#define LRH_9B_PRN "lnh:%d,%s lver:%d sl:%d "
 #define BTH_PRN \
 #define BTH_PRN \
-	"op 0x%.2x,%s se %d m %d pad %d tver %d pkey 0x%.4x " \
-	"f %d b %d qpn 0x%.6x a %d psn 0x%.8x"
-#define EHDR_PRN "%s"
+	"op:0x%.2x,%s se:%d m:%d pad:%d tver:%d pkey:0x%.4x " \
+	"f:%d b:%d qpn:0x%.6x a:%d psn:0x%.8x"
+#define EHDR_PRN "hlen:%d %s"
 
 
-DECLARE_EVENT_CLASS(hfi1_ibhdr_template,
+DECLARE_EVENT_CLASS(hfi1_input_ibhdr_template,
 		    TP_PROTO(struct hfi1_devdata *dd,
 		    TP_PROTO(struct hfi1_devdata *dd,
-			     struct ib_header *hdr),
-		    TP_ARGS(dd, hdr),
+			     struct hfi1_packet *packet,
+			     bool sc5),
+		    TP_ARGS(dd, packet, sc5),
 		    TP_STRUCT__entry(
 		    TP_STRUCT__entry(
 			DD_DEV_ENTRY(dd)
 			DD_DEV_ENTRY(dd)
-			/* LRH */
-			__field(u8, vl)
+			__field(u8, lnh)
 			__field(u8, lver)
 			__field(u8, lver)
 			__field(u8, sl)
 			__field(u8, sl)
+			__field(u16, len)
+			__field(u32, dlid)
+			__field(u8, sc)
+			__field(u32, slid)
+			__field(u8, opcode)
+			__field(u8, se)
+			__field(u8, mig)
+			__field(u8, pad)
+			__field(u8, tver)
+			__field(u16, pkey)
+			__field(u8, fecn)
+			__field(u8, becn)
+			__field(u32, qpn)
+			__field(u8, ack)
+			__field(u32, psn)
+			/* extended headers */
+			__dynamic_array(u8, ehdrs,
+					hfi1_trace_ib_hdr_len(packet->hdr))
+			),
+		    TP_fast_assign(
+			   struct ib_other_headers *ohdr;
+
+			   DD_DEV_ASSIGN(dd);
+
+			   hfi1_trace_parse_9b_hdr(packet->hdr, sc5,
+						   &ohdr,
+						   &__entry->lnh,
+						   &__entry->lver,
+						   &__entry->sl,
+						   &__entry->sc,
+						   &__entry->len,
+						   &__entry->dlid,
+						   &__entry->slid);
+
+			  hfi1_trace_parse_bth(ohdr, &__entry->ack,
+					       &__entry->becn, &__entry->fecn,
+					       &__entry->mig, &__entry->se,
+					       &__entry->pad, &__entry->opcode,
+					       &__entry->tver, &__entry->pkey,
+					       &__entry->psn, &__entry->qpn);
+			  /* extended headers */
+			  memcpy(__get_dynamic_array(ehdrs), &ohdr->u,
+				 __get_dynamic_array_len(ehdrs));
+			 ),
+		    TP_printk("[%s] (IB) " LRH_PRN " " LRH_9B_PRN " "
+			      BTH_PRN " " EHDR_PRN,
+			      __get_str(dev),
+			      __entry->len,
+			      __entry->sc,
+			      __entry->dlid,
+			      __entry->slid,
+			      __entry->lnh, show_lnh(__entry->lnh),
+			      __entry->lver,
+			      __entry->sl,
+			      /* BTH */
+			      __entry->opcode, show_ib_opcode(__entry->opcode),
+			      __entry->se,
+			      __entry->mig,
+			      __entry->pad,
+			      __entry->tver,
+			      __entry->pkey,
+			      __entry->fecn,
+			      __entry->becn,
+			      __entry->qpn,
+			      __entry->ack,
+			      __entry->psn,
+			      /* extended headers */
+			      __get_dynamic_array_len(ehdrs),
+			      __parse_ib_ehdrs(
+					__entry->opcode,
+					(void *)__get_dynamic_array(ehdrs))
+			     )
+);
+
+DEFINE_EVENT(hfi1_input_ibhdr_template, input_ibhdr,
+	     TP_PROTO(struct hfi1_devdata *dd,
+		      struct hfi1_packet *packet, bool sc5),
+	     TP_ARGS(dd, packet, sc5));
+
+DECLARE_EVENT_CLASS(hfi1_output_ibhdr_template,
+		    TP_PROTO(struct hfi1_devdata *dd,
+			     struct ib_header *hdr,
+			     bool sc5),
+		    TP_ARGS(dd, hdr, sc5),
+		    TP_STRUCT__entry(
+			DD_DEV_ENTRY(dd)
 			__field(u8, lnh)
 			__field(u8, lnh)
-			__field(u16, dlid)
+			__field(u8, lver)
+			__field(u8, sl)
 			__field(u16, len)
 			__field(u16, len)
-			__field(u16, slid)
-			/* BTH */
+			__field(u32, dlid)
+			__field(u8, sc)
+			__field(u32, slid)
 			__field(u8, opcode)
 			__field(u8, opcode)
 			__field(u8, se)
 			__field(u8, se)
-			__field(u8, m)
+			__field(u8, mig)
 			__field(u8, pad)
 			__field(u8, pad)
 			__field(u8, tver)
 			__field(u8, tver)
 			__field(u16, pkey)
 			__field(u16, pkey)
-			__field(u8, f)
-			__field(u8, b)
+			__field(u8, fecn)
+			__field(u8, becn)
 			__field(u32, qpn)
 			__field(u32, qpn)
-			__field(u8, a)
+			__field(u8, ack)
 			__field(u32, psn)
 			__field(u32, psn)
 			/* extended headers */
 			/* extended headers */
-			__dynamic_array(u8, ehdrs, ibhdr_exhdr_len(hdr))
+			__dynamic_array(u8, ehdrs,
+					hfi1_trace_ib_hdr_len(hdr))
 			),
 			),
-		      TP_fast_assign(
+		    TP_fast_assign(
 			struct ib_other_headers *ohdr;
 			struct ib_other_headers *ohdr;
 
 
 			DD_DEV_ASSIGN(dd);
 			DD_DEV_ASSIGN(dd);
-			/* LRH */
-			__entry->vl =
-			(u8)(be16_to_cpu(hdr->lrh[0]) >> 12);
-			__entry->lver =
-			(u8)(be16_to_cpu(hdr->lrh[0]) >> 8) & 0xf;
-			__entry->sl =
-			(u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
-			__entry->lnh =
-			(u8)(be16_to_cpu(hdr->lrh[0]) & 3);
-			__entry->dlid =
-			be16_to_cpu(hdr->lrh[1]);
-			/* allow for larger len */
-			__entry->len =
-			be16_to_cpu(hdr->lrh[2]);
-			__entry->slid =
-			be16_to_cpu(hdr->lrh[3]);
-			/* BTH */
-			if (__entry->lnh == HFI1_LRH_BTH)
-			ohdr = &hdr->u.oth;
-			else
-			ohdr = &hdr->u.l.oth;
-			__entry->opcode =
-			(be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
-			__entry->se =
-			(be32_to_cpu(ohdr->bth[0]) >> 23) & 1;
-			__entry->m =
-			(be32_to_cpu(ohdr->bth[0]) >> 22) & 1;
-			__entry->pad =
-			(be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-			__entry->tver =
-			(be32_to_cpu(ohdr->bth[0]) >> 16) & 0xf;
-			__entry->pkey =
-			be32_to_cpu(ohdr->bth[0]) & 0xffff;
-			__entry->f =
-			(be32_to_cpu(ohdr->bth[1]) >> IB_FECN_SHIFT) &
-			IB_FECN_MASK;
-			__entry->b =
-			(be32_to_cpu(ohdr->bth[1]) >> IB_BECN_SHIFT) &
-			IB_BECN_MASK;
-			__entry->qpn =
-			be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
-			__entry->a =
-			(be32_to_cpu(ohdr->bth[2]) >> 31) & 1;
-			/* allow for larger PSN */
-			__entry->psn =
-			be32_to_cpu(ohdr->bth[2]) & 0x7fffffff;
+
+			hfi1_trace_parse_9b_hdr(hdr, sc5,
+						&ohdr, &__entry->lnh,
+						&__entry->lver, &__entry->sl,
+						&__entry->sc, &__entry->len,
+						&__entry->dlid, &__entry->slid);
+
+			hfi1_trace_parse_bth(ohdr, &__entry->ack,
+					     &__entry->becn, &__entry->fecn,
+					     &__entry->mig, &__entry->se,
+					     &__entry->pad, &__entry->opcode,
+					     &__entry->tver, &__entry->pkey,
+					     &__entry->psn, &__entry->qpn);
+
 			/* extended headers */
 			/* extended headers */
-			memcpy(__get_dynamic_array(ehdrs), &ohdr->u,
-			       ibhdr_exhdr_len(hdr));
-			),
-		TP_printk("[%s] " LRH_PRN " " BTH_PRN " " EHDR_PRN,
-			  __get_str(dev),
-			  /* LRH */
-			  __entry->vl,
-			  __entry->lver,
-			  __entry->sl,
-			  __entry->lnh, show_lnh(__entry->lnh),
-			  __entry->dlid,
-			  __entry->len,
-			  __entry->slid,
-			  /* BTH */
-			  __entry->opcode, show_ib_opcode(__entry->opcode),
-			  __entry->se,
-			  __entry->m,
-			  __entry->pad,
-			  __entry->tver,
-			  __entry->pkey,
-			  __entry->f,
-			  __entry->b,
-			  __entry->qpn,
-			  __entry->a,
-			  __entry->psn,
-			  /* extended headers */
-			  __parse_ib_ehdrs(
-				__entry->opcode,
-				(void *)__get_dynamic_array(ehdrs))
-			)
+			memcpy(__get_dynamic_array(ehdrs),
+			       &ohdr->u, __get_dynamic_array_len(ehdrs));
+		    ),
+		    TP_printk("[%s] (IB) " LRH_PRN " " LRH_9B_PRN " "
+			      BTH_PRN " " EHDR_PRN,
+			      __get_str(dev),
+			      __entry->len,
+			      __entry->sc,
+			      __entry->dlid,
+			      __entry->slid,
+			      __entry->lnh, show_lnh(__entry->lnh),
+			      __entry->lver,
+			      __entry->sl,
+			      /* BTH */
+			      __entry->opcode, show_ib_opcode(__entry->opcode),
+			      __entry->se,
+			      __entry->mig,
+			      __entry->pad,
+			      __entry->tver,
+			      __entry->pkey,
+			      __entry->fecn,
+			      __entry->becn,
+			      __entry->qpn,
+			      __entry->ack,
+			      __entry->psn,
+			      /* extended headers */
+			      __get_dynamic_array_len(ehdrs),
+			      __parse_ib_ehdrs(
+					__entry->opcode,
+					(void *)__get_dynamic_array(ehdrs))
+			     )
 );
 );
 
 
-DEFINE_EVENT(hfi1_ibhdr_template, input_ibhdr,
-	     TP_PROTO(struct hfi1_devdata *dd, struct ib_header *hdr),
-	     TP_ARGS(dd, hdr));
+DEFINE_EVENT(hfi1_output_ibhdr_template, pio_output_ibhdr,
+	     TP_PROTO(struct hfi1_devdata *dd,
+		      struct ib_header *hdr, bool sc5),
+	     TP_ARGS(dd, hdr, sc5));
 
 
-DEFINE_EVENT(hfi1_ibhdr_template, pio_output_ibhdr,
-	     TP_PROTO(struct hfi1_devdata *dd, struct ib_header *hdr),
-	     TP_ARGS(dd, hdr));
+DEFINE_EVENT(hfi1_output_ibhdr_template, ack_output_ibhdr,
+	     TP_PROTO(struct hfi1_devdata *dd,
+		      struct ib_header *hdr, bool sc5),
+	     TP_ARGS(dd, hdr, sc5));
 
 
-DEFINE_EVENT(hfi1_ibhdr_template, ack_output_ibhdr,
-	     TP_PROTO(struct hfi1_devdata *dd, struct ib_header *hdr),
-	     TP_ARGS(dd, hdr));
+DEFINE_EVENT(hfi1_output_ibhdr_template, sdma_output_ibhdr,
+	     TP_PROTO(struct hfi1_devdata *dd,
+		      struct ib_header *hdr, bool sc5),
+	     TP_ARGS(dd, hdr, sc5));
 
 
-DEFINE_EVENT(hfi1_ibhdr_template, sdma_output_ibhdr,
-	     TP_PROTO(struct hfi1_devdata *dd, struct ib_header *hdr),
-	     TP_ARGS(dd, hdr));
 
 
 #endif /* __HFI1_TRACE_IBHDRS_H */
 #endif /* __HFI1_TRACE_IBHDRS_H */
 
 

+ 20 - 0
drivers/infiniband/hw/hfi1/trace_misc.h

@@ -72,6 +72,26 @@ TRACE_EVENT(hfi1_interrupt,
 		      __entry->src)
 		      __entry->src)
 );
 );
 
 
+DECLARE_EVENT_CLASS(
+	hfi1_csr_template,
+	TP_PROTO(void __iomem *addr, u64 value),
+	TP_ARGS(addr, value),
+	TP_STRUCT__entry(
+		__field(void __iomem *, addr)
+		__field(u64, value)
+	),
+	TP_fast_assign(
+		__entry->addr = addr;
+		__entry->value = value;
+	),
+	TP_printk("addr %p value %llx", __entry->addr, __entry->value)
+);
+
+DEFINE_EVENT(
+	hfi1_csr_template, hfi1_write_rcvarray,
+	TP_PROTO(void __iomem *addr, u64 value),
+	TP_ARGS(addr, value));
+
 #ifdef CONFIG_FAULT_INJECTION
 #ifdef CONFIG_FAULT_INJECTION
 TRACE_EVENT(hfi1_fault_opcode,
 TRACE_EVENT(hfi1_fault_opcode,
 	    TP_PROTO(struct rvt_qp *qp, u8 opcode),
 	    TP_PROTO(struct rvt_qp *qp, u8 opcode),

+ 58 - 34
drivers/infiniband/hw/hfi1/trace_rx.h

@@ -52,9 +52,25 @@
 
 
 #include "hfi.h"
 #include "hfi.h"
 
 
+#define tidtype_name(type) { PT_##type, #type }
+#define show_tidtype(type)                   \
+__print_symbolic(type,                       \
+	tidtype_name(EXPECTED),              \
+	tidtype_name(EAGER),                 \
+	tidtype_name(INVALID))               \
+
 #undef TRACE_SYSTEM
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM hfi1_rx
 #define TRACE_SYSTEM hfi1_rx
 
 
+#define packettype_name(etype) { RHF_RCV_TYPE_##etype, #etype }
+#define show_packettype(etype)                  \
+__print_symbolic(etype,                         \
+	packettype_name(EXPECTED),              \
+	packettype_name(EAGER),                 \
+	packettype_name(IB),                    \
+	packettype_name(ERROR),                 \
+	packettype_name(BYPASS))
+
 TRACE_EVENT(hfi1_rcvhdr,
 TRACE_EVENT(hfi1_rcvhdr,
 	    TP_PROTO(struct hfi1_devdata *dd,
 	    TP_PROTO(struct hfi1_devdata *dd,
 		     u32 ctxt,
 		     u32 ctxt,
@@ -98,7 +114,7 @@ TRACE_EVENT(hfi1_rcvhdr,
 );
 );
 
 
 TRACE_EVENT(hfi1_receive_interrupt,
 TRACE_EVENT(hfi1_receive_interrupt,
-	    TP_PROTO(struct hfi1_devdata *dd, u32 ctxt),
+	    TP_PROTO(struct hfi1_devdata *dd, u16 ctxt),
 	    TP_ARGS(dd, ctxt),
 	    TP_ARGS(dd, ctxt),
 	    TP_STRUCT__entry(DD_DEV_ENTRY(dd)
 	    TP_STRUCT__entry(DD_DEV_ENTRY(dd)
 			     __field(u32, ctxt)
 			     __field(u32, ctxt)
@@ -129,7 +145,8 @@ TRACE_EVENT(hfi1_receive_interrupt,
 		      )
 		      )
 );
 );
 
 
-TRACE_EVENT(hfi1_exp_tid_reg,
+DECLARE_EVENT_CLASS(
+	    hfi1_exp_tid_reg_unreg,
 	    TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr,
 	    TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr,
 		     u32 npages, unsigned long va, unsigned long pa,
 		     u32 npages, unsigned long va, unsigned long pa,
 		     dma_addr_t dma),
 		     dma_addr_t dma),
@@ -163,38 +180,45 @@ TRACE_EVENT(hfi1_exp_tid_reg,
 		      )
 		      )
 	);
 	);
 
 
-TRACE_EVENT(hfi1_exp_tid_unreg,
-	    TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
-		     unsigned long va, unsigned long pa, dma_addr_t dma),
-	    TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
-	    TP_STRUCT__entry(
-			     __field(unsigned int, ctxt)
-			     __field(u16, subctxt)
-			     __field(u32, rarr)
-			     __field(u32, npages)
-			     __field(unsigned long, va)
-			     __field(unsigned long, pa)
-			     __field(dma_addr_t, dma)
-			     ),
-	    TP_fast_assign(
-			   __entry->ctxt = ctxt;
-			   __entry->subctxt = subctxt;
-			   __entry->rarr = rarr;
-			   __entry->npages = npages;
-			   __entry->va = va;
-			   __entry->pa = pa;
-			   __entry->dma = dma;
-			   ),
-	    TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
-		      __entry->ctxt,
-		      __entry->subctxt,
-		      __entry->rarr,
-		      __entry->npages,
-		      __entry->pa,
-		      __entry->va,
-		      __entry->dma
-		      )
-	);
+DEFINE_EVENT(
+	hfi1_exp_tid_reg_unreg, hfi1_exp_tid_unreg,
+	TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
+		 unsigned long va, unsigned long pa, dma_addr_t dma),
+	TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma));
+
+DEFINE_EVENT(
+	hfi1_exp_tid_reg_unreg, hfi1_exp_tid_reg,
+	TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
+		 unsigned long va, unsigned long pa, dma_addr_t dma),
+	TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma));
+
+TRACE_EVENT(
+	hfi1_put_tid,
+	TP_PROTO(struct hfi1_devdata *dd,
+		 u32 index, u32 type, unsigned long pa, u16 order),
+	TP_ARGS(dd, index, type, pa, order),
+	TP_STRUCT__entry(
+		DD_DEV_ENTRY(dd)
+		__field(unsigned long, pa);
+		__field(u32, index);
+		__field(u32, type);
+		__field(u16, order);
+	),
+	TP_fast_assign(
+		DD_DEV_ASSIGN(dd);
+		__entry->pa = pa;
+		__entry->index = index;
+		__entry->type = type;
+		__entry->order = order;
+	),
+	TP_printk("[%s] type %s pa %lx index %u order %u",
+		  __get_str(dev),
+		  show_tidtype(__entry->type),
+		  __entry->pa,
+		  __entry->index,
+		  __entry->order
+	)
+);
 
 
 TRACE_EVENT(hfi1_exp_tid_inval,
 TRACE_EVENT(hfi1_exp_tid_inval,
 	    TP_PROTO(unsigned int ctxt, u16 subctxt, unsigned long va, u32 rarr,
 	    TP_PROTO(unsigned int ctxt, u16 subctxt, unsigned long va, u32 rarr,

+ 4 - 14
drivers/infiniband/hw/hfi1/uc.c

@@ -297,31 +297,25 @@ bail_no_tx:
 void hfi1_uc_rcv(struct hfi1_packet *packet)
 void hfi1_uc_rcv(struct hfi1_packet *packet)
 {
 {
 	struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd);
 	struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd);
-	struct ib_header *hdr = packet->hdr;
-	u32 rcv_flags = packet->rcv_flags;
 	void *data = packet->ebuf;
 	void *data = packet->ebuf;
 	u32 tlen = packet->tlen;
 	u32 tlen = packet->tlen;
 	struct rvt_qp *qp = packet->qp;
 	struct rvt_qp *qp = packet->qp;
 	struct ib_other_headers *ohdr = packet->ohdr;
 	struct ib_other_headers *ohdr = packet->ohdr;
-	u32 bth0, opcode;
+	u32 opcode = packet->opcode;
 	u32 hdrsize = packet->hlen;
 	u32 hdrsize = packet->hlen;
 	u32 psn;
 	u32 psn;
-	u32 pad;
+	u32 pad = packet->pad;
 	struct ib_wc wc;
 	struct ib_wc wc;
 	u32 pmtu = qp->pmtu;
 	u32 pmtu = qp->pmtu;
 	struct ib_reth *reth;
 	struct ib_reth *reth;
-	int has_grh = rcv_flags & HFI1_HAS_GRH;
 	int ret;
 	int ret;
 
 
-	bth0 = be32_to_cpu(ohdr->bth[0]);
-	if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, bth0))
+	if (hfi1_ruc_check_hdr(ibp, packet))
 		return;
 		return;
 
 
 	process_ecn(qp, packet, true);
 	process_ecn(qp, packet, true);
 
 
-	psn = be32_to_cpu(ohdr->bth[2]);
-	opcode = ib_bth_get_opcode(ohdr);
-
+	psn = ib_bth_get_psn(ohdr);
 	/* Compare the PSN verses the expected PSN. */
 	/* Compare the PSN verses the expected PSN. */
 	if (unlikely(cmp_psn(psn, qp->r_psn) != 0)) {
 	if (unlikely(cmp_psn(psn, qp->r_psn) != 0)) {
 		/*
 		/*
@@ -432,8 +426,6 @@ no_immediate_data:
 		wc.ex.imm_data = 0;
 		wc.ex.imm_data = 0;
 		wc.wc_flags = 0;
 		wc.wc_flags = 0;
 send_last:
 send_last:
-		/* Get the number of bytes the message was padded by. */
-		pad = ib_bth_get_pad(ohdr);
 		/* Check for invalid length. */
 		/* Check for invalid length. */
 		/* LAST len should be >= 1 */
 		/* LAST len should be >= 1 */
 		if (unlikely(tlen < (hdrsize + pad + 4)))
 		if (unlikely(tlen < (hdrsize + pad + 4)))
@@ -527,8 +519,6 @@ rdma_first:
 rdma_last_imm:
 rdma_last_imm:
 		wc.wc_flags = IB_WC_WITH_IMM;
 		wc.wc_flags = IB_WC_WITH_IMM;
 
 
-		/* Get the number of bytes the message was padded by. */
-		pad = ib_bth_get_pad(ohdr);
 		/* Check for invalid length. */
 		/* Check for invalid length. */
 		/* LAST len should be >= 1 */
 		/* LAST len should be >= 1 */
 		if (unlikely(tlen < (hdrsize + pad + 4)))
 		if (unlikely(tlen < (hdrsize + pad + 4)))

+ 23 - 41
drivers/infiniband/hw/hfi1/ud.c

@@ -110,10 +110,10 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 				   ((1 << ppd->lmc) - 1));
 				   ((1 << ppd->lmc) - 1));
 		if (unlikely(ingress_pkey_check(ppd, pkey, sc5,
 		if (unlikely(ingress_pkey_check(ppd, pkey, sc5,
 						qp->s_pkey_index, slid))) {
 						qp->s_pkey_index, slid))) {
-			hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY, pkey,
-				       rdma_ah_get_sl(ah_attr),
-				       sqp->ibqp.qp_num, qp->ibqp.qp_num,
-				       slid, rdma_ah_get_dlid(ah_attr));
+			hfi1_bad_pkey(ibp, pkey,
+				      rdma_ah_get_sl(ah_attr),
+				      sqp->ibqp.qp_num, qp->ibqp.qp_num,
+				      slid, rdma_ah_get_dlid(ah_attr));
 			goto drop;
 			goto drop;
 		}
 		}
 	}
 	}
@@ -128,18 +128,8 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 
 
 		qkey = (int)swqe->ud_wr.remote_qkey < 0 ?
 		qkey = (int)swqe->ud_wr.remote_qkey < 0 ?
 			sqp->qkey : swqe->ud_wr.remote_qkey;
 			sqp->qkey : swqe->ud_wr.remote_qkey;
-		if (unlikely(qkey != qp->qkey)) {
-			u16 lid;
-
-			lid = ppd->lid | (rdma_ah_get_path_bits(ah_attr) &
-					  ((1 << ppd->lmc) - 1));
-			hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_Q_KEY, qkey,
-				       rdma_ah_get_sl(ah_attr),
-				       sqp->ibqp.qp_num, qp->ibqp.qp_num,
-				       lid,
-				       rdma_ah_get_dlid(ah_attr));
-			goto drop;
-		}
+		if (unlikely(qkey != qp->qkey))
+			goto drop; /* silently drop per IBTA spec */
 	}
 	}
 
 
 	/*
 	/*
@@ -549,7 +539,7 @@ void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn,
 	hdr.lrh[3] = cpu_to_be16(slid);
 	hdr.lrh[3] = cpu_to_be16(slid);
 
 
 	plen = 2 /* PBC */ + hwords;
 	plen = 2 /* PBC */ + hwords;
-	pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
+	pbc_flags |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT);
 	vl = sc_to_vlt(ppd->dd, sc5);
 	vl = sc_to_vlt(ppd->dd, sc5);
 	pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
 	pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
 	if (ctxt) {
 	if (ctxt) {
@@ -668,36 +658,31 @@ static int opa_smp_check(struct hfi1_ibport *ibp, u16 pkey, u8 sc5,
 void hfi1_ud_rcv(struct hfi1_packet *packet)
 void hfi1_ud_rcv(struct hfi1_packet *packet)
 {
 {
 	struct ib_other_headers *ohdr = packet->ohdr;
 	struct ib_other_headers *ohdr = packet->ohdr;
-	int opcode;
 	u32 hdrsize = packet->hlen;
 	u32 hdrsize = packet->hlen;
 	struct ib_wc wc;
 	struct ib_wc wc;
 	u32 qkey;
 	u32 qkey;
 	u32 src_qp;
 	u32 src_qp;
-	u16 dlid, pkey;
+	u16 pkey;
 	int mgmt_pkey_idx = -1;
 	int mgmt_pkey_idx = -1;
 	struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd);
 	struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd);
 	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
 	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
 	struct ib_header *hdr = packet->hdr;
 	struct ib_header *hdr = packet->hdr;
-	u32 rcv_flags = packet->rcv_flags;
 	void *data = packet->ebuf;
 	void *data = packet->ebuf;
 	u32 tlen = packet->tlen;
 	u32 tlen = packet->tlen;
 	struct rvt_qp *qp = packet->qp;
 	struct rvt_qp *qp = packet->qp;
-	bool has_grh = rcv_flags & HFI1_HAS_GRH;
 	u8 sc5 = hfi1_9B_get_sc5(hdr, packet->rhf);
 	u8 sc5 = hfi1_9B_get_sc5(hdr, packet->rhf);
 	u32 bth1;
 	u32 bth1;
-	u8 sl_from_sc, sl;
-	u16 slid;
-	u8 extra_bytes;
+	u8 sl_from_sc;
+	u8 extra_bytes = packet->pad;
+	u8 opcode = packet->opcode;
+	u8 sl = packet->sl;
+	u32 dlid = packet->dlid;
+	u32 slid = packet->slid;
 
 
-	qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
-	src_qp = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK;
-	dlid = ib_get_dlid(hdr);
 	bth1 = be32_to_cpu(ohdr->bth[1]);
 	bth1 = be32_to_cpu(ohdr->bth[1]);
-	slid = ib_get_slid(hdr);
+	qkey = ib_get_qkey(ohdr);
+	src_qp = ib_get_sqpn(ohdr);
 	pkey = ib_bth_get_pkey(ohdr);
 	pkey = ib_bth_get_pkey(ohdr);
-	opcode = ib_bth_get_opcode(ohdr);
-	sl = ib_get_sl(hdr);
-	extra_bytes = ib_bth_get_pad(ohdr);
 	extra_bytes += (SIZE_OF_CRC << 2);
 	extra_bytes += (SIZE_OF_CRC << 2);
 	sl_from_sc = ibp->sc_to_sl[sc5];
 	sl_from_sc = ibp->sc_to_sl[sc5];
 
 
@@ -727,10 +712,10 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
 				 * for invalid pkeys is optional according to
 				 * for invalid pkeys is optional according to
 				 * IB spec (release 1.3, section 10.9.4)
 				 * IB spec (release 1.3, section 10.9.4)
 				 */
 				 */
-				hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY,
-					       pkey, sl,
-					       src_qp, qp->ibqp.qp_num,
-					       slid, dlid);
+				hfi1_bad_pkey(ibp,
+					      pkey, sl,
+					      src_qp, qp->ibqp.qp_num,
+					      slid, dlid);
 				return;
 				return;
 			}
 			}
 		} else {
 		} else {
@@ -739,12 +724,9 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
 			if (mgmt_pkey_idx < 0)
 			if (mgmt_pkey_idx < 0)
 				goto drop;
 				goto drop;
 		}
 		}
-		if (unlikely(qkey != qp->qkey)) {
-			hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_Q_KEY, qkey, sl,
-				       src_qp, qp->ibqp.qp_num,
-				       slid, dlid);
+		if (unlikely(qkey != qp->qkey)) /* Silent drop */
 			return;
 			return;
-		}
+
 		/* Drop invalid MAD packets (see 13.5.3.1). */
 		/* Drop invalid MAD packets (see 13.5.3.1). */
 		if (unlikely(qp->ibqp.qp_num == 1 &&
 		if (unlikely(qp->ibqp.qp_num == 1 &&
 			     (tlen > 2048 || (sc5 == 0xF))))
 			     (tlen > 2048 || (sc5 == 0xF))))
@@ -811,7 +793,7 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
 		qp->r_flags |= RVT_R_REUSE_SGE;
 		qp->r_flags |= RVT_R_REUSE_SGE;
 		goto drop;
 		goto drop;
 	}
 	}
-	if (has_grh) {
+	if (packet->grh) {
 		hfi1_copy_sge(&qp->r_sge, &hdr->u.l.grh,
 		hfi1_copy_sge(&qp->r_sge, &hdr->u.l.grh,
 			      sizeof(struct ib_grh), true, false);
 			      sizeof(struct ib_grh), true, false);
 		wc.wc_flags |= IB_WC_GRH;
 		wc.wc_flags |= IB_WC_GRH;

+ 3 - 125
drivers/infiniband/hw/hfi1/user_exp_rcv.c

@@ -51,14 +51,6 @@
 #include "trace.h"
 #include "trace.h"
 #include "mmu_rb.h"
 #include "mmu_rb.h"
 
 
-struct tid_group {
-	struct list_head list;
-	u32 base;
-	u8 size;
-	u8 used;
-	u8 map;
-};
-
 struct tid_rb_node {
 struct tid_rb_node {
 	struct mmu_rb_node mmu;
 	struct mmu_rb_node mmu;
 	unsigned long phys;
 	unsigned long phys;
@@ -75,8 +67,6 @@ struct tid_pageset {
 	u16 count;
 	u16 count;
 };
 };
 
 
-#define EXP_TID_SET_EMPTY(set) (set.count == 0 && list_empty(&set.list))
-
 #define num_user_pages(vaddr, len)				       \
 #define num_user_pages(vaddr, len)				       \
 	(1 + (((((unsigned long)(vaddr) +			       \
 	(1 + (((((unsigned long)(vaddr) +			       \
 		 (unsigned long)(len) - 1) & PAGE_MASK) -	       \
 		 (unsigned long)(len) - 1) & PAGE_MASK) -	       \
@@ -109,96 +99,14 @@ static struct mmu_rb_ops tid_rb_ops = {
 	.invalidate = tid_rb_invalidate
 	.invalidate = tid_rb_invalidate
 };
 };
 
 
-static inline u32 rcventry2tidinfo(u32 rcventry)
-{
-	u32 pair = rcventry & ~0x1;
-
-	return EXP_TID_SET(IDX, pair >> 1) |
-		EXP_TID_SET(CTRL, 1 << (rcventry - pair));
-}
-
-static inline void exp_tid_group_init(struct exp_tid_set *set)
-{
-	INIT_LIST_HEAD(&set->list);
-	set->count = 0;
-}
-
-static inline void tid_group_remove(struct tid_group *grp,
-				    struct exp_tid_set *set)
-{
-	list_del_init(&grp->list);
-	set->count--;
-}
-
-static inline void tid_group_add_tail(struct tid_group *grp,
-				      struct exp_tid_set *set)
-{
-	list_add_tail(&grp->list, &set->list);
-	set->count++;
-}
-
-static inline struct tid_group *tid_group_pop(struct exp_tid_set *set)
-{
-	struct tid_group *grp =
-		list_first_entry(&set->list, struct tid_group, list);
-	list_del_init(&grp->list);
-	set->count--;
-	return grp;
-}
-
-static inline void tid_group_move(struct tid_group *group,
-				  struct exp_tid_set *s1,
-				  struct exp_tid_set *s2)
-{
-	tid_group_remove(group, s1);
-	tid_group_add_tail(group, s2);
-}
-
-int hfi1_user_exp_rcv_grp_init(struct hfi1_filedata *fd)
-{
-	struct hfi1_ctxtdata *uctxt = fd->uctxt;
-	struct hfi1_devdata *dd = fd->dd;
-	u32 tidbase;
-	u32 i;
-	struct tid_group *grp, *gptr;
-
-	exp_tid_group_init(&uctxt->tid_group_list);
-	exp_tid_group_init(&uctxt->tid_used_list);
-	exp_tid_group_init(&uctxt->tid_full_list);
-
-	tidbase = uctxt->expected_base;
-	for (i = 0; i < uctxt->expected_count /
-		     dd->rcv_entries.group_size; i++) {
-		grp = kzalloc(sizeof(*grp), GFP_KERNEL);
-		if (!grp)
-			goto grp_failed;
-
-		grp->size = dd->rcv_entries.group_size;
-		grp->base = tidbase;
-		tid_group_add_tail(grp, &uctxt->tid_group_list);
-		tidbase += dd->rcv_entries.group_size;
-	}
-
-	return 0;
-
-grp_failed:
-	list_for_each_entry_safe(grp, gptr, &uctxt->tid_group_list.list,
-				 list) {
-		list_del_init(&grp->list);
-		kfree(grp);
-	}
-
-	return -ENOMEM;
-}
-
 /*
 /*
  * Initialize context and file private data needed for Expected
  * Initialize context and file private data needed for Expected
  * receive caching. This needs to be done after the context has
  * receive caching. This needs to be done after the context has
  * been configured with the eager/expected RcvEntry counts.
  * been configured with the eager/expected RcvEntry counts.
  */
  */
-int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd)
+int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
+			   struct hfi1_ctxtdata *uctxt)
 {
 {
-	struct hfi1_ctxtdata *uctxt = fd->uctxt;
 	struct hfi1_devdata *dd = uctxt->dd;
 	struct hfi1_devdata *dd = uctxt->dd;
 	int ret = 0;
 	int ret = 0;
 
 
@@ -266,18 +174,6 @@ int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd)
 	return ret;
 	return ret;
 }
 }
 
 
-void hfi1_user_exp_rcv_grp_free(struct hfi1_ctxtdata *uctxt)
-{
-	struct tid_group *grp, *gptr;
-
-	list_for_each_entry_safe(grp, gptr, &uctxt->tid_group_list.list,
-				 list) {
-		list_del_init(&grp->list);
-		kfree(grp);
-	}
-	hfi1_clear_tids(uctxt);
-}
-
 void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
 void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
 {
 {
 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
@@ -302,23 +198,6 @@ void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
 	fd->entry_to_rb = NULL;
 	fd->entry_to_rb = NULL;
 }
 }
 
 
-/*
- * Write an "empty" RcvArray entry.
- * This function exists so the TID registaration code can use it
- * to write to unused/unneeded entries and still take advantage
- * of the WC performance improvements. The HFI will ignore this
- * write to the RcvArray entry.
- */
-static inline void rcv_array_wc_fill(struct hfi1_devdata *dd, u32 index)
-{
-	/*
-	 * Doing the WC fill writes only makes sense if the device is
-	 * present and the RcvArray has been mapped as WC memory.
-	 */
-	if ((dd->flags & HFI1_PRESENT) && dd->rcvarray_wc)
-		writeq(0, dd->rcvarray_wc + (index * 8));
-}
-
 /*
 /*
  * RcvArray entry allocation for Expected Receives is done by the
  * RcvArray entry allocation for Expected Receives is done by the
  * following algorithm:
  * following algorithm:
@@ -935,12 +814,11 @@ static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
 				 node->npages, node->mmu.addr, node->phys,
 				 node->npages, node->mmu.addr, node->phys,
 				 node->dma_addr);
 				 node->dma_addr);
 
 
-	hfi1_put_tid(dd, node->rcventry, PT_INVALID, 0, 0);
 	/*
 	/*
 	 * Make sure device has seen the write before we unpin the
 	 * Make sure device has seen the write before we unpin the
 	 * pages.
 	 * pages.
 	 */
 	 */
-	flush_wc();
+	hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0);
 
 
 	pci_unmap_single(dd->pcidev, node->dma_addr, node->mmu.len,
 	pci_unmap_single(dd->pcidev, node->dma_addr, node->mmu.len,
 			 PCI_DMA_FROMDEVICE);
 			 PCI_DMA_FROMDEVICE);

+ 3 - 23
drivers/infiniband/hw/hfi1/user_exp_rcv.h

@@ -49,30 +49,10 @@
 
 
 #include "hfi.h"
 #include "hfi.h"
 
 
-#define EXP_TID_TIDLEN_MASK   0x7FFULL
-#define EXP_TID_TIDLEN_SHIFT  0
-#define EXP_TID_TIDCTRL_MASK  0x3ULL
-#define EXP_TID_TIDCTRL_SHIFT 20
-#define EXP_TID_TIDIDX_MASK   0x3FFULL
-#define EXP_TID_TIDIDX_SHIFT  22
-#define EXP_TID_GET(tid, field)	\
-	(((tid) >> EXP_TID_TID##field##_SHIFT) & EXP_TID_TID##field##_MASK)
+#include "exp_rcv.h"
 
 
-#define EXP_TID_SET(field, value)			\
-	(((value) & EXP_TID_TID##field##_MASK) <<	\
-	 EXP_TID_TID##field##_SHIFT)
-#define EXP_TID_CLEAR(tid, field) ({					\
-		(tid) &= ~(EXP_TID_TID##field##_MASK <<			\
-			   EXP_TID_TID##field##_SHIFT);			\
-		})
-#define EXP_TID_RESET(tid, field, value) do {				\
-		EXP_TID_CLEAR(tid, field);				\
-		(tid) |= EXP_TID_SET(field, (value));			\
-	} while (0)
-
-void hfi1_user_exp_rcv_grp_free(struct hfi1_ctxtdata *uctxt);
-int hfi1_user_exp_rcv_grp_init(struct hfi1_filedata *fd);
-int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd);
+int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
+			   struct hfi1_ctxtdata *uctxt);
 void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd);
 void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd);
 int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
 int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
 			    struct hfi1_tid_info *tinfo);
 			    struct hfi1_tid_info *tinfo);

+ 102 - 128
drivers/infiniband/hw/hfi1/user_sdma.c

@@ -94,43 +94,13 @@ MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 12
 /* Number of BTH.PSN bits used for sequence number in expected rcvs */
 /* Number of BTH.PSN bits used for sequence number in expected rcvs */
 #define BTH_SEQ_MASK 0x7ffull
 #define BTH_SEQ_MASK 0x7ffull
 
 
-/*
- * Define fields in the KDETH header so we can update the header
- * template.
- */
-#define KDETH_OFFSET_SHIFT        0
-#define KDETH_OFFSET_MASK         0x7fff
-#define KDETH_OM_SHIFT            15
-#define KDETH_OM_MASK             0x1
-#define KDETH_TID_SHIFT           16
-#define KDETH_TID_MASK            0x3ff
-#define KDETH_TIDCTRL_SHIFT       26
-#define KDETH_TIDCTRL_MASK        0x3
-#define KDETH_INTR_SHIFT          28
-#define KDETH_INTR_MASK           0x1
-#define KDETH_SH_SHIFT            29
-#define KDETH_SH_MASK             0x1
-#define KDETH_HCRC_UPPER_SHIFT    16
-#define KDETH_HCRC_UPPER_MASK     0xff
-#define KDETH_HCRC_LOWER_SHIFT    24
-#define KDETH_HCRC_LOWER_MASK     0xff
-
 #define AHG_KDETH_INTR_SHIFT 12
 #define AHG_KDETH_INTR_SHIFT 12
 #define AHG_KDETH_SH_SHIFT   13
 #define AHG_KDETH_SH_SHIFT   13
+#define AHG_KDETH_ARRAY_SIZE  9
 
 
 #define PBC2LRH(x) ((((x) & 0xfff) << 2) - 4)
 #define PBC2LRH(x) ((((x) & 0xfff) << 2) - 4)
 #define LRH2PBC(x) ((((x) >> 2) + 1) & 0xfff)
 #define LRH2PBC(x) ((((x) >> 2) + 1) & 0xfff)
 
 
-#define KDETH_GET(val, field)						\
-	(((le32_to_cpu((val))) >> KDETH_##field##_SHIFT) & KDETH_##field##_MASK)
-#define KDETH_SET(dw, field, val) do {					\
-		u32 dwval = le32_to_cpu(dw);				\
-		dwval &= ~(KDETH_##field##_MASK << KDETH_##field##_SHIFT); \
-		dwval |= (((val) & KDETH_##field##_MASK) << \
-			  KDETH_##field##_SHIFT);			\
-		dw = cpu_to_le32(dwval);				\
-	} while (0)
-
 #define AHG_HEADER_SET(arr, idx, dw, bit, width, value)			\
 #define AHG_HEADER_SET(arr, idx, dw, bit, width, value)			\
 	do {								\
 	do {								\
 		if ((idx) < ARRAY_SIZE((arr)))				\
 		if ((idx) < ARRAY_SIZE((arr)))				\
@@ -141,23 +111,10 @@ MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 12
 			return -ERANGE;					\
 			return -ERANGE;					\
 	} while (0)
 	} while (0)
 
 
-/* KDETH OM multipliers and switch over point */
-#define KDETH_OM_SMALL     4
-#define KDETH_OM_SMALL_SHIFT     2
-#define KDETH_OM_LARGE     64
-#define KDETH_OM_LARGE_SHIFT     6
-#define KDETH_OM_MAX_SIZE  (1 << ((KDETH_OM_LARGE / KDETH_OM_SMALL) + 1))
-
 /* Tx request flag bits */
 /* Tx request flag bits */
 #define TXREQ_FLAGS_REQ_ACK   BIT(0)      /* Set the ACK bit in the header */
 #define TXREQ_FLAGS_REQ_ACK   BIT(0)      /* Set the ACK bit in the header */
 #define TXREQ_FLAGS_REQ_DISABLE_SH BIT(1) /* Disable header suppression */
 #define TXREQ_FLAGS_REQ_DISABLE_SH BIT(1) /* Disable header suppression */
 
 
-/* SDMA request flag bits */
-#define SDMA_REQ_FOR_THREAD 1
-#define SDMA_REQ_SEND_DONE  2
-#define SDMA_REQ_HAS_ERROR  3
-#define SDMA_REQ_DONE_ERROR 4
-
 #define SDMA_PKT_Q_INACTIVE BIT(0)
 #define SDMA_PKT_Q_INACTIVE BIT(0)
 #define SDMA_PKT_Q_ACTIVE   BIT(1)
 #define SDMA_PKT_Q_ACTIVE   BIT(1)
 #define SDMA_PKT_Q_DEFERRED BIT(2)
 #define SDMA_PKT_Q_DEFERRED BIT(2)
@@ -204,25 +161,41 @@ struct evict_data {
 };
 };
 
 
 struct user_sdma_request {
 struct user_sdma_request {
-	struct sdma_req_info info;
-	struct hfi1_user_sdma_pkt_q *pq;
-	struct hfi1_user_sdma_comp_q *cq;
 	/* This is the original header from user space */
 	/* This is the original header from user space */
 	struct hfi1_pkt_header hdr;
 	struct hfi1_pkt_header hdr;
+
+	/* Read mostly fields */
+	struct hfi1_user_sdma_pkt_q *pq ____cacheline_aligned_in_smp;
+	struct hfi1_user_sdma_comp_q *cq;
 	/*
 	/*
 	 * Pointer to the SDMA engine for this request.
 	 * Pointer to the SDMA engine for this request.
 	 * Since different request could be on different VLs,
 	 * Since different request could be on different VLs,
 	 * each request will need it's own engine pointer.
 	 * each request will need it's own engine pointer.
 	 */
 	 */
 	struct sdma_engine *sde;
 	struct sdma_engine *sde;
-	s8 ahg_idx;
-	u32 ahg[9];
+	struct sdma_req_info info;
+	/* TID array values copied from the tid_iov vector */
+	u32 *tids;
+	/* total length of the data in the request */
+	u32 data_len;
+	/* number of elements copied to the tids array */
+	u16 n_tids;
 	/*
 	/*
-	 * KDETH.Offset (Eager) field
-	 * We need to remember the initial value so the headers
-	 * can be updated properly.
+	 * We copy the iovs for this request (based on
+	 * info.iovcnt). These are only the data vectors
 	 */
 	 */
-	u32 koffset;
+	u8 data_iovs;
+	s8 ahg_idx;
+
+	/* Writeable fields shared with interrupt */
+	u64 seqcomp ____cacheline_aligned_in_smp;
+	u64 seqsubmitted;
+	/* status of the last txreq completed */
+	int status;
+
+	/* Send side fields */
+	struct list_head txps ____cacheline_aligned_in_smp;
+	u64 seqnum;
 	/*
 	/*
 	 * KDETH.OFFSET (TID) field
 	 * KDETH.OFFSET (TID) field
 	 * The offset can cover multiple packets, depending on the
 	 * The offset can cover multiple packets, depending on the
@@ -230,29 +203,21 @@ struct user_sdma_request {
 	 */
 	 */
 	u32 tidoffset;
 	u32 tidoffset;
 	/*
 	/*
-	 * We copy the iovs for this request (based on
-	 * info.iovcnt). These are only the data vectors
+	 * KDETH.Offset (Eager) field
+	 * We need to remember the initial value so the headers
+	 * can be updated properly.
 	 */
 	 */
-	unsigned data_iovs;
-	/* total length of the data in the request */
-	u32 data_len;
+	u32 koffset;
+	u32 sent;
+	/* TID index copied from the tid_iov vector */
+	u16 tididx;
 	/* progress index moving along the iovs array */
 	/* progress index moving along the iovs array */
-	unsigned iov_idx;
+	u8 iov_idx;
+	u8 done;
+	u8 has_error;
+
 	struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ];
 	struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ];
-	/* number of elements copied to the tids array */
-	u16 n_tids;
-	/* TID array values copied from the tid_iov vector */
-	u32 *tids;
-	u16 tididx;
-	u32 sent;
-	u64 seqnum;
-	u64 seqcomp;
-	u64 seqsubmitted;
-	struct list_head txps;
-	unsigned long flags;
-	/* status of the last txreq completed */
-	int status;
-};
+} ____cacheline_aligned_in_smp;
 
 
 /*
 /*
  * A single txreq could span up to 3 physical pages when the MTU
  * A single txreq could span up to 3 physical pages when the MTU
@@ -307,7 +272,8 @@ static int defer_packet_queue(
 	struct sdma_engine *sde,
 	struct sdma_engine *sde,
 	struct iowait *wait,
 	struct iowait *wait,
 	struct sdma_txreq *txreq,
 	struct sdma_txreq *txreq,
-	unsigned int seq);
+	uint seq,
+	bool pkts_sent);
 static void activate_packet_queue(struct iowait *wait, int reason);
 static void activate_packet_queue(struct iowait *wait, int reason);
 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
 			   unsigned long len);
 			   unsigned long len);
@@ -329,7 +295,8 @@ static int defer_packet_queue(
 	struct sdma_engine *sde,
 	struct sdma_engine *sde,
 	struct iowait *wait,
 	struct iowait *wait,
 	struct sdma_txreq *txreq,
 	struct sdma_txreq *txreq,
-	unsigned seq)
+	uint seq,
+	bool pkts_sent)
 {
 {
 	struct hfi1_user_sdma_pkt_q *pq =
 	struct hfi1_user_sdma_pkt_q *pq =
 		container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
 		container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
@@ -349,7 +316,7 @@ static int defer_packet_queue(
 	xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
 	xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
 	write_seqlock(&dev->iowait_lock);
 	write_seqlock(&dev->iowait_lock);
 	if (list_empty(&pq->busy.list))
 	if (list_empty(&pq->busy.list))
-		list_add_tail(&pq->busy.list, &sde->dmawait);
+		iowait_queue(pkts_sent, &pq->busy, &sde->dmawait);
 	write_sequnlock(&dev->iowait_lock);
 	write_sequnlock(&dev->iowait_lock);
 	return -EBUSY;
 	return -EBUSY;
 eagain:
 eagain:
@@ -379,7 +346,6 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
 	struct hfi1_devdata *dd;
 	struct hfi1_devdata *dd;
 	struct hfi1_user_sdma_comp_q *cq;
 	struct hfi1_user_sdma_comp_q *cq;
 	struct hfi1_user_sdma_pkt_q *pq;
 	struct hfi1_user_sdma_pkt_q *pq;
-	unsigned long flags;
 
 
 	if (!uctxt || !fd)
 	if (!uctxt || !fd)
 		return -EBADF;
 		return -EBADF;
@@ -393,7 +359,6 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
 	if (!pq)
 	if (!pq)
 		return -ENOMEM;
 		return -ENOMEM;
 
 
-	INIT_LIST_HEAD(&pq->list);
 	pq->dd = dd;
 	pq->dd = dd;
 	pq->ctxt = uctxt->ctxt;
 	pq->ctxt = uctxt->ctxt;
 	pq->subctxt = fd->subctxt;
 	pq->subctxt = fd->subctxt;
@@ -454,10 +419,6 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
 	fd->pq = pq;
 	fd->pq = pq;
 	fd->cq = cq;
 	fd->cq = cq;
 
 
-	spin_lock_irqsave(&uctxt->sdma_qlock, flags);
-	list_add(&pq->list, &uctxt->sdma_queues);
-	spin_unlock_irqrestore(&uctxt->sdma_qlock, flags);
-
 	return 0;
 	return 0;
 
 
 pq_mmu_fail:
 pq_mmu_fail:
@@ -476,11 +437,10 @@ pq_reqs_nomem:
 	return ret;
 	return ret;
 }
 }
 
 
-int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd)
+int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd,
+			       struct hfi1_ctxtdata *uctxt)
 {
 {
-	struct hfi1_ctxtdata *uctxt = fd->uctxt;
 	struct hfi1_user_sdma_pkt_q *pq;
 	struct hfi1_user_sdma_pkt_q *pq;
-	unsigned long flags;
 
 
 	hfi1_cdbg(SDMA, "[%u:%u:%u] Freeing user SDMA queues", uctxt->dd->unit,
 	hfi1_cdbg(SDMA, "[%u:%u:%u] Freeing user SDMA queues", uctxt->dd->unit,
 		  uctxt->ctxt, fd->subctxt);
 		  uctxt->ctxt, fd->subctxt);
@@ -488,10 +448,6 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd)
 	if (pq) {
 	if (pq) {
 		if (pq->handler)
 		if (pq->handler)
 			hfi1_mmu_rb_unregister(pq->handler);
 			hfi1_mmu_rb_unregister(pq->handler);
-		spin_lock_irqsave(&uctxt->sdma_qlock, flags);
-		if (!list_empty(&pq->list))
-			list_del_init(&pq->list);
-		spin_unlock_irqrestore(&uctxt->sdma_qlock, flags);
 		iowait_sdma_drain(&pq->busy);
 		iowait_sdma_drain(&pq->busy);
 		/* Wait until all requests have been freed. */
 		/* Wait until all requests have been freed. */
 		wait_event_interruptible(
 		wait_event_interruptible(
@@ -607,12 +563,20 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
 	hfi1_cdbg(SDMA, "[%u:%u:%u] Using req/comp entry %u\n", dd->unit,
 	hfi1_cdbg(SDMA, "[%u:%u:%u] Using req/comp entry %u\n", dd->unit,
 		  uctxt->ctxt, fd->subctxt, info.comp_idx);
 		  uctxt->ctxt, fd->subctxt, info.comp_idx);
 	req = pq->reqs + info.comp_idx;
 	req = pq->reqs + info.comp_idx;
-	memset(req, 0, sizeof(*req));
 	req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */
 	req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */
+	req->data_len  = 0;
 	req->pq = pq;
 	req->pq = pq;
 	req->cq = cq;
 	req->cq = cq;
 	req->status = -1;
 	req->status = -1;
 	req->ahg_idx = -1;
 	req->ahg_idx = -1;
+	req->iov_idx = 0;
+	req->sent = 0;
+	req->seqnum = 0;
+	req->seqcomp = 0;
+	req->seqsubmitted = 0;
+	req->tids = NULL;
+	req->done = 0;
+	req->has_error = 0;
 	INIT_LIST_HEAD(&req->txps);
 	INIT_LIST_HEAD(&req->txps);
 
 
 	memcpy(&req->info, &info, sizeof(info));
 	memcpy(&req->info, &info, sizeof(info));
@@ -701,12 +665,14 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
 
 
 	/* Save all the IO vector structures */
 	/* Save all the IO vector structures */
 	for (i = 0; i < req->data_iovs; i++) {
 	for (i = 0; i < req->data_iovs; i++) {
+		req->iovs[i].offset = 0;
 		INIT_LIST_HEAD(&req->iovs[i].list);
 		INIT_LIST_HEAD(&req->iovs[i].list);
 		memcpy(&req->iovs[i].iov,
 		memcpy(&req->iovs[i].iov,
 		       iovec + idx++,
 		       iovec + idx++,
 		       sizeof(req->iovs[i].iov));
 		       sizeof(req->iovs[i].iov));
 		ret = pin_vector_pages(req, &req->iovs[i]);
 		ret = pin_vector_pages(req, &req->iovs[i]);
 		if (ret) {
 		if (ret) {
+			req->data_iovs = i;
 			req->status = ret;
 			req->status = ret;
 			goto free_req;
 			goto free_req;
 		}
 		}
@@ -749,6 +715,7 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
 		}
 		}
 		req->tids = tmp;
 		req->tids = tmp;
 		req->n_tids = ntids;
 		req->n_tids = ntids;
+		req->tididx = 0;
 		idx++;
 		idx++;
 	}
 	}
 
 
@@ -791,12 +758,12 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
 	 * request have been submitted to the SDMA engine. However, it
 	 * request have been submitted to the SDMA engine. However, it
 	 * will not wait for send completions.
 	 * will not wait for send completions.
 	 */
 	 */
-	while (!test_bit(SDMA_REQ_SEND_DONE, &req->flags)) {
+	while (req->seqsubmitted != req->info.npkts) {
 		ret = user_sdma_send_pkts(req, pcount);
 		ret = user_sdma_send_pkts(req, pcount);
 		if (ret < 0) {
 		if (ret < 0) {
 			if (ret != -EBUSY) {
 			if (ret != -EBUSY) {
 				req->status = ret;
 				req->status = ret;
-				set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
+				WRITE_ONCE(req->has_error, 1);
 				if (ACCESS_ONCE(req->seqcomp) ==
 				if (ACCESS_ONCE(req->seqcomp) ==
 				    req->seqsubmitted - 1)
 				    req->seqsubmitted - 1)
 					goto free_req;
 					goto free_req;
@@ -898,10 +865,8 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
 	pq = req->pq;
 	pq = req->pq;
 
 
 	/* If tx completion has reported an error, we are done. */
 	/* If tx completion has reported an error, we are done. */
-	if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) {
-		set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
+	if (READ_ONCE(req->has_error))
 		return -EFAULT;
 		return -EFAULT;
-	}
 
 
 	/*
 	/*
 	 * Check if we might have sent the entire request already
 	 * Check if we might have sent the entire request already
@@ -924,10 +889,8 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
 		 * with errors. If so, we are not going to process any
 		 * with errors. If so, we are not going to process any
 		 * more packets from this request.
 		 * more packets from this request.
 		 */
 		 */
-		if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) {
-			set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
+		if (READ_ONCE(req->has_error))
 			return -EFAULT;
 			return -EFAULT;
-		}
 
 
 		tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
 		tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
 		if (!tx)
 		if (!tx)
@@ -1024,11 +987,6 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
 							       datalen);
 							       datalen);
 				if (changes < 0)
 				if (changes < 0)
 					goto free_tx;
 					goto free_tx;
-				sdma_txinit_ahg(&tx->txreq,
-						SDMA_TXREQ_F_USE_AHG,
-						datalen, req->ahg_idx, changes,
-						req->ahg, sizeof(req->hdr),
-						user_sdma_txreq_cb);
 			}
 			}
 		} else {
 		} else {
 			ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
 			ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
@@ -1105,7 +1063,7 @@ dosend:
 	ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps, &count);
 	ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps, &count);
 	req->seqsubmitted += count;
 	req->seqsubmitted += count;
 	if (req->seqsubmitted == req->info.npkts) {
 	if (req->seqsubmitted == req->info.npkts) {
-		set_bit(SDMA_REQ_SEND_DONE, &req->flags);
+		WRITE_ONCE(req->done, 1);
 		/*
 		/*
 		 * The txreq has already been submitted to the HW queue
 		 * The txreq has already been submitted to the HW queue
 		 * so we can free the AHG entry now. Corruption will not
 		 * so we can free the AHG entry now. Corruption will not
@@ -1155,14 +1113,23 @@ static int pin_vector_pages(struct user_sdma_request *req,
 	struct hfi1_user_sdma_pkt_q *pq = req->pq;
 	struct hfi1_user_sdma_pkt_q *pq = req->pq;
 	struct sdma_mmu_node *node = NULL;
 	struct sdma_mmu_node *node = NULL;
 	struct mmu_rb_node *rb_node;
 	struct mmu_rb_node *rb_node;
-
-	rb_node = hfi1_mmu_rb_extract(pq->handler,
-				      (unsigned long)iovec->iov.iov_base,
-				      iovec->iov.iov_len);
-	if (rb_node)
+	bool extracted;
+
+	extracted =
+		hfi1_mmu_rb_remove_unless_exact(pq->handler,
+						(unsigned long)
+						iovec->iov.iov_base,
+						iovec->iov.iov_len, &rb_node);
+	if (rb_node) {
 		node = container_of(rb_node, struct sdma_mmu_node, rb);
 		node = container_of(rb_node, struct sdma_mmu_node, rb);
-	else
-		rb_node = NULL;
+		if (!extracted) {
+			atomic_inc(&node->refcount);
+			iovec->pages = node->pages;
+			iovec->npages = node->npages;
+			iovec->node = node;
+			return 0;
+		}
+	}
 
 
 	if (!node) {
 	if (!node) {
 		node = kzalloc(sizeof(*node), GFP_KERNEL);
 		node = kzalloc(sizeof(*node), GFP_KERNEL);
@@ -1423,21 +1390,22 @@ done:
 }
 }
 
 
 static int set_txreq_header_ahg(struct user_sdma_request *req,
 static int set_txreq_header_ahg(struct user_sdma_request *req,
-				struct user_sdma_txreq *tx, u32 len)
+				struct user_sdma_txreq *tx, u32 datalen)
 {
 {
+	u32 ahg[AHG_KDETH_ARRAY_SIZE];
 	int diff = 0;
 	int diff = 0;
 	u8 omfactor; /* KDETH.OM */
 	u8 omfactor; /* KDETH.OM */
 	struct hfi1_user_sdma_pkt_q *pq = req->pq;
 	struct hfi1_user_sdma_pkt_q *pq = req->pq;
 	struct hfi1_pkt_header *hdr = &req->hdr;
 	struct hfi1_pkt_header *hdr = &req->hdr;
 	u16 pbclen = le16_to_cpu(hdr->pbc[0]);
 	u16 pbclen = le16_to_cpu(hdr->pbc[0]);
-	u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(len));
+	u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
 
 
 	if (PBC2LRH(pbclen) != lrhlen) {
 	if (PBC2LRH(pbclen) != lrhlen) {
 		/* PBC.PbcLengthDWs */
 		/* PBC.PbcLengthDWs */
-		AHG_HEADER_SET(req->ahg, diff, 0, 0, 12,
+		AHG_HEADER_SET(ahg, diff, 0, 0, 12,
 			       cpu_to_le16(LRH2PBC(lrhlen)));
 			       cpu_to_le16(LRH2PBC(lrhlen)));
 		/* LRH.PktLen (we need the full 16 bits due to byte swap) */
 		/* LRH.PktLen (we need the full 16 bits due to byte swap) */
-		AHG_HEADER_SET(req->ahg, diff, 3, 0, 16,
+		AHG_HEADER_SET(ahg, diff, 3, 0, 16,
 			       cpu_to_be16(lrhlen >> 2));
 			       cpu_to_be16(lrhlen >> 2));
 	}
 	}
 
 
@@ -1449,13 +1417,12 @@ static int set_txreq_header_ahg(struct user_sdma_request *req,
 		(HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
 		(HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
 	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
 	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
 		val32 |= 1UL << 31;
 		val32 |= 1UL << 31;
-	AHG_HEADER_SET(req->ahg, diff, 6, 0, 16, cpu_to_be16(val32 >> 16));
-	AHG_HEADER_SET(req->ahg, diff, 6, 16, 16, cpu_to_be16(val32 & 0xffff));
+	AHG_HEADER_SET(ahg, diff, 6, 0, 16, cpu_to_be16(val32 >> 16));
+	AHG_HEADER_SET(ahg, diff, 6, 16, 16, cpu_to_be16(val32 & 0xffff));
 	/* KDETH.Offset */
 	/* KDETH.Offset */
-	AHG_HEADER_SET(req->ahg, diff, 15, 0, 16,
+	AHG_HEADER_SET(ahg, diff, 15, 0, 16,
 		       cpu_to_le16(req->koffset & 0xffff));
 		       cpu_to_le16(req->koffset & 0xffff));
-	AHG_HEADER_SET(req->ahg, diff, 15, 16, 16,
-		       cpu_to_le16(req->koffset >> 16));
+	AHG_HEADER_SET(ahg, diff, 15, 16, 16, cpu_to_le16(req->koffset >> 16));
 	if (req_opcode(req->info.ctrl) == EXPECTED) {
 	if (req_opcode(req->info.ctrl) == EXPECTED) {
 		__le16 val;
 		__le16 val;
 
 
@@ -1473,9 +1440,8 @@ static int set_txreq_header_ahg(struct user_sdma_request *req,
 			 * we have to check again.
 			 * we have to check again.
 			 */
 			 */
 			if (++req->tididx > req->n_tids - 1 ||
 			if (++req->tididx > req->n_tids - 1 ||
-			    !req->tids[req->tididx]) {
+			    !req->tids[req->tididx])
 				return -EINVAL;
 				return -EINVAL;
-			}
 			tidval = req->tids[req->tididx];
 			tidval = req->tids[req->tididx];
 		}
 		}
 		omfactor = ((EXP_TID_GET(tidval, LEN) *
 		omfactor = ((EXP_TID_GET(tidval, LEN) *
@@ -1483,7 +1449,7 @@ static int set_txreq_header_ahg(struct user_sdma_request *req,
 				 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT :
 				 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT :
 				 KDETH_OM_SMALL_SHIFT;
 				 KDETH_OM_SMALL_SHIFT;
 		/* KDETH.OM and KDETH.OFFSET (TID) */
 		/* KDETH.OM and KDETH.OFFSET (TID) */
-		AHG_HEADER_SET(req->ahg, diff, 7, 0, 16,
+		AHG_HEADER_SET(ahg, diff, 7, 0, 16,
 			       ((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 |
 			       ((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 |
 				((req->tidoffset >> omfactor)
 				((req->tidoffset >> omfactor)
 				 & 0x7fff)));
 				 & 0x7fff)));
@@ -1503,12 +1469,20 @@ static int set_txreq_header_ahg(struct user_sdma_request *req,
 					     AHG_KDETH_INTR_SHIFT));
 					     AHG_KDETH_INTR_SHIFT));
 		}
 		}
 
 
-		AHG_HEADER_SET(req->ahg, diff, 7, 16, 14, val);
+		AHG_HEADER_SET(ahg, diff, 7, 16, 14, val);
 	}
 	}
+	if (diff < 0)
+		return diff;
 
 
 	trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
 	trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
 					req->info.comp_idx, req->sde->this_idx,
 					req->info.comp_idx, req->sde->this_idx,
-					req->ahg_idx, req->ahg, diff, tidval);
+					req->ahg_idx, ahg, diff, tidval);
+	sdma_txinit_ahg(&tx->txreq,
+			SDMA_TXREQ_F_USE_AHG,
+			datalen, req->ahg_idx, diff,
+			ahg, sizeof(req->hdr),
+			user_sdma_txreq_cb);
+
 	return diff;
 	return diff;
 }
 }
 
 
@@ -1537,7 +1511,7 @@ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
 	if (status != SDMA_TXREQ_S_OK) {
 	if (status != SDMA_TXREQ_S_OK) {
 		SDMA_DBG(req, "SDMA completion with error %d",
 		SDMA_DBG(req, "SDMA completion with error %d",
 			 status);
 			 status);
-		set_bit(SDMA_REQ_HAS_ERROR, &req->flags);
+		WRITE_ONCE(req->has_error, 1);
 	}
 	}
 
 
 	req->seqcomp = tx->seqnum;
 	req->seqcomp = tx->seqnum;
@@ -1556,8 +1530,8 @@ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
 		if (status != SDMA_TXREQ_S_OK)
 		if (status != SDMA_TXREQ_S_OK)
 			req->status = status;
 			req->status = status;
 		if (req->seqcomp == (ACCESS_ONCE(req->seqsubmitted) - 1) &&
 		if (req->seqcomp == (ACCESS_ONCE(req->seqsubmitted) - 1) &&
-		    (test_bit(SDMA_REQ_SEND_DONE, &req->flags) ||
-		     test_bit(SDMA_REQ_DONE_ERROR, &req->flags))) {
+		    (READ_ONCE(req->done) ||
+		     READ_ONCE(req->has_error))) {
 			user_sdma_free_request(req, false);
 			user_sdma_free_request(req, false);
 			pq_update(pq);
 			pq_update(pq);
 			set_comp_state(pq, cq, idx, ERROR, req->status);
 			set_comp_state(pq, cq, idx, ERROR, req->status);

+ 3 - 3
drivers/infiniband/hw/hfi1/user_sdma.h

@@ -56,8 +56,7 @@
 extern uint extended_psn;
 extern uint extended_psn;
 
 
 struct hfi1_user_sdma_pkt_q {
 struct hfi1_user_sdma_pkt_q {
-	struct list_head list;
-	unsigned ctxt;
+	u16 ctxt;
 	u16 subctxt;
 	u16 subctxt;
 	u16 n_max_reqs;
 	u16 n_max_reqs;
 	atomic_t n_reqs;
 	atomic_t n_reqs;
@@ -82,7 +81,8 @@ struct hfi1_user_sdma_comp_q {
 
 
 int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
 int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
 				struct hfi1_filedata *fd);
 				struct hfi1_filedata *fd);
-int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd);
+int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd,
+			       struct hfi1_ctxtdata *uctxt);
 int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
 int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
 				   struct iovec *iovec, unsigned long dim,
 				   struct iovec *iovec, unsigned long dim,
 				   unsigned long *count);
 				   unsigned long *count);

+ 59 - 63
drivers/infiniband/hw/hfi1/verbs.c

@@ -508,13 +508,14 @@ again:
 /*
 /*
  * Make sure the QP is ready and able to accept the given opcode.
  * Make sure the QP is ready and able to accept the given opcode.
  */
  */
-static inline opcode_handler qp_ok(int opcode, struct hfi1_packet *packet)
+static inline opcode_handler qp_ok(struct hfi1_packet *packet)
 {
 {
 	if (!(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK))
 	if (!(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK))
 		return NULL;
 		return NULL;
-	if (((opcode & RVT_OPCODE_QP_MASK) == packet->qp->allowed_ops) ||
-	    (opcode == IB_OPCODE_CNP))
-		return opcode_handler_tbl[opcode];
+	if (((packet->opcode & RVT_OPCODE_QP_MASK) ==
+	     packet->qp->allowed_ops) ||
+	    (packet->opcode == IB_OPCODE_CNP))
+		return opcode_handler_tbl[packet->opcode];
 
 
 	return NULL;
 	return NULL;
 }
 }
@@ -548,69 +549,34 @@ static u64 hfi1_fault_tx(struct rvt_qp *qp, u8 opcode, u64 pbc)
 	return pbc;
 	return pbc;
 }
 }
 
 
-/**
- * hfi1_ib_rcv - process an incoming packet
- * @packet: data packet information
- *
- * This is called to process an incoming packet at interrupt level.
- *
- * Tlen is the length of the header + data + CRC in bytes.
- */
-void hfi1_ib_rcv(struct hfi1_packet *packet)
+static inline void hfi1_handle_packet(struct hfi1_packet *packet,
+				      bool is_mcast)
 {
 {
+	u32 qp_num;
 	struct hfi1_ctxtdata *rcd = packet->rcd;
 	struct hfi1_ctxtdata *rcd = packet->rcd;
-	struct ib_header *hdr = packet->hdr;
-	u32 tlen = packet->tlen;
 	struct hfi1_pportdata *ppd = rcd->ppd;
 	struct hfi1_pportdata *ppd = rcd->ppd;
 	struct hfi1_ibport *ibp = rcd_to_iport(rcd);
 	struct hfi1_ibport *ibp = rcd_to_iport(rcd);
 	struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
 	struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
 	opcode_handler packet_handler;
 	opcode_handler packet_handler;
 	unsigned long flags;
 	unsigned long flags;
-	u32 qp_num;
-	int lnh;
-	u8 opcode;
-	u16 lid;
-
-	/* Check for GRH */
-	lnh = ib_get_lnh(hdr);
-	if (lnh == HFI1_LRH_BTH) {
-		packet->ohdr = &hdr->u.oth;
-	} else if (lnh == HFI1_LRH_GRH) {
-		u32 vtf;
-
-		packet->ohdr = &hdr->u.l.oth;
-		if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR)
-			goto drop;
-		vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow);
-		if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
-			goto drop;
-		packet->rcv_flags |= HFI1_HAS_GRH;
-	} else {
-		goto drop;
-	}
 
 
-	trace_input_ibhdr(rcd->dd, hdr);
+	inc_opstats(packet->tlen, &rcd->opstats->stats[packet->opcode]);
 
 
-	opcode = ib_bth_get_opcode(packet->ohdr);
-	inc_opstats(tlen, &rcd->opstats->stats[opcode]);
-
-	/* Get the destination QP number. */
-	qp_num = be32_to_cpu(packet->ohdr->bth[1]) & RVT_QPN_MASK;
-	lid = ib_get_dlid(hdr);
-	if (unlikely((lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
-		     (lid != be16_to_cpu(IB_LID_PERMISSIVE)))) {
+	if (unlikely(is_mcast)) {
 		struct rvt_mcast *mcast;
 		struct rvt_mcast *mcast;
 		struct rvt_mcast_qp *p;
 		struct rvt_mcast_qp *p;
 
 
-		if (lnh != HFI1_LRH_GRH)
+		if (!packet->grh)
 			goto drop;
 			goto drop;
-		mcast = rvt_mcast_find(&ibp->rvp, &hdr->u.l.grh.dgid, lid);
+		mcast = rvt_mcast_find(&ibp->rvp,
+				       &packet->grh->dgid,
+				       packet->dlid);
 		if (!mcast)
 		if (!mcast)
 			goto drop;
 			goto drop;
 		list_for_each_entry_rcu(p, &mcast->qp_list, list) {
 		list_for_each_entry_rcu(p, &mcast->qp_list, list) {
 			packet->qp = p->qp;
 			packet->qp = p->qp;
 			spin_lock_irqsave(&packet->qp->r_lock, flags);
 			spin_lock_irqsave(&packet->qp->r_lock, flags);
-			packet_handler = qp_ok(opcode, packet);
+			packet_handler = qp_ok(packet);
 			if (likely(packet_handler))
 			if (likely(packet_handler))
 				packet_handler(packet);
 				packet_handler(packet);
 			else
 			else
@@ -624,19 +590,21 @@ void hfi1_ib_rcv(struct hfi1_packet *packet)
 		if (atomic_dec_return(&mcast->refcount) <= 1)
 		if (atomic_dec_return(&mcast->refcount) <= 1)
 			wake_up(&mcast->wait);
 			wake_up(&mcast->wait);
 	} else {
 	} else {
+		/* Get the destination QP number. */
+		qp_num = ib_bth_get_qpn(packet->ohdr);
 		rcu_read_lock();
 		rcu_read_lock();
 		packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
 		packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
 		if (!packet->qp) {
 		if (!packet->qp) {
 			rcu_read_unlock();
 			rcu_read_unlock();
 			goto drop;
 			goto drop;
 		}
 		}
-		if (unlikely(hfi1_dbg_fault_opcode(packet->qp, opcode,
+		if (unlikely(hfi1_dbg_fault_opcode(packet->qp, packet->opcode,
 						   true))) {
 						   true))) {
 			rcu_read_unlock();
 			rcu_read_unlock();
 			goto drop;
 			goto drop;
 		}
 		}
 		spin_lock_irqsave(&packet->qp->r_lock, flags);
 		spin_lock_irqsave(&packet->qp->r_lock, flags);
-		packet_handler = qp_ok(opcode, packet);
+		packet_handler = qp_ok(packet);
 		if (likely(packet_handler))
 		if (likely(packet_handler))
 			packet_handler(packet);
 			packet_handler(packet);
 		else
 		else
@@ -645,11 +613,29 @@ void hfi1_ib_rcv(struct hfi1_packet *packet)
 		rcu_read_unlock();
 		rcu_read_unlock();
 	}
 	}
 	return;
 	return;
-
 drop:
 drop:
 	ibp->rvp.n_pkt_drops++;
 	ibp->rvp.n_pkt_drops++;
 }
 }
 
 
+/**
+ * hfi1_ib_rcv - process an incoming packet
+ * @packet: data packet information
+ *
+ * This is called to process an incoming packet at interrupt level.
+ */
+void hfi1_ib_rcv(struct hfi1_packet *packet)
+{
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+	bool is_mcast = false;
+
+	if (unlikely(hfi1_check_mcast(packet->dlid)))
+		is_mcast = true;
+
+	trace_input_ibhdr(rcd->dd, packet,
+			  !!(packet->rhf & RHF_DC_INFO_SMASK));
+	hfi1_handle_packet(packet, is_mcast);
+}
+
 /*
 /*
  * This is called from a timer to check for QPs
  * This is called from a timer to check for QPs
  * which need kernel memory in order to send a packet.
  * which need kernel memory in order to send a packet.
@@ -863,7 +849,7 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 
 
 			/* No vl15 here */
 			/* No vl15 here */
 			/* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
 			/* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
-			pbc |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
+			pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT);
 
 
 			if (unlikely(hfi1_dbg_fault_opcode(qp, opcode, false)))
 			if (unlikely(hfi1_dbg_fault_opcode(qp, opcode, false)))
 				pbc = hfi1_fault_tx(qp, opcode, pbc);
 				pbc = hfi1_fault_tx(qp, opcode, pbc);
@@ -878,14 +864,15 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 		if (unlikely(ret))
 		if (unlikely(ret))
 			goto bail_build;
 			goto bail_build;
 	}
 	}
-	ret =  sdma_send_txreq(tx->sde, &priv->s_iowait, &tx->txreq);
+	ret =  sdma_send_txreq(tx->sde, &priv->s_iowait, &tx->txreq,
+			       ps->pkts_sent);
 	if (unlikely(ret < 0)) {
 	if (unlikely(ret < 0)) {
 		if (ret == -ECOMM)
 		if (ret == -ECOMM)
 			goto bail_ecomm;
 			goto bail_ecomm;
 		return ret;
 		return ret;
 	}
 	}
 	trace_sdma_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
 	trace_sdma_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
-				&ps->s_txreq->phdr.hdr);
+				&ps->s_txreq->phdr.hdr, ib_is_sc5(sc5));
 	return ret;
 	return ret;
 
 
 bail_ecomm:
 bail_ecomm:
@@ -935,7 +922,8 @@ static int pio_wait(struct rvt_qp *qp,
 			dev->n_piodrain += !!(flag & RVT_S_WAIT_PIO_DRAIN);
 			dev->n_piodrain += !!(flag & RVT_S_WAIT_PIO_DRAIN);
 			qp->s_flags |= flag;
 			qp->s_flags |= flag;
 			was_empty = list_empty(&sc->piowait);
 			was_empty = list_empty(&sc->piowait);
-			list_add_tail(&priv->s_iowait.list, &sc->piowait);
+			iowait_queue(ps->pkts_sent, &priv->s_iowait,
+				     &sc->piowait);
 			priv->s_iowait.lock = &dev->iowait_lock;
 			priv->s_iowait.lock = &dev->iowait_lock;
 			trace_hfi1_qpsleep(qp, RVT_S_WAIT_PIO);
 			trace_hfi1_qpsleep(qp, RVT_S_WAIT_PIO);
 			rvt_get_qp(qp);
 			rvt_get_qp(qp);
@@ -999,7 +987,7 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 		u8 opcode = get_opcode(&tx->phdr.hdr);
 		u8 opcode = get_opcode(&tx->phdr.hdr);
 
 
 		/* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
 		/* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
-		pbc |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
+		pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT);
 		if (unlikely(hfi1_dbg_fault_opcode(qp, opcode, false)))
 		if (unlikely(hfi1_dbg_fault_opcode(qp, opcode, false)))
 			pbc = hfi1_fault_tx(qp, opcode, pbc);
 			pbc = hfi1_fault_tx(qp, opcode, pbc);
 		pbc = create_pbc(ppd, pbc, qp->srate_mbps, vl, plen);
 		pbc = create_pbc(ppd, pbc, qp->srate_mbps, vl, plen);
@@ -1058,7 +1046,7 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 	}
 	}
 
 
 	trace_pio_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
 	trace_pio_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
-			       &ps->s_txreq->phdr.hdr);
+			       &ps->s_txreq->phdr.hdr, ib_is_sc5(sc5));
 
 
 pio_bail:
 pio_bail:
 	if (qp->s_wqe) {
 	if (qp->s_wqe) {
@@ -1368,7 +1356,7 @@ static int query_port(struct rvt_dev_info *rdi, u8 port_num,
 	props->lmc = ppd->lmc;
 	props->lmc = ppd->lmc;
 	/* OPA logical states match IB logical states */
 	/* OPA logical states match IB logical states */
 	props->state = driver_lstate(ppd);
 	props->state = driver_lstate(ppd);
-	props->phys_state = hfi1_ibphys_portstate(ppd);
+	props->phys_state = driver_pstate(ppd);
 	props->gid_tbl_len = HFI1_GUIDS_PER_PORT;
 	props->gid_tbl_len = HFI1_GUIDS_PER_PORT;
 	props->active_width = (u8)opa_width_to_ib(ppd->link_width_active);
 	props->active_width = (u8)opa_width_to_ib(ppd->link_width_active);
 	/* see rate_show() in ib core/sysfs.c */
 	/* see rate_show() in ib core/sysfs.c */
@@ -1547,13 +1535,22 @@ static void init_ibport(struct hfi1_pportdata *ppd)
 		ibp->sc_to_sl[i] = i;
 		ibp->sc_to_sl[i] = i;
 	}
 	}
 
 
+	for (i = 0; i < RVT_MAX_TRAP_LISTS ; i++)
+		INIT_LIST_HEAD(&ibp->rvp.trap_lists[i].list);
+	setup_timer(&ibp->rvp.trap_timer, hfi1_handle_trap_timer,
+		    (unsigned long)ibp);
+
 	spin_lock_init(&ibp->rvp.lock);
 	spin_lock_init(&ibp->rvp.lock);
 	/* Set the prefix to the default value (see ch. 4.1.1) */
 	/* Set the prefix to the default value (see ch. 4.1.1) */
 	ibp->rvp.gid_prefix = IB_DEFAULT_GID_PREFIX;
 	ibp->rvp.gid_prefix = IB_DEFAULT_GID_PREFIX;
 	ibp->rvp.sm_lid = 0;
 	ibp->rvp.sm_lid = 0;
-	/* Below should only set bits defined in OPA PortInfo.CapabilityMask */
+	/*
+	 * Below should only set bits defined in OPA PortInfo.CapabilityMask
+	 * and PortInfo.CapabilityMask3
+	 */
 	ibp->rvp.port_cap_flags = IB_PORT_AUTO_MIGR_SUP |
 	ibp->rvp.port_cap_flags = IB_PORT_AUTO_MIGR_SUP |
 		IB_PORT_CAP_MASK_NOTICE_SUP;
 		IB_PORT_CAP_MASK_NOTICE_SUP;
+	ibp->rvp.port_cap3_flags = OPA_CAP_MASK3_IsSharedSpaceSupported;
 	ibp->rvp.pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
 	ibp->rvp.pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
 	ibp->rvp.pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
 	ibp->rvp.pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
 	ibp->rvp.pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
 	ibp->rvp.pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
@@ -1564,14 +1561,13 @@ static void init_ibport(struct hfi1_pportdata *ppd)
 	RCU_INIT_POINTER(ibp->rvp.qp[1], NULL);
 	RCU_INIT_POINTER(ibp->rvp.qp[1], NULL);
 }
 }
 
 
-static void hfi1_get_dev_fw_str(struct ib_device *ibdev, char *str,
-				size_t str_len)
+static void hfi1_get_dev_fw_str(struct ib_device *ibdev, char *str)
 {
 {
 	struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
 	struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
 	struct hfi1_ibdev *dev = dev_from_rdi(rdi);
 	struct hfi1_ibdev *dev = dev_from_rdi(rdi);
 	u32 ver = dd_from_dev(dev)->dc8051_ver;
 	u32 ver = dd_from_dev(dev)->dc8051_ver;
 
 
-	snprintf(str, str_len, "%u.%u.%u", dc8051_ver_maj(ver),
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u.%u", dc8051_ver_maj(ver),
 		 dc8051_ver_min(ver), dc8051_ver_patch(ver));
 		 dc8051_ver_min(ver), dc8051_ver_patch(ver));
 }
 }
 
 

+ 5 - 6
drivers/infiniband/hw/hfi1/verbs.h

@@ -143,6 +143,7 @@ struct hfi1_pkt_state {
 	unsigned long timeout_int;
 	unsigned long timeout_int;
 	int cpu;
 	int cpu;
 	bool in_thread;
 	bool in_thread;
+	bool pkts_sent;
 };
 };
 
 
 #define HFI1_PSN_CREDIT  16
 #define HFI1_PSN_CREDIT  16
@@ -236,8 +237,8 @@ static inline int hfi1_send_ok(struct rvt_qp *qp)
 /*
 /*
  * This must be called with s_lock held.
  * This must be called with s_lock held.
  */
  */
-void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl,
-		    u32 qp1, u32 qp2, u16 lid1, u16 lid2);
+void hfi1_bad_pkey(struct hfi1_ibport *ibp, u32 key, u32 sl,
+		   u32 qp1, u32 qp2, u16 lid1, u16 lid2);
 void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num);
 void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num);
 void hfi1_sys_guid_chg(struct hfi1_ibport *ibp);
 void hfi1_sys_guid_chg(struct hfi1_ibport *ibp);
 void hfi1_node_desc_chg(struct hfi1_ibport *ibp);
 void hfi1_node_desc_chg(struct hfi1_ibport *ibp);
@@ -307,8 +308,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet);
 
 
 void hfi1_rc_hdrerr(
 void hfi1_rc_hdrerr(
 	struct hfi1_ctxtdata *rcd,
 	struct hfi1_ctxtdata *rcd,
-	struct ib_header *hdr,
-	u32 rcv_flags,
+	struct hfi1_packet *packet,
 	struct rvt_qp *qp);
 	struct rvt_qp *qp);
 
 
 u8 ah_to_sc(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr);
 u8 ah_to_sc(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr);
@@ -346,8 +346,7 @@ static inline u8 get_opcode(struct ib_header *h)
 		return be32_to_cpu(h->u.l.oth.bth[0]) >> 24;
 		return be32_to_cpu(h->u.l.oth.bth[0]) >> 24;
 }
 }
 
 
-int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct ib_header *hdr,
-		       int has_grh, struct rvt_qp *qp, u32 bth0);
+int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_packet *packet);
 
 
 u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
 u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
 		  const struct ib_global_route *grh, u32 hwords, u32 nwords);
 		  const struct ib_global_route *grh, u32 hwords, u32 nwords);

+ 1 - 0
drivers/infiniband/hw/hfi1/vnic.h

@@ -103,6 +103,7 @@ struct hfi1_vnic_sdma {
 	struct sdma_txreq stx;
 	struct sdma_txreq stx;
 	unsigned int state;
 	unsigned int state;
 	u8 q_idx;
 	u8 q_idx;
+	bool pkts_sent;
 };
 };
 
 
 /**
 /**

+ 10 - 7
drivers/infiniband/hw/hfi1/vnic_main.c

@@ -95,7 +95,7 @@ static int setup_vnic_ctxt(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt)
 	if (HFI1_CAP_KGET_MASK(uctxt->flags, DMA_RTAIL))
 	if (HFI1_CAP_KGET_MASK(uctxt->flags, DMA_RTAIL))
 		rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB;
 		rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB;
 
 
-	hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt->ctxt);
+	hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt);
 
 
 	uctxt->is_vnic = true;
 	uctxt->is_vnic = true;
 done:
 done:
@@ -106,7 +106,7 @@ static int allocate_vnic_ctxt(struct hfi1_devdata *dd,
 			      struct hfi1_ctxtdata **vnic_ctxt)
 			      struct hfi1_ctxtdata **vnic_ctxt)
 {
 {
 	struct hfi1_ctxtdata *uctxt;
 	struct hfi1_ctxtdata *uctxt;
-	unsigned int ctxt;
+	u16 ctxt;
 	int ret;
 	int ret;
 
 
 	if (dd->flags & HFI1_FROZEN)
 	if (dd->flags & HFI1_FROZEN)
@@ -156,11 +156,11 @@ static int allocate_vnic_ctxt(struct hfi1_devdata *dd,
 	return ret;
 	return ret;
 bail:
 bail:
 	/*
 	/*
-	 * hfi1_free_ctxtdata() also releases send_context
-	 * structure if uctxt->sc is not null
+	 * hfi1_rcd_put() will call hfi1_free_ctxtdata(), which will
+	 * release send_context structure if uctxt->sc is not null
 	 */
 	 */
 	dd->rcd[uctxt->ctxt] = NULL;
 	dd->rcd[uctxt->ctxt] = NULL;
-	hfi1_free_ctxtdata(dd, uctxt);
+	hfi1_rcd_put(uctxt);
 	dd_dev_dbg(dd, "vnic allocation failed. rc %d\n", ret);
 	dd_dev_dbg(dd, "vnic allocation failed. rc %d\n", ret);
 	return ret;
 	return ret;
 }
 }
@@ -186,7 +186,7 @@ static void deallocate_vnic_ctxt(struct hfi1_devdata *dd,
 		     HFI1_RCVCTRL_INTRAVAIL_DIS |
 		     HFI1_RCVCTRL_INTRAVAIL_DIS |
 		     HFI1_RCVCTRL_ONE_PKT_EGR_DIS |
 		     HFI1_RCVCTRL_ONE_PKT_EGR_DIS |
 		     HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
 		     HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
-		     HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt->ctxt);
+		     HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt);
 	/*
 	/*
 	 * VNIC contexts are allocated from user context pool.
 	 * VNIC contexts are allocated from user context pool.
 	 * Release them back to user context pool.
 	 * Release them back to user context pool.
@@ -208,7 +208,7 @@ static void deallocate_vnic_ctxt(struct hfi1_devdata *dd,
 	hfi1_clear_ctxt_pkey(dd, uctxt);
 	hfi1_clear_ctxt_pkey(dd, uctxt);
 
 
 	hfi1_stats.sps_ctxts--;
 	hfi1_stats.sps_ctxts--;
-	hfi1_free_ctxtdata(dd, uctxt);
+	hfi1_rcd_put(uctxt);
 }
 }
 
 
 void hfi1_vnic_setup(struct hfi1_devdata *dd)
 void hfi1_vnic_setup(struct hfi1_devdata *dd)
@@ -751,6 +751,7 @@ static int hfi1_vnic_init(struct hfi1_vnic_vport_info *vinfo)
 		rc = hfi1_vnic_allot_ctxt(dd, &dd->vnic.ctxt[i]);
 		rc = hfi1_vnic_allot_ctxt(dd, &dd->vnic.ctxt[i]);
 		if (rc)
 		if (rc)
 			break;
 			break;
+		hfi1_rcd_get(dd->vnic.ctxt[i]);
 		dd->vnic.ctxt[i]->vnic_q_idx = i;
 		dd->vnic.ctxt[i]->vnic_q_idx = i;
 	}
 	}
 
 
@@ -762,6 +763,7 @@ static int hfi1_vnic_init(struct hfi1_vnic_vport_info *vinfo)
 		 */
 		 */
 		while (i-- > dd->vnic.num_ctxt) {
 		while (i-- > dd->vnic.num_ctxt) {
 			deallocate_vnic_ctxt(dd, dd->vnic.ctxt[i]);
 			deallocate_vnic_ctxt(dd, dd->vnic.ctxt[i]);
+			hfi1_rcd_put(dd->vnic.ctxt[i]);
 			dd->vnic.ctxt[i] = NULL;
 			dd->vnic.ctxt[i] = NULL;
 		}
 		}
 		goto alloc_fail;
 		goto alloc_fail;
@@ -791,6 +793,7 @@ static void hfi1_vnic_deinit(struct hfi1_vnic_vport_info *vinfo)
 	if (--dd->vnic.num_vports == 0) {
 	if (--dd->vnic.num_vports == 0) {
 		for (i = 0; i < dd->vnic.num_ctxt; i++) {
 		for (i = 0; i < dd->vnic.num_ctxt; i++) {
 			deallocate_vnic_ctxt(dd, dd->vnic.ctxt[i]);
 			deallocate_vnic_ctxt(dd, dd->vnic.ctxt[i]);
+			hfi1_rcd_put(dd->vnic.ctxt[i]);
 			dd->vnic.ctxt[i] = NULL;
 			dd->vnic.ctxt[i] = NULL;
 		}
 		}
 		hfi1_deinit_vnic_rsm(dd);
 		hfi1_deinit_vnic_rsm(dd);

+ 11 - 3
drivers/infiniband/hw/hfi1/vnic_sdma.c

@@ -198,11 +198,16 @@ int hfi1_vnic_send_dma(struct hfi1_devdata *dd, u8 q_idx,
 		goto free_desc;
 		goto free_desc;
 	tx->retry_count = 0;
 	tx->retry_count = 0;
 
 
-	ret = sdma_send_txreq(sde, &vnic_sdma->wait, &tx->txreq);
+	ret = sdma_send_txreq(sde, &vnic_sdma->wait, &tx->txreq,
+			      vnic_sdma->pkts_sent);
 	/* When -ECOMM, sdma callback will be called with ABORT status */
 	/* When -ECOMM, sdma callback will be called with ABORT status */
 	if (unlikely(ret && unlikely(ret != -ECOMM)))
 	if (unlikely(ret && unlikely(ret != -ECOMM)))
 		goto free_desc;
 		goto free_desc;
 
 
+	if (!ret) {
+		vnic_sdma->pkts_sent = true;
+		iowait_starve_clear(vnic_sdma->pkts_sent, &vnic_sdma->wait);
+	}
 	return ret;
 	return ret;
 
 
 free_desc:
 free_desc:
@@ -211,6 +216,8 @@ free_desc:
 tx_err:
 tx_err:
 	if (ret != -EBUSY)
 	if (ret != -EBUSY)
 		dev_kfree_skb_any(skb);
 		dev_kfree_skb_any(skb);
+	else
+		vnic_sdma->pkts_sent = false;
 	return ret;
 	return ret;
 }
 }
 
 
@@ -225,7 +232,8 @@ tx_err:
 static int hfi1_vnic_sdma_sleep(struct sdma_engine *sde,
 static int hfi1_vnic_sdma_sleep(struct sdma_engine *sde,
 				struct iowait *wait,
 				struct iowait *wait,
 				struct sdma_txreq *txreq,
 				struct sdma_txreq *txreq,
-				unsigned int seq)
+				uint seq,
+				bool pkts_sent)
 {
 {
 	struct hfi1_vnic_sdma *vnic_sdma =
 	struct hfi1_vnic_sdma *vnic_sdma =
 		container_of(wait, struct hfi1_vnic_sdma, wait);
 		container_of(wait, struct hfi1_vnic_sdma, wait);
@@ -239,7 +247,7 @@ static int hfi1_vnic_sdma_sleep(struct sdma_engine *sde,
 	vnic_sdma->state = HFI1_VNIC_SDMA_Q_DEFERRED;
 	vnic_sdma->state = HFI1_VNIC_SDMA_Q_DEFERRED;
 	write_seqlock(&dev->iowait_lock);
 	write_seqlock(&dev->iowait_lock);
 	if (list_empty(&vnic_sdma->wait.list))
 	if (list_empty(&vnic_sdma->wait.list))
-		list_add_tail(&vnic_sdma->wait.list, &sde->dmawait);
+		iowait_queue(pkts_sent, wait, &sde->dmawait);
 	write_sequnlock(&dev->iowait_lock);
 	write_sequnlock(&dev->iowait_lock);
 	return -EBUSY;
 	return -EBUSY;
 }
 }

+ 1 - 1
drivers/infiniband/hw/hns/Kconfig

@@ -1,7 +1,7 @@
 config INFINIBAND_HNS
 config INFINIBAND_HNS
 	tristate "HNS RoCE Driver"
 	tristate "HNS RoCE Driver"
 	depends on NET_VENDOR_HISILICON
 	depends on NET_VENDOR_HISILICON
-	depends on ARM64 && HNS && HNS_DSAF && HNS_ENET
+	depends on (ARM64 || COMPILE_TEST) && HNS && HNS_DSAF && HNS_ENET
 	---help---
 	---help---
 	  This is a RoCE/RDMA driver for the Hisilicon RoCE engine. The engine
 	  This is a RoCE/RDMA driver for the Hisilicon RoCE engine. The engine
 	  is used in Hisilicon Hi1610 and more further ICT SoC.
 	  is used in Hisilicon Hi1610 and more further ICT SoC.

+ 1 - 0
drivers/infiniband/hw/hns/hns_roce_alloc.c

@@ -32,6 +32,7 @@
  */
  */
 
 
 #include <linux/platform_device.h>
 #include <linux/platform_device.h>
+#include <linux/vmalloc.h>
 #include "hns_roce_device.h"
 #include "hns_roce_device.h"
 
 
 int hns_roce_bitmap_alloc(struct hns_roce_bitmap *bitmap, unsigned long *obj)
 int hns_roce_bitmap_alloc(struct hns_roce_bitmap *bitmap, unsigned long *obj)

+ 1 - 0
drivers/infiniband/hw/hns/hns_roce_device.h

@@ -33,6 +33,7 @@
 #ifndef _HNS_ROCE_DEVICE_H
 #ifndef _HNS_ROCE_DEVICE_H
 #define _HNS_ROCE_DEVICE_H
 #define _HNS_ROCE_DEVICE_H
 
 
+#include <linux/io.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_verbs.h>
 
 
 #define DRV_NAME "hns_roce"
 #define DRV_NAME "hns_roce"

+ 2 - 1
drivers/infiniband/hw/hns/hns_roce_eq.c

@@ -31,6 +31,7 @@
  */
  */
 
 
 #include <linux/platform_device.h>
 #include <linux/platform_device.h>
+#include <linux/interrupt.h>
 #include "hns_roce_common.h"
 #include "hns_roce_common.h"
 #include "hns_roce_device.h"
 #include "hns_roce_device.h"
 #include "hns_roce_eq.h"
 #include "hns_roce_eq.h"
@@ -292,7 +293,7 @@ static int hns_roce_aeq_int(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq)
 			dev_warn(dev, "Unhandled event %d on EQ %d at index %u\n",
 			dev_warn(dev, "Unhandled event %d on EQ %d at index %u\n",
 				 event_type, eq->eqn, eq->cons_index);
 				 event_type, eq->eqn, eq->cons_index);
 			break;
 			break;
-		};
+		}
 
 
 		eq->cons_index++;
 		eq->cons_index++;
 		aeqes_found = 1;
 		aeqes_found = 1;

+ 1 - 2
drivers/infiniband/hw/hns/hns_roce_hw_v1.c

@@ -2023,7 +2023,6 @@ int hns_roce_v1_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
 	struct hns_roce_cq *hr_cq = to_hr_cq(ibcq);
 	struct hns_roce_cq *hr_cq = to_hr_cq(ibcq);
 	u32 notification_flag;
 	u32 notification_flag;
 	u32 doorbell[2];
 	u32 doorbell[2];
-	int ret = 0;
 
 
 	notification_flag = (flags & IB_CQ_SOLICITED_MASK) ==
 	notification_flag = (flags & IB_CQ_SOLICITED_MASK) ==
 			    IB_CQ_SOLICITED ? CQ_DB_REQ_NOT : CQ_DB_REQ_NOT_SOL;
 			    IB_CQ_SOLICITED ? CQ_DB_REQ_NOT : CQ_DB_REQ_NOT_SOL;
@@ -2043,7 +2042,7 @@ int hns_roce_v1_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
 
 
 	hns_roce_write64_k(doorbell, hr_cq->cq_db_l);
 	hns_roce_write64_k(doorbell, hr_cq->cq_db_l);
 
 
-	return ret;
+	return 0;
 }
 }
 
 
 static int hns_roce_v1_poll_one(struct hns_roce_cq *hr_cq,
 static int hns_roce_v1_poll_one(struct hns_roce_cq *hr_cq,

+ 1 - 0
drivers/infiniband/hw/hns/hns_roce_mr.c

@@ -32,6 +32,7 @@
  */
  */
 
 
 #include <linux/platform_device.h>
 #include <linux/platform_device.h>
+#include <linux/vmalloc.h>
 #include <rdma/ib_umem.h>
 #include <rdma/ib_umem.h>
 #include "hns_roce_device.h"
 #include "hns_roce_device.h"
 #include "hns_roce_cmd.h"
 #include "hns_roce_cmd.h"

+ 1 - 1
drivers/infiniband/hw/hns/hns_roce_qp.c

@@ -799,7 +799,7 @@ bool hns_roce_wq_overflow(struct hns_roce_wq *hr_wq, int nreq,
 
 
 	cur = hr_wq->head - hr_wq->tail;
 	cur = hr_wq->head - hr_wq->tail;
 	if (likely(cur + nreq < hr_wq->max_post))
 	if (likely(cur + nreq < hr_wq->max_post))
-		return 0;
+		return false;
 
 
 	hr_cq = to_hr_cq(ib_cq);
 	hr_cq = to_hr_cq(ib_cq);
 	spin_lock(&hr_cq->lock);
 	spin_lock(&hr_cq->lock);

+ 0 - 1
drivers/infiniband/hw/i40iw/i40iw_main.c

@@ -77,7 +77,6 @@ MODULE_PARM_DESC(mpa_version, "MPA version to be used in MPA Req/Resp 1 or 2");
 MODULE_AUTHOR("Intel Corporation, <e1000-rdma@lists.sourceforge.net>");
 MODULE_AUTHOR("Intel Corporation, <e1000-rdma@lists.sourceforge.net>");
 MODULE_DESCRIPTION("Intel(R) Ethernet Connection X722 iWARP RDMA Driver");
 MODULE_DESCRIPTION("Intel(R) Ethernet Connection X722 iWARP RDMA Driver");
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_LICENSE("Dual BSD/GPL");
-MODULE_VERSION(DRV_VERSION);
 
 
 static struct i40e_client i40iw_client;
 static struct i40e_client i40iw_client;
 static char i40iw_client_name[I40E_CLIENT_STR_LENGTH] = "i40iw";
 static char i40iw_client_name[I40E_CLIENT_STR_LENGTH] = "i40iw";

+ 3 - 4
drivers/infiniband/hw/i40iw/i40iw_verbs.c

@@ -2584,13 +2584,12 @@ static const char * const i40iw_hw_stat_names[] = {
 		"iwRdmaInv"
 		"iwRdmaInv"
 };
 };
 
 
-static void i40iw_get_dev_fw_str(struct ib_device *dev, char *str,
-				 size_t str_len)
+static void i40iw_get_dev_fw_str(struct ib_device *dev, char *str)
 {
 {
 	u32 firmware_version = I40IW_FW_VERSION;
 	u32 firmware_version = I40IW_FW_VERSION;
 
 
-	snprintf(str, str_len, "%u.%u", firmware_version,
-		       (firmware_version & 0x000000ff));
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u", firmware_version,
+		 (firmware_version & 0x000000ff));
 }
 }
 
 
 /**
 /**

+ 2 - 0
drivers/infiniband/hw/mlx4/cq.c

@@ -218,6 +218,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
 			goto err_mtt;
 			goto err_mtt;
 
 
 		uar = &to_mucontext(context)->uar;
 		uar = &to_mucontext(context)->uar;
+		cq->mcq.usage = MLX4_RES_USAGE_USER_VERBS;
 	} else {
 	} else {
 		err = mlx4_db_alloc(dev->dev, &cq->db, 1);
 		err = mlx4_db_alloc(dev->dev, &cq->db, 1);
 		if (err)
 		if (err)
@@ -233,6 +234,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
 			goto err_db;
 			goto err_db;
 
 
 		uar = &dev->priv_uar;
 		uar = &dev->priv_uar;
+		cq->mcq.usage = MLX4_RES_USAGE_DRIVER;
 	}
 	}
 
 
 	if (dev->eq_table)
 	if (dev->eq_table)

+ 48 - 6
drivers/infiniband/hw/mlx4/main.c

@@ -70,7 +70,6 @@
 MODULE_AUTHOR("Roland Dreier");
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver");
 MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver");
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_LICENSE("Dual BSD/GPL");
-MODULE_VERSION(DRV_VERSION);
 
 
 int mlx4_ib_sm_guid_assign = 0;
 int mlx4_ib_sm_guid_assign = 0;
 module_param_named(sm_guid_assign, mlx4_ib_sm_guid_assign, int, 0444);
 module_param_named(sm_guid_assign, mlx4_ib_sm_guid_assign, int, 0444);
@@ -81,6 +80,8 @@ static const char mlx4_ib_version[] =
 	DRV_VERSION "\n";
 	DRV_VERSION "\n";
 
 
 static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init);
 static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init);
+static enum rdma_link_layer mlx4_ib_port_link_layer(struct ib_device *device,
+						    u8 port_num);
 
 
 static struct workqueue_struct *wq;
 static struct workqueue_struct *wq;
 
 
@@ -552,6 +553,16 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
 	props->timestamp_mask = 0xFFFFFFFFFFFFULL;
 	props->timestamp_mask = 0xFFFFFFFFFFFFULL;
 	props->max_ah = INT_MAX;
 	props->max_ah = INT_MAX;
 
 
+	if ((dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) &&
+	    (mlx4_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET ||
+	     mlx4_ib_port_link_layer(ibdev, 2) == IB_LINK_LAYER_ETHERNET)) {
+		props->rss_caps.max_rwq_indirection_tables = props->max_qp;
+		props->rss_caps.max_rwq_indirection_table_size =
+			dev->dev->caps.max_rss_tbl_sz;
+		props->rss_caps.supported_qpts = 1 << IB_QPT_RAW_PACKET;
+		props->max_wq_type_rq = props->max_qp;
+	}
+
 	if (!mlx4_is_slave(dev->dev))
 	if (!mlx4_is_slave(dev->dev))
 		err = mlx4_get_internal_clock_params(dev->dev, &clock_params);
 		err = mlx4_get_internal_clock_params(dev->dev, &clock_params);
 
 
@@ -563,6 +574,13 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
 		}
 		}
 	}
 	}
 
 
+	if (uhw->outlen >= resp.response_length +
+	    sizeof(resp.max_inl_recv_sz)) {
+		resp.response_length += sizeof(resp.max_inl_recv_sz);
+		resp.max_inl_recv_sz  = dev->dev->caps.max_rq_sg *
+			sizeof(struct mlx4_wqe_data_seg);
+	}
+
 	if (uhw->outlen) {
 	if (uhw->outlen) {
 		err = ib_copy_to_udata(uhw, &resp, resp.response_length);
 		err = ib_copy_to_udata(uhw, &resp, resp.response_length);
 		if (err)
 		if (err)
@@ -1069,6 +1087,9 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev,
 	INIT_LIST_HEAD(&context->db_page_list);
 	INIT_LIST_HEAD(&context->db_page_list);
 	mutex_init(&context->db_page_mutex);
 	mutex_init(&context->db_page_mutex);
 
 
+	INIT_LIST_HEAD(&context->wqn_ranges_list);
+	mutex_init(&context->wqn_ranges_mutex);
+
 	if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION)
 	if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION)
 		err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3));
 		err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3));
 	else
 	else
@@ -2566,12 +2587,11 @@ static int mlx4_port_immutable(struct ib_device *ibdev, u8 port_num,
 	return 0;
 	return 0;
 }
 }
 
 
-static void get_fw_ver_str(struct ib_device *device, char *str,
-			   size_t str_len)
+static void get_fw_ver_str(struct ib_device *device, char *str)
 {
 {
 	struct mlx4_ib_dev *dev =
 	struct mlx4_ib_dev *dev =
 		container_of(device, struct mlx4_ib_dev, ib_dev);
 		container_of(device, struct mlx4_ib_dev, ib_dev);
-	snprintf(str, str_len, "%d.%d.%d",
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d",
 		 (int) (dev->dev->caps.fw_ver >> 32),
 		 (int) (dev->dev->caps.fw_ver >> 32),
 		 (int) (dev->dev->caps.fw_ver >> 16) & 0xffff,
 		 (int) (dev->dev->caps.fw_ver >> 16) & 0xffff,
 		 (int) dev->dev->caps.fw_ver & 0xffff);
 		 (int) dev->dev->caps.fw_ver & 0xffff);
@@ -2713,6 +2733,26 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	ibdev->ib_dev.get_dev_fw_str    = get_fw_ver_str;
 	ibdev->ib_dev.get_dev_fw_str    = get_fw_ver_str;
 	ibdev->ib_dev.disassociate_ucontext = mlx4_ib_disassociate_ucontext;
 	ibdev->ib_dev.disassociate_ucontext = mlx4_ib_disassociate_ucontext;
 
 
+	if ((dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) &&
+	    ((mlx4_ib_port_link_layer(&ibdev->ib_dev, 1) ==
+	    IB_LINK_LAYER_ETHERNET) ||
+	    (mlx4_ib_port_link_layer(&ibdev->ib_dev, 2) ==
+	    IB_LINK_LAYER_ETHERNET))) {
+		ibdev->ib_dev.create_wq		= mlx4_ib_create_wq;
+		ibdev->ib_dev.modify_wq		= mlx4_ib_modify_wq;
+		ibdev->ib_dev.destroy_wq	= mlx4_ib_destroy_wq;
+		ibdev->ib_dev.create_rwq_ind_table  =
+			mlx4_ib_create_rwq_ind_table;
+		ibdev->ib_dev.destroy_rwq_ind_table =
+			mlx4_ib_destroy_rwq_ind_table;
+		ibdev->ib_dev.uverbs_ex_cmd_mask |=
+			(1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ)	  |
+			(1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ)	  |
+			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ)	  |
+			(1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) |
+			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL);
+	}
+
 	if (!mlx4_is_slave(ibdev->dev)) {
 	if (!mlx4_is_slave(ibdev->dev)) {
 		ibdev->ib_dev.alloc_fmr		= mlx4_ib_fmr_alloc;
 		ibdev->ib_dev.alloc_fmr		= mlx4_ib_fmr_alloc;
 		ibdev->ib_dev.map_phys_fmr	= mlx4_ib_map_phys_fmr;
 		ibdev->ib_dev.map_phys_fmr	= mlx4_ib_map_phys_fmr;
@@ -2772,7 +2812,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 		allocated = 0;
 		allocated = 0;
 		if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) ==
 		if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) ==
 						IB_LINK_LAYER_ETHERNET) {
 						IB_LINK_LAYER_ETHERNET) {
-			err = mlx4_counter_alloc(ibdev->dev, &counter_index);
+			err = mlx4_counter_alloc(ibdev->dev, &counter_index,
+						 MLX4_RES_USAGE_DRIVER);
 			/* if failed to allocate a new counter, use default */
 			/* if failed to allocate a new counter, use default */
 			if (err)
 			if (err)
 				counter_index =
 				counter_index =
@@ -2827,7 +2868,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 		ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS;
 		ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS;
 		err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count,
 		err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count,
 					    MLX4_IB_UC_STEER_QPN_ALIGN,
 					    MLX4_IB_UC_STEER_QPN_ALIGN,
-					    &ibdev->steer_qpn_base, 0);
+					    &ibdev->steer_qpn_base, 0,
+					    MLX4_RES_USAGE_DRIVER);
 		if (err)
 		if (err)
 			goto err_counter;
 			goto err_counter;
 
 

+ 41 - 1
drivers/infiniband/hw/mlx4/mlx4_ib.h

@@ -46,6 +46,7 @@
 
 
 #include <linux/mlx4/device.h>
 #include <linux/mlx4/device.h>
 #include <linux/mlx4/doorbell.h>
 #include <linux/mlx4/doorbell.h>
+#include <linux/mlx4/qp.h>
 
 
 #define MLX4_IB_DRV_NAME	"mlx4_ib"
 #define MLX4_IB_DRV_NAME	"mlx4_ib"
 
 
@@ -88,6 +89,8 @@ struct mlx4_ib_ucontext {
 	struct list_head	db_page_list;
 	struct list_head	db_page_list;
 	struct mutex		db_page_mutex;
 	struct mutex		db_page_mutex;
 	struct mlx4_ib_vma_private_data hw_bar_info[HW_BAR_COUNT];
 	struct mlx4_ib_vma_private_data hw_bar_info[HW_BAR_COUNT];
+	struct list_head	wqn_ranges_list;
+	struct mutex		wqn_ranges_mutex; /* protect wqn_ranges_list */
 };
 };
 
 
 struct mlx4_ib_pd {
 struct mlx4_ib_pd {
@@ -289,8 +292,25 @@ struct mlx4_roce_smac_vlan_info {
 	int update_vid;
 	int update_vid;
 };
 };
 
 
+struct mlx4_wqn_range {
+	int			base_wqn;
+	int			size;
+	int			refcount;
+	bool			dirty;
+	struct list_head	list;
+};
+
+struct mlx4_ib_rss {
+	unsigned int		base_qpn_tbl_sz;
+	u8			flags;
+	u8			rss_key[MLX4_EN_RSS_KEY_SIZE];
+};
+
 struct mlx4_ib_qp {
 struct mlx4_ib_qp {
-	struct ib_qp		ibqp;
+	union {
+		struct ib_qp	ibqp;
+		struct ib_wq	ibwq;
+	};
 	struct mlx4_qp		mqp;
 	struct mlx4_qp		mqp;
 	struct mlx4_buf		buf;
 	struct mlx4_buf		buf;
 
 
@@ -318,6 +338,7 @@ struct mlx4_ib_qp {
 	u8			sq_no_prefetch;
 	u8			sq_no_prefetch;
 	u8			state;
 	u8			state;
 	int			mlx_type;
 	int			mlx_type;
+	u32			inl_recv_sz;
 	struct list_head	gid_list;
 	struct list_head	gid_list;
 	struct list_head	steering_rules;
 	struct list_head	steering_rules;
 	struct mlx4_ib_buf	*sqp_proxy_rcv;
 	struct mlx4_ib_buf	*sqp_proxy_rcv;
@@ -328,6 +349,10 @@ struct mlx4_ib_qp {
 	struct list_head	cq_recv_list;
 	struct list_head	cq_recv_list;
 	struct list_head	cq_send_list;
 	struct list_head	cq_send_list;
 	struct counter_index	*counter_index;
 	struct counter_index	*counter_index;
+	struct mlx4_wqn_range	*wqn_range;
+	/* Number of RSS QP parents that uses this WQ */
+	u32			rss_usecnt;
+	struct mlx4_ib_rss	*rss_ctx;
 };
 };
 
 
 struct mlx4_ib_srq {
 struct mlx4_ib_srq {
@@ -623,6 +648,8 @@ struct mlx4_uverbs_ex_query_device_resp {
 	__u32 comp_mask;
 	__u32 comp_mask;
 	__u32 response_length;
 	__u32 response_length;
 	__u64 hca_core_clock_offset;
 	__u64 hca_core_clock_offset;
+	__u32 max_inl_recv_sz;
+	__u32 reserved;
 };
 };
 
 
 static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev)
 static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev)
@@ -890,4 +917,17 @@ void mlx4_sched_ib_sl2vl_update_work(struct mlx4_ib_dev *ibdev,
 
 
 void mlx4_ib_sl2vl_update(struct mlx4_ib_dev *mdev, int port);
 void mlx4_ib_sl2vl_update(struct mlx4_ib_dev *mdev, int port);
 
 
+struct ib_wq *mlx4_ib_create_wq(struct ib_pd *pd,
+				struct ib_wq_init_attr *init_attr,
+				struct ib_udata *udata);
+int mlx4_ib_destroy_wq(struct ib_wq *wq);
+int mlx4_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
+		      u32 wq_attr_mask, struct ib_udata *udata);
+
+struct ib_rwq_ind_table
+*mlx4_ib_create_rwq_ind_table(struct ib_device *device,
+			      struct ib_rwq_ind_table_init_attr *init_attr,
+			      struct ib_udata *udata);
+int mlx4_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table);
+
 #endif /* MLX4_IB_H */
 #endif /* MLX4_IB_H */

+ 963 - 68
drivers/infiniband/hw/mlx4/qp.c

@@ -53,6 +53,7 @@ static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq,
 			     struct mlx4_ib_cq *recv_cq);
 			     struct mlx4_ib_cq *recv_cq);
 static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq,
 static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq,
 			       struct mlx4_ib_cq *recv_cq);
 			       struct mlx4_ib_cq *recv_cq);
+static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state);
 
 
 enum {
 enum {
 	MLX4_IB_ACK_REQ_FREQ	= 8,
 	MLX4_IB_ACK_REQ_FREQ	= 8,
@@ -116,6 +117,11 @@ static const __be32 mlx4_ib_opcode[] = {
 	[IB_WR_MASKED_ATOMIC_FETCH_AND_ADD]	= cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA),
 	[IB_WR_MASKED_ATOMIC_FETCH_AND_ADD]	= cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA),
 };
 };
 
 
+enum mlx4_ib_source_type {
+	MLX4_IB_QP_SRC	= 0,
+	MLX4_IB_RWQ_SRC	= 1,
+};
+
 static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
 static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
 {
 {
 	return container_of(mqp, struct mlx4_ib_sqp, qp);
 	return container_of(mqp, struct mlx4_ib_sqp, qp);
@@ -330,6 +336,12 @@ static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
 	}
 	}
 }
 }
 
 
+static void mlx4_ib_wq_event(struct mlx4_qp *qp, enum mlx4_event type)
+{
+	pr_warn_ratelimited("Unexpected event type %d on WQ 0x%06x. Events are not supported for WQs\n",
+			    type, qp->qpn);
+}
+
 static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags)
 static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags)
 {
 {
 	/*
 	/*
@@ -377,7 +389,8 @@ static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags)
 }
 }
 
 
 static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
 static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
-		       int is_user, int has_rq, struct mlx4_ib_qp *qp)
+		       int is_user, int has_rq, struct mlx4_ib_qp *qp,
+		       u32 inl_recv_sz)
 {
 {
 	/* Sanity check RQ size before proceeding */
 	/* Sanity check RQ size before proceeding */
 	if (cap->max_recv_wr > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE ||
 	if (cap->max_recv_wr > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE ||
@@ -385,18 +398,24 @@ static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
 		return -EINVAL;
 		return -EINVAL;
 
 
 	if (!has_rq) {
 	if (!has_rq) {
-		if (cap->max_recv_wr)
+		if (cap->max_recv_wr || inl_recv_sz)
 			return -EINVAL;
 			return -EINVAL;
 
 
 		qp->rq.wqe_cnt = qp->rq.max_gs = 0;
 		qp->rq.wqe_cnt = qp->rq.max_gs = 0;
 	} else {
 	} else {
+		u32 max_inl_recv_sz = dev->dev->caps.max_rq_sg *
+			sizeof(struct mlx4_wqe_data_seg);
+		u32 wqe_size;
+
 		/* HW requires >= 1 RQ entry with >= 1 gather entry */
 		/* HW requires >= 1 RQ entry with >= 1 gather entry */
-		if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge))
+		if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge ||
+				inl_recv_sz > max_inl_recv_sz))
 			return -EINVAL;
 			return -EINVAL;
 
 
 		qp->rq.wqe_cnt	 = roundup_pow_of_two(max(1U, cap->max_recv_wr));
 		qp->rq.wqe_cnt	 = roundup_pow_of_two(max(1U, cap->max_recv_wr));
 		qp->rq.max_gs	 = roundup_pow_of_two(max(1U, cap->max_recv_sge));
 		qp->rq.max_gs	 = roundup_pow_of_two(max(1U, cap->max_recv_sge));
-		qp->rq.wqe_shift = ilog2(qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg));
+		wqe_size = qp->rq.max_gs * sizeof(struct mlx4_wqe_data_seg);
+		qp->rq.wqe_shift = ilog2(max_t(u32, wqe_size, inl_recv_sz));
 	}
 	}
 
 
 	/* leave userspace return values as they were, so as not to break ABI */
 	/* leave userspace return values as they were, so as not to break ABI */
@@ -632,7 +651,297 @@ static void mlx4_ib_free_qp_counter(struct mlx4_ib_dev *dev,
 	qp->counter_index = NULL;
 	qp->counter_index = NULL;
 }
 }
 
 
+static int set_qp_rss(struct mlx4_ib_dev *dev, struct mlx4_ib_rss *rss_ctx,
+		      struct ib_qp_init_attr *init_attr,
+		      struct mlx4_ib_create_qp_rss *ucmd)
+{
+	rss_ctx->base_qpn_tbl_sz = init_attr->rwq_ind_tbl->ind_tbl[0]->wq_num |
+		(init_attr->rwq_ind_tbl->log_ind_tbl_size << 24);
+
+	if ((ucmd->rx_hash_function == MLX4_IB_RX_HASH_FUNC_TOEPLITZ) &&
+	    (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_TOP)) {
+		memcpy(rss_ctx->rss_key, ucmd->rx_hash_key,
+		       MLX4_EN_RSS_KEY_SIZE);
+	} else {
+		pr_debug("RX Hash function is not supported\n");
+		return (-EOPNOTSUPP);
+	}
+
+	if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV4) &&
+	    (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV4)) {
+		rss_ctx->flags = MLX4_RSS_IPV4;
+	} else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV4) ||
+		   (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV4)) {
+		pr_debug("RX Hash fields_mask is not supported - both IPv4 SRC and DST must be set\n");
+		return (-EOPNOTSUPP);
+	}
+
+	if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV6) &&
+	    (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV6)) {
+		rss_ctx->flags |= MLX4_RSS_IPV6;
+	} else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV6) ||
+		   (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV6)) {
+		pr_debug("RX Hash fields_mask is not supported - both IPv6 SRC and DST must be set\n");
+		return (-EOPNOTSUPP);
+	}
+
+	if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_UDP) &&
+	    (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_UDP)) {
+		if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UDP_RSS)) {
+			pr_debug("RX Hash fields_mask for UDP is not supported\n");
+			return (-EOPNOTSUPP);
+		}
+
+		if (rss_ctx->flags & MLX4_RSS_IPV4) {
+			rss_ctx->flags |= MLX4_RSS_UDP_IPV4;
+		} else if (rss_ctx->flags & MLX4_RSS_IPV6) {
+			rss_ctx->flags |= MLX4_RSS_UDP_IPV6;
+		} else {
+			pr_debug("RX Hash fields_mask is not supported - UDP must be set with IPv4 or IPv6\n");
+			return (-EOPNOTSUPP);
+		}
+	} else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_UDP) ||
+		   (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_UDP)) {
+		pr_debug("RX Hash fields_mask is not supported - both UDP SRC and DST must be set\n");
+		return (-EOPNOTSUPP);
+	}
+
+	if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_TCP) &&
+	    (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_TCP)) {
+		if (rss_ctx->flags & MLX4_RSS_IPV4) {
+			rss_ctx->flags |= MLX4_RSS_TCP_IPV4;
+		} else if (rss_ctx->flags & MLX4_RSS_IPV6) {
+			rss_ctx->flags |= MLX4_RSS_TCP_IPV6;
+		} else {
+			pr_debug("RX Hash fields_mask is not supported - TCP must be set with IPv4 or IPv6\n");
+			return (-EOPNOTSUPP);
+		}
+
+	} else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_TCP) ||
+		   (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_TCP)) {
+		pr_debug("RX Hash fields_mask is not supported - both TCP SRC and DST must be set\n");
+		return (-EOPNOTSUPP);
+	}
+
+	return 0;
+}
+
+static int create_qp_rss(struct mlx4_ib_dev *dev, struct ib_pd *ibpd,
+			 struct ib_qp_init_attr *init_attr,
+			 struct mlx4_ib_create_qp_rss *ucmd,
+			 struct mlx4_ib_qp *qp)
+{
+	int qpn;
+	int err;
+
+	qp->mqp.usage = MLX4_RES_USAGE_USER_VERBS;
+
+	err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn, 0, qp->mqp.usage);
+	if (err)
+		return err;
+
+	err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp);
+	if (err)
+		goto err_qpn;
+
+	mutex_init(&qp->mutex);
+
+	INIT_LIST_HEAD(&qp->gid_list);
+	INIT_LIST_HEAD(&qp->steering_rules);
+
+	qp->mlx4_ib_qp_type = MLX4_IB_QPT_RAW_ETHERTYPE;
+	qp->state = IB_QPS_RESET;
+
+	/* Set dummy send resources to be compatible with HV and PRM */
+	qp->sq_no_prefetch = 1;
+	qp->sq.wqe_cnt = 1;
+	qp->sq.wqe_shift = MLX4_IB_MIN_SQ_STRIDE;
+	qp->buf_size = qp->sq.wqe_cnt << MLX4_IB_MIN_SQ_STRIDE;
+	qp->mtt = (to_mqp(
+		   (struct ib_qp *)init_attr->rwq_ind_tbl->ind_tbl[0]))->mtt;
+
+	qp->rss_ctx = kzalloc(sizeof(*qp->rss_ctx), GFP_KERNEL);
+	if (!qp->rss_ctx) {
+		err = -ENOMEM;
+		goto err_qp_alloc;
+	}
+
+	err = set_qp_rss(dev, qp->rss_ctx, init_attr, ucmd);
+	if (err)
+		goto err;
+
+	return 0;
+
+err:
+	kfree(qp->rss_ctx);
+
+err_qp_alloc:
+	mlx4_qp_remove(dev->dev, &qp->mqp);
+	mlx4_qp_free(dev->dev, &qp->mqp);
+
+err_qpn:
+	mlx4_qp_release_range(dev->dev, qpn, 1);
+	return err;
+}
+
+static struct ib_qp *_mlx4_ib_create_qp_rss(struct ib_pd *pd,
+					    struct ib_qp_init_attr *init_attr,
+					    struct ib_udata *udata)
+{
+	struct mlx4_ib_qp *qp;
+	struct mlx4_ib_create_qp_rss ucmd = {};
+	size_t required_cmd_sz;
+	int err;
+
+	if (!udata) {
+		pr_debug("RSS QP with NULL udata\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (udata->outlen)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	required_cmd_sz = offsetof(typeof(ucmd), reserved1) +
+					sizeof(ucmd.reserved1);
+	if (udata->inlen < required_cmd_sz) {
+		pr_debug("invalid inlen\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) {
+		pr_debug("copy failed\n");
+		return ERR_PTR(-EFAULT);
+	}
+
+	if (ucmd.comp_mask || ucmd.reserved1)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (udata->inlen > sizeof(ucmd) &&
+	    !ib_is_udata_cleared(udata, sizeof(ucmd),
+				 udata->inlen - sizeof(ucmd))) {
+		pr_debug("inlen is not supported\n");
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	if (init_attr->qp_type != IB_QPT_RAW_PACKET) {
+		pr_debug("RSS QP with unsupported QP type %d\n",
+			 init_attr->qp_type);
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	if (init_attr->create_flags) {
+		pr_debug("RSS QP doesn't support create flags\n");
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	if (init_attr->send_cq || init_attr->cap.max_send_wr) {
+		pr_debug("RSS QP with unsupported send attributes\n");
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+	if (!qp)
+		return ERR_PTR(-ENOMEM);
+
+	qp->pri.vid = 0xFFFF;
+	qp->alt.vid = 0xFFFF;
+
+	err = create_qp_rss(to_mdev(pd->device), pd, init_attr, &ucmd, qp);
+	if (err) {
+		kfree(qp);
+		return ERR_PTR(err);
+	}
+
+	qp->ibqp.qp_num = qp->mqp.qpn;
+
+	return &qp->ibqp;
+}
+
+/*
+ * This function allocates a WQN from a range which is consecutive and aligned
+ * to its size. In case the range is full, then it creates a new range and
+ * allocates WQN from it. The new range will be used for following allocations.
+ */
+static int mlx4_ib_alloc_wqn(struct mlx4_ib_ucontext *context,
+			     struct mlx4_ib_qp *qp, int range_size, int *wqn)
+{
+	struct mlx4_ib_dev *dev = to_mdev(context->ibucontext.device);
+	struct mlx4_wqn_range *range;
+	int err = 0;
+
+	mutex_lock(&context->wqn_ranges_mutex);
+
+	range = list_first_entry_or_null(&context->wqn_ranges_list,
+					 struct mlx4_wqn_range, list);
+
+	if (!range || (range->refcount == range->size) || range->dirty) {
+		range = kzalloc(sizeof(*range), GFP_KERNEL);
+		if (!range) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		err = mlx4_qp_reserve_range(dev->dev, range_size,
+					    range_size, &range->base_wqn, 0,
+					    qp->mqp.usage);
+		if (err) {
+			kfree(range);
+			goto out;
+		}
+
+		range->size = range_size;
+		list_add(&range->list, &context->wqn_ranges_list);
+	} else if (range_size != 1) {
+		/*
+		 * Requesting a new range (>1) when last range is still open, is
+		 * not valid.
+		 */
+		err = -EINVAL;
+		goto out;
+	}
+
+	qp->wqn_range = range;
+
+	*wqn = range->base_wqn + range->refcount;
+
+	range->refcount++;
+
+out:
+	mutex_unlock(&context->wqn_ranges_mutex);
+
+	return err;
+}
+
+static void mlx4_ib_release_wqn(struct mlx4_ib_ucontext *context,
+				struct mlx4_ib_qp *qp, bool dirty_release)
+{
+	struct mlx4_ib_dev *dev = to_mdev(context->ibucontext.device);
+	struct mlx4_wqn_range *range;
+
+	mutex_lock(&context->wqn_ranges_mutex);
+
+	range = qp->wqn_range;
+
+	range->refcount--;
+	if (!range->refcount) {
+		mlx4_qp_release_range(dev->dev, range->base_wqn,
+				      range->size);
+		list_del(&range->list);
+		kfree(range);
+	} else if (dirty_release) {
+	/*
+	 * A range which one of its WQNs is destroyed, won't be able to be
+	 * reused for further WQN allocations.
+	 * The next created WQ will allocate a new range.
+	 */
+		range->dirty = 1;
+	}
+
+	mutex_unlock(&context->wqn_ranges_mutex);
+}
+
 static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
+			    enum mlx4_ib_source_type src,
 			    struct ib_qp_init_attr *init_attr,
 			    struct ib_qp_init_attr *init_attr,
 			    struct ib_udata *udata, int sqpn,
 			    struct ib_udata *udata, int sqpn,
 			    struct mlx4_ib_qp **caller_qp)
 			    struct mlx4_ib_qp **caller_qp)
@@ -645,6 +954,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 	enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type;
 	enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type;
 	struct mlx4_ib_cq *mcq;
 	struct mlx4_ib_cq *mcq;
 	unsigned long flags;
 	unsigned long flags;
+	int range_size = 0;
 
 
 	/* When tunneling special qps, we use a plain UD qp */
 	/* When tunneling special qps, we use a plain UD qp */
 	if (sqpn) {
 	if (sqpn) {
@@ -719,26 +1029,71 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
 	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
 		qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
 		qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
 
 
-	err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, qp_has_rq(init_attr), qp);
-	if (err)
-		goto err;
 
 
 	if (pd->uobject) {
 	if (pd->uobject) {
-		struct mlx4_ib_create_qp ucmd;
+		union {
+			struct mlx4_ib_create_qp qp;
+			struct mlx4_ib_create_wq wq;
+		} ucmd;
+		size_t copy_len;
 
 
-		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+		copy_len = (src == MLX4_IB_QP_SRC) ?
+			   sizeof(struct mlx4_ib_create_qp) :
+			   min(sizeof(struct mlx4_ib_create_wq), udata->inlen);
+
+		if (ib_copy_from_udata(&ucmd, udata, copy_len)) {
 			err = -EFAULT;
 			err = -EFAULT;
 			goto err;
 			goto err;
 		}
 		}
 
 
-		qp->sq_no_prefetch = ucmd.sq_no_prefetch;
+		if (src == MLX4_IB_RWQ_SRC) {
+			if (ucmd.wq.comp_mask || ucmd.wq.reserved1 ||
+			    ucmd.wq.reserved[0] || ucmd.wq.reserved[1] ||
+			    ucmd.wq.reserved[2]) {
+				pr_debug("user command isn't supported\n");
+				err = -EOPNOTSUPP;
+				goto err;
+			}
+
+			if (ucmd.wq.log_range_size >
+			    ilog2(dev->dev->caps.max_rss_tbl_sz)) {
+				pr_debug("WQN range size must be equal or smaller than %d\n",
+					 dev->dev->caps.max_rss_tbl_sz);
+				err = -EOPNOTSUPP;
+				goto err;
+			}
+			range_size = 1 << ucmd.wq.log_range_size;
+		} else {
+			qp->inl_recv_sz = ucmd.qp.inl_recv_sz;
+		}
 
 
-		err = set_user_sq_size(dev, qp, &ucmd);
+		err = set_rq_size(dev, &init_attr->cap, !!pd->uobject,
+				  qp_has_rq(init_attr), qp, qp->inl_recv_sz);
 		if (err)
 		if (err)
 			goto err;
 			goto err;
 
 
-		qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
-				       qp->buf_size, 0, 0);
+		if (src == MLX4_IB_QP_SRC) {
+			qp->sq_no_prefetch = ucmd.qp.sq_no_prefetch;
+
+			err = set_user_sq_size(dev, qp,
+					       (struct mlx4_ib_create_qp *)
+					       &ucmd);
+			if (err)
+				goto err;
+		} else {
+			qp->sq_no_prefetch = 1;
+			qp->sq.wqe_cnt = 1;
+			qp->sq.wqe_shift = MLX4_IB_MIN_SQ_STRIDE;
+			/* Allocated buffer expects to have at least that SQ
+			 * size.
+			 */
+			qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+				(qp->sq.wqe_cnt << qp->sq.wqe_shift);
+		}
+
+		qp->umem = ib_umem_get(pd->uobject->context,
+				(src == MLX4_IB_QP_SRC) ? ucmd.qp.buf_addr :
+				ucmd.wq.buf_addr, qp->buf_size, 0, 0);
 		if (IS_ERR(qp->umem)) {
 		if (IS_ERR(qp->umem)) {
 			err = PTR_ERR(qp->umem);
 			err = PTR_ERR(qp->umem);
 			goto err;
 			goto err;
@@ -755,11 +1110,18 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 
 
 		if (qp_has_rq(init_attr)) {
 		if (qp_has_rq(init_attr)) {
 			err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
 			err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
-						  ucmd.db_addr, &qp->db);
+				(src == MLX4_IB_QP_SRC) ? ucmd.qp.db_addr :
+				ucmd.wq.db_addr, &qp->db);
 			if (err)
 			if (err)
 				goto err_mtt;
 				goto err_mtt;
 		}
 		}
+		qp->mqp.usage = MLX4_RES_USAGE_USER_VERBS;
 	} else {
 	} else {
+		err = set_rq_size(dev, &init_attr->cap, !!pd->uobject,
+				  qp_has_rq(init_attr), qp, 0);
+		if (err)
+			goto err;
+
 		qp->sq_no_prefetch = 0;
 		qp->sq_no_prefetch = 0;
 
 
 		if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
 		if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
@@ -826,6 +1188,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 			err = -ENOMEM;
 			err = -ENOMEM;
 			goto err_wrid;
 			goto err_wrid;
 		}
 		}
+		qp->mqp.usage = MLX4_RES_USAGE_DRIVER;
 	}
 	}
 
 
 	if (sqpn) {
 	if (sqpn) {
@@ -836,6 +1199,11 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 				goto err_wrid;
 				goto err_wrid;
 			}
 			}
 		}
 		}
+	} else if (src == MLX4_IB_RWQ_SRC) {
+		err = mlx4_ib_alloc_wqn(to_mucontext(pd->uobject->context), qp,
+					range_size, &qpn);
+		if (err)
+			goto err_wrid;
 	} else {
 	} else {
 		/* Raw packet QPNs may not have bits 6,7 set in their qp_num;
 		/* Raw packet QPNs may not have bits 6,7 set in their qp_num;
 		 * otherwise, the WQE BlueFlame setup flow wrongly causes
 		 * otherwise, the WQE BlueFlame setup flow wrongly causes
@@ -845,13 +1213,14 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 						    (init_attr->cap.max_send_wr ?
 						    (init_attr->cap.max_send_wr ?
 						     MLX4_RESERVE_ETH_BF_QP : 0) |
 						     MLX4_RESERVE_ETH_BF_QP : 0) |
 						    (init_attr->cap.max_recv_wr ?
 						    (init_attr->cap.max_recv_wr ?
-						     MLX4_RESERVE_A0_QP : 0));
+						     MLX4_RESERVE_A0_QP : 0),
+						    qp->mqp.usage);
 		else
 		else
 			if (qp->flags & MLX4_IB_QP_NETIF)
 			if (qp->flags & MLX4_IB_QP_NETIF)
 				err = mlx4_ib_steer_qp_alloc(dev, 1, &qpn);
 				err = mlx4_ib_steer_qp_alloc(dev, 1, &qpn);
 			else
 			else
 				err = mlx4_qp_reserve_range(dev->dev, 1, 1,
 				err = mlx4_qp_reserve_range(dev->dev, 1, 1,
-							    &qpn, 0);
+							    &qpn, 0, qp->mqp.usage);
 		if (err)
 		if (err)
 			goto err_proxy;
 			goto err_proxy;
 	}
 	}
@@ -873,7 +1242,9 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 	 */
 	 */
 	qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
 	qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
 
 
-	qp->mqp.event = mlx4_ib_qp_event;
+	qp->mqp.event = (src == MLX4_IB_QP_SRC) ? mlx4_ib_qp_event :
+						  mlx4_ib_wq_event;
+
 	if (!*caller_qp)
 	if (!*caller_qp)
 		*caller_qp = qp;
 		*caller_qp = qp;
 
 
@@ -900,6 +1271,9 @@ err_qpn:
 	if (!sqpn) {
 	if (!sqpn) {
 		if (qp->flags & MLX4_IB_QP_NETIF)
 		if (qp->flags & MLX4_IB_QP_NETIF)
 			mlx4_ib_steer_qp_free(dev, qpn, 1);
 			mlx4_ib_steer_qp_free(dev, qpn, 1);
+		else if (src == MLX4_IB_RWQ_SRC)
+			mlx4_ib_release_wqn(to_mucontext(pd->uobject->context),
+					    qp, 0);
 		else
 		else
 			mlx4_qp_release_range(dev->dev, qpn, 1);
 			mlx4_qp_release_range(dev->dev, qpn, 1);
 	}
 	}
@@ -998,7 +1372,7 @@ static struct mlx4_ib_pd *get_pd(struct mlx4_ib_qp *qp)
 		return to_mpd(qp->ibqp.pd);
 		return to_mpd(qp->ibqp.pd);
 }
 }
 
 
-static void get_cqs(struct mlx4_ib_qp *qp,
+static void get_cqs(struct mlx4_ib_qp *qp, enum mlx4_ib_source_type src,
 		    struct mlx4_ib_cq **send_cq, struct mlx4_ib_cq **recv_cq)
 		    struct mlx4_ib_cq **send_cq, struct mlx4_ib_cq **recv_cq)
 {
 {
 	switch (qp->ibqp.qp_type) {
 	switch (qp->ibqp.qp_type) {
@@ -1011,14 +1385,46 @@ static void get_cqs(struct mlx4_ib_qp *qp,
 		*recv_cq = *send_cq;
 		*recv_cq = *send_cq;
 		break;
 		break;
 	default:
 	default:
-		*send_cq = to_mcq(qp->ibqp.send_cq);
-		*recv_cq = to_mcq(qp->ibqp.recv_cq);
+		*recv_cq = (src == MLX4_IB_QP_SRC) ? to_mcq(qp->ibqp.recv_cq) :
+						     to_mcq(qp->ibwq.cq);
+		*send_cq = (src == MLX4_IB_QP_SRC) ? to_mcq(qp->ibqp.send_cq) :
+						     *recv_cq;
 		break;
 		break;
 	}
 	}
 }
 }
 
 
+static void destroy_qp_rss(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+	if (qp->state != IB_QPS_RESET) {
+		int i;
+
+		for (i = 0; i < (1 << qp->ibqp.rwq_ind_tbl->log_ind_tbl_size);
+		     i++) {
+			struct ib_wq *ibwq = qp->ibqp.rwq_ind_tbl->ind_tbl[i];
+			struct mlx4_ib_qp *wq =	to_mqp((struct ib_qp *)ibwq);
+
+			mutex_lock(&wq->mutex);
+
+			wq->rss_usecnt--;
+
+			mutex_unlock(&wq->mutex);
+		}
+
+		if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state),
+				   MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp))
+			pr_warn("modify QP %06x to RESET failed.\n",
+				qp->mqp.qpn);
+	}
+
+	mlx4_qp_remove(dev->dev, &qp->mqp);
+	mlx4_qp_free(dev->dev, &qp->mqp);
+	mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
+	del_gid_entries(qp);
+	kfree(qp->rss_ctx);
+}
+
 static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
 static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
-			      int is_user)
+			      enum mlx4_ib_source_type src, int is_user)
 {
 {
 	struct mlx4_ib_cq *send_cq, *recv_cq;
 	struct mlx4_ib_cq *send_cq, *recv_cq;
 	unsigned long flags;
 	unsigned long flags;
@@ -1051,7 +1457,7 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
 		}
 		}
 	}
 	}
 
 
-	get_cqs(qp, &send_cq, &recv_cq);
+	get_cqs(qp, src, &send_cq, &recv_cq);
 
 
 	spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
 	spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
 	mlx4_ib_lock_cqs(send_cq, recv_cq);
 	mlx4_ib_lock_cqs(send_cq, recv_cq);
@@ -1077,6 +1483,9 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
 	if (!is_sqp(dev, qp) && !is_tunnel_qp(dev, qp)) {
 	if (!is_sqp(dev, qp) && !is_tunnel_qp(dev, qp)) {
 		if (qp->flags & MLX4_IB_QP_NETIF)
 		if (qp->flags & MLX4_IB_QP_NETIF)
 			mlx4_ib_steer_qp_free(dev, qp->mqp.qpn, 1);
 			mlx4_ib_steer_qp_free(dev, qp->mqp.qpn, 1);
+		else if (src == MLX4_IB_RWQ_SRC)
+			mlx4_ib_release_wqn(to_mucontext(
+					    qp->ibwq.uobject->context), qp, 1);
 		else
 		else
 			mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
 			mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
 	}
 	}
@@ -1084,9 +1493,12 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
 	mlx4_mtt_cleanup(dev->dev, &qp->mtt);
 	mlx4_mtt_cleanup(dev->dev, &qp->mtt);
 
 
 	if (is_user) {
 	if (is_user) {
-		if (qp->rq.wqe_cnt)
-			mlx4_ib_db_unmap_user(to_mucontext(qp->ibqp.uobject->context),
-					      &qp->db);
+		if (qp->rq.wqe_cnt) {
+			struct mlx4_ib_ucontext *mcontext = !src ?
+				to_mucontext(qp->ibqp.uobject->context) :
+				to_mucontext(qp->ibwq.uobject->context);
+			mlx4_ib_db_unmap_user(mcontext, &qp->db);
+		}
 		ib_umem_release(qp->umem);
 		ib_umem_release(qp->umem);
 	} else {
 	} else {
 		kvfree(qp->sq.wrid);
 		kvfree(qp->sq.wrid);
@@ -1128,6 +1540,9 @@ static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd,
 	int sup_u_create_flags = MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
 	int sup_u_create_flags = MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
 	u16 xrcdn = 0;
 	u16 xrcdn = 0;
 
 
+	if (init_attr->rwq_ind_tbl)
+		return _mlx4_ib_create_qp_rss(pd, init_attr, udata);
+
 	/*
 	/*
 	 * We only support LSO, vendor flag1, and multicast loopback blocking,
 	 * We only support LSO, vendor flag1, and multicast loopback blocking,
 	 * and only for kernel UD QPs.
 	 * and only for kernel UD QPs.
@@ -1182,8 +1597,8 @@ static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd,
 		/* fall through */
 		/* fall through */
 	case IB_QPT_UD:
 	case IB_QPT_UD:
 	{
 	{
-		err = create_qp_common(to_mdev(pd->device), pd, init_attr,
-				       udata, 0, &qp);
+		err = create_qp_common(to_mdev(pd->device), pd,	MLX4_IB_QP_SRC,
+				       init_attr, udata, 0, &qp);
 		if (err) {
 		if (err) {
 			kfree(qp);
 			kfree(qp);
 			return ERR_PTR(err);
 			return ERR_PTR(err);
@@ -1203,7 +1618,9 @@ static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd,
 		if (udata)
 		if (udata)
 			return ERR_PTR(-EINVAL);
 			return ERR_PTR(-EINVAL);
 		if (init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI) {
 		if (init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI) {
-			int res = mlx4_qp_reserve_range(to_mdev(pd->device)->dev, 1, 1, &sqpn, 0);
+			int res = mlx4_qp_reserve_range(to_mdev(pd->device)->dev,
+							1, 1, &sqpn, 0,
+							MLX4_RES_USAGE_DRIVER);
 
 
 			if (res)
 			if (res)
 				return ERR_PTR(res);
 				return ERR_PTR(res);
@@ -1211,8 +1628,8 @@ static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd,
 			sqpn = get_sqp_num(to_mdev(pd->device), init_attr);
 			sqpn = get_sqp_num(to_mdev(pd->device), init_attr);
 		}
 		}
 
 
-		err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata,
-				       sqpn, &qp);
+		err = create_qp_common(to_mdev(pd->device), pd, MLX4_IB_QP_SRC,
+				       init_attr, udata, sqpn, &qp);
 		if (err)
 		if (err)
 			return ERR_PTR(err);
 			return ERR_PTR(err);
 
 
@@ -1267,7 +1684,6 @@ static int _mlx4_ib_destroy_qp(struct ib_qp *qp)
 {
 {
 	struct mlx4_ib_dev *dev = to_mdev(qp->device);
 	struct mlx4_ib_dev *dev = to_mdev(qp->device);
 	struct mlx4_ib_qp *mqp = to_mqp(qp);
 	struct mlx4_ib_qp *mqp = to_mqp(qp);
-	struct mlx4_ib_pd *pd;
 
 
 	if (is_qp0(dev, mqp))
 	if (is_qp0(dev, mqp))
 		mlx4_CLOSE_PORT(dev->dev, mqp->port);
 		mlx4_CLOSE_PORT(dev->dev, mqp->port);
@@ -1282,8 +1698,14 @@ static int _mlx4_ib_destroy_qp(struct ib_qp *qp)
 	if (mqp->counter_index)
 	if (mqp->counter_index)
 		mlx4_ib_free_qp_counter(dev, mqp);
 		mlx4_ib_free_qp_counter(dev, mqp);
 
 
-	pd = get_pd(mqp);
-	destroy_qp_common(dev, mqp, !!pd->ibpd.uobject);
+	if (qp->rwq_ind_tbl) {
+		destroy_qp_rss(dev, mqp);
+	} else {
+		struct mlx4_ib_pd *pd;
+
+		pd = get_pd(mqp);
+		destroy_qp_common(dev, mqp, MLX4_IB_QP_SRC, !!pd->ibpd.uobject);
+	}
 
 
 	if (is_sqp(dev, mqp))
 	if (is_sqp(dev, mqp))
 		kfree(to_msqp(mqp));
 		kfree(to_msqp(mqp));
@@ -1566,7 +1988,7 @@ static int create_qp_lb_counter(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
 	    !(dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_LB_SRC_CHK))
 	    !(dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_LB_SRC_CHK))
 		return 0;
 		return 0;
 
 
-	err = mlx4_counter_alloc(dev->dev, &tmp_idx);
+	err = mlx4_counter_alloc(dev->dev, &tmp_idx, MLX4_RES_USAGE_DRIVER);
 	if (err)
 	if (err)
 		return err;
 		return err;
 
 
@@ -1606,12 +2028,119 @@ static u8 gid_type_to_qpc(enum ib_gid_type gid_type)
 	}
 	}
 }
 }
 
 
-static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
+/*
+ * Go over all RSS QP's childes (WQs) and apply their HW state according to
+ * their logic state if the RSS QP is the first RSS QP associated for the WQ.
+ */
+static int bringup_rss_rwqs(struct ib_rwq_ind_table *ind_tbl, u8 port_num)
+{
+	int i;
+	int err;
+
+	for (i = 0; i < (1 << ind_tbl->log_ind_tbl_size); i++) {
+		struct ib_wq *ibwq = ind_tbl->ind_tbl[i];
+		struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq);
+
+		mutex_lock(&wq->mutex);
+
+		/* Mlx4_ib restrictions:
+		 * WQ's is associated to a port according to the RSS QP it is
+		 * associates to.
+		 * In case the WQ is associated to a different port by another
+		 * RSS QP, return a failure.
+		 */
+		if ((wq->rss_usecnt > 0) && (wq->port != port_num)) {
+			err = -EINVAL;
+			mutex_unlock(&wq->mutex);
+			break;
+		}
+		wq->port = port_num;
+		if ((wq->rss_usecnt == 0) && (ibwq->state == IB_WQS_RDY)) {
+			err = _mlx4_ib_modify_wq(ibwq, IB_WQS_RDY);
+			if (err) {
+				mutex_unlock(&wq->mutex);
+				break;
+			}
+		}
+		wq->rss_usecnt++;
+
+		mutex_unlock(&wq->mutex);
+	}
+
+	if (i && err) {
+		int j;
+
+		for (j = (i - 1); j >= 0; j--) {
+			struct ib_wq *ibwq = ind_tbl->ind_tbl[j];
+			struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq);
+
+			mutex_lock(&wq->mutex);
+
+			if ((wq->rss_usecnt == 1) &&
+			    (ibwq->state == IB_WQS_RDY))
+				if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET))
+					pr_warn("failed to reverse WQN=0x%06x\n",
+						ibwq->wq_num);
+			wq->rss_usecnt--;
+
+			mutex_unlock(&wq->mutex);
+		}
+	}
+
+	return err;
+}
+
+static void bring_down_rss_rwqs(struct ib_rwq_ind_table *ind_tbl)
+{
+	int i;
+
+	for (i = 0; i < (1 << ind_tbl->log_ind_tbl_size); i++) {
+		struct ib_wq *ibwq = ind_tbl->ind_tbl[i];
+		struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq);
+
+		mutex_lock(&wq->mutex);
+
+		if ((wq->rss_usecnt == 1) && (ibwq->state == IB_WQS_RDY))
+			if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET))
+				pr_warn("failed to reverse WQN=%x\n",
+					ibwq->wq_num);
+		wq->rss_usecnt--;
+
+		mutex_unlock(&wq->mutex);
+	}
+}
+
+static void fill_qp_rss_context(struct mlx4_qp_context *context,
+				struct mlx4_ib_qp *qp)
+{
+	struct mlx4_rss_context *rss_context;
+
+	rss_context = (void *)context + offsetof(struct mlx4_qp_context,
+			pri_path) + MLX4_RSS_OFFSET_IN_QPC_PRI_PATH;
+
+	rss_context->base_qpn = cpu_to_be32(qp->rss_ctx->base_qpn_tbl_sz);
+	rss_context->default_qpn =
+		cpu_to_be32(qp->rss_ctx->base_qpn_tbl_sz & 0xffffff);
+	if (qp->rss_ctx->flags & (MLX4_RSS_UDP_IPV4 | MLX4_RSS_UDP_IPV6))
+		rss_context->base_qpn_udp = rss_context->default_qpn;
+	rss_context->flags = qp->rss_ctx->flags;
+	/* Currently support just toeplitz */
+	rss_context->hash_fn = MLX4_RSS_HASH_TOP;
+
+	memcpy(rss_context->rss_key, qp->rss_ctx->rss_key,
+	       MLX4_EN_RSS_KEY_SIZE);
+}
+
+static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
 			       const struct ib_qp_attr *attr, int attr_mask,
 			       const struct ib_qp_attr *attr, int attr_mask,
 			       enum ib_qp_state cur_state, enum ib_qp_state new_state)
 			       enum ib_qp_state cur_state, enum ib_qp_state new_state)
 {
 {
-	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
-	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	struct ib_uobject *ibuobject;
+	struct ib_srq  *ibsrq;
+	struct ib_rwq_ind_table *rwq_ind_tbl;
+	enum ib_qp_type qp_type;
+	struct mlx4_ib_dev *dev;
+	struct mlx4_ib_qp *qp;
 	struct mlx4_ib_pd *pd;
 	struct mlx4_ib_pd *pd;
 	struct mlx4_ib_cq *send_cq, *recv_cq;
 	struct mlx4_ib_cq *send_cq, *recv_cq;
 	struct mlx4_qp_context *context;
 	struct mlx4_qp_context *context;
@@ -1621,6 +2150,30 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 	int err = -EINVAL;
 	int err = -EINVAL;
 	int counter_index;
 	int counter_index;
 
 
+	if (src_type == MLX4_IB_RWQ_SRC) {
+		struct ib_wq *ibwq;
+
+		ibwq	    = (struct ib_wq *)src;
+		ibuobject   = ibwq->uobject;
+		ibsrq	    = NULL;
+		rwq_ind_tbl = NULL;
+		qp_type     = IB_QPT_RAW_PACKET;
+		qp	    = to_mqp((struct ib_qp *)ibwq);
+		dev	    = to_mdev(ibwq->device);
+		pd	    = to_mpd(ibwq->pd);
+	} else {
+		struct ib_qp *ibqp;
+
+		ibqp	    = (struct ib_qp *)src;
+		ibuobject   = ibqp->uobject;
+		ibsrq	    = ibqp->srq;
+		rwq_ind_tbl = ibqp->rwq_ind_tbl;
+		qp_type     = ibqp->qp_type;
+		qp	    = to_mqp(ibqp);
+		dev	    = to_mdev(ibqp->device);
+		pd	    = get_pd(qp);
+	}
+
 	/* APM is not supported under RoCE */
 	/* APM is not supported under RoCE */
 	if (attr_mask & IB_QP_ALT_PATH &&
 	if (attr_mask & IB_QP_ALT_PATH &&
 	    rdma_port_get_link_layer(&dev->ib_dev, qp->port) ==
 	    rdma_port_get_link_layer(&dev->ib_dev, qp->port) ==
@@ -1634,6 +2187,11 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 	context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) |
 	context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) |
 				     (to_mlx4_st(dev, qp->mlx4_ib_qp_type) << 16));
 				     (to_mlx4_st(dev, qp->mlx4_ib_qp_type) << 16));
 
 
+	if (rwq_ind_tbl) {
+		fill_qp_rss_context(context, qp);
+		context->flags |= cpu_to_be32(1 << MLX4_RSS_QPC_FLAG_OFFSET);
+	}
+
 	if (!(attr_mask & IB_QP_PATH_MIG_STATE))
 	if (!(attr_mask & IB_QP_PATH_MIG_STATE))
 		context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
 		context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
 	else {
 	else {
@@ -1651,11 +2209,14 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 		}
 		}
 	}
 	}
 
 
-	if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI)
+	if (qp->inl_recv_sz)
+		context->param3 |= cpu_to_be32(1 << 25);
+
+	if (qp_type == IB_QPT_GSI || qp_type == IB_QPT_SMI)
 		context->mtu_msgmax = (IB_MTU_4096 << 5) | 11;
 		context->mtu_msgmax = (IB_MTU_4096 << 5) | 11;
-	else if (ibqp->qp_type == IB_QPT_RAW_PACKET)
+	else if (qp_type == IB_QPT_RAW_PACKET)
 		context->mtu_msgmax = (MLX4_RAW_QP_MTU << 5) | MLX4_RAW_QP_MSGMAX;
 		context->mtu_msgmax = (MLX4_RAW_QP_MTU << 5) | MLX4_RAW_QP_MSGMAX;
-	else if (ibqp->qp_type == IB_QPT_UD) {
+	else if (qp_type == IB_QPT_UD) {
 		if (qp->flags & MLX4_IB_QP_LSO)
 		if (qp->flags & MLX4_IB_QP_LSO)
 			context->mtu_msgmax = (IB_MTU_4096 << 5) |
 			context->mtu_msgmax = (IB_MTU_4096 << 5) |
 					      ilog2(dev->dev->caps.max_gso_sz);
 					      ilog2(dev->dev->caps.max_gso_sz);
@@ -1671,9 +2232,11 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 			ilog2(dev->dev->caps.max_msg_sz);
 			ilog2(dev->dev->caps.max_msg_sz);
 	}
 	}
 
 
-	if (qp->rq.wqe_cnt)
-		context->rq_size_stride = ilog2(qp->rq.wqe_cnt) << 3;
-	context->rq_size_stride |= qp->rq.wqe_shift - 4;
+	if (!rwq_ind_tbl) { /* PRM RSS receive side should be left zeros */
+		if (qp->rq.wqe_cnt)
+			context->rq_size_stride = ilog2(qp->rq.wqe_cnt) << 3;
+		context->rq_size_stride |= qp->rq.wqe_shift - 4;
+	}
 
 
 	if (qp->sq.wqe_cnt)
 	if (qp->sq.wqe_cnt)
 		context->sq_size_stride = ilog2(qp->sq.wqe_cnt) << 3;
 		context->sq_size_stride = ilog2(qp->sq.wqe_cnt) << 3;
@@ -1685,14 +2248,15 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
 	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
 		context->sq_size_stride |= !!qp->sq_no_prefetch << 7;
 		context->sq_size_stride |= !!qp->sq_no_prefetch << 7;
 		context->xrcd = cpu_to_be32((u32) qp->xrcdn);
 		context->xrcd = cpu_to_be32((u32) qp->xrcdn);
-		if (ibqp->qp_type == IB_QPT_RAW_PACKET)
+		if (qp_type == IB_QPT_RAW_PACKET)
 			context->param3 |= cpu_to_be32(1 << 30);
 			context->param3 |= cpu_to_be32(1 << 30);
 	}
 	}
 
 
-	if (qp->ibqp.uobject)
+	if (ibuobject)
 		context->usr_page = cpu_to_be32(
 		context->usr_page = cpu_to_be32(
 			mlx4_to_hw_uar_index(dev->dev,
 			mlx4_to_hw_uar_index(dev->dev,
-					     to_mucontext(ibqp->uobject->context)->uar.index));
+					     to_mucontext(ibuobject->context)
+					     ->uar.index));
 	else
 	else
 		context->usr_page = cpu_to_be32(
 		context->usr_page = cpu_to_be32(
 			mlx4_to_hw_uar_index(dev->dev, dev->priv_uar.index));
 			mlx4_to_hw_uar_index(dev->dev, dev->priv_uar.index));
@@ -1736,7 +2300,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 			steer_qp = 1;
 			steer_qp = 1;
 		}
 		}
 
 
-		if (ibqp->qp_type == IB_QPT_GSI) {
+		if (qp_type == IB_QPT_GSI) {
 			enum ib_gid_type gid_type = qp->flags & MLX4_IB_ROCE_V2_GSI_QP ?
 			enum ib_gid_type gid_type = qp->flags & MLX4_IB_ROCE_V2_GSI_QP ?
 				IB_GID_TYPE_ROCE_UDP_ENCAP : IB_GID_TYPE_ROCE;
 				IB_GID_TYPE_ROCE_UDP_ENCAP : IB_GID_TYPE_ROCE;
 			u8 qpc_roce_mode = gid_type_to_qpc(gid_type);
 			u8 qpc_roce_mode = gid_type_to_qpc(gid_type);
@@ -1753,7 +2317,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 	}
 	}
 
 
 	if (attr_mask & IB_QP_AV) {
 	if (attr_mask & IB_QP_AV) {
-		u8 port_num = mlx4_is_bonded(to_mdev(ibqp->device)->dev) ? 1 :
+		u8 port_num = mlx4_is_bonded(dev->dev) ? 1 :
 			attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
 			attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
 		union ib_gid gid;
 		union ib_gid gid;
 		struct ib_gid_attr gid_attr = {.gid_type = IB_GID_TYPE_IB};
 		struct ib_gid_attr gid_attr = {.gid_type = IB_GID_TYPE_IB};
@@ -1768,7 +2332,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 			int index =
 			int index =
 				rdma_ah_read_grh(&attr->ah_attr)->sgid_index;
 				rdma_ah_read_grh(&attr->ah_attr)->sgid_index;
 
 
-			status = ib_get_cached_gid(ibqp->device, port_num,
+			status = ib_get_cached_gid(&dev->ib_dev, port_num,
 						   index, &gid, &gid_attr);
 						   index, &gid, &gid_attr);
 			if (!status && !memcmp(&gid, &zgid, sizeof(gid)))
 			if (!status && !memcmp(&gid, &zgid, sizeof(gid)))
 				status = -ENOENT;
 				status = -ENOENT;
@@ -1825,15 +2389,20 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 		optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH;
 		optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH;
 	}
 	}
 
 
-	pd = get_pd(qp);
-	get_cqs(qp, &send_cq, &recv_cq);
-	context->pd       = cpu_to_be32(pd->pdn);
+	context->pd = cpu_to_be32(pd->pdn);
+
+	if (!rwq_ind_tbl) {
+		get_cqs(qp, src_type, &send_cq, &recv_cq);
+	} else { /* Set dummy CQs to be compatible with HV and PRM */
+		send_cq = to_mcq(rwq_ind_tbl->ind_tbl[0]->cq);
+		recv_cq = send_cq;
+	}
 	context->cqn_send = cpu_to_be32(send_cq->mcq.cqn);
 	context->cqn_send = cpu_to_be32(send_cq->mcq.cqn);
 	context->cqn_recv = cpu_to_be32(recv_cq->mcq.cqn);
 	context->cqn_recv = cpu_to_be32(recv_cq->mcq.cqn);
 	context->params1  = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28);
 	context->params1  = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28);
 
 
 	/* Set "fast registration enabled" for all kernel QPs */
 	/* Set "fast registration enabled" for all kernel QPs */
-	if (!qp->ibqp.uobject)
+	if (!ibuobject)
 		context->params1 |= cpu_to_be32(1 << 11);
 		context->params1 |= cpu_to_be32(1 << 11);
 
 
 	if (attr_mask & IB_QP_RNR_RETRY) {
 	if (attr_mask & IB_QP_RNR_RETRY) {
@@ -1868,7 +2437,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 		optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE;
 		optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE;
 	}
 	}
 
 
-	if (ibqp->srq)
+	if (ibsrq)
 		context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC);
 		context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC);
 
 
 	if (attr_mask & IB_QP_MIN_RNR_TIMER) {
 	if (attr_mask & IB_QP_MIN_RNR_TIMER) {
@@ -1899,17 +2468,19 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 		optpar |= MLX4_QP_OPTPAR_Q_KEY;
 		optpar |= MLX4_QP_OPTPAR_Q_KEY;
 	}
 	}
 
 
-	if (ibqp->srq)
-		context->srqn = cpu_to_be32(1 << 24 | to_msrq(ibqp->srq)->msrq.srqn);
+	if (ibsrq)
+		context->srqn = cpu_to_be32(1 << 24 |
+					    to_msrq(ibsrq)->msrq.srqn);
 
 
-	if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+	if (qp->rq.wqe_cnt &&
+	    cur_state == IB_QPS_RESET &&
+	    new_state == IB_QPS_INIT)
 		context->db_rec_addr = cpu_to_be64(qp->db.dma);
 		context->db_rec_addr = cpu_to_be64(qp->db.dma);
 
 
 	if (cur_state == IB_QPS_INIT &&
 	if (cur_state == IB_QPS_INIT &&
 	    new_state == IB_QPS_RTR  &&
 	    new_state == IB_QPS_RTR  &&
-	    (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI ||
-	     ibqp->qp_type == IB_QPT_UD ||
-	     ibqp->qp_type == IB_QPT_RAW_PACKET)) {
+	    (qp_type == IB_QPT_GSI || qp_type == IB_QPT_SMI ||
+	     qp_type == IB_QPT_UD || qp_type == IB_QPT_RAW_PACKET)) {
 		context->pri_path.sched_queue = (qp->port - 1) << 6;
 		context->pri_path.sched_queue = (qp->port - 1) << 6;
 		if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI ||
 		if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI ||
 		    qp->mlx4_ib_qp_type &
 		    qp->mlx4_ib_qp_type &
@@ -1942,7 +2513,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 		}
 		}
 	}
 	}
 
 
-	if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) {
+	if (qp_type == IB_QPT_RAW_PACKET) {
 		context->pri_path.ackto = (context->pri_path.ackto & 0xf8) |
 		context->pri_path.ackto = (context->pri_path.ackto & 0xf8) |
 					MLX4_IB_LINK_TYPE_ETH;
 					MLX4_IB_LINK_TYPE_ETH;
 		if (dev->dev->caps.tunnel_offload_mode ==  MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
 		if (dev->dev->caps.tunnel_offload_mode ==  MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
@@ -1952,7 +2523,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 		}
 		}
 	}
 	}
 
 
-	if (ibqp->qp_type == IB_QPT_UD && (new_state == IB_QPS_RTR)) {
+	if (qp_type == IB_QPT_UD && (new_state == IB_QPS_RTR)) {
 		int is_eth = rdma_port_get_link_layer(
 		int is_eth = rdma_port_get_link_layer(
 				&dev->ib_dev, qp->port) ==
 				&dev->ib_dev, qp->port) ==
 				IB_LINK_LAYER_ETHERNET;
 				IB_LINK_LAYER_ETHERNET;
@@ -1962,14 +2533,15 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 		}
 		}
 	}
 	}
 
 
-
 	if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD	&&
 	if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD	&&
 	    attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify)
 	    attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify)
 		sqd_event = 1;
 		sqd_event = 1;
 	else
 	else
 		sqd_event = 0;
 		sqd_event = 0;
 
 
-	if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+	if (!ibuobject &&
+	    cur_state == IB_QPS_RESET &&
+	    new_state == IB_QPS_INIT)
 		context->rlkey_roce_mode |= (1 << 4);
 		context->rlkey_roce_mode |= (1 << 4);
 
 
 	/*
 	/*
@@ -1978,7 +2550,9 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 	 * headroom is stamped so that the hardware doesn't start
 	 * headroom is stamped so that the hardware doesn't start
 	 * processing stale work requests.
 	 * processing stale work requests.
 	 */
 	 */
-	if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+	if (!ibuobject &&
+	    cur_state == IB_QPS_RESET &&
+	    new_state == IB_QPS_INIT) {
 		struct mlx4_wqe_ctrl_seg *ctrl;
 		struct mlx4_wqe_ctrl_seg *ctrl;
 		int i;
 		int i;
 
 
@@ -2035,9 +2609,9 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 	 * entries and reinitialize the QP.
 	 * entries and reinitialize the QP.
 	 */
 	 */
 	if (new_state == IB_QPS_RESET) {
 	if (new_state == IB_QPS_RESET) {
-		if (!ibqp->uobject) {
+		if (!ibuobject) {
 			mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
 			mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
-					 ibqp->srq ? to_msrq(ibqp->srq) : NULL);
+					 ibsrq ? to_msrq(ibsrq) : NULL);
 			if (send_cq != recv_cq)
 			if (send_cq != recv_cq)
 				mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
 				mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
 
 
@@ -2148,6 +2722,11 @@ out:
 	return err;
 	return err;
 }
 }
 
 
+enum {
+	MLX4_IB_MODIFY_QP_RSS_SUP_ATTR_MSK = (IB_QP_STATE	|
+					      IB_QP_PORT),
+};
+
 static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 			      int attr_mask, struct ib_udata *udata)
 			      int attr_mask, struct ib_udata *udata)
 {
 {
@@ -2178,6 +2757,27 @@ static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		goto out;
 		goto out;
 	}
 	}
 
 
+	if (ibqp->rwq_ind_tbl) {
+		if (!(((cur_state == IB_QPS_RESET) &&
+		       (new_state == IB_QPS_INIT)) ||
+		      ((cur_state == IB_QPS_INIT)  &&
+		       (new_state == IB_QPS_RTR)))) {
+			pr_debug("qpn 0x%x: RSS QP unsupported transition %d to %d\n",
+				 ibqp->qp_num, cur_state, new_state);
+
+			err = -EOPNOTSUPP;
+			goto out;
+		}
+
+		if (attr_mask & ~MLX4_IB_MODIFY_QP_RSS_SUP_ATTR_MSK) {
+			pr_debug("qpn 0x%x: RSS QP unsupported attribute mask 0x%x for transition %d to %d\n",
+				 ibqp->qp_num, attr_mask, cur_state, new_state);
+
+			err = -EOPNOTSUPP;
+			goto out;
+		}
+	}
+
 	if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT)) {
 	if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT)) {
 		if ((cur_state == IB_QPS_RESET) && (new_state == IB_QPS_INIT)) {
 		if ((cur_state == IB_QPS_RESET) && (new_state == IB_QPS_INIT)) {
 			if ((ibqp->qp_type == IB_QPT_RC) ||
 			if ((ibqp->qp_type == IB_QPT_RC) ||
@@ -2242,7 +2842,17 @@ static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		goto out;
 		goto out;
 	}
 	}
 
 
-	err = __mlx4_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
+	if (ibqp->rwq_ind_tbl && (new_state == IB_QPS_INIT)) {
+		err = bringup_rss_rwqs(ibqp->rwq_ind_tbl, attr->port_num);
+		if (err)
+			goto out;
+	}
+
+	err = __mlx4_ib_modify_qp(ibqp, MLX4_IB_QP_SRC, attr, attr_mask,
+				  cur_state, new_state);
+
+	if (ibqp->rwq_ind_tbl && err)
+		bring_down_rss_rwqs(ibqp->rwq_ind_tbl);
 
 
 	if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT))
 	if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT))
 		attr->port_num = 1;
 		attr->port_num = 1;
@@ -3432,6 +4042,9 @@ int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr
 	int mlx4_state;
 	int mlx4_state;
 	int err = 0;
 	int err = 0;
 
 
+	if (ibqp->rwq_ind_tbl)
+		return -EOPNOTSUPP;
+
 	mutex_lock(&qp->mutex);
 	mutex_lock(&qp->mutex);
 
 
 	if (qp->state == IB_QPS_RESET) {
 	if (qp->state == IB_QPS_RESET) {
@@ -3527,3 +4140,285 @@ out:
 	return err;
 	return err;
 }
 }
 
 
+struct ib_wq *mlx4_ib_create_wq(struct ib_pd *pd,
+				struct ib_wq_init_attr *init_attr,
+				struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev;
+	struct ib_qp_init_attr ib_qp_init_attr;
+	struct mlx4_ib_qp *qp;
+	struct mlx4_ib_create_wq ucmd;
+	int err, required_cmd_sz;
+
+	if (!(udata && pd->uobject))
+		return ERR_PTR(-EINVAL);
+
+	required_cmd_sz = offsetof(typeof(ucmd), reserved) +
+			  sizeof(ucmd.reserved);
+	if (udata->inlen < required_cmd_sz) {
+		pr_debug("invalid inlen\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (udata->inlen > sizeof(ucmd) &&
+	    !ib_is_udata_cleared(udata, sizeof(ucmd),
+				 udata->inlen - sizeof(ucmd))) {
+		pr_debug("inlen is not supported\n");
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	if (udata->outlen)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	dev = to_mdev(pd->device);
+
+	if (init_attr->wq_type != IB_WQT_RQ) {
+		pr_debug("unsupported wq type %d\n", init_attr->wq_type);
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	if (init_attr->create_flags) {
+		pr_debug("unsupported create_flags %u\n",
+			 init_attr->create_flags);
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+	if (!qp)
+		return ERR_PTR(-ENOMEM);
+
+	qp->pri.vid = 0xFFFF;
+	qp->alt.vid = 0xFFFF;
+
+	memset(&ib_qp_init_attr, 0, sizeof(ib_qp_init_attr));
+	ib_qp_init_attr.qp_context = init_attr->wq_context;
+	ib_qp_init_attr.qp_type = IB_QPT_RAW_PACKET;
+	ib_qp_init_attr.cap.max_recv_wr = init_attr->max_wr;
+	ib_qp_init_attr.cap.max_recv_sge = init_attr->max_sge;
+	ib_qp_init_attr.recv_cq = init_attr->cq;
+	ib_qp_init_attr.send_cq = ib_qp_init_attr.recv_cq; /* Dummy CQ */
+
+	err = create_qp_common(dev, pd, MLX4_IB_RWQ_SRC, &ib_qp_init_attr,
+			       udata, 0, &qp);
+	if (err) {
+		kfree(qp);
+		return ERR_PTR(err);
+	}
+
+	qp->ibwq.event_handler = init_attr->event_handler;
+	qp->ibwq.wq_num = qp->mqp.qpn;
+	qp->ibwq.state = IB_WQS_RESET;
+
+	return &qp->ibwq;
+}
+
+static int ib_wq2qp_state(enum ib_wq_state state)
+{
+	switch (state) {
+	case IB_WQS_RESET:
+		return IB_QPS_RESET;
+	case IB_WQS_RDY:
+		return IB_QPS_RTR;
+	default:
+		return IB_QPS_ERR;
+	}
+}
+
+static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state)
+{
+	struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq);
+	enum ib_qp_state qp_cur_state;
+	enum ib_qp_state qp_new_state;
+	int attr_mask;
+	int err;
+
+	/* ib_qp.state represents the WQ HW state while ib_wq.state represents
+	 * the WQ logic state.
+	 */
+	qp_cur_state = qp->state;
+	qp_new_state = ib_wq2qp_state(new_state);
+
+	if (ib_wq2qp_state(new_state) == qp_cur_state)
+		return 0;
+
+	if (new_state == IB_WQS_RDY) {
+		struct ib_qp_attr attr = {};
+
+		attr.port_num = qp->port;
+		attr_mask = IB_QP_PORT;
+
+		err = __mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, &attr,
+					  attr_mask, IB_QPS_RESET, IB_QPS_INIT);
+		if (err) {
+			pr_debug("WQN=0x%06x failed to apply RST->INIT on the HW QP\n",
+				 ibwq->wq_num);
+			return err;
+		}
+
+		qp_cur_state = IB_QPS_INIT;
+	}
+
+	attr_mask = 0;
+	err = __mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, NULL, attr_mask,
+				  qp_cur_state,  qp_new_state);
+
+	if (err && (qp_cur_state == IB_QPS_INIT)) {
+		qp_new_state = IB_QPS_RESET;
+		if (__mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, NULL,
+					attr_mask, IB_QPS_INIT, IB_QPS_RESET)) {
+			pr_warn("WQN=0x%06x failed with reverting HW's resources failure\n",
+				ibwq->wq_num);
+			qp_new_state = IB_QPS_INIT;
+		}
+	}
+
+	qp->state = qp_new_state;
+
+	return err;
+}
+
+int mlx4_ib_modify_wq(struct ib_wq *ibwq, struct ib_wq_attr *wq_attr,
+		      u32 wq_attr_mask, struct ib_udata *udata)
+{
+	struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq);
+	struct mlx4_ib_modify_wq ucmd = {};
+	size_t required_cmd_sz;
+	enum ib_wq_state cur_state, new_state;
+	int err = 0;
+
+	required_cmd_sz = offsetof(typeof(ucmd), reserved) +
+				   sizeof(ucmd.reserved);
+	if (udata->inlen < required_cmd_sz)
+		return -EINVAL;
+
+	if (udata->inlen > sizeof(ucmd) &&
+	    !ib_is_udata_cleared(udata, sizeof(ucmd),
+				 udata->inlen - sizeof(ucmd)))
+		return -EOPNOTSUPP;
+
+	if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)))
+		return -EFAULT;
+
+	if (ucmd.comp_mask || ucmd.reserved)
+		return -EOPNOTSUPP;
+
+	if (wq_attr_mask & IB_WQ_FLAGS)
+		return -EOPNOTSUPP;
+
+	cur_state = wq_attr_mask & IB_WQ_CUR_STATE ? wq_attr->curr_wq_state :
+						     ibwq->state;
+	new_state = wq_attr_mask & IB_WQ_STATE ? wq_attr->wq_state : cur_state;
+
+	if (cur_state  < IB_WQS_RESET || cur_state  > IB_WQS_ERR ||
+	    new_state < IB_WQS_RESET || new_state > IB_WQS_ERR)
+		return -EINVAL;
+
+	if ((new_state == IB_WQS_RDY) && (cur_state == IB_WQS_ERR))
+		return -EINVAL;
+
+	if ((new_state == IB_WQS_ERR) && (cur_state == IB_WQS_RESET))
+		return -EINVAL;
+
+	/* Need to protect against the parent RSS which also may modify WQ
+	 * state.
+	 */
+	mutex_lock(&qp->mutex);
+
+	/* Can update HW state only if a RSS QP has already associated to this
+	 * WQ, so we can apply its port on the WQ.
+	 */
+	if (qp->rss_usecnt)
+		err = _mlx4_ib_modify_wq(ibwq, new_state);
+
+	if (!err)
+		ibwq->state = new_state;
+
+	mutex_unlock(&qp->mutex);
+
+	return err;
+}
+
+int mlx4_ib_destroy_wq(struct ib_wq *ibwq)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibwq->device);
+	struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq);
+
+	if (qp->counter_index)
+		mlx4_ib_free_qp_counter(dev, qp);
+
+	destroy_qp_common(dev, qp, MLX4_IB_RWQ_SRC, 1);
+
+	kfree(qp);
+
+	return 0;
+}
+
+struct ib_rwq_ind_table
+*mlx4_ib_create_rwq_ind_table(struct ib_device *device,
+			      struct ib_rwq_ind_table_init_attr *init_attr,
+			      struct ib_udata *udata)
+{
+	struct ib_rwq_ind_table *rwq_ind_table;
+	struct mlx4_ib_create_rwq_ind_tbl_resp resp = {};
+	unsigned int ind_tbl_size = 1 << init_attr->log_ind_tbl_size;
+	unsigned int base_wqn;
+	size_t min_resp_len;
+	int i;
+	int err;
+
+	if (udata->inlen > 0 &&
+	    !ib_is_udata_cleared(udata, 0,
+				 udata->inlen))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved);
+	if (udata->outlen && udata->outlen < min_resp_len)
+		return ERR_PTR(-EINVAL);
+
+	if (ind_tbl_size >
+	    device->attrs.rss_caps.max_rwq_indirection_table_size) {
+		pr_debug("log_ind_tbl_size = %d is bigger than supported = %d\n",
+			 ind_tbl_size,
+			 device->attrs.rss_caps.max_rwq_indirection_table_size);
+		return ERR_PTR(-EINVAL);
+	}
+
+	base_wqn = init_attr->ind_tbl[0]->wq_num;
+
+	if (base_wqn % ind_tbl_size) {
+		pr_debug("WQN=0x%x isn't aligned with indirection table size\n",
+			 base_wqn);
+		return ERR_PTR(-EINVAL);
+	}
+
+	for (i = 1; i < ind_tbl_size; i++) {
+		if (++base_wqn != init_attr->ind_tbl[i]->wq_num) {
+			pr_debug("indirection table's WQNs aren't consecutive\n");
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
+	rwq_ind_table = kzalloc(sizeof(*rwq_ind_table), GFP_KERNEL);
+	if (!rwq_ind_table)
+		return ERR_PTR(-ENOMEM);
+
+	if (udata->outlen) {
+		resp.response_length = offsetof(typeof(resp), response_length) +
+					sizeof(resp.response_length);
+		err = ib_copy_to_udata(udata, &resp, resp.response_length);
+		if (err)
+			goto err;
+	}
+
+	return rwq_ind_table;
+
+err:
+	kfree(rwq_ind_table);
+	return ERR_PTR(err);
+}
+
+int mlx4_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl)
+{
+	kfree(ib_rwq_ind_tbl);
+	return 0;
+}

+ 1 - 1
drivers/infiniband/hw/mlx5/Makefile

@@ -1,4 +1,4 @@
 obj-$(CONFIG_MLX5_INFINIBAND)	+= mlx5_ib.o
 obj-$(CONFIG_MLX5_INFINIBAND)	+= mlx5_ib.o
 
 
-mlx5_ib-y :=	main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o gsi.o ib_virt.o cmd.o
+mlx5_ib-y :=	main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o gsi.o ib_virt.o cmd.o cong.o
 mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o
 mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o

+ 20 - 0
drivers/infiniband/hw/mlx5/cmd.c

@@ -57,3 +57,23 @@ int mlx5_cmd_query_cong_counter(struct mlx5_core_dev *dev,
 	MLX5_SET(query_cong_statistics_in, in, clear, reset);
 	MLX5_SET(query_cong_statistics_in, in, clear, reset);
 	return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size);
 	return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size);
 }
 }
+
+int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point,
+			       void *out, int out_size)
+{
+	u32 in[MLX5_ST_SZ_DW(query_cong_params_in)] = { };
+
+	MLX5_SET(query_cong_params_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_CONG_PARAMS);
+	MLX5_SET(query_cong_params_in, in, cong_protocol, cong_point);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size);
+}
+
+int mlx5_cmd_modify_cong_params(struct mlx5_core_dev *dev,
+				void *in, int in_size)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_cong_params_out)] = { };
+
+	return mlx5_cmd_exec(dev, in, in_size, out, sizeof(out));
+}

+ 4 - 0
drivers/infiniband/hw/mlx5/cmd.h

@@ -39,4 +39,8 @@
 int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey);
 int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey);
 int mlx5_cmd_query_cong_counter(struct mlx5_core_dev *dev,
 int mlx5_cmd_query_cong_counter(struct mlx5_core_dev *dev,
 				bool reset, void *out, int out_size);
 				bool reset, void *out, int out_size);
+int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point,
+			       void *out, int out_size);
+int mlx5_cmd_modify_cong_params(struct mlx5_core_dev *mdev,
+				void *in, int in_size);
 #endif /* MLX5_IB_CMD_H */
 #endif /* MLX5_IB_CMD_H */

+ 421 - 0
drivers/infiniband/hw/mlx5/cong.c

@@ -0,0 +1,421 @@
+/*
+ * Copyright (c) 2013-2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/debugfs.h>
+
+#include "mlx5_ib.h"
+#include "cmd.h"
+
+enum mlx5_ib_cong_node_type {
+	MLX5_IB_RROCE_ECN_RP = 1,
+	MLX5_IB_RROCE_ECN_NP = 2,
+};
+
+static const char * const mlx5_ib_dbg_cc_name[] = {
+	"rp_clamp_tgt_rate",
+	"rp_clamp_tgt_rate_ati",
+	"rp_time_reset",
+	"rp_byte_reset",
+	"rp_threshold",
+	"rp_ai_rate",
+	"rp_hai_rate",
+	"rp_min_dec_fac",
+	"rp_min_rate",
+	"rp_rate_to_set_on_first_cnp",
+	"rp_dce_tcp_g",
+	"rp_dce_tcp_rtt",
+	"rp_rate_reduce_monitor_period",
+	"rp_initial_alpha_value",
+	"rp_gd",
+	"np_cnp_dscp",
+	"np_cnp_prio_mode",
+	"np_cnp_prio",
+};
+
+#define MLX5_IB_RP_CLAMP_TGT_RATE_ATTR			BIT(1)
+#define MLX5_IB_RP_CLAMP_TGT_RATE_ATI_ATTR		BIT(2)
+#define MLX5_IB_RP_TIME_RESET_ATTR			BIT(3)
+#define MLX5_IB_RP_BYTE_RESET_ATTR			BIT(4)
+#define MLX5_IB_RP_THRESHOLD_ATTR			BIT(5)
+#define MLX5_IB_RP_AI_RATE_ATTR				BIT(7)
+#define MLX5_IB_RP_HAI_RATE_ATTR			BIT(8)
+#define MLX5_IB_RP_MIN_DEC_FAC_ATTR			BIT(9)
+#define MLX5_IB_RP_MIN_RATE_ATTR			BIT(10)
+#define MLX5_IB_RP_RATE_TO_SET_ON_FIRST_CNP_ATTR	BIT(11)
+#define MLX5_IB_RP_DCE_TCP_G_ATTR			BIT(12)
+#define MLX5_IB_RP_DCE_TCP_RTT_ATTR			BIT(13)
+#define MLX5_IB_RP_RATE_REDUCE_MONITOR_PERIOD_ATTR	BIT(14)
+#define MLX5_IB_RP_INITIAL_ALPHA_VALUE_ATTR		BIT(15)
+#define MLX5_IB_RP_GD_ATTR				BIT(16)
+
+#define MLX5_IB_NP_CNP_DSCP_ATTR			BIT(3)
+#define MLX5_IB_NP_CNP_PRIO_MODE_ATTR			BIT(4)
+
+static enum mlx5_ib_cong_node_type
+mlx5_ib_param_to_node(enum mlx5_ib_dbg_cc_types param_offset)
+{
+	if (param_offset >= MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE &&
+	    param_offset <= MLX5_IB_DBG_CC_RP_GD)
+		return MLX5_IB_RROCE_ECN_RP;
+	else
+		return MLX5_IB_RROCE_ECN_NP;
+}
+
+static u32 mlx5_get_cc_param_val(void *field, int offset)
+{
+	switch (offset) {
+	case MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				clamp_tgt_rate);
+	case MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE_ATI:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				clamp_tgt_rate_after_time_inc);
+	case MLX5_IB_DBG_CC_RP_TIME_RESET:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rpg_time_reset);
+	case MLX5_IB_DBG_CC_RP_BYTE_RESET:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rpg_byte_reset);
+	case MLX5_IB_DBG_CC_RP_THRESHOLD:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rpg_threshold);
+	case MLX5_IB_DBG_CC_RP_AI_RATE:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rpg_ai_rate);
+	case MLX5_IB_DBG_CC_RP_HAI_RATE:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rpg_hai_rate);
+	case MLX5_IB_DBG_CC_RP_MIN_DEC_FAC:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rpg_min_dec_fac);
+	case MLX5_IB_DBG_CC_RP_MIN_RATE:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rpg_min_rate);
+	case MLX5_IB_DBG_CC_RP_RATE_TO_SET_ON_FIRST_CNP:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rate_to_set_on_first_cnp);
+	case MLX5_IB_DBG_CC_RP_DCE_TCP_G:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				dce_tcp_g);
+	case MLX5_IB_DBG_CC_RP_DCE_TCP_RTT:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				dce_tcp_rtt);
+	case MLX5_IB_DBG_CC_RP_RATE_REDUCE_MONITOR_PERIOD:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rate_reduce_monitor_period);
+	case MLX5_IB_DBG_CC_RP_INITIAL_ALPHA_VALUE:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				initial_alpha_value);
+	case MLX5_IB_DBG_CC_RP_GD:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rpg_gd);
+	case MLX5_IB_DBG_CC_NP_CNP_DSCP:
+		return MLX5_GET(cong_control_r_roce_ecn_np, field,
+				cnp_dscp);
+	case MLX5_IB_DBG_CC_NP_CNP_PRIO_MODE:
+		return MLX5_GET(cong_control_r_roce_ecn_np, field,
+				cnp_prio_mode);
+	case MLX5_IB_DBG_CC_NP_CNP_PRIO:
+		return MLX5_GET(cong_control_r_roce_ecn_np, field,
+				cnp_802p_prio);
+	default:
+		return 0;
+	}
+}
+
+static void mlx5_ib_set_cc_param_mask_val(void *field, int offset,
+					  u32 var, u32 *attr_mask)
+{
+	switch (offset) {
+	case MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE:
+		*attr_mask |= MLX5_IB_RP_CLAMP_TGT_RATE_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 clamp_tgt_rate, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE_ATI:
+		*attr_mask |= MLX5_IB_RP_CLAMP_TGT_RATE_ATI_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 clamp_tgt_rate_after_time_inc, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_TIME_RESET:
+		*attr_mask |= MLX5_IB_RP_TIME_RESET_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rpg_time_reset, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_BYTE_RESET:
+		*attr_mask |= MLX5_IB_RP_BYTE_RESET_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rpg_byte_reset, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_THRESHOLD:
+		*attr_mask |= MLX5_IB_RP_THRESHOLD_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rpg_threshold, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_AI_RATE:
+		*attr_mask |= MLX5_IB_RP_AI_RATE_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rpg_ai_rate, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_HAI_RATE:
+		*attr_mask |= MLX5_IB_RP_HAI_RATE_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rpg_hai_rate, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_MIN_DEC_FAC:
+		*attr_mask |= MLX5_IB_RP_MIN_DEC_FAC_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rpg_min_dec_fac, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_MIN_RATE:
+		*attr_mask |= MLX5_IB_RP_MIN_RATE_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rpg_min_rate, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_RATE_TO_SET_ON_FIRST_CNP:
+		*attr_mask |= MLX5_IB_RP_RATE_TO_SET_ON_FIRST_CNP_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rate_to_set_on_first_cnp, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_DCE_TCP_G:
+		*attr_mask |= MLX5_IB_RP_DCE_TCP_G_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 dce_tcp_g, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_DCE_TCP_RTT:
+		*attr_mask |= MLX5_IB_RP_DCE_TCP_RTT_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 dce_tcp_rtt, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_RATE_REDUCE_MONITOR_PERIOD:
+		*attr_mask |= MLX5_IB_RP_RATE_REDUCE_MONITOR_PERIOD_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rate_reduce_monitor_period, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_INITIAL_ALPHA_VALUE:
+		*attr_mask |= MLX5_IB_RP_INITIAL_ALPHA_VALUE_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 initial_alpha_value, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_GD:
+		*attr_mask |= MLX5_IB_RP_GD_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rpg_gd, var);
+		break;
+	case MLX5_IB_DBG_CC_NP_CNP_DSCP:
+		*attr_mask |= MLX5_IB_NP_CNP_DSCP_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_dscp, var);
+		break;
+	case MLX5_IB_DBG_CC_NP_CNP_PRIO_MODE:
+		*attr_mask |= MLX5_IB_NP_CNP_PRIO_MODE_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_prio_mode, var);
+		break;
+	case MLX5_IB_DBG_CC_NP_CNP_PRIO:
+		*attr_mask |= MLX5_IB_NP_CNP_PRIO_MODE_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_prio_mode, 0);
+		MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_802p_prio, var);
+		break;
+	}
+}
+
+static int mlx5_ib_get_cc_params(struct mlx5_ib_dev *dev, int offset, u32 *var)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_cong_params_out);
+	void *out;
+	void *field;
+	int err;
+	enum mlx5_ib_cong_node_type node;
+
+	out = kvzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	node = mlx5_ib_param_to_node(offset);
+
+	err = mlx5_cmd_query_cong_params(dev->mdev, node, out, outlen);
+	if (err)
+		goto free;
+
+	field = MLX5_ADDR_OF(query_cong_params_out, out, congestion_parameters);
+	*var = mlx5_get_cc_param_val(field, offset);
+
+free:
+	kvfree(out);
+	return err;
+}
+
+static int mlx5_ib_set_cc_params(struct mlx5_ib_dev *dev, int offset, u32 var)
+{
+	int inlen = MLX5_ST_SZ_BYTES(modify_cong_params_in);
+	void *in;
+	void *field;
+	enum mlx5_ib_cong_node_type node;
+	u32 attr_mask = 0;
+	int err;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_cong_params_in, in, opcode,
+		 MLX5_CMD_OP_MODIFY_CONG_PARAMS);
+
+	node = mlx5_ib_param_to_node(offset);
+	MLX5_SET(modify_cong_params_in, in, cong_protocol, node);
+
+	field = MLX5_ADDR_OF(modify_cong_params_in, in, congestion_parameters);
+	mlx5_ib_set_cc_param_mask_val(field, offset, var, &attr_mask);
+
+	field = MLX5_ADDR_OF(modify_cong_params_in, in, field_select);
+	MLX5_SET(field_select_r_roce_rp, field, field_select_r_roce_rp,
+		 attr_mask);
+
+	err = mlx5_cmd_modify_cong_params(dev->mdev, in, inlen);
+	kvfree(in);
+	return err;
+}
+
+static ssize_t set_param(struct file *filp, const char __user *buf,
+			 size_t count, loff_t *pos)
+{
+	struct mlx5_ib_dbg_param *param = filp->private_data;
+	int offset = param->offset;
+	char lbuf[11] = { };
+	u32 var;
+	int ret;
+
+	if (count > sizeof(lbuf))
+		return -EINVAL;
+
+	if (copy_from_user(lbuf, buf, count))
+		return -EFAULT;
+
+	lbuf[sizeof(lbuf) - 1] = '\0';
+
+	if (kstrtou32(lbuf, 0, &var))
+		return -EINVAL;
+
+	ret = mlx5_ib_set_cc_params(param->dev, offset, var);
+	return ret ? ret : count;
+}
+
+static ssize_t get_param(struct file *filp, char __user *buf, size_t count,
+			 loff_t *pos)
+{
+	struct mlx5_ib_dbg_param *param = filp->private_data;
+	int offset = param->offset;
+	u32 var = 0;
+	int ret;
+	char lbuf[11];
+
+	if (*pos)
+		return 0;
+
+	ret = mlx5_ib_get_cc_params(param->dev, offset, &var);
+	if (ret)
+		return ret;
+
+	ret = snprintf(lbuf, sizeof(lbuf), "%d\n", var);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(buf, lbuf, ret))
+		return -EFAULT;
+
+	*pos += ret;
+	return ret;
+}
+
+static const struct file_operations dbg_cc_fops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.write	= set_param,
+	.read	= get_param,
+};
+
+void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev)
+{
+	if (!mlx5_debugfs_root ||
+	    !dev->dbg_cc_params ||
+	    !dev->dbg_cc_params->root)
+		return;
+
+	debugfs_remove_recursive(dev->dbg_cc_params->root);
+	kfree(dev->dbg_cc_params);
+	dev->dbg_cc_params = NULL;
+}
+
+int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_ib_dbg_cc_params *dbg_cc_params;
+	int i;
+
+	if (!mlx5_debugfs_root)
+		goto out;
+
+	if (!MLX5_CAP_GEN(dev->mdev, cc_query_allowed) ||
+	    !MLX5_CAP_GEN(dev->mdev, cc_modify_allowed))
+		goto out;
+
+	dbg_cc_params = kzalloc(sizeof(*dbg_cc_params), GFP_KERNEL);
+	if (!dbg_cc_params)
+		goto out;
+
+	dev->dbg_cc_params = dbg_cc_params;
+
+	dbg_cc_params->root = debugfs_create_dir("cc_params",
+						 dev->mdev->priv.dbg_root);
+	if (!dbg_cc_params->root)
+		goto err;
+
+	for (i = 0; i < MLX5_IB_DBG_CC_MAX; i++) {
+		dbg_cc_params->params[i].offset = i;
+		dbg_cc_params->params[i].dev = dev;
+		dbg_cc_params->params[i].dentry =
+			debugfs_create_file(mlx5_ib_dbg_cc_name[i],
+					    0600, dbg_cc_params->root,
+					    &dbg_cc_params->params[i],
+					    &dbg_cc_fops);
+		if (!dbg_cc_params->params[i].dentry)
+			goto err;
+	}
+out:	return 0;
+
+err:
+	mlx5_ib_warn(dev, "cong debugfs failure\n");
+	mlx5_ib_cleanup_cong_debugfs(dev);
+	/*
+	 * We don't want to fail driver if debugfs failed to initialize,
+	 * so we are not forwarding error to the user.
+	 */
+	return 0;
+}

+ 9 - 0
drivers/infiniband/hw/mlx5/ib_virt.c

@@ -96,6 +96,7 @@ int mlx5_ib_set_vf_link_state(struct ib_device *device, int vf,
 	struct mlx5_ib_dev *dev = to_mdev(device);
 	struct mlx5_ib_dev *dev = to_mdev(device);
 	struct mlx5_core_dev *mdev = dev->mdev;
 	struct mlx5_core_dev *mdev = dev->mdev;
 	struct mlx5_hca_vport_context *in;
 	struct mlx5_hca_vport_context *in;
+	struct mlx5_vf_context *vfs_ctx = mdev->priv.sriov.vfs_ctx;
 	int err;
 	int err;
 
 
 	in = kzalloc(sizeof(*in), GFP_KERNEL);
 	in = kzalloc(sizeof(*in), GFP_KERNEL);
@@ -109,6 +110,8 @@ int mlx5_ib_set_vf_link_state(struct ib_device *device, int vf,
 	}
 	}
 	in->field_select = MLX5_HCA_VPORT_SEL_STATE_POLICY;
 	in->field_select = MLX5_HCA_VPORT_SEL_STATE_POLICY;
 	err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in);
 	err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in);
+	if (!err)
+		vfs_ctx[vf].policy = in->policy;
 
 
 out:
 out:
 	kfree(in);
 	kfree(in);
@@ -151,6 +154,7 @@ static int set_vf_node_guid(struct ib_device *device, int vf, u8 port, u64 guid)
 	struct mlx5_ib_dev *dev = to_mdev(device);
 	struct mlx5_ib_dev *dev = to_mdev(device);
 	struct mlx5_core_dev *mdev = dev->mdev;
 	struct mlx5_core_dev *mdev = dev->mdev;
 	struct mlx5_hca_vport_context *in;
 	struct mlx5_hca_vport_context *in;
+	struct mlx5_vf_context *vfs_ctx = mdev->priv.sriov.vfs_ctx;
 	int err;
 	int err;
 
 
 	in = kzalloc(sizeof(*in), GFP_KERNEL);
 	in = kzalloc(sizeof(*in), GFP_KERNEL);
@@ -160,6 +164,8 @@ static int set_vf_node_guid(struct ib_device *device, int vf, u8 port, u64 guid)
 	in->field_select = MLX5_HCA_VPORT_SEL_NODE_GUID;
 	in->field_select = MLX5_HCA_VPORT_SEL_NODE_GUID;
 	in->node_guid = guid;
 	in->node_guid = guid;
 	err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in);
 	err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in);
+	if (!err)
+		vfs_ctx[vf].node_guid = guid;
 	kfree(in);
 	kfree(in);
 	return err;
 	return err;
 }
 }
@@ -169,6 +175,7 @@ static int set_vf_port_guid(struct ib_device *device, int vf, u8 port, u64 guid)
 	struct mlx5_ib_dev *dev = to_mdev(device);
 	struct mlx5_ib_dev *dev = to_mdev(device);
 	struct mlx5_core_dev *mdev = dev->mdev;
 	struct mlx5_core_dev *mdev = dev->mdev;
 	struct mlx5_hca_vport_context *in;
 	struct mlx5_hca_vport_context *in;
+	struct mlx5_vf_context *vfs_ctx = mdev->priv.sriov.vfs_ctx;
 	int err;
 	int err;
 
 
 	in = kzalloc(sizeof(*in), GFP_KERNEL);
 	in = kzalloc(sizeof(*in), GFP_KERNEL);
@@ -178,6 +185,8 @@ static int set_vf_port_guid(struct ib_device *device, int vf, u8 port, u64 guid)
 	in->field_select = MLX5_HCA_VPORT_SEL_PORT_GUID;
 	in->field_select = MLX5_HCA_VPORT_SEL_PORT_GUID;
 	in->port_guid = guid;
 	in->port_guid = guid;
 	err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in);
 	err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in);
+	if (!err)
+		vfs_ctx[vf].port_guid = guid;
 	kfree(in);
 	kfree(in);
 	return err;
 	return err;
 }
 }

+ 307 - 29
drivers/infiniband/hw/mlx5/main.c

@@ -30,6 +30,7 @@
  * SOFTWARE.
  * SOFTWARE.
  */
  */
 
 
+#include <linux/debugfs.h>
 #include <linux/highmem.h>
 #include <linux/highmem.h>
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/init.h>
@@ -58,6 +59,7 @@
 #include <linux/mlx5/vport.h>
 #include <linux/mlx5/vport.h>
 #include "mlx5_ib.h"
 #include "mlx5_ib.h"
 #include "cmd.h"
 #include "cmd.h"
+#include <linux/mlx5/vport.h>
 
 
 #define DRIVER_NAME "mlx5_ib"
 #define DRIVER_NAME "mlx5_ib"
 #define DRIVER_VERSION "5.0-0"
 #define DRIVER_VERSION "5.0-0"
@@ -65,7 +67,6 @@
 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
 MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver");
 MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver");
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_LICENSE("Dual BSD/GPL");
-MODULE_VERSION(DRIVER_VERSION);
 
 
 static char mlx5_version[] =
 static char mlx5_version[] =
 	DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v"
 	DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v"
@@ -97,6 +98,20 @@ mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num)
 	return mlx5_port_type_cap_to_rdma_ll(port_type_cap);
 	return mlx5_port_type_cap_to_rdma_ll(port_type_cap);
 }
 }
 
 
+static int get_port_state(struct ib_device *ibdev,
+			  u8 port_num,
+			  enum ib_port_state *state)
+{
+	struct ib_port_attr attr;
+	int ret;
+
+	memset(&attr, 0, sizeof(attr));
+	ret = mlx5_ib_query_port(ibdev, port_num, &attr);
+	if (!ret)
+		*state = attr.state;
+	return ret;
+}
+
 static int mlx5_netdev_event(struct notifier_block *this,
 static int mlx5_netdev_event(struct notifier_block *this,
 			     unsigned long event, void *ptr)
 			     unsigned long event, void *ptr)
 {
 {
@@ -114,6 +129,7 @@ static int mlx5_netdev_event(struct notifier_block *this,
 		write_unlock(&ibdev->roce.netdev_lock);
 		write_unlock(&ibdev->roce.netdev_lock);
 		break;
 		break;
 
 
+	case NETDEV_CHANGE:
 	case NETDEV_UP:
 	case NETDEV_UP:
 	case NETDEV_DOWN: {
 	case NETDEV_DOWN: {
 		struct net_device *lag_ndev = mlx5_lag_get_roce_netdev(ibdev->mdev);
 		struct net_device *lag_ndev = mlx5_lag_get_roce_netdev(ibdev->mdev);
@@ -127,10 +143,23 @@ static int mlx5_netdev_event(struct notifier_block *this,
 		if ((upper == ndev || (!upper && ndev == ibdev->roce.netdev))
 		if ((upper == ndev || (!upper && ndev == ibdev->roce.netdev))
 		    && ibdev->ib_active) {
 		    && ibdev->ib_active) {
 			struct ib_event ibev = { };
 			struct ib_event ibev = { };
+			enum ib_port_state port_state;
+
+			if (get_port_state(&ibdev->ib_dev, 1, &port_state))
+				return NOTIFY_DONE;
+
+			if (ibdev->roce.last_port_state == port_state)
+				return NOTIFY_DONE;
 
 
+			ibdev->roce.last_port_state = port_state;
 			ibev.device = &ibdev->ib_dev;
 			ibev.device = &ibdev->ib_dev;
-			ibev.event = (event == NETDEV_UP) ?
-				     IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
+			if (port_state == IB_PORT_DOWN)
+				ibev.event = IB_EVENT_PORT_ERR;
+			else if (port_state == IB_PORT_ACTIVE)
+				ibev.event = IB_EVENT_PORT_ACTIVE;
+			else
+				return NOTIFY_DONE;
+
 			ibev.element.port_num = 1;
 			ibev.element.port_num = 1;
 			ib_dispatch_event(&ibev);
 			ib_dispatch_event(&ibev);
 		}
 		}
@@ -668,6 +697,14 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 		props->device_cap_flags |= IB_DEVICE_UD_TSO;
 		props->device_cap_flags |= IB_DEVICE_UD_TSO;
 	}
 	}
 
 
+	if (MLX5_CAP_GEN(dev->mdev, rq_delay_drop) &&
+	    MLX5_CAP_GEN(dev->mdev, general_notification_event))
+		props->raw_packet_caps |= IB_RAW_PACKET_CAP_DELAY_DROP;
+
+	if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads) &&
+	    MLX5_CAP_IPOIB_ENHANCED(mdev, csum_cap))
+		props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
+
 	if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
 	if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
 	    MLX5_CAP_ETH(dev->mdev, scatter_fcs)) {
 	    MLX5_CAP_ETH(dev->mdev, scatter_fcs)) {
 		/* Legacy bit to support old userspace libraries */
 		/* Legacy bit to support old userspace libraries */
@@ -1138,7 +1175,7 @@ static int calc_total_bfregs(struct mlx5_ib_dev *dev, bool lib_uar_4k,
 	if (req->num_low_latency_bfregs > req->total_num_bfregs - 1)
 	if (req->num_low_latency_bfregs > req->total_num_bfregs - 1)
 		return -EINVAL;
 		return -EINVAL;
 
 
-	mlx5_ib_dbg(dev, "uar_4k: fw support %s, lib support %s, user requested %d bfregs, alloated %d, using %d sys pages\n",
+	mlx5_ib_dbg(dev, "uar_4k: fw support %s, lib support %s, user requested %d bfregs, allocated %d, using %d sys pages\n",
 		    MLX5_CAP_GEN(dev->mdev, uar_4k) ? "yes" : "no",
 		    MLX5_CAP_GEN(dev->mdev, uar_4k) ? "yes" : "no",
 		    lib_uar_4k ? "yes" : "no", ref_bfregs,
 		    lib_uar_4k ? "yes" : "no", ref_bfregs,
 		    req->total_num_bfregs, *num_sys_pages);
 		    req->total_num_bfregs, *num_sys_pages);
@@ -1187,6 +1224,45 @@ static int deallocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *con
 	return 0;
 	return 0;
 }
 }
 
 
+static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn)
+{
+	int err;
+
+	err = mlx5_core_alloc_transport_domain(dev->mdev, tdn);
+	if (err)
+		return err;
+
+	if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) ||
+	    !MLX5_CAP_GEN(dev->mdev, disable_local_lb))
+		return err;
+
+	mutex_lock(&dev->lb_mutex);
+	dev->user_td++;
+
+	if (dev->user_td == 2)
+		err = mlx5_nic_vport_update_local_lb(dev->mdev, true);
+
+	mutex_unlock(&dev->lb_mutex);
+	return err;
+}
+
+static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn)
+{
+	mlx5_core_dealloc_transport_domain(dev->mdev, tdn);
+
+	if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) ||
+	    !MLX5_CAP_GEN(dev->mdev, disable_local_lb))
+		return;
+
+	mutex_lock(&dev->lb_mutex);
+	dev->user_td--;
+
+	if (dev->user_td < 2)
+		mlx5_nic_vport_update_local_lb(dev->mdev, false);
+
+	mutex_unlock(&dev->lb_mutex);
+}
+
 static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 						  struct ib_udata *udata)
 						  struct ib_udata *udata)
 {
 {
@@ -1295,8 +1371,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	mutex_init(&context->upd_xlt_page_mutex);
 	mutex_init(&context->upd_xlt_page_mutex);
 
 
 	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) {
 	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) {
-		err = mlx5_core_alloc_transport_domain(dev->mdev,
-						       &context->tdn);
+		err = mlx5_ib_alloc_transport_domain(dev, &context->tdn);
 		if (err)
 		if (err)
 			goto out_page;
 			goto out_page;
 	}
 	}
@@ -1362,7 +1437,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 
 
 out_td:
 out_td:
 	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
 	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
-		mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn);
+		mlx5_ib_dealloc_transport_domain(dev, context->tdn);
 
 
 out_page:
 out_page:
 	free_page(context->upd_xlt_page);
 	free_page(context->upd_xlt_page);
@@ -1390,7 +1465,7 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
 
 
 	bfregi = &context->bfregi;
 	bfregi = &context->bfregi;
 	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
 	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
-		mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn);
+		mlx5_ib_dealloc_transport_domain(dev, context->tdn);
 
 
 	free_page(context->upd_xlt_page);
 	free_page(context->upd_xlt_page);
 	deallocate_uars(dev, context);
 	deallocate_uars(dev, context);
@@ -2030,21 +2105,32 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c,
  */
  */
 static bool flow_is_multicast_only(struct ib_flow_attr *ib_attr)
 static bool flow_is_multicast_only(struct ib_flow_attr *ib_attr)
 {
 {
-	struct ib_flow_spec_eth *eth_spec;
+	union ib_flow_spec *flow_spec;
 
 
 	if (ib_attr->type != IB_FLOW_ATTR_NORMAL ||
 	if (ib_attr->type != IB_FLOW_ATTR_NORMAL ||
-	    ib_attr->size < sizeof(struct ib_flow_attr) +
-	    sizeof(struct ib_flow_spec_eth) ||
 	    ib_attr->num_of_specs < 1)
 	    ib_attr->num_of_specs < 1)
 		return false;
 		return false;
 
 
-	eth_spec = (struct ib_flow_spec_eth *)(ib_attr + 1);
-	if (eth_spec->type != IB_FLOW_SPEC_ETH ||
-	    eth_spec->size != sizeof(*eth_spec))
+	flow_spec = (union ib_flow_spec *)(ib_attr + 1);
+	if (flow_spec->type == IB_FLOW_SPEC_IPV4) {
+		struct ib_flow_spec_ipv4 *ipv4_spec;
+
+		ipv4_spec = (struct ib_flow_spec_ipv4 *)flow_spec;
+		if (ipv4_is_multicast(ipv4_spec->val.dst_ip))
+			return true;
+
 		return false;
 		return false;
+	}
 
 
-	return is_multicast_ether_addr(eth_spec->mask.dst_mac) &&
-	       is_multicast_ether_addr(eth_spec->val.dst_mac);
+	if (flow_spec->type == IB_FLOW_SPEC_ETH) {
+		struct ib_flow_spec_eth *eth_spec;
+
+		eth_spec = (struct ib_flow_spec_eth *)flow_spec;
+		return is_multicast_ether_addr(eth_spec->mask.dst_mac) &&
+		       is_multicast_ether_addr(eth_spec->val.dst_mac);
+	}
+
+	return false;
 }
 }
 
 
 static bool is_valid_ethertype(struct mlx5_core_dev *mdev,
 static bool is_valid_ethertype(struct mlx5_core_dev *mdev,
@@ -2522,8 +2608,14 @@ unlock:
 static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 {
 {
 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx5_ib_qp *mqp = to_mqp(ibqp);
 	int err;
 	int err;
 
 
+	if (mqp->flags & MLX5_IB_QP_UNDERLAY) {
+		mlx5_ib_dbg(dev, "Attaching a multi cast group to underlay QP is not supported\n");
+		return -EOPNOTSUPP;
+	}
+
 	err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num);
 	err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num);
 	if (err)
 	if (err)
 		mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n",
 		mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n",
@@ -2685,6 +2777,26 @@ static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev)
 	spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
 	spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
 }
 }
 
 
+static void delay_drop_handler(struct work_struct *work)
+{
+	int err;
+	struct mlx5_ib_delay_drop *delay_drop =
+		container_of(work, struct mlx5_ib_delay_drop,
+			     delay_drop_work);
+
+	atomic_inc(&delay_drop->events_cnt);
+
+	mutex_lock(&delay_drop->lock);
+	err = mlx5_core_set_delay_drop(delay_drop->dev->mdev,
+				       delay_drop->timeout);
+	if (err) {
+		mlx5_ib_warn(delay_drop->dev, "Failed to set delay drop, timeout=%u\n",
+			     delay_drop->timeout);
+		delay_drop->activate = false;
+	}
+	mutex_unlock(&delay_drop->lock);
+}
+
 static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
 static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
 			  enum mlx5_dev_event event, unsigned long param)
 			  enum mlx5_dev_event event, unsigned long param)
 {
 {
@@ -2737,8 +2849,11 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
 		ibev.event = IB_EVENT_CLIENT_REREGISTER;
 		ibev.event = IB_EVENT_CLIENT_REREGISTER;
 		port = (u8)param;
 		port = (u8)param;
 		break;
 		break;
+	case MLX5_DEV_EVENT_DELAY_DROP_TIMEOUT:
+		schedule_work(&ibdev->delay_drop.delay_drop_work);
+		goto out;
 	default:
 	default:
-		return;
+		goto out;
 	}
 	}
 
 
 	ibev.device	      = &ibdev->ib_dev;
 	ibev.device	      = &ibdev->ib_dev;
@@ -2746,7 +2861,7 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
 
 
 	if (port < 1 || port > ibdev->num_ports) {
 	if (port < 1 || port > ibdev->num_ports) {
 		mlx5_ib_warn(ibdev, "warning: event on port %d\n", port);
 		mlx5_ib_warn(ibdev, "warning: event on port %d\n", port);
-		return;
+		goto out;
 	}
 	}
 
 
 	if (ibdev->ib_active)
 	if (ibdev->ib_active)
@@ -2754,6 +2869,9 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
 
 
 	if (fatal)
 	if (fatal)
 		ibdev->ib_active = false;
 		ibdev->ib_active = false;
+
+out:
+	return;
 }
 }
 
 
 static int set_has_smi_cap(struct mlx5_ib_dev *dev)
 static int set_has_smi_cap(struct mlx5_ib_dev *dev)
@@ -3167,13 +3285,13 @@ static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
 	return 0;
 	return 0;
 }
 }
 
 
-static void get_dev_fw_str(struct ib_device *ibdev, char *str,
-			   size_t str_len)
+static void get_dev_fw_str(struct ib_device *ibdev, char *str)
 {
 {
 	struct mlx5_ib_dev *dev =
 	struct mlx5_ib_dev *dev =
 		container_of(ibdev, struct mlx5_ib_dev, ib_dev);
 		container_of(ibdev, struct mlx5_ib_dev, ib_dev);
-	snprintf(str, str_len, "%d.%d.%04d", fw_rev_maj(dev->mdev),
-		       fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev));
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%04d",
+		 fw_rev_maj(dev->mdev), fw_rev_min(dev->mdev),
+		 fw_rev_sub(dev->mdev));
 }
 }
 
 
 static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev)
 static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev)
@@ -3313,6 +3431,17 @@ static const struct mlx5_ib_counter cong_cnts[] = {
 	INIT_CONG_COUNTER(np_cnp_sent),
 	INIT_CONG_COUNTER(np_cnp_sent),
 };
 };
 
 
+static const struct mlx5_ib_counter extended_err_cnts[] = {
+	INIT_Q_COUNTER(resp_local_length_error),
+	INIT_Q_COUNTER(resp_cqe_error),
+	INIT_Q_COUNTER(req_cqe_error),
+	INIT_Q_COUNTER(req_remote_invalid_request),
+	INIT_Q_COUNTER(req_remote_access_errors),
+	INIT_Q_COUNTER(resp_remote_access_errors),
+	INIT_Q_COUNTER(resp_cqe_flush_error),
+	INIT_Q_COUNTER(req_cqe_flush_error),
+};
+
 static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev)
 static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev)
 {
 {
 	unsigned int i;
 	unsigned int i;
@@ -3337,6 +3466,10 @@ static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev,
 
 
 	if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters))
 	if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters))
 		num_counters += ARRAY_SIZE(retrans_q_cnts);
 		num_counters += ARRAY_SIZE(retrans_q_cnts);
+
+	if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters))
+		num_counters += ARRAY_SIZE(extended_err_cnts);
+
 	cnts->num_q_counters = num_counters;
 	cnts->num_q_counters = num_counters;
 
 
 	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
 	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
@@ -3386,6 +3519,13 @@ static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev,
 		}
 		}
 	}
 	}
 
 
+	if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters)) {
+		for (i = 0; i < ARRAY_SIZE(extended_err_cnts); i++, j++) {
+			names[j] = extended_err_cnts[i].name;
+			offsets[j] = extended_err_cnts[i].offset;
+		}
+	}
+
 	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
 	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
 		for (i = 0; i < ARRAY_SIZE(cong_cnts); i++, j++) {
 		for (i = 0; i < ARRAY_SIZE(cong_cnts); i++, j++) {
 			names[j] = cong_cnts[i].name;
 			names[j] = cong_cnts[i].name;
@@ -3556,6 +3696,126 @@ mlx5_ib_alloc_rdma_netdev(struct ib_device *hca,
 	return netdev;
 	return netdev;
 }
 }
 
 
+static void delay_drop_debugfs_cleanup(struct mlx5_ib_dev *dev)
+{
+	if (!dev->delay_drop.dbg)
+		return;
+	debugfs_remove_recursive(dev->delay_drop.dbg->dir_debugfs);
+	kfree(dev->delay_drop.dbg);
+	dev->delay_drop.dbg = NULL;
+}
+
+static void cancel_delay_drop(struct mlx5_ib_dev *dev)
+{
+	if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP))
+		return;
+
+	cancel_work_sync(&dev->delay_drop.delay_drop_work);
+	delay_drop_debugfs_cleanup(dev);
+}
+
+static ssize_t delay_drop_timeout_read(struct file *filp, char __user *buf,
+				       size_t count, loff_t *pos)
+{
+	struct mlx5_ib_delay_drop *delay_drop = filp->private_data;
+	char lbuf[20];
+	int len;
+
+	len = snprintf(lbuf, sizeof(lbuf), "%u\n", delay_drop->timeout);
+	return simple_read_from_buffer(buf, count, pos, lbuf, len);
+}
+
+static ssize_t delay_drop_timeout_write(struct file *filp, const char __user *buf,
+					size_t count, loff_t *pos)
+{
+	struct mlx5_ib_delay_drop *delay_drop = filp->private_data;
+	u32 timeout;
+	u32 var;
+
+	if (kstrtouint_from_user(buf, count, 0, &var))
+		return -EFAULT;
+
+	timeout = min_t(u32, roundup(var, 100), MLX5_MAX_DELAY_DROP_TIMEOUT_MS *
+			1000);
+	if (timeout != var)
+		mlx5_ib_dbg(delay_drop->dev, "Round delay drop timeout to %u usec\n",
+			    timeout);
+
+	delay_drop->timeout = timeout;
+
+	return count;
+}
+
+static const struct file_operations fops_delay_drop_timeout = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.write	= delay_drop_timeout_write,
+	.read	= delay_drop_timeout_read,
+};
+
+static int delay_drop_debugfs_init(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_ib_dbg_delay_drop *dbg;
+
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	dbg = kzalloc(sizeof(*dbg), GFP_KERNEL);
+	if (!dbg)
+		return -ENOMEM;
+
+	dbg->dir_debugfs =
+		debugfs_create_dir("delay_drop",
+				   dev->mdev->priv.dbg_root);
+	if (!dbg->dir_debugfs)
+		return -ENOMEM;
+
+	dbg->events_cnt_debugfs =
+		debugfs_create_atomic_t("num_timeout_events", 0400,
+					dbg->dir_debugfs,
+					&dev->delay_drop.events_cnt);
+	if (!dbg->events_cnt_debugfs)
+		goto out_debugfs;
+
+	dbg->rqs_cnt_debugfs =
+		debugfs_create_atomic_t("num_rqs", 0400,
+					dbg->dir_debugfs,
+					&dev->delay_drop.rqs_cnt);
+	if (!dbg->rqs_cnt_debugfs)
+		goto out_debugfs;
+
+	dbg->timeout_debugfs =
+		debugfs_create_file("timeout", 0600,
+				    dbg->dir_debugfs,
+				    &dev->delay_drop,
+				    &fops_delay_drop_timeout);
+	if (!dbg->timeout_debugfs)
+		goto out_debugfs;
+
+	return 0;
+
+out_debugfs:
+	delay_drop_debugfs_cleanup(dev);
+	return -ENOMEM;
+}
+
+static void init_delay_drop(struct mlx5_ib_dev *dev)
+{
+	if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP))
+		return;
+
+	mutex_init(&dev->delay_drop.lock);
+	dev->delay_drop.dev = dev;
+	dev->delay_drop.activate = false;
+	dev->delay_drop.timeout = MLX5_MAX_DELAY_DROP_TIMEOUT_MS * 1000;
+	INIT_WORK(&dev->delay_drop.delay_drop_work, delay_drop_handler);
+	atomic_set(&dev->delay_drop.rqs_cnt, 0);
+	atomic_set(&dev->delay_drop.events_cnt, 0);
+
+	if (delay_drop_debugfs_init(dev))
+		mlx5_ib_warn(dev, "Failed to init delay drop debugfs\n");
+}
+
 static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 {
 {
 	struct mlx5_ib_dev *dev;
 	struct mlx5_ib_dev *dev;
@@ -3723,18 +3983,20 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 			(1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
 			(1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
 	}
 	}
 
 
+	dev->ib_dev.create_flow	= mlx5_ib_create_flow;
+	dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow;
+	dev->ib_dev.uverbs_ex_cmd_mask |=
+			(1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
+			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
+
 	if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
 	if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
 	    IB_LINK_LAYER_ETHERNET) {
 	    IB_LINK_LAYER_ETHERNET) {
-		dev->ib_dev.create_flow	= mlx5_ib_create_flow;
-		dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow;
 		dev->ib_dev.create_wq	 = mlx5_ib_create_wq;
 		dev->ib_dev.create_wq	 = mlx5_ib_create_wq;
 		dev->ib_dev.modify_wq	 = mlx5_ib_modify_wq;
 		dev->ib_dev.modify_wq	 = mlx5_ib_modify_wq;
 		dev->ib_dev.destroy_wq	 = mlx5_ib_destroy_wq;
 		dev->ib_dev.destroy_wq	 = mlx5_ib_destroy_wq;
 		dev->ib_dev.create_rwq_ind_table = mlx5_ib_create_rwq_ind_table;
 		dev->ib_dev.create_rwq_ind_table = mlx5_ib_create_rwq_ind_table;
 		dev->ib_dev.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table;
 		dev->ib_dev.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table;
 		dev->ib_dev.uverbs_ex_cmd_mask |=
 		dev->ib_dev.uverbs_ex_cmd_mask |=
-			(1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
-			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW) |
 			(1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) |
 			(1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) |
 			(1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) |
 			(1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) |
 			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) |
 			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) |
@@ -3754,6 +4016,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 		err = mlx5_enable_eth(dev);
 		err = mlx5_enable_eth(dev);
 		if (err)
 		if (err)
 			goto err_free_port;
 			goto err_free_port;
+		dev->roce.last_port_state = IB_PORT_DOWN;
 	}
 	}
 
 
 	err = create_dev_resources(&dev->devr);
 	err = create_dev_resources(&dev->devr);
@@ -3770,9 +4033,13 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 			goto err_odp;
 			goto err_odp;
 	}
 	}
 
 
+	err = mlx5_ib_init_cong_debugfs(dev);
+	if (err)
+		goto err_cnt;
+
 	dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev);
 	dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev);
 	if (!dev->mdev->priv.uar)
 	if (!dev->mdev->priv.uar)
-		goto err_cnt;
+		goto err_cong;
 
 
 	err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false);
 	err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false);
 	if (err)
 	if (err)
@@ -3790,18 +4057,25 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 	if (err)
 	if (err)
 		goto err_dev;
 		goto err_dev;
 
 
+	init_delay_drop(dev);
+
 	for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) {
 	for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) {
 		err = device_create_file(&dev->ib_dev.dev,
 		err = device_create_file(&dev->ib_dev.dev,
 					 mlx5_class_attributes[i]);
 					 mlx5_class_attributes[i]);
 		if (err)
 		if (err)
-			goto err_umrc;
+			goto err_delay_drop;
 	}
 	}
 
 
+	if ((MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) &&
+	    MLX5_CAP_GEN(mdev, disable_local_lb))
+		mutex_init(&dev->lb_mutex);
+
 	dev->ib_active = true;
 	dev->ib_active = true;
 
 
 	return dev;
 	return dev;
 
 
-err_umrc:
+err_delay_drop:
+	cancel_delay_drop(dev);
 	destroy_umrc_res(dev);
 	destroy_umrc_res(dev);
 
 
 err_dev:
 err_dev:
@@ -3817,6 +4091,8 @@ err_uar_page:
 	mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar);
 	mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar);
 
 
 err_cnt:
 err_cnt:
+	mlx5_ib_cleanup_cong_debugfs(dev);
+err_cong:
 	if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
 	if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
 		mlx5_ib_dealloc_counters(dev);
 		mlx5_ib_dealloc_counters(dev);
 
 
@@ -3846,11 +4122,13 @@ static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
 	struct mlx5_ib_dev *dev = context;
 	struct mlx5_ib_dev *dev = context;
 	enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1);
 	enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1);
 
 
+	cancel_delay_drop(dev);
 	mlx5_remove_netdev_notifier(dev);
 	mlx5_remove_netdev_notifier(dev);
 	ib_unregister_device(&dev->ib_dev);
 	ib_unregister_device(&dev->ib_dev);
 	mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
 	mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
 	mlx5_free_bfreg(dev->mdev, &dev->bfreg);
 	mlx5_free_bfreg(dev->mdev, &dev->bfreg);
 	mlx5_put_uars_page(dev->mdev, mdev->priv.uar);
 	mlx5_put_uars_page(dev->mdev, mdev->priv.uar);
+	mlx5_ib_cleanup_cong_debugfs(dev);
 	if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
 	if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
 		mlx5_ib_dealloc_counters(dev);
 		mlx5_ib_dealloc_counters(dev);
 	destroy_umrc_res(dev);
 	destroy_umrc_res(dev);

+ 76 - 3
drivers/infiniband/hw/mlx5/mlx5_ib.h

@@ -247,6 +247,10 @@ struct mlx5_ib_wq {
 	void		       *qend;
 	void		       *qend;
 };
 };
 
 
+enum mlx5_ib_wq_flags {
+	MLX5_IB_WQ_FLAGS_DELAY_DROP = 0x1,
+};
+
 struct mlx5_ib_rwq {
 struct mlx5_ib_rwq {
 	struct ib_wq		ibwq;
 	struct ib_wq		ibwq;
 	struct mlx5_core_qp	core_qp;
 	struct mlx5_core_qp	core_qp;
@@ -264,6 +268,7 @@ struct mlx5_ib_rwq {
 	u32			wqe_count;
 	u32			wqe_count;
 	u32			wqe_shift;
 	u32			wqe_shift;
 	int			wq_sig;
 	int			wq_sig;
+	u32			create_flags; /* Use enum mlx5_ib_wq_flags */
 };
 };
 
 
 enum {
 enum {
@@ -378,6 +383,7 @@ struct mlx5_ib_qp {
 	struct list_head	cq_recv_list;
 	struct list_head	cq_recv_list;
 	struct list_head	cq_send_list;
 	struct list_head	cq_send_list;
 	u32			rate_limit;
 	u32			rate_limit;
+	u32                     underlay_qpn;
 };
 };
 
 
 struct mlx5_ib_cq_buf {
 struct mlx5_ib_cq_buf {
@@ -399,6 +405,7 @@ enum mlx5_ib_qp_flags {
 	MLX5_IB_QP_CAP_SCATTER_FCS		= 1 << 7,
 	MLX5_IB_QP_CAP_SCATTER_FCS		= 1 << 7,
 	MLX5_IB_QP_RSS				= 1 << 8,
 	MLX5_IB_QP_RSS				= 1 << 8,
 	MLX5_IB_QP_CVLAN_STRIPPING		= 1 << 9,
 	MLX5_IB_QP_CVLAN_STRIPPING		= 1 << 9,
+	MLX5_IB_QP_UNDERLAY			= 1 << 10,
 };
 };
 
 
 struct mlx5_umr_wr {
 struct mlx5_umr_wr {
@@ -616,6 +623,63 @@ struct mlx5_roce {
 	struct net_device	*netdev;
 	struct net_device	*netdev;
 	struct notifier_block	nb;
 	struct notifier_block	nb;
 	atomic_t		next_port;
 	atomic_t		next_port;
+	enum ib_port_state last_port_state;
+};
+
+struct mlx5_ib_dbg_param {
+	int			offset;
+	struct mlx5_ib_dev	*dev;
+	struct dentry		*dentry;
+};
+
+enum mlx5_ib_dbg_cc_types {
+	MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE,
+	MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE_ATI,
+	MLX5_IB_DBG_CC_RP_TIME_RESET,
+	MLX5_IB_DBG_CC_RP_BYTE_RESET,
+	MLX5_IB_DBG_CC_RP_THRESHOLD,
+	MLX5_IB_DBG_CC_RP_AI_RATE,
+	MLX5_IB_DBG_CC_RP_HAI_RATE,
+	MLX5_IB_DBG_CC_RP_MIN_DEC_FAC,
+	MLX5_IB_DBG_CC_RP_MIN_RATE,
+	MLX5_IB_DBG_CC_RP_RATE_TO_SET_ON_FIRST_CNP,
+	MLX5_IB_DBG_CC_RP_DCE_TCP_G,
+	MLX5_IB_DBG_CC_RP_DCE_TCP_RTT,
+	MLX5_IB_DBG_CC_RP_RATE_REDUCE_MONITOR_PERIOD,
+	MLX5_IB_DBG_CC_RP_INITIAL_ALPHA_VALUE,
+	MLX5_IB_DBG_CC_RP_GD,
+	MLX5_IB_DBG_CC_NP_CNP_DSCP,
+	MLX5_IB_DBG_CC_NP_CNP_PRIO_MODE,
+	MLX5_IB_DBG_CC_NP_CNP_PRIO,
+	MLX5_IB_DBG_CC_MAX,
+};
+
+struct mlx5_ib_dbg_cc_params {
+	struct dentry			*root;
+	struct mlx5_ib_dbg_param	params[MLX5_IB_DBG_CC_MAX];
+};
+
+enum {
+	MLX5_MAX_DELAY_DROP_TIMEOUT_MS = 100,
+};
+
+struct mlx5_ib_dbg_delay_drop {
+	struct dentry		*dir_debugfs;
+	struct dentry		*rqs_cnt_debugfs;
+	struct dentry		*events_cnt_debugfs;
+	struct dentry		*timeout_debugfs;
+};
+
+struct mlx5_ib_delay_drop {
+	struct mlx5_ib_dev     *dev;
+	struct work_struct	delay_drop_work;
+	/* serialize setting of delay drop */
+	struct mutex		lock;
+	u32			timeout;
+	bool			activate;
+	atomic_t		events_cnt;
+	atomic_t		rqs_cnt;
+	struct mlx5_ib_dbg_delay_drop *dbg;
 };
 };
 
 
 struct mlx5_ib_dev {
 struct mlx5_ib_dev {
@@ -652,9 +716,15 @@ struct mlx5_ib_dev {
 	struct list_head	qp_list;
 	struct list_head	qp_list;
 	/* Array with num_ports elements */
 	/* Array with num_ports elements */
 	struct mlx5_ib_port	*port;
 	struct mlx5_ib_port	*port;
-	struct mlx5_sq_bfreg     bfreg;
-	struct mlx5_sq_bfreg     fp_bfreg;
-	u8				umr_fence;
+	struct mlx5_sq_bfreg	bfreg;
+	struct mlx5_sq_bfreg	fp_bfreg;
+	struct mlx5_ib_delay_drop	delay_drop;
+	struct mlx5_ib_dbg_cc_params	*dbg_cc_params;
+
+	/* protect the user_td */
+	struct mutex		lb_mutex;
+	u32			user_td;
+	u8			umr_fence;
 };
 };
 
 
 static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
 static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
@@ -904,6 +974,9 @@ __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num,
 int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port_num,
 int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port_num,
 			   int index, enum ib_gid_type *gid_type);
 			   int index, enum ib_gid_type *gid_type);
 
 
+void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev);
+int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev);
+
 /* GSI QP helper functions */
 /* GSI QP helper functions */
 struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd,
 struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd,
 				    struct ib_qp_init_attr *init_attr);
 				    struct ib_qp_init_attr *init_attr);

+ 13 - 5
drivers/infiniband/hw/mlx5/mr.c

@@ -48,6 +48,7 @@ enum {
 #define MLX5_UMR_ALIGN 2048
 #define MLX5_UMR_ALIGN 2048
 
 
 static int clean_mr(struct mlx5_ib_mr *mr);
 static int clean_mr(struct mlx5_ib_mr *mr);
+static int max_umr_order(struct mlx5_ib_dev *dev);
 static int use_umr(struct mlx5_ib_dev *dev, int order);
 static int use_umr(struct mlx5_ib_dev *dev, int order);
 static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
 static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
 
 
@@ -491,16 +492,18 @@ static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
 	struct mlx5_mr_cache *cache = &dev->cache;
 	struct mlx5_mr_cache *cache = &dev->cache;
 	struct mlx5_ib_mr *mr = NULL;
 	struct mlx5_ib_mr *mr = NULL;
 	struct mlx5_cache_ent *ent;
 	struct mlx5_cache_ent *ent;
+	int last_umr_cache_entry;
 	int c;
 	int c;
 	int i;
 	int i;
 
 
 	c = order2idx(dev, order);
 	c = order2idx(dev, order);
-	if (c < 0 || c > MAX_UMR_CACHE_ENTRY) {
+	last_umr_cache_entry = order2idx(dev, max_umr_order(dev));
+	if (c < 0 || c > last_umr_cache_entry) {
 		mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c);
 		mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c);
 		return NULL;
 		return NULL;
 	}
 	}
 
 
-	for (i = c; i < MAX_UMR_CACHE_ENTRY; i++) {
+	for (i = c; i <= last_umr_cache_entry; i++) {
 		ent = &cache->ent[i];
 		ent = &cache->ent[i];
 
 
 		mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i);
 		mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i);
@@ -816,11 +819,16 @@ static int get_octo_len(u64 addr, u64 len, int page_size)
 	return (npages + 1) / 2;
 	return (npages + 1) / 2;
 }
 }
 
 
-static int use_umr(struct mlx5_ib_dev *dev, int order)
+static int max_umr_order(struct mlx5_ib_dev *dev)
 {
 {
 	if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
 	if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
-		return order <= MAX_UMR_CACHE_ENTRY + 2;
-	return order <= MLX5_MAX_UMR_SHIFT;
+		return MAX_UMR_CACHE_ENTRY + 2;
+	return MLX5_MAX_UMR_SHIFT;
+}
+
+static int use_umr(struct mlx5_ib_dev *dev, int order)
+{
+	return order <= max_umr_order(dev);
 }
 }
 
 
 static int mr_umem_get(struct ib_pd *pd, u64 start, u64 length,
 static int mr_umem_get(struct ib_pd *pd, u64 start, u64 length,

+ 1 - 1
drivers/infiniband/hw/mlx5/odp.c

@@ -939,7 +939,7 @@ static int mlx5_ib_mr_initiator_pfault_handler(
 
 
 	if (qp->ibqp.qp_type != IB_QPT_RC) {
 	if (qp->ibqp.qp_type != IB_QPT_RC) {
 		av = *wqe;
 		av = *wqe;
-		if (av->dqp_dct & be32_to_cpu(MLX5_WQE_AV_EXT))
+		if (av->dqp_dct & cpu_to_be32(MLX5_EXTENDED_UD_AV))
 			*wqe += sizeof(struct mlx5_av);
 			*wqe += sizeof(struct mlx5_av);
 		else
 		else
 			*wqe += sizeof(struct mlx5_base_av);
 			*wqe += sizeof(struct mlx5_base_av);

+ 104 - 18
drivers/infiniband/hw/mlx5/qp.c

@@ -34,6 +34,7 @@
 #include <rdma/ib_umem.h>
 #include <rdma/ib_umem.h>
 #include <rdma/ib_cache.h>
 #include <rdma/ib_cache.h>
 #include <rdma/ib_user_verbs.h>
 #include <rdma/ib_user_verbs.h>
+#include <linux/mlx5/fs.h>
 #include "mlx5_ib.h"
 #include "mlx5_ib.h"
 
 
 /* not supported currently */
 /* not supported currently */
@@ -453,7 +454,8 @@ static int set_user_buf_size(struct mlx5_ib_dev *dev,
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
-	if (attr->qp_type == IB_QPT_RAW_PACKET) {
+	if (attr->qp_type == IB_QPT_RAW_PACKET ||
+	    qp->flags & MLX5_IB_QP_UNDERLAY) {
 		base->ubuffer.buf_size = qp->rq.wqe_cnt << qp->rq.wqe_shift;
 		base->ubuffer.buf_size = qp->rq.wqe_cnt << qp->rq.wqe_shift;
 		qp->raw_packet_qp.sq.ubuffer.buf_size = qp->sq.wqe_cnt << 6;
 		qp->raw_packet_qp.sq.ubuffer.buf_size = qp->sq.wqe_cnt << 6;
 	} else {
 	} else {
@@ -675,10 +677,14 @@ err_umem:
 	return err;
 	return err;
 }
 }
 
 
-static void destroy_user_rq(struct ib_pd *pd, struct mlx5_ib_rwq *rwq)
+static void destroy_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
+			    struct mlx5_ib_rwq *rwq)
 {
 {
 	struct mlx5_ib_ucontext *context;
 	struct mlx5_ib_ucontext *context;
 
 
+	if (rwq->create_flags & MLX5_IB_WQ_FLAGS_DELAY_DROP)
+		atomic_dec(&dev->delay_drop.rqs_cnt);
+
 	context = to_mucontext(pd->uobject->context);
 	context = to_mucontext(pd->uobject->context);
 	mlx5_ib_db_unmap_user(context, &rwq->db);
 	mlx5_ib_db_unmap_user(context, &rwq->db);
 	if (rwq->umem)
 	if (rwq->umem)
@@ -1021,12 +1027,16 @@ static int is_connected(enum ib_qp_type qp_type)
 }
 }
 
 
 static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev,
 static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev,
+				    struct mlx5_ib_qp *qp,
 				    struct mlx5_ib_sq *sq, u32 tdn)
 				    struct mlx5_ib_sq *sq, u32 tdn)
 {
 {
 	u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {0};
 	u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {0};
 	void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
 	void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
 
 
 	MLX5_SET(tisc, tisc, transport_domain, tdn);
 	MLX5_SET(tisc, tisc, transport_domain, tdn);
+	if (qp->flags & MLX5_IB_QP_UNDERLAY)
+		MLX5_SET(tisc, tisc, underlay_qpn, qp->underlay_qpn);
+
 	return mlx5_core_create_tis(dev->mdev, in, sizeof(in), &sq->tisn);
 	return mlx5_core_create_tis(dev->mdev, in, sizeof(in), &sq->tisn);
 }
 }
 
 
@@ -1229,7 +1239,7 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
 	u32 tdn = mucontext->tdn;
 	u32 tdn = mucontext->tdn;
 
 
 	if (qp->sq.wqe_cnt) {
 	if (qp->sq.wqe_cnt) {
-		err = create_raw_packet_qp_tis(dev, sq, tdn);
+		err = create_raw_packet_qp_tis(dev, qp, sq, tdn);
 		if (err)
 		if (err)
 			return err;
 			return err;
 
 
@@ -1502,10 +1512,6 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 	u32 *in;
 	u32 *in;
 	int err;
 	int err;
 
 
-	base = init_attr->qp_type == IB_QPT_RAW_PACKET ?
-	       &qp->raw_packet_qp.rq.base :
-	       &qp->trans_qp.base;
-
 	mutex_init(&qp->mutex);
 	mutex_init(&qp->mutex);
 	spin_lock_init(&qp->sq.lock);
 	spin_lock_init(&qp->sq.lock);
 	spin_lock_init(&qp->rq.lock);
 	spin_lock_init(&qp->rq.lock);
@@ -1587,10 +1593,28 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 
 
 		qp->wq_sig = !!(ucmd.flags & MLX5_QP_FLAG_SIGNATURE);
 		qp->wq_sig = !!(ucmd.flags & MLX5_QP_FLAG_SIGNATURE);
 		qp->scat_cqe = !!(ucmd.flags & MLX5_QP_FLAG_SCATTER_CQE);
 		qp->scat_cqe = !!(ucmd.flags & MLX5_QP_FLAG_SCATTER_CQE);
+
+		if (init_attr->create_flags & IB_QP_CREATE_SOURCE_QPN) {
+			if (init_attr->qp_type != IB_QPT_UD ||
+			    (MLX5_CAP_GEN(dev->mdev, port_type) !=
+			     MLX5_CAP_PORT_TYPE_IB) ||
+			    !mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS)) {
+				mlx5_ib_dbg(dev, "Source QP option isn't supported\n");
+				return -EOPNOTSUPP;
+			}
+
+			qp->flags |= MLX5_IB_QP_UNDERLAY;
+			qp->underlay_qpn = init_attr->source_qpn;
+		}
 	} else {
 	} else {
 		qp->wq_sig = !!wq_signature;
 		qp->wq_sig = !!wq_signature;
 	}
 	}
 
 
+	base = (init_attr->qp_type == IB_QPT_RAW_PACKET ||
+		qp->flags & MLX5_IB_QP_UNDERLAY) ?
+	       &qp->raw_packet_qp.rq.base :
+	       &qp->trans_qp.base;
+
 	qp->has_rq = qp_has_rq(init_attr);
 	qp->has_rq = qp_has_rq(init_attr);
 	err = set_rq_size(dev, &init_attr->cap, qp->has_rq,
 	err = set_rq_size(dev, &init_attr->cap, qp->has_rq,
 			  qp, (pd && pd->uobject) ? &ucmd : NULL);
 			  qp, (pd && pd->uobject) ? &ucmd : NULL);
@@ -1741,7 +1765,8 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 		qp->flags |= MLX5_IB_QP_LSO;
 		qp->flags |= MLX5_IB_QP_LSO;
 	}
 	}
 
 
-	if (init_attr->qp_type == IB_QPT_RAW_PACKET) {
+	if (init_attr->qp_type == IB_QPT_RAW_PACKET ||
+	    qp->flags & MLX5_IB_QP_UNDERLAY) {
 		qp->raw_packet_qp.sq.ubuffer.buf_addr = ucmd.sq_buf_addr;
 		qp->raw_packet_qp.sq.ubuffer.buf_addr = ucmd.sq_buf_addr;
 		raw_packet_qp_copy_info(qp, &qp->raw_packet_qp);
 		raw_packet_qp_copy_info(qp, &qp->raw_packet_qp);
 		err = create_raw_packet_qp(dev, qp, in, pd);
 		err = create_raw_packet_qp(dev, qp, in, pd);
@@ -1893,7 +1918,7 @@ static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
 static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
 static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
 {
 {
 	struct mlx5_ib_cq *send_cq, *recv_cq;
 	struct mlx5_ib_cq *send_cq, *recv_cq;
-	struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
+	struct mlx5_ib_qp_base *base;
 	unsigned long flags;
 	unsigned long flags;
 	int err;
 	int err;
 
 
@@ -1902,12 +1927,14 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
 		return;
 		return;
 	}
 	}
 
 
-	base = qp->ibqp.qp_type == IB_QPT_RAW_PACKET ?
+	base = (qp->ibqp.qp_type == IB_QPT_RAW_PACKET ||
+		qp->flags & MLX5_IB_QP_UNDERLAY) ?
 	       &qp->raw_packet_qp.rq.base :
 	       &qp->raw_packet_qp.rq.base :
 	       &qp->trans_qp.base;
 	       &qp->trans_qp.base;
 
 
 	if (qp->state != IB_QPS_RESET) {
 	if (qp->state != IB_QPS_RESET) {
-		if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET) {
+		if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET &&
+		    !(qp->flags & MLX5_IB_QP_UNDERLAY)) {
 			err = mlx5_core_qp_modify(dev->mdev,
 			err = mlx5_core_qp_modify(dev->mdev,
 						  MLX5_CMD_OP_2RST_QP, 0,
 						  MLX5_CMD_OP_2RST_QP, 0,
 						  NULL, &base->mqp);
 						  NULL, &base->mqp);
@@ -1946,7 +1973,8 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
 	mlx5_ib_unlock_cqs(send_cq, recv_cq);
 	mlx5_ib_unlock_cqs(send_cq, recv_cq);
 	spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
 	spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
 
 
-	if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) {
+	if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET ||
+	    qp->flags & MLX5_IB_QP_UNDERLAY) {
 		destroy_raw_packet_qp(dev, qp);
 		destroy_raw_packet_qp(dev, qp);
 	} else {
 	} else {
 		err = mlx5_core_destroy_qp(dev->mdev, &base->mqp);
 		err = mlx5_core_destroy_qp(dev->mdev, &base->mqp);
@@ -2702,7 +2730,8 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 
 
 	if (is_sqp(ibqp->qp_type)) {
 	if (is_sqp(ibqp->qp_type)) {
 		context->mtu_msgmax = (IB_MTU_256 << 5) | 8;
 		context->mtu_msgmax = (IB_MTU_256 << 5) | 8;
-	} else if (ibqp->qp_type == IB_QPT_UD ||
+	} else if ((ibqp->qp_type == IB_QPT_UD &&
+		    !(qp->flags & MLX5_IB_QP_UNDERLAY)) ||
 		   ibqp->qp_type == MLX5_IB_QPT_REG_UMR) {
 		   ibqp->qp_type == MLX5_IB_QPT_REG_UMR) {
 		context->mtu_msgmax = (IB_MTU_4096 << 5) | 12;
 		context->mtu_msgmax = (IB_MTU_4096 << 5) | 12;
 	} else if (attr_mask & IB_QP_PATH_MTU) {
 	} else if (attr_mask & IB_QP_PATH_MTU) {
@@ -2799,6 +2828,11 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
 	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
 		u8 port_num = (attr_mask & IB_QP_PORT ? attr->port_num :
 		u8 port_num = (attr_mask & IB_QP_PORT ? attr->port_num :
 			       qp->port) - 1;
 			       qp->port) - 1;
+
+		/* Underlay port should be used - index 0 function per port */
+		if (qp->flags & MLX5_IB_QP_UNDERLAY)
+			port_num = 0;
+
 		mibport = &dev->port[port_num];
 		mibport = &dev->port[port_num];
 		context->qp_counter_set_usr_page |=
 		context->qp_counter_set_usr_page |=
 			cpu_to_be32((u32)(mibport->cnts.set_id) << 24);
 			cpu_to_be32((u32)(mibport->cnts.set_id) << 24);
@@ -2824,7 +2858,8 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 	optpar = ib_mask_to_mlx5_opt(attr_mask);
 	optpar = ib_mask_to_mlx5_opt(attr_mask);
 	optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st];
 	optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st];
 
 
-	if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) {
+	if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET ||
+	    qp->flags & MLX5_IB_QP_UNDERLAY) {
 		struct mlx5_modify_raw_qp_param raw_qp_param = {};
 		struct mlx5_modify_raw_qp_param raw_qp_param = {};
 
 
 		raw_qp_param.operation = op;
 		raw_qp_param.operation = op;
@@ -2913,7 +2948,13 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		ll = dev->ib_dev.get_link_layer(&dev->ib_dev, port);
 		ll = dev->ib_dev.get_link_layer(&dev->ib_dev, port);
 	}
 	}
 
 
-	if (qp_type != MLX5_IB_QPT_REG_UMR &&
+	if (qp->flags & MLX5_IB_QP_UNDERLAY) {
+		if (attr_mask & ~(IB_QP_STATE | IB_QP_CUR_STATE)) {
+			mlx5_ib_dbg(dev, "invalid attr_mask 0x%x when underlay QP is used\n",
+				    attr_mask);
+			goto out;
+		}
+	} else if (qp_type != MLX5_IB_QPT_REG_UMR &&
 	    !ib_modify_qp_is_ok(cur_state, new_state, qp_type, attr_mask, ll)) {
 	    !ib_modify_qp_is_ok(cur_state, new_state, qp_type, attr_mask, ll)) {
 		mlx5_ib_dbg(dev, "invalid QP state transition from %d to %d, qp_type %d, attr_mask 0x%x\n",
 		mlx5_ib_dbg(dev, "invalid QP state transition from %d to %d, qp_type %d, attr_mask 0x%x\n",
 			    cur_state, new_state, ibqp->qp_type, attr_mask);
 			    cur_state, new_state, ibqp->qp_type, attr_mask);
@@ -4477,9 +4518,14 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
 		return mlx5_ib_gsi_query_qp(ibqp, qp_attr, qp_attr_mask,
 		return mlx5_ib_gsi_query_qp(ibqp, qp_attr, qp_attr_mask,
 					    qp_init_attr);
 					    qp_init_attr);
 
 
+	/* Not all of output fields are applicable, make sure to zero them */
+	memset(qp_init_attr, 0, sizeof(*qp_init_attr));
+	memset(qp_attr, 0, sizeof(*qp_attr));
+
 	mutex_lock(&qp->mutex);
 	mutex_lock(&qp->mutex);
 
 
-	if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) {
+	if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET ||
+	    qp->flags & MLX5_IB_QP_UNDERLAY) {
 		err = query_raw_packet_qp_state(dev, qp, &raw_packet_qp_state);
 		err = query_raw_packet_qp_state(dev, qp, &raw_packet_qp_state);
 		if (err)
 		if (err)
 			goto out;
 			goto out;
@@ -4597,6 +4643,27 @@ static void mlx5_ib_wq_event(struct mlx5_core_qp *core_qp, int type)
 	}
 	}
 }
 }
 
 
+static int set_delay_drop(struct mlx5_ib_dev *dev)
+{
+	int err = 0;
+
+	mutex_lock(&dev->delay_drop.lock);
+	if (dev->delay_drop.activate)
+		goto out;
+
+	err = mlx5_core_set_delay_drop(dev->mdev, dev->delay_drop.timeout);
+	if (err)
+		goto out;
+
+	dev->delay_drop.activate = true;
+out:
+	mutex_unlock(&dev->delay_drop.lock);
+
+	if (!err)
+		atomic_inc(&dev->delay_drop.rqs_cnt);
+	return err;
+}
+
 static int  create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd,
 static int  create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd,
 		      struct ib_wq_init_attr *init_attr)
 		      struct ib_wq_init_attr *init_attr)
 {
 {
@@ -4651,9 +4718,28 @@ static int  create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd,
 		}
 		}
 		MLX5_SET(rqc, rqc, scatter_fcs, 1);
 		MLX5_SET(rqc, rqc, scatter_fcs, 1);
 	}
 	}
+	if (init_attr->create_flags & IB_WQ_FLAGS_DELAY_DROP) {
+		if (!(dev->ib_dev.attrs.raw_packet_caps &
+		      IB_RAW_PACKET_CAP_DELAY_DROP)) {
+			mlx5_ib_dbg(dev, "Delay drop is not supported\n");
+			err = -EOPNOTSUPP;
+			goto out;
+		}
+		MLX5_SET(rqc, rqc, delay_drop_en, 1);
+	}
 	rq_pas0 = (__be64 *)MLX5_ADDR_OF(wq, wq, pas);
 	rq_pas0 = (__be64 *)MLX5_ADDR_OF(wq, wq, pas);
 	mlx5_ib_populate_pas(dev, rwq->umem, rwq->page_shift, rq_pas0, 0);
 	mlx5_ib_populate_pas(dev, rwq->umem, rwq->page_shift, rq_pas0, 0);
 	err = mlx5_core_create_rq_tracked(dev->mdev, in, inlen, &rwq->core_qp);
 	err = mlx5_core_create_rq_tracked(dev->mdev, in, inlen, &rwq->core_qp);
+	if (!err && init_attr->create_flags & IB_WQ_FLAGS_DELAY_DROP) {
+		err = set_delay_drop(dev);
+		if (err) {
+			mlx5_ib_warn(dev, "Failed to enable delay drop err=%d\n",
+				     err);
+			mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp);
+		} else {
+			rwq->create_flags |= MLX5_IB_WQ_FLAGS_DELAY_DROP;
+		}
+	}
 out:
 out:
 	kvfree(in);
 	kvfree(in);
 	return err;
 	return err;
@@ -4787,7 +4873,7 @@ struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd,
 err_copy:
 err_copy:
 	mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp);
 	mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp);
 err_user_rq:
 err_user_rq:
-	destroy_user_rq(pd, rwq);
+	destroy_user_rq(dev, pd, rwq);
 err:
 err:
 	kfree(rwq);
 	kfree(rwq);
 	return ERR_PTR(err);
 	return ERR_PTR(err);
@@ -4799,7 +4885,7 @@ int mlx5_ib_destroy_wq(struct ib_wq *wq)
 	struct mlx5_ib_rwq *rwq = to_mrwq(wq);
 	struct mlx5_ib_rwq *rwq = to_mrwq(wq);
 
 
 	mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp);
 	mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp);
-	destroy_user_rq(wq->pd, rwq);
+	destroy_user_rq(dev, wq->pd, rwq);
 	kfree(rwq);
 	kfree(rwq);
 
 
 	return 0;
 	return 0;

+ 0 - 1
drivers/infiniband/hw/mthca/mthca_main.c

@@ -49,7 +49,6 @@
 MODULE_AUTHOR("Roland Dreier");
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("Mellanox InfiniBand HCA low-level driver");
 MODULE_DESCRIPTION("Mellanox InfiniBand HCA low-level driver");
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_LICENSE("Dual BSD/GPL");
-MODULE_VERSION(DRV_VERSION);
 
 
 #ifdef CONFIG_INFINIBAND_MTHCA_DEBUG
 #ifdef CONFIG_INFINIBAND_MTHCA_DEBUG
 
 

+ 2 - 3
drivers/infiniband/hw/mthca/mthca_provider.c

@@ -1178,12 +1178,11 @@ static int mthca_port_immutable(struct ib_device *ibdev, u8 port_num,
 	return 0;
 	return 0;
 }
 }
 
 
-static void get_dev_fw_str(struct ib_device *device, char *str,
-			   size_t str_len)
+static void get_dev_fw_str(struct ib_device *device, char *str)
 {
 {
 	struct mthca_dev *dev =
 	struct mthca_dev *dev =
 		container_of(device, struct mthca_dev, ib_dev);
 		container_of(device, struct mthca_dev, ib_dev);
-	snprintf(str, str_len, "%d.%d.%d",
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d",
 		 (int) (dev->fw_ver >> 32),
 		 (int) (dev->fw_ver >> 32),
 		 (int) (dev->fw_ver >> 16) & 0xffff,
 		 (int) (dev->fw_ver >> 16) & 0xffff,
 		 (int) dev->fw_ver & 0xffff);
 		 (int) dev->fw_ver & 0xffff);

Niektoré súbory nie sú zobrazené, pretože je v týchto rozdielových dátach zmenené mnoho súborov