Browse Source

Merge branch 'hfi1' into merge-test

Doug Ledford 8 năm trước cách đây
mục cha
commit
253f8b22e0
46 tập tin đã thay đổi với 1786 bổ sung674 xóa
  1. 2 1
      drivers/infiniband/hw/hfi1/affinity.c
  2. 1 8
      drivers/infiniband/hw/hfi1/affinity.h
  3. 6 3
      drivers/infiniband/hw/hfi1/chip.c
  4. 3 0
      drivers/infiniband/hw/hfi1/chip_registers.h
  5. 110 0
      drivers/infiniband/hw/hfi1/debugfs.c
  6. 1 2
      drivers/infiniband/hw/hfi1/driver.c
  7. 203 8
      drivers/infiniband/hw/hfi1/eprom.c
  8. 133 23
      drivers/infiniband/hw/hfi1/firmware.c
  9. 87 57
      drivers/infiniband/hw/hfi1/hfi.h
  10. 8 0
      drivers/infiniband/hw/hfi1/iowait.h
  11. 10 9
      drivers/infiniband/hw/hfi1/mad.c
  12. 1 1
      drivers/infiniband/hw/hfi1/mmu_rb.c
  13. 13 26
      drivers/infiniband/hw/hfi1/pio.c
  14. 20 18
      drivers/infiniband/hw/hfi1/pio.h
  15. 11 11
      drivers/infiniband/hw/hfi1/pio_copy.c
  16. 167 26
      drivers/infiniband/hw/hfi1/platform.c
  17. 117 10
      drivers/infiniband/hw/hfi1/platform.h
  18. 8 3
      drivers/infiniband/hw/hfi1/qp.c
  19. 13 47
      drivers/infiniband/hw/hfi1/rc.c
  20. 7 37
      drivers/infiniband/hw/hfi1/ruc.c
  21. 9 9
      drivers/infiniband/hw/hfi1/sdma.c
  22. 9 3
      drivers/infiniband/hw/hfi1/sdma.h
  23. 2 2
      drivers/infiniband/hw/hfi1/uc.c
  24. 2 2
      drivers/infiniband/hw/hfi1/ud.c
  25. 43 17
      drivers/infiniband/hw/hfi1/user_sdma.c
  26. 174 35
      drivers/infiniband/hw/hfi1/verbs.c
  27. 7 9
      drivers/infiniband/hw/hfi1/verbs.h
  28. 7 6
      drivers/infiniband/hw/hfi1/verbs_txreq.c
  29. 1 0
      drivers/infiniband/hw/hfi1/verbs_txreq.h
  30. 1 2
      drivers/infiniband/hw/qib/qib_driver.c
  31. 5 39
      drivers/infiniband/hw/qib/qib_rc.c
  32. 2 22
      drivers/infiniband/hw/qib/qib_ruc.c
  33. 8 25
      drivers/infiniband/hw/qib/qib_verbs.c
  34. 27 37
      drivers/infiniband/sw/rdmavt/cq.c
  35. 2 3
      drivers/infiniband/sw/rdmavt/mcast.c
  36. 13 9
      drivers/infiniband/sw/rdmavt/mr.c
  37. 19 1
      drivers/infiniband/sw/rdmavt/qp.c
  38. 4 137
      drivers/infiniband/sw/rdmavt/trace.h
  39. 112 0
      drivers/infiniband/sw/rdmavt/trace_mr.h
  40. 96 0
      drivers/infiniband/sw/rdmavt/trace_qp.h
  41. 81 0
      drivers/infiniband/sw/rdmavt/trace_rvt.h
  42. 132 0
      drivers/infiniband/sw/rdmavt/trace_tx.h
  43. 24 22
      include/rdma/rdma_vt.h
  44. 7 3
      include/rdma/rdmavt_mr.h
  45. 77 0
      include/rdma/rdmavt_qp.h
  46. 1 1
      include/uapi/rdma/hfi/hfi1_user.h

+ 2 - 1
drivers/infiniband/hw/hfi1/affinity.c

@@ -125,6 +125,7 @@ int node_affinity_init(void)
 				cpumask_weight(topology_sibling_cpumask(
 					cpumask_first(&node_affinity.proc.mask)
 					));
+	node_affinity.num_possible_nodes = num_possible_nodes();
 	node_affinity.num_online_nodes = num_online_nodes();
 	node_affinity.num_online_cpus = num_online_cpus();
 
@@ -135,7 +136,7 @@ int node_affinity_init(void)
 	 */
 	init_real_cpu_mask();
 
-	hfi1_per_node_cntr = kcalloc(num_possible_nodes(),
+	hfi1_per_node_cntr = kcalloc(node_affinity.num_possible_nodes,
 				     sizeof(*hfi1_per_node_cntr), GFP_KERNEL);
 	if (!hfi1_per_node_cntr)
 		return -ENOMEM;

+ 1 - 8
drivers/infiniband/hw/hfi1/affinity.h

@@ -70,14 +70,6 @@ struct cpu_mask_set {
 	uint gen;
 };
 
-struct hfi1_affinity {
-	struct cpu_mask_set def_intr;
-	struct cpu_mask_set rcv_intr;
-	struct cpumask real_cpu_mask;
-	/* spin lock to protect affinity struct */
-	spinlock_t lock;
-};
-
 struct hfi1_msix_entry;
 
 /* Initialize non-HT cpu cores mask */
@@ -119,6 +111,7 @@ struct hfi1_affinity_node_list {
 	struct cpumask real_cpu_mask;
 	struct cpu_mask_set proc;
 	int num_core_siblings;
+	int num_possible_nodes;
 	int num_online_nodes;
 	int num_online_cpus;
 	struct mutex lock; /* protects affinity nodes */

+ 6 - 3
drivers/infiniband/hw/hfi1/chip.c

@@ -8488,7 +8488,10 @@ static int do_8051_command(
 	 */
 	if (type == HCMD_WRITE_LCB_CSR) {
 		in_data |= ((*out_data) & 0xffffffffffull) << 8;
-		reg = ((((*out_data) >> 40) & 0xff) <<
+		/* must preserve COMPLETED - it is tied to hardware */
+		reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_0);
+		reg &= DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK;
+		reg |= ((((*out_data) >> 40) & 0xff) <<
 				DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT)
 		      | ((((*out_data) >> 48) & 0xffff) <<
 				DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT);
@@ -9567,11 +9570,11 @@ int bringup_serdes(struct hfi1_pportdata *ppd)
 	if (HFI1_CAP_IS_KSET(EXTENDED_PSN))
 		add_rcvctrl(dd, RCV_CTRL_RCV_EXTENDED_PSN_ENABLE_SMASK);
 
-	guid = ppd->guid;
+	guid = ppd->guids[HFI1_PORT_GUID_INDEX];
 	if (!guid) {
 		if (dd->base_guid)
 			guid = dd->base_guid + ppd->port - 1;
-		ppd->guid = guid;
+		ppd->guids[HFI1_PORT_GUID_INDEX] = guid;
 	}
 
 	/* Set linkinit_reason on power up per OPA spec */

+ 3 - 0
drivers/infiniband/hw/hfi1/chip_registers.h

@@ -415,6 +415,9 @@
 #define ASIC_CFG_SBUS_REQUEST_DATA_IN_SHIFT 32
 #define ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT 0
 #define ASIC_CFG_SCRATCH (ASIC + 0x000000000020)
+#define ASIC_CFG_SCRATCH_1 (ASIC_CFG_SCRATCH + 0x08)
+#define ASIC_CFG_SCRATCH_2 (ASIC_CFG_SCRATCH + 0x10)
+#define ASIC_CFG_SCRATCH_3 (ASIC_CFG_SCRATCH + 0x18)
 #define ASIC_CFG_THERM_POLL_EN (ASIC + 0x000000000050)
 #define ASIC_EEP_ADDR_CMD (ASIC + 0x000000000308)
 #define ASIC_EEP_ADDR_CMD_EP_ADDR_MASK 0xFFFFFFull

+ 110 - 0
drivers/infiniband/hw/hfi1/debugfs.c

@@ -541,6 +541,114 @@ static ssize_t asic_flags_write(struct file *file, const char __user *buf,
 	return ret;
 }
 
+/* read the dc8051 memory */
+static ssize_t dc8051_memory_read(struct file *file, char __user *buf,
+				  size_t count, loff_t *ppos)
+{
+	struct hfi1_pportdata *ppd = private2ppd(file);
+	ssize_t rval;
+	void *tmp;
+	loff_t start, end;
+
+	/* the checks below expect the position to be positive */
+	if (*ppos < 0)
+		return -EINVAL;
+
+	tmp = kzalloc(DC8051_DATA_MEM_SIZE, GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	/*
+	 * Fill in the requested portion of the temporary buffer from the
+	 * 8051 memory.  The 8051 memory read is done in terms of 8 bytes.
+	 * Adjust start and end to fit.  Skip reading anything if out of
+	 * range.
+	 */
+	start = *ppos & ~0x7;	/* round down */
+	if (start < DC8051_DATA_MEM_SIZE) {
+		end = (*ppos + count + 7) & ~0x7; /* round up */
+		if (end > DC8051_DATA_MEM_SIZE)
+			end = DC8051_DATA_MEM_SIZE;
+		rval = read_8051_data(ppd->dd, start, end - start,
+				      (u64 *)(tmp + start));
+		if (rval)
+			goto done;
+	}
+
+	rval = simple_read_from_buffer(buf, count, ppos, tmp,
+				       DC8051_DATA_MEM_SIZE);
+done:
+	kfree(tmp);
+	return rval;
+}
+
+static ssize_t debugfs_lcb_read(struct file *file, char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	struct hfi1_pportdata *ppd = private2ppd(file);
+	struct hfi1_devdata *dd = ppd->dd;
+	unsigned long total, csr_off;
+	u64 data;
+
+	if (*ppos < 0)
+		return -EINVAL;
+	/* only read 8 byte quantities */
+	if ((count % 8) != 0)
+		return -EINVAL;
+	/* offset must be 8-byte aligned */
+	if ((*ppos % 8) != 0)
+		return -EINVAL;
+	/* do nothing if out of range or zero count */
+	if (*ppos >= (LCB_END - LCB_START) || !count)
+		return 0;
+	/* reduce count if needed */
+	if (*ppos + count > LCB_END - LCB_START)
+		count = (LCB_END - LCB_START) - *ppos;
+
+	csr_off = LCB_START + *ppos;
+	for (total = 0; total < count; total += 8, csr_off += 8) {
+		if (read_lcb_csr(dd, csr_off, (u64 *)&data))
+			break; /* failed */
+		if (put_user(data, (unsigned long __user *)(buf + total)))
+			break;
+	}
+	*ppos += total;
+	return total;
+}
+
+static ssize_t debugfs_lcb_write(struct file *file, const char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	struct hfi1_pportdata *ppd = private2ppd(file);
+	struct hfi1_devdata *dd = ppd->dd;
+	unsigned long total, csr_off, data;
+
+	if (*ppos < 0)
+		return -EINVAL;
+	/* only write 8 byte quantities */
+	if ((count % 8) != 0)
+		return -EINVAL;
+	/* offset must be 8-byte aligned */
+	if ((*ppos % 8) != 0)
+		return -EINVAL;
+	/* do nothing if out of range or zero count */
+	if (*ppos >= (LCB_END - LCB_START) || !count)
+		return 0;
+	/* reduce count if needed */
+	if (*ppos + count > LCB_END - LCB_START)
+		count = (LCB_END - LCB_START) - *ppos;
+
+	csr_off = LCB_START + *ppos;
+	for (total = 0; total < count; total += 8, csr_off += 8) {
+		if (get_user(data, (unsigned long __user *)(buf + total)))
+			break;
+		if (write_lcb_csr(dd, csr_off, data))
+			break; /* failed */
+	}
+	*ppos += total;
+	return total;
+}
+
 /*
  * read the per-port QSFP data for ppd
  */
@@ -931,6 +1039,8 @@ static const struct counter_info port_cntr_ops[] = {
 	DEBUGFS_XOPS("qsfp2", qsfp2_debugfs_read, qsfp2_debugfs_write,
 		     qsfp2_debugfs_open, qsfp2_debugfs_release),
 	DEBUGFS_OPS("asic_flags", asic_flags_read, asic_flags_write),
+	DEBUGFS_OPS("dc8051_memory", dc8051_memory_read, NULL),
+	DEBUGFS_OPS("lcb", debugfs_lcb_read, debugfs_lcb_write),
 };
 
 static void *_sdma_cpu_list_seq_start(struct seq_file *s, loff_t *pos)

+ 1 - 2
drivers/infiniband/hw/hfi1/driver.c

@@ -793,8 +793,7 @@ static inline void process_rcv_qp_work(struct hfi1_packet *packet)
 				hfi1_schedule_send(qp);
 			spin_unlock_irqrestore(&qp->s_lock, flags);
 		}
-		if (atomic_dec_and_test(&qp->refcount))
-			wake_up(&qp->wait);
+		rvt_put_qp(qp);
 	}
 }
 

+ 203 - 8
drivers/infiniband/hw/hfi1/eprom.c

@@ -207,6 +207,40 @@ done_asic:
 /* magic character sequence that trails an image */
 #define IMAGE_TRAIL_MAGIC "egamiAPO"
 
+/* EPROM file types */
+#define HFI1_EFT_PLATFORM_CONFIG 2
+
+/* segment size - 128 KiB */
+#define SEG_SIZE (128 * 1024)
+
+struct hfi1_eprom_footer {
+	u32 oprom_size;		/* size of the oprom, in bytes */
+	u16 num_table_entries;
+	u16 version;		/* version of this footer */
+	u32 magic;		/* must be last */
+};
+
+struct hfi1_eprom_table_entry {
+	u32 type;		/* file type */
+	u32 offset;		/* file offset from start of EPROM */
+	u32 size;		/* file size, in bytes */
+};
+
+/*
+ * Calculate the max number of table entries that will fit within a directory
+ * buffer of size 'dir_size'.
+ */
+#define MAX_TABLE_ENTRIES(dir_size) \
+	(((dir_size) - sizeof(struct hfi1_eprom_footer)) / \
+		sizeof(struct hfi1_eprom_table_entry))
+
+#define DIRECTORY_SIZE(n) (sizeof(struct hfi1_eprom_footer) + \
+	(sizeof(struct hfi1_eprom_table_entry) * (n)))
+
+#define MAGIC4(a, b, c, d) ((d) << 24 | (c) << 16 | (b) << 8 | (a))
+#define FOOTER_MAGIC MAGIC4('e', 'p', 'r', 'm')
+#define FOOTER_VERSION 1
+
 /*
  * Read all of partition 1.  The actual file is at the front.  Adjust
  * the returned size if a trailing image magic is found.
@@ -241,6 +275,167 @@ static int read_partition_platform_config(struct hfi1_devdata *dd, void **data,
 	return 0;
 }
 
+/*
+ * The segment magic has been checked.  There is a footer and table of
+ * contents present.
+ *
+ * directory is a u32 aligned buffer of size EP_PAGE_SIZE.
+ */
+static int read_segment_platform_config(struct hfi1_devdata *dd,
+					void *directory, void **data, u32 *size)
+{
+	struct hfi1_eprom_footer *footer;
+	struct hfi1_eprom_table_entry *table;
+	struct hfi1_eprom_table_entry *entry;
+	void *buffer = NULL;
+	void *table_buffer = NULL;
+	int ret, i;
+	u32 directory_size;
+	u32 seg_base, seg_offset;
+	u32 bytes_available, ncopied, to_copy;
+
+	/* the footer is at the end of the directory */
+	footer = (struct hfi1_eprom_footer *)
+			(directory + EP_PAGE_SIZE - sizeof(*footer));
+
+	/* make sure the structure version is supported */
+	if (footer->version != FOOTER_VERSION)
+		return -EINVAL;
+
+	/* oprom size cannot be larger than a segment */
+	if (footer->oprom_size >= SEG_SIZE)
+		return -EINVAL;
+
+	/* the file table must fit in a segment with the oprom */
+	if (footer->num_table_entries >
+			MAX_TABLE_ENTRIES(SEG_SIZE - footer->oprom_size))
+		return -EINVAL;
+
+	/* find the file table start, which precedes the footer */
+	directory_size = DIRECTORY_SIZE(footer->num_table_entries);
+	if (directory_size <= EP_PAGE_SIZE) {
+		/* the file table fits into the directory buffer handed in */
+		table = (struct hfi1_eprom_table_entry *)
+				(directory + EP_PAGE_SIZE - directory_size);
+	} else {
+		/* need to allocate and read more */
+		table_buffer = kmalloc(directory_size, GFP_KERNEL);
+		if (!table_buffer)
+			return -ENOMEM;
+		ret = read_length(dd, SEG_SIZE - directory_size,
+				  directory_size, table_buffer);
+		if (ret)
+			goto done;
+		table = table_buffer;
+	}
+
+	/* look for the platform configuration file in the table */
+	for (entry = NULL, i = 0; i < footer->num_table_entries; i++) {
+		if (table[i].type == HFI1_EFT_PLATFORM_CONFIG) {
+			entry = &table[i];
+			break;
+		}
+	}
+	if (!entry) {
+		ret = -ENOENT;
+		goto done;
+	}
+
+	/*
+	 * Sanity check on the configuration file size - it should never
+	 * be larger than 4 KiB.
+	 */
+	if (entry->size > (4 * 1024)) {
+		dd_dev_err(dd, "Bad configuration file size 0x%x\n",
+			   entry->size);
+		ret = -EINVAL;
+		goto done;
+	}
+
+	/* check for bogus offset and size that wrap when added together */
+	if (entry->offset + entry->size < entry->offset) {
+		dd_dev_err(dd,
+			   "Bad configuration file start + size 0x%x+0x%x\n",
+			   entry->offset, entry->size);
+		ret = -EINVAL;
+		goto done;
+	}
+
+	/* allocate the buffer to return */
+	buffer = kmalloc(entry->size, GFP_KERNEL);
+	if (!buffer) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	/*
+	 * Extract the file by looping over segments until it is fully read.
+	 */
+	seg_offset = entry->offset % SEG_SIZE;
+	seg_base = entry->offset - seg_offset;
+	ncopied = 0;
+	while (ncopied < entry->size) {
+		/* calculate data bytes available in this segment */
+
+		/* start with the bytes from the current offset to the end */
+		bytes_available = SEG_SIZE - seg_offset;
+		/* subtract off footer and table from segment 0 */
+		if (seg_base == 0) {
+			/*
+			 * Sanity check: should not have a starting point
+			 * at or within the directory.
+			 */
+			if (bytes_available <= directory_size) {
+				dd_dev_err(dd,
+					   "Bad configuration file - offset 0x%x within footer+table\n",
+					   entry->offset);
+				ret = -EINVAL;
+				goto done;
+			}
+			bytes_available -= directory_size;
+		}
+
+		/* calculate bytes wanted */
+		to_copy = entry->size - ncopied;
+
+		/* max out at the available bytes in this segment */
+		if (to_copy > bytes_available)
+			to_copy = bytes_available;
+
+		/*
+		 * Read from the EPROM.
+		 *
+		 * The sanity check for entry->offset is done in read_length().
+		 * The EPROM offset is validated against what the hardware
+		 * addressing supports.  In addition, if the offset is larger
+		 * than the actual EPROM, it silently wraps.  It will work
+		 * fine, though the reader may not get what they expected
+		 * from the EPROM.
+		 */
+		ret = read_length(dd, seg_base + seg_offset, to_copy,
+				  buffer + ncopied);
+		if (ret)
+			goto done;
+
+		ncopied += to_copy;
+
+		/* set up for next segment */
+		seg_offset = footer->oprom_size;
+		seg_base += SEG_SIZE;
+	}
+
+	/* success */
+	ret = 0;
+	*data = buffer;
+	*size = entry->size;
+
+done:
+	kfree(table_buffer);
+	if (ret)
+		kfree(buffer);
+	return ret;
+}
+
 /*
  * Read the platform configuration file from the EPROM.
  *
@@ -253,6 +448,7 @@ static int read_partition_platform_config(struct hfi1_devdata *dd, void **data,
  *   -EBUSY   - not able to acquire access to the EPROM
  *   -ENOENT  - no recognizable file written
  *   -ENOMEM  - buffer could not be allocated
+ *   -EINVAL  - invalid EPROM contentents found
  */
 int eprom_read_platform_config(struct hfi1_devdata *dd, void **data, u32 *size)
 {
@@ -266,21 +462,20 @@ int eprom_read_platform_config(struct hfi1_devdata *dd, void **data, u32 *size)
 	if (ret)
 		return -EBUSY;
 
-	/* read the last page of P0 for the EPROM format magic */
-	ret = read_length(dd, P1_START - EP_PAGE_SIZE, EP_PAGE_SIZE, directory);
+	/* read the last page of the segment for the EPROM format magic */
+	ret = read_length(dd, SEG_SIZE - EP_PAGE_SIZE, EP_PAGE_SIZE, directory);
 	if (ret)
 		goto done;
 
-	/* last dword of P0 contains a magic indicator */
-	if (directory[EP_PAGE_DWORDS - 1] == 0) {
+	/* last dword of the segment contains a magic value */
+	if (directory[EP_PAGE_DWORDS - 1] == FOOTER_MAGIC) {
+		/* segment format */
+		ret = read_segment_platform_config(dd, directory, data, size);
+	} else {
 		/* partition format */
 		ret = read_partition_platform_config(dd, data, size);
-		goto done;
 	}
 
-	/* nothing recognized */
-	ret = -ENOENT;
-
 done:
 	release_chip_resource(dd, CR_EPROM);
 	return ret;

+ 133 - 23
drivers/infiniband/hw/hfi1/firmware.c

@@ -239,6 +239,16 @@ static const u8 all_fabric_serdes_broadcast = 0xe1;
 const u8 pcie_serdes_broadcast[2] = { 0xe2, 0xe3 };
 static const u8 all_pcie_serdes_broadcast = 0xe0;
 
+static const u32 platform_config_table_limits[PLATFORM_CONFIG_TABLE_MAX] = {
+	0,
+	SYSTEM_TABLE_MAX,
+	PORT_TABLE_MAX,
+	RX_PRESET_TABLE_MAX,
+	TX_PRESET_TABLE_MAX,
+	QSFP_ATTEN_TABLE_MAX,
+	VARIABLE_SETTINGS_TABLE_MAX
+};
+
 /* forwards */
 static void dispose_one_firmware(struct firmware_details *fdet);
 static int load_fabric_serdes_firmware(struct hfi1_devdata *dd,
@@ -263,11 +273,13 @@ static int __read_8051_data(struct hfi1_devdata *dd, u32 addr, u64 *result)
 	u64 reg;
 	int count;
 
-	/* start the read at the given address */
-	reg = ((addr & DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK)
-			<< DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT)
-		| DC_DC8051_CFG_RAM_ACCESS_CTRL_READ_ENA_SMASK;
+	/* step 1: set the address, clear enable */
+	reg = (addr & DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK)
+			<< DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT;
 	write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, reg);
+	/* step 2: enable */
+	write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL,
+		  reg | DC_DC8051_CFG_RAM_ACCESS_CTRL_READ_ENA_SMASK);
 
 	/* wait until ACCESS_COMPLETED is set */
 	count = 0;
@@ -707,6 +719,9 @@ static int obtain_firmware(struct hfi1_devdata *dd)
 				       &dd->pcidev->dev);
 		if (err) {
 			platform_config = NULL;
+			dd_dev_err(dd,
+				   "%s: No default platform config file found\n",
+				   __func__);
 			goto done;
 		}
 		dd->platform_config.data = platform_config->data;
@@ -1761,8 +1776,17 @@ int parse_platform_config(struct hfi1_devdata *dd)
 	u32 record_idx = 0, table_type = 0, table_length_dwords = 0;
 	int ret = -EINVAL; /* assume failure */
 
+	/*
+	 * For integrated devices that did not fall back to the default file,
+	 * the SI tuning information for active channels is acquired from the
+	 * scratch register bitmap, thus there is no platform config to parse.
+	 * Skip parsing in these situations.
+	 */
+	if (is_integrated(dd) && !platform_config_load)
+		return 0;
+
 	if (!dd->platform_config.data) {
-		dd_dev_info(dd, "%s: Missing config file\n", __func__);
+		dd_dev_err(dd, "%s: Missing config file\n", __func__);
 		goto bail;
 	}
 	ptr = (u32 *)dd->platform_config.data;
@@ -1770,7 +1794,7 @@ int parse_platform_config(struct hfi1_devdata *dd)
 	magic_num = *ptr;
 	ptr++;
 	if (magic_num != PLATFORM_CONFIG_MAGIC_NUM) {
-		dd_dev_info(dd, "%s: Bad config file\n", __func__);
+		dd_dev_err(dd, "%s: Bad config file\n", __func__);
 		goto bail;
 	}
 
@@ -1797,9 +1821,9 @@ int parse_platform_config(struct hfi1_devdata *dd)
 		header1 = *ptr;
 		header2 = *(ptr + 1);
 		if (header1 != ~header2) {
-			dd_dev_info(dd, "%s: Failed validation at offset %ld\n",
-				    __func__, (ptr - (u32 *)
-					       dd->platform_config.data));
+			dd_dev_err(dd, "%s: Failed validation at offset %ld\n",
+				   __func__, (ptr - (u32 *)
+					      dd->platform_config.data));
 			goto bail;
 		}
 
@@ -1841,11 +1865,11 @@ int parse_platform_config(struct hfi1_devdata *dd)
 							table_length_dwords;
 				break;
 			default:
-				dd_dev_info(dd,
-					    "%s: Unknown data table %d, offset %ld\n",
-					    __func__, table_type,
-					    (ptr - (u32 *)
-					     dd->platform_config.data));
+				dd_dev_err(dd,
+					   "%s: Unknown data table %d, offset %ld\n",
+					   __func__, table_type,
+					   (ptr - (u32 *)
+					    dd->platform_config.data));
 				goto bail; /* We don't trust this file now */
 			}
 			pcfgcache->config_tables[table_type].table = ptr;
@@ -1865,11 +1889,11 @@ int parse_platform_config(struct hfi1_devdata *dd)
 			case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
 				break;
 			default:
-				dd_dev_info(dd,
-					    "%s: Unknown meta table %d, offset %ld\n",
-					    __func__, table_type,
-					    (ptr -
-					     (u32 *)dd->platform_config.data));
+				dd_dev_err(dd,
+					   "%s: Unknown meta table %d, offset %ld\n",
+					   __func__, table_type,
+					   (ptr -
+					    (u32 *)dd->platform_config.data));
 				goto bail; /* We don't trust this file now */
 			}
 			pcfgcache->config_tables[table_type].table_metadata =
@@ -1884,10 +1908,9 @@ int parse_platform_config(struct hfi1_devdata *dd)
 		/* Jump the table */
 		ptr += table_length_dwords;
 		if (crc != *ptr) {
-			dd_dev_info(dd, "%s: Failed CRC check at offset %ld\n",
-				    __func__, (ptr -
-					       (u32 *)
-					       dd->platform_config.data));
+			dd_dev_err(dd, "%s: Failed CRC check at offset %ld\n",
+				   __func__, (ptr -
+				   (u32 *)dd->platform_config.data));
 			goto bail;
 		}
 		/* Jump the CRC DWORD */
@@ -1901,6 +1924,84 @@ bail:
 	return ret;
 }
 
+static void get_integrated_platform_config_field(
+		struct hfi1_devdata *dd,
+		enum platform_config_table_type_encoding table_type,
+		int field_index, u32 *data)
+{
+	struct hfi1_pportdata *ppd = dd->pport;
+	u8 *cache = ppd->qsfp_info.cache;
+	u32 tx_preset = 0;
+
+	switch (table_type) {
+	case PLATFORM_CONFIG_SYSTEM_TABLE:
+		if (field_index == SYSTEM_TABLE_QSFP_POWER_CLASS_MAX)
+			*data = ppd->max_power_class;
+		else if (field_index == SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_25G)
+			*data = ppd->default_atten;
+		break;
+	case PLATFORM_CONFIG_PORT_TABLE:
+		if (field_index == PORT_TABLE_PORT_TYPE)
+			*data = ppd->port_type;
+		else if (field_index == PORT_TABLE_LOCAL_ATTEN_25G)
+			*data = ppd->local_atten;
+		else if (field_index == PORT_TABLE_REMOTE_ATTEN_25G)
+			*data = ppd->remote_atten;
+		break;
+	case PLATFORM_CONFIG_RX_PRESET_TABLE:
+		if (field_index == RX_PRESET_TABLE_QSFP_RX_CDR_APPLY)
+			*data = (ppd->rx_preset & QSFP_RX_CDR_APPLY_SMASK) >>
+				QSFP_RX_CDR_APPLY_SHIFT;
+		else if (field_index == RX_PRESET_TABLE_QSFP_RX_EMP_APPLY)
+			*data = (ppd->rx_preset & QSFP_RX_EMP_APPLY_SMASK) >>
+				QSFP_RX_EMP_APPLY_SHIFT;
+		else if (field_index == RX_PRESET_TABLE_QSFP_RX_AMP_APPLY)
+			*data = (ppd->rx_preset & QSFP_RX_AMP_APPLY_SMASK) >>
+				QSFP_RX_AMP_APPLY_SHIFT;
+		else if (field_index == RX_PRESET_TABLE_QSFP_RX_CDR)
+			*data = (ppd->rx_preset & QSFP_RX_CDR_SMASK) >>
+				QSFP_RX_CDR_SHIFT;
+		else if (field_index == RX_PRESET_TABLE_QSFP_RX_EMP)
+			*data = (ppd->rx_preset & QSFP_RX_EMP_SMASK) >>
+				QSFP_RX_EMP_SHIFT;
+		else if (field_index == RX_PRESET_TABLE_QSFP_RX_AMP)
+			*data = (ppd->rx_preset & QSFP_RX_AMP_SMASK) >>
+				QSFP_RX_AMP_SHIFT;
+		break;
+	case PLATFORM_CONFIG_TX_PRESET_TABLE:
+		if (cache[QSFP_EQ_INFO_OFFS] & 0x4)
+			tx_preset = ppd->tx_preset_eq;
+		else
+			tx_preset = ppd->tx_preset_noeq;
+		if (field_index == TX_PRESET_TABLE_PRECUR)
+			*data = (tx_preset & TX_PRECUR_SMASK) >>
+				TX_PRECUR_SHIFT;
+		else if (field_index == TX_PRESET_TABLE_ATTN)
+			*data = (tx_preset & TX_ATTN_SMASK) >>
+				TX_ATTN_SHIFT;
+		else if (field_index == TX_PRESET_TABLE_POSTCUR)
+			*data = (tx_preset & TX_POSTCUR_SMASK) >>
+				TX_POSTCUR_SHIFT;
+		else if (field_index == TX_PRESET_TABLE_QSFP_TX_CDR_APPLY)
+			*data = (tx_preset & QSFP_TX_CDR_APPLY_SMASK) >>
+				QSFP_TX_CDR_APPLY_SHIFT;
+		else if (field_index == TX_PRESET_TABLE_QSFP_TX_EQ_APPLY)
+			*data = (tx_preset & QSFP_TX_EQ_APPLY_SMASK) >>
+				QSFP_TX_EQ_APPLY_SHIFT;
+		else if (field_index == TX_PRESET_TABLE_QSFP_TX_CDR)
+			*data = (tx_preset & QSFP_TX_CDR_SMASK) >>
+				QSFP_TX_CDR_SHIFT;
+		else if (field_index == TX_PRESET_TABLE_QSFP_TX_EQ)
+			*data = (tx_preset & QSFP_TX_EQ_SMASK) >>
+				QSFP_TX_EQ_SHIFT;
+		break;
+	case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+	case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+	default:
+		break;
+	}
+}
+
 static int get_platform_fw_field_metadata(struct hfi1_devdata *dd, int table,
 					  int field, u32 *field_len_bits,
 					  u32 *field_start_bits)
@@ -1976,6 +2077,15 @@ int get_platform_config_field(struct hfi1_devdata *dd,
 	else
 		return -EINVAL;
 
+	if (is_integrated(dd) && !platform_config_load) {
+		/*
+		 * Use saved configuration from ppd for integrated platforms
+		 */
+		get_integrated_platform_config_field(dd, table_type,
+						     field_index, data);
+		return 0;
+	}
+
 	ret = get_platform_fw_field_metadata(dd, table_type, field_index,
 					     &field_len_bits,
 					     &field_start_bits);

+ 87 - 57
drivers/infiniband/hw/hfi1/hfi.h

@@ -512,6 +512,9 @@ struct rvt_sge_state;
 #define HFI1_MIN_VLS_SUPPORTED 1
 #define HFI1_MAX_VLS_SUPPORTED 8
 
+#define HFI1_GUIDS_PER_PORT  5
+#define HFI1_PORT_GUID_INDEX 0
+
 static inline void incr_cntr64(u64 *cntr)
 {
 	if (*cntr < (u64)-1LL)
@@ -579,11 +582,20 @@ struct hfi1_pportdata {
 	struct kobject vl2mtu_kobj;
 
 	/* PHY support */
-	u32 port_type;
 	struct qsfp_data qsfp_info;
+	/* Values for SI tuning of SerDes */
+	u32 port_type;
+	u32 tx_preset_eq;
+	u32 tx_preset_noeq;
+	u32 rx_preset;
+	u8  local_atten;
+	u8  remote_atten;
+	u8  default_atten;
+	u8  max_power_class;
+
+	/* GUIDs for this interface, in host order, guids[0] is a port guid */
+	u64 guids[HFI1_GUIDS_PER_PORT];
 
-	/* GUID for this interface, in host order */
-	u64 guid;
 	/* GUID for peer interface, in host order */
 	u64 neighbor_guid;
 
@@ -848,32 +860,29 @@ struct hfi1_devdata {
 	u8 __iomem *kregend;
 	/* physical address of chip for io_remap, etc. */
 	resource_size_t physaddr;
-	/* receive context data */
-	struct hfi1_ctxtdata **rcd;
+	/* Per VL data. Enough for all VLs but not all elements are set/used. */
+	struct per_vl_data vld[PER_VL_SEND_CONTEXTS];
 	/* send context data */
 	struct send_context_info *send_contexts;
 	/* map hardware send contexts to software index */
 	u8 *hw_to_sw;
 	/* spinlock for allocating and releasing send context resources */
 	spinlock_t sc_lock;
-	/* Per VL data. Enough for all VLs but not all elements are set/used. */
-	struct per_vl_data vld[PER_VL_SEND_CONTEXTS];
 	/* lock for pio_map */
 	spinlock_t pio_map_lock;
+	/* Send Context initialization lock. */
+	spinlock_t sc_init_lock;
+	/* lock for sdma_map */
+	spinlock_t                          sde_map_lock;
 	/* array of kernel send contexts */
 	struct send_context **kernel_send_context;
 	/* array of vl maps */
 	struct pio_vl_map __rcu *pio_map;
-	/* seqlock for sc2vl */
-	seqlock_t sc2vl_lock;
-	u64 sc2vl[4];
-	/* Send Context initialization lock. */
-	spinlock_t sc_init_lock;
+	/* default flags to last descriptor */
+	u64 default_desc1;
 
 	/* fields common to all SDMA engines */
 
-	/* default flags to last descriptor */
-	u64 default_desc1;
 	volatile __le64                    *sdma_heads_dma; /* DMA'ed by chip */
 	dma_addr_t                          sdma_heads_phys;
 	void                               *sdma_pad_dma; /* DMA'ed by chip */
@@ -884,8 +893,6 @@ struct hfi1_devdata {
 	u32                                 chip_sdma_engines;
 	/* num used */
 	u32                                 num_sdma;
-	/* lock for sdma_map */
-	spinlock_t                          sde_map_lock;
 	/* array of engines sized by num_sdma */
 	struct sdma_engine                 *per_sdma;
 	/* array of vl maps */
@@ -894,14 +901,11 @@ struct hfi1_devdata {
 	wait_queue_head_t		  sdma_unfreeze_wq;
 	atomic_t			  sdma_unfreeze_count;
 
+	u32 lcb_access_count;		/* count of LCB users */
+
 	/* common data between shared ASIC HFIs in this OS */
 	struct hfi1_asic_data *asic_data;
 
-	/* hfi1_pportdata, points to array of (physical) port-specific
-	 * data structs, indexed by pidx (0..n-1)
-	 */
-	struct hfi1_pportdata *pport;
-
 	/* mem-mapped pointer to base of PIO buffers */
 	void __iomem *piobase;
 	/*
@@ -918,20 +922,13 @@ struct hfi1_devdata {
 	/* send context numbers and sizes for each type */
 	struct sc_config_sizes sc_sizes[SC_MAX];
 
-	u32 lcb_access_count;		/* count of LCB users */
-
 	char *boardname; /* human readable board info */
 
-	/* device (not port) flags, basically device capabilities */
-	u32 flags;
-
 	/* reset value */
 	u64 z_int_counter;
 	u64 z_rcv_limit;
 	u64 z_send_schedule;
-	/* percpu int_counter */
-	u64 __percpu *int_counter;
-	u64 __percpu *rcv_limit;
+
 	u64 __percpu *send_schedule;
 	/* number of receive contexts in use by the driver */
 	u32 num_rcv_contexts;
@@ -946,6 +943,7 @@ struct hfi1_devdata {
 	/* base receive interrupt timeout, in CSR units */
 	u32 rcv_intr_timeout_csr;
 
+	u32 freezelen; /* max length of freezemsg */
 	u64 __iomem *egrtidbase;
 	spinlock_t sendctrl_lock; /* protect changes to SendCtrl */
 	spinlock_t rcvctrl_lock; /* protect changes to RcvCtrl */
@@ -967,7 +965,6 @@ struct hfi1_devdata {
 	 * IB link status cheaply
 	 */
 	struct hfi1_status *status;
-	u32 freezelen; /* max length of freezemsg */
 
 	/* revision register shadow */
 	u64 revision;
@@ -995,6 +992,8 @@ struct hfi1_devdata {
 	u16 rcvegrbufsize_shift;
 	/* both sides of the PCIe link are gen3 capable */
 	u8 link_gen3_capable;
+	/* default link down value (poll/sleep) */
+	u8 link_default;
 	/* localbus width (1, 2,4,8,16,32) from config space  */
 	u32 lbus_width;
 	/* localbus speed in MHz */
@@ -1030,8 +1029,6 @@ struct hfi1_devdata {
 	u8 hfi1_id;
 	/* implementation code */
 	u8 icode;
-	/* default link down value (poll/sleep) */
-	u8 link_default;
 	/* vAU of this device */
 	u8 vau;
 	/* vCU of this device */
@@ -1042,27 +1039,17 @@ struct hfi1_devdata {
 	u16 vl15_init;
 
 	/* Misc small ints */
-	/* Number of physical ports available */
-	u8 num_pports;
-	/* Lowest context number which can be used by user processes */
-	u8 first_user_ctxt;
 	u8 n_krcv_queues;
 	u8 qos_shift;
-	u8 qpn_mask;
 
-	u16 rhf_offset; /* offset of RHF within receive header entry */
 	u16 irev;	/* implementation revision */
 	u16 dc8051_ver; /* 8051 firmware version */
 
+	spinlock_t hfi1_diag_trans_lock; /* protect diag observer ops */
 	struct platform_config platform_config;
 	struct platform_config_cache pcfg_cache;
 
 	struct diag_client *diag_client;
-	spinlock_t hfi1_diag_trans_lock; /* protect diag observer ops */
-
-	u8 psxmitwait_supported;
-	/* cycle length of PS* counters in HW (in picoseconds) */
-	u16 psxmitwait_check_rate;
 
 	/* MSI-X information */
 	struct hfi1_msix_entry *msix_entries;
@@ -1077,6 +1064,9 @@ struct hfi1_devdata {
 
 	struct rcv_array_data rcv_entries;
 
+	/* cycle length of PS* counters in HW (in picoseconds) */
+	u16 psxmitwait_check_rate;
+
 	/*
 	 * 64 bit synthetic counters
 	 */
@@ -1109,11 +1099,11 @@ struct hfi1_devdata {
 	struct err_info_rcvport err_info_rcvport;
 	struct err_info_constraint err_info_rcv_constraint;
 	struct err_info_constraint err_info_xmit_constraint;
-	u8 err_info_uncorrectable;
-	u8 err_info_fmconfig;
 
 	atomic_t drop_packet;
 	u8 do_drop;
+	u8 err_info_uncorrectable;
+	u8 err_info_fmconfig;
 
 	/*
 	 * Software counters for the status bits defined by the
@@ -1136,47 +1126,70 @@ struct hfi1_devdata {
 	u64 sw_cce_err_status_aggregate;
 	/* Software counter that aggregates all bypass packet rcv errors */
 	u64 sw_rcv_bypass_packet_errors;
-	/* receive interrupt functions */
-	rhf_rcv_function_ptr *rhf_rcv_function_map;
+	/* receive interrupt function */
 	rhf_rcv_function_ptr normal_rhf_rcv_functions[8];
 
+	/* Save the enabled LCB error bits */
+	u64 lcb_err_en;
+
 	/*
 	 * Handlers for outgoing data so that snoop/capture does not
 	 * have to have its hooks in the send path
 	 */
-	send_routine process_pio_send;
+	send_routine process_pio_send ____cacheline_aligned_in_smp;
 	send_routine process_dma_send;
 	void (*pio_inline_send)(struct hfi1_devdata *dd, struct pio_buf *pbuf,
 				u64 pbc, const void *from, size_t count);
+	/* hfi1_pportdata, points to array of (physical) port-specific
+	 * data structs, indexed by pidx (0..n-1)
+	 */
+	struct hfi1_pportdata *pport;
+	/* receive context data */
+	struct hfi1_ctxtdata **rcd;
+	u64 __percpu *int_counter;
+	/* device (not port) flags, basically device capabilities */
+	u16 flags;
+	/* Number of physical ports available */
+	u8 num_pports;
+	/* Lowest context number which can be used by user processes */
+	u8 first_user_ctxt;
+	/* adding a new field here would make it part of this cacheline */
+
+	/* seqlock for sc2vl */
+	seqlock_t sc2vl_lock ____cacheline_aligned_in_smp;
+	u64 sc2vl[4];
+	/* receive interrupt functions */
+	rhf_rcv_function_ptr *rhf_rcv_function_map;
+	u64 __percpu *rcv_limit;
+	u16 rhf_offset; /* offset of RHF within receive header entry */
+	/* adding a new field here would make it part of this cacheline */
 
 	/* OUI comes from the HW. Used everywhere as 3 separate bytes. */
 	u8 oui1;
 	u8 oui2;
 	u8 oui3;
+	u8 dc_shutdown;
+
 	/* Timer and counter used to detect RcvBufOvflCnt changes */
 	struct timer_list rcverr_timer;
-	u32 rcv_ovfl_cnt;
 
 	wait_queue_head_t event_queue;
 
-	/* Save the enabled LCB error bits */
-	u64 lcb_err_en;
-	u8 dc_shutdown;
-
 	/* receive context tail dummy address */
 	__le64 *rcvhdrtail_dummy_kvaddr;
 	dma_addr_t rcvhdrtail_dummy_dma;
 
-	bool eprom_available;	/* true if EPROM is available for this device */
-	bool aspm_supported;	/* Does HW support ASPM */
-	bool aspm_enabled;	/* ASPM state: enabled/disabled */
+	u32 rcv_ovfl_cnt;
 	/* Serialize ASPM enable/disable between multiple verbs contexts */
 	spinlock_t aspm_lock;
 	/* Number of verbs contexts which have disabled ASPM */
 	atomic_t aspm_disabled_cnt;
 
-	struct hfi1_affinity *affinity;
+	bool eprom_available;	/* true if EPROM is available for this device */
+	bool aspm_supported;	/* Does HW support ASPM */
+	bool aspm_enabled;	/* ASPM state: enabled/disabled */
 	struct rhashtable sdma_rht;
+
 	struct kobject kobj;
 };
 
@@ -1632,6 +1645,17 @@ static inline u16 hfi1_get_pkey(struct hfi1_ibport *ibp, unsigned index)
 	return ret;
 }
 
+/*
+ * Return the indexed GUID from the port GUIDs table.
+ */
+static inline __be64 get_sguid(struct hfi1_ibport *ibp, unsigned int index)
+{
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+	WARN_ON(index >= HFI1_GUIDS_PER_PORT);
+	return cpu_to_be64(ppd->guids[index]);
+}
+
 /*
  * Called by readers of cc_state only, must call under rcu_read_lock().
  */
@@ -2003,6 +2027,12 @@ static inline u32 qsfp_resource(struct hfi1_devdata *dd)
 	return i2c_target(dd->hfi1_id);
 }
 
+/* Is this device integrated or discrete? */
+static inline bool is_integrated(struct hfi1_devdata *dd)
+{
+	return dd->pcidev->device == PCI_DEVICE_ID_INTEL1;
+}
+
 int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp);
 
 #define DD_DEV_ENTRY(dd)       __string(dev, dev_name(&(dd)->pcidev->dev))

+ 8 - 0
drivers/infiniband/hw/hfi1/iowait.h

@@ -64,6 +64,7 @@ struct sdma_engine;
 /**
  * struct iowait - linkage for delayed progress/waiting
  * @list: used to add/insert into QP/PQ wait lists
+ * @lock: uses to record the list head lock
  * @tx_head: overflow list of sdma_txreq's
  * @sleep: no space callback
  * @wakeup: space callback wakeup
@@ -91,6 +92,11 @@ struct sdma_engine;
  * so sleeping is not allowed.
  *
  * The wait_dma member along with the iow
+ *
+ * The lock field is used by waiters to record
+ * the seqlock_t that guards the list head.
+ * Waiters explicity know that, but the destroy
+ * code that unwaits QPs does not.
  */
 
 struct iowait {
@@ -103,6 +109,7 @@ struct iowait {
 		unsigned seq);
 	void (*wakeup)(struct iowait *wait, int reason);
 	void (*sdma_drained)(struct iowait *wait);
+	seqlock_t *lock;
 	struct work_struct iowork;
 	wait_queue_head_t wait_dma;
 	wait_queue_head_t wait_pio;
@@ -141,6 +148,7 @@ static inline void iowait_init(
 	void (*sdma_drained)(struct iowait *wait))
 {
 	wait->count = 0;
+	wait->lock = NULL;
 	INIT_LIST_HEAD(&wait->list);
 	INIT_LIST_HEAD(&wait->tx_head);
 	INIT_WORK(&wait->iowork, func);

+ 10 - 9
drivers/infiniband/hw/hfi1/mad.c

@@ -336,20 +336,20 @@ static int __subn_get_opa_nodeinfo(struct opa_smp *smp, u32 am, u8 *data,
 	ni = (struct opa_node_info *)data;
 
 	/* GUID 0 is illegal */
-	if (am || pidx >= dd->num_pports || dd->pport[pidx].guid == 0) {
+	if (am || pidx >= dd->num_pports || ibdev->node_guid == 0 ||
+	    get_sguid(to_iport(ibdev, port), HFI1_PORT_GUID_INDEX) == 0) {
 		smp->status |= IB_SMP_INVALID_FIELD;
 		return reply((struct ib_mad_hdr *)smp);
 	}
 
-	ni->port_guid = cpu_to_be64(dd->pport[pidx].guid);
+	ni->port_guid = get_sguid(to_iport(ibdev, port), HFI1_PORT_GUID_INDEX);
 	ni->base_version = OPA_MGMT_BASE_VERSION;
 	ni->class_version = OPA_SMI_CLASS_VERSION;
 	ni->node_type = 1;     /* channel adapter */
 	ni->num_ports = ibdev->phys_port_cnt;
 	/* This is already in network order */
 	ni->system_image_guid = ib_hfi1_sys_image_guid;
-	/* Use first-port GUID as node */
-	ni->node_guid = cpu_to_be64(dd->pport->guid);
+	ni->node_guid = ibdev->node_guid;
 	ni->partition_cap = cpu_to_be16(hfi1_get_npkeys(dd));
 	ni->device_id = cpu_to_be16(dd->pcidev->device);
 	ni->revision = cpu_to_be32(dd->minrev);
@@ -373,19 +373,20 @@ static int subn_get_nodeinfo(struct ib_smp *smp, struct ib_device *ibdev,
 
 	/* GUID 0 is illegal */
 	if (smp->attr_mod || pidx >= dd->num_pports ||
-	    dd->pport[pidx].guid == 0)
+	    ibdev->node_guid == 0 ||
+	    get_sguid(to_iport(ibdev, port), HFI1_PORT_GUID_INDEX) == 0) {
 		smp->status |= IB_SMP_INVALID_FIELD;
-	else
-		nip->port_guid = cpu_to_be64(dd->pport[pidx].guid);
+		return reply((struct ib_mad_hdr *)smp);
+	}
 
+	nip->port_guid = get_sguid(to_iport(ibdev, port), HFI1_PORT_GUID_INDEX);
 	nip->base_version = OPA_MGMT_BASE_VERSION;
 	nip->class_version = OPA_SMI_CLASS_VERSION;
 	nip->node_type = 1;     /* channel adapter */
 	nip->num_ports = ibdev->phys_port_cnt;
 	/* This is already in network order */
 	nip->sys_guid = ib_hfi1_sys_image_guid;
-	 /* Use first-port GUID as node */
-	nip->node_guid = cpu_to_be64(dd->pport->guid);
+	nip->node_guid = ibdev->node_guid;
 	nip->partition_cap = cpu_to_be16(hfi1_get_npkeys(dd));
 	nip->device_id = cpu_to_be16(dd->pcidev->device);
 	nip->revision = cpu_to_be32(dd->minrev);

+ 1 - 1
drivers/infiniband/hw/hfi1/mmu_rb.c

@@ -81,7 +81,7 @@ static void do_remove(struct mmu_rb_handler *handler,
 		      struct list_head *del_list);
 static void handle_remove(struct work_struct *work);
 
-static struct mmu_notifier_ops mn_opts = {
+static const struct mmu_notifier_ops mn_opts = {
 	.invalidate_page = mmu_notifier_page,
 	.invalidate_range_start = mmu_notifier_range_start,
 };

+ 13 - 26
drivers/infiniband/hw/hfi1/pio.c

@@ -765,6 +765,7 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
 	sc->hw_context = hw_context;
 	cr_group_addresses(sc, &dma);
 	sc->credits = sci->credits;
+	sc->size = sc->credits * PIO_BLOCK_SIZE;
 
 /* PIO Send Memory Address details */
 #define PIO_ADDR_CONTEXT_MASK 0xfful
@@ -1249,6 +1250,7 @@ int sc_enable(struct send_context *sc)
 	sc->free = 0;
 	sc->alloc_free = 0;
 	sc->fill = 0;
+	sc->fill_wrap = 0;
 	sc->sr_head = 0;
 	sc->sr_tail = 0;
 	sc->flags = 0;
@@ -1392,7 +1394,7 @@ struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,
 	unsigned long flags;
 	unsigned long avail;
 	unsigned long blocks = dwords_to_blocks(dw_len);
-	unsigned long start_fill;
+	u32 fill_wrap;
 	int trycount = 0;
 	u32 head, next;
 
@@ -1417,9 +1419,7 @@ retry:
 			(sc->fill - sc->alloc_free);
 		if (blocks > avail) {
 			/* still no room, actively update */
-			spin_unlock_irqrestore(&sc->alloc_lock, flags);
 			sc_release_update(sc);
-			spin_lock_irqsave(&sc->alloc_lock, flags);
 			sc->alloc_free = ACCESS_ONCE(sc->free);
 			trycount++;
 			goto retry;
@@ -1435,8 +1435,11 @@ retry:
 	head = sc->sr_head;
 
 	/* "allocate" the buffer */
-	start_fill = sc->fill;
 	sc->fill += blocks;
+	fill_wrap = sc->fill_wrap;
+	sc->fill_wrap += blocks;
+	if (sc->fill_wrap >= sc->credits)
+		sc->fill_wrap = sc->fill_wrap - sc->credits;
 
 	/*
 	 * Fill the parts that the releaser looks at before moving the head.
@@ -1465,11 +1468,8 @@ retry:
 	spin_unlock_irqrestore(&sc->alloc_lock, flags);
 
 	/* finish filling in the buffer outside the lock */
-	pbuf->start = sc->base_addr + ((start_fill % sc->credits)
-							* PIO_BLOCK_SIZE);
-	pbuf->size = sc->credits * PIO_BLOCK_SIZE;
-	pbuf->end = sc->base_addr + pbuf->size;
-	pbuf->block_count = blocks;
+	pbuf->start = sc->base_addr + fill_wrap * PIO_BLOCK_SIZE;
+	pbuf->end = sc->base_addr + sc->size;
 	pbuf->qw_written = 0;
 	pbuf->carry_bytes = 0;
 	pbuf->carry.val64 = 0;
@@ -1580,6 +1580,7 @@ static void sc_piobufavail(struct send_context *sc)
 		qp = iowait_to_qp(wait);
 		priv = qp->priv;
 		list_del_init(&priv->s_iowait.list);
+		priv->s_iowait.lock = NULL;
 		/* refcount held until actual wake up */
 		qps[n++] = qp;
 	}
@@ -2035,28 +2036,17 @@ freesc15:
 int init_credit_return(struct hfi1_devdata *dd)
 {
 	int ret;
-	int num_numa;
 	int i;
 
-	num_numa = num_online_nodes();
-	/* enforce the expectation that the numas are compact */
-	for (i = 0; i < num_numa; i++) {
-		if (!node_online(i)) {
-			dd_dev_err(dd, "NUMA nodes are not compact\n");
-			ret = -EINVAL;
-			goto done;
-		}
-	}
-
 	dd->cr_base = kcalloc(
-		num_numa,
+		node_affinity.num_possible_nodes,
 		sizeof(struct credit_return_base),
 		GFP_KERNEL);
 	if (!dd->cr_base) {
 		ret = -ENOMEM;
 		goto done;
 	}
-	for (i = 0; i < num_numa; i++) {
+	for_each_node_with_cpus(i) {
 		int bytes = TXE_NUM_CONTEXTS * sizeof(struct credit_return);
 
 		set_dev_node(&dd->pcidev->dev, i);
@@ -2083,14 +2073,11 @@ done:
 
 void free_credit_return(struct hfi1_devdata *dd)
 {
-	int num_numa;
 	int i;
 
 	if (!dd->cr_base)
 		return;
-
-	num_numa = num_online_nodes();
-	for (i = 0; i < num_numa; i++) {
+	for (i = 0; i < node_affinity.num_possible_nodes; i++) {
 		if (dd->cr_base[i].va) {
 			dma_free_coherent(&dd->pcidev->dev,
 					  TXE_NUM_CONTEXTS *

+ 20 - 18
drivers/infiniband/hw/hfi1/pio.h

@@ -83,53 +83,55 @@ struct pio_buf {
 	void *arg;		/* argument for cb */
 	void __iomem *start;	/* buffer start address */
 	void __iomem *end;	/* context end address */
-	unsigned long size;	/* context size, in bytes */
 	unsigned long sent_at;	/* buffer is sent when <= free */
-	u32 block_count;	/* size of buffer, in blocks */
-	u32 qw_written;		/* QW written so far */
-	u32 carry_bytes;	/* number of valid bytes in carry */
 	union mix carry;	/* pending unwritten bytes */
+	u16 qw_written;		/* QW written so far */
+	u8 carry_bytes;	/* number of valid bytes in carry */
 };
 
 /* cache line aligned pio buffer array */
 union pio_shadow_ring {
 	struct pio_buf pbuf;
-	u64 unused[16];		/* cache line spacer */
 } ____cacheline_aligned;
 
 /* per-NUMA send context */
 struct send_context {
 	/* read-only after init */
 	struct hfi1_devdata *dd;		/* device */
-	void __iomem *base_addr;	/* start of PIO memory */
 	union pio_shadow_ring *sr;	/* shadow ring */
+	void __iomem *base_addr;	/* start of PIO memory */
+	u32 __percpu *buffers_allocated;/* count of buffers allocated */
+	u32 size;			/* context size, in bytes */
 
-	volatile __le64 *hw_free;	/* HW free counter */
-	struct work_struct halt_work;	/* halted context work queue entry */
-	unsigned long flags;		/* flags */
 	int node;			/* context home node */
-	int type;			/* context type */
-	u32 sw_index;			/* software index number */
-	u32 hw_context;			/* hardware context number */
-	u32 credits;			/* number of blocks in context */
 	u32 sr_size;			/* size of the shadow ring */
-	u32 group;			/* credit return group */
+	u16 flags;			/* flags */
+	u8  type;			/* context type */
+	u8  sw_index;			/* software index number */
+	u8  hw_context;			/* hardware context number */
+	u8  group;			/* credit return group */
+
 	/* allocator fields */
 	spinlock_t alloc_lock ____cacheline_aligned_in_smp;
+	u32 sr_head;			/* shadow ring head */
 	unsigned long fill;		/* official alloc count */
 	unsigned long alloc_free;	/* copy of free (less cache thrash) */
-	u32 sr_head;			/* shadow ring head */
+	u32 fill_wrap;			/* tracks fill within ring */
+	u32 credits;			/* number of blocks in context */
+	/* adding a new field here would make it part of this cacheline */
+
 	/* releaser fields */
 	spinlock_t release_lock ____cacheline_aligned_in_smp;
-	unsigned long free;		/* official free count */
 	u32 sr_tail;			/* shadow ring tail */
+	unsigned long free;		/* official free count */
+	volatile __le64 *hw_free;	/* HW free counter */
 	/* list for PIO waiters */
 	struct list_head piowait  ____cacheline_aligned_in_smp;
 	spinlock_t credit_ctrl_lock ____cacheline_aligned_in_smp;
-	u64 credit_ctrl;		/* cache for credit control */
 	u32 credit_intr_count;		/* count of credit intr users */
-	u32 __percpu *buffers_allocated;/* count of buffers allocated */
+	u64 credit_ctrl;		/* cache for credit control */
 	wait_queue_head_t halt_wait;    /* wait until kernel sees interrupt */
+	struct work_struct halt_work;	/* halted context work queue entry */
 };
 
 /* send context flags */

+ 11 - 11
drivers/infiniband/hw/hfi1/pio_copy.c

@@ -129,8 +129,8 @@ void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc,
 				dest += sizeof(u64);
 			}
 
-			dest -= pbuf->size;
-			dend -= pbuf->size;
+			dest -= pbuf->sc->size;
+			dend -= pbuf->sc->size;
 		}
 
 		/* write 8-byte non-SOP, non-wrap chunk data */
@@ -361,8 +361,8 @@ void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc,
 				dest += sizeof(u64);
 			}
 
-			dest -= pbuf->size;
-			dend -= pbuf->size;
+			dest -= pbuf->sc->size;
+			dend -= pbuf->sc->size;
 		}
 
 		/* write 8-byte non-SOP, non-wrap chunk data */
@@ -458,8 +458,8 @@ static void mid_copy_mix(struct pio_buf *pbuf, const void *from, size_t nbytes)
 			dest += sizeof(u64);
 		}
 
-		dest -= pbuf->size;
-		dend -= pbuf->size;
+		dest -= pbuf->sc->size;
+		dend -= pbuf->sc->size;
 	}
 
 	/* write 8-byte non-SOP, non-wrap chunk data */
@@ -492,7 +492,7 @@ static void mid_copy_mix(struct pio_buf *pbuf, const void *from, size_t nbytes)
 		 */
 		/* adjust if we have wrapped */
 		if (dest >= pbuf->end)
-			dest -= pbuf->size;
+			dest -= pbuf->sc->size;
 		/* jump to the SOP range if within the first block */
 		else if (pbuf->qw_written < PIO_BLOCK_QWS)
 			dest += SOP_DISTANCE;
@@ -584,8 +584,8 @@ static void mid_copy_straight(struct pio_buf *pbuf,
 			dest += sizeof(u64);
 		}
 
-		dest -= pbuf->size;
-		dend -= pbuf->size;
+		dest -= pbuf->sc->size;
+		dend -= pbuf->sc->size;
 	}
 
 	/* write 8-byte non-SOP, non-wrap chunk data */
@@ -666,7 +666,7 @@ void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes)
 			 */
 			/* adjust if we've wrapped */
 			if (dest >= pbuf->end)
-				dest -= pbuf->size;
+				dest -= pbuf->sc->size;
 			/* jump to SOP range if within the first block */
 			else if (pbuf->qw_written < PIO_BLOCK_QWS)
 				dest += SOP_DISTANCE;
@@ -719,7 +719,7 @@ void seg_pio_copy_end(struct pio_buf *pbuf)
 	 */
 	/* adjust if we have wrapped */
 	if (dest >= pbuf->end)
-		dest -= pbuf->size;
+		dest -= pbuf->sc->size;
 	/* jump to the SOP range if within the first block */
 	else if (pbuf->qw_written < PIO_BLOCK_QWS)
 		dest += SOP_DISTANCE;

+ 167 - 26
drivers/infiniband/hw/hfi1/platform.c

@@ -49,6 +49,90 @@
 #include "efivar.h"
 #include "eprom.h"
 
+static int validate_scratch_checksum(struct hfi1_devdata *dd)
+{
+	u64 checksum = 0, temp_scratch = 0;
+	int i, j, version;
+
+	temp_scratch = read_csr(dd, ASIC_CFG_SCRATCH);
+	version = (temp_scratch & BITMAP_VERSION_SMASK) >> BITMAP_VERSION_SHIFT;
+
+	/* Prevent power on default of all zeroes from passing checksum */
+	if (!version)
+		return 0;
+
+	/*
+	 * ASIC scratch 0 only contains the checksum and bitmap version as
+	 * fields of interest, both of which are handled separately from the
+	 * loop below, so skip it
+	 */
+	checksum += version;
+	for (i = 1; i < ASIC_NUM_SCRATCH; i++) {
+		temp_scratch = read_csr(dd, ASIC_CFG_SCRATCH + (8 * i));
+		for (j = sizeof(u64); j != 0; j -= 2) {
+			checksum += (temp_scratch & 0xFFFF);
+			temp_scratch >>= 16;
+		}
+	}
+
+	while (checksum >> 16)
+		checksum = (checksum & CHECKSUM_MASK) + (checksum >> 16);
+
+	temp_scratch = read_csr(dd, ASIC_CFG_SCRATCH);
+	temp_scratch &= CHECKSUM_SMASK;
+	temp_scratch >>= CHECKSUM_SHIFT;
+
+	if (checksum + temp_scratch == 0xFFFF)
+		return 1;
+	return 0;
+}
+
+static void save_platform_config_fields(struct hfi1_devdata *dd)
+{
+	struct hfi1_pportdata *ppd = dd->pport;
+	u64 temp_scratch = 0, temp_dest = 0;
+
+	temp_scratch = read_csr(dd, ASIC_CFG_SCRATCH_1);
+
+	temp_dest = temp_scratch &
+		    (dd->hfi1_id ? PORT1_PORT_TYPE_SMASK :
+		     PORT0_PORT_TYPE_SMASK);
+	ppd->port_type = temp_dest >>
+			 (dd->hfi1_id ? PORT1_PORT_TYPE_SHIFT :
+			  PORT0_PORT_TYPE_SHIFT);
+
+	temp_dest = temp_scratch &
+		    (dd->hfi1_id ? PORT1_LOCAL_ATTEN_SMASK :
+		     PORT0_LOCAL_ATTEN_SMASK);
+	ppd->local_atten = temp_dest >>
+			   (dd->hfi1_id ? PORT1_LOCAL_ATTEN_SHIFT :
+			    PORT0_LOCAL_ATTEN_SHIFT);
+
+	temp_dest = temp_scratch &
+		    (dd->hfi1_id ? PORT1_REMOTE_ATTEN_SMASK :
+		     PORT0_REMOTE_ATTEN_SMASK);
+	ppd->remote_atten = temp_dest >>
+			    (dd->hfi1_id ? PORT1_REMOTE_ATTEN_SHIFT :
+			     PORT0_REMOTE_ATTEN_SHIFT);
+
+	temp_dest = temp_scratch &
+		    (dd->hfi1_id ? PORT1_DEFAULT_ATTEN_SMASK :
+		     PORT0_DEFAULT_ATTEN_SMASK);
+	ppd->default_atten = temp_dest >>
+			     (dd->hfi1_id ? PORT1_DEFAULT_ATTEN_SHIFT :
+			      PORT0_DEFAULT_ATTEN_SHIFT);
+
+	temp_scratch = read_csr(dd, dd->hfi1_id ? ASIC_CFG_SCRATCH_3 :
+				ASIC_CFG_SCRATCH_2);
+
+	ppd->tx_preset_eq = (temp_scratch & TX_EQ_SMASK) >> TX_EQ_SHIFT;
+	ppd->tx_preset_noeq = (temp_scratch & TX_NO_EQ_SMASK) >> TX_NO_EQ_SHIFT;
+	ppd->rx_preset = (temp_scratch & RX_SMASK) >> RX_SHIFT;
+
+	ppd->max_power_class = (temp_scratch & QSFP_MAX_POWER_SMASK) >>
+				QSFP_MAX_POWER_SHIFT;
+}
+
 void get_platform_config(struct hfi1_devdata *dd)
 {
 	int ret = 0;
@@ -56,38 +140,49 @@ void get_platform_config(struct hfi1_devdata *dd)
 	u8 *temp_platform_config = NULL;
 	u32 esize;
 
-	ret = eprom_read_platform_config(dd, (void **)&temp_platform_config,
-					 &esize);
-	if (!ret) {
-		/* success */
-		size = esize;
-		goto success;
+	if (is_integrated(dd)) {
+		if (validate_scratch_checksum(dd)) {
+			save_platform_config_fields(dd);
+			return;
+		}
+		dd_dev_err(dd, "%s: Config bitmap corrupted/uninitialized\n",
+			   __func__);
+		dd_dev_err(dd,
+			   "%s: Please update your BIOS to support active channels\n",
+			   __func__);
+	} else {
+		ret = eprom_read_platform_config(dd,
+						 (void **)&temp_platform_config,
+						 &esize);
+		if (!ret) {
+			/* success */
+			dd->platform_config.data = temp_platform_config;
+			dd->platform_config.size = esize;
+			return;
+		}
+		/* fail, try EFI variable */
+
+		ret = read_hfi1_efi_var(dd, "configuration", &size,
+					(void **)&temp_platform_config);
+		if (!ret) {
+			dd->platform_config.data = temp_platform_config;
+			dd->platform_config.size = size;
+			return;
+		}
 	}
-	/* fail, try EFI variable */
-
-	ret = read_hfi1_efi_var(dd, "configuration", &size,
-				(void **)&temp_platform_config);
-	if (!ret)
-		goto success;
-
-	dd_dev_info(dd,
-		    "%s: Failed to get platform config from UEFI, falling back to request firmware\n",
-		    __func__);
+	dd_dev_err(dd,
+		   "%s: Failed to get platform config, falling back to sub-optimal default file\n",
+		   __func__);
 	/* fall back to request firmware */
 	platform_config_load = 1;
-	return;
-
-success:
-	dd->platform_config.data = temp_platform_config;
-	dd->platform_config.size = size;
 }
 
 void free_platform_config(struct hfi1_devdata *dd)
 {
 	if (!platform_config_load) {
 		/*
-		 * was loaded from EFI, release memory
-		 * allocated by read_efi_var
+		 * was loaded from EFI or the EPROM, release memory
+		 * allocated by read_efi_var/eprom_read_platform_config
 		 */
 		kfree(dd->platform_config.data);
 	}
@@ -100,12 +195,16 @@ void free_platform_config(struct hfi1_devdata *dd)
 void get_port_type(struct hfi1_pportdata *ppd)
 {
 	int ret;
+	u32 temp;
 
 	ret = get_platform_config_field(ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
-					PORT_TABLE_PORT_TYPE, &ppd->port_type,
+					PORT_TABLE_PORT_TYPE, &temp,
 					4);
-	if (ret)
+	if (ret) {
 		ppd->port_type = PORT_TYPE_UNKNOWN;
+		return;
+	}
+	ppd->port_type = temp;
 }
 
 int set_qsfp_tx(struct hfi1_pportdata *ppd, int on)
@@ -538,6 +637,38 @@ static void apply_tx_lanes(struct hfi1_pportdata *ppd, u8 field_id,
 	}
 }
 
+/*
+ * Return a special SerDes setting for low power AOC cables.  The power class
+ * threshold and setting being used were all found by empirical testing.
+ *
+ * Summary of the logic:
+ *
+ * if (QSFP and QSFP_TYPE == AOC and QSFP_POWER_CLASS < 4)
+ *     return 0xe
+ * return 0; // leave at default
+ */
+static u8 aoc_low_power_setting(struct hfi1_pportdata *ppd)
+{
+	u8 *cache = ppd->qsfp_info.cache;
+	int power_class;
+
+	/* QSFP only */
+	if (ppd->port_type != PORT_TYPE_QSFP)
+		return 0; /* leave at default */
+
+	/* active optical cables only */
+	switch ((cache[QSFP_MOD_TECH_OFFS] & 0xF0) >> 4) {
+	case 0x0 ... 0x9: /* fallthrough */
+	case 0xC: /* fallthrough */
+	case 0xE:
+		/* active AOC */
+		power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
+		if (power_class < QSFP_POWER_CLASS_4)
+			return 0xe;
+	}
+	return 0; /* leave at default */
+}
+
 static void apply_tunings(
 		struct hfi1_pportdata *ppd, u32 tx_preset_index,
 		u8 tuning_method, u32 total_atten, u8 limiting_active)
@@ -606,7 +737,17 @@ static void apply_tunings(
 		tx_preset_index, TX_PRESET_TABLE_POSTCUR, &tx_preset, 4);
 	postcur = tx_preset;
 
-	config_data = precur | (attn << 8) | (postcur << 16);
+	/*
+	 * NOTES:
+	 * o The aoc_low_power_setting is applied to all lanes even
+	 *   though only lane 0's value is examined by the firmware.
+	 * o A lingering low power setting after a cable swap does
+	 *   not occur.  On cable unplug the 8051 is reset and
+	 *   restarted on cable insert.  This resets all settings to
+	 *   their default, erasing any previous low power setting.
+	 */
+	config_data = precur | (attn << 8) | (postcur << 16) |
+			(aoc_low_power_setting(ppd) << 24);
 
 	apply_tx_lanes(ppd, TX_EQ_SETTINGS, config_data,
 		       "Applying TX settings");

+ 117 - 10
drivers/infiniband/hw/hfi1/platform.h

@@ -168,16 +168,6 @@ struct platform_config_cache {
 	struct platform_config_data config_tables[PLATFORM_CONFIG_TABLE_MAX];
 };
 
-static const u32 platform_config_table_limits[PLATFORM_CONFIG_TABLE_MAX] = {
-	0,
-	SYSTEM_TABLE_MAX,
-	PORT_TABLE_MAX,
-	RX_PRESET_TABLE_MAX,
-	TX_PRESET_TABLE_MAX,
-	QSFP_ATTEN_TABLE_MAX,
-	VARIABLE_SETTINGS_TABLE_MAX
-};
-
 /* This section defines default values and encodings for the
  * fields defined for each table above
  */
@@ -295,6 +285,123 @@ enum link_tuning_encoding {
 	OPA_UNKNOWN_TUNING
 };
 
+/*
+ * Shifts and masks for the link SI tuning values stuffed into the ASIC scratch
+ * registers for integrated platforms
+ */
+#define PORT0_PORT_TYPE_SHIFT		0
+#define PORT0_LOCAL_ATTEN_SHIFT		4
+#define PORT0_REMOTE_ATTEN_SHIFT	10
+#define PORT0_DEFAULT_ATTEN_SHIFT	32
+
+#define PORT1_PORT_TYPE_SHIFT		16
+#define PORT1_LOCAL_ATTEN_SHIFT		20
+#define PORT1_REMOTE_ATTEN_SHIFT	26
+#define PORT1_DEFAULT_ATTEN_SHIFT	40
+
+#define PORT0_PORT_TYPE_MASK		0xFUL
+#define PORT0_LOCAL_ATTEN_MASK		0x3FUL
+#define PORT0_REMOTE_ATTEN_MASK		0x3FUL
+#define PORT0_DEFAULT_ATTEN_MASK	0xFFUL
+
+#define PORT1_PORT_TYPE_MASK		0xFUL
+#define PORT1_LOCAL_ATTEN_MASK		0x3FUL
+#define PORT1_REMOTE_ATTEN_MASK		0x3FUL
+#define PORT1_DEFAULT_ATTEN_MASK	0xFFUL
+
+#define PORT0_PORT_TYPE_SMASK		(PORT0_PORT_TYPE_MASK << \
+					 PORT0_PORT_TYPE_SHIFT)
+#define PORT0_LOCAL_ATTEN_SMASK		(PORT0_LOCAL_ATTEN_MASK << \
+					 PORT0_LOCAL_ATTEN_SHIFT)
+#define PORT0_REMOTE_ATTEN_SMASK	(PORT0_REMOTE_ATTEN_MASK << \
+					 PORT0_REMOTE_ATTEN_SHIFT)
+#define PORT0_DEFAULT_ATTEN_SMASK	(PORT0_DEFAULT_ATTEN_MASK << \
+					 PORT0_DEFAULT_ATTEN_SHIFT)
+
+#define PORT1_PORT_TYPE_SMASK		(PORT1_PORT_TYPE_MASK << \
+					 PORT1_PORT_TYPE_SHIFT)
+#define PORT1_LOCAL_ATTEN_SMASK		(PORT1_LOCAL_ATTEN_MASK << \
+					 PORT1_LOCAL_ATTEN_SHIFT)
+#define PORT1_REMOTE_ATTEN_SMASK	(PORT1_REMOTE_ATTEN_MASK << \
+					 PORT1_REMOTE_ATTEN_SHIFT)
+#define PORT1_DEFAULT_ATTEN_SMASK	(PORT1_DEFAULT_ATTEN_MASK << \
+					 PORT1_DEFAULT_ATTEN_SHIFT)
+
+#define QSFP_MAX_POWER_SHIFT		0
+#define TX_NO_EQ_SHIFT			4
+#define TX_EQ_SHIFT			25
+#define RX_SHIFT			46
+
+#define QSFP_MAX_POWER_MASK		0xFUL
+#define TX_NO_EQ_MASK			0x1FFFFFUL
+#define TX_EQ_MASK			0x1FFFFFUL
+#define RX_MASK				0xFFFFUL
+
+#define QSFP_MAX_POWER_SMASK		(QSFP_MAX_POWER_MASK << \
+					 QSFP_MAX_POWER_SHIFT)
+#define TX_NO_EQ_SMASK			(TX_NO_EQ_MASK << TX_NO_EQ_SHIFT)
+#define TX_EQ_SMASK			(TX_EQ_MASK << TX_EQ_SHIFT)
+#define RX_SMASK			(RX_MASK << RX_SHIFT)
+
+#define TX_PRECUR_SHIFT			0
+#define TX_ATTN_SHIFT			4
+#define QSFP_TX_CDR_APPLY_SHIFT		9
+#define QSFP_TX_EQ_APPLY_SHIFT		10
+#define QSFP_TX_CDR_SHIFT		11
+#define QSFP_TX_EQ_SHIFT		12
+#define TX_POSTCUR_SHIFT		16
+
+#define TX_PRECUR_MASK			0xFUL
+#define TX_ATTN_MASK			0x1FUL
+#define QSFP_TX_CDR_APPLY_MASK		0x1UL
+#define QSFP_TX_EQ_APPLY_MASK		0x1UL
+#define QSFP_TX_CDR_MASK		0x1UL
+#define QSFP_TX_EQ_MASK			0xFUL
+#define TX_POSTCUR_MASK			0x1FUL
+
+#define TX_PRECUR_SMASK			(TX_PRECUR_MASK << TX_PRECUR_SHIFT)
+#define TX_ATTN_SMASK			(TX_ATTN_MASK << TX_ATTN_SHIFT)
+#define QSFP_TX_CDR_APPLY_SMASK		(QSFP_TX_CDR_APPLY_MASK << \
+					 QSFP_TX_CDR_APPLY_SHIFT)
+#define QSFP_TX_EQ_APPLY_SMASK		(QSFP_TX_EQ_APPLY_MASK << \
+					 QSFP_TX_EQ_APPLY_SHIFT)
+#define QSFP_TX_CDR_SMASK		(QSFP_TX_CDR_MASK << QSFP_TX_CDR_SHIFT)
+#define QSFP_TX_EQ_SMASK		(QSFP_TX_EQ_MASK << QSFP_TX_EQ_SHIFT)
+#define TX_POSTCUR_SMASK		(TX_POSTCUR_MASK << TX_POSTCUR_SHIFT)
+
+#define QSFP_RX_CDR_APPLY_SHIFT		0
+#define QSFP_RX_EMP_APPLY_SHIFT		1
+#define QSFP_RX_AMP_APPLY_SHIFT		2
+#define QSFP_RX_CDR_SHIFT		3
+#define QSFP_RX_EMP_SHIFT		4
+#define QSFP_RX_AMP_SHIFT		8
+
+#define QSFP_RX_CDR_APPLY_MASK		0x1UL
+#define QSFP_RX_EMP_APPLY_MASK		0x1UL
+#define QSFP_RX_AMP_APPLY_MASK		0x1UL
+#define QSFP_RX_CDR_MASK		0x1UL
+#define QSFP_RX_EMP_MASK		0xFUL
+#define QSFP_RX_AMP_MASK		0x3UL
+
+#define QSFP_RX_CDR_APPLY_SMASK		(QSFP_RX_CDR_APPLY_MASK << \
+					 QSFP_RX_CDR_APPLY_SHIFT)
+#define QSFP_RX_EMP_APPLY_SMASK		(QSFP_RX_EMP_APPLY_MASK << \
+					 QSFP_RX_EMP_APPLY_SHIFT)
+#define QSFP_RX_AMP_APPLY_SMASK		(QSFP_RX_AMP_APPLY_MASK << \
+					 QSFP_RX_AMP_APPLY_SHIFT)
+#define QSFP_RX_CDR_SMASK		(QSFP_RX_CDR_MASK << QSFP_RX_CDR_SHIFT)
+#define QSFP_RX_EMP_SMASK		(QSFP_RX_EMP_MASK << QSFP_RX_EMP_SHIFT)
+#define QSFP_RX_AMP_SMASK		(QSFP_RX_AMP_MASK << QSFP_RX_AMP_SHIFT)
+
+#define BITMAP_VERSION			1
+#define BITMAP_VERSION_SHIFT		44
+#define BITMAP_VERSION_MASK		0xFUL
+#define BITMAP_VERSION_SMASK		(BITMAP_VERSION_MASK << \
+					 BITMAP_VERSION_SHIFT)
+#define CHECKSUM_SHIFT			48
+#define CHECKSUM_MASK			0xFFFFUL
+#define CHECKSUM_SMASK			(CHECKSUM_MASK << CHECKSUM_SHIFT)
+
 /* platform.c */
 void get_platform_config(struct hfi1_devdata *dd);
 void free_platform_config(struct hfi1_devdata *dd);

+ 8 - 3
drivers/infiniband/hw/hfi1/qp.c

@@ -196,15 +196,18 @@ static void flush_tx_list(struct rvt_qp *qp)
 static void flush_iowait(struct rvt_qp *qp)
 {
 	struct hfi1_qp_priv *priv = qp->priv;
-	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
 	unsigned long flags;
+	seqlock_t *lock = priv->s_iowait.lock;
 
-	write_seqlock_irqsave(&dev->iowait_lock, flags);
+	if (!lock)
+		return;
+	write_seqlock_irqsave(lock, flags);
 	if (!list_empty(&priv->s_iowait.list)) {
 		list_del_init(&priv->s_iowait.list);
+		priv->s_iowait.lock = NULL;
 		rvt_put_qp(qp);
 	}
-	write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+	write_sequnlock_irqrestore(lock, flags);
 }
 
 static inline int opa_mtu_enum_to_int(int mtu)
@@ -543,6 +546,7 @@ static int iowait_sleep(
 			ibp->rvp.n_dmawait++;
 			qp->s_flags |= RVT_S_WAIT_DMA_DESC;
 			list_add_tail(&priv->s_iowait.list, &sde->dmawait);
+			priv->s_iowait.lock = &dev->iowait_lock;
 			trace_hfi1_qpsleep(qp, RVT_S_WAIT_DMA_DESC);
 			rvt_get_qp(qp);
 		}
@@ -964,6 +968,7 @@ void notify_error_qp(struct rvt_qp *qp)
 	if (!list_empty(&priv->s_iowait.list) && !(qp->s_flags & RVT_S_BUSY)) {
 		qp->s_flags &= ~RVT_S_ANY_WAIT_IO;
 		list_del_init(&priv->s_iowait.list);
+		priv->s_iowait.lock = NULL;
 		rvt_put_qp(qp);
 	}
 	write_sequnlock(&dev->iowait_lock);

+ 13 - 47
drivers/infiniband/hw/hfi1/rc.c

@@ -276,7 +276,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
 				rvt_get_mr(ps->s_txreq->mr);
 			qp->s_ack_rdma_sge.sge = e->rdma_sge;
 			qp->s_ack_rdma_sge.num_sge = 1;
-			qp->s_cur_sge = &qp->s_ack_rdma_sge;
+			ps->s_txreq->ss = &qp->s_ack_rdma_sge;
 			if (len > pmtu) {
 				len = pmtu;
 				qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
@@ -290,7 +290,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
 			bth2 = mask_psn(qp->s_ack_rdma_psn++);
 		} else {
 			/* COMPARE_SWAP or FETCH_ADD */
-			qp->s_cur_sge = NULL;
+			ps->s_txreq->ss = NULL;
 			len = 0;
 			qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
 			ohdr->u.at.aeth = hfi1_compute_aeth(qp);
@@ -306,7 +306,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
 		qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
 		/* FALLTHROUGH */
 	case OP(RDMA_READ_RESPONSE_MIDDLE):
-		qp->s_cur_sge = &qp->s_ack_rdma_sge;
+		ps->s_txreq->ss = &qp->s_ack_rdma_sge;
 		ps->s_txreq->mr = qp->s_ack_rdma_sge.sge.mr;
 		if (ps->s_txreq->mr)
 			rvt_get_mr(ps->s_txreq->mr);
@@ -335,7 +335,7 @@ normal:
 		 */
 		qp->s_ack_state = OP(SEND_ONLY);
 		qp->s_flags &= ~RVT_S_ACK_PENDING;
-		qp->s_cur_sge = NULL;
+		ps->s_txreq->ss = NULL;
 		if (qp->s_nak_state)
 			ohdr->u.aeth =
 				cpu_to_be32((qp->r_msn & HFI1_MSN_MASK) |
@@ -351,7 +351,7 @@ normal:
 	qp->s_rdma_ack_cnt++;
 	qp->s_hdrwords = hwords;
 	ps->s_txreq->sde = priv->s_sde;
-	qp->s_cur_size = len;
+	ps->s_txreq->s_cur_size = len;
 	hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle, ps);
 	/* pbc */
 	ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
@@ -801,8 +801,8 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 	qp->s_len -= len;
 	qp->s_hdrwords = hwords;
 	ps->s_txreq->sde = priv->s_sde;
-	qp->s_cur_sge = ss;
-	qp->s_cur_size = len;
+	ps->s_txreq->ss = ss;
+	ps->s_txreq->s_cur_size = len;
 	hfi1_make_ruc_header(
 		qp,
 		ohdr,
@@ -1146,8 +1146,6 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct ib_header *hdr)
 {
 	struct ib_other_headers *ohdr;
 	struct rvt_swqe *wqe;
-	struct ib_wc wc;
-	unsigned i;
 	u32 opcode;
 	u32 psn;
 
@@ -1195,22 +1193,8 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct ib_header *hdr)
 		qp->s_last = s_last;
 		/* see post_send() */
 		barrier();
-		for (i = 0; i < wqe->wr.num_sge; i++) {
-			struct rvt_sge *sge = &wqe->sg_list[i];
-
-			rvt_put_mr(sge->mr);
-		}
-		/* Post a send completion queue entry if requested. */
-		if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
-		    (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
-			memset(&wc, 0, sizeof(wc));
-			wc.wr_id = wqe->wr.wr_id;
-			wc.status = IB_WC_SUCCESS;
-			wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
-			wc.byte_len = wqe->length;
-			wc.qp = &qp->ibqp;
-			rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc, 0);
-		}
+		rvt_put_swqe(wqe);
+		rvt_qp_swqe_complete(qp, wqe, IB_WC_SUCCESS);
 	}
 	/*
 	 * If we were waiting for sends to complete before re-sending,
@@ -1240,9 +1224,6 @@ static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
 					 struct rvt_swqe *wqe,
 					 struct hfi1_ibport *ibp)
 {
-	struct ib_wc wc;
-	unsigned i;
-
 	lockdep_assert_held(&qp->s_lock);
 	/*
 	 * Don't decrement refcount and don't generate a
@@ -1253,28 +1234,14 @@ static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
 	    cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
 		u32 s_last;
 
-		for (i = 0; i < wqe->wr.num_sge; i++) {
-			struct rvt_sge *sge = &wqe->sg_list[i];
-
-			rvt_put_mr(sge->mr);
-		}
+		rvt_put_swqe(wqe);
 		s_last = qp->s_last;
 		if (++s_last >= qp->s_size)
 			s_last = 0;
 		qp->s_last = s_last;
 		/* see post_send() */
 		barrier();
-		/* Post a send completion queue entry if requested. */
-		if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
-		    (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
-			memset(&wc, 0, sizeof(wc));
-			wc.wr_id = wqe->wr.wr_id;
-			wc.status = IB_WC_SUCCESS;
-			wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
-			wc.byte_len = wqe->length;
-			wc.qp = &qp->ibqp;
-			rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc, 0);
-		}
+		rvt_qp_swqe_complete(qp, wqe, IB_WC_SUCCESS);
 	} else {
 		struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
 
@@ -2295,7 +2262,7 @@ send_last:
 		hfi1_copy_sge(&qp->r_sge, data, tlen, 1, copy_last);
 		rvt_put_ss(&qp->r_sge);
 		qp->r_msn++;
-		if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
+		if (!__test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
 			break;
 		wc.wr_id = qp->r_wr_id;
 		wc.status = IB_WC_SUCCESS;
@@ -2410,8 +2377,7 @@ send_last:
 			 * Update the next expected PSN.  We add 1 later
 			 * below, so only add the remainder here.
 			 */
-			if (len > pmtu)
-				qp->r_psn += (len - 1) / pmtu;
+			qp->r_psn += rvt_div_mtu(qp, len - 1);
 		} else {
 			e->rdma_sge.mr = NULL;
 			e->rdma_sge.vaddr = NULL;

+ 7 - 37
drivers/infiniband/hw/hfi1/ruc.c

@@ -239,16 +239,6 @@ bail:
 	return ret;
 }
 
-static __be64 get_sguid(struct hfi1_ibport *ibp, unsigned index)
-{
-	if (!index) {
-		struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-
-		return cpu_to_be64(ppd->guid);
-	}
-	return ibp->guids[index - 1];
-}
-
 static int gid_ok(union ib_gid *gid, __be64 gid_prefix, __be64 id)
 {
 	return (gid->global.interface_id == id &&
@@ -699,9 +689,9 @@ u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
 	/* The SGID is 32-bit aligned. */
 	hdr->sgid.global.subnet_prefix = ibp->rvp.gid_prefix;
 	hdr->sgid.global.interface_id =
-		grh->sgid_index && grh->sgid_index < ARRAY_SIZE(ibp->guids) ?
-		ibp->guids[grh->sgid_index - 1] :
-			cpu_to_be64(ppd_from_ibp(ibp)->guid);
+		grh->sgid_index < HFI1_GUIDS_PER_PORT ?
+		get_sguid(ibp, grh->sgid_index) :
+		get_sguid(ibp, HFI1_PORT_GUID_INDEX);
 	hdr->dgid = grh->dgid;
 
 	/* GRH header size in 32-bit words. */
@@ -777,8 +767,8 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
 	u32 bth1;
 
 	/* Construct the header. */
-	extra_bytes = -qp->s_cur_size & 3;
-	nwords = (qp->s_cur_size + extra_bytes) >> 2;
+	extra_bytes = -ps->s_txreq->s_cur_size & 3;
+	nwords = (ps->s_txreq->s_cur_size + extra_bytes) >> 2;
 	lrh0 = HFI1_LRH_BTH;
 	if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
 		qp->s_hdrwords += hfi1_make_grh(ibp,
@@ -952,7 +942,6 @@ void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
 			enum ib_wc_status status)
 {
 	u32 old_last, last;
-	unsigned i;
 
 	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
 		return;
@@ -964,32 +953,13 @@ void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
 	qp->s_last = last;
 	/* See post_send() */
 	barrier();
-	for (i = 0; i < wqe->wr.num_sge; i++) {
-		struct rvt_sge *sge = &wqe->sg_list[i];
-
-		rvt_put_mr(sge->mr);
-	}
+	rvt_put_swqe(wqe);
 	if (qp->ibqp.qp_type == IB_QPT_UD ||
 	    qp->ibqp.qp_type == IB_QPT_SMI ||
 	    qp->ibqp.qp_type == IB_QPT_GSI)
 		atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount);
 
-	/* See ch. 11.2.4.1 and 10.7.3.1 */
-	if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
-	    (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
-	    status != IB_WC_SUCCESS) {
-		struct ib_wc wc;
-
-		memset(&wc, 0, sizeof(wc));
-		wc.wr_id = wqe->wr.wr_id;
-		wc.status = status;
-		wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
-		wc.qp = &qp->ibqp;
-		if (status == IB_WC_SUCCESS)
-			wc.byte_len = wqe->length;
-		rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc,
-			     status != IB_WC_SUCCESS);
-	}
+	rvt_qp_swqe_complete(qp, wqe, status);
 
 	if (qp->s_acked == old_last)
 		qp->s_acked = last;

+ 9 - 9
drivers/infiniband/hw/hfi1/sdma.c

@@ -375,7 +375,7 @@ static inline void complete_tx(struct sdma_engine *sde,
 			   sde->head_sn, tx->sn);
 	sde->head_sn++;
 #endif
-	sdma_txclean(sde->dd, tx);
+	__sdma_txclean(sde->dd, tx);
 	if (complete)
 		(*complete)(tx, res);
 	if (wait && iowait_sdma_dec(wait))
@@ -1643,7 +1643,7 @@ static inline u8 ahg_mode(struct sdma_txreq *tx)
 }
 
 /**
- * sdma_txclean() - clean tx of mappings, descp *kmalloc's
+ * __sdma_txclean() - clean tx of mappings, descp *kmalloc's
  * @dd: hfi1_devdata for unmapping
  * @tx: tx request to clean
  *
@@ -1653,7 +1653,7 @@ static inline u8 ahg_mode(struct sdma_txreq *tx)
  * The code can be called multiple times without issue.
  *
  */
-void sdma_txclean(
+void __sdma_txclean(
 	struct hfi1_devdata *dd,
 	struct sdma_txreq *tx)
 {
@@ -3080,7 +3080,7 @@ static int _extend_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
 		tx->descp[i] = tx->descs[i];
 	return 0;
 enomem:
-	sdma_txclean(dd, tx);
+	__sdma_txclean(dd, tx);
 	return -ENOMEM;
 }
 
@@ -3109,14 +3109,14 @@ int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx,
 
 	rval = _extend_sdma_tx_descs(dd, tx);
 	if (rval) {
-		sdma_txclean(dd, tx);
+		__sdma_txclean(dd, tx);
 		return rval;
 	}
 
 	/* If coalesce buffer is allocated, copy data into it */
 	if (tx->coalesce_buf) {
 		if (type == SDMA_MAP_NONE) {
-			sdma_txclean(dd, tx);
+			__sdma_txclean(dd, tx);
 			return -EINVAL;
 		}
 
@@ -3124,7 +3124,7 @@ int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx,
 			kvaddr = kmap(page);
 			kvaddr += offset;
 		} else if (WARN_ON(!kvaddr)) {
-			sdma_txclean(dd, tx);
+			__sdma_txclean(dd, tx);
 			return -EINVAL;
 		}
 
@@ -3154,7 +3154,7 @@ int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx,
 				      DMA_TO_DEVICE);
 
 		if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
-			sdma_txclean(dd, tx);
+			__sdma_txclean(dd, tx);
 			return -ENOSPC;
 		}
 
@@ -3196,7 +3196,7 @@ int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
 	if ((unlikely(tx->num_desc == tx->desc_limit))) {
 		rval = _extend_sdma_tx_descs(dd, tx);
 		if (rval) {
-			sdma_txclean(dd, tx);
+			__sdma_txclean(dd, tx);
 			return rval;
 		}
 	}

+ 9 - 3
drivers/infiniband/hw/hfi1/sdma.h

@@ -667,7 +667,13 @@ int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx,
 			   int type, void *kvaddr, struct page *page,
 			   unsigned long offset, u16 len);
 int _pad_sdma_tx_descs(struct hfi1_devdata *, struct sdma_txreq *);
-void sdma_txclean(struct hfi1_devdata *, struct sdma_txreq *);
+void __sdma_txclean(struct hfi1_devdata *, struct sdma_txreq *);
+
+static inline void sdma_txclean(struct hfi1_devdata *dd, struct sdma_txreq *tx)
+{
+	if (tx->num_desc)
+		__sdma_txclean(dd, tx);
+}
 
 /* helpers used by public routines */
 static inline void _sdma_close_tx(struct hfi1_devdata *dd,
@@ -753,7 +759,7 @@ static inline int sdma_txadd_page(
 		       DMA_TO_DEVICE);
 
 	if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
-		sdma_txclean(dd, tx);
+		__sdma_txclean(dd, tx);
 		return -ENOSPC;
 	}
 
@@ -834,7 +840,7 @@ static inline int sdma_txadd_kvaddr(
 		       DMA_TO_DEVICE);
 
 	if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
-		sdma_txclean(dd, tx);
+		__sdma_txclean(dd, tx);
 		return -ENOSPC;
 	}
 

+ 2 - 2
drivers/infiniband/hw/hfi1/uc.c

@@ -258,8 +258,8 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 	qp->s_len -= len;
 	qp->s_hdrwords = hwords;
 	ps->s_txreq->sde = priv->s_sde;
-	qp->s_cur_sge = &qp->s_sge;
-	qp->s_cur_size = len;
+	ps->s_txreq->ss = &qp->s_sge;
+	ps->s_txreq->s_cur_size = len;
 	hfi1_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24),
 			     mask_psn(qp->s_psn++), middle, ps);
 	/* pbc */

+ 2 - 2
drivers/infiniband/hw/hfi1/ud.c

@@ -354,8 +354,8 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 
 	/* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */
 	qp->s_hdrwords = 7;
-	qp->s_cur_size = wqe->length;
-	qp->s_cur_sge = &qp->s_sge;
+	ps->s_txreq->s_cur_size = wqe->length;
+	ps->s_txreq->ss = &qp->s_sge;
 	qp->s_srate = ah_attr->static_rate;
 	qp->srate_mbps = ib_rate_to_mbps(qp->s_srate);
 	qp->s_wqe = wqe;

+ 43 - 17
drivers/infiniband/hw/hfi1/user_sdma.c

@@ -115,6 +115,7 @@ MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 12
 #define KDETH_HCRC_LOWER_MASK     0xff
 
 #define AHG_KDETH_INTR_SHIFT 12
+#define AHG_KDETH_SH_SHIFT   13
 
 #define PBC2LRH(x) ((((x) & 0xfff) << 2) - 4)
 #define LRH2PBC(x) ((((x) >> 2) + 1) & 0xfff)
@@ -144,8 +145,9 @@ MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 12
 #define KDETH_OM_LARGE     64
 #define KDETH_OM_MAX_SIZE  (1 << ((KDETH_OM_LARGE / KDETH_OM_SMALL) + 1))
 
-/* Last packet in the request */
-#define TXREQ_FLAGS_REQ_LAST_PKT BIT(0)
+/* Tx request flag bits */
+#define TXREQ_FLAGS_REQ_ACK   BIT(0)      /* Set the ACK bit in the header */
+#define TXREQ_FLAGS_REQ_DISABLE_SH BIT(1) /* Disable header suppression */
 
 /* SDMA request flag bits */
 #define SDMA_REQ_FOR_THREAD 1
@@ -943,8 +945,13 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
 		tx->busycount = 0;
 		INIT_LIST_HEAD(&tx->list);
 
+		/*
+		 * For the last packet set the ACK request
+		 * and disable header suppression.
+		 */
 		if (req->seqnum == req->info.npkts - 1)
-			tx->flags |= TXREQ_FLAGS_REQ_LAST_PKT;
+			tx->flags |= (TXREQ_FLAGS_REQ_ACK |
+				      TXREQ_FLAGS_REQ_DISABLE_SH);
 
 		/*
 		 * Calculate the payload size - this is min of the fragment
@@ -963,11 +970,22 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
 			}
 
 			datalen = compute_data_length(req, tx);
+
+			/*
+			 * Disable header suppression for the payload <= 8DWS.
+			 * If there is an uncorrectable error in the receive
+			 * data FIFO when the received payload size is less than
+			 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is
+			 * not reported.There is set RHF.EccErr if the header
+			 * is not suppressed.
+			 */
 			if (!datalen) {
 				SDMA_DBG(req,
 					 "Request has data but pkt len is 0");
 				ret = -EFAULT;
 				goto free_tx;
+			} else if (datalen <= 32) {
+				tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH;
 			}
 		}
 
@@ -990,6 +1008,10 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
 						LRH2PBC(lrhlen);
 					tx->hdr.pbc[0] = cpu_to_le16(pbclen);
 				}
+				ret = check_header_template(req, &tx->hdr,
+							    lrhlen, datalen);
+				if (ret)
+					goto free_tx;
 				ret = sdma_txinit_ahg(&tx->txreq,
 						      SDMA_TXREQ_F_AHG_COPY,
 						      sizeof(tx->hdr) + datalen,
@@ -1351,7 +1373,7 @@ static int set_txreq_header(struct user_sdma_request *req,
 				req->seqnum));
 
 	/* Set ACK request on last packet */
-	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
+	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
 		hdr->bth[2] |= cpu_to_be32(1UL << 31);
 
 	/* Set the new offset */
@@ -1384,8 +1406,8 @@ static int set_txreq_header(struct user_sdma_request *req,
 		/* Set KDETH.TID based on value for this TID */
 		KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
 			  EXP_TID_GET(tidval, IDX));
-		/* Clear KDETH.SH only on the last packet */
-		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
+		/* Clear KDETH.SH when DISABLE_SH flag is set */
+		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH))
 			KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
 		/*
 		 * Set the KDETH.OFFSET and KDETH.OM based on size of
@@ -1429,7 +1451,7 @@ static int set_txreq_header_ahg(struct user_sdma_request *req,
 	/* BTH.PSN and BTH.A */
 	val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
 		(HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
-	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
+	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
 		val32 |= 1UL << 31;
 	AHG_HEADER_SET(req->ahg, diff, 6, 0, 16, cpu_to_be16(val32 >> 16));
 	AHG_HEADER_SET(req->ahg, diff, 6, 16, 16, cpu_to_be16(val32 & 0xffff));
@@ -1468,19 +1490,23 @@ static int set_txreq_header_ahg(struct user_sdma_request *req,
 		AHG_HEADER_SET(req->ahg, diff, 7, 0, 16,
 			       ((!!(req->omfactor - KDETH_OM_SMALL)) << 15 |
 				((req->tidoffset / req->omfactor) & 0x7fff)));
-		/* KDETH.TIDCtrl, KDETH.TID */
+		/* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */
 		val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
-					(EXP_TID_GET(tidval, IDX) & 0x3ff));
-		/* Clear KDETH.SH on last packet */
-		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT)) {
-			val |= cpu_to_le16(KDETH_GET(hdr->kdeth.ver_tid_offset,
-						     INTR) <<
-					   AHG_KDETH_INTR_SHIFT);
-			val &= cpu_to_le16(~(1U << 13));
-			AHG_HEADER_SET(req->ahg, diff, 7, 16, 14, val);
+				   (EXP_TID_GET(tidval, IDX) & 0x3ff));
+
+		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) {
+			val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
+						      INTR) <<
+					    AHG_KDETH_INTR_SHIFT));
 		} else {
-			AHG_HEADER_SET(req->ahg, diff, 7, 16, 12, val);
+			val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ?
+			       cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) :
+			       cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
+						      INTR) <<
+					     AHG_KDETH_INTR_SHIFT));
 		}
+
+		AHG_HEADER_SET(req->ahg, diff, 7, 16, 14, val);
 	}
 
 	trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,

+ 174 - 35
drivers/infiniband/hw/hfi1/verbs.c

@@ -296,22 +296,6 @@ static inline int wss_exceeds_threshold(void)
 	return atomic_read(&wss.total_count) >= wss.threshold;
 }
 
-/*
- * Translate ib_wr_opcode into ib_wc_opcode.
- */
-const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
-	[IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
-	[IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
-	[IB_WR_SEND] = IB_WC_SEND,
-	[IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
-	[IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
-	[IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
-	[IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD,
-	[IB_WR_SEND_WITH_INV] = IB_WC_SEND,
-	[IB_WR_LOCAL_INV] = IB_WC_LOCAL_INV,
-	[IB_WR_REG_MR] = IB_WC_REG_MR
-};
-
 /*
  * Length of header by opcode, 0 --> not supported
  */
@@ -694,6 +678,7 @@ static void mem_timer(unsigned long data)
 		qp = iowait_to_qp(wait);
 		priv = qp->priv;
 		list_del_init(&priv->s_iowait.list);
+		priv->s_iowait.lock = NULL;
 		/* refcount held until actual wake up */
 		if (!list_empty(list))
 			mod_timer(&dev->mem_timer, jiffies + 1);
@@ -769,6 +754,7 @@ static int wait_kmem(struct hfi1_ibdev *dev,
 				mod_timer(&dev->mem_timer, jiffies + 1);
 			qp->s_flags |= RVT_S_WAIT_KMEM;
 			list_add_tail(&priv->s_iowait.list, &dev->memwait);
+			priv->s_iowait.lock = &dev->iowait_lock;
 			trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM);
 			rvt_get_qp(qp);
 		}
@@ -788,10 +774,10 @@ static int wait_kmem(struct hfi1_ibdev *dev,
  */
 static noinline int build_verbs_ulp_payload(
 	struct sdma_engine *sde,
-	struct rvt_sge_state *ss,
 	u32 length,
 	struct verbs_txreq *tx)
 {
+	struct rvt_sge_state *ss = tx->ss;
 	struct rvt_sge *sg_list = ss->sg_list;
 	struct rvt_sge sge = ss->sge;
 	u8 num_sge = ss->num_sge;
@@ -835,7 +821,6 @@ bail_txadd:
 /* New API */
 static int build_verbs_tx_desc(
 	struct sdma_engine *sde,
-	struct rvt_sge_state *ss,
 	u32 length,
 	struct verbs_txreq *tx,
 	struct hfi1_ahg_info *ahg_info,
@@ -879,9 +864,9 @@ static int build_verbs_tx_desc(
 			goto bail_txadd;
 	}
 
-	/* add the ulp payload - if any.  ss can be NULL for acks */
-	if (ss)
-		ret = build_verbs_ulp_payload(sde, ss, length, tx);
+	/* add the ulp payload - if any. tx->ss can be NULL for acks */
+	if (tx->ss)
+		ret = build_verbs_ulp_payload(sde, length, tx);
 bail_txadd:
 	return ret;
 }
@@ -892,8 +877,7 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 	struct hfi1_qp_priv *priv = qp->priv;
 	struct hfi1_ahg_info *ahg_info = priv->s_ahg;
 	u32 hdrwords = qp->s_hdrwords;
-	struct rvt_sge_state *ss = qp->s_cur_sge;
-	u32 len = qp->s_cur_size;
+	u32 len = ps->s_txreq->s_cur_size;
 	u32 plen = hdrwords + ((len + 3) >> 2) + 2; /* includes pbc */
 	struct hfi1_ibdev *dev = ps->dev;
 	struct hfi1_pportdata *ppd = ps->ppd;
@@ -918,7 +902,7 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 					 plen);
 		}
 		tx->wqe = qp->s_wqe;
-		ret = build_verbs_tx_desc(tx->sde, ss, len, tx, ahg_info, pbc);
+		ret = build_verbs_tx_desc(tx->sde, len, tx, ahg_info, pbc);
 		if (unlikely(ret))
 			goto bail_build;
 	}
@@ -980,6 +964,7 @@ static int pio_wait(struct rvt_qp *qp,
 			qp->s_flags |= flag;
 			was_empty = list_empty(&sc->piowait);
 			list_add_tail(&priv->s_iowait.list, &sc->piowait);
+			priv->s_iowait.lock = &dev->iowait_lock;
 			trace_hfi1_qpsleep(qp, RVT_S_WAIT_PIO);
 			rvt_get_qp(qp);
 			/* counting: only call wantpiobuf_intr if first user */
@@ -1008,8 +993,8 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 {
 	struct hfi1_qp_priv *priv = qp->priv;
 	u32 hdrwords = qp->s_hdrwords;
-	struct rvt_sge_state *ss = qp->s_cur_sge;
-	u32 len = qp->s_cur_size;
+	struct rvt_sge_state *ss = ps->s_txreq->ss;
+	u32 len = ps->s_txreq->s_cur_size;
 	u32 dwords = (len + 3) >> 2;
 	u32 plen = hdrwords + dwords + 2; /* includes pbc */
 	struct hfi1_pportdata *ppd = ps->ppd;
@@ -1237,7 +1222,7 @@ static inline send_routine get_send_routine(struct rvt_qp *qp,
 		u8 op = get_opcode(h);
 
 		if (piothreshold &&
-		    qp->s_cur_size <= min(piothreshold, qp->pmtu) &&
+		    tx->s_cur_size <= min(piothreshold, qp->pmtu) &&
 		    (BIT(op & OPMASK) & pio_opmask[op >> 5]) &&
 		    iowait_sdma_pending(&priv->s_iowait) == 0 &&
 		    !sdma_txreq_built(&tx->txreq))
@@ -1483,15 +1468,11 @@ static int hfi1_get_guid_be(struct rvt_dev_info *rdi, struct rvt_ibport *rvp,
 			    int guid_index, __be64 *guid)
 {
 	struct hfi1_ibport *ibp = container_of(rvp, struct hfi1_ibport, rvp);
-	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
 
-	if (guid_index == 0)
-		*guid = cpu_to_be64(ppd->guid);
-	else if (guid_index < HFI1_GUIDS_PER_PORT)
-		*guid = ibp->guids[guid_index - 1];
-	else
+	if (guid_index >= HFI1_GUIDS_PER_PORT)
 		return -EINVAL;
 
+	*guid = get_sguid(ibp, guid_index);
 	return 0;
 }
 
@@ -1610,6 +1591,154 @@ static void hfi1_get_dev_fw_str(struct ib_device *ibdev, char *str,
 		 dc8051_ver_min(ver));
 }
 
+static const char * const driver_cntr_names[] = {
+	/* must be element 0*/
+	"DRIVER_KernIntr",
+	"DRIVER_ErrorIntr",
+	"DRIVER_Tx_Errs",
+	"DRIVER_Rcv_Errs",
+	"DRIVER_HW_Errs",
+	"DRIVER_NoPIOBufs",
+	"DRIVER_CtxtsOpen",
+	"DRIVER_RcvLen_Errs",
+	"DRIVER_EgrBufFull",
+	"DRIVER_EgrHdrFull"
+};
+
+static const char **dev_cntr_names;
+static const char **port_cntr_names;
+static int num_driver_cntrs = ARRAY_SIZE(driver_cntr_names);
+static int num_dev_cntrs;
+static int num_port_cntrs;
+static int cntr_names_initialized;
+
+/*
+ * Convert a list of names separated by '\n' into an array of NULL terminated
+ * strings. Optionally some entries can be reserved in the array to hold extra
+ * external strings.
+ */
+static int init_cntr_names(const char *names_in,
+			   const int names_len,
+			   int num_extra_names,
+			   int *num_cntrs,
+			   const char ***cntr_names)
+{
+	char *names_out, *p, **q;
+	int i, n;
+
+	n = 0;
+	for (i = 0; i < names_len; i++)
+		if (names_in[i] == '\n')
+			n++;
+
+	names_out = kmalloc((n + num_extra_names) * sizeof(char *) + names_len,
+			    GFP_KERNEL);
+	if (!names_out) {
+		*num_cntrs = 0;
+		*cntr_names = NULL;
+		return -ENOMEM;
+	}
+
+	p = names_out + (n + num_extra_names) * sizeof(char *);
+	memcpy(p, names_in, names_len);
+
+	q = (char **)names_out;
+	for (i = 0; i < n; i++) {
+		q[i] = p;
+		p = strchr(p, '\n');
+		*p++ = '\0';
+	}
+
+	*num_cntrs = n;
+	*cntr_names = (const char **)names_out;
+	return 0;
+}
+
+static struct rdma_hw_stats *alloc_hw_stats(struct ib_device *ibdev,
+					    u8 port_num)
+{
+	int i, err;
+
+	if (!cntr_names_initialized) {
+		struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+
+		err = init_cntr_names(dd->cntrnames,
+				      dd->cntrnameslen,
+				      num_driver_cntrs,
+				      &num_dev_cntrs,
+				      &dev_cntr_names);
+		if (err)
+			return NULL;
+
+		for (i = 0; i < num_driver_cntrs; i++)
+			dev_cntr_names[num_dev_cntrs + i] =
+				driver_cntr_names[i];
+
+		err = init_cntr_names(dd->portcntrnames,
+				      dd->portcntrnameslen,
+				      0,
+				      &num_port_cntrs,
+				      &port_cntr_names);
+		if (err) {
+			kfree(dev_cntr_names);
+			dev_cntr_names = NULL;
+			return NULL;
+		}
+		cntr_names_initialized = 1;
+	}
+
+	if (!port_num)
+		return rdma_alloc_hw_stats_struct(
+				dev_cntr_names,
+				num_dev_cntrs + num_driver_cntrs,
+				RDMA_HW_STATS_DEFAULT_LIFESPAN);
+	else
+		return rdma_alloc_hw_stats_struct(
+				port_cntr_names,
+				num_port_cntrs,
+				RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
+static u64 hfi1_sps_ints(void)
+{
+	unsigned long flags;
+	struct hfi1_devdata *dd;
+	u64 sps_ints = 0;
+
+	spin_lock_irqsave(&hfi1_devs_lock, flags);
+	list_for_each_entry(dd, &hfi1_dev_list, list) {
+		sps_ints += get_all_cpu_total(dd->int_counter);
+	}
+	spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+	return sps_ints;
+}
+
+static int get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
+			u8 port, int index)
+{
+	u64 *values;
+	int count;
+
+	if (!port) {
+		u64 *stats = (u64 *)&hfi1_stats;
+		int i;
+
+		hfi1_read_cntrs(dd_from_ibdev(ibdev), NULL, &values);
+		values[num_dev_cntrs] = hfi1_sps_ints();
+		for (i = 1; i < num_driver_cntrs; i++)
+			values[num_dev_cntrs + i] = stats[i];
+		count = num_dev_cntrs + num_driver_cntrs;
+	} else {
+		struct hfi1_ibport *ibp = to_iport(ibdev, port);
+
+		hfi1_read_portcntrs(ppd_from_ibp(ibp), NULL, &values);
+		count = num_port_cntrs;
+	}
+
+	memcpy(stats->value, values, count * sizeof(u64));
+	return count;
+}
+
 /**
  * hfi1_register_ib_device - register our device with the infiniband core
  * @dd: the device data structure
@@ -1620,6 +1749,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
 	struct hfi1_ibdev *dev = &dd->verbs_dev;
 	struct ib_device *ibdev = &dev->rdi.ibdev;
 	struct hfi1_pportdata *ppd = dd->pport;
+	struct hfi1_ibport *ibp = &ppd->ibport_data;
 	unsigned i;
 	int ret;
 	size_t lcpysz = IB_DEVICE_NAME_MAX;
@@ -1632,6 +1762,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
 	setup_timer(&dev->mem_timer, mem_timer, (unsigned long)dev);
 
 	seqlock_init(&dev->iowait_lock);
+	seqlock_init(&dev->txwait_lock);
 	INIT_LIST_HEAD(&dev->txwait);
 	INIT_LIST_HEAD(&dev->memwait);
 
@@ -1639,20 +1770,24 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
 	if (ret)
 		goto err_verbs_txreq;
 
+	/* Use first-port GUID as node guid */
+	ibdev->node_guid = get_sguid(ibp, HFI1_PORT_GUID_INDEX);
+
 	/*
 	 * The system image GUID is supposed to be the same for all
 	 * HFIs in a single system but since there can be other
 	 * device types in the system, we can't be sure this is unique.
 	 */
 	if (!ib_hfi1_sys_image_guid)
-		ib_hfi1_sys_image_guid = cpu_to_be64(ppd->guid);
+		ib_hfi1_sys_image_guid = ibdev->node_guid;
 	lcpysz = strlcpy(ibdev->name, class_name(), lcpysz);
 	strlcpy(ibdev->name + lcpysz, "_%d", IB_DEVICE_NAME_MAX - lcpysz);
 	ibdev->owner = THIS_MODULE;
-	ibdev->node_guid = cpu_to_be64(ppd->guid);
 	ibdev->phys_port_cnt = dd->num_pports;
 	ibdev->dma_device = &dd->pcidev->dev;
 	ibdev->modify_device = modify_device;
+	ibdev->alloc_hw_stats = alloc_hw_stats;
+	ibdev->get_hw_stats = get_hw_stats;
 
 	/* keep process mad in the driver */
 	ibdev->process_mad = hfi1_process_mad;
@@ -1767,6 +1902,10 @@ void hfi1_unregister_ib_device(struct hfi1_devdata *dd)
 
 	del_timer_sync(&dev->mem_timer);
 	verbs_txreq_exit(dev);
+
+	kfree(dev_cntr_names);
+	kfree(port_cntr_names);
+	cntr_names_initialized = 0;
 }
 
 void hfi1_cnp_rcv(struct hfi1_packet *packet)

+ 7 - 9
drivers/infiniband/hw/hfi1/verbs.h

@@ -73,7 +73,6 @@ struct hfi1_packet;
 #include "iowait.h"
 
 #define HFI1_MAX_RDMA_ATOMIC     16
-#define HFI1_GUIDS_PER_PORT	5
 
 /*
  * Increment this value if any changes that break userspace ABI
@@ -169,8 +168,6 @@ struct hfi1_ibport {
 	struct rvt_qp __rcu *qp[2];
 	struct rvt_ibport rvp;
 
-	__be64 guids[HFI1_GUIDS_PER_PORT	- 1];	/* writable GUIDs */
-
 	/* the first 16 entries are sl_to_vl for !OPA */
 	u8 sl_to_sc[32];
 	u8 sc_to_sl[32];
@@ -180,18 +177,19 @@ struct hfi1_ibdev {
 	struct rvt_dev_info rdi; /* Must be first */
 
 	/* QP numbers are shared by all IB ports */
-	/* protect wait lists */
-	seqlock_t iowait_lock;
+	/* protect txwait list */
+	seqlock_t txwait_lock ____cacheline_aligned_in_smp;
 	struct list_head txwait;        /* list for wait verbs_txreq */
 	struct list_head memwait;       /* list for wait kernel memory */
-	struct list_head txreq_free;
 	struct kmem_cache *verbs_txreq_cache;
-	struct timer_list mem_timer;
+	u64 n_txwait;
+	u64 n_kmem_wait;
 
+	/* protect iowait lists */
+	seqlock_t iowait_lock ____cacheline_aligned_in_smp;
 	u64 n_piowait;
 	u64 n_piodrain;
-	u64 n_txwait;
-	u64 n_kmem_wait;
+	struct timer_list mem_timer;
 
 #ifdef CONFIG_DEBUG_FS
 	/* per HFI debugfs */

+ 7 - 6
drivers/infiniband/hw/hfi1/verbs_txreq.c

@@ -72,22 +72,22 @@ void hfi1_put_txreq(struct verbs_txreq *tx)
 	kmem_cache_free(dev->verbs_txreq_cache, tx);
 
 	do {
-		seq = read_seqbegin(&dev->iowait_lock);
+		seq = read_seqbegin(&dev->txwait_lock);
 		if (!list_empty(&dev->txwait)) {
 			struct iowait *wait;
 
-			write_seqlock_irqsave(&dev->iowait_lock, flags);
+			write_seqlock_irqsave(&dev->txwait_lock, flags);
 			wait = list_first_entry(&dev->txwait, struct iowait,
 						list);
 			qp = iowait_to_qp(wait);
 			priv = qp->priv;
 			list_del_init(&priv->s_iowait.list);
 			/* refcount held until actual wake up */
-			write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+			write_sequnlock_irqrestore(&dev->txwait_lock, flags);
 			hfi1_qp_wakeup(qp, RVT_S_WAIT_TX);
 			break;
 		}
-	} while (read_seqretry(&dev->iowait_lock, seq));
+	} while (read_seqretry(&dev->txwait_lock, seq));
 }
 
 struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev,
@@ -96,7 +96,7 @@ struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev,
 {
 	struct verbs_txreq *tx = ERR_PTR(-EBUSY);
 
-	write_seqlock(&dev->iowait_lock);
+	write_seqlock(&dev->txwait_lock);
 	if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
 		struct hfi1_qp_priv *priv;
 
@@ -108,13 +108,14 @@ struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev,
 			dev->n_txwait++;
 			qp->s_flags |= RVT_S_WAIT_TX;
 			list_add_tail(&priv->s_iowait.list, &dev->txwait);
+			priv->s_iowait.lock = &dev->txwait_lock;
 			trace_hfi1_qpsleep(qp, RVT_S_WAIT_TX);
 			rvt_get_qp(qp);
 		}
 		qp->s_flags &= ~RVT_S_BUSY;
 	}
 out:
-	write_sequnlock(&dev->iowait_lock);
+	write_sequnlock(&dev->txwait_lock);
 	return tx;
 }
 

+ 1 - 0
drivers/infiniband/hw/hfi1/verbs_txreq.h

@@ -65,6 +65,7 @@ struct verbs_txreq {
 	struct sdma_engine     *sde;
 	struct send_context     *psc;
 	u16                     hdr_dwords;
+	u16			s_cur_size;
 };
 
 struct hfi1_ibdev;

+ 1 - 2
drivers/infiniband/hw/qib/qib_driver.c

@@ -420,8 +420,7 @@ static u32 qib_rcv_hdrerr(struct qib_ctxtdata *rcd, struct qib_pportdata *ppd,
 						if (list_empty(&qp->rspwait)) {
 							qp->r_flags |=
 								RVT_R_RSP_NAK;
-							atomic_inc(
-								&qp->refcount);
+							rvt_get_qp(qp);
 							list_add_tail(
 							 &qp->rspwait,
 							 &rcd->qp_wait_list);

+ 5 - 39
drivers/infiniband/hw/qib/qib_rc.c

@@ -941,8 +941,6 @@ void qib_rc_send_complete(struct rvt_qp *qp, struct ib_header *hdr)
 {
 	struct ib_other_headers *ohdr;
 	struct rvt_swqe *wqe;
-	struct ib_wc wc;
-	unsigned i;
 	u32 opcode;
 	u32 psn;
 
@@ -988,22 +986,8 @@ void qib_rc_send_complete(struct rvt_qp *qp, struct ib_header *hdr)
 		qp->s_last = s_last;
 		/* see post_send() */
 		barrier();
-		for (i = 0; i < wqe->wr.num_sge; i++) {
-			struct rvt_sge *sge = &wqe->sg_list[i];
-
-			rvt_put_mr(sge->mr);
-		}
-		/* Post a send completion queue entry if requested. */
-		if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
-		    (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
-			memset(&wc, 0, sizeof(wc));
-			wc.wr_id = wqe->wr.wr_id;
-			wc.status = IB_WC_SUCCESS;
-			wc.opcode = ib_qib_wc_opcode[wqe->wr.opcode];
-			wc.byte_len = wqe->length;
-			wc.qp = &qp->ibqp;
-			rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc, 0);
-		}
+		rvt_put_swqe(wqe);
+		rvt_qp_swqe_complete(qp, wqe, IB_WC_SUCCESS);
 	}
 	/*
 	 * If we were waiting for sends to complete before resending,
@@ -1032,9 +1016,6 @@ static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
 					 struct rvt_swqe *wqe,
 					 struct qib_ibport *ibp)
 {
-	struct ib_wc wc;
-	unsigned i;
-
 	/*
 	 * Don't decrement refcount and don't generate a
 	 * completion if the SWQE is being resent until the send
@@ -1044,28 +1025,14 @@ static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
 	    qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
 		u32 s_last;
 
-		for (i = 0; i < wqe->wr.num_sge; i++) {
-			struct rvt_sge *sge = &wqe->sg_list[i];
-
-			rvt_put_mr(sge->mr);
-		}
+		rvt_put_swqe(wqe);
 		s_last = qp->s_last;
 		if (++s_last >= qp->s_size)
 			s_last = 0;
 		qp->s_last = s_last;
 		/* see post_send() */
 		barrier();
-		/* Post a send completion queue entry if requested. */
-		if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
-		    (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
-			memset(&wc, 0, sizeof(wc));
-			wc.wr_id = wqe->wr.wr_id;
-			wc.status = IB_WC_SUCCESS;
-			wc.opcode = ib_qib_wc_opcode[wqe->wr.opcode];
-			wc.byte_len = wqe->length;
-			wc.qp = &qp->ibqp;
-			rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc, 0);
-		}
+		rvt_qp_swqe_complete(qp, wqe, IB_WC_SUCCESS);
 	} else
 		this_cpu_inc(*ibp->rvp.rc_delayed_comp);
 
@@ -2112,8 +2079,7 @@ send_last:
 			 * Update the next expected PSN.  We add 1 later
 			 * below, so only add the remainder here.
 			 */
-			if (len > pmtu)
-				qp->r_psn += (len - 1) / pmtu;
+			qp->r_psn += rvt_div_mtu(qp, len - 1);
 		} else {
 			e->rdma_sge.mr = NULL;
 			e->rdma_sge.vaddr = NULL;

+ 2 - 22
drivers/infiniband/hw/qib/qib_ruc.c

@@ -793,7 +793,6 @@ void qib_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
 		       enum ib_wc_status status)
 {
 	u32 old_last, last;
-	unsigned i;
 
 	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
 		return;
@@ -805,32 +804,13 @@ void qib_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
 	qp->s_last = last;
 	/* See post_send() */
 	barrier();
-	for (i = 0; i < wqe->wr.num_sge; i++) {
-		struct rvt_sge *sge = &wqe->sg_list[i];
-
-		rvt_put_mr(sge->mr);
-	}
+	rvt_put_swqe(wqe);
 	if (qp->ibqp.qp_type == IB_QPT_UD ||
 	    qp->ibqp.qp_type == IB_QPT_SMI ||
 	    qp->ibqp.qp_type == IB_QPT_GSI)
 		atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount);
 
-	/* See ch. 11.2.4.1 and 10.7.3.1 */
-	if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
-	    (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
-	    status != IB_WC_SUCCESS) {
-		struct ib_wc wc;
-
-		memset(&wc, 0, sizeof(wc));
-		wc.wr_id = wqe->wr.wr_id;
-		wc.status = status;
-		wc.opcode = ib_qib_wc_opcode[wqe->wr.opcode];
-		wc.qp = &qp->ibqp;
-		if (status == IB_WC_SUCCESS)
-			wc.byte_len = wqe->length;
-		rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc,
-			     status != IB_WC_SUCCESS);
-	}
+	rvt_qp_swqe_complete(qp, wqe, status);
 
 	if (qp->s_acked == old_last)
 		qp->s_acked = last;

+ 8 - 25
drivers/infiniband/hw/qib/qib_verbs.c

@@ -113,19 +113,6 @@ static unsigned int ib_qib_disable_sma;
 module_param_named(disable_sma, ib_qib_disable_sma, uint, S_IWUSR | S_IRUGO);
 MODULE_PARM_DESC(disable_sma, "Disable the SMA");
 
-/*
- * Translate ib_wr_opcode into ib_wc_opcode.
- */
-const enum ib_wc_opcode ib_qib_wc_opcode[] = {
-	[IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
-	[IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
-	[IB_WR_SEND] = IB_WC_SEND,
-	[IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
-	[IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
-	[IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
-	[IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
-};
-
 /*
  * System image GUID.
  */
@@ -464,7 +451,7 @@ static void mem_timer(unsigned long data)
 		priv = list_entry(list->next, struct qib_qp_priv, iowait);
 		qp = priv->owner;
 		list_del_init(&priv->iowait);
-		atomic_inc(&qp->refcount);
+		rvt_get_qp(qp);
 		if (!list_empty(list))
 			mod_timer(&dev->mem_timer, jiffies + 1);
 	}
@@ -477,8 +464,7 @@ static void mem_timer(unsigned long data)
 			qib_schedule_send(qp);
 		}
 		spin_unlock_irqrestore(&qp->s_lock, flags);
-		if (atomic_dec_and_test(&qp->refcount))
-			wake_up(&qp->wait);
+		rvt_put_qp(qp);
 	}
 }
 
@@ -762,7 +748,7 @@ void qib_put_txreq(struct qib_verbs_txreq *tx)
 				  iowait);
 		qp = priv->owner;
 		list_del_init(&priv->iowait);
-		atomic_inc(&qp->refcount);
+		rvt_get_qp(qp);
 		spin_unlock_irqrestore(&dev->rdi.pending_lock, flags);
 
 		spin_lock_irqsave(&qp->s_lock, flags);
@@ -772,8 +758,7 @@ void qib_put_txreq(struct qib_verbs_txreq *tx)
 		}
 		spin_unlock_irqrestore(&qp->s_lock, flags);
 
-		if (atomic_dec_and_test(&qp->refcount))
-			wake_up(&qp->wait);
+		rvt_put_qp(qp);
 	} else
 		spin_unlock_irqrestore(&dev->rdi.pending_lock, flags);
 }
@@ -808,7 +793,7 @@ void qib_verbs_sdma_desc_avail(struct qib_pportdata *ppd, unsigned avail)
 			break;
 		avail -= qpp->s_tx->txreq.sg_count;
 		list_del_init(&qpp->iowait);
-		atomic_inc(&qp->refcount);
+		rvt_get_qp(qp);
 		qps[n++] = qp;
 	}
 
@@ -822,8 +807,7 @@ void qib_verbs_sdma_desc_avail(struct qib_pportdata *ppd, unsigned avail)
 			qib_schedule_send(qp);
 		}
 		spin_unlock(&qp->s_lock);
-		if (atomic_dec_and_test(&qp->refcount))
-			wake_up(&qp->wait);
+		rvt_put_qp(qp);
 	}
 }
 
@@ -1288,7 +1272,7 @@ void qib_ib_piobufavail(struct qib_devdata *dd)
 		priv = list_entry(list->next, struct qib_qp_priv, iowait);
 		qp = priv->owner;
 		list_del_init(&priv->iowait);
-		atomic_inc(&qp->refcount);
+		rvt_get_qp(qp);
 		qps[n++] = qp;
 	}
 	dd->f_wantpiobuf_intr(dd, 0);
@@ -1306,8 +1290,7 @@ full:
 		spin_unlock_irqrestore(&qp->s_lock, flags);
 
 		/* Notify qib_destroy_qp() if it is waiting. */
-		if (atomic_dec_and_test(&qp->refcount))
-			wake_up(&qp->wait);
+		rvt_put_qp(qp);
 	}
 }
 

+ 27 - 37
drivers/infiniband/sw/rdmavt/cq.c

@@ -119,18 +119,17 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
 	if (cq->notify == IB_CQ_NEXT_COMP ||
 	    (cq->notify == IB_CQ_SOLICITED &&
 	     (solicited || entry->status != IB_WC_SUCCESS))) {
-		struct kthread_worker *worker;
 		/*
 		 * This will cause send_complete() to be called in
 		 * another thread.
 		 */
-		smp_read_barrier_depends(); /* see rvt_cq_exit */
-		worker = cq->rdi->worker;
-		if (likely(worker)) {
+		spin_lock(&cq->rdi->n_cqs_lock);
+		if (likely(cq->rdi->worker)) {
 			cq->notify = RVT_CQ_NONE;
 			cq->triggered++;
-			kthread_queue_work(worker, &cq->comptask);
+			kthread_queue_work(cq->rdi->worker, &cq->comptask);
 		}
+		spin_unlock(&cq->rdi->n_cqs_lock);
 	}
 
 	spin_unlock_irqrestore(&cq->lock, flags);
@@ -240,15 +239,15 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
 		}
 	}
 
-	spin_lock(&rdi->n_cqs_lock);
+	spin_lock_irq(&rdi->n_cqs_lock);
 	if (rdi->n_cqs_allocated == rdi->dparms.props.max_cq) {
-		spin_unlock(&rdi->n_cqs_lock);
+		spin_unlock_irq(&rdi->n_cqs_lock);
 		ret = ERR_PTR(-ENOMEM);
 		goto bail_ip;
 	}
 
 	rdi->n_cqs_allocated++;
-	spin_unlock(&rdi->n_cqs_lock);
+	spin_unlock_irq(&rdi->n_cqs_lock);
 
 	if (cq->ip) {
 		spin_lock_irq(&rdi->pending_lock);
@@ -296,9 +295,9 @@ int rvt_destroy_cq(struct ib_cq *ibcq)
 	struct rvt_dev_info *rdi = cq->rdi;
 
 	kthread_flush_work(&cq->comptask);
-	spin_lock(&rdi->n_cqs_lock);
+	spin_lock_irq(&rdi->n_cqs_lock);
 	rdi->n_cqs_allocated--;
-	spin_unlock(&rdi->n_cqs_lock);
+	spin_unlock_irq(&rdi->n_cqs_lock);
 	if (cq->ip)
 		kref_put(&cq->ip->ref, rvt_release_mmap_info);
 	else
@@ -504,33 +503,23 @@ int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
  */
 int rvt_driver_cq_init(struct rvt_dev_info *rdi)
 {
-	int ret = 0;
 	int cpu;
-	struct task_struct *task;
+	struct kthread_worker *worker;
 
 	if (rdi->worker)
 		return 0;
+
 	spin_lock_init(&rdi->n_cqs_lock);
-	rdi->worker = kzalloc(sizeof(*rdi->worker), GFP_KERNEL);
-	if (!rdi->worker)
-		return -ENOMEM;
-	kthread_init_worker(rdi->worker);
-	task = kthread_create_on_node(
-		kthread_worker_fn,
-		rdi->worker,
-		rdi->dparms.node,
-		"%s", rdi->dparms.cq_name);
-	if (IS_ERR(task)) {
-		kfree(rdi->worker);
-		rdi->worker = NULL;
-		return PTR_ERR(task);
-	}
 
-	set_user_nice(task, MIN_NICE);
 	cpu = cpumask_first(cpumask_of_node(rdi->dparms.node));
-	kthread_bind(task, cpu);
-	wake_up_process(task);
-	return ret;
+	worker = kthread_create_worker_on_cpu(cpu, 0,
+					      "%s", rdi->dparms.cq_name);
+	if (IS_ERR(worker))
+		return PTR_ERR(worker);
+
+	set_user_nice(worker->task, MIN_NICE);
+	rdi->worker = worker;
+	return 0;
 }
 
 /**
@@ -541,13 +530,14 @@ void rvt_cq_exit(struct rvt_dev_info *rdi)
 {
 	struct kthread_worker *worker;
 
-	worker = rdi->worker;
-	if (!worker)
+	/* block future queuing from send_complete() */
+	spin_lock_irq(&rdi->n_cqs_lock);
+	if (!rdi->worker) {
+		spin_unlock_irq(&rdi->n_cqs_lock);
 		return;
-	/* blocks future queuing from send_complete() */
+	}
 	rdi->worker = NULL;
-	smp_wmb(); /* See rdi_cq_enter */
-	kthread_flush_worker(worker);
-	kthread_stop(worker->task);
-	kfree(worker);
+	spin_unlock_irq(&rdi->n_cqs_lock);
+
+	kthread_destroy_worker(worker);
 }

+ 2 - 3
drivers/infiniband/sw/rdmavt/mcast.c

@@ -81,7 +81,7 @@ static struct rvt_mcast_qp *rvt_mcast_qp_alloc(struct rvt_qp *qp)
 		goto bail;
 
 	mqp->qp = qp;
-	atomic_inc(&qp->refcount);
+	rvt_get_qp(qp);
 
 bail:
 	return mqp;
@@ -92,8 +92,7 @@ static void rvt_mcast_qp_free(struct rvt_mcast_qp *mqp)
 	struct rvt_qp *qp = mqp->qp;
 
 	/* Notify hfi1_destroy_qp() if it is waiting. */
-	if (atomic_dec_and_test(&qp->refcount))
-		wake_up(&qp->wait);
+	rvt_put_qp(qp);
 
 	kfree(mqp);
 }

+ 13 - 9
drivers/infiniband/sw/rdmavt/mr.c

@@ -51,6 +51,7 @@
 #include <rdma/rdma_vt.h>
 #include "vt.h"
 #include "mr.h"
+#include "trace.h"
 
 /**
  * rvt_driver_mr_init - Init MR resources per driver
@@ -84,6 +85,7 @@ int rvt_driver_mr_init(struct rvt_dev_info *rdi)
 		lkey_table_size = rdi->dparms.lkey_table_size;
 	}
 	rdi->lkey_table.max = 1 << lkey_table_size;
+	rdi->lkey_table.shift = 32 - lkey_table_size;
 	lk_tab_size = rdi->lkey_table.max * sizeof(*rdi->lkey_table.table);
 	rdi->lkey_table.table = (struct rvt_mregion __rcu **)
 			       vmalloc_node(lk_tab_size, rdi->dparms.node);
@@ -402,6 +404,7 @@ struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 		}
 		mr->mr.map[m]->segs[n].vaddr = vaddr;
 		mr->mr.map[m]->segs[n].length = umem->page_size;
+		trace_rvt_mr_user_seg(&mr->mr, m, n, vaddr, umem->page_size);
 		n++;
 		if (n == RVT_SEGSZ) {
 			m++;
@@ -506,6 +509,7 @@ static int rvt_set_page(struct ib_mr *ibmr, u64 addr)
 	n = mapped_segs % RVT_SEGSZ;
 	mr->mr.map[m]->segs[n].vaddr = (void *)addr;
 	mr->mr.map[m]->segs[n].length = ps;
+	trace_rvt_mr_page_seg(&mr->mr, m, n, (void *)addr, ps);
 	mr->mr.length += ps;
 
 	return 0;
@@ -692,6 +696,7 @@ int rvt_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
 	for (i = 0; i < list_len; i++) {
 		fmr->mr.map[m]->segs[n].vaddr = (void *)page_list[i];
 		fmr->mr.map[m]->segs[n].length = ps;
+		trace_rvt_mr_fmr_seg(&fmr->mr, m, n, (void *)page_list[i], ps);
 		if (++n == RVT_SEGSZ) {
 			m++;
 			n = 0;
@@ -774,7 +779,6 @@ int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd,
 	struct rvt_mregion *mr;
 	unsigned n, m;
 	size_t off;
-	struct rvt_dev_info *dev = ib_to_rvt(pd->ibpd.device);
 
 	/*
 	 * We use LKEY == zero for kernel virtual addresses
@@ -782,12 +786,14 @@ int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd,
 	 */
 	rcu_read_lock();
 	if (sge->lkey == 0) {
+		struct rvt_dev_info *dev = ib_to_rvt(pd->ibpd.device);
+
 		if (pd->user)
 			goto bail;
 		mr = rcu_dereference(dev->dma_mr);
 		if (!mr)
 			goto bail;
-		atomic_inc(&mr->refcount);
+		rvt_get_mr(mr);
 		rcu_read_unlock();
 
 		isge->mr = mr;
@@ -798,8 +804,7 @@ int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd,
 		isge->n = 0;
 		goto ok;
 	}
-	mr = rcu_dereference(
-		rkt->table[(sge->lkey >> (32 - dev->dparms.lkey_table_size))]);
+	mr = rcu_dereference(rkt->table[sge->lkey >> rkt->shift]);
 	if (unlikely(!mr || atomic_read(&mr->lkey_invalid) ||
 		     mr->lkey != sge->lkey || mr->pd != &pd->ibpd))
 		goto bail;
@@ -809,7 +814,7 @@ int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd,
 		     off + sge->length > mr->length ||
 		     (mr->access_flags & acc) != acc))
 		goto bail;
-	atomic_inc(&mr->refcount);
+	rvt_get_mr(mr);
 	rcu_read_unlock();
 
 	off += mr->offset;
@@ -887,7 +892,7 @@ int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge,
 		mr = rcu_dereference(rdi->dma_mr);
 		if (!mr)
 			goto bail;
-		atomic_inc(&mr->refcount);
+		rvt_get_mr(mr);
 		rcu_read_unlock();
 
 		sge->mr = mr;
@@ -899,8 +904,7 @@ int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge,
 		goto ok;
 	}
 
-	mr = rcu_dereference(
-		rkt->table[(rkey >> (32 - dev->dparms.lkey_table_size))]);
+	mr = rcu_dereference(rkt->table[rkey >> rkt->shift]);
 	if (unlikely(!mr || atomic_read(&mr->lkey_invalid) ||
 		     mr->lkey != rkey || qp->ibqp.pd != mr->pd))
 		goto bail;
@@ -909,7 +913,7 @@ int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge,
 	if (unlikely(vaddr < mr->iova || off + len > mr->length ||
 		     (mr->access_flags & acc) == 0))
 		goto bail;
-	atomic_inc(&mr->refcount);
+	rvt_get_mr(mr);
 	rcu_read_unlock();
 
 	off += mr->offset;

+ 19 - 1
drivers/infiniband/sw/rdmavt/qp.c

@@ -76,6 +76,23 @@ const int ib_rvt_state_ops[IB_QPS_ERR + 1] = {
 };
 EXPORT_SYMBOL(ib_rvt_state_ops);
 
+/*
+ * Translate ib_wr_opcode into ib_wc_opcode.
+ */
+const enum ib_wc_opcode ib_rvt_wc_opcode[] = {
+	[IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
+	[IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
+	[IB_WR_SEND] = IB_WC_SEND,
+	[IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
+	[IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
+	[IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
+	[IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD,
+	[IB_WR_SEND_WITH_INV] = IB_WC_SEND,
+	[IB_WR_LOCAL_INV] = IB_WC_LOCAL_INV,
+	[IB_WR_REG_MR] = IB_WC_REG_MR
+};
+EXPORT_SYMBOL(ib_rvt_wc_opcode);
+
 static void get_map_page(struct rvt_qpn_table *qpt,
 			 struct rvt_qpn_map *map,
 			 gfp_t gfp)
@@ -884,7 +901,8 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
 	return ret;
 
 bail_ip:
-	kref_put(&qp->ip->ref, rvt_release_mmap_info);
+	if (qp->ip)
+		kref_put(&qp->ip->ref, rvt_release_mmap_info);
 
 bail_qpn:
 	free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num);

+ 4 - 137
drivers/infiniband/sw/rdmavt/trace.h

@@ -45,143 +45,10 @@
  *
  */
 
-#undef TRACE_SYSTEM_VAR
-#define TRACE_SYSTEM_VAR rdmavt
-
-#if !defined(__RDMAVT_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
-#define __RDMAVT_TRACE_H
-
-#include <linux/tracepoint.h>
-#include <linux/trace_seq.h>
-
-#include <rdma/ib_verbs.h>
-#include <rdma/rdma_vt.h>
-
 #define RDI_DEV_ENTRY(rdi)   __string(dev, rdi->driver_f.get_card_name(rdi))
 #define RDI_DEV_ASSIGN(rdi)  __assign_str(dev, rdi->driver_f.get_card_name(rdi))
 
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM rdmavt
-
-TRACE_EVENT(rvt_dbg,
-	TP_PROTO(struct rvt_dev_info *rdi,
-		 const char *msg),
-	TP_ARGS(rdi, msg),
-	TP_STRUCT__entry(
-		RDI_DEV_ENTRY(rdi)
-		__string(msg, msg)
-	),
-	TP_fast_assign(
-		RDI_DEV_ASSIGN(rdi);
-		__assign_str(msg, msg);
-	),
-	TP_printk("[%s]: %s", __get_str(dev), __get_str(msg))
-);
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM rvt_qphash
-DECLARE_EVENT_CLASS(rvt_qphash_template,
-	TP_PROTO(struct rvt_qp *qp, u32 bucket),
-	TP_ARGS(qp, bucket),
-	TP_STRUCT__entry(
-		RDI_DEV_ENTRY(ib_to_rvt(qp->ibqp.device))
-		__field(u32, qpn)
-		__field(u32, bucket)
-	),
-	TP_fast_assign(
-		RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device))
-		__entry->qpn = qp->ibqp.qp_num;
-		__entry->bucket = bucket;
-	),
-	TP_printk(
-		"[%s] qpn 0x%x bucket %u",
-		__get_str(dev),
-		__entry->qpn,
-		__entry->bucket
-	)
-);
-
-DEFINE_EVENT(rvt_qphash_template, rvt_qpinsert,
-	TP_PROTO(struct rvt_qp *qp, u32 bucket),
-	TP_ARGS(qp, bucket));
-
-DEFINE_EVENT(rvt_qphash_template, rvt_qpremove,
-	TP_PROTO(struct rvt_qp *qp, u32 bucket),
-	TP_ARGS(qp, bucket));
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM rvt_tx
-
-#define wr_opcode_name(opcode) { IB_WR_##opcode, #opcode  }
-#define show_wr_opcode(opcode)                             \
-__print_symbolic(opcode,                                   \
-	wr_opcode_name(RDMA_WRITE),                        \
-	wr_opcode_name(RDMA_WRITE_WITH_IMM),               \
-	wr_opcode_name(SEND),                              \
-	wr_opcode_name(SEND_WITH_IMM),                     \
-	wr_opcode_name(RDMA_READ),                         \
-	wr_opcode_name(ATOMIC_CMP_AND_SWP),                \
-	wr_opcode_name(ATOMIC_FETCH_AND_ADD),              \
-	wr_opcode_name(LSO),                               \
-	wr_opcode_name(SEND_WITH_INV),                     \
-	wr_opcode_name(RDMA_READ_WITH_INV),                \
-	wr_opcode_name(LOCAL_INV),                         \
-	wr_opcode_name(MASKED_ATOMIC_CMP_AND_SWP),         \
-	wr_opcode_name(MASKED_ATOMIC_FETCH_AND_ADD))
-
-#define POS_PRN \
-"[%s] wr_id %llx qpn %x psn 0x%x lpsn 0x%x length %u opcode 0x%.2x,%s size %u avail %u head %u last %u"
-
-TRACE_EVENT(
-	rvt_post_one_wr,
-	TP_PROTO(struct rvt_qp *qp, struct rvt_swqe *wqe),
-	TP_ARGS(qp, wqe),
-	TP_STRUCT__entry(
-		RDI_DEV_ENTRY(ib_to_rvt(qp->ibqp.device))
-		__field(u64, wr_id)
-		__field(u32, qpn)
-		__field(u32, psn)
-		__field(u32, lpsn)
-		__field(u32, length)
-		__field(u32, opcode)
-		__field(u32, size)
-		__field(u32, avail)
-		__field(u32, head)
-		__field(u32, last)
-	),
-	TP_fast_assign(
-		RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device))
-		__entry->wr_id = wqe->wr.wr_id;
-		__entry->qpn = qp->ibqp.qp_num;
-		__entry->psn = wqe->psn;
-		__entry->lpsn = wqe->lpsn;
-		__entry->length = wqe->length;
-		__entry->opcode = wqe->wr.opcode;
-		__entry->size = qp->s_size;
-		__entry->avail = qp->s_avail;
-		__entry->head = qp->s_head;
-		__entry->last = qp->s_last;
-	),
-	TP_printk(
-		POS_PRN,
-		__get_str(dev),
-		__entry->wr_id,
-		__entry->qpn,
-		__entry->psn,
-		__entry->lpsn,
-		__entry->length,
-		__entry->opcode, show_wr_opcode(__entry->opcode),
-		__entry->size,
-		__entry->avail,
-		__entry->head,
-		__entry->last
-	)
-);
-
-#endif /* __RDMAVT_TRACE_H */
-
-#undef TRACE_INCLUDE_PATH
-#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_PATH .
-#define TRACE_INCLUDE_FILE trace
-#include <trace/define_trace.h>
+#include "trace_rvt.h"
+#include "trace_qp.h"
+#include "trace_tx.h"
+#include "trace_mr.h"

+ 112 - 0
drivers/infiniband/sw/rdmavt/trace_mr.h

@@ -0,0 +1,112 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#if !defined(__RVT_TRACE_MR_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __RVT_TRACE_MR_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_mr.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rvt_mr
+DECLARE_EVENT_CLASS(
+	rvt_mr_template,
+	TP_PROTO(struct rvt_mregion *mr, u16 m, u16 n, void *v, size_t len),
+	TP_ARGS(mr, m, n, v, len),
+	TP_STRUCT__entry(
+		RDI_DEV_ENTRY(ib_to_rvt(mr->pd->device))
+		__field(void *, vaddr)
+		__field(struct page *, page)
+		__field(size_t, len)
+		__field(u32, lkey)
+		__field(u16, m)
+		__field(u16, n)
+	),
+	TP_fast_assign(
+		RDI_DEV_ASSIGN(ib_to_rvt(mr->pd->device));
+		__entry->vaddr = v;
+		__entry->page = virt_to_page(v);
+		__entry->m = m;
+		__entry->n = n;
+		__entry->len = len;
+	),
+	TP_printk(
+		"[%s] vaddr %p page %p m %u n %u len %ld",
+		__get_str(dev),
+		__entry->vaddr,
+		__entry->page,
+		__entry->m,
+		__entry->n,
+		__entry->len
+	)
+);
+
+DEFINE_EVENT(
+	rvt_mr_template, rvt_mr_page_seg,
+	TP_PROTO(struct rvt_mregion *mr, u16 m, u16 n, void *v, size_t len),
+	TP_ARGS(mr, m, n, v, len));
+
+DEFINE_EVENT(
+	rvt_mr_template, rvt_mr_fmr_seg,
+	TP_PROTO(struct rvt_mregion *mr, u16 m, u16 n, void *v, size_t len),
+	TP_ARGS(mr, m, n, v, len));
+
+DEFINE_EVENT(
+	rvt_mr_template, rvt_mr_user_seg,
+	TP_PROTO(struct rvt_mregion *mr, u16 m, u16 n, void *v, size_t len),
+	TP_ARGS(mr, m, n, v, len));
+
+#endif /* __RVT_TRACE_MR_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_mr
+#include <trace/define_trace.h>

+ 96 - 0
drivers/infiniband/sw/rdmavt/trace_qp.h

@@ -0,0 +1,96 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#if !defined(__RVT_TRACE_QP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __RVT_TRACE_QP_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_vt.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rvt_qp
+
+DECLARE_EVENT_CLASS(rvt_qphash_template,
+	TP_PROTO(struct rvt_qp *qp, u32 bucket),
+	TP_ARGS(qp, bucket),
+	TP_STRUCT__entry(
+		RDI_DEV_ENTRY(ib_to_rvt(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(u32, bucket)
+	),
+	TP_fast_assign(
+		RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device))
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->bucket = bucket;
+	),
+	TP_printk(
+		"[%s] qpn 0x%x bucket %u",
+		__get_str(dev),
+		__entry->qpn,
+		__entry->bucket
+	)
+);
+
+DEFINE_EVENT(rvt_qphash_template, rvt_qpinsert,
+	TP_PROTO(struct rvt_qp *qp, u32 bucket),
+	TP_ARGS(qp, bucket));
+
+DEFINE_EVENT(rvt_qphash_template, rvt_qpremove,
+	TP_PROTO(struct rvt_qp *qp, u32 bucket),
+	TP_ARGS(qp, bucket));
+
+
+#endif /* __RVT_TRACE_QP_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_qp
+#include <trace/define_trace.h>
+

+ 81 - 0
drivers/infiniband/sw/rdmavt/trace_rvt.h

@@ -0,0 +1,81 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#if !defined(__RVT_TRACE_RVT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __RVT_TRACE_RVT_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_vt.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rvt
+
+TRACE_EVENT(rvt_dbg,
+	TP_PROTO(struct rvt_dev_info *rdi,
+		 const char *msg),
+	TP_ARGS(rdi, msg),
+	TP_STRUCT__entry(
+		RDI_DEV_ENTRY(rdi)
+		__string(msg, msg)
+	),
+	TP_fast_assign(
+		RDI_DEV_ASSIGN(rdi);
+		__assign_str(msg, msg);
+	),
+	TP_printk("[%s]: %s", __get_str(dev), __get_str(msg))
+);
+
+#endif /* __RVT_TRACE_MISC_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_rvt
+#include <trace/define_trace.h>
+

+ 132 - 0
drivers/infiniband/sw/rdmavt/trace_tx.h

@@ -0,0 +1,132 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#if !defined(__RVT_TRACE_TX_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __RVT_TRACE_TX_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_vt.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rvt_tx
+
+#define wr_opcode_name(opcode) { IB_WR_##opcode, #opcode  }
+#define show_wr_opcode(opcode)                             \
+__print_symbolic(opcode,                                   \
+	wr_opcode_name(RDMA_WRITE),                        \
+	wr_opcode_name(RDMA_WRITE_WITH_IMM),               \
+	wr_opcode_name(SEND),                              \
+	wr_opcode_name(SEND_WITH_IMM),                     \
+	wr_opcode_name(RDMA_READ),                         \
+	wr_opcode_name(ATOMIC_CMP_AND_SWP),                \
+	wr_opcode_name(ATOMIC_FETCH_AND_ADD),              \
+	wr_opcode_name(LSO),                               \
+	wr_opcode_name(SEND_WITH_INV),                     \
+	wr_opcode_name(RDMA_READ_WITH_INV),                \
+	wr_opcode_name(LOCAL_INV),                         \
+	wr_opcode_name(MASKED_ATOMIC_CMP_AND_SWP),         \
+	wr_opcode_name(MASKED_ATOMIC_FETCH_AND_ADD))
+
+#define POS_PRN \
+"[%s] wr_id %llx qpn %x psn 0x%x lpsn 0x%x length %u opcode 0x%.2x,%s size %u avail %u head %u last %u"
+
+TRACE_EVENT(
+	rvt_post_one_wr,
+	TP_PROTO(struct rvt_qp *qp, struct rvt_swqe *wqe),
+	TP_ARGS(qp, wqe),
+	TP_STRUCT__entry(
+		RDI_DEV_ENTRY(ib_to_rvt(qp->ibqp.device))
+		__field(u64, wr_id)
+		__field(u32, qpn)
+		__field(u32, psn)
+		__field(u32, lpsn)
+		__field(u32, length)
+		__field(u32, opcode)
+		__field(u32, size)
+		__field(u32, avail)
+		__field(u32, head)
+		__field(u32, last)
+	),
+	TP_fast_assign(
+		RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device))
+		__entry->wr_id = wqe->wr.wr_id;
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->psn = wqe->psn;
+		__entry->lpsn = wqe->lpsn;
+		__entry->length = wqe->length;
+		__entry->opcode = wqe->wr.opcode;
+		__entry->size = qp->s_size;
+		__entry->avail = qp->s_avail;
+		__entry->head = qp->s_head;
+		__entry->last = qp->s_last;
+	),
+	TP_printk(
+		POS_PRN,
+		__get_str(dev),
+		__entry->wr_id,
+		__entry->qpn,
+		__entry->psn,
+		__entry->lpsn,
+		__entry->length,
+		__entry->opcode, show_wr_opcode(__entry->opcode),
+		__entry->size,
+		__entry->avail,
+		__entry->head,
+		__entry->last
+	)
+);
+
+#endif /* __RVT_TRACE_TX_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_tx
+#include <trace/define_trace.h>
+

+ 24 - 22
include/rdma/rdma_vt.h

@@ -185,6 +185,27 @@ struct rvt_driver_provided {
 	 * check_support() for details.
 	 */
 
+	/* hot path calldowns in a single cacheline */
+
+	/*
+	 * Give the driver a notice that there is send work to do. It is up to
+	 * the driver to generally push the packets out, this just queues the
+	 * work with the driver. There are two variants here. The no_lock
+	 * version requires the s_lock not to be held. The other assumes the
+	 * s_lock is held.
+	 */
+	void (*schedule_send)(struct rvt_qp *qp);
+	void (*schedule_send_no_lock)(struct rvt_qp *qp);
+
+	/* Driver specific work request checking */
+	int (*check_send_wqe)(struct rvt_qp *qp, struct rvt_swqe *wqe);
+
+	/*
+	 * Sometimes rdmavt needs to kick the driver's send progress. That is
+	 * done by this call back.
+	 */
+	void (*do_send)(struct rvt_qp *qp);
+
 	/* Passed to ib core registration. Callback to create syfs files */
 	int (*port_callback)(struct ib_device *, u8, struct kobject *);
 
@@ -222,22 +243,6 @@ struct rvt_driver_provided {
 	 */
 	void (*notify_qp_reset)(struct rvt_qp *qp);
 
-	/*
-	 * Give the driver a notice that there is send work to do. It is up to
-	 * the driver to generally push the packets out, this just queues the
-	 * work with the driver. There are two variants here. The no_lock
-	 * version requires the s_lock not to be held. The other assumes the
-	 * s_lock is held.
-	 */
-	void (*schedule_send)(struct rvt_qp *qp);
-	void (*schedule_send_no_lock)(struct rvt_qp *qp);
-
-	/*
-	 * Sometimes rdmavt needs to kick the driver's send progress. That is
-	 * done by this call back.
-	 */
-	void (*do_send)(struct rvt_qp *qp);
-
 	/*
 	 * Get a path mtu from the driver based on qp attributes.
 	 */
@@ -324,9 +329,6 @@ struct rvt_driver_provided {
 	void (*modify_qp)(struct rvt_qp *qp, struct ib_qp_attr *attr,
 			  int attr_mask, struct ib_udata *udata);
 
-	/* Driver specific work request checking */
-	int (*check_send_wqe)(struct rvt_qp *qp, struct rvt_swqe *wqe);
-
 	/* Notify driver a mad agent has been created */
 	void (*notify_create_mad_agent)(struct rvt_dev_info *rdi, int port_idx);
 
@@ -355,12 +357,12 @@ struct rvt_dev_info {
 	/* post send table */
 	const struct rvt_operation_params *post_parms;
 
-	struct rvt_mregion __rcu *dma_mr;
-	struct rvt_lkey_table lkey_table;
-
 	/* Driver specific helper functions */
 	struct rvt_driver_provided driver_f;
 
+	struct rvt_mregion __rcu *dma_mr;
+	struct rvt_lkey_table lkey_table;
+
 	/* Internal use */
 	int n_pds_allocated;
 	spinlock_t n_pds_lock; /* Protect pd allocated count */

+ 7 - 3
include/rdma/rdmavt_mr.h

@@ -90,11 +90,15 @@ struct rvt_mregion {
 #define RVT_MAX_LKEY_TABLE_BITS 23
 
 struct rvt_lkey_table {
-	spinlock_t lock; /* protect changes in this struct */
-	u32 next;               /* next unused index (speeds search) */
-	u32 gen;                /* generation count */
+	/* read mostly fields */
 	u32 max;                /* size of the table */
+	u32 shift;              /* lkey/rkey shift */
 	struct rvt_mregion __rcu **table;
+	/* writeable fields */
+	/* protect changes in this struct */
+	spinlock_t lock ____cacheline_aligned_in_smp;
+	u32 next;               /* next unused index (speeds search) */
+	u32 gen;                /* generation count */
 };
 
 /*

+ 77 - 0
include/rdma/rdmavt_qp.h

@@ -51,6 +51,7 @@
 #include <rdma/rdma_vt.h>
 #include <rdma/ib_pack.h>
 #include <rdma/ib_verbs.h>
+#include <rdma/rdmavt_cq.h>
 /*
  * Atomic bit definitions for r_aflags.
  */
@@ -484,6 +485,23 @@ static inline void rvt_put_qp(struct rvt_qp *qp)
 		wake_up(&qp->wait);
 }
 
+/**
+ * rvt_put_swqe - drop mr refs held by swqe
+ * @wqe - the send wqe
+ *
+ * This drops any mr references held by the swqe
+ */
+static inline void rvt_put_swqe(struct rvt_swqe *wqe)
+{
+	int i;
+
+	for (i = 0; i < wqe->wr.num_sge; i++) {
+		struct rvt_sge *sge = &wqe->sg_list[i];
+
+		rvt_put_mr(sge->mr);
+	}
+}
+
 /**
  * rvt_qp_wqe_reserve - reserve operation
  * @qp - the rvt qp
@@ -527,6 +545,65 @@ static inline void rvt_qp_wqe_unreserve(
 	}
 }
 
+extern const enum ib_wc_opcode ib_rvt_wc_opcode[];
+
+/**
+ * rvt_qp_swqe_complete() - insert send completion
+ * @qp - the qp
+ * @wqe - the send wqe
+ * @status - completion status
+ *
+ * Insert a send completion into the completion
+ * queue if the qp indicates it should be done.
+ *
+ * See IBTA 10.7.3.1 for info on completion
+ * control.
+ */
+static inline void rvt_qp_swqe_complete(
+	struct rvt_qp *qp,
+	struct rvt_swqe *wqe,
+	enum ib_wc_status status)
+{
+	if (unlikely(wqe->wr.send_flags & RVT_SEND_RESERVE_USED))
+		return;
+	if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
+	    (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
+	     status != IB_WC_SUCCESS) {
+		struct ib_wc wc;
+
+		memset(&wc, 0, sizeof(wc));
+		wc.wr_id = wqe->wr.wr_id;
+		wc.status = status;
+		wc.opcode = ib_rvt_wc_opcode[wqe->wr.opcode];
+		wc.qp = &qp->ibqp;
+		wc.byte_len = wqe->length;
+		rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc,
+			     status != IB_WC_SUCCESS);
+	}
+}
+
+/**
+ * @qp - the qp pair
+ * @len - the length
+ *
+ * Perform a shift based mtu round up divide
+ */
+static inline u32 rvt_div_round_up_mtu(struct rvt_qp *qp, u32 len)
+{
+	return (len + qp->pmtu - 1) >> qp->log_pmtu;
+}
+
+/**
+ * @qp - the qp pair
+ * @len - the length
+ *
+ * Perform a shift based mtu divide
+ */
+static inline u32 rvt_div_mtu(struct rvt_qp *qp, u32 len)
+{
+	return len >> qp->log_pmtu;
+}
+
 extern const int  ib_rvt_state_ops[];
 
 struct rvt_dev_info;

+ 1 - 1
include/uapi/rdma/hfi/hfi1_user.h

@@ -75,7 +75,7 @@
  * may not be implemented; the user code must deal with this if it
  * cares, or it must abort after initialization reports the difference.
  */
-#define HFI1_USER_SWMINOR 2
+#define HFI1_USER_SWMINOR 3
 
 /*
  * We will encode the major/minor inside a single 32bit version number.