9 år sedan · 528ee9fbf0
--- a/drivers/staging/rdma/hfi1/init.c
+++ b/drivers/staging/rdma/hfi1/init.c
@@ -1242,6 +1242,9 @@ static int __init hfi1_mod_init(void)
 
				 	idr_init(&hfi1_unit_table);
			
 
				 
			
 
				 	hfi1_dbg_init();
			
 
				+	ret = hfi1_wss_init();
			
 
				+	if (ret < 0)
			
 
				+		goto bail_wss;
			
 
				 	ret = pci_register_driver(&hfi1_pci_driver);
			
 
				 	if (ret < 0) {
			
 
				 		pr_err("Unable to register driver: error %d\n", -ret);
			
@@ -1250,6 +1253,8 @@ static int __init hfi1_mod_init(void)
 
				 	goto bail; /* all OK */
			
 
				 
			
 
				 bail_dev:
			
 
				+	hfi1_wss_exit();
			
 
				+bail_wss:
			
 
				 	hfi1_dbg_exit();
			
 
				 	idr_destroy(&hfi1_unit_table);
			
 
				 	dev_cleanup();
			
@@ -1265,6 +1270,7 @@ module_init(hfi1_mod_init);
 
				 static void __exit hfi1_mod_cleanup(void)
			
 
				 {
			
 
				 	pci_unregister_driver(&hfi1_pci_driver);
			
 
				+	hfi1_wss_exit();
			
 
				 	hfi1_dbg_exit();
			
 
				 	hfi1_cpulist_count = 0;
			
 
				 	kfree(hfi1_cpulist);
			
--- a/drivers/staging/rdma/hfi1/verbs.c
+++ b/drivers/staging/rdma/hfi1/verbs.c
@@ -125,6 +125,13 @@ unsigned short piothreshold;
 
				 module_param(piothreshold, ushort, S_IRUGO);
			
 
				 MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio");
			
 
				 
			
 
				+#define COPY_CACHELESS 1
			
 
				+#define COPY_ADAPTIVE  2
			
 
				+static unsigned int sge_copy_mode;
			
 
				+module_param(sge_copy_mode, uint, S_IRUGO);
			
 
				+MODULE_PARM_DESC(sge_copy_mode,
			
 
				+		 "Verbs copy mode: 0 use memcpy, 1 use cacheless copy, 2 adapt based on WSS");
			
 
				+
			
 
				 static void verbs_sdma_complete(
			
 
				 	struct sdma_txreq *cookie,
			
 
				 	int status);
			
@@ -137,6 +144,159 @@ static int pio_wait(struct rvt_qp *qp,
 
				 /* Length of buffer to create verbs txreq cache name */
			
 
				 #define TXREQ_NAME_LEN 24
			
 
				 
			
 
				+static uint wss_threshold;
			
 
				+module_param(wss_threshold, uint, S_IRUGO);
			
 
				+MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy");
			
 
				+static uint wss_clean_period = 256;
			
 
				+module_param(wss_clean_period, uint, S_IRUGO);
			
 
				+MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned");
			
 
				+
			
 
				+/* memory working set size */
			
 
				+struct hfi1_wss {
			
 
				+	unsigned long *entries;
			
 
				+	atomic_t total_count;
			
 
				+	atomic_t clean_counter;
			
 
				+	atomic_t clean_entry;
			
 
				+
			
 
				+	int threshold;
			
 
				+	int num_entries;
			
 
				+	long pages_mask;
			
 
				+};
			
 
				+
			
 
				+static struct hfi1_wss wss;
			
 
				+
			
 
				+int hfi1_wss_init(void)
			
 
				+{
			
 
				+	long llc_size;
			
 
				+	long llc_bits;
			
 
				+	long table_size;
			
 
				+	long table_bits;
			
 
				+
			
 
				+	/* check for a valid percent range - default to 80 if none or invalid */
			
 
				+	if (wss_threshold < 1 || wss_threshold > 100)
			
 
				+		wss_threshold = 80;
			
 
				+	/* reject a wildly large period */
			
 
				+	if (wss_clean_period > 1000000)
			
 
				+		wss_clean_period = 256;
			
 
				+	/* reject a zero period */
			
 
				+	if (wss_clean_period == 0)
			
 
				+		wss_clean_period = 1;
			
 
				+
			
 
				+	/*
			
 
				+	 * Calculate the table size - the next power of 2 larger than the
			
 
				+	 * LLC size.  LLC size is in KiB.
			
 
				+	 */
			
 
				+	llc_size = wss_llc_size() * 1024;
			
 
				+	table_size = roundup_pow_of_two(llc_size);
			
 
				+
			
 
				+	/* one bit per page in rounded up table */
			
 
				+	llc_bits = llc_size / PAGE_SIZE;
			
 
				+	table_bits = table_size / PAGE_SIZE;
			
 
				+	wss.pages_mask = table_bits - 1;
			
 
				+	wss.num_entries = table_bits / BITS_PER_LONG;
			
 
				+
			
 
				+	wss.threshold = (llc_bits * wss_threshold) / 100;
			
 
				+	if (wss.threshold == 0)
			
 
				+		wss.threshold = 1;
			
 
				+
			
 
				+	atomic_set(&wss.clean_counter, wss_clean_period);
			
 
				+
			
 
				+	wss.entries = kcalloc(wss.num_entries, sizeof(*wss.entries),
			
 
				+			      GFP_KERNEL);
			
 
				+	if (!wss.entries) {
			
 
				+		hfi1_wss_exit();
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+void hfi1_wss_exit(void)
			
 
				+{
			
 
				+	/* coded to handle partially initialized and repeat callers */
			
 
				+	kfree(wss.entries);
			
 
				+	wss.entries = NULL;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Advance the clean counter.  When the clean period has expired,
			
 
				+ * clean an entry.
			
 
				+ *
			
 
				+ * This is implemented in atomics to avoid locking.  Because multiple
			
 
				+ * variables are involved, it can be racy which can lead to slightly
			
 
				+ * inaccurate information.  Since this is only a heuristic, this is
			
 
				+ * OK.  Any innaccuracies will clean themselves out as the counter
			
 
				+ * advances.  That said, it is unlikely the entry clean operation will
			
 
				+ * race - the next possible racer will not start until the next clean
			
 
				+ * period.
			
 
				+ *
			
 
				+ * The clean counter is implemented as a decrement to zero.  When zero
			
 
				+ * is reached an entry is cleaned.
			
 
				+ */
			
 
				+static void wss_advance_clean_counter(void)
			
 
				+{
			
 
				+	int entry;
			
 
				+	int weight;
			
 
				+	unsigned long bits;
			
 
				+
			
 
				+	/* become the cleaner if we decrement the counter to zero */
			
 
				+	if (atomic_dec_and_test(&wss.clean_counter)) {
			
 
				+		/*
			
 
				+		 * Set, not add, the clean period.  This avoids an issue
			
 
				+		 * where the counter could decrement below the clean period.
			
 
				+		 * Doing a set can result in lost decrements, slowing the
			
 
				+		 * clean advance.  Since this a heuristic, this possible
			
 
				+		 * slowdown is OK.
			
 
				+		 *
			
 
				+		 * An alternative is to loop, advancing the counter by a
			
 
				+		 * clean period until the result is > 0. However, this could
			
 
				+		 * lead to several threads keeping another in the clean loop.
			
 
				+		 * This could be mitigated by limiting the number of times
			
 
				+		 * we stay in the loop.
			
 
				+		 */
			
 
				+		atomic_set(&wss.clean_counter, wss_clean_period);
			
 
				+
			
 
				+		/*
			
 
				+		 * Uniquely grab the entry to clean and move to next.
			
 
				+		 * The current entry is always the lower bits of
			
 
				+		 * wss.clean_entry.  The table size, wss.num_entries,
			
 
				+		 * is always a power-of-2.
			
 
				+		 */
			
 
				+		entry = (atomic_inc_return(&wss.clean_entry) - 1)
			
 
				+			& (wss.num_entries - 1);
			
 
				+
			
 
				+		/* clear the entry and count the bits */
			
 
				+		bits = xchg(&wss.entries[entry], 0);
			
 
				+		weight = hweight64((u64)bits);
			
 
				+		/* only adjust the contended total count if needed */
			
 
				+		if (weight)
			
 
				+			atomic_sub(weight, &wss.total_count);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Insert the given address into the working set array.
			
 
				+ */
			
 
				+static void wss_insert(void *address)
			
 
				+{
			
 
				+	u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss.pages_mask;
			
 
				+	u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */
			
 
				+	u32 nr = page & (BITS_PER_LONG - 1);
			
 
				+
			
 
				+	if (!test_and_set_bit(nr, &wss.entries[entry]))
			
 
				+		atomic_inc(&wss.total_count);
			
 
				+
			
 
				+	wss_advance_clean_counter();
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Is the working set larger than the threshold?
			
 
				+ */
			
 
				+static inline int wss_exceeds_threshold(void)
			
 
				+{
			
 
				+	return atomic_read(&wss.total_count) >= wss.threshold;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Translate ib_wr_opcode into ib_wc_opcode.
			
 
				  */
			
@@ -258,7 +418,26 @@ void hfi1_copy_sge(
 
				 	struct rvt_sge *sge = &ss->sge;
			
 
				 	int in_last = 0;
			
 
				 	int i;
			
 
				+	int cacheless_copy = 0;
			
 
				 
			
 
				+	if (sge_copy_mode == COPY_CACHELESS) {
			
 
				+		cacheless_copy = length >= PAGE_SIZE;
			
 
				+	} else if (sge_copy_mode == COPY_ADAPTIVE) {
			
 
				+		if (length >= PAGE_SIZE) {
			
 
				+			/*
			
 
				+			 * NOTE: this *assumes*:
			
 
				+			 * o The first vaddr is the dest.
			
 
				+			 * o If multiple pages, then vaddr is sequential.
			
 
				+			 */
			
 
				+			wss_insert(sge->vaddr);
			
 
				+			if (length >= (2 * PAGE_SIZE))
			
 
				+				wss_insert(sge->vaddr + PAGE_SIZE);
			
 
				+
			
 
				+			cacheless_copy = wss_exceeds_threshold();
			
 
				+		} else {
			
 
				+			wss_advance_clean_counter();
			
 
				+		}
			
 
				+	}
			
 
				 	if (copy_last) {
			
 
				 		if (length > 8) {
			
 
				 			length -= 8;
			
@@ -277,10 +456,12 @@ again:
 
				 		if (len > sge->sge_length)
			
 
				 			len = sge->sge_length;
			
 
				 		WARN_ON_ONCE(len == 0);
			
 
				-		if (in_last) {
			
 
				-			/* enforce byte transer ordering */
			
 
				+		if (unlikely(in_last)) {
			
 
				+			/* enforce byte transfer ordering */
			
 
				 			for (i = 0; i < len; i++)
			
 
				 				((u8 *)sge->vaddr)[i] = ((u8 *)data)[i];
			
 
				+		} else if (cacheless_copy) {
			
 
				+			cacheless_memcpy(sge->vaddr, data, len);
			
 
				 		} else {
			
 
				 			memcpy(sge->vaddr, data, len);
			
 
				 		}
			
--- a/drivers/staging/rdma/hfi1/verbs.h
+++ b/drivers/staging/rdma/hfi1/verbs.h
@@ -475,6 +475,28 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 
				 int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
			
 
				 			u64 pbc);
			
 
				 
			
 
				+int hfi1_wss_init(void);
			
 
				+void hfi1_wss_exit(void);
			
 
				+
			
 
				+/* platform specific: return the lowest level cache (llc) size, in KiB */
			
 
				+static inline int wss_llc_size(void)
			
 
				+{
			
 
				+	/* assume that the boot CPU value is universal for all CPUs */
			
 
				+	return boot_cpu_data.x86_cache_size;
			
 
				+}
			
 
				+
			
 
				+/* platform specific: cacheless copy */
			
 
				+static inline void cacheless_memcpy(void *dst, void *src, size_t n)
			
 
				+{
			
 
				+	/*
			
 
				+	 * Use the only available X64 cacheless copy.  Add a __user cast
			
 
				+	 * to quiet sparse.  The src agument is already in the kernel so
			
 
				+	 * there are no security issues.  The extra fault recovery machinery
			
 
				+	 * is not invoked.
			
 
				+	 */
			
 
				+	__copy_user_nocache(dst, (void __user *)src, n, 0);
			
 
				+}
			
 
				+
			
 
				 extern const enum ib_wc_opcode ib_hfi1_wc_opcode[];
			
 
				 
			
 
				 extern const u8 hdr_len_by_opcode[];