|
@@ -129,8 +129,6 @@ unsigned short piothreshold = 256;
|
|
module_param(piothreshold, ushort, S_IRUGO);
|
|
module_param(piothreshold, ushort, S_IRUGO);
|
|
MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio");
|
|
MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio");
|
|
|
|
|
|
-#define COPY_CACHELESS 1
|
|
|
|
-#define COPY_ADAPTIVE 2
|
|
|
|
static unsigned int sge_copy_mode;
|
|
static unsigned int sge_copy_mode;
|
|
module_param(sge_copy_mode, uint, S_IRUGO);
|
|
module_param(sge_copy_mode, uint, S_IRUGO);
|
|
MODULE_PARM_DESC(sge_copy_mode,
|
|
MODULE_PARM_DESC(sge_copy_mode,
|
|
@@ -151,159 +149,13 @@ static int pio_wait(struct rvt_qp *qp,
|
|
/* 16B trailing buffer */
|
|
/* 16B trailing buffer */
|
|
static const u8 trail_buf[MAX_16B_PADDING];
|
|
static const u8 trail_buf[MAX_16B_PADDING];
|
|
|
|
|
|
-static uint wss_threshold;
|
|
|
|
|
|
+static uint wss_threshold = 80;
|
|
module_param(wss_threshold, uint, S_IRUGO);
|
|
module_param(wss_threshold, uint, S_IRUGO);
|
|
MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy");
|
|
MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy");
|
|
static uint wss_clean_period = 256;
|
|
static uint wss_clean_period = 256;
|
|
module_param(wss_clean_period, uint, S_IRUGO);
|
|
module_param(wss_clean_period, uint, S_IRUGO);
|
|
MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned");
|
|
MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned");
|
|
|
|
|
|
-/* memory working set size */
|
|
|
|
-struct hfi1_wss {
|
|
|
|
- unsigned long *entries;
|
|
|
|
- atomic_t total_count;
|
|
|
|
- atomic_t clean_counter;
|
|
|
|
- atomic_t clean_entry;
|
|
|
|
-
|
|
|
|
- int threshold;
|
|
|
|
- int num_entries;
|
|
|
|
- long pages_mask;
|
|
|
|
-};
|
|
|
|
-
|
|
|
|
-static struct hfi1_wss wss;
|
|
|
|
-
|
|
|
|
-int hfi1_wss_init(void)
|
|
|
|
-{
|
|
|
|
- long llc_size;
|
|
|
|
- long llc_bits;
|
|
|
|
- long table_size;
|
|
|
|
- long table_bits;
|
|
|
|
-
|
|
|
|
- /* check for a valid percent range - default to 80 if none or invalid */
|
|
|
|
- if (wss_threshold < 1 || wss_threshold > 100)
|
|
|
|
- wss_threshold = 80;
|
|
|
|
- /* reject a wildly large period */
|
|
|
|
- if (wss_clean_period > 1000000)
|
|
|
|
- wss_clean_period = 256;
|
|
|
|
- /* reject a zero period */
|
|
|
|
- if (wss_clean_period == 0)
|
|
|
|
- wss_clean_period = 1;
|
|
|
|
-
|
|
|
|
- /*
|
|
|
|
- * Calculate the table size - the next power of 2 larger than the
|
|
|
|
- * LLC size. LLC size is in KiB.
|
|
|
|
- */
|
|
|
|
- llc_size = wss_llc_size() * 1024;
|
|
|
|
- table_size = roundup_pow_of_two(llc_size);
|
|
|
|
-
|
|
|
|
- /* one bit per page in rounded up table */
|
|
|
|
- llc_bits = llc_size / PAGE_SIZE;
|
|
|
|
- table_bits = table_size / PAGE_SIZE;
|
|
|
|
- wss.pages_mask = table_bits - 1;
|
|
|
|
- wss.num_entries = table_bits / BITS_PER_LONG;
|
|
|
|
-
|
|
|
|
- wss.threshold = (llc_bits * wss_threshold) / 100;
|
|
|
|
- if (wss.threshold == 0)
|
|
|
|
- wss.threshold = 1;
|
|
|
|
-
|
|
|
|
- atomic_set(&wss.clean_counter, wss_clean_period);
|
|
|
|
-
|
|
|
|
- wss.entries = kcalloc(wss.num_entries, sizeof(*wss.entries),
|
|
|
|
- GFP_KERNEL);
|
|
|
|
- if (!wss.entries) {
|
|
|
|
- hfi1_wss_exit();
|
|
|
|
- return -ENOMEM;
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- return 0;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-void hfi1_wss_exit(void)
|
|
|
|
-{
|
|
|
|
- /* coded to handle partially initialized and repeat callers */
|
|
|
|
- kfree(wss.entries);
|
|
|
|
- wss.entries = NULL;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * Advance the clean counter. When the clean period has expired,
|
|
|
|
- * clean an entry.
|
|
|
|
- *
|
|
|
|
- * This is implemented in atomics to avoid locking. Because multiple
|
|
|
|
- * variables are involved, it can be racy which can lead to slightly
|
|
|
|
- * inaccurate information. Since this is only a heuristic, this is
|
|
|
|
- * OK. Any innaccuracies will clean themselves out as the counter
|
|
|
|
- * advances. That said, it is unlikely the entry clean operation will
|
|
|
|
- * race - the next possible racer will not start until the next clean
|
|
|
|
- * period.
|
|
|
|
- *
|
|
|
|
- * The clean counter is implemented as a decrement to zero. When zero
|
|
|
|
- * is reached an entry is cleaned.
|
|
|
|
- */
|
|
|
|
-static void wss_advance_clean_counter(void)
|
|
|
|
-{
|
|
|
|
- int entry;
|
|
|
|
- int weight;
|
|
|
|
- unsigned long bits;
|
|
|
|
-
|
|
|
|
- /* become the cleaner if we decrement the counter to zero */
|
|
|
|
- if (atomic_dec_and_test(&wss.clean_counter)) {
|
|
|
|
- /*
|
|
|
|
- * Set, not add, the clean period. This avoids an issue
|
|
|
|
- * where the counter could decrement below the clean period.
|
|
|
|
- * Doing a set can result in lost decrements, slowing the
|
|
|
|
- * clean advance. Since this a heuristic, this possible
|
|
|
|
- * slowdown is OK.
|
|
|
|
- *
|
|
|
|
- * An alternative is to loop, advancing the counter by a
|
|
|
|
- * clean period until the result is > 0. However, this could
|
|
|
|
- * lead to several threads keeping another in the clean loop.
|
|
|
|
- * This could be mitigated by limiting the number of times
|
|
|
|
- * we stay in the loop.
|
|
|
|
- */
|
|
|
|
- atomic_set(&wss.clean_counter, wss_clean_period);
|
|
|
|
-
|
|
|
|
- /*
|
|
|
|
- * Uniquely grab the entry to clean and move to next.
|
|
|
|
- * The current entry is always the lower bits of
|
|
|
|
- * wss.clean_entry. The table size, wss.num_entries,
|
|
|
|
- * is always a power-of-2.
|
|
|
|
- */
|
|
|
|
- entry = (atomic_inc_return(&wss.clean_entry) - 1)
|
|
|
|
- & (wss.num_entries - 1);
|
|
|
|
-
|
|
|
|
- /* clear the entry and count the bits */
|
|
|
|
- bits = xchg(&wss.entries[entry], 0);
|
|
|
|
- weight = hweight64((u64)bits);
|
|
|
|
- /* only adjust the contended total count if needed */
|
|
|
|
- if (weight)
|
|
|
|
- atomic_sub(weight, &wss.total_count);
|
|
|
|
- }
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * Insert the given address into the working set array.
|
|
|
|
- */
|
|
|
|
-static void wss_insert(void *address)
|
|
|
|
-{
|
|
|
|
- u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss.pages_mask;
|
|
|
|
- u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */
|
|
|
|
- u32 nr = page & (BITS_PER_LONG - 1);
|
|
|
|
-
|
|
|
|
- if (!test_and_set_bit(nr, &wss.entries[entry]))
|
|
|
|
- atomic_inc(&wss.total_count);
|
|
|
|
-
|
|
|
|
- wss_advance_clean_counter();
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * Is the working set larger than the threshold?
|
|
|
|
- */
|
|
|
|
-static inline bool wss_exceeds_threshold(void)
|
|
|
|
-{
|
|
|
|
- return atomic_read(&wss.total_count) >= wss.threshold;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
/*
|
|
/*
|
|
* Translate ib_wr_opcode into ib_wc_opcode.
|
|
* Translate ib_wr_opcode into ib_wc_opcode.
|
|
*/
|
|
*/
|
|
@@ -438,79 +290,6 @@ static const u32 pio_opmask[BIT(3)] = {
|
|
*/
|
|
*/
|
|
__be64 ib_hfi1_sys_image_guid;
|
|
__be64 ib_hfi1_sys_image_guid;
|
|
|
|
|
|
-/**
|
|
|
|
- * hfi1_copy_sge - copy data to SGE memory
|
|
|
|
- * @ss: the SGE state
|
|
|
|
- * @data: the data to copy
|
|
|
|
- * @length: the length of the data
|
|
|
|
- * @release: boolean to release MR
|
|
|
|
- * @copy_last: do a separate copy of the last 8 bytes
|
|
|
|
- */
|
|
|
|
-void hfi1_copy_sge(
|
|
|
|
- struct rvt_sge_state *ss,
|
|
|
|
- void *data, u32 length,
|
|
|
|
- bool release,
|
|
|
|
- bool copy_last)
|
|
|
|
-{
|
|
|
|
- struct rvt_sge *sge = &ss->sge;
|
|
|
|
- int i;
|
|
|
|
- bool in_last = false;
|
|
|
|
- bool cacheless_copy = false;
|
|
|
|
-
|
|
|
|
- if (sge_copy_mode == COPY_CACHELESS) {
|
|
|
|
- cacheless_copy = length >= PAGE_SIZE;
|
|
|
|
- } else if (sge_copy_mode == COPY_ADAPTIVE) {
|
|
|
|
- if (length >= PAGE_SIZE) {
|
|
|
|
- /*
|
|
|
|
- * NOTE: this *assumes*:
|
|
|
|
- * o The first vaddr is the dest.
|
|
|
|
- * o If multiple pages, then vaddr is sequential.
|
|
|
|
- */
|
|
|
|
- wss_insert(sge->vaddr);
|
|
|
|
- if (length >= (2 * PAGE_SIZE))
|
|
|
|
- wss_insert(sge->vaddr + PAGE_SIZE);
|
|
|
|
-
|
|
|
|
- cacheless_copy = wss_exceeds_threshold();
|
|
|
|
- } else {
|
|
|
|
- wss_advance_clean_counter();
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- if (copy_last) {
|
|
|
|
- if (length > 8) {
|
|
|
|
- length -= 8;
|
|
|
|
- } else {
|
|
|
|
- copy_last = false;
|
|
|
|
- in_last = true;
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
-again:
|
|
|
|
- while (length) {
|
|
|
|
- u32 len = rvt_get_sge_length(sge, length);
|
|
|
|
-
|
|
|
|
- WARN_ON_ONCE(len == 0);
|
|
|
|
- if (unlikely(in_last)) {
|
|
|
|
- /* enforce byte transfer ordering */
|
|
|
|
- for (i = 0; i < len; i++)
|
|
|
|
- ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i];
|
|
|
|
- } else if (cacheless_copy) {
|
|
|
|
- cacheless_memcpy(sge->vaddr, data, len);
|
|
|
|
- } else {
|
|
|
|
- memcpy(sge->vaddr, data, len);
|
|
|
|
- }
|
|
|
|
- rvt_update_sge(ss, len, release);
|
|
|
|
- data += len;
|
|
|
|
- length -= len;
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- if (copy_last) {
|
|
|
|
- copy_last = false;
|
|
|
|
- in_last = true;
|
|
|
|
- length = 8;
|
|
|
|
- goto again;
|
|
|
|
- }
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
/*
|
|
/*
|
|
* Make sure the QP is ready and able to accept the given opcode.
|
|
* Make sure the QP is ready and able to accept the given opcode.
|
|
*/
|
|
*/
|
|
@@ -1949,6 +1728,9 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
|
|
dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size;
|
|
dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size;
|
|
dd->verbs_dev.rdi.dparms.nports = dd->num_pports;
|
|
dd->verbs_dev.rdi.dparms.nports = dd->num_pports;
|
|
dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd);
|
|
dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd);
|
|
|
|
+ dd->verbs_dev.rdi.dparms.sge_copy_mode = sge_copy_mode;
|
|
|
|
+ dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold;
|
|
|
|
+ dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period;
|
|
|
|
|
|
/* post send table */
|
|
/* post send table */
|
|
dd->verbs_dev.rdi.post_parms = hfi1_post_parms;
|
|
dd->verbs_dev.rdi.post_parms = hfi1_post_parms;
|