|
@@ -97,8 +97,7 @@ struct tid_pageset {
|
|
|
|
|
|
static void unlock_exp_tids(struct hfi1_ctxtdata *, struct exp_tid_set *,
|
|
|
struct rb_root *);
|
|
|
-static u32 find_phys_blocks(struct page **, unsigned,
|
|
|
- struct tid_pageset *) __maybe_unused;
|
|
|
+static u32 find_phys_blocks(struct page **, unsigned, struct tid_pageset *);
|
|
|
static int set_rcvarray_entry(struct file *, unsigned long, u32,
|
|
|
struct tid_group *, struct page **, unsigned);
|
|
|
static inline int mmu_addr_cmp(struct mmu_rb_node *, unsigned long,
|
|
@@ -119,7 +118,7 @@ static inline void mmu_notifier_range_start(struct mmu_notifier *,
|
|
|
unsigned long, unsigned long);
|
|
|
static int program_rcvarray(struct file *, unsigned long, struct tid_group *,
|
|
|
struct tid_pageset *, unsigned, u16, struct page **,
|
|
|
- u32 *, unsigned *, unsigned *) __maybe_unused;
|
|
|
+ u32 *, unsigned *, unsigned *);
|
|
|
static int unprogram_rcvarray(struct file *, u32, struct tid_group **);
|
|
|
static void clear_tid_node(struct hfi1_filedata *, u16, struct mmu_rb_node *);
|
|
|
|
|
@@ -339,9 +338,265 @@ static inline void rcv_array_wc_fill(struct hfi1_devdata *dd, u32 index)
|
|
|
writeq(0, dd->rcvarray_wc + (index * 8));
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * RcvArray entry allocation for Expected Receives is done by the
|
|
|
+ * following algorithm:
|
|
|
+ *
|
|
|
+ * The context keeps 3 lists of groups of RcvArray entries:
|
|
|
+ * 1. List of empty groups - tid_group_list
|
|
|
+ * This list is created during user context creation and
|
|
|
+ * contains elements which describe sets (of 8) of empty
|
|
|
+ * RcvArray entries.
|
|
|
+ * 2. List of partially used groups - tid_used_list
|
|
|
+ * This list contains sets of RcvArray entries which are
|
|
|
+ * not completely used up. Another mapping request could
|
|
|
+ * use some of all of the remaining entries.
|
|
|
+ * 3. List of full groups - tid_full_list
|
|
|
+ * This is the list where sets that are completely used
|
|
|
+ * up go.
|
|
|
+ *
|
|
|
+ * An attempt to optimize the usage of RcvArray entries is
|
|
|
+ * made by finding all sets of physically contiguous pages in a
|
|
|
+ * user's buffer.
|
|
|
+ * These physically contiguous sets are further split into
|
|
|
+ * sizes supported by the receive engine of the HFI. The
|
|
|
+ * resulting sets of pages are stored in struct tid_pageset,
|
|
|
+ * which describes the sets as:
|
|
|
+ * * .count - number of pages in this set
|
|
|
+ * * .idx - starting index into struct page ** array
|
|
|
+ * of this set
|
|
|
+ *
|
|
|
+ * From this point on, the algorithm deals with the page sets
|
|
|
+ * described above. The number of pagesets is divided by the
|
|
|
+ * RcvArray group size to produce the number of full groups
|
|
|
+ * needed.
|
|
|
+ *
|
|
|
+ * Groups from the 3 lists are manipulated using the following
|
|
|
+ * rules:
|
|
|
+ * 1. For each set of 8 pagesets, a complete group from
|
|
|
+ * tid_group_list is taken, programmed, and moved to
|
|
|
+ * the tid_full_list list.
|
|
|
+ * 2. For all remaining pagesets:
|
|
|
+ * 2.1 If the tid_used_list is empty and the tid_group_list
|
|
|
+ * is empty, stop processing pageset and return only
|
|
|
+ * what has been programmed up to this point.
|
|
|
+ * 2.2 If the tid_used_list is empty and the tid_group_list
|
|
|
+ * is not empty, move a group from tid_group_list to
|
|
|
+ * tid_used_list.
|
|
|
+ * 2.3 For each group is tid_used_group, program as much as
|
|
|
+ * can fit into the group. If the group becomes fully
|
|
|
+ * used, move it to tid_full_list.
|
|
|
+ */
|
|
|
int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo)
|
|
|
{
|
|
|
- return -EINVAL;
|
|
|
+ int ret = 0, need_group = 0, pinned;
|
|
|
+ struct hfi1_filedata *fd = fp->private_data;
|
|
|
+ struct hfi1_ctxtdata *uctxt = fd->uctxt;
|
|
|
+ struct hfi1_devdata *dd = uctxt->dd;
|
|
|
+ unsigned npages, ngroups, pageidx = 0, pageset_count, npagesets,
|
|
|
+ tididx = 0, mapped, mapped_pages = 0;
|
|
|
+ unsigned long vaddr = tinfo->vaddr;
|
|
|
+ struct page **pages = NULL;
|
|
|
+ u32 *tidlist = NULL;
|
|
|
+ struct tid_pageset *pagesets = NULL;
|
|
|
+
|
|
|
+ /* Get the number of pages the user buffer spans */
|
|
|
+ npages = num_user_pages(vaddr, tinfo->length);
|
|
|
+ if (!npages)
|
|
|
+ return -EINVAL;
|
|
|
+
|
|
|
+ if (npages > uctxt->expected_count) {
|
|
|
+ dd_dev_err(dd, "Expected buffer too big\n");
|
|
|
+ return -EINVAL;
|
|
|
+ }
|
|
|
+
|
|
|
+ /* Verify that access is OK for the user buffer */
|
|
|
+ if (!access_ok(VERIFY_WRITE, (void __user *)vaddr,
|
|
|
+ npages * PAGE_SIZE)) {
|
|
|
+ dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n",
|
|
|
+ (void *)vaddr, npages);
|
|
|
+ return -EFAULT;
|
|
|
+ }
|
|
|
+
|
|
|
+ pagesets = kcalloc(uctxt->expected_count, sizeof(*pagesets),
|
|
|
+ GFP_KERNEL);
|
|
|
+ if (!pagesets)
|
|
|
+ return -ENOMEM;
|
|
|
+
|
|
|
+ /* Allocate the array of struct page pointers needed for pinning */
|
|
|
+ pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
|
|
|
+ if (!pages) {
|
|
|
+ ret = -ENOMEM;
|
|
|
+ goto bail;
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Pin all the pages of the user buffer. If we can't pin all the
|
|
|
+ * pages, accept the amount pinned so far and program only that.
|
|
|
+ * User space knows how to deal with partially programmed buffers.
|
|
|
+ */
|
|
|
+ pinned = hfi1_acquire_user_pages(vaddr, npages, true, pages);
|
|
|
+ if (pinned <= 0) {
|
|
|
+ ret = pinned;
|
|
|
+ goto bail;
|
|
|
+ }
|
|
|
+
|
|
|
+ /* Find sets of physically contiguous pages */
|
|
|
+ npagesets = find_phys_blocks(pages, pinned, pagesets);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * We don't need to access this under a lock since tid_used is per
|
|
|
+ * process and the same process cannot be in hfi1_user_exp_rcv_clear()
|
|
|
+ * and hfi1_user_exp_rcv_setup() at the same time.
|
|
|
+ */
|
|
|
+ spin_lock(&fd->tid_lock);
|
|
|
+ if (fd->tid_used + npagesets > fd->tid_limit)
|
|
|
+ pageset_count = fd->tid_limit - fd->tid_used;
|
|
|
+ else
|
|
|
+ pageset_count = npagesets;
|
|
|
+ spin_unlock(&fd->tid_lock);
|
|
|
+
|
|
|
+ if (!pageset_count)
|
|
|
+ goto bail;
|
|
|
+
|
|
|
+ ngroups = pageset_count / dd->rcv_entries.group_size;
|
|
|
+ tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
|
|
|
+ if (!tidlist) {
|
|
|
+ ret = -ENOMEM;
|
|
|
+ goto nomem;
|
|
|
+ }
|
|
|
+
|
|
|
+ tididx = 0;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * From this point on, we are going to be using shared (between master
|
|
|
+ * and subcontexts) context resources. We need to take the lock.
|
|
|
+ */
|
|
|
+ mutex_lock(&uctxt->exp_lock);
|
|
|
+ /*
|
|
|
+ * The first step is to program the RcvArray entries which are complete
|
|
|
+ * groups.
|
|
|
+ */
|
|
|
+ while (ngroups && uctxt->tid_group_list.count) {
|
|
|
+ struct tid_group *grp =
|
|
|
+ tid_group_pop(&uctxt->tid_group_list);
|
|
|
+
|
|
|
+ ret = program_rcvarray(fp, vaddr, grp, pagesets,
|
|
|
+ pageidx, dd->rcv_entries.group_size,
|
|
|
+ pages, tidlist, &tididx, &mapped);
|
|
|
+ /*
|
|
|
+ * If there was a failure to program the RcvArray
|
|
|
+ * entries for the entire group, reset the grp fields
|
|
|
+ * and add the grp back to the free group list.
|
|
|
+ */
|
|
|
+ if (ret <= 0) {
|
|
|
+ tid_group_add_tail(grp, &uctxt->tid_group_list);
|
|
|
+ hfi1_cdbg(TID,
|
|
|
+ "Failed to program RcvArray group %d", ret);
|
|
|
+ goto unlock;
|
|
|
+ }
|
|
|
+
|
|
|
+ tid_group_add_tail(grp, &uctxt->tid_full_list);
|
|
|
+ ngroups--;
|
|
|
+ pageidx += ret;
|
|
|
+ mapped_pages += mapped;
|
|
|
+ }
|
|
|
+
|
|
|
+ while (pageidx < pageset_count) {
|
|
|
+ struct tid_group *grp, *ptr;
|
|
|
+ /*
|
|
|
+ * If we don't have any partially used tid groups, check
|
|
|
+ * if we have empty groups. If so, take one from there and
|
|
|
+ * put in the partially used list.
|
|
|
+ */
|
|
|
+ if (!uctxt->tid_used_list.count || need_group) {
|
|
|
+ if (!uctxt->tid_group_list.count)
|
|
|
+ goto unlock;
|
|
|
+
|
|
|
+ grp = tid_group_pop(&uctxt->tid_group_list);
|
|
|
+ tid_group_add_tail(grp, &uctxt->tid_used_list);
|
|
|
+ need_group = 0;
|
|
|
+ }
|
|
|
+ /*
|
|
|
+ * There is an optimization opportunity here - instead of
|
|
|
+ * fitting as many page sets as we can, check for a group
|
|
|
+ * later on in the list that could fit all of them.
|
|
|
+ */
|
|
|
+ list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list,
|
|
|
+ list) {
|
|
|
+ unsigned use = min_t(unsigned, pageset_count - pageidx,
|
|
|
+ grp->size - grp->used);
|
|
|
+
|
|
|
+ ret = program_rcvarray(fp, vaddr, grp, pagesets,
|
|
|
+ pageidx, use, pages, tidlist,
|
|
|
+ &tididx, &mapped);
|
|
|
+ if (ret < 0) {
|
|
|
+ hfi1_cdbg(TID,
|
|
|
+ "Failed to program RcvArray entries %d",
|
|
|
+ ret);
|
|
|
+ ret = -EFAULT;
|
|
|
+ goto unlock;
|
|
|
+ } else if (ret > 0) {
|
|
|
+ if (grp->used == grp->size)
|
|
|
+ tid_group_move(grp,
|
|
|
+ &uctxt->tid_used_list,
|
|
|
+ &uctxt->tid_full_list);
|
|
|
+ pageidx += ret;
|
|
|
+ mapped_pages += mapped;
|
|
|
+ need_group = 0;
|
|
|
+ /* Check if we are done so we break out early */
|
|
|
+ if (pageidx >= pageset_count)
|
|
|
+ break;
|
|
|
+ } else if (WARN_ON(ret == 0)) {
|
|
|
+ /*
|
|
|
+ * If ret is 0, we did not program any entries
|
|
|
+ * into this group, which can only happen if
|
|
|
+ * we've screwed up the accounting somewhere.
|
|
|
+ * Warn and try to continue.
|
|
|
+ */
|
|
|
+ need_group = 1;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+unlock:
|
|
|
+ mutex_unlock(&uctxt->exp_lock);
|
|
|
+nomem:
|
|
|
+ hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
|
|
|
+ mapped_pages, ret);
|
|
|
+ if (tididx) {
|
|
|
+ spin_lock(&fd->tid_lock);
|
|
|
+ fd->tid_used += tididx;
|
|
|
+ spin_unlock(&fd->tid_lock);
|
|
|
+ tinfo->tidcnt = tididx;
|
|
|
+ tinfo->length = mapped_pages * PAGE_SIZE;
|
|
|
+
|
|
|
+ if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist,
|
|
|
+ tidlist, sizeof(tidlist[0]) * tididx)) {
|
|
|
+ /*
|
|
|
+ * On failure to copy to the user level, we need to undo
|
|
|
+ * everything done so far so we don't leak resources.
|
|
|
+ */
|
|
|
+ tinfo->tidlist = (unsigned long)&tidlist;
|
|
|
+ hfi1_user_exp_rcv_clear(fp, tinfo);
|
|
|
+ tinfo->tidlist = 0;
|
|
|
+ ret = -EFAULT;
|
|
|
+ goto bail;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If not everything was mapped (due to insufficient RcvArray entries,
|
|
|
+ * for example), unpin all unmapped pages so we can pin them nex time.
|
|
|
+ */
|
|
|
+ if (mapped_pages != pinned)
|
|
|
+ hfi1_release_user_pages(&pages[mapped_pages],
|
|
|
+ pinned - mapped_pages,
|
|
|
+ false);
|
|
|
+bail:
|
|
|
+ kfree(pagesets);
|
|
|
+ kfree(pages);
|
|
|
+ kfree(tidlist);
|
|
|
+ return ret > 0 ? 0 : ret;
|
|
|
}
|
|
|
|
|
|
int hfi1_user_exp_rcv_clear(struct file *fp, struct hfi1_tid_info *tinfo)
|