|
@@ -37,8 +37,78 @@
|
|
|
|
|
|
#define MAX_PREFETCH_LEN (4*1024*1024U)
|
|
#define MAX_PREFETCH_LEN (4*1024*1024U)
|
|
|
|
|
|
|
|
+/* Timeout in ms to wait for an active mmu notifier to complete when handling
|
|
|
|
+ * a pagefault. */
|
|
|
|
+#define MMU_NOTIFIER_TIMEOUT 1000
|
|
|
|
+
|
|
struct workqueue_struct *mlx5_ib_page_fault_wq;
|
|
struct workqueue_struct *mlx5_ib_page_fault_wq;
|
|
|
|
|
|
|
|
+void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
|
|
|
|
+ unsigned long end)
|
|
|
|
+{
|
|
|
|
+ struct mlx5_ib_mr *mr;
|
|
|
|
+ const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(u64)) - 1;
|
|
|
|
+ u64 idx = 0, blk_start_idx = 0;
|
|
|
|
+ int in_block = 0;
|
|
|
|
+ u64 addr;
|
|
|
|
+
|
|
|
|
+ if (!umem || !umem->odp_data) {
|
|
|
|
+ pr_err("invalidation called on NULL umem or non-ODP umem\n");
|
|
|
|
+ return;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ mr = umem->odp_data->private;
|
|
|
|
+
|
|
|
|
+ if (!mr || !mr->ibmr.pd)
|
|
|
|
+ return;
|
|
|
|
+
|
|
|
|
+ start = max_t(u64, ib_umem_start(umem), start);
|
|
|
|
+ end = min_t(u64, ib_umem_end(umem), end);
|
|
|
|
+
|
|
|
|
+ /*
|
|
|
|
+ * Iteration one - zap the HW's MTTs. The notifiers_count ensures that
|
|
|
|
+ * while we are doing the invalidation, no page fault will attempt to
|
|
|
|
+ * overwrite the same MTTs. Concurent invalidations might race us,
|
|
|
|
+ * but they will write 0s as well, so no difference in the end result.
|
|
|
|
+ */
|
|
|
|
+
|
|
|
|
+ for (addr = start; addr < end; addr += (u64)umem->page_size) {
|
|
|
|
+ idx = (addr - ib_umem_start(umem)) / PAGE_SIZE;
|
|
|
|
+ /*
|
|
|
|
+ * Strive to write the MTTs in chunks, but avoid overwriting
|
|
|
|
+ * non-existing MTTs. The huristic here can be improved to
|
|
|
|
+ * estimate the cost of another UMR vs. the cost of bigger
|
|
|
|
+ * UMR.
|
|
|
|
+ */
|
|
|
|
+ if (umem->odp_data->dma_list[idx] &
|
|
|
|
+ (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
|
|
|
|
+ if (!in_block) {
|
|
|
|
+ blk_start_idx = idx;
|
|
|
|
+ in_block = 1;
|
|
|
|
+ }
|
|
|
|
+ } else {
|
|
|
|
+ u64 umr_offset = idx & umr_block_mask;
|
|
|
|
+
|
|
|
|
+ if (in_block && umr_offset == 0) {
|
|
|
|
+ mlx5_ib_update_mtt(mr, blk_start_idx,
|
|
|
|
+ idx - blk_start_idx, 1);
|
|
|
|
+ in_block = 0;
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ if (in_block)
|
|
|
|
+ mlx5_ib_update_mtt(mr, blk_start_idx, idx - blk_start_idx + 1,
|
|
|
|
+ 1);
|
|
|
|
+
|
|
|
|
+ /*
|
|
|
|
+ * We are now sure that the device will not access the
|
|
|
|
+ * memory. We can safely unmap it, and mark it as dirty if
|
|
|
|
+ * needed.
|
|
|
|
+ */
|
|
|
|
+
|
|
|
|
+ ib_umem_odp_unmap_dma_pages(umem, start, end);
|
|
|
|
+}
|
|
|
|
+
|
|
#define COPY_ODP_BIT_MLX_TO_IB(reg, ib_caps, field_name, bit_name) do { \
|
|
#define COPY_ODP_BIT_MLX_TO_IB(reg, ib_caps, field_name, bit_name) do { \
|
|
if (be32_to_cpu(reg.field_name) & MLX5_ODP_SUPPORT_##bit_name) \
|
|
if (be32_to_cpu(reg.field_name) & MLX5_ODP_SUPPORT_##bit_name) \
|
|
ib_caps->field_name |= IB_ODP_SUPPORT_##bit_name; \
|
|
ib_caps->field_name |= IB_ODP_SUPPORT_##bit_name; \
|
|
@@ -59,9 +129,18 @@ int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev)
|
|
if (err)
|
|
if (err)
|
|
goto out;
|
|
goto out;
|
|
|
|
|
|
- /* At this point we would copy the capability bits that the driver
|
|
|
|
- * supports from the hw_caps struct to the caps struct. However, no
|
|
|
|
- * such capabilities are supported so far. */
|
|
|
|
|
|
+ caps->general_caps = IB_ODP_SUPPORT;
|
|
|
|
+ COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.ud_odp_caps,
|
|
|
|
+ SEND);
|
|
|
|
+ COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
|
|
|
|
+ SEND);
|
|
|
|
+ COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
|
|
|
|
+ RECV);
|
|
|
|
+ COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
|
|
|
|
+ WRITE);
|
|
|
|
+ COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
|
|
|
|
+ READ);
|
|
|
|
+
|
|
out:
|
|
out:
|
|
return err;
|
|
return err;
|
|
}
|
|
}
|
|
@@ -71,8 +150,9 @@ static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev,
|
|
{
|
|
{
|
|
u32 base_key = mlx5_base_mkey(key);
|
|
u32 base_key = mlx5_base_mkey(key);
|
|
struct mlx5_core_mr *mmr = __mlx5_mr_lookup(dev->mdev, base_key);
|
|
struct mlx5_core_mr *mmr = __mlx5_mr_lookup(dev->mdev, base_key);
|
|
|
|
+ struct mlx5_ib_mr *mr = container_of(mmr, struct mlx5_ib_mr, mmr);
|
|
|
|
|
|
- if (!mmr || mmr->key != key)
|
|
|
|
|
|
+ if (!mmr || mmr->key != key || !mr->live)
|
|
return NULL;
|
|
return NULL;
|
|
|
|
|
|
return container_of(mmr, struct mlx5_ib_mr, mmr);
|
|
return container_of(mmr, struct mlx5_ib_mr, mmr);
|
|
@@ -143,6 +223,11 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
|
|
}
|
|
}
|
|
|
|
|
|
current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq);
|
|
current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq);
|
|
|
|
+ /*
|
|
|
|
+ * Ensure the sequence number is valid for some time before we call
|
|
|
|
+ * gup.
|
|
|
|
+ */
|
|
|
|
+ smp_rmb();
|
|
|
|
|
|
/*
|
|
/*
|
|
* Avoid branches - this code will perform correctly
|
|
* Avoid branches - this code will perform correctly
|
|
@@ -165,15 +250,20 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
|
|
|
|
|
|
if (npages > 0) {
|
|
if (npages > 0) {
|
|
mutex_lock(&mr->umem->odp_data->umem_mutex);
|
|
mutex_lock(&mr->umem->odp_data->umem_mutex);
|
|
- /*
|
|
|
|
- * No need to check whether the MTTs really belong to
|
|
|
|
- * this MR, since ib_umem_odp_map_dma_pages already
|
|
|
|
- * checks this.
|
|
|
|
- */
|
|
|
|
- ret = mlx5_ib_update_mtt(mr, start_idx, npages, 0);
|
|
|
|
|
|
+ if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) {
|
|
|
|
+ /*
|
|
|
|
+ * No need to check whether the MTTs really belong to
|
|
|
|
+ * this MR, since ib_umem_odp_map_dma_pages already
|
|
|
|
+ * checks this.
|
|
|
|
+ */
|
|
|
|
+ ret = mlx5_ib_update_mtt(mr, start_idx, npages, 0);
|
|
|
|
+ } else {
|
|
|
|
+ ret = -EAGAIN;
|
|
|
|
+ }
|
|
mutex_unlock(&mr->umem->odp_data->umem_mutex);
|
|
mutex_unlock(&mr->umem->odp_data->umem_mutex);
|
|
if (ret < 0) {
|
|
if (ret < 0) {
|
|
- pr_err("Failed to update mkey page tables\n");
|
|
|
|
|
|
+ if (ret != -EAGAIN)
|
|
|
|
+ pr_err("Failed to update mkey page tables\n");
|
|
goto srcu_unlock;
|
|
goto srcu_unlock;
|
|
}
|
|
}
|
|
|
|
|
|
@@ -185,6 +275,22 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
|
|
}
|
|
}
|
|
|
|
|
|
srcu_unlock:
|
|
srcu_unlock:
|
|
|
|
+ if (ret == -EAGAIN) {
|
|
|
|
+ if (!mr->umem->odp_data->dying) {
|
|
|
|
+ struct ib_umem_odp *odp_data = mr->umem->odp_data;
|
|
|
|
+ unsigned long timeout =
|
|
|
|
+ msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
|
|
|
|
+
|
|
|
|
+ if (!wait_for_completion_timeout(
|
|
|
|
+ &odp_data->notifier_completion,
|
|
|
|
+ timeout)) {
|
|
|
|
+ pr_warn("timeout waiting for mmu notifier completion\n");
|
|
|
|
+ }
|
|
|
|
+ } else {
|
|
|
|
+ /* The MR is being killed, kill the QP as well. */
|
|
|
|
+ ret = -EFAULT;
|
|
|
|
+ }
|
|
|
|
+ }
|
|
srcu_read_unlock(&mib_dev->mr_srcu, srcu_key);
|
|
srcu_read_unlock(&mib_dev->mr_srcu, srcu_key);
|
|
pfault->mpfault.bytes_committed = 0;
|
|
pfault->mpfault.bytes_committed = 0;
|
|
return ret ? ret : npages;
|
|
return ret ? ret : npages;
|