On 12/9/2020 12:39 AM, Jianxin Xiong wrote:
Implement the new driver method 'reg_user_mr_dmabuf'. Utilize the core functions to import dma-buf based memory region and update the mappings.
Add code to handle dma-buf related page fault.
Signed-off-by: Jianxin Xiong jianxin.xiong@intel.com Reviewed-by: Sean Hefty sean.hefty@intel.com Acked-by: Michael J. Ruhl michael.j.ruhl@intel.com Acked-by: Christian Koenig christian.koenig@amd.com Acked-by: Daniel Vetter daniel.vetter@ffwll.ch
drivers/infiniband/hw/mlx5/main.c | 2 + drivers/infiniband/hw/mlx5/mlx5_ib.h | 18 +++++ drivers/infiniband/hw/mlx5/mr.c | 128 +++++++++++++++++++++++++++++++++-- drivers/infiniband/hw/mlx5/odp.c | 86 +++++++++++++++++++++-- 4 files changed, 225 insertions(+), 9 deletions(-)
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 4a054eb..c025746 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /*
- Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved.
- Copyright (c) 2020, Intel Corporation. All rights reserved.
*/
#include <linux/debugfs.h>
@@ -4069,6 +4070,7 @@ static int mlx5_ib_enable_driver(struct ib_device *dev) .query_srq = mlx5_ib_query_srq, .query_ucontext = mlx5_ib_query_ucontext, .reg_user_mr = mlx5_ib_reg_user_mr,
- .reg_user_mr_dmabuf = mlx5_ib_reg_user_mr_dmabuf, .req_notify_cq = mlx5_ib_arm_cq, .rereg_user_mr = mlx5_ib_rereg_user_mr, .resize_cq = mlx5_ib_resize_cq,
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 718e59f..6f4d1b4 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ /*
- Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved.
- Copyright (c) 2020, Intel Corporation. All rights reserved.
*/
#ifndef MLX5_IB_H
@@ -704,6 +705,12 @@ static inline bool is_odp_mr(struct mlx5_ib_mr *mr) mr->umem->is_odp; }
+static inline bool is_dmabuf_mr(struct mlx5_ib_mr *mr) +{
- return IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && mr->umem &&
mr->umem->is_dmabuf;
+}
Didn't we agree that IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) should be checked earlier
upon mlx5_ib_reg_user_mr_dmabuf () to fully prevent the functionality when ODP is not supportedĀ ? see note on previous versions.
struct mlx5_ib_mw { struct ib_mw ibmw; struct mlx5_core_mkey mmkey; @@ -1239,6 +1246,10 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata); +struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start,
u64 length, u64 virt_addr,
int fd, int access_flags,
int mlx5_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, u32 flags,struct ib_udata *udata);
@@ -1249,11 +1260,13 @@ int mlx5_ib_advise_mr(struct ib_pd *pd, int mlx5_ib_dealloc_mw(struct ib_mw *mw); int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, int page_shift, int flags); +int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags); struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, struct ib_udata *udata, int access_flags); void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr); void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr); +void mlx5_ib_fence_dmabuf_mr(struct mlx5_ib_mr *mr); int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_pd *pd, struct ib_udata *udata); @@ -1341,6 +1354,7 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, u32 flags, struct ib_sge *sg_list, u32 num_sge); int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr, bool enable); +int mlx5_ib_init_dmabuf_mr(struct mlx5_ib_mr *mr); #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) { @@ -1366,6 +1380,10 @@ static inline int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr, bool enable) { return -EOPNOTSUPP; } +static inline int mlx5_ib_init_dmabuf_mr(struct mlx5_ib_mr *mr) +{
- return -EOPNOTSUPP;
+} #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
extern const struct mmu_interval_notifier_ops mlx5_mn_ops; diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index b6116f6..e3be1f5 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1,5 +1,6 @@ /*
- Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
- Copyright (c) 2020, Intel Corporation. All rights reserved.
- This software is available to you under a choice of one of two
- licenses. You may choose to be licensed under the terms of the GNU
@@ -36,6 +37,8 @@ #include <linux/debugfs.h> #include <linux/export.h> #include <linux/delay.h> +#include <linux/dma-buf.h> +#include <linux/dma-resv.h> #include <rdma/ib_umem.h> #include <rdma/ib_umem_odp.h> #include <rdma/ib_verbs.h> @@ -957,6 +960,16 @@ static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev, return &cache->ent[order]; }
+static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem,
u64 iova)
+{
- if ((iova ^ umem->address) & (PAGE_SIZE - 1))
return 0;
Also here, see my note from previous versions, can user space trigger this by supplying some invalid input ?
If so, this may cause a WARN_ON() in the caller (i.e.alloc_mr_from_cache) that we should prevent from being triggered by user space application.
- umem->iova = iova;
- return PAGE_SIZE;
+}
- static struct mlx5_ib_mr *alloc_mr_from_cache(struct ib_pd *pd, struct ib_umem *umem, u64 iova, int access_flags)
@@ -966,7 +979,11 @@ static struct mlx5_ib_mr *alloc_mr_from_cache(struct ib_pd *pd, struct mlx5_ib_mr *mr; unsigned int page_size;
- page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, 0, iova);
- if (umem->is_dmabuf)
page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova);
- else
page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size,
if (WARN_ON(!page_size)) return ERR_PTR(-EINVAL); ent = mr_cache_ent_from_order(0, iova);
@@ -1212,8 +1229,10 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
/*
- Send the DMA list to the HW for a normal MR using UMR.
- Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP
*/
- flag may be used.
-static int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) +int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) { struct mlx5_ib_dev *dev = mr->dev; struct device *ddev = &dev->mdev->pdev->dev; @@ -1255,6 +1274,10 @@ static int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) cur_mtt->ptag = cpu_to_be64(rdma_block_iter_dma_address(&biter) | MLX5_IB_MTT_PRESENT);
if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP))
cur_mtt->ptag = 0;
- cur_mtt++; }
@@ -1291,8 +1314,11 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd, int err; bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
- page_size =
mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, 0, iova);
- if (umem->is_dmabuf)
page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova);
- else
page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size,
if (WARN_ON(!page_size)) return ERR_PTR(-EINVAL);0, iova);
@@ -1572,6 +1598,96 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return ERR_PTR(err); }
+static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) +{
- struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv;
- struct mlx5_ib_mr *mr = umem_dmabuf->private;
- dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
- if (!umem_dmabuf->sgt)
return;
- mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP);
- ib_umem_dmabuf_unmap_pages(umem_dmabuf);
+}
+static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
- .allow_peer2peer = 1,
- .move_notify = mlx5_ib_dmabuf_invalidate_cb,
+};
+struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
u64 length, u64 virt_addr,
int fd, int access_flags,
struct ib_udata *udata)
+{
- struct mlx5_ib_dev *dev = to_mdev(pd->device);
- struct mlx5_ib_mr *mr = NULL;
- struct ib_umem *umem;
- int err;
- if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
return ERR_PTR(-EOPNOTSUPP);
- mlx5_ib_dbg(dev,
"offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n",
offset, virt_addr, length, fd, access_flags);
- if (!mlx5_ib_can_load_pas_with_umr(dev, length))
return ERR_PTR(-EINVAL);
- umem = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd, access_flags,
&mlx5_ib_dmabuf_attach_ops);
- if (IS_ERR(umem)) {
mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(umem));
return ERR_PTR(PTR_ERR(umem));
- }
- mr = alloc_mr_from_cache(pd, umem, virt_addr, access_flags);
- if (IS_ERR(mr))
mr = NULL;
- if (!mr) {
mutex_lock(&dev->slow_path_mutex);
mr = reg_create(NULL, pd, umem, virt_addr, access_flags,
false);
mutex_unlock(&dev->slow_path_mutex);
- }
- if (IS_ERR(mr)) {
err = PTR_ERR(mr);
goto error;
- }
- mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
- mr->umem = umem;
- atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages);
- set_mr_fields(dev, mr, length, access_flags);
- to_ib_umem_dmabuf(umem)->private = mr;
- init_waitqueue_head(&mr->q_deferred_work);
- atomic_set(&mr->num_deferred_work, 0);
- err = xa_err(xa_store(&dev->odp_mkeys,
mlx5_base_mkey(mr->mmkey.key), &mr->mmkey,
GFP_KERNEL));
- if (err) {
dereg_mr(dev, mr);
return ERR_PTR(err);
- }
- err = mlx5_ib_init_dmabuf_mr(mr);
- if (err) {
dereg_mr(dev, mr);
return ERR_PTR(err);
- }
- return &mr->ibmr;
+error:
- ib_umem_release(umem);
- return ERR_PTR(err);
+}
- /**
- mlx5_mr_cache_invalidate - Fence all DMA on the MR
- @mr: The MR to fence
@@ -1640,7 +1756,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, if (!mr->umem) return -EINVAL;
- if (is_odp_mr(mr))
if (is_odp_mr(mr) || is_dmabuf_mr(mr)) return -EOPNOTSUPP;
if (flags & IB_MR_REREG_TRANS) {
@@ -1804,6 +1920,8 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) /* Stop all DMA */ if (is_odp_mr(mr)) mlx5_ib_fence_odp_mr(mr);
- else if (is_dmabuf_mr(mr))
else clean_mr(dev, mr);mlx5_ib_fence_dmabuf_mr(mr);
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 5c853ec..35d6770 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -33,6 +33,8 @@ #include <rdma/ib_umem.h> #include <rdma/ib_umem_odp.h> #include <linux/kernel.h> +#include <linux/dma-buf.h> +#include <linux/dma-resv.h>
#include "mlx5_ib.h" #include "cmd.h" @@ -664,6 +666,37 @@ void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr) dma_fence_odp_mr(mr); }
+/**
- mlx5_ib_fence_dmabuf_mr - Stop all access to the dmabuf MR
- @mr: to fence
- On return no parallel threads will be touching this MR and no DMA will be
- active.
- */
+void mlx5_ib_fence_dmabuf_mr(struct mlx5_ib_mr *mr) +{
- struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
- /* Prevent new page faults and prefetch requests from succeeding */
- xa_erase(&mr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key));
- /* Wait for all running page-fault handlers to finish. */
- synchronize_srcu(&mr->dev->odp_srcu);
- wait_event(mr->q_deferred_work, !atomic_read(&mr->num_deferred_work));
- dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
- mlx5_mr_cache_invalidate(mr);
- umem_dmabuf->private = NULL;
- ib_umem_dmabuf_unmap_pages(umem_dmabuf);
- dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
- if (!mr->cache_ent) {
mlx5_core_destroy_mkey(mr->dev->mdev, &mr->mmkey);
WARN_ON(mr->descs);
- }
+}
- #define MLX5_PF_FLAGS_DOWNGRADE BIT(1) #define MLX5_PF_FLAGS_SNAPSHOT BIT(2) #define MLX5_PF_FLAGS_ENABLE BIT(3)
@@ -797,6 +830,41 @@ static int pagefault_implicit_mr(struct mlx5_ib_mr *imr, return ret; }
+static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt,
u32 *bytes_mapped, u32 flags)
+{
- struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
- u32 xlt_flags = 0;
- int err;
- unsigned int page_size;
- if (flags & MLX5_PF_FLAGS_ENABLE)
xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;
- dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
- err = ib_umem_dmabuf_map_pages(umem_dmabuf);
- if (!err) {
page_size = mlx5_umem_find_best_pgsz(&umem_dmabuf->umem, mkc,
log_page_size, 0,
umem_dmabuf->umem.iova);
if (unlikely(page_size < PAGE_SIZE)) {
ib_umem_dmabuf_unmap_pages(umem_dmabuf);
err = -EINVAL;
} else {
err = mlx5_ib_update_mr_pas(mr, xlt_flags);
}
- }
- dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
- if (err)
return err;
- if (bytes_mapped)
*bytes_mapped += bcnt;
- return ib_umem_num_pages(mr->umem);
+}
- /*
- Returns:
- -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are
@@ -815,6 +883,9 @@ static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, if (unlikely(io_virt < mr->mmkey.iova)) return -EFAULT;
- if (mr->umem->is_dmabuf)
return pagefault_dmabuf_mr(mr, bcnt, bytes_mapped, flags);
- if (!odp->is_implicit_odp) { u64 user_va;
@@ -845,6 +916,16 @@ int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr, bool enable) return ret >= 0 ? 0 : ret; }
+int mlx5_ib_init_dmabuf_mr(struct mlx5_ib_mr *mr) +{
- int ret;
- ret = pagefault_dmabuf_mr(mr, mr->umem->length, NULL,
MLX5_PF_FLAGS_ENABLE);
- return ret >= 0 ? 0 : ret;
+}
- struct pf_frame { struct pf_frame *next; u32 key;
@@ -1747,7 +1828,6 @@ static void destroy_prefetch_work(struct prefetch_mr_work *work) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_core_mkey *mmkey;
struct ib_umem_odp *odp; struct mlx5_ib_mr *mr;
lockdep_assert_held(&dev->odp_srcu);
@@ -1761,11 +1841,9 @@ static void destroy_prefetch_work(struct prefetch_mr_work *work) if (mr->ibmr.pd != pd) return NULL;
- odp = to_ib_umem_odp(mr->umem);
- /* prefetch with write-access must be supported by the MR */ if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
!odp->umem.writable)
!mr->umem->writable)
return NULL;
return mr;