This is an RFC for a hacky idea I had to at least move the converation forward.
The branch with this in it is: https://github.com/airlied/linux/tree/ttm-bounce
it won't apply to any other tree as it's based on all those patches I posted and some other refactorings.
The basic idea is if the driver gets a move request from the TTM core that requires it to bounce the buffer through another domain, it returns -EMULTIHOP and puts the domain details into the mem_type, the core code then just does the mem space for the new temp placment, and retries the final placement again.
I've tested on nouveau that the code gets executed (a printk prints at least), and it all doesn't burn down, but it's very lightly tested.
It does allow getting rid of a lot of driver code to handle bouncing moves.
I'm sure this could be prettier or done in a very different way more effectively, but hey this was my chance to misuse an errno value.
Dave.
From: Dave Airlie airlied@redhat.com
--- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 128 +++--------------------- drivers/gpu/drm/nouveau/nouveau_bo.c | 106 ++------------------ drivers/gpu/drm/radeon/radeon_ttm.c | 110 ++------------------ drivers/gpu/drm/ttm/ttm_bo.c | 44 +++++++- 4 files changed, 77 insertions(+), 311 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 872795affe87..51d55e3fd8c8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -515,109 +515,6 @@ static int amdgpu_move_blit(struct ttm_buffer_object *bo, return r; }
-static int amdgpu_move_setup_tt(struct ttm_buffer_object *bo, - struct ttm_operation_ctx *ctx, - struct ttm_resource *tmp_mem) -{ - struct ttm_place placements; - struct ttm_placement placement; - int r; - - placement.num_placement = 1; - placement.placement = &placements; - placement.num_busy_placement = 1; - placement.busy_placement = &placements; - placements.fpfn = 0; - placements.lpfn = 0; - placements.mem_type = TTM_PL_TT; - placements.flags = 0; - r = ttm_bo_mem_space(bo, &placement, tmp_mem, ctx); - if (unlikely(r)) { - pr_err("Failed to find GTT space for blit from VRAM\n"); - return r; - } - - r = ttm_tt_populate(bo->bdev, bo->ttm, ctx); - if (unlikely(r)) - goto out_cleanup; - - /* Bind the memory to the GTT space */ - r = amdgpu_ttm_backend_bind(bo->bdev, bo->ttm, tmp_mem); - if (unlikely(r)) { - goto out_cleanup; - } - return 0; -out_cleanup: - ttm_resource_free(bo, tmp_mem); - return r; -} -/** - * amdgpu_move_vram_ram - Copy VRAM buffer to RAM buffer - * - * Called by amdgpu_bo_move(). - */ -static int amdgpu_move_vram_ram(struct ttm_buffer_object *bo, bool evict, - struct ttm_operation_ctx *ctx, - struct ttm_resource *new_mem) -{ - struct ttm_resource *old_mem = &bo->mem; - struct ttm_resource tmp_mem; - int r; - - /* create space/pages for new_mem in GTT space */ - tmp_mem = *new_mem; - tmp_mem.mm_node = NULL; - - r = amdgpu_move_setup_tt(bo, ctx, &tmp_mem); - if (unlikely(r)) - return r; - - /* blit VRAM to GTT */ - r = amdgpu_move_blit(bo, evict, &tmp_mem, old_mem); - if (unlikely(r)) { - goto out_cleanup; - } - - r = ttm_bo_wait_ctx(bo, ctx); - if (unlikely(r)) - goto out_cleanup; - - amdgpu_ttm_backend_unbind(bo->bdev, bo->ttm); - ttm_resource_free(bo, &bo->mem); - ttm_bo_assign_mem(bo, new_mem); -out_cleanup: - ttm_resource_free(bo, &tmp_mem); - return r; -} - -/** - * amdgpu_move_ram_vram - Copy buffer from RAM to VRAM - * - * Called by amdgpu_bo_move(). - */ -static int amdgpu_move_ram_vram(struct ttm_buffer_object *bo, bool evict, - struct ttm_operation_ctx *ctx, - struct ttm_resource *new_mem) -{ - struct ttm_resource *old_mem = &bo->mem; - struct ttm_resource tmp_mem; - int r; - - /* make space in GTT for old_mem buffer */ - tmp_mem = *new_mem; - tmp_mem.mm_node = NULL; - - r = amdgpu_move_setup_tt(bo, ctx, &tmp_mem); - if (unlikely(r)) - return r; - - ttm_bo_assign_mem(bo, &tmp_mem); - /* copy to VRAM */ - r = amdgpu_move_blit(bo, evict, new_mem, old_mem); - ttm_resource_free(bo, &tmp_mem); - return r; -} - /** * amdgpu_mem_visible - Check that memory can be accessed by ttm_bo_move_memcpy * @@ -656,6 +553,19 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict, struct ttm_resource *old_mem = &bo->mem; int r;
+ /* don't go from system->vram in one hop, + report this back to the caller tell it + where to bounce this buffer through. */ + + if ((old_mem->mem_type == TTM_PL_SYSTEM && + new_mem->mem_type == TTM_PL_VRAM) || + (old_mem->mem_type == TTM_PL_VRAM && + new_mem->mem_type == TTM_PL_SYSTEM)) { + new_mem->mem_type = TTM_PL_TT; + new_mem->placement = 0; + return -EMULTIHOP; + } + if (new_mem->mem_type == TTM_PL_TT) { r = amdgpu_ttm_backend_bind(bo->bdev, bo->ttm, new_mem); if (r) @@ -709,16 +619,8 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict, goto memcpy; }
- if (old_mem->mem_type == TTM_PL_VRAM && - new_mem->mem_type == TTM_PL_SYSTEM) { - r = amdgpu_move_vram_ram(bo, evict, ctx, new_mem); - } else if (old_mem->mem_type == TTM_PL_SYSTEM && - new_mem->mem_type == TTM_PL_VRAM) { - r = amdgpu_move_ram_vram(bo, evict, ctx, new_mem); - } else { - r = amdgpu_move_blit(bo, evict, - new_mem, old_mem); - } + r = amdgpu_move_blit(bo, evict, + new_mem, old_mem);
if (r) { memcpy: diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c index 038974d9a0d6..c08b3da804fb 100644 --- a/drivers/gpu/drm/nouveau/nouveau_bo.c +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c @@ -862,92 +862,6 @@ nouveau_bo_move_init(struct nouveau_drm *drm) NV_INFO(drm, "MM: using %s for buffer copies\n", name); }
-static int -nouveau_move_setup_tt(struct ttm_buffer_object *bo, - struct ttm_operation_ctx *ctx, - struct ttm_resource *tmp_reg) -{ - struct ttm_place placement_memtype = { - .fpfn = 0, - .lpfn = 0, - .mem_type = TTM_PL_TT, - .flags = 0 - }; - struct ttm_placement placement; - int ret; - - placement.num_placement = placement.num_busy_placement = 1; - placement.placement = placement.busy_placement = &placement_memtype; - - ret = ttm_bo_mem_space(bo, &placement, tmp_reg, ctx); - if (ret) - return ret; - - ret = ttm_tt_populate(bo->bdev, bo->ttm, ctx); - if (ret) - goto out_cleanup; - - ret = nouveau_ttm_tt_bind(bo->bdev, bo->ttm, tmp_reg); - if (ret) - goto out_cleanup; - return 0; -out_cleanup: - ttm_resource_free(bo, tmp_reg); - return ret; -} - -static int -nouveau_bo_move_flipd(struct ttm_buffer_object *bo, bool evict, - struct ttm_operation_ctx *ctx, - struct ttm_resource *new_reg) -{ - struct ttm_resource tmp_reg; - int ret; - - tmp_reg = *new_reg; - tmp_reg.mm_node = NULL; - - ret = nouveau_move_setup_tt(bo, ctx, &tmp_reg); - if (ret) - return ret; - - ret = nouveau_bo_move_m2mf(bo, true, ctx, &tmp_reg); - if (ret) - goto out; - - ret = ttm_bo_wait_ctx(bo, ctx); - if (ret) - goto out; - - nouveau_ttm_tt_unbind(bo->bdev, bo->ttm); - ttm_resource_free(bo, &bo->mem); - ttm_bo_assign_mem(bo, &tmp_reg); -out: - ttm_resource_free(bo, &tmp_reg); - return ret; -} - -static int -nouveau_bo_move_flips(struct ttm_buffer_object *bo, bool evict, - struct ttm_operation_ctx *ctx, - struct ttm_resource *new_reg) -{ - struct ttm_resource tmp_reg; - int ret; - - tmp_reg = *new_reg; - tmp_reg.mm_node = NULL; - - ret = nouveau_move_setup_tt(bo, ctx, &tmp_reg); - if (ret) - return ret; - - ttm_bo_assign_mem(bo, &tmp_reg); - ret = nouveau_bo_move_m2mf(bo, true, ctx, new_reg); - ttm_resource_free(bo, &tmp_reg); - return ret; -} - static void nouveau_bo_move_ntfy(struct ttm_buffer_object *bo, bool evict, struct ttm_resource *new_reg) @@ -1028,6 +942,15 @@ nouveau_bo_move(struct ttm_buffer_object *bo, bool evict, struct nouveau_drm_tile *new_tile = NULL; int ret = 0;
+ if ((old_reg->mem_type == TTM_PL_SYSTEM && + new_reg->mem_type == TTM_PL_VRAM) || + (old_reg->mem_type == TTM_PL_VRAM && + new_reg->mem_type == TTM_PL_SYSTEM)) { + new_reg->mem_type = TTM_PL_TT; + new_reg->placement = 0; + return -EMULTIHOP; + } + if (new_reg->mem_type == TTM_PL_TT) { ret = nouveau_ttm_tt_bind(bo->bdev, bo->ttm, new_reg); if (ret) @@ -1070,15 +993,8 @@ nouveau_bo_move(struct ttm_buffer_object *bo, bool evict,
/* Hardware assisted copy. */ if (drm->ttm.move) { - if (new_reg->mem_type == TTM_PL_SYSTEM) - ret = nouveau_bo_move_flipd(bo, evict, ctx, - new_reg); - else if (old_reg->mem_type == TTM_PL_SYSTEM) - ret = nouveau_bo_move_flips(bo, evict, ctx, - new_reg); - else - ret = nouveau_bo_move_m2mf(bo, evict, ctx, - new_reg); + ret = nouveau_bo_move_m2mf(bo, evict, ctx, + new_reg); if (!ret) goto out; } diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c index efe5046e3268..d36321b699e0 100644 --- a/drivers/gpu/drm/radeon/radeon_ttm.c +++ b/drivers/gpu/drm/radeon/radeon_ttm.c @@ -207,94 +207,6 @@ static int radeon_move_blit(struct ttm_buffer_object *bo, return r; }
-static int radeon_move_setup_tt(struct ttm_buffer_object *bo, - struct ttm_operation_ctx *ctx, - struct ttm_resource *tmp_mem) -{ - struct ttm_place placements; - struct ttm_placement placement; - int r; - - placement.num_placement = 1; - placement.placement = &placements; - placement.num_busy_placement = 1; - placement.busy_placement = &placements; - placements.fpfn = 0; - placements.lpfn = 0; - placements.mem_type = TTM_PL_TT; - placements.flags = 0; - - r = ttm_bo_mem_space(bo, &placement, tmp_mem, ctx); - if (unlikely(r)) - return r; - - r = ttm_tt_populate(bo->bdev, bo->ttm, ctx); - if (unlikely(r)) - goto out_cleanup; - - r = radeon_ttm_tt_bind(bo->bdev, bo->ttm, tmp_mem); - if (unlikely(r)) - goto out_cleanup; - return 0; -out_cleanup: - ttm_resource_free(bo, tmp_mem); - return r; -} - -static int radeon_move_vram_ram(struct ttm_buffer_object *bo, - bool evict, - struct ttm_operation_ctx *ctx, - struct ttm_resource *new_mem) -{ - struct ttm_resource *old_mem = &bo->mem; - struct ttm_resource tmp_mem; - int r; - - tmp_mem = *new_mem; - tmp_mem.mm_node = NULL; - - r = radeon_move_setup_tt(bo, ctx, &tmp_mem); - if (unlikely(r)) - return r; - - r = radeon_move_blit(bo, true, &tmp_mem, old_mem); - if (unlikely(r)) { - goto out_cleanup; - } - r = ttm_bo_wait_ctx(bo, ctx); - if (unlikely(r)) - goto out_cleanup; - - radeon_ttm_tt_unbind(bo->bdev, bo->ttm); - ttm_resource_free(bo, &bo->mem); - ttm_bo_assign_mem(bo, new_mem); -out_cleanup: - ttm_resource_free(bo, &tmp_mem); - return r; -} - -static int radeon_move_ram_vram(struct ttm_buffer_object *bo, - bool evict, - struct ttm_operation_ctx *ctx, - struct ttm_resource *new_mem) -{ - struct ttm_resource *old_mem = &bo->mem; - struct ttm_resource tmp_mem; - int r; - - tmp_mem = *new_mem; - tmp_mem.mm_node = NULL; - - r = radeon_move_setup_tt(bo, ctx, &tmp_mem); - if (unlikely(r)) - return r; - - ttm_bo_assign_mem(bo, &tmp_mem); - r = radeon_move_blit(bo, true, new_mem, old_mem); - ttm_resource_free(bo, &tmp_mem); - return r; -} - static int radeon_bo_move(struct ttm_buffer_object *bo, bool evict, struct ttm_operation_ctx *ctx, struct ttm_resource *new_mem) @@ -304,6 +216,15 @@ static int radeon_bo_move(struct ttm_buffer_object *bo, bool evict, struct ttm_resource *old_mem = &bo->mem; int r;
+ if ((old_mem->mem_type == TTM_PL_SYSTEM && + new_mem->mem_type == TTM_PL_VRAM) || + (old_mem->mem_type == TTM_PL_VRAM && + new_mem->mem_type == TTM_PL_SYSTEM)) { + new_mem->mem_type = TTM_PL_TT; + new_mem->placement = 0; + return -EMULTIHOP; + } + if (new_mem->mem_type == TTM_PL_TT) { r = radeon_ttm_tt_bind(bo->bdev, bo->ttm, new_mem); if (r) @@ -344,17 +265,8 @@ static int radeon_bo_move(struct ttm_buffer_object *bo, bool evict, goto memcpy; }
- if (old_mem->mem_type == TTM_PL_VRAM && - new_mem->mem_type == TTM_PL_SYSTEM) { - r = radeon_move_vram_ram(bo, evict, ctx, new_mem); - } else if (old_mem->mem_type == TTM_PL_SYSTEM && - new_mem->mem_type == TTM_PL_VRAM) { - r = radeon_move_ram_vram(bo, evict, ctx, new_mem); - } else { - r = radeon_move_blit(bo, evict, - new_mem, old_mem); - } - + r = radeon_move_blit(bo, evict, + new_mem, old_mem); if (r) { memcpy: r = ttm_bo_move_memcpy(bo, ctx, new_mem); diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 9b2be4e44bea..f4a6d5f29be8 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -260,8 +260,11 @@ static int ttm_bo_handle_move_mem(struct ttm_buffer_object *bo, }
ret = bdev->driver->move(bo, evict, ctx, mem); - if (ret) + if (ret) { + if (ret == -EMULTIHOP) + return ret; goto out_err; + }
ctx->bytes_moved += bo->mem.num_pages << PAGE_SHIFT; return 0; @@ -936,6 +939,33 @@ int ttm_bo_mem_space(struct ttm_buffer_object *bo, } EXPORT_SYMBOL(ttm_bo_mem_space);
+static int ttm_bo_bounce_temp_buffer(struct ttm_buffer_object *bo, + struct ttm_resource *mem, + struct ttm_operation_ctx *ctx) +{ + struct ttm_place placement_memtype = { + .fpfn = 0, + .lpfn = 0, + .mem_type = mem->mem_type, + .flags = mem->placement, + }; + struct ttm_placement bounce_placement; + int ret; + + bounce_placement.num_placement = bounce_placement.num_busy_placement = 1; + bounce_placement.placement = bounce_placement.busy_placement = &placement_memtype; + + /* find space in the bounce domain */ + ret = ttm_bo_mem_space(bo, &bounce_placement, mem, ctx); + if (ret) + return ret; + /* move to the bounce domain */ + ret = ttm_bo_handle_move_mem(bo, mem, false, ctx); + if (ret) + return ret; + return 0; +} + static int ttm_bo_move_buffer(struct ttm_buffer_object *bo, struct ttm_placement *placement, struct ttm_operation_ctx *ctx) @@ -954,11 +984,18 @@ static int ttm_bo_move_buffer(struct ttm_buffer_object *bo, /* * Determine where to move the buffer. */ +bounce: ret = ttm_bo_mem_space(bo, placement, &mem, ctx); if (ret) - goto out_unlock; + return ret; ret = ttm_bo_handle_move_mem(bo, &mem, false, ctx); -out_unlock: + if (ret == -EMULTIHOP) { + ret = ttm_bo_bounce_temp_buffer(bo, &mem, ctx); + if (ret) + return ret; + /* try and move to final place now. */ + goto bounce; + } if (ret) ttm_resource_free(bo, &mem); return ret; @@ -1478,4 +1515,3 @@ void ttm_bo_tt_destroy(struct ttm_buffer_object *bo) ttm_tt_destroy(bo->bdev, bo->ttm); bo->ttm = NULL; } -
Yes, please! That approach looks even better than what I had in mind.
A few comments below:
Am 20.10.20 um 06:16 schrieb Dave Airlie:
From: Dave Airlie airlied@redhat.com
[SNIP]
- /* don't go from system->vram in one hop,
report this back to the caller tell it
where to bounce this buffer through. */
- if ((old_mem->mem_type == TTM_PL_SYSTEM &&
new_mem->mem_type == TTM_PL_VRAM) ||
(old_mem->mem_type == TTM_PL_VRAM &&
new_mem->mem_type == TTM_PL_SYSTEM)) {
new_mem->mem_type = TTM_PL_TT;
Not sure if that is such a good idea, new_mem can be some allocated memory in the VRAM domain at this moment.
Maybe instead give the callback a separate bounce argument instead.
new_mem->placement = 0;
return -EMULTIHOP;
Using EMULTIHOP here is a really nice idea.
[SNIP]
+static int ttm_bo_bounce_temp_buffer(struct ttm_buffer_object *bo,
struct ttm_resource *mem,
struct ttm_operation_ctx *ctx)
+{
- struct ttm_place placement_memtype = {
.fpfn = 0,
.lpfn = 0,
.mem_type = mem->mem_type,
.flags = mem->placement,
- };
- struct ttm_placement bounce_placement;
- int ret;
- bounce_placement.num_placement = bounce_placement.num_busy_placement = 1;
- bounce_placement.placement = bounce_placement.busy_placement = &placement_memtype;
- /* find space in the bounce domain */
- ret = ttm_bo_mem_space(bo, &bounce_placement, mem, ctx);
- if (ret)
return ret;
- /* move to the bounce domain */
- ret = ttm_bo_handle_move_mem(bo, mem, false, ctx);
Is this a recursion? I don't think it is, but I thought I better double check.
Regards, Christian.
- if (ret)
return ret;
- return 0;
+}
- static int ttm_bo_move_buffer(struct ttm_buffer_object *bo, struct ttm_placement *placement, struct ttm_operation_ctx *ctx)
@@ -954,11 +984,18 @@ static int ttm_bo_move_buffer(struct ttm_buffer_object *bo, /* * Determine where to move the buffer. */ +bounce: ret = ttm_bo_mem_space(bo, placement, &mem, ctx); if (ret)
goto out_unlock;
ret = ttm_bo_handle_move_mem(bo, &mem, false, ctx);return ret;
-out_unlock:
- if (ret == -EMULTIHOP) {
ret = ttm_bo_bounce_temp_buffer(bo, &mem, ctx);
if (ret)
return ret;
/* try and move to final place now. */
goto bounce;
- } if (ret) ttm_resource_free(bo, &mem); return ret;
@@ -1478,4 +1515,3 @@ void ttm_bo_tt_destroy(struct ttm_buffer_object *bo) ttm_tt_destroy(bo->bdev, bo->ttm); bo->ttm = NULL; }
On Tue, 20 Oct 2020 at 18:31, Christian König christian.koenig@amd.com wrote:
Yes, please! That approach looks even better than what I had in mind.
A few comments below:
Am 20.10.20 um 06:16 schrieb Dave Airlie:
From: Dave Airlie airlied@redhat.com
[SNIP]
/* don't go from system->vram in one hop,
report this back to the caller tell it
where to bounce this buffer through. */
if ((old_mem->mem_type == TTM_PL_SYSTEM &&
new_mem->mem_type == TTM_PL_VRAM) ||
(old_mem->mem_type == TTM_PL_VRAM &&
new_mem->mem_type == TTM_PL_SYSTEM)) {
new_mem->mem_type = TTM_PL_TT;
Not sure if that is such a good idea, new_mem can be some allocated memory in the VRAM domain at this moment.
Maybe instead give the callback a separate bounce argument instead.
I've changed this in the latest posting to take a ttm_place that the driver fills out if it needs a hop.
/* find space in the bounce domain */
ret = ttm_bo_mem_space(bo, &bounce_placement, mem, ctx);
if (ret)
return ret;
/* move to the bounce domain */
ret = ttm_bo_handle_move_mem(bo, mem, false, ctx);
Is this a recursion? I don't think it is, but I thought I better double check.
No it should never recurse because the driver should only return -EMULTIHOP once, even if a theoretical driver was to need > 1 hops it should work until it returns something other than -EMULTIHOP. Of course a broken driver could cause bad things, but that is always true.
Dave.
dri-devel@lists.freedesktop.org