On Mon, 2021-07-19 at 13:51 +0200, Christian König wrote:
This way we finally fix the problem that new resource are not immediately evict-able after allocation.
That has caused numerous problems including OOM on GDS handling and not being able to use TTM as general resource manager.
Signed-off-by: Christian König christian.koenig@amd.com
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 8 +- drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 2 +- drivers/gpu/drm/ttm/ttm_bo.c | 101 ++----------------- drivers/gpu/drm/ttm/ttm_bo_util.c | 1 - drivers/gpu/drm/ttm/ttm_device.c | 4 +- drivers/gpu/drm/ttm/ttm_resource.c | 127 ++++++++++++++++++++++++ include/drm/ttm/ttm_bo_api.h | 16 --- include/drm/ttm/ttm_bo_driver.h | 29 +----- include/drm/ttm/ttm_resource.h | 35 +++++++ 9 files changed, 177 insertions(+), 146 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 18246b5b6ee3..4b178a74b4e0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -643,12 +643,12 @@ void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev, if (vm->bulk_moveable) { spin_lock(&adev->mman.bdev.lru_lock); - ttm_bo_bulk_move_lru_tail(&vm->lru_bulk_move); + ttm_lru_bulk_move_tail(&vm->lru_bulk_move); spin_unlock(&adev->mman.bdev.lru_lock); return; } - memset(&vm->lru_bulk_move, 0, sizeof(vm->lru_bulk_move)); + ttm_lru_bulk_move_init(&vm->lru_bulk_move); spin_lock(&adev->mman.bdev.lru_lock); list_for_each_entry(bo_base, &vm->idle, vm_status) { @@ -658,11 +658,9 @@ void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev, if (!bo->parent) continue; - ttm_bo_move_to_lru_tail(&bo->tbo, bo->tbo.resource, - &vm->lru_bulk_move); + ttm_bo_move_to_lru_tail(&bo->tbo, &vm-
lru_bulk_move);
if (shadow) ttm_bo_move_to_lru_tail(&shadow->tbo, - shadow->tbo.resource, &vm->lru_bulk_move); } spin_unlock(&adev->mman.bdev.lru_lock); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c index bf33724bed5c..b38eef37f1c8 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c @@ -472,7 +472,7 @@ static void i915_ttm_adjust_lru(struct drm_i915_gem_object *obj) bo->priority = I915_TTM_PRIO_NO_PAGES; } - ttm_bo_move_to_lru_tail(bo, bo->resource, NULL); + ttm_bo_move_to_lru_tail(bo, NULL); spin_unlock(&bo->bdev->lru_lock); } diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 5a2dc712c632..09a62ad06b9d 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -69,95 +69,15 @@ static void ttm_bo_mem_space_debug(struct ttm_buffer_object *bo, } } -static void ttm_bo_del_from_lru(struct ttm_buffer_object *bo) -{ - struct ttm_device *bdev = bo->bdev;
- list_del_init(&bo->lru);
- if (bdev->funcs->del_from_lru_notify) - bdev->funcs->del_from_lru_notify(bo); -}
-static void ttm_bo_bulk_move_set_pos(struct ttm_lru_bulk_move_pos *pos, - struct ttm_buffer_object *bo) -{ - if (!pos->first) - pos->first = bo; - pos->last = bo; -}
void ttm_bo_move_to_lru_tail(struct ttm_buffer_object *bo, - struct ttm_resource *mem, struct ttm_lru_bulk_move *bulk) { - struct ttm_device *bdev = bo->bdev; - struct ttm_resource_manager *man;
- if (!bo->deleted) - dma_resv_assert_held(bo->base.resv);
- if (bo->pin_count) { - ttm_bo_del_from_lru(bo); - return; - }
- man = ttm_manager_type(bdev, mem->mem_type); - list_move_tail(&bo->lru, &man->lru[bo->priority]);
- if (bdev->funcs->del_from_lru_notify) - bdev->funcs->del_from_lru_notify(bo);
- if (bulk && !bo->pin_count) { - switch (bo->resource->mem_type) { - case TTM_PL_TT: - ttm_bo_bulk_move_set_pos(&bulk->tt[bo-
priority], bo);
- break; + dma_resv_assert_held(bo->base.resv); - case TTM_PL_VRAM: - ttm_bo_bulk_move_set_pos(&bulk->vram[bo-
priority], bo);
- break; - } - } + ttm_resource_move_to_lru_tail(bo->resource, bulk); } EXPORT_SYMBOL(ttm_bo_move_to_lru_tail); -void ttm_bo_bulk_move_lru_tail(struct ttm_lru_bulk_move *bulk) -{ - unsigned i;
- for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) { - struct ttm_lru_bulk_move_pos *pos = &bulk->tt[i]; - struct ttm_resource_manager *man;
- if (!pos->first) - continue;
- dma_resv_assert_held(pos->first->base.resv); - dma_resv_assert_held(pos->last->base.resv);
- man = ttm_manager_type(pos->first->bdev, TTM_PL_TT); - list_bulk_move_tail(&man->lru[i], &pos->first->lru, - &pos->last->lru); - }
- for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) { - struct ttm_lru_bulk_move_pos *pos = &bulk->vram[i]; - struct ttm_resource_manager *man;
- if (!pos->first) - continue;
- dma_resv_assert_held(pos->first->base.resv); - dma_resv_assert_held(pos->last->base.resv);
- man = ttm_manager_type(pos->first->bdev, TTM_PL_VRAM); - list_bulk_move_tail(&man->lru[i], &pos->first->lru, - &pos->last->lru); - } -} -EXPORT_SYMBOL(ttm_bo_bulk_move_lru_tail);
static int ttm_bo_handle_move_mem(struct ttm_buffer_object *bo, struct ttm_resource *mem, bool evict, struct ttm_operation_ctx *ctx, @@ -339,7 +259,6 @@ static int ttm_bo_cleanup_refs(struct ttm_buffer_object *bo, return ret; } - ttm_bo_del_from_lru(bo); list_del_init(&bo->ddestroy); spin_unlock(&bo->bdev->lru_lock); ttm_bo_cleanup_memtype_use(bo); @@ -440,7 +359,7 @@ static void ttm_bo_release(struct kref *kref) */ if (bo->pin_count) { bo->pin_count = 0; - ttm_bo_move_to_lru_tail(bo, bo->resource, NULL); + ttm_resource_move_to_lru_tail(bo->resource, NULL); } kref_init(&bo->kref); @@ -453,7 +372,6 @@ static void ttm_bo_release(struct kref *kref) } spin_lock(&bo->bdev->lru_lock); - ttm_bo_del_from_lru(bo); list_del(&bo->ddestroy); spin_unlock(&bo->bdev->lru_lock); @@ -667,15 +585,17 @@ int ttm_mem_evict_first(struct ttm_device *bdev, struct ww_acquire_ctx *ticket) { struct ttm_buffer_object *bo = NULL, *busy_bo = NULL; + struct ttm_resource *res; bool locked = false; unsigned i; int ret; spin_lock(&bdev->lru_lock); for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) { - list_for_each_entry(bo, &man->lru[i], lru) { + list_for_each_entry(res, &man->lru[i], lru) { bool busy; + bo = res->bo;
Follow up to previous review: What happens here if someone now reassigns @res->bo and then kills @bo. At least it's not immediately clear what's protecting from that. Isn't a kref_get_unless_zero() on the bo needed here, and res->bo being assigned (and properly cleared on bo destruction) under the lru_lock when needed?
Admittedly as you pointed out earlier we can't kref_put() the bo under the lru lock but (if all else fails) one could perhaps defer the put to a worker, or move the bo to lru tail and drop the lru lock iff kref_put() may hit a zero refcount.
/Thomas