Re: [PATCH 4/5] drm/ttm: move the LRU into resource handling

23 Aug 2021

On Mon, 2021-07-19 at 13:51 +0200, Christian König wrote:
...
This way we finally fix the problem that new resource are
not immediately evict-able after allocation.
That has caused numerous problems including OOM on GDS handling
and not being able to use TTM as general resource manager.
Signed-off-by: Christian König christian.koenig@amd.com
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  |   8 +-
 drivers/gpu/drm/i915/gem/i915_gem_ttm.c |   2 +-
 drivers/gpu/drm/ttm/ttm_bo.c            | 101 ++-----------------
 drivers/gpu/drm/ttm/ttm_bo_util.c       |   1 -
 drivers/gpu/drm/ttm/ttm_device.c        |   4 +-
 drivers/gpu/drm/ttm/ttm_resource.c      | 127
++++++++++++++++++++++++
 include/drm/ttm/ttm_bo_api.h            |  16 ---
 include/drm/ttm/ttm_bo_driver.h         |  29 +-----
 include/drm/ttm/ttm_resource.h          |  35 +++++++
 9 files changed, 177 insertions(+), 146 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 18246b5b6ee3..4b178a74b4e0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -643,12 +643,12 @@ void amdgpu_vm_move_to_lru_tail(struct
amdgpu_device *adev,
 
        if (vm->bulk_moveable) {
                spin_lock(&adev->mman.bdev.lru_lock);
-               ttm_bo_bulk_move_lru_tail(&vm->lru_bulk_move);
+               ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
                spin_unlock(&adev->mman.bdev.lru_lock);
                return;
        }
 
-       memset(&vm->lru_bulk_move, 0, sizeof(vm->lru_bulk_move));
+       ttm_lru_bulk_move_init(&vm->lru_bulk_move);
 
        spin_lock(&adev->mman.bdev.lru_lock);
        list_for_each_entry(bo_base, &vm->idle, vm_status) {
@@ -658,11 +658,9 @@ void amdgpu_vm_move_to_lru_tail(struct
amdgpu_device *adev,
                if (!bo->parent)
                        continue;
 
-               ttm_bo_move_to_lru_tail(&bo->tbo, bo->tbo.resource,
-                                       &vm->lru_bulk_move);
+               ttm_bo_move_to_lru_tail(&bo->tbo, &vm-
...
lru_bulk_move);
if (shadow)
                        ttm_bo_move_to_lru_tail(&shadow->tbo,
-                                               shadow->tbo.resource,
                                                &vm->lru_bulk_move);
        }
        spin_unlock(&adev->mman.bdev.lru_lock);
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
index bf33724bed5c..b38eef37f1c8 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
@@ -472,7 +472,7 @@ static void i915_ttm_adjust_lru(struct
drm_i915_gem_object *obj)
                        bo->priority = I915_TTM_PRIO_NO_PAGES;
        }
 
-       ttm_bo_move_to_lru_tail(bo, bo->resource, NULL);
+       ttm_bo_move_to_lru_tail(bo, NULL);
        spin_unlock(&bo->bdev->lru_lock);
 }
 
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c
b/drivers/gpu/drm/ttm/ttm_bo.c
index 5a2dc712c632..09a62ad06b9d 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -69,95 +69,15 @@ static void ttm_bo_mem_space_debug(struct
ttm_buffer_object *bo,
        }
 }
 
-static void ttm_bo_del_from_lru(struct ttm_buffer_object *bo)
-{
-       struct ttm_device *bdev = bo->bdev;



-       list_del_init(&bo->lru);



-       if (bdev->funcs->del_from_lru_notify)
-               bdev->funcs->del_from_lru_notify(bo);
-}



-static void ttm_bo_bulk_move_set_pos(struct ttm_lru_bulk_move_pos
*pos,
-                                    struct ttm_buffer_object *bo)
-{
-       if (!pos->first)
-               pos->first = bo;
-       pos->last = bo;
-}



void ttm_bo_move_to_lru_tail(struct ttm_buffer_object *bo,
-                            struct ttm_resource *mem,
                             struct ttm_lru_bulk_move *bulk)
 {
-       struct ttm_device *bdev = bo->bdev;
-       struct ttm_resource_manager *man;



-       if (!bo->deleted)
-               dma_resv_assert_held(bo->base.resv);



-       if (bo->pin_count) {
-               ttm_bo_del_from_lru(bo);
-               return;
-       }



-       man = ttm_manager_type(bdev, mem->mem_type);
-       list_move_tail(&bo->lru, &man->lru[bo->priority]);



-       if (bdev->funcs->del_from_lru_notify)
-               bdev->funcs->del_from_lru_notify(bo);



-       if (bulk && !bo->pin_count) {
-               switch (bo->resource->mem_type) {
-               case TTM_PL_TT:
-                       ttm_bo_bulk_move_set_pos(&bulk->tt[bo-
...
priority], bo);
-                       break;
+       dma_resv_assert_held(bo->base.resv);
 
-               case TTM_PL_VRAM:
-                       ttm_bo_bulk_move_set_pos(&bulk->vram[bo-
...
priority], bo);
-                       break;
-               }
-       }
+       ttm_resource_move_to_lru_tail(bo->resource, bulk);
 }
 EXPORT_SYMBOL(ttm_bo_move_to_lru_tail);
 
-void ttm_bo_bulk_move_lru_tail(struct ttm_lru_bulk_move *bulk)
-{
-       unsigned i;



-       for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
-               struct ttm_lru_bulk_move_pos *pos = &bulk->tt[i];
-               struct ttm_resource_manager *man;



-               if (!pos->first)
-                       continue;



-               dma_resv_assert_held(pos->first->base.resv);
-               dma_resv_assert_held(pos->last->base.resv);



-               man = ttm_manager_type(pos->first->bdev, TTM_PL_TT);
-               list_bulk_move_tail(&man->lru[i], &pos->first->lru,
-                                   &pos->last->lru);
-       }



-       for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
-               struct ttm_lru_bulk_move_pos *pos = &bulk->vram[i];
-               struct ttm_resource_manager *man;



-               if (!pos->first)
-                       continue;



-               dma_resv_assert_held(pos->first->base.resv);
-               dma_resv_assert_held(pos->last->base.resv);



-               man = ttm_manager_type(pos->first->bdev,
TTM_PL_VRAM);
-               list_bulk_move_tail(&man->lru[i], &pos->first->lru,
-                                   &pos->last->lru);
-       }
-}
-EXPORT_SYMBOL(ttm_bo_bulk_move_lru_tail);



static int ttm_bo_handle_move_mem(struct ttm_buffer_object *bo,
                                  struct ttm_resource *mem, bool
evict,
                                  struct ttm_operation_ctx *ctx,
@@ -339,7 +259,6 @@ static int ttm_bo_cleanup_refs(struct
ttm_buffer_object *bo,
                return ret;
        }
 
-       ttm_bo_del_from_lru(bo);
        list_del_init(&bo->ddestroy);
        spin_unlock(&bo->bdev->lru_lock);
        ttm_bo_cleanup_memtype_use(bo);
@@ -440,7 +359,7 @@ static void ttm_bo_release(struct kref *kref)
                 */
                if (bo->pin_count) {
                        bo->pin_count = 0;
-                       ttm_bo_move_to_lru_tail(bo, bo->resource,
NULL);
+                       ttm_resource_move_to_lru_tail(bo->resource,
NULL);
                }
 
                kref_init(&bo->kref);
@@ -453,7 +372,6 @@ static void ttm_bo_release(struct kref *kref)
        }
 
        spin_lock(&bo->bdev->lru_lock);
-       ttm_bo_del_from_lru(bo);
        list_del(&bo->ddestroy);
        spin_unlock(&bo->bdev->lru_lock);
 
@@ -667,15 +585,17 @@ int ttm_mem_evict_first(struct ttm_device
*bdev,
                        struct ww_acquire_ctx *ticket)
 {
        struct ttm_buffer_object *bo = NULL, *busy_bo = NULL;
+       struct ttm_resource *res;
        bool locked = false;
        unsigned i;
        int ret;
 
        spin_lock(&bdev->lru_lock);
        for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
-               list_for_each_entry(bo, &man->lru[i], lru) {
+               list_for_each_entry(res, &man->lru[i], lru) {
                        bool busy;
 
+                       bo = res->bo;
Follow up to previous review: What happens here if someone now
reassigns @res->bo and then kills @bo. At least it's not immediately
clear what's protecting from that. Isn't a kref_get_unless_zero() on
the bo needed here, and res->bo being assigned (and properly cleared on
bo destruction) under the lru_lock when needed?
Admittedly as you pointed out earlier we can't kref_put() the bo under
the lru lock but (if all else fails) one could perhaps defer the put to
a worker, or move the bo to lru tail and drop the lru lock iff
kref_put() may hit a zero refcount.
/Thomas

    

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

Re: [PATCH 4/5] drm/ttm: move the LRU into resource handling

Signed-off-by: Christian König christian.koenig@amd.com