[PATCH 1/6] drm/ttm: remove no_gpu_wait param from ttm_bo_move_accel_cleanup

List overview All Threads
Download

newer

older

Re: master build: 0 failures 4...

[PATCH] drm/omap: panel-dsi-cm:...

Christian König

15 Jun 2016 15 Jun '16

11:44 a.m.

From: Christian König christian.koenig@amd.com

It isn't used and not waiting for the GPU after scheduling a move is actually quite dangerous.

Signed-off-by: Christian König christian.koenig@amd.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 +-- drivers/gpu/drm/nouveau/nouveau_bo.c | 1 - drivers/gpu/drm/radeon/radeon_ttm.c | 3 +-- drivers/gpu/drm/ttm/ttm_bo_util.c | 1 - include/drm/ttm/ttm_bo_driver.h | 4 +--- 5 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 232f123..b2b9df6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -287,8 +287,7 @@ static int amdgpu_move_blit(struct ttm_buffer_object *bo, new_mem->num_pages * PAGE_SIZE, /* bytes */ bo->resv, &fence); /* FIXME: handle copy error */ - r = ttm_bo_move_accel_cleanup(bo, fence, - evict, no_wait_gpu, new_mem); + r = ttm_bo_move_accel_cleanup(bo, fence, evict, new_mem); fence_put(fence); return r; } diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c index b61660b..49ae191 100644 --- a/drivers/gpu/drm/nouveau/nouveau_bo.c +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c @@ -1082,7 +1082,6 @@ nouveau_bo_move_m2mf(struct ttm_buffer_object *bo, int evict, bool intr, ret = ttm_bo_move_accel_cleanup(bo, &fence->base, evict, - no_wait_gpu, new_mem); nouveau_fence_unref(&fence); } diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c index 20ca22d..ffdad81 100644 --- a/drivers/gpu/drm/radeon/radeon_ttm.c +++ b/drivers/gpu/drm/radeon/radeon_ttm.c @@ -300,8 +300,7 @@ static int radeon_move_blit(struct ttm_buffer_object *bo, if (IS_ERR(fence)) return PTR_ERR(fence);

- r = ttm_bo_move_accel_cleanup(bo, &fence->base, - evict, no_wait_gpu, new_mem); + r = ttm_bo_move_accel_cleanup(bo, &fence->base, evict, new_mem); radeon_fence_unref(&fence); return r; } diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c index 434f239..c8fe554 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_util.c +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c @@ -637,7 +637,6 @@ EXPORT_SYMBOL(ttm_bo_kunmap); int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo, struct fence *fence, bool evict, - bool no_wait_gpu, struct ttm_mem_reg *new_mem) { struct ttm_bo_device *bdev = bo->bdev; diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h index 0d1d9d7..697e5f9 100644 --- a/include/drm/ttm/ttm_bo_driver.h +++ b/include/drm/ttm/ttm_bo_driver.h @@ -1004,7 +1004,6 @@ extern void ttm_bo_free_old_node(struct ttm_buffer_object *bo); * @bo: A pointer to a struct ttm_buffer_object. * @fence: A fence object that signals when moving is complete. * @evict: This is an evict move. Don't return until the buffer is idle. - * @no_wait_gpu: Return immediately if the GPU is busy. * @new_mem: struct ttm_mem_reg indicating where to move. * * Accelerated move function to be called when an accelerated move @@ -1016,8 +1015,7 @@ extern void ttm_bo_free_old_node(struct ttm_buffer_object *bo); */

extern int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo, - struct fence *fence, - bool evict, bool no_wait_gpu, + struct fence *fence, bool evict, struct ttm_mem_reg *new_mem); /** * ttm_io_prot

-- 2.5.0

Show replies by date

Christian König

15 Jun 15 Jun

11:44 a.m.

New subject: [PATCH 2/6] drm/ttm: remove TTM_BO_PRIV_FLAG_MOVING

From: Christian König christian.koenig@amd.com

Instead of using the flag just remember the fence of the last move operation.

This avoids waiting for command submissions pipelined after the move, but before accessing the BO with the CPU again.

Signed-off-by: Christian König christian.koenig@amd.com --- drivers/gpu/drm/ttm/ttm_bo.c | 4 ++-- drivers/gpu/drm/ttm/ttm_bo_util.c | 4 +++- drivers/gpu/drm/ttm/ttm_bo_vm.c | 19 ++++++++++++------- include/drm/ttm/ttm_bo_api.h | 4 ++-- include/drm/ttm/ttm_bo_driver.h | 3 --- 5 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index c3c615c..caa657d 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -149,6 +149,7 @@ static void ttm_bo_release_list(struct kref *list_kref)

ttm_tt_destroy(bo->ttm); atomic_dec(&bo->glob->bo_count); + fence_put(bo->moving); if (bo->resv == &bo->ttm_resv) reservation_object_fini(&bo->ttm_resv); mutex_destroy(&bo->wu_mutex); @@ -1138,7 +1139,7 @@ int ttm_bo_init(struct ttm_bo_device *bdev, bo->mem.page_alignment = page_alignment; bo->mem.bus.io_reserved_vm = false; bo->mem.bus.io_reserved_count = 0; - bo->priv_flags = 0; + bo->moving = NULL; bo->mem.placement = (TTM_PL_FLAG_SYSTEM | TTM_PL_FLAG_CACHED); bo->persistent_swap_storage = persistent_swap_storage; bo->acc_size = acc_size; @@ -1585,7 +1586,6 @@ int ttm_bo_wait(struct ttm_buffer_object *bo, return -EBUSY;

reservation_object_add_excl_fence(resv, NULL); - clear_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags); return 0; } EXPORT_SYMBOL(ttm_bo_wait); diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c index c8fe554..9ea8d02 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_util.c +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c @@ -465,6 +465,7 @@ static int ttm_buffer_object_transfer(struct ttm_buffer_object *bo, INIT_LIST_HEAD(&fbo->lru); INIT_LIST_HEAD(&fbo->swap); INIT_LIST_HEAD(&fbo->io_reserve_lru); + fbo->moving = NULL; drm_vma_node_reset(&fbo->vma_node); atomic_set(&fbo->cpu_writers, 0);

@@ -665,7 +666,8 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo, * operation has completed. */

- set_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags); + fence_put(bo->moving); + bo->moving = fence_get(fence);

ret = ttm_buffer_object_transfer(bo, &ghost_obj); if (ret) diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c index 3216878..a6ed9d5 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c @@ -48,15 +48,14 @@ static int ttm_bo_vm_fault_idle(struct ttm_buffer_object *bo, { int ret = 0;

- if (likely(!test_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags))) + if (likely(!bo->moving)) goto out_unlock;

/* * Quick non-stalling check for idle. */ - ret = ttm_bo_wait(bo, false, true); - if (likely(ret == 0)) - goto out_unlock; + if (fence_is_signaled(bo->moving)) + goto out_clear;

/* * If possible, avoid waiting for GPU with mmap_sem @@ -68,17 +67,23 @@ static int ttm_bo_vm_fault_idle(struct ttm_buffer_object *bo, goto out_unlock;

up_read(&vma->vm_mm->mmap_sem); - (void) ttm_bo_wait(bo, true, false); + (void) fence_wait(bo->moving, true); goto out_unlock; }

/* * Ordinary wait. */ - ret = ttm_bo_wait(bo, true, false); - if (unlikely(ret != 0)) + ret = fence_wait(bo->moving, true); + if (unlikely(ret != 0)) { ret = (ret != -ERESTARTSYS) ? VM_FAULT_SIGBUS : VM_FAULT_NOPAGE; + goto out_unlock; + } + +out_clear: + fence_put(bo->moving); + bo->moving = NULL;

out_unlock: return ret; diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h index c801d90..97aaf5c 100644 --- a/include/drm/ttm/ttm_bo_api.h +++ b/include/drm/ttm/ttm_bo_api.h @@ -173,7 +173,7 @@ struct ttm_tt; * @lru: List head for the lru list. * @ddestroy: List head for the delayed destroy list. * @swap: List head for swap LRU list. - * @priv_flags: Flags describing buffer object internal state. + * @moving: Fence set when BO is moving * @vma_node: Address space manager node. * @offset: The current GPU offset, which can have different meanings * depending on the memory type. For SYSTEM type memory, it should be 0. @@ -239,7 +239,7 @@ struct ttm_buffer_object { * Members protected by a bo reservation. */

- unsigned long priv_flags; + struct fence *moving;

struct drm_vma_offset_node vma_node;

diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h index 697e5f9..44dea22 100644 --- a/include/drm/ttm/ttm_bo_driver.h +++ b/include/drm/ttm/ttm_bo_driver.h @@ -503,9 +503,6 @@ struct ttm_bo_global {

#define TTM_NUM_MEM_TYPES 8

-#define TTM_BO_PRIV_FLAG_MOVING 0 /* Buffer object is moving and needs - idling before CPU mapping */ -#define TTM_BO_PRIV_FLAG_MAX 1 /** * struct ttm_bo_device - Buffer object driver device-specific data. *

-- 2.5.0

Christian König

11:44 a.m.

New subject: [PATCH 3/6] drm/ttm: simplify ttm_bo_wait

From: Christian König christian.koenig@amd.com

As far as I can see no need for a custom implementation any more.

Signed-off-by: Christian König christian.koenig@amd.com --- drivers/gpu/drm/ttm/ttm_bo.c | 37 ++++--------------------------------- 1 file changed, 4 insertions(+), 33 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index caa657d..28cd535 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -1546,46 +1546,17 @@ EXPORT_SYMBOL(ttm_bo_unmap_virtual); int ttm_bo_wait(struct ttm_buffer_object *bo, bool interruptible, bool no_wait) { - struct reservation_object_list *fobj; - struct reservation_object *resv; - struct fence *excl; - long timeout = 15 * HZ; - int i; - - resv = bo->resv; - fobj = reservation_object_get_list(resv); - excl = reservation_object_get_excl(resv); - if (excl) { - if (!fence_is_signaled(excl)) { - if (no_wait) - return -EBUSY; - - timeout = fence_wait_timeout(excl, - interruptible, timeout); - } - } - - for (i = 0; fobj && timeout > 0 && i < fobj->shared_count; ++i) { - struct fence *fence; - fence = rcu_dereference_protected(fobj->shared[i], - reservation_object_held(resv)); - - if (!fence_is_signaled(fence)) { - if (no_wait) - return -EBUSY; - - timeout = fence_wait_timeout(fence, - interruptible, timeout); - } - } + long timeout = no_wait ? 0 : 15 * HZ;

+ timeout = reservation_object_wait_timeout_rcu(bo->resv, true, + interruptible, timeout); if (timeout < 0) return timeout;

if (timeout == 0) return -EBUSY;

- reservation_object_add_excl_fence(resv, NULL); + reservation_object_add_excl_fence(bo->resv, NULL); return 0; } EXPORT_SYMBOL(ttm_bo_wait);

-- 2.5.0

Christian König

11:44 a.m.

New subject: [PATCH 4/6] drm/ttm: add the infrastructur for pipelined evictions

From: Christian König christian.koenig@amd.com

Free up the memory immediately, remember the last eviction for each domain and make new allocations depend on the last eviction to be completed.

Signed-off-by: Christian König christian.koenig@amd.com --- drivers/gpu/drm/ttm/ttm_bo.c | 49 ++++++++++++++++++--- drivers/gpu/drm/ttm/ttm_bo_util.c | 92 +++++++++++++++++++++++++++++++++++++++ include/drm/ttm/ttm_bo_driver.h | 24 ++++++++++ 3 files changed, 160 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 28cd535..5d93169 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -788,6 +788,34 @@ void ttm_bo_mem_put(struct ttm_buffer_object *bo, struct ttm_mem_reg *mem) EXPORT_SYMBOL(ttm_bo_mem_put);

/** + * Add the last move fence to the BO and reserve a new shared slot. + */ +static int ttm_bo_add_move_fence(struct ttm_buffer_object *bo, + struct ttm_mem_type_manager *man, + struct ttm_mem_reg *mem) +{ + struct fence *fence; + int ret; + + spin_lock(&man->move_lock); + fence = fence_get(man->move); + spin_unlock(&man->move_lock); + + if (fence) { + reservation_object_add_shared_fence(bo->resv, fence); + + ret = reservation_object_reserve_shared(bo->resv); + if (unlikely(ret)) + return ret; + + fence_put(bo->moving); + bo->moving = fence; + } + + return 0; +} + +/** * Repeatedly evict memory from the LRU for @mem_type until we create enough * space, or we've evicted everything and there isn't enough space. */ @@ -813,10 +841,8 @@ static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo, if (unlikely(ret != 0)) return ret; } while (1); - if (mem->mm_node == NULL) - return -ENOMEM; mem->mem_type = mem_type; - return 0; + return ttm_bo_add_move_fence(bo, man, mem); }

static uint32_t ttm_bo_select_caching(struct ttm_mem_type_manager *man, @@ -886,6 +912,10 @@ int ttm_bo_mem_space(struct ttm_buffer_object *bo, bool has_erestartsys = false; int i, ret;

+ ret = reservation_object_reserve_shared(bo->resv); + if (unlikely(ret)) + return ret; + mem->mm_node = NULL; for (i = 0; i < placement->num_placement; ++i) { const struct ttm_place *place = &placement->placement[i]; @@ -919,9 +949,15 @@ int ttm_bo_mem_space(struct ttm_buffer_object *bo, ret = (*man->func->get_node)(man, bo, place, mem); if (unlikely(ret)) return ret; - - if (mem->mm_node) + + if (mem->mm_node) { + ret = ttm_bo_add_move_fence(bo, man, mem); + if (unlikely(ret)) { + (*man->func->put_node)(man, mem); + return ret; + } break; + } }

if ((type_ok && (mem_type == TTM_PL_SYSTEM)) || mem->mm_node) { @@ -1290,6 +1326,7 @@ int ttm_bo_clean_mm(struct ttm_bo_device *bdev, unsigned mem_type) mem_type); return ret; } + fence_put(man->move);

man->use_type = false; man->has_type = false; @@ -1335,6 +1372,7 @@ int ttm_bo_init_mm(struct ttm_bo_device *bdev, unsigned type, man->io_reserve_fastpath = true; man->use_io_reserve_lru = false; mutex_init(&man->io_reserve_mutex); + spin_lock_init(&man->move_lock); INIT_LIST_HEAD(&man->io_reserve_lru);

ret = bdev->driver->init_mem_type(bdev, type, man); @@ -1353,6 +1391,7 @@ int ttm_bo_init_mm(struct ttm_bo_device *bdev, unsigned type, man->size = p_size;

INIT_LIST_HEAD(&man->lru); + man->move = NULL;

return 0; } diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c index 9ea8d02..0c389a5 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_util.c +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c @@ -696,3 +696,95 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo, return 0; } EXPORT_SYMBOL(ttm_bo_move_accel_cleanup); + +int ttm_bo_pipeline_move(struct ttm_buffer_object *bo, + struct fence *fence, bool evict, + struct ttm_mem_reg *new_mem) +{ + struct ttm_bo_device *bdev = bo->bdev; + struct ttm_mem_reg *old_mem = &bo->mem; + + struct ttm_mem_type_manager *from = &bdev->man[old_mem->mem_type]; + struct ttm_mem_type_manager *to = &bdev->man[new_mem->mem_type]; + + int ret; + + reservation_object_add_excl_fence(bo->resv, fence); + + if (!evict) { + struct ttm_buffer_object *ghost_obj; + + /** + * This should help pipeline ordinary buffer moves. + * + * Hang old buffer memory on a new buffer object, + * and leave it to be released when the GPU + * operation has completed. + */ + + fence_put(bo->moving); + bo->moving = fence_get(fence); + + ret = ttm_buffer_object_transfer(bo, &ghost_obj); + if (ret) + return ret; + + reservation_object_add_excl_fence(ghost_obj->resv, fence); + + /** + * If we're not moving to fixed memory, the TTM object + * needs to stay alive. Otherwhise hang it on the ghost + * bo to be unbound and destroyed. + */ + + if (!(to->flags & TTM_MEMTYPE_FLAG_FIXED)) + ghost_obj->ttm = NULL; + else + bo->ttm = NULL; + + ttm_bo_unreserve(ghost_obj); + ttm_bo_unref(&ghost_obj); + + } else if (from->flags & TTM_MEMTYPE_FLAG_FIXED) { + + /** + * BO doesn't have a TTM we need to bind/unbind. Just remember + * this eviction and free up the allocation + */ + + spin_lock(&from->move_lock); + if (!from->move || fence_is_later(from->move, fence)) { + fence_put(from->move); + from->move = fence_get(fence); + } + spin_unlock(&from->move_lock); + + ttm_bo_free_old_node(bo); + + fence_put(bo->moving); + bo->moving = fence_get(fence); + + } else { + /** + * Last resort, wait for the move to be completed. + * + * Should never happen in pratice. + */ + + ret = ttm_bo_wait(bo, false, false); + if (ret) + return ret; + + if (to->flags & TTM_MEMTYPE_FLAG_FIXED) { + ttm_tt_destroy(bo->ttm); + bo->ttm = NULL; + } + ttm_bo_free_old_node(bo); + } + + *old_mem = *new_mem; + new_mem->mm_node = NULL; + + return 0; +} +EXPORT_SYMBOL(ttm_bo_pipeline_move); diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h index 44dea22..e2ebe66 100644 --- a/include/drm/ttm/ttm_bo_driver.h +++ b/include/drm/ttm/ttm_bo_driver.h @@ -258,8 +258,10 @@ struct ttm_mem_type_manager_func { * reserved by the TTM vm system. * @io_reserve_lru: Optional lru list for unreserving io mem regions. * @io_reserve_fastpath: Only use bdev::driver::io_mem_reserve to obtain + * @move_lock: lock for move fence * static information. bdev::driver::io_mem_free is never used. * @lru: The lru list for this memory type. + * @move: The fence of the last pipelined move operation. * * This structure is used to identify and manage memory types for a device. * It's set up by the ttm_bo_driver::init_mem_type method. @@ -286,6 +288,7 @@ struct ttm_mem_type_manager { struct mutex io_reserve_mutex; bool use_io_reserve_lru; bool io_reserve_fastpath; + spinlock_t move_lock;

/* * Protected by @io_reserve_mutex: @@ -298,6 +301,11 @@ struct ttm_mem_type_manager { */

struct list_head lru; + + /* + * Protected by @move_lock. + */ + struct fence *move; };

/** @@ -1014,6 +1022,22 @@ extern void ttm_bo_free_old_node(struct ttm_buffer_object *bo); extern int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo, struct fence *fence, bool evict, struct ttm_mem_reg *new_mem); + +/** + * ttm_bo_pipeline_move. + * + * @bo: A pointer to a struct ttm_buffer_object. + * @fence: A fence object that signals when moving is complete. + * @evict: This is an evict move. Don't return until the buffer is idle. + * @new_mem: struct ttm_mem_reg indicating where to move. + * + * Function for pipelining accelerated moves. Either free the memory + * immediately or hang it on a temporary buffer object. + */ +int ttm_bo_pipeline_move(struct ttm_buffer_object *bo, + struct fence *fence, bool evict, + struct ttm_mem_reg *new_mem); + /** * ttm_io_prot *

-- 2.5.0

Alex Deucher

4:21 p.m.

New subject: [PATCH 4/6] drm/ttm: add the infrastructur for pipelined evictions

On Wed, Jun 15, 2016 at 7:44 AM, Christian König deathsimple@vodafone.de wrote:

...

From: Christian König christian.koenig@amd.com

Free up the memory immediately, remember the last eviction for each domain and make new allocations depend on the last eviction to be completed.

Signed-off-by: Christian König christian.koenig@amd.com

Minor typo in the patch title: s/infrastructur/infrastructure/

Alex

...

drivers/gpu/drm/ttm/ttm_bo.c | 49 ++++++++++++++++++--- drivers/gpu/drm/ttm/ttm_bo_util.c | 92 +++++++++++++++++++++++++++++++++++++++ include/drm/ttm/ttm_bo_driver.h | 24 ++++++++++ 3 files changed, 160 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 28cd535..5d93169 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -788,6 +788,34 @@ void ttm_bo_mem_put(struct ttm_buffer_object *bo, struct ttm_mem_reg *mem) EXPORT_SYMBOL(ttm_bo_mem_put);

/**

Add the last move fence to the BO and reserve a new shared slot.

*/

+static int ttm_bo_add_move_fence(struct ttm_buffer_object *bo,
                           struct ttm_mem_type_manager *man,
                           struct ttm_mem_reg *mem)
+{
  struct fence *fence;
  int ret;
  spin_lock(&man->move_lock);
  fence = fence_get(man->move);
  spin_unlock(&man->move_lock);
  if (fence) {
          reservation_object_add_shared_fence(bo->resv, fence);
          ret = reservation_object_reserve_shared(bo->resv);
          if (unlikely(ret))
                  return ret;
          fence_put(bo->moving);
          bo->moving = fence;
  }
  return 0;
+}

+/**

Repeatedly evict memory from the LRU for @mem_type until we create enough

space, or we've evicted everything and there isn't enough space.

*/ @@ -813,10 +841,8 @@ static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo, if (unlikely(ret != 0)) return ret; } while (1);
  if (mem->mm_node == NULL)
          return -ENOMEM;
  mem->mem_type = mem_type;
  return 0;
  return ttm_bo_add_move_fence(bo, man, mem);
}

static uint32_t ttm_bo_select_caching(struct ttm_mem_type_manager *man, @@ -886,6 +912,10 @@ int ttm_bo_mem_space(struct ttm_buffer_object *bo, bool has_erestartsys = false; int i, ret;
  ret = reservation_object_reserve_shared(bo->resv);
  if (unlikely(ret))
          return ret;
  mem->mm_node = NULL;
  for (i = 0; i < placement->num_placement; ++i) {
          const struct ttm_place *place = &placement->placement[i];
@@ -919,9 +949,15 @@ int ttm_bo_mem_space(struct ttm_buffer_object *bo, ret = (*man->func->get_node)(man, bo, place, mem); if (unlikely(ret)) return ret;
          if (mem->mm_node)
          if (mem->mm_node) {
                  ret = ttm_bo_add_move_fence(bo, man, mem);
                  if (unlikely(ret)) {
                          (*man->func->put_node)(man, mem);
                          return ret;
                  }
                  break;
          }
  }

  if ((type_ok && (mem_type == TTM_PL_SYSTEM)) || mem->mm_node) {
@@ -1290,6 +1326,7 @@ int ttm_bo_clean_mm(struct ttm_bo_device *bdev, unsigned mem_type) mem_type); return ret; }
  fence_put(man->move);

  man->use_type = false;
  man->has_type = false;
@@ -1335,6 +1372,7 @@ int ttm_bo_init_mm(struct ttm_bo_device *bdev, unsigned type, man->io_reserve_fastpath = true; man->use_io_reserve_lru = false; mutex_init(&man->io_reserve_mutex);
  spin_lock_init(&man->move_lock);
  INIT_LIST_HEAD(&man->io_reserve_lru);

  ret = bdev->driver->init_mem_type(bdev, type, man);
@@ -1353,6 +1391,7 @@ int ttm_bo_init_mm(struct ttm_bo_device *bdev, unsigned type, man->size = p_size;
    INIT_LIST_HEAD(&man->lru);
  man->move = NULL;

  return 0;
} diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c index 9ea8d02..0c389a5 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_util.c +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c @@ -696,3 +696,95 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo, return 0; } EXPORT_SYMBOL(ttm_bo_move_accel_cleanup);

+int ttm_bo_pipeline_move(struct ttm_buffer_object *bo,
                   struct fence *fence, bool evict,
                   struct ttm_mem_reg *new_mem)
+{
  struct ttm_bo_device *bdev = bo->bdev;
  struct ttm_mem_reg *old_mem = &bo->mem;
  struct ttm_mem_type_manager *from = &bdev->man[old_mem->mem_type];
  struct ttm_mem_type_manager *to = &bdev->man[new_mem->mem_type];
  int ret;
  reservation_object_add_excl_fence(bo->resv, fence);
  if (!evict) {
          struct ttm_buffer_object *ghost_obj;
          /**
           * This should help pipeline ordinary buffer moves.
           *
           * Hang old buffer memory on a new buffer object,
           * and leave it to be released when the GPU
           * operation has completed.
           */
          fence_put(bo->moving);
          bo->moving = fence_get(fence);
          ret = ttm_buffer_object_transfer(bo, &ghost_obj);
          if (ret)
                  return ret;
          reservation_object_add_excl_fence(ghost_obj->resv, fence);
          /**
           * If we're not moving to fixed memory, the TTM object
           * needs to stay alive. Otherwhise hang it on the ghost
           * bo to be unbound and destroyed.
           */
          if (!(to->flags & TTM_MEMTYPE_FLAG_FIXED))
                  ghost_obj->ttm = NULL;
          else
                  bo->ttm = NULL;
          ttm_bo_unreserve(ghost_obj);
          ttm_bo_unref(&ghost_obj);
  } else if (from->flags & TTM_MEMTYPE_FLAG_FIXED) {
          /**
           * BO doesn't have a TTM we need to bind/unbind. Just remember
           * this eviction and free up the allocation
           */
          spin_lock(&from->move_lock);
          if (!from->move || fence_is_later(from->move, fence)) {
                  fence_put(from->move);
                  from->move = fence_get(fence);
          }
          spin_unlock(&from->move_lock);
          ttm_bo_free_old_node(bo);
          fence_put(bo->moving);
          bo->moving = fence_get(fence);
  } else {
          /**
           * Last resort, wait for the move to be completed.
           *
           * Should never happen in pratice.
           */
          ret = ttm_bo_wait(bo, false, false);
          if (ret)
                  return ret;
          if (to->flags & TTM_MEMTYPE_FLAG_FIXED) {
                  ttm_tt_destroy(bo->ttm);
                  bo->ttm = NULL;
          }
          ttm_bo_free_old_node(bo);
  }
  *old_mem = *new_mem;
  new_mem->mm_node = NULL;
  return 0;
+} +EXPORT_SYMBOL(ttm_bo_pipeline_move); diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h index 44dea22..e2ebe66 100644 --- a/include/drm/ttm/ttm_bo_driver.h +++ b/include/drm/ttm/ttm_bo_driver.h @@ -258,8 +258,10 @@ struct ttm_mem_type_manager_func {

reserved by the TTM vm system.

@io_reserve_lru: Optional lru list for unreserving io mem regions.

@io_reserve_fastpath: Only use bdev::driver::io_mem_reserve to obtain

@move_lock: lock for move fence

static information. bdev::driver::io_mem_free is never used.

@lru: The lru list for this memory type.

@move: The fence of the last pipelined move operation.

This structure is used to identify and manage memory types for a device.

It's set up by the ttm_bo_driver::init_mem_type method.

@@ -286,6 +288,7 @@ struct ttm_mem_type_manager { struct mutex io_reserve_mutex; bool use_io_reserve_lru; bool io_reserve_fastpath;
  spinlock_t move_lock;

  /*
   * Protected by @io_reserve_mutex:
@@ -298,6 +301,11 @@ struct ttm_mem_type_manager { */
    struct list_head lru;
  /*
   * Protected by @move_lock.
   */
  struct fence *move;
};

/** @@ -1014,6 +1022,22 @@ extern void ttm_bo_free_old_node(struct ttm_buffer_object *bo); extern int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo, struct fence *fence, bool evict, struct ttm_mem_reg *new_mem);

+/**

ttm_bo_pipeline_move.

@bo: A pointer to a struct ttm_buffer_object.

@fence: A fence object that signals when moving is complete.

@evict: This is an evict move. Don't return until the buffer is idle.

@new_mem: struct ttm_mem_reg indicating where to move.

Function for pipelining accelerated moves. Either free the memory

immediately or hang it on a temporary buffer object.

*/

+int ttm_bo_pipeline_move(struct ttm_buffer_object *bo,
                   struct fence *fence, bool evict,
                   struct ttm_mem_reg *new_mem);
/**

ttm_io_prot

-- 2.5.0

dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel

Thomas Hellstrom

2 Jul 2 Jul

8:39 a.m.

New subject: [PATCH 4/6] drm/ttm: add the infrastructur for pipelined evictions

Christian,

Thanks for doing this! A question below:

On 06/15/2016 01:44 PM, Christian König wrote:

...

From: Christian König christian.koenig@amd.com

Free up the memory immediately, remember the last eviction for each domain and make new allocations depend on the last eviction to be completed.

Signed-off-by: Christian König christian.koenig@amd.com

drivers/gpu/drm/ttm/ttm_bo.c | 49 ++++++++++++++++++--- drivers/gpu/drm/ttm/ttm_bo_util.c | 92 +++++++++++++++++++++++++++++++++++++++ include/drm/ttm/ttm_bo_driver.h | 24 ++++++++++ 3 files changed, 160 insertions(+), 5 deletions(-)

/* * Protected by @io_reserve_mutex: @@ -298,6 +301,11 @@ struct ttm_mem_type_manager { */

struct list_head lru;
/*
* Protected by @move_lock.
*/
struct fence *move;
};

Did you look at protecting the move fence with RCU? I figure where there actually is a fence it doesn't matter much but in the fastpath where move is NULL we'd be able to get rid of a number of locking cycles.

I guess though there might be both licensing implications and requirements to using kfree_rcu() to free the fence.

/Thomas

Christian König

4:10 p.m.

New subject: [PATCH 4/6] drm/ttm: add the infrastructur for pipelined evictions

Am 02.07.2016 um 10:39 schrieb Thomas Hellstrom:

...

Christian,

Thanks for doing this! A question below:

On 06/15/2016 01:44 PM, Christian König wrote:

...
From: Christian König christian.koenig@amd.com

Free up the memory immediately, remember the last eviction for each domain and make new allocations depend on the last eviction to be completed.

Signed-off-by: Christian König christian.koenig@amd.com

drivers/gpu/drm/ttm/ttm_bo.c | 49 ++++++++++++++++++--- drivers/gpu/drm/ttm/ttm_bo_util.c | 92 +++++++++++++++++++++++++++++++++++++++ include/drm/ttm/ttm_bo_driver.h | 24 ++++++++++ 3 files changed, 160 insertions(+), 5 deletions(-)

/* * Protected by @io_reserve_mutex: @@ -298,6 +301,11 @@ struct ttm_mem_type_manager { */

struct list_head lru;
/*
* Protected by @move_lock.
*/
struct fence *move; };
Did you look at protecting the move fence with RCU? I figure where there actually is a fence it doesn't matter much but in the fastpath where move is NULL we'd be able to get rid of a number of locking cycles.

Yeah, thought about that as well.

...

I guess though there might be both licensing implications and requirements to using kfree_rcu() to free the fence.

The problem wasn't licensing (but it is good that you point that out I wasn't aware of this problem), but that that the existing fence RCU function wouldn't worked here and I didn't had time to take a closer look.

Should be relative easy to switch the read path over, because we already free the fences RCU protected.

Regards, Christian.

...

/Thomas

Christian König

15 Jun 15 Jun

11:44 a.m.

New subject: [PATCH 5/6] drm/amdgpu: save the PD addr before scheduling the job

From: Christian König christian.koenig@amd.com

When we pipeline evictions the page directory could already be moving somewhere else when grab_id is called.

Signed-off-by: Christian König christian.koenig@amd.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 6 ++---- 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index a3d7d13..850c4dd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -661,6 +661,8 @@ static int amdgpu_cs_ib_vm_chunk(struct amdgpu_device *adev, } }

+ p->job->vm_pd_addr = amdgpu_bo_gpu_offset(vm->page_directory); + r = amdgpu_bo_vm_update_pte(p, vm); if (!r) amdgpu_cs_sync_rings(p); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index d3e0576..82efb40 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -177,7 +177,6 @@ int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct amdgpu_ring *ring, struct amdgpu_sync *sync, struct fence *fence, unsigned *vm_id, uint64_t *vm_pd_addr) { - uint64_t pd_addr = amdgpu_bo_gpu_offset(vm->page_directory); struct amdgpu_device *adev = ring->adev; struct fence *updates = sync->last_vm_update; struct amdgpu_vm_id *id, *idle; @@ -250,7 +249,7 @@ int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct amdgpu_ring *ring, if (atomic64_read(&id->owner) != vm->client_id) continue;

- if (pd_addr != id->pd_gpu_addr) + if (*vm_pd_addr != id->pd_gpu_addr) continue;

if (!same_ring && @@ -298,14 +297,13 @@ int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct amdgpu_ring *ring, fence_put(id->flushed_updates); id->flushed_updates = fence_get(updates);

- id->pd_gpu_addr = pd_addr; + id->pd_gpu_addr = *vm_pd_addr;

list_move_tail(&id->list, &adev->vm_manager.ids_lru); atomic64_set(&id->owner, vm->client_id); vm->ids[ring->idx] = id;

*vm_id = id - adev->vm_manager.ids; - *vm_pd_addr = pd_addr; trace_amdgpu_vm_grab_id(vm, ring->idx, *vm_id, *vm_pd_addr);

error:

-- 2.5.0

zhoucm1

16 Jun 16 Jun

8:33 a.m.

New subject: [PATCH 5/6] drm/amdgpu: save the PD addr before scheduling the job

On 2016年06月15日 19:44, Christian König wrote:

...

From: Christian König christian.koenig@amd.com

When we pipeline evictions the page directory could already be moving somewhere else when grab_id is called.

Isn't PD bo protected by job fence? I think before job fence is signalled, the PD bo is safe, there shouldn't be a chance to evict PD bo.

Regards, David Zhou

...

Signed-off-by: Christian König christian.koenig@amd.com

drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 6 ++---- 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index a3d7d13..850c4dd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -661,6 +661,8 @@ static int amdgpu_cs_ib_vm_chunk(struct amdgpu_device *adev, } }

p->job->vm_pd_addr = amdgpu_bo_gpu_offset(vm->page_directory);

r = amdgpu_bo_vm_update_pte(p, vm); if (!r) amdgpu_cs_sync_rings(p);

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index d3e0576..82efb40 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -177,7 +177,6 @@ int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct amdgpu_ring *ring, struct amdgpu_sync *sync, struct fence *fence, unsigned *vm_id, uint64_t *vm_pd_addr) {

uint64_t pd_addr = amdgpu_bo_gpu_offset(vm->page_directory); struct amdgpu_device *adev = ring->adev; struct fence *updates = sync->last_vm_update; struct amdgpu_vm_id *id, *idle;

@@ -250,7 +249,7 @@ int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct amdgpu_ring *ring, if (atomic64_read(&id->owner) != vm->client_id) continue;
if (pd_addr != id->pd_gpu_addr)
if (*vm_pd_addr != id->pd_gpu_addr)
continue;
if (!same_ring &&
@@ -298,14 +297,13 @@ int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct amdgpu_ring *ring, fence_put(id->flushed_updates); id->flushed_updates = fence_get(updates);

id->pd_gpu_addr = pd_addr;

id->pd_gpu_addr = *vm_pd_addr;

list_move_tail(&id->list, &adev->vm_manager.ids_lru); atomic64_set(&id->owner, vm->client_id); vm->ids[ring->idx] = id;

*vm_id = id - adev->vm_manager.ids;

*vm_pd_addr = pd_addr; trace_amdgpu_vm_grab_id(vm, ring->idx, *vm_id, *vm_pd_addr);

error:

Christian König

9:52 a.m.

New subject: [PATCH 5/6] drm/amdgpu: save the PD addr before scheduling the job

Am 16.06.2016 um 10:33 schrieb zhoucm1:

...

On 2016年06月15日 19:44, Christian König wrote:

...
From: Christian König christian.koenig@amd.com

When we pipeline evictions the page directory could already be moving somewhere else when grab_id is called.

Isn't PD bo protected by job fence? I think before job fence is signalled, the PD bo is safe, there shouldn't be a chance to evict PD bo.

The crux here is that we start to pipeline BO evictions (we plan them but don't execute them immediately).

E.g. the eviction won't happen before the protecting fence is signaled, but we have it planned and so the address returned by amdgpu_bo_gpu_offset() is already the new one.

Regards, Christian.

...

Regards, David Zhou

...
Signed-off-by: Christian König christian.koenig@amd.com

drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 6 ++---- 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index a3d7d13..850c4dd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -661,6 +661,8 @@ static int amdgpu_cs_ib_vm_chunk(struct amdgpu_device *adev, } }
p->job->vm_pd_addr = amdgpu_bo_gpu_offset(vm->page_directory);
r = amdgpu_bo_vm_update_pte(p, vm);
if (!r)
    amdgpu_cs_sync_rings(p);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index d3e0576..82efb40 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -177,7 +177,6 @@ int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct amdgpu_ring *ring, struct amdgpu_sync *sync, struct fence *fence, unsigned *vm_id, uint64_t *vm_pd_addr) {

uint64_t pd_addr = amdgpu_bo_gpu_offset(vm->page_directory); struct amdgpu_device *adev = ring->adev; struct fence *updates = sync->last_vm_update; struct amdgpu_vm_id *id, *idle;

@@ -250,7 +249,7 @@ int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct amdgpu_ring *ring, if (atomic64_read(&id->owner) != vm->client_id) continue;
   if (pd_addr != id->pd_gpu_addr)
   if (*vm_pd_addr != id->pd_gpu_addr)
        continue;
      if (!same_ring &&
@@ -298,14 +297,13 @@ int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct amdgpu_ring *ring, fence_put(id->flushed_updates); id->flushed_updates = fence_get(updates);

id->pd_gpu_addr = pd_addr;

id->pd_gpu_addr = *vm_pd_addr; list_move_tail(&id->list, &adev->vm_manager.ids_lru); atomic64_set(&id->owner, vm->client_id); vm->ids[ring->idx] = id; *vm_id = id - adev->vm_manager.ids;

*vm_pd_addr = pd_addr; trace_amdgpu_vm_grab_id(vm, ring->idx, *vm_id, *vm_pd_addr); error:
dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel

zhoucm1

9:54 a.m.

New subject: [PATCH 5/6] drm/amdgpu: save the PD addr before scheduling the job

On 2016年06月16日 17:52, Christian König wrote:

...

Am 16.06.2016 um 10:33 schrieb zhoucm1:

...
On 2016年06月15日 19:44, Christian König wrote:

...
From: Christian König christian.koenig@amd.com

When we pipeline evictions the page directory could already be moving somewhere else when grab_id is called.

Isn't PD bo protected by job fence? I think before job fence is signalled, the PD bo is safe, there shouldn't be a chance to evict PD bo.

The crux here is that we start to pipeline BO evictions (we plan them but don't execute them immediately).

E.g. the eviction won't happen before the protecting fence is signaled, but we have it planned and so the address returned by amdgpu_bo_gpu_offset() is already the new one.

Thanks for mentioned, I see the code in ttm_bo_handle_move_mem: if (bo->mem.mm_node) { bo->offset = (bo->mem.start << PAGE_SHIFT) + bdev->man[bo->mem.mem_type].gpu_offset; bo->cur_placement = bo->mem.placement; } else bo->offset = 0;

it seems better to update the offset after the moving-fence is signalled, add a moving-fence callback?

Regards, David Zhou

...

Regards, Christian.

...
Regards, David Zhou

...
Signed-off-by: Christian König christian.koenig@amd.com

drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 6 ++---- 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index a3d7d13..850c4dd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -661,6 +661,8 @@ static int amdgpu_cs_ib_vm_chunk(struct amdgpu_device *adev, } }
p->job->vm_pd_addr = amdgpu_bo_gpu_offset(vm->page_directory);
r = amdgpu_bo_vm_update_pte(p, vm);
if (!r)
    amdgpu_cs_sync_rings(p);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index d3e0576..82efb40 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -177,7 +177,6 @@ int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct amdgpu_ring *ring, struct amdgpu_sync *sync, struct fence *fence, unsigned *vm_id, uint64_t *vm_pd_addr) {

uint64_t pd_addr = amdgpu_bo_gpu_offset(vm->page_directory); struct amdgpu_device *adev = ring->adev; struct fence *updates = sync->last_vm_update; struct amdgpu_vm_id *id, *idle;

@@ -250,7 +249,7 @@ int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct amdgpu_ring *ring, if (atomic64_read(&id->owner) != vm->client_id) continue;
   if (pd_addr != id->pd_gpu_addr)
   if (*vm_pd_addr != id->pd_gpu_addr)
        continue;
      if (!same_ring &&
@@ -298,14 +297,13 @@ int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct amdgpu_ring *ring, fence_put(id->flushed_updates); id->flushed_updates = fence_get(updates);

id->pd_gpu_addr = pd_addr;

id->pd_gpu_addr = *vm_pd_addr; list_move_tail(&id->list, &adev->vm_manager.ids_lru); atomic64_set(&id->owner, vm->client_id); vm->ids[ring->idx] = id; *vm_id = id - adev->vm_manager.ids;

*vm_pd_addr = pd_addr; trace_amdgpu_vm_grab_id(vm, ring->idx, *vm_id, *vm_pd_addr); error:
dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel

Christian König

10:10 a.m.

New subject: [PATCH 5/6] drm/amdgpu: save the PD addr before scheduling the job

Am 16.06.2016 um 11:54 schrieb zhoucm1:

...

On 2016年06月16日 17:52, Christian König wrote:

...
Am 16.06.2016 um 10:33 schrieb zhoucm1:

...
On 2016年06月15日 19:44, Christian König wrote:

...
From: Christian König christian.koenig@amd.com

When we pipeline evictions the page directory could already be moving somewhere else when grab_id is called.

Isn't PD bo protected by job fence? I think before job fence is signalled, the PD bo is safe, there shouldn't be a chance to evict PD bo.

The crux here is that we start to pipeline BO evictions (we plan them but don't execute them immediately).

E.g. the eviction won't happen before the protecting fence is signaled, but we have it planned and so the address returned by amdgpu_bo_gpu_offset() is already the new one.

Thanks for mentioned, I see the code in ttm_bo_handle_move_mem: if (bo->mem.mm_node) { bo->offset = (bo->mem.start << PAGE_SHIFT) + bdev->man[bo->mem.mem_type].gpu_offset; bo->cur_placement = bo->mem.placement; } else bo->offset = 0;

it seems better to update the offset after the moving-fence is signalled, add a moving-fence callback?

No, when the next operation wants to move the BO back in it needs the already updated offset.

All we need to do is to make sure that all offsets are determined when the BO is validated into the domain where the submission needs it.

This is actually a requirement for retrieving all buffer offsets anyway because an application could request to move the buffer directly after making a submission with it.

The only reason we haven't noticed that previously is because applications can't affect the PD directly and we didn't pipelined evictions (the only other reason for moving a buffer).

I should probably take a look at adding a couple of warning to amdgpu_bo_gpu_offset() again.

Regards, Christian.

...

Regards, David Zhou

...
Regards, Christian.

...
Regards, David Zhou

...
Signed-off-by: Christian König christian.koenig@amd.com

drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 6 ++---- 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index a3d7d13..850c4dd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -661,6 +661,8 @@ static int amdgpu_cs_ib_vm_chunk(struct amdgpu_device *adev, } }
p->job->vm_pd_addr = amdgpu_bo_gpu_offset(vm->page_directory);
r = amdgpu_bo_vm_update_pte(p, vm);
if (!r)
    amdgpu_cs_sync_rings(p);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index d3e0576..82efb40 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -177,7 +177,6 @@ int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct amdgpu_ring *ring, struct amdgpu_sync *sync, struct fence *fence, unsigned *vm_id, uint64_t *vm_pd_addr) {

uint64_t pd_addr = amdgpu_bo_gpu_offset(vm->page_directory); struct amdgpu_device *adev = ring->adev; struct fence *updates = sync->last_vm_update; struct amdgpu_vm_id *id, *idle;

@@ -250,7 +249,7 @@ int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct amdgpu_ring *ring, if (atomic64_read(&id->owner) != vm->client_id) continue;
   if (pd_addr != id->pd_gpu_addr)
   if (*vm_pd_addr != id->pd_gpu_addr)
        continue;
      if (!same_ring &&
@@ -298,14 +297,13 @@ int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct amdgpu_ring *ring, fence_put(id->flushed_updates); id->flushed_updates = fence_get(updates);

id->pd_gpu_addr = pd_addr;

id->pd_gpu_addr = *vm_pd_addr; list_move_tail(&id->list, &adev->vm_manager.ids_lru); atomic64_set(&id->owner, vm->client_id); vm->ids[ring->idx] = id; *vm_id = id - adev->vm_manager.ids;

*vm_pd_addr = pd_addr; trace_amdgpu_vm_grab_id(vm, ring->idx, *vm_id, *vm_pd_addr); error:
dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel

Christian König

15 Jun 15 Jun

11:44 a.m.

New subject: [PATCH 6/6] drm/amdgpu: pipeline evictions as well

From: Christian König christian.koenig@amd.com

This boosts Xonotic from 38fps to 47fps when artificially limiting VRAM to 256MB for testing. It should improve all CPU bound rendering situations where we have a lot of swapping to/from VRAM.

Signed-off-by: Christian König christian.koenig@amd.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index b2b9df6..f85527f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -286,8 +286,10 @@ static int amdgpu_move_blit(struct ttm_buffer_object *bo, r = amdgpu_copy_buffer(ring, old_start, new_start, new_mem->num_pages * PAGE_SIZE, /* bytes */ bo->resv, &fence); - /* FIXME: handle copy error */ - r = ttm_bo_move_accel_cleanup(bo, fence, evict, new_mem); + if (r) + return r; + + r = ttm_bo_pipeline_move(bo, fence, evict, new_mem); fence_put(fence); return r; }

-- 2.5.0

Alex Deucher

4:22 p.m.

New subject: [PATCH 6/6] drm/amdgpu: pipeline evictions as well

On Wed, Jun 15, 2016 at 7:44 AM, Christian König deathsimple@vodafone.de wrote:

...

From: Christian König christian.koenig@amd.com

This boosts Xonotic from 38fps to 47fps when artificially limiting VRAM to 256MB for testing. It should improve all CPU bound rendering situations where we have a lot of swapping to/from VRAM.

Signed-off-by: Christian König christian.koenig@amd.com

Nice work. Just a minor typo in the title of patch 4. For the series: Reviewed-by: Alex Deucher alexander.deucher@amd.com

...

drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index b2b9df6..f85527f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -286,8 +286,10 @@ static int amdgpu_move_blit(struct ttm_buffer_object *bo, r = amdgpu_copy_buffer(ring, old_start, new_start, new_mem->num_pages * PAGE_SIZE, /* bytes */ bo->resv, &fence);
  /* FIXME: handle copy error */
  r = ttm_bo_move_accel_cleanup(bo, fence, evict, new_mem);
  if (r)
          return r;
  r = ttm_bo_pipeline_move(bo, fence, evict, new_mem);
  fence_put(fence);
  return r;
}

2.5.0

dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel

Christian König

1:27 p.m.

Alex do you want to pull that in through your branches or should I send Dave a separate pull request for the TTM changes?

BTW: I'm trying to enable this on radeon as well. In theory it should work out of the box.

Regards, Christian.

Am 15.06.2016 um 13:44 schrieb Christian König:

...

From: Christian König christian.koenig@amd.com

It isn't used and not waiting for the GPU after scheduling a move is actually quite dangerous.

Signed-off-by: Christian König christian.koenig@amd.com

drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 +-- drivers/gpu/drm/nouveau/nouveau_bo.c | 1 - drivers/gpu/drm/radeon/radeon_ttm.c | 3 +-- drivers/gpu/drm/ttm/ttm_bo_util.c | 1 - include/drm/ttm/ttm_bo_driver.h | 4 +--- 5 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 232f123..b2b9df6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -287,8 +287,7 @@ static int amdgpu_move_blit(struct ttm_buffer_object *bo, new_mem->num_pages * PAGE_SIZE, /* bytes */ bo->resv, &fence); /* FIXME: handle copy error */
r = ttm_bo_move_accel_cleanup(bo, fence,
		      evict, no_wait_gpu, new_mem);
r = ttm_bo_move_accel_cleanup(bo, fence, evict, new_mem); fence_put(fence); return r; }

diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c index b61660b..49ae191 100644 --- a/drivers/gpu/drm/nouveau/nouveau_bo.c +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c @@ -1082,7 +1082,6 @@ nouveau_bo_move_m2mf(struct ttm_buffer_object *bo, int evict, bool intr, ret = ttm_bo_move_accel_cleanup(bo, &fence->base, evict,
						no_wait_gpu,
					new_mem);
	nouveau_fence_unref(&fence);
}
diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c index 20ca22d..ffdad81 100644 --- a/drivers/gpu/drm/radeon/radeon_ttm.c +++ b/drivers/gpu/drm/radeon/radeon_ttm.c @@ -300,8 +300,7 @@ static int radeon_move_blit(struct ttm_buffer_object *bo, if (IS_ERR(fence)) return PTR_ERR(fence);
r = ttm_bo_move_accel_cleanup(bo, &fence->base,
		      evict, no_wait_gpu, new_mem);
r = ttm_bo_move_accel_cleanup(bo, &fence->base, evict, new_mem); radeon_fence_unref(&fence); return r; }

diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c index 434f239..c8fe554 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_util.c +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c @@ -637,7 +637,6 @@ EXPORT_SYMBOL(ttm_bo_kunmap); int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo, struct fence *fence, bool evict,
	      bool no_wait_gpu,
      struct ttm_mem_reg *new_mem)
{ struct ttm_bo_device *bdev = bo->bdev;
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h index 0d1d9d7..697e5f9 100644 --- a/include/drm/ttm/ttm_bo_driver.h +++ b/include/drm/ttm/ttm_bo_driver.h @@ -1004,7 +1004,6 @@ extern void ttm_bo_free_old_node(struct ttm_buffer_object *bo);

@bo: A pointer to a struct ttm_buffer_object.

@fence: A fence object that signals when moving is complete.

@evict: This is an evict move. Don't return until the buffer is idle.

@no_wait_gpu: Return immediately if the GPU is busy.

@new_mem: struct ttm_mem_reg indicating where to move.

Accelerated move function to be called when an accelerated move

@@ -1016,8 +1015,7 @@ extern void ttm_bo_free_old_node(struct ttm_buffer_object *bo); */

extern int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo,
		     struct fence *fence,
		     bool evict, bool no_wait_gpu,
		     struct fence *fence, bool evict,
	     struct ttm_mem_reg *new_mem);
/**

ttm_io_prot

Alex Deucher

3:57 p.m.

On Wed, Jun 15, 2016 at 9:27 AM, Christian König christian.koenig@amd.com wrote:

...

Alex do you want to pull that in through your branches or should I send Dave a separate pull request for the TTM changes?

I'll pick it up today.

...

BTW: I'm trying to enable this on radeon as well. In theory it should work out of the box.

Excellent!

Alex

...

Regards, Christian.

Am 15.06.2016 um 13:44 schrieb Christian König:

...
From: Christian König christian.koenig@amd.com

It isn't used and not waiting for the GPU after scheduling a move is actually quite dangerous.

Signed-off-by: Christian König christian.koenig@amd.com

drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 +-- drivers/gpu/drm/nouveau/nouveau_bo.c | 1 - drivers/gpu/drm/radeon/radeon_ttm.c | 3 +-- drivers/gpu/drm/ttm/ttm_bo_util.c | 1 - include/drm/ttm/ttm_bo_driver.h | 4 +--- 5 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 232f123..b2b9df6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -287,8 +287,7 @@ static int amdgpu_move_blit(struct ttm_buffer_object *bo, new_mem->num_pages * PAGE_SIZE, /* bytes */ bo->resv, &fence); /* FIXME: handle copy error */
  r = ttm_bo_move_accel_cleanup(bo, fence,
                                evict, no_wait_gpu, new_mem);
  r = ttm_bo_move_accel_cleanup(bo, fence, evict, new_mem);
  fence_put(fence);
  return r;
}
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c index b61660b..49ae191 100644 --- a/drivers/gpu/drm/nouveau/nouveau_bo.c +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c @@ -1082,7 +1082,6 @@ nouveau_bo_move_m2mf(struct ttm_buffer_object *bo, int evict, bool intr, ret = ttm_bo_move_accel_cleanup(bo,

&fence->base, evict,

no_wait_gpu, new_mem); nouveau_fence_unref(&fence); } diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c index 20ca22d..ffdad81 100644 --- a/drivers/gpu/drm/radeon/radeon_ttm.c +++ b/drivers/gpu/drm/radeon/radeon_ttm.c @@ -300,8 +300,7 @@ static int radeon_move_blit(struct ttm_buffer_object *bo, if (IS_ERR(fence)) return PTR_ERR(fence);
r = ttm_bo_move_accel_cleanup(bo, &fence->base,
                                evict, no_wait_gpu, new_mem);
  r = ttm_bo_move_accel_cleanup(bo, &fence->base, evict, new_mem);
  radeon_fence_unref(&fence);
  return r;
}
diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c index 434f239..c8fe554 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_util.c +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c @@ -637,7 +637,6 @@ EXPORT_SYMBOL(ttm_bo_kunmap); int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo, struct fence *fence, bool evict,
                        bool no_wait_gpu,
                        struct ttm_mem_reg *new_mem)
{ struct ttm_bo_device *bdev = bo->bdev;
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h index 0d1d9d7..697e5f9 100644 --- a/include/drm/ttm/ttm_bo_driver.h +++ b/include/drm/ttm/ttm_bo_driver.h @@ -1004,7 +1004,6 @@ extern void ttm_bo_free_old_node(struct ttm_buffer_object *bo);

@bo: A pointer to a struct ttm_buffer_object.

@fence: A fence object that signals when moving is complete.

@evict: This is an evict move. Don't return until the buffer is idle.

@no_wait_gpu: Return immediately if the GPU is busy.

@new_mem: struct ttm_mem_reg indicating where to move.

Accelerated move function to be called when an accelerated move

@@ -1016,8 +1015,7 @@ extern void ttm_bo_free_old_node(struct ttm_buffer_object *bo); */ extern int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo,
                               struct fence *fence,
                               bool evict, bool no_wait_gpu,
                               struct fence *fence, bool evict,
                               struct ttm_mem_reg *new_mem);
/**

ttm_io_prot
dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel

3216

Age (days ago)

3233

Last active (days ago)

dri-devel@lists.freedesktop.org

15 comments

5 participants

tags (0)

participants (5)

Alex Deucher
Christian König
Christian König
Thomas Hellstrom
zhoucm1