With the nouveau calls fixed there were only 2 places left that used fence_lock without a reservation, those are fixed in this patch:
ttm_bo_cleanup_refs_or_queue is fixed by simply doing things the other way around.
ttm_bo_cleanup_refs is fixed by taking a reservation first, then a pointer to the fence object and backs off from the reservation if waiting has to be performed.
Signed-off-by: Maarten Lankhorst maarten.lankhorst@canonical.com --- This patch should be applied only after all the previous patches I sent are applied, since it depends on sync_obj_arg removal (kinda, probably fails to apply otherwise), uses ttm_bo_is_reserved, and depends on nouveau wait behavior being fixed.
drivers/gpu/drm/nouveau/nouveau_bo.c | 4 - drivers/gpu/drm/nouveau/nouveau_gem.c | 15 +--- drivers/gpu/drm/radeon/radeon_object.c | 2 - drivers/gpu/drm/ttm/ttm_bo.c | 124 ++++++++++++-------------------- drivers/gpu/drm/ttm/ttm_bo_util.c | 3 - drivers/gpu/drm/ttm/ttm_bo_vm.c | 5 - drivers/gpu/drm/ttm/ttm_execbuf_util.c | 2 - drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 3 - include/drm/ttm/ttm_bo_api.h | 12 +-- include/drm/ttm/ttm_bo_driver.h | 3 - 10 files changed, 52 insertions(+), 121 deletions(-)
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c index cee7448..9749c45 100644 --- a/drivers/gpu/drm/nouveau/nouveau_bo.c +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c @@ -1449,10 +1449,8 @@ nouveau_bo_fence(struct nouveau_bo *nvbo, struct nouveau_fence *fence) if (likely(fence)) nouveau_fence_ref(fence);
- spin_lock(&nvbo->bo.bdev->fence_lock); old_fence = nvbo->bo.sync_obj; nvbo->bo.sync_obj = fence; - spin_unlock(&nvbo->bo.bdev->fence_lock);
nouveau_fence_unref(&old_fence); } @@ -1552,9 +1550,7 @@ nouveau_bo_vma_del(struct nouveau_bo *nvbo, struct nouveau_vma *vma) if (vma->node) { if (nvbo->bo.mem.mem_type != TTM_PL_SYSTEM) { ttm_bo_reserve(&nvbo->bo, false, false, false, 0); - spin_lock(&nvbo->bo.bdev->fence_lock); ttm_bo_wait(&nvbo->bo, false, false, false); - spin_unlock(&nvbo->bo.bdev->fence_lock); ttm_bo_unreserve(&nvbo->bo); nouveau_vm_unmap(vma); } diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c index 6d8391d..eaa700a 100644 --- a/drivers/gpu/drm/nouveau/nouveau_gem.c +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c @@ -391,18 +391,11 @@ retry: static int validate_sync(struct nouveau_channel *chan, struct nouveau_bo *nvbo) { - struct nouveau_fence *fence = NULL; + struct nouveau_fence *fence = nvbo->bo.sync_obj; int ret = 0;
- spin_lock(&nvbo->bo.bdev->fence_lock); - if (nvbo->bo.sync_obj) - fence = nouveau_fence_ref(nvbo->bo.sync_obj); - spin_unlock(&nvbo->bo.bdev->fence_lock); - - if (fence) { + if (fence) ret = nouveau_fence_sync(fence, chan); - nouveau_fence_unref(&fence); - }
return ret; } @@ -614,9 +607,7 @@ nouveau_gem_pushbuf_reloc_apply(struct drm_device *dev, data |= r->vor; }
- spin_lock(&nvbo->bo.bdev->fence_lock); ret = ttm_bo_wait(&nvbo->bo, false, false, false); - spin_unlock(&nvbo->bo.bdev->fence_lock); if (ret) { NV_ERROR(drm, "reloc wait_idle failed: %d\n", ret); break; @@ -848,11 +839,9 @@ nouveau_gem_ioctl_cpu_prep(struct drm_device *dev, void *data,
ret = ttm_bo_reserve(&nvbo->bo, true, false, false, 0); if (!ret) { - spin_lock(&nvbo->bo.bdev->fence_lock); ret = ttm_bo_wait(&nvbo->bo, true, true, true); if (!no_wait && ret) fence = nouveau_fence_ref(nvbo->bo.sync_obj); - spin_unlock(&nvbo->bo.bdev->fence_lock);
ttm_bo_unreserve(&nvbo->bo); } diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c index 808c444..df430e4 100644 --- a/drivers/gpu/drm/radeon/radeon_object.c +++ b/drivers/gpu/drm/radeon/radeon_object.c @@ -613,12 +613,10 @@ int radeon_bo_wait(struct radeon_bo *bo, u32 *mem_type, bool no_wait) r = ttm_bo_reserve(&bo->tbo, true, no_wait, false, 0); if (unlikely(r != 0)) return r; - spin_lock(&bo->tbo.bdev->fence_lock); if (mem_type) *mem_type = bo->tbo.mem.mem_type; if (bo->tbo.sync_obj) r = ttm_bo_wait(&bo->tbo, true, true, no_wait); - spin_unlock(&bo->tbo.bdev->fence_lock); ttm_bo_unreserve(&bo->tbo); return r; } diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index f6d7026..69fc29b 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -498,28 +498,23 @@ static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo) { struct ttm_bo_device *bdev = bo->bdev; struct ttm_bo_global *glob = bo->glob; - struct ttm_bo_driver *driver; + struct ttm_bo_driver *driver = bdev->driver; void *sync_obj = NULL; int put_count; int ret;
- spin_lock(&bdev->fence_lock); - (void) ttm_bo_wait(bo, false, false, true); - if (!bo->sync_obj) { - - spin_lock(&glob->lru_lock); - - /** - * Lock inversion between bo:reserve and bdev::fence_lock here, - * but that's OK, since we're only trylocking. - */ - - ret = ttm_bo_reserve_locked(bo, false, true, false, 0); + spin_lock(&glob->lru_lock); + ret = ttm_bo_reserve_locked(bo, false, true, false, 0); + if (!ret) { + ret = ttm_bo_wait(bo, false, false, true);
- if (unlikely(ret == -EBUSY)) + if (unlikely(ret == -EBUSY)) { + sync_obj = driver->sync_obj_ref(bo->sync_obj); + atomic_set(&bo->reserved, 0); + wake_up_all(&bo->event_queue); goto queue; + }
- spin_unlock(&bdev->fence_lock); put_count = ttm_bo_del_from_lru(bo);
spin_unlock(&glob->lru_lock); @@ -528,18 +523,11 @@ static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo) ttm_bo_list_ref_sub(bo, put_count, true);
return; - } else { - spin_lock(&glob->lru_lock); } queue: - driver = bdev->driver; - if (bo->sync_obj) - sync_obj = driver->sync_obj_ref(bo->sync_obj); - kref_get(&bo->list_kref); list_add_tail(&bo->ddestroy, &bdev->ddestroy); spin_unlock(&glob->lru_lock); - spin_unlock(&bdev->fence_lock);
if (sync_obj) { driver->sync_obj_flush(sync_obj); @@ -565,25 +553,15 @@ static int ttm_bo_cleanup_refs(struct ttm_buffer_object *bo, bool no_wait_gpu) { struct ttm_bo_device *bdev = bo->bdev; + struct ttm_bo_driver *driver = bdev->driver; struct ttm_bo_global *glob = bo->glob; int put_count; int ret = 0; + void *sync_obj;
retry: - spin_lock(&bdev->fence_lock); - ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu); - spin_unlock(&bdev->fence_lock); - - if (unlikely(ret != 0)) - return ret; - spin_lock(&glob->lru_lock);
- if (unlikely(list_empty(&bo->ddestroy))) { - spin_unlock(&glob->lru_lock); - return 0; - } - ret = ttm_bo_reserve_locked(bo, interruptible, no_wait_reserve, false, 0);
@@ -592,18 +570,37 @@ retry: return ret; }
- /** - * We can re-check for sync object without taking - * the bo::lock since setting the sync object requires - * also bo::reserved. A busy object at this point may - * be caused by another thread recently starting an accelerated - * eviction. - */ + if (unlikely(list_empty(&bo->ddestroy))) { + atomic_set(&bo->reserved, 0); + wake_up_all(&bo->event_queue); + spin_unlock(&glob->lru_lock); + return 0; + } + ret = ttm_bo_wait(bo, false, false, true); + + if (ret) { + if (no_wait_gpu) { + atomic_set(&bo->reserved, 0); + wake_up_all(&bo->event_queue); + spin_unlock(&glob->lru_lock); + return ret; + }
- if (unlikely(bo->sync_obj)) { + /** + * Take a reference to the fence and unreserve, if the wait + * was succesful and no new sync_obj was attached, + * ttm_bo_wait in retry will return ret = 0, and end the loop. + */ + + sync_obj = driver->sync_obj_ref(&bo->sync_obj); atomic_set(&bo->reserved, 0); wake_up_all(&bo->event_queue); spin_unlock(&glob->lru_lock); + + ret = driver->sync_obj_wait(bo->sync_obj, false, interruptible); + driver->sync_obj_unref(&sync_obj); + if (ret) + return ret; goto retry; }
@@ -735,9 +732,7 @@ static int ttm_bo_evict(struct ttm_buffer_object *bo, bool interruptible, struct ttm_placement placement; int ret = 0;
- spin_lock(&bdev->fence_lock); ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu); - spin_unlock(&bdev->fence_lock);
if (unlikely(ret != 0)) { if (ret != -ERESTARTSYS) { @@ -1054,7 +1049,6 @@ int ttm_bo_move_buffer(struct ttm_buffer_object *bo, { int ret = 0; struct ttm_mem_reg mem; - struct ttm_bo_device *bdev = bo->bdev;
BUG_ON(!ttm_bo_is_reserved(bo));
@@ -1063,9 +1057,7 @@ int ttm_bo_move_buffer(struct ttm_buffer_object *bo, * Have the driver move function wait for idle when necessary, * instead of doing it here. */ - spin_lock(&bdev->fence_lock); ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu); - spin_unlock(&bdev->fence_lock); if (ret) return ret; mem.num_pages = bo->num_pages; @@ -1554,7 +1546,6 @@ int ttm_bo_device_init(struct ttm_bo_device *bdev, bdev->glob = glob; bdev->need_dma32 = need_dma32; bdev->val_seq = 0; - spin_lock_init(&bdev->fence_lock); mutex_lock(&glob->device_list_mutex); list_add_tail(&bdev->device_list, &glob->device_list); mutex_unlock(&glob->device_list_mutex); @@ -1690,52 +1681,33 @@ int ttm_bo_wait(struct ttm_buffer_object *bo, bool lazy, bool interruptible, bool no_wait) { struct ttm_bo_driver *driver = bo->bdev->driver; - struct ttm_bo_device *bdev = bo->bdev; - void *sync_obj; int ret = 0;
+ WARN_ON_ONCE(!ttm_bo_is_reserved(bo)); + if (likely(bo->sync_obj == NULL)) return 0;
- while (bo->sync_obj) { - + if (bo->sync_obj) { if (driver->sync_obj_signaled(bo->sync_obj)) { void *tmp_obj = bo->sync_obj; bo->sync_obj = NULL; clear_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags); - spin_unlock(&bdev->fence_lock); driver->sync_obj_unref(&tmp_obj); - spin_lock(&bdev->fence_lock); - continue; + return 0; }
if (no_wait) return -EBUSY;
- sync_obj = driver->sync_obj_ref(bo->sync_obj); - spin_unlock(&bdev->fence_lock); - ret = driver->sync_obj_wait(sync_obj, + ret = driver->sync_obj_wait(bo->sync_obj, lazy, interruptible); if (unlikely(ret != 0)) { - driver->sync_obj_unref(&sync_obj); - spin_lock(&bdev->fence_lock); return ret; } - spin_lock(&bdev->fence_lock); - if (likely(bo->sync_obj == sync_obj)) { - void *tmp_obj = bo->sync_obj; - bo->sync_obj = NULL; - clear_bit(TTM_BO_PRIV_FLAG_MOVING, - &bo->priv_flags); - spin_unlock(&bdev->fence_lock); - driver->sync_obj_unref(&sync_obj); - driver->sync_obj_unref(&tmp_obj); - spin_lock(&bdev->fence_lock); - } else { - spin_unlock(&bdev->fence_lock); - driver->sync_obj_unref(&sync_obj); - spin_lock(&bdev->fence_lock); - } + driver->sync_obj_unref(&bo->sync_obj); + clear_bit(TTM_BO_PRIV_FLAG_MOVING, + &bo->priv_flags); } return 0; } @@ -1799,9 +1771,7 @@ static int ttm_bo_swapout(struct ttm_mem_shrink *shrink) * Wait for GPU, then move to system cached. */
- spin_lock(&bo->bdev->fence_lock); ret = ttm_bo_wait(bo, false, false, false); - spin_unlock(&bo->bdev->fence_lock);
if (unlikely(ret != 0)) goto out; diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c index d80d1e8..84d6e01 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_util.c +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c @@ -622,7 +622,6 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo, struct ttm_buffer_object *ghost_obj; void *tmp_obj = NULL;
- spin_lock(&bdev->fence_lock); if (bo->sync_obj) { tmp_obj = bo->sync_obj; bo->sync_obj = NULL; @@ -630,7 +629,6 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo, bo->sync_obj = driver->sync_obj_ref(sync_obj); if (evict) { ret = ttm_bo_wait(bo, false, false, false); - spin_unlock(&bdev->fence_lock); if (tmp_obj) driver->sync_obj_unref(&tmp_obj); if (ret) @@ -653,7 +651,6 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo, */
set_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags); - spin_unlock(&bdev->fence_lock); if (tmp_obj) driver->sync_obj_unref(&tmp_obj);
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c index a877813..55718c2 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c @@ -122,17 +122,14 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * move. */
- spin_lock(&bdev->fence_lock); if (test_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags)) { ret = ttm_bo_wait(bo, false, true, false); - spin_unlock(&bdev->fence_lock); if (unlikely(ret != 0)) { retval = (ret != -ERESTARTSYS) ? VM_FAULT_SIGBUS : VM_FAULT_NOPAGE; goto out_unlock; } - } else - spin_unlock(&bdev->fence_lock); + }
ret = ttm_mem_io_lock(man, true); if (unlikely(ret != 0)) { diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c b/drivers/gpu/drm/ttm/ttm_execbuf_util.c index 721886e..51b5e97 100644 --- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c +++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c @@ -207,7 +207,6 @@ void ttm_eu_fence_buffer_objects(struct list_head *list, void *sync_obj) driver = bdev->driver; glob = bo->glob;
- spin_lock(&bdev->fence_lock); spin_lock(&glob->lru_lock);
list_for_each_entry(entry, list, head) { @@ -218,7 +217,6 @@ void ttm_eu_fence_buffer_objects(struct list_head *list, void *sync_obj) entry->reserved = false; } spin_unlock(&glob->lru_lock); - spin_unlock(&bdev->fence_lock);
list_for_each_entry(entry, list, head) { if (entry->old_sync_obj) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c index ed3c1e7..e038c9a 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c @@ -251,13 +251,10 @@ static void vmw_dummy_query_bo_prepare(struct vmw_private *dev_priv) volatile SVGA3dQueryResult *result; bool dummy; int ret; - struct ttm_bo_device *bdev = &dev_priv->bdev; struct ttm_buffer_object *bo = dev_priv->dummy_query_bo;
ttm_bo_reserve(bo, false, false, false, 0); - spin_lock(&bdev->fence_lock); ret = ttm_bo_wait(bo, false, false, false); - spin_unlock(&bdev->fence_lock); if (unlikely(ret != 0)) (void) vmw_fallback_wait(dev_priv, false, true, 0, false, 10*HZ); diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h index 816d9b1..6c69224 100644 --- a/include/drm/ttm/ttm_bo_api.h +++ b/include/drm/ttm/ttm_bo_api.h @@ -222,6 +222,8 @@ struct ttm_buffer_object { struct file *persistent_swap_storage; struct ttm_tt *ttm; bool evicted; + void *sync_obj; + unsigned long priv_flags;
/** * Members protected by the bdev::lru_lock. @@ -242,16 +244,6 @@ struct ttm_buffer_object { atomic_t reserved;
/** - * Members protected by struct buffer_object_device::fence_lock - * In addition, setting sync_obj to anything else - * than NULL requires bo::reserved to be held. This allows for - * checking NULL while reserved but not holding the mentioned lock. - */ - - void *sync_obj; - unsigned long priv_flags; - - /** * Members protected by the bdev::vm_lock */
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h index 0c8c3b5..aac101b 100644 --- a/include/drm/ttm/ttm_bo_driver.h +++ b/include/drm/ttm/ttm_bo_driver.h @@ -515,8 +515,6 @@ struct ttm_bo_global { * * @driver: Pointer to a struct ttm_bo_driver struct setup by the driver. * @man: An array of mem_type_managers. - * @fence_lock: Protects the synchronizing members on *all* bos belonging - * to this device. * @addr_space_mm: Range manager for the device address space. * lru_lock: Spinlock that protects the buffer+device lru lists and * ddestroy lists. @@ -539,7 +537,6 @@ struct ttm_bo_device { struct ttm_bo_driver *driver; rwlock_t vm_lock; struct ttm_mem_type_manager man[TTM_NUM_MEM_TYPES]; - spinlock_t fence_lock; /* * Protected by the vm lock. */
Hi, Maarten,
As you know I have been having my doubts about this change. To me it seems insane to be forced to read the fence pointer under a reserved lock, simply because when you take the reserve lock, another process may have it and there is a substantial chance that that process will also be waiting for idle while holding the reserve lock.
In essence, to read the fence pointer, there is a large chance you will be waiting for idle to be able to access it. That's why it's protected by a separate spinlock in the first place.
So even if this change might not affect current driver much it's a change to the TTM api that leads to an IMHO very poor design.
/Thomas
On 10/12/2012 05:23 PM, Maarten Lankhorst wrote:
With the nouveau calls fixed there were only 2 places left that used fence_lock without a reservation, those are fixed in this patch:
ttm_bo_cleanup_refs_or_queue is fixed by simply doing things the other way around.
ttm_bo_cleanup_refs is fixed by taking a reservation first, then a pointer to the fence object and backs off from the reservation if waiting has to be performed.
Signed-off-by: Maarten Lankhorst maarten.lankhorst@canonical.com
This patch should be applied only after all the previous patches I sent are applied, since it depends on sync_obj_arg removal (kinda, probably fails to apply otherwise), uses ttm_bo_is_reserved, and depends on nouveau wait behavior being fixed.
drivers/gpu/drm/nouveau/nouveau_bo.c | 4 - drivers/gpu/drm/nouveau/nouveau_gem.c | 15 +--- drivers/gpu/drm/radeon/radeon_object.c | 2 - drivers/gpu/drm/ttm/ttm_bo.c | 124 ++++++++++++-------------------- drivers/gpu/drm/ttm/ttm_bo_util.c | 3 - drivers/gpu/drm/ttm/ttm_bo_vm.c | 5 - drivers/gpu/drm/ttm/ttm_execbuf_util.c | 2 - drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 3 - include/drm/ttm/ttm_bo_api.h | 12 +-- include/drm/ttm/ttm_bo_driver.h | 3 - 10 files changed, 52 insertions(+), 121 deletions(-)
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c index cee7448..9749c45 100644 --- a/drivers/gpu/drm/nouveau/nouveau_bo.c +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c @@ -1449,10 +1449,8 @@ nouveau_bo_fence(struct nouveau_bo *nvbo, struct nouveau_fence *fence) if (likely(fence)) nouveau_fence_ref(fence);
spin_lock(&nvbo->bo.bdev->fence_lock); old_fence = nvbo->bo.sync_obj; nvbo->bo.sync_obj = fence;
spin_unlock(&nvbo->bo.bdev->fence_lock);
nouveau_fence_unref(&old_fence); }
@@ -1552,9 +1550,7 @@ nouveau_bo_vma_del(struct nouveau_bo *nvbo, struct nouveau_vma *vma) if (vma->node) { if (nvbo->bo.mem.mem_type != TTM_PL_SYSTEM) { ttm_bo_reserve(&nvbo->bo, false, false, false, 0);
spin_lock(&nvbo->bo.bdev->fence_lock); ttm_bo_wait(&nvbo->bo, false, false, false);
}spin_unlock(&nvbo->bo.bdev->fence_lock); ttm_bo_unreserve(&nvbo->bo); nouveau_vm_unmap(vma);
diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c index 6d8391d..eaa700a 100644 --- a/drivers/gpu/drm/nouveau/nouveau_gem.c +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c @@ -391,18 +391,11 @@ retry: static int validate_sync(struct nouveau_channel *chan, struct nouveau_bo *nvbo) {
- struct nouveau_fence *fence = NULL;
- struct nouveau_fence *fence = nvbo->bo.sync_obj; int ret = 0;
- spin_lock(&nvbo->bo.bdev->fence_lock);
- if (nvbo->bo.sync_obj)
fence = nouveau_fence_ref(nvbo->bo.sync_obj);
- spin_unlock(&nvbo->bo.bdev->fence_lock);
- if (fence) {
- if (fence) ret = nouveau_fence_sync(fence, chan);
nouveau_fence_unref(&fence);
}
return ret; }
@@ -614,9 +607,7 @@ nouveau_gem_pushbuf_reloc_apply(struct drm_device *dev, data |= r->vor; }
ret = ttm_bo_wait(&nvbo->bo, false, false, false);spin_lock(&nvbo->bo.bdev->fence_lock);
if (ret) { NV_ERROR(drm, "reloc wait_idle failed: %d\n", ret); break;spin_unlock(&nvbo->bo.bdev->fence_lock);
@@ -848,11 +839,9 @@ nouveau_gem_ioctl_cpu_prep(struct drm_device *dev, void *data,
ret = ttm_bo_reserve(&nvbo->bo, true, false, false, 0); if (!ret) {
spin_lock(&nvbo->bo.bdev->fence_lock);
ret = ttm_bo_wait(&nvbo->bo, true, true, true); if (!no_wait && ret) fence = nouveau_fence_ref(nvbo->bo.sync_obj);
spin_unlock(&nvbo->bo.bdev->fence_lock);
ttm_bo_unreserve(&nvbo->bo); }
diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c index 808c444..df430e4 100644 --- a/drivers/gpu/drm/radeon/radeon_object.c +++ b/drivers/gpu/drm/radeon/radeon_object.c @@ -613,12 +613,10 @@ int radeon_bo_wait(struct radeon_bo *bo, u32 *mem_type, bool no_wait) r = ttm_bo_reserve(&bo->tbo, true, no_wait, false, 0); if (unlikely(r != 0)) return r;
- spin_lock(&bo->tbo.bdev->fence_lock); if (mem_type) *mem_type = bo->tbo.mem.mem_type; if (bo->tbo.sync_obj) r = ttm_bo_wait(&bo->tbo, true, true, no_wait);
- spin_unlock(&bo->tbo.bdev->fence_lock); ttm_bo_unreserve(&bo->tbo); return r; }
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index f6d7026..69fc29b 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -498,28 +498,23 @@ static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo) { struct ttm_bo_device *bdev = bo->bdev; struct ttm_bo_global *glob = bo->glob;
- struct ttm_bo_driver *driver;
- struct ttm_bo_driver *driver = bdev->driver; void *sync_obj = NULL; int put_count; int ret;
- spin_lock(&bdev->fence_lock);
- (void) ttm_bo_wait(bo, false, false, true);
- if (!bo->sync_obj) {
spin_lock(&glob->lru_lock);
/**
* Lock inversion between bo:reserve and bdev::fence_lock here,
* but that's OK, since we're only trylocking.
*/
ret = ttm_bo_reserve_locked(bo, false, true, false, 0);
- spin_lock(&glob->lru_lock);
- ret = ttm_bo_reserve_locked(bo, false, true, false, 0);
- if (!ret) {
ret = ttm_bo_wait(bo, false, false, true);
if (unlikely(ret == -EBUSY))
if (unlikely(ret == -EBUSY)) {
sync_obj = driver->sync_obj_ref(bo->sync_obj);
atomic_set(&bo->reserved, 0);
wake_up_all(&bo->event_queue); goto queue;
}
spin_unlock(&bdev->fence_lock);
put_count = ttm_bo_del_from_lru(bo);
spin_unlock(&glob->lru_lock);
@@ -528,18 +523,11 @@ static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo) ttm_bo_list_ref_sub(bo, put_count, true);
return;
} else {
spin_lock(&glob->lru_lock);
} queue:
driver = bdev->driver;
if (bo->sync_obj)
sync_obj = driver->sync_obj_ref(bo->sync_obj);
kref_get(&bo->list_kref); list_add_tail(&bo->ddestroy, &bdev->ddestroy); spin_unlock(&glob->lru_lock);
spin_unlock(&bdev->fence_lock);
if (sync_obj) { driver->sync_obj_flush(sync_obj);
@@ -565,25 +553,15 @@ static int ttm_bo_cleanup_refs(struct ttm_buffer_object *bo, bool no_wait_gpu) { struct ttm_bo_device *bdev = bo->bdev;
struct ttm_bo_driver *driver = bdev->driver; struct ttm_bo_global *glob = bo->glob; int put_count; int ret = 0;
void *sync_obj;
retry:
spin_lock(&bdev->fence_lock);
ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
spin_unlock(&bdev->fence_lock);
if (unlikely(ret != 0))
return ret;
spin_lock(&glob->lru_lock);
if (unlikely(list_empty(&bo->ddestroy))) {
spin_unlock(&glob->lru_lock);
return 0;
}
ret = ttm_bo_reserve_locked(bo, interruptible, no_wait_reserve, false, 0);
@@ -592,18 +570,37 @@ retry: return ret; }
- /**
* We can re-check for sync object without taking
* the bo::lock since setting the sync object requires
* also bo::reserved. A busy object at this point may
* be caused by another thread recently starting an accelerated
* eviction.
*/
- if (unlikely(list_empty(&bo->ddestroy))) {
atomic_set(&bo->reserved, 0);
wake_up_all(&bo->event_queue);
spin_unlock(&glob->lru_lock);
return 0;
- }
- ret = ttm_bo_wait(bo, false, false, true);
- if (ret) {
if (no_wait_gpu) {
atomic_set(&bo->reserved, 0);
wake_up_all(&bo->event_queue);
spin_unlock(&glob->lru_lock);
return ret;
}
- if (unlikely(bo->sync_obj)) {
/**
* Take a reference to the fence and unreserve, if the wait
* was succesful and no new sync_obj was attached,
* ttm_bo_wait in retry will return ret = 0, and end the loop.
*/
atomic_set(&bo->reserved, 0); wake_up_all(&bo->event_queue); spin_unlock(&glob->lru_lock);sync_obj = driver->sync_obj_ref(&bo->sync_obj);
ret = driver->sync_obj_wait(bo->sync_obj, false, interruptible);
driver->sync_obj_unref(&sync_obj);
if (ret)
goto retry; }return ret;
@@ -735,9 +732,7 @@ static int ttm_bo_evict(struct ttm_buffer_object *bo, bool interruptible, struct ttm_placement placement; int ret = 0;
spin_lock(&bdev->fence_lock); ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
spin_unlock(&bdev->fence_lock);
if (unlikely(ret != 0)) { if (ret != -ERESTARTSYS) {
@@ -1054,7 +1049,6 @@ int ttm_bo_move_buffer(struct ttm_buffer_object *bo, { int ret = 0; struct ttm_mem_reg mem;
struct ttm_bo_device *bdev = bo->bdev;
BUG_ON(!ttm_bo_is_reserved(bo));
@@ -1063,9 +1057,7 @@ int ttm_bo_move_buffer(struct ttm_buffer_object *bo, * Have the driver move function wait for idle when necessary, * instead of doing it here. */
- spin_lock(&bdev->fence_lock); ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
- spin_unlock(&bdev->fence_lock); if (ret) return ret; mem.num_pages = bo->num_pages;
@@ -1554,7 +1546,6 @@ int ttm_bo_device_init(struct ttm_bo_device *bdev, bdev->glob = glob; bdev->need_dma32 = need_dma32; bdev->val_seq = 0;
- spin_lock_init(&bdev->fence_lock); mutex_lock(&glob->device_list_mutex); list_add_tail(&bdev->device_list, &glob->device_list); mutex_unlock(&glob->device_list_mutex);
@@ -1690,52 +1681,33 @@ int ttm_bo_wait(struct ttm_buffer_object *bo, bool lazy, bool interruptible, bool no_wait) { struct ttm_bo_driver *driver = bo->bdev->driver;
- struct ttm_bo_device *bdev = bo->bdev;
- void *sync_obj; int ret = 0;
- WARN_ON_ONCE(!ttm_bo_is_reserved(bo));
- if (likely(bo->sync_obj == NULL)) return 0;
- while (bo->sync_obj) {
- if (bo->sync_obj) { if (driver->sync_obj_signaled(bo->sync_obj)) { void *tmp_obj = bo->sync_obj; bo->sync_obj = NULL; clear_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags);
spin_unlock(&bdev->fence_lock); driver->sync_obj_unref(&tmp_obj);
spin_lock(&bdev->fence_lock);
continue;
return 0;
}
if (no_wait) return -EBUSY;
sync_obj = driver->sync_obj_ref(bo->sync_obj);
spin_unlock(&bdev->fence_lock);
ret = driver->sync_obj_wait(sync_obj,
if (unlikely(ret != 0)) {ret = driver->sync_obj_wait(bo->sync_obj, lazy, interruptible);
driver->sync_obj_unref(&sync_obj);
}spin_lock(&bdev->fence_lock); return ret;
spin_lock(&bdev->fence_lock);
if (likely(bo->sync_obj == sync_obj)) {
void *tmp_obj = bo->sync_obj;
bo->sync_obj = NULL;
clear_bit(TTM_BO_PRIV_FLAG_MOVING,
&bo->priv_flags);
spin_unlock(&bdev->fence_lock);
driver->sync_obj_unref(&sync_obj);
driver->sync_obj_unref(&tmp_obj);
spin_lock(&bdev->fence_lock);
} else {
spin_unlock(&bdev->fence_lock);
driver->sync_obj_unref(&sync_obj);
spin_lock(&bdev->fence_lock);
}
driver->sync_obj_unref(&bo->sync_obj);
clear_bit(TTM_BO_PRIV_FLAG_MOVING,
} return 0; }&bo->priv_flags);
@@ -1799,9 +1771,7 @@ static int ttm_bo_swapout(struct ttm_mem_shrink *shrink) * Wait for GPU, then move to system cached. */
spin_lock(&bo->bdev->fence_lock); ret = ttm_bo_wait(bo, false, false, false);
spin_unlock(&bo->bdev->fence_lock);
if (unlikely(ret != 0)) goto out;
diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c index d80d1e8..84d6e01 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_util.c +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c @@ -622,7 +622,6 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo, struct ttm_buffer_object *ghost_obj; void *tmp_obj = NULL;
- spin_lock(&bdev->fence_lock); if (bo->sync_obj) { tmp_obj = bo->sync_obj; bo->sync_obj = NULL;
@@ -630,7 +629,6 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo, bo->sync_obj = driver->sync_obj_ref(sync_obj); if (evict) { ret = ttm_bo_wait(bo, false, false, false);
if (tmp_obj) driver->sync_obj_unref(&tmp_obj); if (ret)spin_unlock(&bdev->fence_lock);
@@ -653,7 +651,6 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo, */
set_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags);
if (tmp_obj) driver->sync_obj_unref(&tmp_obj);spin_unlock(&bdev->fence_lock);
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c index a877813..55718c2 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c @@ -122,17 +122,14 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * move. */
- spin_lock(&bdev->fence_lock); if (test_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags)) { ret = ttm_bo_wait(bo, false, true, false);
if (unlikely(ret != 0)) { retval = (ret != -ERESTARTSYS) ? VM_FAULT_SIGBUS : VM_FAULT_NOPAGE; goto out_unlock; }spin_unlock(&bdev->fence_lock);
- } else
spin_unlock(&bdev->fence_lock);
}
ret = ttm_mem_io_lock(man, true); if (unlikely(ret != 0)) {
diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c b/drivers/gpu/drm/ttm/ttm_execbuf_util.c index 721886e..51b5e97 100644 --- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c +++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c @@ -207,7 +207,6 @@ void ttm_eu_fence_buffer_objects(struct list_head *list, void *sync_obj) driver = bdev->driver; glob = bo->glob;
spin_lock(&bdev->fence_lock); spin_lock(&glob->lru_lock);
list_for_each_entry(entry, list, head) {
@@ -218,7 +217,6 @@ void ttm_eu_fence_buffer_objects(struct list_head *list, void *sync_obj) entry->reserved = false; } spin_unlock(&glob->lru_lock);
spin_unlock(&bdev->fence_lock);
list_for_each_entry(entry, list, head) { if (entry->old_sync_obj)
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c index ed3c1e7..e038c9a 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c @@ -251,13 +251,10 @@ static void vmw_dummy_query_bo_prepare(struct vmw_private *dev_priv) volatile SVGA3dQueryResult *result; bool dummy; int ret;
struct ttm_bo_device *bdev = &dev_priv->bdev; struct ttm_buffer_object *bo = dev_priv->dummy_query_bo;
ttm_bo_reserve(bo, false, false, false, 0);
spin_lock(&bdev->fence_lock); ret = ttm_bo_wait(bo, false, false, false);
spin_unlock(&bdev->fence_lock); if (unlikely(ret != 0)) (void) vmw_fallback_wait(dev_priv, false, true, 0, false, 10*HZ);
diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h index 816d9b1..6c69224 100644 --- a/include/drm/ttm/ttm_bo_api.h +++ b/include/drm/ttm/ttm_bo_api.h @@ -222,6 +222,8 @@ struct ttm_buffer_object { struct file *persistent_swap_storage; struct ttm_tt *ttm; bool evicted;
void *sync_obj;
unsigned long priv_flags;
/**
- Members protected by the bdev::lru_lock.
@@ -242,16 +244,6 @@ struct ttm_buffer_object { atomic_t reserved;
/**
* Members protected by struct buffer_object_device::fence_lock
* In addition, setting sync_obj to anything else
* than NULL requires bo::reserved to be held. This allows for
* checking NULL while reserved but not holding the mentioned lock.
*/
- void *sync_obj;
- unsigned long priv_flags;
- /**
*/
- Members protected by the bdev::vm_lock
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h index 0c8c3b5..aac101b 100644 --- a/include/drm/ttm/ttm_bo_driver.h +++ b/include/drm/ttm/ttm_bo_driver.h @@ -515,8 +515,6 @@ struct ttm_bo_global {
- @driver: Pointer to a struct ttm_bo_driver struct setup by the driver.
- @man: An array of mem_type_managers.
- @fence_lock: Protects the synchronizing members on *all* bos belonging
- to this device.
- @addr_space_mm: Range manager for the device address space.
- lru_lock: Spinlock that protects the buffer+device lru lists and
- ddestroy lists.
@@ -539,7 +537,6 @@ struct ttm_bo_device { struct ttm_bo_driver *driver; rwlock_t vm_lock; struct ttm_mem_type_manager man[TTM_NUM_MEM_TYPES];
- spinlock_t fence_lock; /*
*/
- Protected by the vm lock.
dri-devel mailing list dri-devel@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/dri-devel
On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
Hi, Maarten,
As you know I have been having my doubts about this change. To me it seems insane to be forced to read the fence pointer under a reserved lock, simply because when you take the reserve lock, another process may have it and there is a substantial chance that that process will also be waiting for idle while holding the reserve lock.
In essence, to read the fence pointer, there is a large chance you will be waiting for idle to be able to access it. That's why it's protected by a separate spinlock in the first place.
So even if this change might not affect current driver much it's a change to the TTM api that leads to an IMHO very poor design.
/Thomas
One way we could perhaps improve on this, if you think this is necessary, is to build a bit on the initial rcu suggestion, still require reservation when the fence pointer is modified, but to also use rcu_assign_pointer() for the fence pointer.
Anyone that wants quick access to the fence pointer would then use rcu_dereference_x(), but if the fence is indeed signaled, that caller needs to avoid setting the bo fence pointer to NULL.
A check for bo idle without taking any (bo) locks would then mean imply reading the fence pointer in this fashion, and if it's non-NULL check whether the fence is signaled.
/Thomas
On 10/12/2012 05:23 PM, Maarten Lankhorst wrote:
With the nouveau calls fixed there were only 2 places left that used fence_lock without a reservation, those are fixed in this patch:
ttm_bo_cleanup_refs_or_queue is fixed by simply doing things the other way around.
ttm_bo_cleanup_refs is fixed by taking a reservation first, then a pointer to the fence object and backs off from the reservation if waiting has to be performed.
Signed-off-by: Maarten Lankhorst maarten.lankhorst@canonical.com
This patch should be applied only after all the previous patches I sent are applied, since it depends on sync_obj_arg removal (kinda, probably fails to apply otherwise), uses ttm_bo_is_reserved, and depends on nouveau wait behavior being fixed.
drivers/gpu/drm/nouveau/nouveau_bo.c | 4 - drivers/gpu/drm/nouveau/nouveau_gem.c | 15 +--- drivers/gpu/drm/radeon/radeon_object.c | 2 - drivers/gpu/drm/ttm/ttm_bo.c | 124 ++++++++++++-------------------- drivers/gpu/drm/ttm/ttm_bo_util.c | 3 - drivers/gpu/drm/ttm/ttm_bo_vm.c | 5 - drivers/gpu/drm/ttm/ttm_execbuf_util.c | 2 - drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 3 - include/drm/ttm/ttm_bo_api.h | 12 +-- include/drm/ttm/ttm_bo_driver.h | 3 - 10 files changed, 52 insertions(+), 121 deletions(-)
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c index cee7448..9749c45 100644 --- a/drivers/gpu/drm/nouveau/nouveau_bo.c +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c @@ -1449,10 +1449,8 @@ nouveau_bo_fence(struct nouveau_bo *nvbo, struct nouveau_fence *fence) if (likely(fence)) nouveau_fence_ref(fence);
- spin_lock(&nvbo->bo.bdev->fence_lock); old_fence = nvbo->bo.sync_obj; nvbo->bo.sync_obj = fence;
- spin_unlock(&nvbo->bo.bdev->fence_lock); nouveau_fence_unref(&old_fence); }
@@ -1552,9 +1550,7 @@ nouveau_bo_vma_del(struct nouveau_bo *nvbo, struct nouveau_vma *vma) if (vma->node) { if (nvbo->bo.mem.mem_type != TTM_PL_SYSTEM) { ttm_bo_reserve(&nvbo->bo, false, false, false, 0);
spin_lock(&nvbo->bo.bdev->fence_lock); ttm_bo_wait(&nvbo->bo, false, false, false);
spin_unlock(&nvbo->bo.bdev->fence_lock); ttm_bo_unreserve(&nvbo->bo); nouveau_vm_unmap(vma); }
diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c index 6d8391d..eaa700a 100644 --- a/drivers/gpu/drm/nouveau/nouveau_gem.c +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c @@ -391,18 +391,11 @@ retry: static int validate_sync(struct nouveau_channel *chan, struct nouveau_bo *nvbo) {
- struct nouveau_fence *fence = NULL;
- struct nouveau_fence *fence = nvbo->bo.sync_obj; int ret = 0;
- spin_lock(&nvbo->bo.bdev->fence_lock);
- if (nvbo->bo.sync_obj)
fence = nouveau_fence_ref(nvbo->bo.sync_obj);
- spin_unlock(&nvbo->bo.bdev->fence_lock);
- if (fence) {
- if (fence) ret = nouveau_fence_sync(fence, chan);
nouveau_fence_unref(&fence);
- } return ret; }
@@ -614,9 +607,7 @@ nouveau_gem_pushbuf_reloc_apply(struct drm_device *dev, data |= r->vor; }
spin_lock(&nvbo->bo.bdev->fence_lock); ret = ttm_bo_wait(&nvbo->bo, false, false, false);
spin_unlock(&nvbo->bo.bdev->fence_lock); if (ret) { NV_ERROR(drm, "reloc wait_idle failed: %d\n", ret); break;
@@ -848,11 +839,9 @@ nouveau_gem_ioctl_cpu_prep(struct drm_device *dev, void *data, ret = ttm_bo_reserve(&nvbo->bo, true, false, false, 0); if (!ret) {
spin_lock(&nvbo->bo.bdev->fence_lock); ret = ttm_bo_wait(&nvbo->bo, true, true, true); if (!no_wait && ret) fence = nouveau_fence_ref(nvbo->bo.sync_obj);
spin_unlock(&nvbo->bo.bdev->fence_lock); ttm_bo_unreserve(&nvbo->bo); }
diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c index 808c444..df430e4 100644 --- a/drivers/gpu/drm/radeon/radeon_object.c +++ b/drivers/gpu/drm/radeon/radeon_object.c @@ -613,12 +613,10 @@ int radeon_bo_wait(struct radeon_bo *bo, u32 *mem_type, bool no_wait) r = ttm_bo_reserve(&bo->tbo, true, no_wait, false, 0); if (unlikely(r != 0)) return r;
- spin_lock(&bo->tbo.bdev->fence_lock); if (mem_type) *mem_type = bo->tbo.mem.mem_type; if (bo->tbo.sync_obj) r = ttm_bo_wait(&bo->tbo, true, true, no_wait);
- spin_unlock(&bo->tbo.bdev->fence_lock); ttm_bo_unreserve(&bo->tbo); return r; }
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index f6d7026..69fc29b 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -498,28 +498,23 @@ static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo) { struct ttm_bo_device *bdev = bo->bdev; struct ttm_bo_global *glob = bo->glob;
- struct ttm_bo_driver *driver;
- struct ttm_bo_driver *driver = bdev->driver; void *sync_obj = NULL; int put_count; int ret;
- spin_lock(&bdev->fence_lock);
- (void) ttm_bo_wait(bo, false, false, true);
- if (!bo->sync_obj) {
spin_lock(&glob->lru_lock);
/**
* Lock inversion between bo:reserve and bdev::fence_lock here,
* but that's OK, since we're only trylocking.
*/
ret = ttm_bo_reserve_locked(bo, false, true, false, 0);
- spin_lock(&glob->lru_lock);
- ret = ttm_bo_reserve_locked(bo, false, true, false, 0);
- if (!ret) {
ret = ttm_bo_wait(bo, false, false, true);
if (unlikely(ret == -EBUSY))
if (unlikely(ret == -EBUSY)) {
sync_obj = driver->sync_obj_ref(bo->sync_obj);
atomic_set(&bo->reserved, 0);
wake_up_all(&bo->event_queue); goto queue;
}
spin_unlock(&bdev->fence_lock); put_count = ttm_bo_del_from_lru(bo); spin_unlock(&glob->lru_lock);
@@ -528,18 +523,11 @@ static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo) ttm_bo_list_ref_sub(bo, put_count, true); return;
- } else {
queue:spin_lock(&glob->lru_lock); }
- driver = bdev->driver;
- if (bo->sync_obj)
sync_obj = driver->sync_obj_ref(bo->sync_obj);
kref_get(&bo->list_kref); list_add_tail(&bo->ddestroy, &bdev->ddestroy); spin_unlock(&glob->lru_lock);
- spin_unlock(&bdev->fence_lock); if (sync_obj) { driver->sync_obj_flush(sync_obj);
@@ -565,25 +553,15 @@ static int ttm_bo_cleanup_refs(struct ttm_buffer_object *bo, bool no_wait_gpu) { struct ttm_bo_device *bdev = bo->bdev;
- struct ttm_bo_driver *driver = bdev->driver; struct ttm_bo_global *glob = bo->glob; int put_count; int ret = 0;
- void *sync_obj; retry:
- spin_lock(&bdev->fence_lock);
- ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
- spin_unlock(&bdev->fence_lock);
- if (unlikely(ret != 0))
return ret;
spin_lock(&glob->lru_lock);
- if (unlikely(list_empty(&bo->ddestroy))) {
spin_unlock(&glob->lru_lock);
return 0;
- }
@@ -592,18 +570,37 @@ retry: return ret; }ret = ttm_bo_reserve_locked(bo, interruptible, no_wait_reserve, false, 0);
- /**
* We can re-check for sync object without taking
* the bo::lock since setting the sync object requires
* also bo::reserved. A busy object at this point may
* be caused by another thread recently starting an accelerated
* eviction.
*/
- if (unlikely(list_empty(&bo->ddestroy))) {
atomic_set(&bo->reserved, 0);
wake_up_all(&bo->event_queue);
spin_unlock(&glob->lru_lock);
return 0;
- }
- ret = ttm_bo_wait(bo, false, false, true);
- if (ret) {
if (no_wait_gpu) {
atomic_set(&bo->reserved, 0);
wake_up_all(&bo->event_queue);
spin_unlock(&glob->lru_lock);
return ret;
}
- if (unlikely(bo->sync_obj)) {
/**
* Take a reference to the fence and unreserve, if the wait
* was succesful and no new sync_obj was attached,
* ttm_bo_wait in retry will return ret = 0, and end the loop.
*/
sync_obj = driver->sync_obj_ref(&bo->sync_obj); atomic_set(&bo->reserved, 0); wake_up_all(&bo->event_queue); spin_unlock(&glob->lru_lock);
ret = driver->sync_obj_wait(bo->sync_obj, false,
interruptible);
driver->sync_obj_unref(&sync_obj);
if (ret)
@@ -735,9 +732,7 @@ static int ttm_bo_evict(structreturn ret; goto retry; }
ttm_buffer_object *bo, bool interruptible, struct ttm_placement placement; int ret = 0;
- spin_lock(&bdev->fence_lock); ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
- spin_unlock(&bdev->fence_lock); if (unlikely(ret != 0)) { if (ret != -ERESTARTSYS) {
@@ -1054,7 +1049,6 @@ int ttm_bo_move_buffer(struct ttm_buffer_object *bo, { int ret = 0; struct ttm_mem_reg mem;
- struct ttm_bo_device *bdev = bo->bdev; BUG_ON(!ttm_bo_is_reserved(bo)); @@ -1063,9 +1057,7 @@ int ttm_bo_move_buffer(struct
ttm_buffer_object *bo, * Have the driver move function wait for idle when necessary, * instead of doing it here. */
- spin_lock(&bdev->fence_lock); ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
- spin_unlock(&bdev->fence_lock); if (ret) return ret; mem.num_pages = bo->num_pages;
@@ -1554,7 +1546,6 @@ int ttm_bo_device_init(struct ttm_bo_device *bdev, bdev->glob = glob; bdev->need_dma32 = need_dma32; bdev->val_seq = 0;
- spin_lock_init(&bdev->fence_lock); mutex_lock(&glob->device_list_mutex); list_add_tail(&bdev->device_list, &glob->device_list); mutex_unlock(&glob->device_list_mutex);
@@ -1690,52 +1681,33 @@ int ttm_bo_wait(struct ttm_buffer_object *bo, bool lazy, bool interruptible, bool no_wait) { struct ttm_bo_driver *driver = bo->bdev->driver;
- struct ttm_bo_device *bdev = bo->bdev;
- void *sync_obj; int ret = 0;
- WARN_ON_ONCE(!ttm_bo_is_reserved(bo));
if (likely(bo->sync_obj == NULL)) return 0;
- while (bo->sync_obj) {
- if (bo->sync_obj) { if (driver->sync_obj_signaled(bo->sync_obj)) { void *tmp_obj = bo->sync_obj; bo->sync_obj = NULL; clear_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags);
spin_unlock(&bdev->fence_lock); driver->sync_obj_unref(&tmp_obj);
spin_lock(&bdev->fence_lock);
continue;
return 0; } if (no_wait) return -EBUSY;
sync_obj = driver->sync_obj_ref(bo->sync_obj);
spin_unlock(&bdev->fence_lock);
ret = driver->sync_obj_wait(sync_obj,
ret = driver->sync_obj_wait(bo->sync_obj, lazy, interruptible); if (unlikely(ret != 0)) {
driver->sync_obj_unref(&sync_obj);
spin_lock(&bdev->fence_lock); return ret; }
spin_lock(&bdev->fence_lock);
if (likely(bo->sync_obj == sync_obj)) {
void *tmp_obj = bo->sync_obj;
bo->sync_obj = NULL;
clear_bit(TTM_BO_PRIV_FLAG_MOVING,
&bo->priv_flags);
spin_unlock(&bdev->fence_lock);
driver->sync_obj_unref(&sync_obj);
driver->sync_obj_unref(&tmp_obj);
spin_lock(&bdev->fence_lock);
} else {
spin_unlock(&bdev->fence_lock);
driver->sync_obj_unref(&sync_obj);
spin_lock(&bdev->fence_lock);
}
driver->sync_obj_unref(&bo->sync_obj);
clear_bit(TTM_BO_PRIV_FLAG_MOVING,
}&bo->priv_flags); } return 0;
@@ -1799,9 +1771,7 @@ static int ttm_bo_swapout(struct ttm_mem_shrink *shrink) * Wait for GPU, then move to system cached. */
- spin_lock(&bo->bdev->fence_lock); ret = ttm_bo_wait(bo, false, false, false);
- spin_unlock(&bo->bdev->fence_lock); if (unlikely(ret != 0)) goto out;
diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c index d80d1e8..84d6e01 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_util.c +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c @@ -622,7 +622,6 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo, struct ttm_buffer_object *ghost_obj; void *tmp_obj = NULL;
- spin_lock(&bdev->fence_lock); if (bo->sync_obj) { tmp_obj = bo->sync_obj; bo->sync_obj = NULL;
@@ -630,7 +629,6 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo, bo->sync_obj = driver->sync_obj_ref(sync_obj); if (evict) { ret = ttm_bo_wait(bo, false, false, false);
spin_unlock(&bdev->fence_lock); if (tmp_obj) driver->sync_obj_unref(&tmp_obj); if (ret)
@@ -653,7 +651,6 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo, */ set_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags);
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.cspin_unlock(&bdev->fence_lock); if (tmp_obj) driver->sync_obj_unref(&tmp_obj);
b/drivers/gpu/drm/ttm/ttm_bo_vm.c index a877813..55718c2 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c @@ -122,17 +122,14 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * move. */
- spin_lock(&bdev->fence_lock); if (test_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags)) { ret = ttm_bo_wait(bo, false, true, false);
spin_unlock(&bdev->fence_lock); if (unlikely(ret != 0)) { retval = (ret != -ERESTARTSYS) ? VM_FAULT_SIGBUS : VM_FAULT_NOPAGE; goto out_unlock; }
- } else
spin_unlock(&bdev->fence_lock);
- } ret = ttm_mem_io_lock(man, true); if (unlikely(ret != 0)) {
diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c b/drivers/gpu/drm/ttm/ttm_execbuf_util.c index 721886e..51b5e97 100644 --- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c +++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c @@ -207,7 +207,6 @@ void ttm_eu_fence_buffer_objects(struct list_head *list, void *sync_obj) driver = bdev->driver; glob = bo->glob;
- spin_lock(&bdev->fence_lock); spin_lock(&glob->lru_lock); list_for_each_entry(entry, list, head) {
@@ -218,7 +217,6 @@ void ttm_eu_fence_buffer_objects(struct list_head *list, void *sync_obj) entry->reserved = false; } spin_unlock(&glob->lru_lock);
- spin_unlock(&bdev->fence_lock); list_for_each_entry(entry, list, head) { if (entry->old_sync_obj)
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c index ed3c1e7..e038c9a 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c @@ -251,13 +251,10 @@ static void vmw_dummy_query_bo_prepare(struct vmw_private *dev_priv) volatile SVGA3dQueryResult *result; bool dummy; int ret;
- struct ttm_bo_device *bdev = &dev_priv->bdev; struct ttm_buffer_object *bo = dev_priv->dummy_query_bo; ttm_bo_reserve(bo, false, false, false, 0);
- spin_lock(&bdev->fence_lock); ret = ttm_bo_wait(bo, false, false, false);
- spin_unlock(&bdev->fence_lock); if (unlikely(ret != 0)) (void) vmw_fallback_wait(dev_priv, false, true, 0, false, 10*HZ);
diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h index 816d9b1..6c69224 100644 --- a/include/drm/ttm/ttm_bo_api.h +++ b/include/drm/ttm/ttm_bo_api.h @@ -222,6 +222,8 @@ struct ttm_buffer_object { struct file *persistent_swap_storage; struct ttm_tt *ttm; bool evicted;
- void *sync_obj;
- unsigned long priv_flags; /**
- Members protected by the bdev::lru_lock.
@@ -242,16 +244,6 @@ struct ttm_buffer_object { atomic_t reserved; /**
* Members protected by struct buffer_object_device::fence_lock
* In addition, setting sync_obj to anything else
* than NULL requires bo::reserved to be held. This allows for
* checking NULL while reserved but not holding the mentioned lock.
*/
- void *sync_obj;
- unsigned long priv_flags;
- /**
*/ diff --git a/include/drm/ttm/ttm_bo_driver.h
- Members protected by the bdev::vm_lock
b/include/drm/ttm/ttm_bo_driver.h index 0c8c3b5..aac101b 100644 --- a/include/drm/ttm/ttm_bo_driver.h +++ b/include/drm/ttm/ttm_bo_driver.h @@ -515,8 +515,6 @@ struct ttm_bo_global {
- @driver: Pointer to a struct ttm_bo_driver struct setup by the
driver.
- @man: An array of mem_type_managers.
- @fence_lock: Protects the synchronizing members on *all* bos
belonging
- to this device.
- @addr_space_mm: Range manager for the device address space.
- lru_lock: Spinlock that protects the buffer+device lru lists and
- ddestroy lists.
@@ -539,7 +537,6 @@ struct ttm_bo_device { struct ttm_bo_driver *driver; rwlock_t vm_lock; struct ttm_mem_type_manager man[TTM_NUM_MEM_TYPES];
- spinlock_t fence_lock; /*
*/
- Protected by the vm lock.
dri-devel mailing list dri-devel@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/dri-devel
Hey,
Op 18-10-12 09:59, Thomas Hellstrom schreef:
On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
Hi, Maarten,
As you know I have been having my doubts about this change. To me it seems insane to be forced to read the fence pointer under a reserved lock, simply because when you take the reserve lock, another process may have it and there is a substantial chance that that process will also be waiting for idle while holding the reserve lock.
I think it makes perfect sense, the only times you want to read the fence is when you want to change the members protected by the reservation.
In essence, to read the fence pointer, there is a large chance you will be waiting for idle to be able to access it. That's why it's protected by a separate spinlock in the first place.
So even if this change might not affect current driver much it's a change to the TTM api that leads to an IMHO very poor design.
I would argue the opposite, no longer having a separate lock, with clear semantics when fencing is allowed, gives you a way to clean up the core of ttm immensely.
There were only 2 functions affected by fence lock removal and they were on buffer destruction, ttm_bo_cleanup_refs (and *_or_queue). But since the *_or_queue can simply change order around, you only have to worry about ttm_bo_cleanup_refs. This function is already ugly for other reasons, and the followup patch I was suggesting cleaned up the ugliness there too.
The only thing done differently is backing off from the reservation early. With the cleanup it won't even try to get the reservation again, since nobody should set a new fence on the bo when it's dead. Instead all destruction is moved until list refcount drops to 0.
One way we could perhaps improve on this, if you think this is necessary, is to build a bit on the initial rcu suggestion, still require reservation when the fence pointer is modified, but to also use rcu_assign_pointer() for the fence pointer.
This is a massive overkill when the only time you read the fence pointer without reservation is during buffer destruction. RCU is only good if there's ~10x more reads than writes, and for us it's simply 50% reads 50% writes..
Anyone that wants quick access to the fence pointer would then use rcu_dereference_x(), but if the fence is indeed signaled, that caller needs to avoid setting the bo fence pointer to NULL.
A check for bo idle without taking any (bo) locks would then mean imply reading the fence pointer in this fashion, and if it's non-NULL check whether the fence is signaled.
Sure it may look easier, but you add more overhead and complexity. I thought you wanted to avoid overhead in the reservation path? RCU won't be the way to do that.
~Maarten
On 10/18/2012 10:37 AM, Maarten Lankhorst wrote:
Hey,
Op 18-10-12 09:59, Thomas Hellstrom schreef:
On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
Hi, Maarten,
As you know I have been having my doubts about this change. To me it seems insane to be forced to read the fence pointer under a reserved lock, simply because when you take the reserve lock, another process may have it and there is a substantial chance that that process will also be waiting for idle while holding the reserve lock.
I think it makes perfect sense, the only times you want to read the fence is when you want to change the members protected by the reservation.
No, that's not true. A typical case (or the only case) is where you want to do a map with no_wait semantics. You will want to be able to access a buffer's results even if the eviction code is about to schedule an unbind from the GPU, and have the buffer reserved?
In essence, to read the fence pointer, there is a large chance you will be waiting for idle to be able to access it. That's why it's protected by a separate spinlock in the first place.
So even if this change might not affect current driver much it's a change to the TTM api that leads to an IMHO very poor design.
I would argue the opposite, no longer having a separate lock, with clear semantics when fencing is allowed, gives you a way to clean up the core of ttm immensely.
There were only 2 functions affected by fence lock removal and they were on buffer destruction, ttm_bo_cleanup_refs (and *_or_queue).
The actual code and the number of users is irrelevant here, since we're discussing the implications of changing the API.
One way we could perhaps improve on this, if you think this is necessary, is to build a bit on the initial rcu suggestion, still require reservation when the fence pointer is modified, but to also use rcu_assign_pointer() for the fence pointer.
This is a massive overkill when the only time you read the fence pointer without reservation is during buffer destruction. RCU is only good if there's ~10x more reads than writes, and for us it's simply 50% reads 50% writes..
I think you misunderstand me. I'm not suggesting going down the full RCU path, I'm merely making sure that reads and writes to the bo's fence pointer are atomic, using the RCU functions for this. I'm not suggesting any RCU syncs. This means your patch can be kept largely as it is, just make sure you do atomic_writes to the fence pointers.
/Thomas
Op 18-10-12 13:02, Thomas Hellstrom schreef:
On 10/18/2012 10:37 AM, Maarten Lankhorst wrote:
Hey,
Op 18-10-12 09:59, Thomas Hellstrom schreef:
On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
Hi, Maarten,
As you know I have been having my doubts about this change. To me it seems insane to be forced to read the fence pointer under a reserved lock, simply because when you take the reserve lock, another process may have it and there is a substantial chance that that process will also be waiting for idle while holding the reserve lock.
I think it makes perfect sense, the only times you want to read the fence is when you want to change the members protected by the reservation.
No, that's not true. A typical case (or the only case) is where you want to do a map with no_wait semantics. You will want to be able to access a buffer's results even if the eviction code is about to schedule an unbind from the GPU, and have the buffer reserved?
Well either block on reserve or return -EBUSY if reserved, presumably the former..
ttm_bo_vm_fault does the latter already, anyway.
You don't need to hold the reservation while performing the wait itself though, you could check if ttm_bo_wait(no_wait_gpu = true) returns -EBUSY, and if so take a ref to the sync_obj member and then wait after unreserving. You won't reset sync_obj member to NULL in that case, but that should be harmless. This will allow you to keep the reservations fast and short. Maybe a helper would be appropriate for this since radeon and nouveau both seem to do this.
The next time someone wants to do a wait it will go through the fastpath and unset the sync_obj member, since it's already signaled, or it's removed when ttm_execbuffer_util is used.
~Maarten
On 10/18/2012 01:38 PM, Maarten Lankhorst wrote:
Op 18-10-12 13:02, Thomas Hellstrom schreef:
On 10/18/2012 10:37 AM, Maarten Lankhorst wrote:
Hey,
Op 18-10-12 09:59, Thomas Hellstrom schreef:
On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
Hi, Maarten,
As you know I have been having my doubts about this change. To me it seems insane to be forced to read the fence pointer under a reserved lock, simply because when you take the reserve lock, another process may have it and there is a substantial chance that that process will also be waiting for idle while holding the reserve lock.
I think it makes perfect sense, the only times you want to read the fence is when you want to change the members protected by the reservation.
No, that's not true. A typical case (or the only case) is where you want to do a map with no_wait semantics. You will want to be able to access a buffer's results even if the eviction code is about to schedule an unbind from the GPU, and have the buffer reserved?
Well either block on reserve or return -EBUSY if reserved, presumably the former..
ttm_bo_vm_fault does the latter already, anyway
ttm_bo_vm_fault only trylocks to avoid a deadlock with mmap_sem, it's really a waiting reserve but for different reasons. Typically a user-space app will keep asynchronous maps to TTM during a buffer lifetime, and the buffer lifetime may be long as user-space apps keep caches. That's not the same as accessing a buffer after the GPU is done with it.
You don't need to hold the reservation while performing the wait itself though, you could check if ttm_bo_wait(no_wait_gpu = true) returns -EBUSY, and if so take a ref to the sync_obj member and then wait after unreserving. You won't reset sync_obj member to NULL in that case, but that should be harmless. This will allow you to keep the reservations fast and short. Maybe a helper would be appropriate for this since radeon and nouveau both seem to do this.
The problem is that as long as other users are waiting for idle with reservation held, for example to switch GPU engine or to delete a GPU bind, waiting for reserve will in many case mean wait for GPU.
/Thomas
Op 18-10-12 13:55, Thomas Hellstrom schreef:
On 10/18/2012 01:38 PM, Maarten Lankhorst wrote:
Op 18-10-12 13:02, Thomas Hellstrom schreef:
On 10/18/2012 10:37 AM, Maarten Lankhorst wrote:
Hey,
Op 18-10-12 09:59, Thomas Hellstrom schreef:
On 10/18/2012 09:28 AM, Thomas Hellstrom wrote:
Hi, Maarten,
As you know I have been having my doubts about this change. To me it seems insane to be forced to read the fence pointer under a reserved lock, simply because when you take the reserve lock, another process may have it and there is a substantial chance that that process will also be waiting for idle while holding the reserve lock.
I think it makes perfect sense, the only times you want to read the fence is when you want to change the members protected by the reservation.
No, that's not true. A typical case (or the only case) is where you want to do a map with no_wait semantics. You will want to be able to access a buffer's results even if the eviction code is about to schedule an unbind from the GPU, and have the buffer reserved?
Well either block on reserve or return -EBUSY if reserved, presumably the former..
ttm_bo_vm_fault does the latter already, anyway
ttm_bo_vm_fault only trylocks to avoid a deadlock with mmap_sem, it's really a waiting reserve but for different reasons. Typically a user-space app will keep asynchronous maps to TTM during a buffer lifetime, and the buffer lifetime may be long as user-space apps keep caches. That's not the same as accessing a buffer after the GPU is done with it.
Ah indeed.
You don't need to hold the reservation while performing the wait itself though, you could check if ttm_bo_wait(no_wait_gpu = true) returns -EBUSY, and if so take a ref to the sync_obj member and then wait after unreserving. You won't reset sync_obj member to NULL in that case, but that should be harmless. This will allow you to keep the reservations fast and short. Maybe a helper would be appropriate for this since radeon and nouveau both seem to do this.
The problem is that as long as other users are waiting for idle with reservation held, for example to switch GPU engine or to delete a GPU bind, waiting for reserve will in many case mean wait for GPU.
This example sounds inefficient, I know nouveau can do this, but this essentially just stalls the gpu entirely. I think guess you mean functions like nouveau_gem_object_close? It wouldn't surprise me if performance in nouveau could be improved simply by fixing those cases up instead, since it stalls the application completely too for other uses.
If reservations are held, it also becomes very easy to find all outliers, I didn't hook this part of lockdep up yet, but I intend to. See Documentation/lockstat.txt .
Lockstat becomes slightly trickier since we do multi object reservation, but we could follow the same path as lock_mutex_interruptible in the interrupted case. If the waiting on a reservation becomes a problem, I intend to make it a very measurable problem. :-)
And the only reason I haven't fixed the nouveau function I mentioned is because I wanted to see if I could make this show up as issue, and check how much it affects normal workloads.
All other places already did the reservation before wait, so it would be really valuable to quantify how much of a problem this really is, and fix up those pain points instead.
~Maarten
On 10/18/2012 04:45 PM, Maarten Lankhorst wrote:
Op 18-10-12 13:55, Thomas Hellstrom schreef:
On 10/18/2012 01:38 PM, Maarten Lankhorst wrote:
Op 18-10-12 13:02, Thomas Hellstrom schreef:
On 10/18/2012 10:37 AM, Maarten Lankhorst wrote:
Hey,
Op 18-10-12 09:59, Thomas Hellstrom schreef:
On 10/18/2012 09:28 AM, Thomas Hellstrom wrote: > Hi, Maarten, > > As you know I have been having my doubts about this change. > To me it seems insane to be forced to read the fence pointer under a > reserved lock, simply because when you take the reserve lock, another > process may have it and there is a substantial chance that that process > will also be waiting for idle while holding the reserve lock.
I think it makes perfect sense, the only times you want to read the fence is when you want to change the members protected by the reservation.
No, that's not true. A typical case (or the only case) is where you want to do a map with no_wait semantics. You will want to be able to access a buffer's results even if the eviction code is about to schedule an unbind from the GPU, and have the buffer reserved?
Well either block on reserve or return -EBUSY if reserved, presumably the former..
ttm_bo_vm_fault does the latter already, anyway
ttm_bo_vm_fault only trylocks to avoid a deadlock with mmap_sem, it's really a waiting reserve but for different reasons. Typically a user-space app will keep asynchronous maps to TTM during a buffer lifetime, and the buffer lifetime may be long as user-space apps keep caches. That's not the same as accessing a buffer after the GPU is done with it.
Ah indeed.
You don't need to hold the reservation while performing the wait itself though, you could check if ttm_bo_wait(no_wait_gpu = true) returns -EBUSY, and if so take a ref to the sync_obj member and then wait after unreserving. You won't reset sync_obj member to NULL in that case, but that should be harmless. This will allow you to keep the reservations fast and short. Maybe a helper would be appropriate for this since radeon and nouveau both seem to do this.
The problem is that as long as other users are waiting for idle with reservation held, for example to switch GPU engine or to delete a GPU bind, waiting for reserve will in many case mean wait for GPU.
This example sounds inefficient, I know nouveau can do this, but this essentially just stalls the gpu entirely. I think guess you mean functions like nouveau_gem_object_close? It wouldn't surprise me if performance in nouveau could be improved simply by fixing those cases up instead, since it stalls the application completely too for other uses.
Please see the Nouveau cpu_prep patch as well.
While there are a number of cases that can be fixed up, also in Radeon, there's no way around that reservation is a heavyweight lock that, particularly on simpler hardware without support for fence ordering with barriers and / or "semaphores" and accelerated eviction will be held while waiting for idle.
As such, it is unsuitable to protect read access to the fence pointer. If the intention is to keep a single fence pointer I think the best solution is to allow reading the fence pointer outside reservation, but make sure this can be done atomically. If the intention is to protect an array or list of fence pointers, I think a spinlock is the better solution.
/Thomas
On Thu, Oct 18, 2012 at 06:43:40PM +0200, Thomas Hellstrom wrote:
On 10/18/2012 04:45 PM, Maarten Lankhorst wrote:
Op 18-10-12 13:55, Thomas Hellstrom schreef:
On 10/18/2012 01:38 PM, Maarten Lankhorst wrote:
Op 18-10-12 13:02, Thomas Hellstrom schreef:
On 10/18/2012 10:37 AM, Maarten Lankhorst wrote:
Hey,
Op 18-10-12 09:59, Thomas Hellstrom schreef: >On 10/18/2012 09:28 AM, Thomas Hellstrom wrote: >>Hi, Maarten, >> >>As you know I have been having my doubts about this change. >>To me it seems insane to be forced to read the fence pointer under a >>reserved lock, simply because when you take the reserve lock, another >>process may have it and there is a substantial chance that that process >>will also be waiting for idle while holding the reserve lock. I think it makes perfect sense, the only times you want to read the fence is when you want to change the members protected by the reservation.
No, that's not true. A typical case (or the only case) is where you want to do a map with no_wait semantics. You will want to be able to access a buffer's results even if the eviction code is about to schedule an unbind from the GPU, and have the buffer reserved?
Well either block on reserve or return -EBUSY if reserved, presumably the former..
ttm_bo_vm_fault does the latter already, anyway
ttm_bo_vm_fault only trylocks to avoid a deadlock with mmap_sem, it's really a waiting reserve but for different reasons. Typically a user-space app will keep asynchronous maps to TTM during a buffer lifetime, and the buffer lifetime may be long as user-space apps keep caches. That's not the same as accessing a buffer after the GPU is done with it.
Ah indeed.
You don't need to hold the reservation while performing the wait itself though, you could check if ttm_bo_wait(no_wait_gpu = true) returns -EBUSY, and if so take a ref to the sync_obj member and then wait after unreserving. You won't reset sync_obj member to NULL in that case, but that should be harmless. This will allow you to keep the reservations fast and short. Maybe a helper would be appropriate for this since radeon and nouveau both seem to do this.
The problem is that as long as other users are waiting for idle with reservation held, for example to switch GPU engine or to delete a GPU bind, waiting for reserve will in many case mean wait for GPU.
This example sounds inefficient, I know nouveau can do this, but this essentially just stalls the gpu entirely. I think guess you mean functions like nouveau_gem_object_close? It wouldn't surprise me if performance in nouveau could be improved simply by fixing those cases up instead, since it stalls the application completely too for other uses.
Please see the Nouveau cpu_prep patch as well.
While there are a number of cases that can be fixed up, also in Radeon, there's no way around that reservation is a heavyweight lock that, particularly on simpler hardware without support for fence ordering with barriers and / or "semaphores" and accelerated eviction will be held while waiting for idle.
As such, it is unsuitable to protect read access to the fence pointer. If the intention is to keep a single fence pointer I think the best solution is to allow reading the fence pointer outside reservation, but make sure this can be done atomically. If the intention is to protect an array or list of fence pointers, I think a spinlock is the better solution.
/Thomas
Just wanted to point out that like Thomas i am concern about having to have object reserved when waiting on its associated fence. I fear it will hurt us somehow.
I will try to spend couple days stress testing your branch on radeon trying to see if it hurts performance anyhow with current use case.
Cheers, Jerome
On Fri, Oct 19, 2012 at 3:04 AM, Jerome Glisse j.glisse@gmail.com wrote:
On Thu, Oct 18, 2012 at 06:43:40PM +0200, Thomas Hellstrom wrote:
On 10/18/2012 04:45 PM, Maarten Lankhorst wrote:
Op 18-10-12 13:55, Thomas Hellstrom schreef:
On 10/18/2012 01:38 PM, Maarten Lankhorst wrote:
Op 18-10-12 13:02, Thomas Hellstrom schreef:
On 10/18/2012 10:37 AM, Maarten Lankhorst wrote: >Hey, > >Op 18-10-12 09:59, Thomas Hellstrom schreef: >>On 10/18/2012 09:28 AM, Thomas Hellstrom wrote: >>>Hi, Maarten, >>> >>>As you know I have been having my doubts about this change. >>>To me it seems insane to be forced to read the fence pointer under a >>>reserved lock, simply because when you take the reserve lock, another >>>process may have it and there is a substantial chance that that process >>>will also be waiting for idle while holding the reserve lock. >I think it makes perfect sense, the only times you want to read the fence >is when you want to change the members protected by the reservation. No, that's not true. A typical case (or the only case) is where you want to do a map with no_wait semantics. You will want to be able to access a buffer's results even if the eviction code is about to schedule an unbind from the GPU, and have the buffer reserved?
Well either block on reserve or return -EBUSY if reserved, presumably the former..
ttm_bo_vm_fault does the latter already, anyway
ttm_bo_vm_fault only trylocks to avoid a deadlock with mmap_sem, it's really a waiting reserve but for different reasons. Typically a user-space app will keep asynchronous maps to TTM during a buffer lifetime, and the buffer lifetime may be long as user-space apps keep caches. That's not the same as accessing a buffer after the GPU is done with it.
Ah indeed.
You don't need to hold the reservation while performing the wait itself though, you could check if ttm_bo_wait(no_wait_gpu = true) returns -EBUSY, and if so take a ref to the sync_obj member and then wait after unreserving. You won't reset sync_obj member to NULL in that case, but that should be harmless. This will allow you to keep the reservations fast and short. Maybe a helper would be appropriate for this since radeon and nouveau both seem to do this.
The problem is that as long as other users are waiting for idle with reservation held, for example to switch GPU engine or to delete a GPU bind, waiting for reserve will in many case mean wait for GPU.
This example sounds inefficient, I know nouveau can do this, but this essentially just stalls the gpu entirely. I think guess you mean functions like nouveau_gem_object_close? It wouldn't surprise me if performance in nouveau could be improved simply by fixing those cases up instead, since it stalls the application completely too for other uses.
Please see the Nouveau cpu_prep patch as well.
While there are a number of cases that can be fixed up, also in Radeon, there's no way around that reservation is a heavyweight lock that, particularly on simpler hardware without support for fence ordering with barriers and / or "semaphores" and accelerated eviction will be held while waiting for idle.
As such, it is unsuitable to protect read access to the fence pointer. If the intention is to keep a single fence pointer I think the best solution is to allow reading the fence pointer outside reservation, but make sure this can be done atomically. If the intention is to protect an array or list of fence pointers, I think a spinlock is the better solution.
/Thomas
Just wanted to point out that like Thomas i am concern about having to have object reserved when waiting on its associated fence. I fear it will hurt us somehow.
I will try to spend couple days stress testing your branch on radeon trying to see if it hurts performance anyhow with current use case.
I've been trying to figure out what to do with Maarten's patches going forward since they are essential for all kinds of SoC people,
However I'm still not 100% sure I believe either the fact that the problem is anything more than a microoptimisation, and possibly a premature one at that, this needs someone to start suggesting benchmarks we can run and a driver set to run them on, otherwise I'm starting to tend towards we are taking about an optimisation we can fix later,
The other option is to somehow merge this stuff under an option that allows us to test it using a command line option, but I don't think that is sane either,
So Maarten, Jerome, Thomas, please start discussing actual real world tests you think merging this code will affect and then I can make a better consideration.
Dave.
On 03/21/2014 12:55 AM, Dave Airlie wrote:
On Fri, Oct 19, 2012 at 3:04 AM, Jerome Glisse j.glisse@gmail.com wrote:
On Thu, Oct 18, 2012 at 06:43:40PM +0200, Thomas Hellstrom wrote:
On 10/18/2012 04:45 PM, Maarten Lankhorst wrote:
Op 18-10-12 13:55, Thomas Hellstrom schreef:
On 10/18/2012 01:38 PM, Maarten Lankhorst wrote:
Op 18-10-12 13:02, Thomas Hellstrom schreef: > On 10/18/2012 10:37 AM, Maarten Lankhorst wrote: >> Hey, >> >> Op 18-10-12 09:59, Thomas Hellstrom schreef: >>> On 10/18/2012 09:28 AM, Thomas Hellstrom wrote: >>>> Hi, Maarten, >>>> >>>> As you know I have been having my doubts about this change. >>>> To me it seems insane to be forced to read the fence pointer under a >>>> reserved lock, simply because when you take the reserve lock, another >>>> process may have it and there is a substantial chance that that process >>>> will also be waiting for idle while holding the reserve lock. >> I think it makes perfect sense, the only times you want to read the fence >> is when you want to change the members protected by the reservation. > No, that's not true. A typical case (or the only case) > is where you want to do a map with no_wait semantics. You will want > to be able to access a buffer's results even if the eviction code > is about to schedule an unbind from the GPU, and have the buffer > reserved? Well either block on reserve or return -EBUSY if reserved, presumably the former..
ttm_bo_vm_fault does the latter already, anyway
ttm_bo_vm_fault only trylocks to avoid a deadlock with mmap_sem, it's really a waiting reserve but for different reasons. Typically a user-space app will keep asynchronous maps to TTM during a buffer lifetime, and the buffer lifetime may be long as user-space apps keep caches. That's not the same as accessing a buffer after the GPU is done with it.
Ah indeed.
You don't need to hold the reservation while performing the wait itself though, you could check if ttm_bo_wait(no_wait_gpu = true) returns -EBUSY, and if so take a ref to the sync_obj member and then wait after unreserving. You won't reset sync_obj member to NULL in that case, but that should be harmless. This will allow you to keep the reservations fast and short. Maybe a helper would be appropriate for this since radeon and nouveau both seem to do this.
The problem is that as long as other users are waiting for idle with reservation held, for example to switch GPU engine or to delete a GPU bind, waiting for reserve will in many case mean wait for GPU.
This example sounds inefficient, I know nouveau can do this, but this essentially just stalls the gpu entirely. I think guess you mean functions like nouveau_gem_object_close? It wouldn't surprise me if performance in nouveau could be improved simply by fixing those cases up instead, since it stalls the application completely too for other uses.
Please see the Nouveau cpu_prep patch as well.
While there are a number of cases that can be fixed up, also in Radeon, there's no way around that reservation is a heavyweight lock that, particularly on simpler hardware without support for fence ordering with barriers and / or "semaphores" and accelerated eviction will be held while waiting for idle.
As such, it is unsuitable to protect read access to the fence pointer. If the intention is to keep a single fence pointer I think the best solution is to allow reading the fence pointer outside reservation, but make sure this can be done atomically. If the intention is to protect an array or list of fence pointers, I think a spinlock is the better solution.
/Thomas
Just wanted to point out that like Thomas i am concern about having to have object reserved when waiting on its associated fence. I fear it will hurt us somehow.
I will try to spend couple days stress testing your branch on radeon trying to see if it hurts performance anyhow with current use case.
I've been trying to figure out what to do with Maarten's patches going forward since they are essential for all kinds of SoC people,
However I'm still not 100% sure I believe either the fact that the problem is anything more than a microoptimisation, and possibly a premature one at that, this needs someone to start suggesting benchmarks we can run and a driver set to run them on, otherwise I'm starting to tend towards we are taking about an optimisation we can fix later,
The other option is to somehow merge this stuff under an option that allows us to test it using a command line option, but I don't think that is sane either,
So Maarten, Jerome, Thomas, please start discussing actual real world tests you think merging this code will affect and then I can make a better consideration.
Dave.
Dave,
This is IMHO a fundamental design discussion, not a micro-optimization.
I'm pretty sure all sorts of horrendous things could be done to the DRM design without affecting real world application performance.
In TTM data protection is primarily done with spinlocks: This serves two purposes.
a) You can't unnecessarily hold a data protection lock while waiting for GPU, which is typically a very stupid thing to do (perhaps not so in this particular case) but when the sleep while atomic or locking inversion kernel warning turns up, that should at least ring a bell. Trying to implement dma-buf fencing using the TTM fencing callbacks will AFAICT cause a locking inversion.
b) Spinlocks essentially go away on UP systems. The whole reservation path was essentially lock-free on UP systems pre dma-buf integration, and with very few atomic locking operations even on SMP systems. It was all prompted by a test some years ago (by Eric Anholt IIRC) showing that locking operations turned up quite high on Intel driver profiling.
If we start protecting data with reservations, when we also export the reservation locks, so that people find them a convenient "serialize work on this buffer" lock, all kind of interesting things might happen, and we might end up in a situation similar to protecting everything with struct_mutex.
This is why I dislike this change. In particular when there is a very simple remedy.
But if I can get an ACK to convert the reservation object fence pointers and associated operations on them to be rcu-safe when I have some time left, I'd be prepared to accept this patch series in it's current state.
/Thomas
Hey,
op 21-03-14 09:27, Thomas Hellstrom schreef:
On 03/21/2014 12:55 AM, Dave Airlie wrote:
On Fri, Oct 19, 2012 at 3:04 AM, Jerome Glisse j.glisse@gmail.com wrote:
On Thu, Oct 18, 2012 at 06:43:40PM +0200, Thomas Hellstrom wrote:
On 10/18/2012 04:45 PM, Maarten Lankhorst wrote:
Op 18-10-12 13:55, Thomas Hellstrom schreef:
On 10/18/2012 01:38 PM, Maarten Lankhorst wrote: > Op 18-10-12 13:02, Thomas Hellstrom schreef: >> On 10/18/2012 10:37 AM, Maarten Lankhorst wrote: >>> Hey, >>> >>> Op 18-10-12 09:59, Thomas Hellstrom schreef: >>>> On 10/18/2012 09:28 AM, Thomas Hellstrom wrote: >>>>> Hi, Maarten, >>>>> >>>>> As you know I have been having my doubts about this change. >>>>> To me it seems insane to be forced to read the fence pointer under a >>>>> reserved lock, simply because when you take the reserve lock, another >>>>> process may have it and there is a substantial chance that that process >>>>> will also be waiting for idle while holding the reserve lock. >>> I think it makes perfect sense, the only times you want to read the fence >>> is when you want to change the members protected by the reservation. >> No, that's not true. A typical case (or the only case) >> is where you want to do a map with no_wait semantics. You will want >> to be able to access a buffer's results even if the eviction code >> is about to schedule an unbind from the GPU, and have the buffer >> reserved? > Well either block on reserve or return -EBUSY if reserved, presumably the > former.. > > ttm_bo_vm_fault does the latter already, anyway ttm_bo_vm_fault only trylocks to avoid a deadlock with mmap_sem, it's really a waiting reserve but for different reasons. Typically a user-space app will keep asynchronous maps to TTM during a buffer lifetime, and the buffer lifetime may be long as user-space apps keep caches. That's not the same as accessing a buffer after the GPU is done with it.
Ah indeed.
> You don't need to hold the reservation while performing the wait itself though, > you could check if ttm_bo_wait(no_wait_gpu = true) returns -EBUSY, and if so > take a ref to the sync_obj member and then wait after unreserving. You won't > reset sync_obj member to NULL in that case, but that should be harmless. > This will allow you to keep the reservations fast and short. Maybe a helper > would be appropriate for this since radeon and nouveau both seem to do this. > The problem is that as long as other users are waiting for idle with reservation held, for example to switch GPU engine or to delete a GPU bind, waiting for reserve will in many case mean wait for GPU.
This example sounds inefficient, I know nouveau can do this, but this essentially just stalls the gpu entirely. I think guess you mean functions like nouveau_gem_object_close? It wouldn't surprise me if performance in nouveau could be improved simply by fixing those cases up instead, since it stalls the application completely too for other uses.
Please see the Nouveau cpu_prep patch as well.
While there are a number of cases that can be fixed up, also in Radeon, there's no way around that reservation is a heavyweight lock that, particularly on simpler hardware without support for fence ordering with barriers and / or "semaphores" and accelerated eviction will be held while waiting for idle.
As such, it is unsuitable to protect read access to the fence pointer. If the intention is to keep a single fence pointer I think the best solution is to allow reading the fence pointer outside reservation, but make sure this can be done atomically. If the intention is to protect an array or list of fence pointers, I think a spinlock is the better solution.
/Thomas
Just wanted to point out that like Thomas i am concern about having to have object reserved when waiting on its associated fence. I fear it will hurt us somehow.
I will try to spend couple days stress testing your branch on radeon trying to see if it hurts performance anyhow with current use case.
I've been trying to figure out what to do with Maarten's patches going forward since they are essential for all kinds of SoC people,
However I'm still not 100% sure I believe either the fact that the problem is anything more than a microoptimisation, and possibly a premature one at that, this needs someone to start suggesting benchmarks we can run and a driver set to run them on, otherwise I'm starting to tend towards we are taking about an optimisation we can fix later,
The other option is to somehow merge this stuff under an option that allows us to test it using a command line option, but I don't think that is sane either,
So Maarten, Jerome, Thomas, please start discussing actual real world tests you think merging this code will affect and then I can make a better consideration.
Dave.
Dave,
This is IMHO a fundamental design discussion, not a micro-optimization.
I'm pretty sure all sorts of horrendous things could be done to the DRM design without affecting real world application performance.
In TTM data protection is primarily done with spinlocks: This serves two purposes.
a) You can't unnecessarily hold a data protection lock while waiting for GPU, which is typically a very stupid thing to do (perhaps not so in this particular case) but when the sleep while atomic or locking inversion kernel warning turns up, that should at least ring a bell. Trying to implement dma-buf fencing using the TTM fencing callbacks will AFAICT cause a locking inversion.
b) Spinlocks essentially go away on UP systems. The whole reservation path was essentially lock-free on UP systems pre dma-buf integration, and with very few atomic locking operations even on SMP systems. It was all prompted by a test some years ago (by Eric Anholt IIRC) showing that locking operations turned up quite high on Intel driver profiling.
If we start protecting data with reservations, when we also export the reservation locks, so that people find them a convenient "serialize work on this buffer" lock, all kind of interesting things might happen, and we might end up in a situation similar to protecting everything with struct_mutex.
This is why I dislike this change. In particular when there is a very simple remedy.
But if I can get an ACK to convert the reservation object fence pointers and associated operations on them to be rcu-safe when I have some time left, I'd be prepared to accept this patch series in it's current state.
RCU could be a useful way to deal with this. But in my case I've shown there are very few places where it's needed, core ttm does not need it at all. This is why I wanted to leave it to individual drivers to implement it.
I think kfree_rcu for free in the driver itself should be enough, and obtaining in the driver would require something like this, similar to whats in rcuref.txt:
rcu_read_lock(); f = rcu_dereference(bo->fence); if (f && !kref_get_unless-zero(&f->kref)) f = NULL; rcu_read_unlock();
if (f) { // do stuff here ...
// free fence kref_put(&f->kref, fence_put_with_kfree_rcu); }
Am I wrong or is something like this enough to make sure fence is still alive? There might still be a small bug when bo->fence's refcount is decreased before bo->fence is set to null. I haven't checked core ttm so that might need fixing.
I added some more people to CC. It might be worthwhile having this in the core fence code and delete all fences with rcu, but I'm not completely certain about that yet.
~Maarten
On 03/21/2014 01:12 PM, Maarten Lankhorst wrote:
Hey,
op 21-03-14 09:27, Thomas Hellstrom schreef:
On 03/21/2014 12:55 AM, Dave Airlie wrote:
On Fri, Oct 19, 2012 at 3:04 AM, Jerome Glisse j.glisse@gmail.com wrote:
On Thu, Oct 18, 2012 at 06:43:40PM +0200, Thomas Hellstrom wrote:
On 10/18/2012 04:45 PM, Maarten Lankhorst wrote:
Op 18-10-12 13:55, Thomas Hellstrom schreef: > On 10/18/2012 01:38 PM, Maarten Lankhorst wrote: >> Op 18-10-12 13:02, Thomas Hellstrom schreef: >>> On 10/18/2012 10:37 AM, Maarten Lankhorst wrote: >>>> Hey, >>>> >>>> Op 18-10-12 09:59, Thomas Hellstrom schreef: >>>>> On 10/18/2012 09:28 AM, Thomas Hellstrom wrote: >>>>>> Hi, Maarten, >>>>>> >>>>>> As you know I have been having my doubts about this change. >>>>>> To me it seems insane to be forced to read the fence >>>>>> pointer under a >>>>>> reserved lock, simply because when you take the reserve >>>>>> lock, another >>>>>> process may have it and there is a substantial chance that >>>>>> that process >>>>>> will also be waiting for idle while holding the reserve lock. >>>> I think it makes perfect sense, the only times you want to >>>> read the fence >>>> is when you want to change the members protected by the >>>> reservation. >>> No, that's not true. A typical case (or the only case) >>> is where you want to do a map with no_wait semantics. You will >>> want >>> to be able to access a buffer's results even if the eviction code >>> is about to schedule an unbind from the GPU, and have the buffer >>> reserved? >> Well either block on reserve or return -EBUSY if reserved, >> presumably the >> former.. >> >> ttm_bo_vm_fault does the latter already, anyway > ttm_bo_vm_fault only trylocks to avoid a deadlock with mmap_sem, > it's really > a waiting reserve but for different reasons. Typically a > user-space app will keep > asynchronous maps to TTM during a buffer lifetime, and the > buffer lifetime may > be long as user-space apps keep caches. > That's not the same as accessing a buffer after the GPU is done > with it. Ah indeed. >> You don't need to hold the reservation while performing the >> wait itself though, >> you could check if ttm_bo_wait(no_wait_gpu = true) returns >> -EBUSY, and if so >> take a ref to the sync_obj member and then wait after >> unreserving. You won't >> reset sync_obj member to NULL in that case, but that should be >> harmless. >> This will allow you to keep the reservations fast and short. >> Maybe a helper >> would be appropriate for this since radeon and nouveau both >> seem to do this. >> > The problem is that as long as other users are waiting for idle > with reservation > held, for example to switch GPU engine or to delete a GPU bind, > waiting > for reserve will in many case mean wait for GPU. This example sounds inefficient, I know nouveau can do this, but this essentially just stalls the gpu entirely. I think guess you mean functions like nouveau_gem_object_close? It wouldn't surprise me if performance in nouveau could be improved simply by fixing those cases up instead, since it stalls the application completely too for other uses.
Please see the Nouveau cpu_prep patch as well.
While there are a number of cases that can be fixed up, also in Radeon, there's no way around that reservation is a heavyweight lock that, particularly on simpler hardware without support for fence ordering with barriers and / or "semaphores" and accelerated eviction will be held while waiting for idle.
As such, it is unsuitable to protect read access to the fence pointer. If the intention is to keep a single fence pointer I think the best solution is to allow reading the fence pointer outside reservation, but make sure this can be done atomically. If the intention is to protect an array or list of fence pointers, I think a spinlock is the better solution.
/Thomas
Just wanted to point out that like Thomas i am concern about having to have object reserved when waiting on its associated fence. I fear it will hurt us somehow.
I will try to spend couple days stress testing your branch on radeon trying to see if it hurts performance anyhow with current use case.
I've been trying to figure out what to do with Maarten's patches going forward since they are essential for all kinds of SoC people,
However I'm still not 100% sure I believe either the fact that the problem is anything more than a microoptimisation, and possibly a premature one at that, this needs someone to start suggesting benchmarks we can run and a driver set to run them on, otherwise I'm starting to tend towards we are taking about an optimisation we can fix later,
The other option is to somehow merge this stuff under an option that allows us to test it using a command line option, but I don't think that is sane either,
So Maarten, Jerome, Thomas, please start discussing actual real world tests you think merging this code will affect and then I can make a better consideration.
Dave.
Dave,
This is IMHO a fundamental design discussion, not a micro-optimization.
I'm pretty sure all sorts of horrendous things could be done to the DRM design without affecting real world application performance.
In TTM data protection is primarily done with spinlocks: This serves two purposes.
a) You can't unnecessarily hold a data protection lock while waiting for GPU, which is typically a very stupid thing to do (perhaps not so in this particular case) but when the sleep while atomic or locking inversion kernel warning turns up, that should at least ring a bell. Trying to implement dma-buf fencing using the TTM fencing callbacks will AFAICT cause a locking inversion.
b) Spinlocks essentially go away on UP systems. The whole reservation path was essentially lock-free on UP systems pre dma-buf integration, and with very few atomic locking operations even on SMP systems. It was all prompted by a test some years ago (by Eric Anholt IIRC) showing that locking operations turned up quite high on Intel driver profiling.
If we start protecting data with reservations, when we also export the reservation locks, so that people find them a convenient "serialize work on this buffer" lock, all kind of interesting things might happen, and we might end up in a situation similar to protecting everything with struct_mutex.
This is why I dislike this change. In particular when there is a very simple remedy.
But if I can get an ACK to convert the reservation object fence pointers and associated operations on them to be rcu-safe when I have some time left, I'd be prepared to accept this patch series in it's current state.
RCU could be a useful way to deal with this. But in my case I've shown there are very few places where it's needed, core ttm does not need it at all. This is why I wanted to leave it to individual drivers to implement it.
I think kfree_rcu for free in the driver itself should be enough, and obtaining in the driver would require something like this, similar to whats in rcuref.txt:
rcu_read_lock(); f = rcu_dereference(bo->fence); if (f && !kref_get_unless-zero(&f->kref)) f = NULL; rcu_read_unlock();
if (f) { // do stuff here ...
// free fence kref_put(&f->kref, fence_put_with_kfree_rcu); }
Am I wrong or is something like this enough to make sure fence is still alive?
No, you're correct.
And a bo check for idle would amount to (since we don't care if the fence refcount is zero).
rcu_read_lock() f = rcu_dereference(bo->fence); signaled = !f || f->signaled; rcu_read_unlock().
/Thomas
There might still be a small bug when bo->fence's refcount is decreased before bo->fence is set to null. I haven't checked core ttm so that might need fixing.
I added some more people to CC. It might be worthwhile having this in the core fence code and delete all fences with rcu, but I'm not completely certain about that yet.
~Maarten
Hey,
op 21-03-14 14:04, Thomas Hellstrom schreef:
On 03/21/2014 01:12 PM, Maarten Lankhorst wrote:
Hey,
op 21-03-14 09:27, Thomas Hellstrom schreef:
On 03/21/2014 12:55 AM, Dave Airlie wrote:
On Fri, Oct 19, 2012 at 3:04 AM, Jerome Glisse j.glisse@gmail.com wrote:
On Thu, Oct 18, 2012 at 06:43:40PM +0200, Thomas Hellstrom wrote:
On 10/18/2012 04:45 PM, Maarten Lankhorst wrote: > Op 18-10-12 13:55, Thomas Hellstrom schreef: >> On 10/18/2012 01:38 PM, Maarten Lankhorst wrote: >>> Op 18-10-12 13:02, Thomas Hellstrom schreef: >>>> On 10/18/2012 10:37 AM, Maarten Lankhorst wrote: >>>>> Hey, >>>>> >>>>> Op 18-10-12 09:59, Thomas Hellstrom schreef: >>>>>> On 10/18/2012 09:28 AM, Thomas Hellstrom wrote: >>>>>>> Hi, Maarten, >>>>>>> >>>>>>> As you know I have been having my doubts about this change. >>>>>>> To me it seems insane to be forced to read the fence >>>>>>> pointer under a >>>>>>> reserved lock, simply because when you take the reserve >>>>>>> lock, another >>>>>>> process may have it and there is a substantial chance that >>>>>>> that process >>>>>>> will also be waiting for idle while holding the reserve lock. >>>>> I think it makes perfect sense, the only times you want to >>>>> read the fence >>>>> is when you want to change the members protected by the >>>>> reservation. >>>> No, that's not true. A typical case (or the only case) >>>> is where you want to do a map with no_wait semantics. You will >>>> want >>>> to be able to access a buffer's results even if the eviction code >>>> is about to schedule an unbind from the GPU, and have the buffer >>>> reserved? >>> Well either block on reserve or return -EBUSY if reserved, >>> presumably the >>> former.. >>> >>> ttm_bo_vm_fault does the latter already, anyway >> ttm_bo_vm_fault only trylocks to avoid a deadlock with mmap_sem, >> it's really >> a waiting reserve but for different reasons. Typically a >> user-space app will keep >> asynchronous maps to TTM during a buffer lifetime, and the >> buffer lifetime may >> be long as user-space apps keep caches. >> That's not the same as accessing a buffer after the GPU is done >> with it. > Ah indeed. >>> You don't need to hold the reservation while performing the >>> wait itself though, >>> you could check if ttm_bo_wait(no_wait_gpu = true) returns >>> -EBUSY, and if so >>> take a ref to the sync_obj member and then wait after >>> unreserving. You won't >>> reset sync_obj member to NULL in that case, but that should be >>> harmless. >>> This will allow you to keep the reservations fast and short. >>> Maybe a helper >>> would be appropriate for this since radeon and nouveau both >>> seem to do this. >>> >> The problem is that as long as other users are waiting for idle >> with reservation >> held, for example to switch GPU engine or to delete a GPU bind, >> waiting >> for reserve will in many case mean wait for GPU. > This example sounds inefficient, I know nouveau can do this, but > this essentially > just stalls the gpu entirely. I think guess you mean functions > like nouveau_gem_object_close? > It wouldn't surprise me if performance in nouveau could be > improved simply by > fixing those cases up instead, since it stalls the application > completely too for other uses. > Please see the Nouveau cpu_prep patch as well.
While there are a number of cases that can be fixed up, also in Radeon, there's no way around that reservation is a heavyweight lock that, particularly on simpler hardware without support for fence ordering with barriers and / or "semaphores" and accelerated eviction will be held while waiting for idle.
As such, it is unsuitable to protect read access to the fence pointer. If the intention is to keep a single fence pointer I think the best solution is to allow reading the fence pointer outside reservation, but make sure this can be done atomically. If the intention is to protect an array or list of fence pointers, I think a spinlock is the better solution.
/Thomas
Just wanted to point out that like Thomas i am concern about having to have object reserved when waiting on its associated fence. I fear it will hurt us somehow.
I will try to spend couple days stress testing your branch on radeon trying to see if it hurts performance anyhow with current use case.
I've been trying to figure out what to do with Maarten's patches going forward since they are essential for all kinds of SoC people,
However I'm still not 100% sure I believe either the fact that the problem is anything more than a microoptimisation, and possibly a premature one at that, this needs someone to start suggesting benchmarks we can run and a driver set to run them on, otherwise I'm starting to tend towards we are taking about an optimisation we can fix later,
The other option is to somehow merge this stuff under an option that allows us to test it using a command line option, but I don't think that is sane either,
So Maarten, Jerome, Thomas, please start discussing actual real world tests you think merging this code will affect and then I can make a better consideration.
Dave.
Dave,
This is IMHO a fundamental design discussion, not a micro-optimization.
I'm pretty sure all sorts of horrendous things could be done to the DRM design without affecting real world application performance.
In TTM data protection is primarily done with spinlocks: This serves two purposes.
a) You can't unnecessarily hold a data protection lock while waiting for GPU, which is typically a very stupid thing to do (perhaps not so in this particular case) but when the sleep while atomic or locking inversion kernel warning turns up, that should at least ring a bell. Trying to implement dma-buf fencing using the TTM fencing callbacks will AFAICT cause a locking inversion.
b) Spinlocks essentially go away on UP systems. The whole reservation path was essentially lock-free on UP systems pre dma-buf integration, and with very few atomic locking operations even on SMP systems. It was all prompted by a test some years ago (by Eric Anholt IIRC) showing that locking operations turned up quite high on Intel driver profiling.
If we start protecting data with reservations, when we also export the reservation locks, so that people find them a convenient "serialize work on this buffer" lock, all kind of interesting things might happen, and we might end up in a situation similar to protecting everything with struct_mutex.
This is why I dislike this change. In particular when there is a very simple remedy.
But if I can get an ACK to convert the reservation object fence pointers and associated operations on them to be rcu-safe when I have some time left, I'd be prepared to accept this patch series in it's current state.
RCU could be a useful way to deal with this. But in my case I've shown there are very few places where it's needed, core ttm does not need it at all. This is why I wanted to leave it to individual drivers to implement it.
I think kfree_rcu for free in the driver itself should be enough, and obtaining in the driver would require something like this, similar to whats in rcuref.txt:
rcu_read_lock(); f = rcu_dereference(bo->fence); if (f && !kref_get_unless-zero(&f->kref)) f = NULL; rcu_read_unlock();
if (f) { // do stuff here ...
// free fence kref_put(&f->kref, fence_put_with_kfree_rcu); }
Am I wrong or is something like this enough to make sure fence is still alive?
No, you're correct.
And a bo check for idle would amount to (since we don't care if the fence refcount is zero).
rcu_read_lock() f = rcu_dereference(bo->fence); signaled = !f || f->signaled; rcu_read_unlock().
/Thomas
This is very easy to implement when there is only 1 fence slot, bo->fence being equal to reservation_object->fence_excl. It appears to break down when slightly when I implement it on top of shared fences.
My diff is against git://git.linaro.org/people/sumit.semwal/linux-3.x.git for-next-fences shared_max_fence is held in reservation_object to prevent the reallocation in reservation_object_reserve_shared_fence every time the same reservation_object gets > 4 shared fences; if it happens once, it's likely going to happen again on the same object.
Anyhow does the below look sane to you? This has only been tested by my compiler, I haven't checked if this boots. --- commit c73b87d33fd08f7c1b0a381b08b3626128f8f3b8 Author: Maarten Lankhorst maarten.lankhorst@canonical.com Date: Tue Mar 25 15:10:21 2014 +0100
add rcu to fence HACK WIP DO NOT USE KILLS YOUR POT PLANTS PETS AND FAMILY
diff --git a/drivers/base/dma-buf.c b/drivers/base/dma-buf.c index 96338bf7f457..0a88c10b3db9 100644 --- a/drivers/base/dma-buf.c +++ b/drivers/base/dma-buf.c @@ -134,7 +134,10 @@ static unsigned int dma_buf_poll(struct file *file, poll_table *poll) { struct dma_buf *dmabuf; struct reservation_object *resv; + struct reservation_object_shared *shared; + struct fence *fence_excl; unsigned long events; + unsigned shared_count;
dmabuf = file->private_data; if (!dmabuf || !dmabuf->resv) @@ -148,14 +151,18 @@ static unsigned int dma_buf_poll(struct file *file, poll_table *poll) if (!events) return 0;
- ww_mutex_lock(&resv->lock, NULL); + rcu_read_lock();
- if (resv->fence_excl && (!(events & POLLOUT) || - resv->fence_shared_count == 0)) { + shared = rcu_dereference(resv->shared); + fence_excl = rcu_dereference(resv->fence_excl); + shared_count = ACCESS_ONCE(shared->count); + + if (fence_excl && (!(events & POLLOUT) || + (!shared || shared_count == 0))) { struct dma_buf_poll_cb_t *dcb = &dmabuf->cb_excl; unsigned long pevents = POLLIN;
- if (resv->fence_shared_count == 0) + if (!shared || shared_count == 0) pevents |= POLLOUT;
spin_lock_irq(&dmabuf->poll.lock); @@ -167,19 +174,26 @@ static unsigned int dma_buf_poll(struct file *file, poll_table *poll) spin_unlock_irq(&dmabuf->poll.lock);
if (events & pevents) { - if (!fence_add_callback(resv->fence_excl, - &dcb->cb, dma_buf_poll_cb)) + if (!kref_get_unless_zero(&fence_excl->refcount)) { + /* force a recheck */ + events &= ~pevents; + dma_buf_poll_cb(NULL, &dcb->cb); + } else if (!fence_add_callback(fence_excl, &dcb->cb, + dma_buf_poll_cb)) { events &= ~pevents; - else + fence_put(fence_excl); + } else { /* * No callback queued, wake up any additional * waiters. */ + fence_put(fence_excl); dma_buf_poll_cb(NULL, &dcb->cb); + } } }
- if ((events & POLLOUT) && resv->fence_shared_count > 0) { + if ((events & POLLOUT) && shared && shared_count > 0) { struct dma_buf_poll_cb_t *dcb = &dmabuf->cb_shared; int i;
@@ -194,20 +208,34 @@ static unsigned int dma_buf_poll(struct file *file, poll_table *poll) if (!(events & POLLOUT)) goto out;
- for (i = 0; i < resv->fence_shared_count; ++i) - if (!fence_add_callback(resv->fence_shared[i], - &dcb->cb, dma_buf_poll_cb)) { + for (i = 0; i < shared_count; ++i) { + struct fence *fence = ACCESS_ONCE(shared->fence[i]); + if (!kref_get_unless_zero(&fence->refcount)) { + /* + * fence refcount dropped to zero, this means + * that the shared object has been freed from under us. + * call dma_buf_poll_cb and force a recheck! + */ events &= ~POLLOUT; + dma_buf_poll_cb(NULL, &dcb->cb); break; } + if (!fence_add_callback(fence, &dcb->cb, + dma_buf_poll_cb)) { + fence_put(fence); + events &= ~POLLOUT; + break; + } + fence_put(fence); + }
/* No callback queued, wake up any additional waiters. */ - if (i == resv->fence_shared_count) + if (i == shared_count) dma_buf_poll_cb(NULL, &dcb->cb); }
out: - ww_mutex_unlock(&resv->lock); + rcu_read_unlock(); return events; }
diff --git a/drivers/base/fence.c b/drivers/base/fence.c index 8fff13fb86cf..be03a9e8ad8b 100644 --- a/drivers/base/fence.c +++ b/drivers/base/fence.c @@ -170,7 +170,7 @@ void release_fence(struct kref *kref) if (fence->ops->release) fence->ops->release(fence); else - kfree(fence); + kfree_rcu(fence, rcu); } EXPORT_SYMBOL(release_fence);
diff --git a/include/linux/fence.h b/include/linux/fence.h index 65f2a01ee7e4..d19620405c34 100644 --- a/include/linux/fence.h +++ b/include/linux/fence.h @@ -40,6 +40,7 @@ struct fence_cb; * struct fence - software synchronization primitive * @refcount: refcount for this fence * @ops: fence_ops associated with this fence + * @rcu: used for releasing fence with kfree_rcu * @cb_list: list of all callbacks to call * @lock: spin_lock_irqsave used for locking * @context: execution context this fence belongs to, returned by @@ -73,6 +74,7 @@ struct fence_cb; struct fence { struct kref refcount; const struct fence_ops *ops; + struct rcu_head rcu; struct list_head cb_list; spinlock_t *lock; unsigned context, seqno; diff --git a/include/linux/reservation.h b/include/linux/reservation.h index f3f57460a205..91576fabafdb 100644 --- a/include/linux/reservation.h +++ b/include/linux/reservation.h @@ -42,6 +42,7 @@ #include <linux/ww_mutex.h> #include <linux/fence.h> #include <linux/slab.h> +#include <linux/rcupdate.h>
extern struct ww_class reservation_ww_class;
@@ -49,8 +50,12 @@ struct reservation_object { struct ww_mutex lock;
struct fence *fence_excl; - struct fence **fence_shared; - u32 fence_shared_count, fence_shared_max; + u32 shared_max_fence; + struct reservation_object_shared { + struct rcu_head rcu; + u32 count; + struct fence *fence[]; + } *shared; };
static inline void @@ -58,8 +63,8 @@ reservation_object_init(struct reservation_object *obj) { ww_mutex_init(&obj->lock, &reservation_ww_class);
- obj->fence_shared_count = obj->fence_shared_max = 0; - obj->fence_shared = NULL; + obj->shared = NULL; + obj->shared_max_fence = 4; obj->fence_excl = NULL; }
@@ -70,11 +75,97 @@ reservation_object_fini(struct reservation_object *obj)
if (obj->fence_excl) fence_put(obj->fence_excl); - for (i = 0; i < obj->fence_shared_count; ++i) - fence_put(obj->fence_shared[i]); - kfree(obj->fence_shared); + if (obj->shared) { + for (i = 0; i < obj->shared->count; ++i) + fence_put(obj->shared->fence[i]); + + /* + * This object should be dead and all references must have + * been released to it, so no need to free with rcu. + */ + kfree(obj->shared); + }
ww_mutex_destroy(&obj->lock); }
+/* + * Reserve space to add a shared fence to a reservation_object, + * must be called with obj->lock held. + */ +static inline int +reservation_object_reserve_shared_fence(struct reservation_object *obj) +{ + struct reservation_object_shared *shared, *old; + u32 max = obj->shared_max_fence; + + if (obj->shared) { + if (obj->shared->count < max) + return 0; + max *= 2; + } + + shared = kmalloc(offsetof(typeof(*shared), fence[max]), GFP_KERNEL); + if (!shared) + return -ENOMEM; + old = obj->shared; + + if (old) { + shared->count = old->count; + memcpy(shared->fence, old->fence, old->count * sizeof(*old->fence)); + } else { + shared->count = 0; + } + rcu_assign_pointer(obj->shared, shared); + obj->shared_max_fence = max; + kfree_rcu(old, rcu); + return 0; +} + +/* + * Add a fence to a shared slot, obj->lock must be held, and + * reservation_object_reserve_shared_fence has been called. + */ + +static inline void +reservation_object_add_shared_fence(struct reservation_object *obj, + struct fence *fence) +{ + unsigned i; + + BUG_ON(obj->shared->count == obj->shared_max_fence); + fence_get(fence); + + for (i = 0; i < obj->shared->count; ++i) + if (obj->shared->fence[i]->context == fence->context) { + struct fence *old = obj->shared->fence[i]; + rcu_assign_pointer(obj->shared->fence[i], fence); + fence_put(old); + return; + } + + obj->shared->fence[obj->shared->count] = fence; + smp_wmb(); + obj->shared->count++; +} + +/* + * May be called after adding an exclusive to wipe all shared fences. + */ + +static inline void +reservation_object_clear_shared(struct reservation_object *obj) +{ + struct reservation_object_shared *old = obj->shared; + unsigned i; + + if (!old) + return; + + rcu_assign_pointer(obj->shared, NULL); + for (i = 0; i < old->count; ++i) + fence_put(old->fence[i]); + kfree_rcu(old, rcu); +} + #endif /* _LINUX_RESERVATION_H */
dri-devel@lists.freedesktop.org