Signed-off-by: Maarten Lankhorst maarten.lankhorst@canonical.com --- drivers/gpu/drm/ttm/ttm_execbuf_util.c | 141 +++++++++++--------------------- drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c | 1 - include/drm/ttm/ttm_execbuf_util.h | 3 - 3 files changed, 50 insertions(+), 95 deletions(-)
diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c b/drivers/gpu/drm/ttm/ttm_execbuf_util.c index 9198755df086..ec36206da95a 100644 --- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c +++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c @@ -32,20 +32,12 @@ #include <linux/sched.h> #include <linux/module.h>
-static void ttm_eu_backoff_reservation_locked(struct list_head *list) +static void ttm_eu_backoff_reservation_reverse(struct list_head *list, + struct ttm_validate_buffer *entry) { - struct ttm_validate_buffer *entry; - - list_for_each_entry(entry, list, head) { + list_for_each_entry_continue_reverse(entry, list, head) { struct ttm_buffer_object *bo = entry->bo; - if (!entry->reserved) - continue;
- entry->reserved = false; - if (entry->removed) { - ttm_bo_add_to_lru(bo); - entry->removed = false; - } ww_mutex_unlock(&bo->resv->lock); } } @@ -56,27 +48,9 @@ static void ttm_eu_del_from_lru_locked(struct list_head *list)
list_for_each_entry(entry, list, head) { struct ttm_buffer_object *bo = entry->bo; - if (!entry->reserved) - continue; - - if (!entry->removed) { - entry->put_count = ttm_bo_del_from_lru(bo); - entry->removed = true; - } - } -} - -static void ttm_eu_list_ref_sub(struct list_head *list) -{ - struct ttm_validate_buffer *entry; - - list_for_each_entry(entry, list, head) { - struct ttm_buffer_object *bo = entry->bo; + unsigned put_count = ttm_bo_del_from_lru(bo);
- if (entry->put_count) { - ttm_bo_list_ref_sub(bo, entry->put_count, true); - entry->put_count = 0; - } + ttm_bo_list_ref_sub(bo, put_count, true); } }
@@ -91,11 +65,18 @@ void ttm_eu_backoff_reservation(struct ww_acquire_ctx *ticket,
entry = list_first_entry(list, struct ttm_validate_buffer, head); glob = entry->bo->glob; + spin_lock(&glob->lru_lock); - ttm_eu_backoff_reservation_locked(list); + list_for_each_entry(entry, list, head) { + struct ttm_buffer_object *bo = entry->bo; + + ttm_bo_add_to_lru(bo); + ww_mutex_unlock(&bo->resv->lock); + } + spin_unlock(&glob->lru_lock); + if (ticket) ww_acquire_fini(ticket); - spin_unlock(&glob->lru_lock); } EXPORT_SYMBOL(ttm_eu_backoff_reservation);
@@ -121,64 +102,56 @@ int ttm_eu_reserve_buffers(struct ww_acquire_ctx *ticket, if (list_empty(list)) return 0;
- list_for_each_entry(entry, list, head) { - entry->reserved = false; - entry->put_count = 0; - entry->removed = false; - } - entry = list_first_entry(list, struct ttm_validate_buffer, head); glob = entry->bo->glob;
if (ticket) ww_acquire_init(ticket, &reservation_ww_class); -retry: + list_for_each_entry(entry, list, head) { struct ttm_buffer_object *bo = entry->bo;
- /* already slowpath reserved? */ - if (entry->reserved) - continue; - ret = ttm_bo_reserve_nolru(bo, intr, (ticket == NULL), true, ticket);
- if (ret == -EDEADLK) { - /* uh oh, we lost out, drop every reservation and try - * to only reserve this buffer, then start over if - * this succeeds. - */ - BUG_ON(ticket == NULL); - spin_lock(&glob->lru_lock); - ttm_eu_backoff_reservation_locked(list); - spin_unlock(&glob->lru_lock); - ttm_eu_list_ref_sub(list); - - if (intr) { - ret = ww_mutex_lock_slow_interruptible(&bo->resv->lock, - ticket); - if (unlikely(ret != 0)) { - if (ret == -EINTR) - ret = -ERESTARTSYS; - goto err_fini; - } - } else - ww_mutex_lock_slow(&bo->resv->lock, ticket); - - entry->reserved = true; - if (unlikely(atomic_read(&bo->cpu_writers) > 0)) { - ret = -EBUSY; - goto err; - } - goto retry; - } else if (ret) - goto err; + if (!ret && unlikely(atomic_read(&bo->cpu_writers) > 0)) { + ww_mutex_unlock(&bo->resv->lock);
- entry->reserved = true; - if (unlikely(atomic_read(&bo->cpu_writers) > 0)) { ret = -EBUSY; - goto err; } + + if (!ret) + continue; + + /* uh oh, we lost out, drop every reservation and try + * to only reserve this buffer, then start over if + * this succeeds. + */ + ttm_eu_backoff_reservation_reverse(list, entry); + + if (ret == -EDEADLK && intr) { + ret = ww_mutex_lock_slow_interruptible(&bo->resv->lock, + ticket); + } else if (ret == -EDEADLK) { + ww_mutex_lock_slow(&bo->resv->lock, ticket); + ret = 0; + } + + if (unlikely(ret != 0)) { + if (ret == -EINTR) + ret = -ERESTARTSYS; + if (ticket) { + ww_acquire_done(ticket); + ww_acquire_fini(ticket); + } + return ret; + } + + /* move this item to the front of the list, + * forces correct iteration of the loop without keeping track + */ + list_del(&entry->head); + list_add(&entry->head, list); }
if (ticket) @@ -186,20 +159,7 @@ retry: spin_lock(&glob->lru_lock); ttm_eu_del_from_lru_locked(list); spin_unlock(&glob->lru_lock); - ttm_eu_list_ref_sub(list); return 0; - -err: - spin_lock(&glob->lru_lock); - ttm_eu_backoff_reservation_locked(list); - spin_unlock(&glob->lru_lock); - ttm_eu_list_ref_sub(list); -err_fini: - if (ticket) { - ww_acquire_done(ticket); - ww_acquire_fini(ticket); - } - return ret; } EXPORT_SYMBOL(ttm_eu_reserve_buffers);
@@ -229,7 +189,6 @@ void ttm_eu_fence_buffer_objects(struct ww_acquire_ctx *ticket, bo->sync_obj = driver->sync_obj_ref(sync_obj); ttm_bo_add_to_lru(bo); ww_mutex_unlock(&bo->resv->lock); - entry->reserved = false; } spin_unlock(&bdev->fence_lock); spin_unlock(&glob->lru_lock); diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c index 7ca48db74a09..8d87631b3eb8 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c @@ -264,7 +264,6 @@ static int vmw_bo_to_validate_list(struct vmw_sw_context *sw_context, ++sw_context->cur_val_buf; val_buf = &vval_buf->base; val_buf->bo = ttm_bo_reference(bo); - val_buf->reserved = false; list_add_tail(&val_buf->head, &sw_context->validate_nodes); }
diff --git a/include/drm/ttm/ttm_execbuf_util.h b/include/drm/ttm/ttm_execbuf_util.h index fd95fd569ca3..8490cb8ee0d8 100644 --- a/include/drm/ttm/ttm_execbuf_util.h +++ b/include/drm/ttm/ttm_execbuf_util.h @@ -48,9 +48,6 @@ struct ttm_validate_buffer { struct list_head head; struct ttm_buffer_object *bo; - bool reserved; - bool removed; - int put_count; void *old_sync_obj; };
Apart from some code inside ttm itself and nouveau_bo_vma_del, this is the only place where ttm_bo_wait is used without a reservation. Fix this so we can remove the fence_lock later on.
Signed-off-by: Maarten Lankhorst maarten.lankhorst@canonical.com --- drivers/gpu/drm/nouveau/nouveau_gem.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c index 78a27f8ad7d9..24e9c58da8aa 100644 --- a/drivers/gpu/drm/nouveau/nouveau_gem.c +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c @@ -894,17 +894,31 @@ nouveau_gem_ioctl_cpu_prep(struct drm_device *dev, void *data, struct drm_gem_object *gem; struct nouveau_bo *nvbo; bool no_wait = !!(req->flags & NOUVEAU_GEM_CPU_PREP_NOWAIT); - int ret = -EINVAL; + int ret; + struct nouveau_fence *fence = NULL;
gem = drm_gem_object_lookup(dev, file_priv, req->handle); if (!gem) return -ENOENT; nvbo = nouveau_gem_object(gem);
- spin_lock(&nvbo->bo.bdev->fence_lock); - ret = ttm_bo_wait(&nvbo->bo, true, true, no_wait); - spin_unlock(&nvbo->bo.bdev->fence_lock); + ret = ttm_bo_reserve(&nvbo->bo, true, false, false, 0); + if (!ret) { + spin_lock(&nvbo->bo.bdev->fence_lock); + ret = ttm_bo_wait(&nvbo->bo, true, true, true); + if (!no_wait && ret) + fence = nouveau_fence_ref(nvbo->bo.sync_obj); + spin_unlock(&nvbo->bo.bdev->fence_lock); + + ttm_bo_unreserve(&nvbo->bo); + } drm_gem_object_unreference_unlocked(gem); + + if (fence) { + ret = nouveau_fence_wait(fence, true, no_wait); + nouveau_fence_unref(&fence); + } + return ret; }
Maarten, for this and the other patches in this series,
I seem to recall we have this discussion before? IIRC I stated that reservation was a too heavy-weight lock to hold to determine whether a buffer was idle? It's a pretty nasty thing to build in.
/Thomas
On 01/21/2014 02:04 PM, Maarten Lankhorst wrote:
Apart from some code inside ttm itself and nouveau_bo_vma_del, this is the only place where ttm_bo_wait is used without a reservation. Fix this so we can remove the fence_lock later on.
Signed-off-by: Maarten Lankhorst maarten.lankhorst@canonical.com
drivers/gpu/drm/nouveau/nouveau_gem.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c index 78a27f8ad7d9..24e9c58da8aa 100644 --- a/drivers/gpu/drm/nouveau/nouveau_gem.c +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c @@ -894,17 +894,31 @@ nouveau_gem_ioctl_cpu_prep(struct drm_device *dev, void *data, struct drm_gem_object *gem; struct nouveau_bo *nvbo; bool no_wait = !!(req->flags & NOUVEAU_GEM_CPU_PREP_NOWAIT);
- int ret = -EINVAL;
int ret;
struct nouveau_fence *fence = NULL;
gem = drm_gem_object_lookup(dev, file_priv, req->handle); if (!gem) return -ENOENT; nvbo = nouveau_gem_object(gem);
- spin_lock(&nvbo->bo.bdev->fence_lock);
- ret = ttm_bo_wait(&nvbo->bo, true, true, no_wait);
- spin_unlock(&nvbo->bo.bdev->fence_lock);
- ret = ttm_bo_reserve(&nvbo->bo, true, false, false, 0);
- if (!ret) {
spin_lock(&nvbo->bo.bdev->fence_lock);
ret = ttm_bo_wait(&nvbo->bo, true, true, true);
if (!no_wait && ret)
fence = nouveau_fence_ref(nvbo->bo.sync_obj);
spin_unlock(&nvbo->bo.bdev->fence_lock);
ttm_bo_unreserve(&nvbo->bo);
- } drm_gem_object_unreference_unlocked(gem);
- if (fence) {
ret = nouveau_fence_wait(fence, true, no_wait);
nouveau_fence_unref(&fence);
- }
- return ret;
}
Hey,
op 21-01-14 16:17, Thomas Hellstrom schreef:
Maarten, for this and the other patches in this series,
I seem to recall we have this discussion before? IIRC I stated that reservation was a too heavy-weight lock to hold to determine whether a buffer was idle? It's a pretty nasty thing to build in.
I've sent this patch after determining that this already didn't end up being heavyweight. Most places were already using the fence_lock and reservation, I just fixed up the few places that didn't hold a reservation while waiting. Converting the few places that didn't ended up being trivial, so I thought I'd submit it.
If a tryreserve fails it's a good indication that the buffer is NOT idle, no need to check the fences too in that case.
I ended up converting this so I could use shared/exclusive fence slots internally in nouveau, allowing multiple readers to access the buffers in parallel. See commit "drm/nouveau: first stab at using shared fences for readable objects" at http://cgit.freedesktop.org/~mlankhorst/linux/log/
But doing this required killing fence_lock.
~Maarten
On 01/21/2014 04:29 PM, Maarten Lankhorst wrote:
Hey,
op 21-01-14 16:17, Thomas Hellstrom schreef:
Maarten, for this and the other patches in this series,
I seem to recall we have this discussion before? IIRC I stated that reservation was a too heavy-weight lock to hold to determine whether a buffer was idle? It's a pretty nasty thing to build in.
I've sent this patch after determining that this already didn't end up being heavyweight. Most places were already using the fence_lock and reservation, I just fixed up the few places that didn't hold a reservation while waiting. Converting the few places that didn't ended up being trivial, so I thought I'd submit it.
Actually the only *valid* reason for holding a reservation when waiting for idle is 1) You want to block further command submission on the buffer. 2) You want to switch GPU engine and don't have access to gpu semaphores / barriers.
Reservation has the nasty side effect that it blocks command submission and pins the buffer (in addition now makes the evict list traversals skip the buffer) which in general is *not* necessary for most wait cases, so we should instead actually convert the wait cases that don't fulfill 1) and 2) above in the other direction if we have performance and latency-reduction in mind. I can't see how a spinlock protecting a fence pointer or fence list is stopping you from using RW fences as long as the spinlock is held while manipulating the fence list?
/Thomas
op 21-01-14 18:44, Thomas Hellstrom schreef:
On 01/21/2014 04:29 PM, Maarten Lankhorst wrote:
Hey,
op 21-01-14 16:17, Thomas Hellstrom schreef:
Maarten, for this and the other patches in this series,
I seem to recall we have this discussion before? IIRC I stated that reservation was a too heavy-weight lock to hold to determine whether a buffer was idle? It's a pretty nasty thing to build in.
I've sent this patch after determining that this already didn't end up being heavyweight. Most places were already using the fence_lock and reservation, I just fixed up the few places that didn't hold a reservation while waiting. Converting the few places that didn't ended up being trivial, so I thought I'd submit it.
Actually the only *valid* reason for holding a reservation when waiting for idle is
- You want to block further command submission on the buffer.
- You want to switch GPU engine and don't have access to gpu semaphores
/ barriers.
Reservation has the nasty side effect that it blocks command submission and pins the buffer (in addition now makes the evict list traversals skip the buffer) which in general is *not* necessary for most wait cases, so we should instead actually convert the wait cases that don't fulfill 1) and 2) above in the other direction if we have performance and latency-reduction in mind. I can't see how a spinlock protecting a fence pointer or fence list is stopping you from using RW fences as long as the spinlock is held while manipulating the fence list?
You wish. Fine I'll enumerate all cases of ttm_bo_wait (with the patchset, though) and enumerate if they can be changed to work without reservation or not.
ttm/ttm_bo.c ttm_bo_cleanup_refs_or_queue: needs reservation and ttm_bo_wait to finish for the direct destroy fastpath, if either fails it needs to be queued. Cannot work without reservation. ttm_bo_cleanup_refs_and_unlock: already drops reservation to wait, doesn't need to re-acquire. Simply reordering ttm_bo_wait until after re-reserve is enough. ttm_bo_evict: already has the reservation, cannot be dropped since only trylock is allowed. Dropping reservation would cause badness, cannot be converted. ttm_bo_move_buffer: called from ttm_bo_validate, cannot drop reservation for same reason as ttm_bo_evict. It might be part of a ticketed reservation so really don't drop lock here. ttm_bo_synccpu_write_grab: the wait could be converted to be done afterwards, without fence_lock. But in this case reservation could take the role of fence_lock too, so no separate fence_lock would be needed. ttm_bo_swapout: see ttm_bo_evict.
ttm/ttm_bo_util.c: ttm_bo_move_accel_cleanup: calls ttm_bo_wait, cannot drop lock, see ttm_bo_move_buffer, can be called from that function.
ttm/ttm_bo_vm.c ttm_bo_vm_fault_idle: I guess you COULD drop the reservation here, but you already had the reservation, so a similar optimization to ttm_bo_synccpu_write_grab could be done without requiring fence_lock. If you would write it like that, you would end up with a patch similar to drm/nouveau: add reservation to nouveau_gem_ioctl_cpu_prep. I think we should do this, an
Ok, so the core does NOT need fence_lock because we can never drop reservations except in synccpu_write_grab and maybe ttm_bo_vm_fault_idle, but even in those cases reservation is done. So that could be used instead of fence_lock.
nouveau_gem_ioctl_cpu_prep: Either block on a global spinlock or a local reservation lock. Doesn't matter much which, I don't need the need to keep a global lock for this function... 2 cases can happen in the trylock reservation failure case: buffer is not reserved, so it's not in the process of being evicted. buffer is reserved, which means it's being used in command submission right now, or in one of the functions described above (eg not idle).
nouveau_gem_pushbuf_reloc_apply: has to call ttm_bo_wait with reservation, cannot be dropped.
So for core ttm and nouveau the fence_lock is never needed, radeon has only 1 function that calls ttm_bo_wait which uses a reservation too. It doesn't need the fence_lock either.
~Maarten
On 01/22/2014 09:19 AM, Maarten Lankhorst wrote:
op 21-01-14 18:44, Thomas Hellstrom schreef:
On 01/21/2014 04:29 PM, Maarten Lankhorst wrote:
Hey,
op 21-01-14 16:17, Thomas Hellstrom schreef:
Maarten, for this and the other patches in this series,
I seem to recall we have this discussion before? IIRC I stated that reservation was a too heavy-weight lock to hold to determine whether a buffer was idle? It's a pretty nasty thing to build in.
I've sent this patch after determining that this already didn't end up being heavyweight. Most places were already using the fence_lock and reservation, I just fixed up the few places that didn't hold a reservation while waiting. Converting the few places that didn't ended up being trivial, so I thought I'd submit it.
Actually the only *valid* reason for holding a reservation when waiting for idle is
- You want to block further command submission on the buffer.
- You want to switch GPU engine and don't have access to gpu semaphores
/ barriers.
Reservation has the nasty side effect that it blocks command submission and pins the buffer (in addition now makes the evict list traversals skip the buffer) which in general is *not* necessary for most wait cases, so we should instead actually convert the wait cases that don't fulfill 1) and 2) above in the other direction if we have performance and latency-reduction in mind. I can't see how a spinlock protecting a fence pointer or fence list is stopping you from using RW fences as long as the spinlock is held while manipulating the fence list?
You wish. Fine I'll enumerate all cases of ttm_bo_wait (with the patchset, though) and enumerate if they can be changed to work without reservation or not.
ttm/ttm_bo.c ttm_bo_cleanup_refs_or_queue: needs reservation and ttm_bo_wait to finish for the direct destroy fastpath, if either fails it needs to be queued. Cannot work without reservation.
Doesn't block and no significant reservation contention expected.
ttm_bo_cleanup_refs_and_unlock: already drops reservation to wait, doesn't need to re-acquire. Simply reordering ttm_bo_wait until after re-reserve is enough.
Currently follows the above rules.
ttm_bo_evict: already has the reservation, cannot be dropped since only trylock is allowed. Dropping reservation would cause badness, cannot be converted.
Follows rule 2 above. We're about to move the buffer and if that can't be pipelined using the GPU (which TTM currently doesn't allow), we need to wait. Although eviction should be low priority compared to new command submission, so I can't really see why we couldn't wait before trying to reserve here?
ttm_bo_move_buffer: called from ttm_bo_validate, cannot drop reservation for same reason as ttm_bo_evict. It might be part of a ticketed reservation so really don't drop lock here.
Part of command submission and as such follows rule 2 above. If we can pipeline the move with the GPU, no need to wait (but needs to be implemented, of course).
ttm_bo_synccpu_write_grab: the wait could be converted to be done afterwards, without fence_lock. But in this case reservation could take the role of fence_lock too,
so no separate fence_lock would be needed.
With the exception that reservation is more likely to be contended.
ttm_bo_swapout: see ttm_bo_evict.
ttm/ttm_bo_util.c: ttm_bo_move_accel_cleanup: calls ttm_bo_wait, cannot drop lock, see ttm_bo_move_buffer, can be called from that function.
Rule 2.
ttm/ttm_bo_vm.c ttm_bo_vm_fault_idle: I guess you COULD drop the reservation here, but you already had the reservation, so a similar optimization to ttm_bo_synccpu_write_grab could be done without requiring fence_lock. If you would write it like that, you would end up with a patch similar to drm/nouveau: add reservation to nouveau_gem_ioctl_cpu_prep. I think we should do this, an
Ok, so the core does NOT need fence_lock because we can never drop reservations except in synccpu_write_grab and maybe ttm_bo_vm_fault_idle, but even in those cases reservation is done. So that could be used instead of fence_lock.
nouveau_gem_ioctl_cpu_prep: Either block on a global spinlock or a local reservation lock. Doesn't matter much which, I don't need the need to keep a global lock for this function... 2 cases can happen in the trylock reservation failure case: buffer is not reserved, so it's not in the process of being evicted. buffer is reserved, which means it's being used in command submission right now, or in one of the functions described above (eg not idle).
nouveau_gem_pushbuf_reloc_apply: has to call ttm_bo_wait with reservation, cannot be dropped.
So for core ttm and nouveau the fence_lock is never needed, radeon has only 1 function that calls ttm_bo_wait which uses a reservation too. It doesn't need the fence_lock either.
And vmwgfx now also has a syccpu IOCTL (see drm-next).
So assuming that we converted the functions that can be converted to wait outside of reservation, the same way you have done with Nouveau, leaving the ones that fall under 1) and 2) above, I would still argue that a spinlock should be used because taking a reservation may implicitly mean wait for gpu, and could give bad performance- and latency charateristics. You shouldn't need to wait for gpu to check for buffer idle.
/Thomas
~Maarten
op 22-01-14 10:40, Thomas Hellstrom schreef:
On 01/22/2014 09:19 AM, Maarten Lankhorst wrote:
op 21-01-14 18:44, Thomas Hellstrom schreef:
On 01/21/2014 04:29 PM, Maarten Lankhorst wrote:
Hey,
op 21-01-14 16:17, Thomas Hellstrom schreef:
Maarten, for this and the other patches in this series,
I seem to recall we have this discussion before? IIRC I stated that reservation was a too heavy-weight lock to hold to determine whether a buffer was idle? It's a pretty nasty thing to build in.
I've sent this patch after determining that this already didn't end up being heavyweight. Most places were already using the fence_lock and reservation, I just fixed up the few places that didn't hold a reservation while waiting. Converting the few places that didn't ended up being trivial, so I thought I'd submit it.
Actually the only *valid* reason for holding a reservation when waiting for idle is
- You want to block further command submission on the buffer.
- You want to switch GPU engine and don't have access to gpu semaphores
/ barriers.
Reservation has the nasty side effect that it blocks command submission and pins the buffer (in addition now makes the evict list traversals skip the buffer) which in general is *not* necessary for most wait cases, so we should instead actually convert the wait cases that don't fulfill 1) and 2) above in the other direction if we have performance and latency-reduction in mind. I can't see how a spinlock protecting a fence pointer or fence list is stopping you from using RW fences as long as the spinlock is held while manipulating the fence list?
You wish. Fine I'll enumerate all cases of ttm_bo_wait (with the patchset, though) and enumerate if they can be changed to work without reservation or not.
ttm/ttm_bo.c ttm_bo_cleanup_refs_or_queue: needs reservation and ttm_bo_wait to finish for the direct destroy fastpath, if either fails it needs to be queued. Cannot work without reservation.
Doesn't block and no significant reservation contention expected.
ttm_bo_cleanup_refs_and_unlock: already drops reservation to wait, doesn't need to re-acquire. Simply reordering ttm_bo_wait until after re-reserve is enough.
Currently follows the above rules.
ttm_bo_evict: already has the reservation, cannot be dropped since only trylock is allowed. Dropping reservation would cause badness, cannot be converted.
Follows rule 2 above. We're about to move the buffer and if that can't be pipelined using the GPU (which TTM currently doesn't allow), we need to wait. Although eviction should be low priority compared to new command submission, so I can't really see why we couldn't wait before trying to reserve here?
ttm_bo_move_buffer: called from ttm_bo_validate, cannot drop reservation for same reason as ttm_bo_evict. It might be part of a ticketed reservation so really don't drop lock here.
Part of command submission and as such follows rule 2 above. If we can pipeline the move with the GPU, no need to wait (but needs to be implemented, of course).
ttm_bo_synccpu_write_grab: the wait could be converted to be done afterwards, without fence_lock. But in this case reservation could take the role of fence_lock too,
so no separate fence_lock would be needed.
With the exception that reservation is more likely to be contended.
True but rule 1.
ttm_bo_swapout: see ttm_bo_evict.
ttm/ttm_bo_util.c: ttm_bo_move_accel_cleanup: calls ttm_bo_wait, cannot drop lock, see ttm_bo_move_buffer, can be called from that function.
Rule 2.
ttm/ttm_bo_vm.c ttm_bo_vm_fault_idle: I guess you COULD drop the reservation here, but you already had the reservation, so a similar optimization to ttm_bo_synccpu_write_grab could be done without requiring fence_lock. If you would write it like that, you would end up with a patch similar to drm/nouveau: add reservation to nouveau_gem_ioctl_cpu_prep. I think we should do this, an
Ok, so the core does NOT need fence_lock because we can never drop reservations except in synccpu_write_grab and maybe ttm_bo_vm_fault_idle, but even in those cases reservation is done. So that could be used instead of fence_lock.
nouveau_gem_ioctl_cpu_prep: Either block on a global spinlock or a local reservation lock. Doesn't matter much which, I don't need the need to keep a global lock for this function... 2 cases can happen in the trylock reservation failure case: buffer is not reserved, so it's not in the process of being evicted. buffer is reserved, which means it's being used in command submission right now, or in one of the functions described above (eg not idle).
nouveau_gem_pushbuf_reloc_apply: has to call ttm_bo_wait with reservation, cannot be dropped.
So for core ttm and nouveau the fence_lock is never needed, radeon has only 1 function that calls ttm_bo_wait which uses a reservation too. It doesn't need the fence_lock either.
And vmwgfx now also has a syccpu IOCTL (see drm-next).
So assuming that we converted the functions that can be converted to wait outside of reservation, the same way you have done with Nouveau, leaving the ones that fall under 1) and 2) above, I would still argue that a spinlock should be used because taking a reservation may implicitly mean wait for gpu, and could give bad performance- and latency charateristics. You shouldn't need to wait for gpu to check for buffer idle.
Except that without reservation you can't tell if the buffer is really idle, or is currently being used as part of some command submission/eviction before the fence pointer is set.
~Maarten
On 01/22/2014 10:55 AM, Maarten Lankhorst wrote:
op 22-01-14 10:40, Thomas Hellstrom schreef:
On 01/22/2014 09:19 AM, Maarten Lankhorst wrote:
op 21-01-14 18:44, Thomas Hellstrom schreef:
On 01/21/2014 04:29 PM, Maarten Lankhorst wrote:
Hey,
op 21-01-14 16:17, Thomas Hellstrom schreef:
Maarten, for this and the other patches in this series,
I seem to recall we have this discussion before? IIRC I stated that reservation was a too heavy-weight lock to hold to determine whether a buffer was idle? It's a pretty nasty thing to build in.
I've sent this patch after determining that this already didn't end up being heavyweight. Most places were already using the fence_lock and reservation, I just fixed up the few places that didn't hold a reservation while waiting. Converting the few places that didn't ended up being trivial, so I thought I'd submit it.
Actually the only *valid* reason for holding a reservation when waiting for idle is
- You want to block further command submission on the buffer.
- You want to switch GPU engine and don't have access to gpu
semaphores / barriers.
Reservation has the nasty side effect that it blocks command submission and pins the buffer (in addition now makes the evict list traversals skip the buffer) which in general is *not* necessary for most wait cases, so we should instead actually convert the wait cases that don't fulfill 1) and 2) above in the other direction if we have performance and latency-reduction in mind. I can't see how a spinlock protecting a fence pointer or fence list is stopping you from using RW fences as long as the spinlock is held while manipulating the fence list?
You wish. Fine I'll enumerate all cases of ttm_bo_wait (with the patchset, though) and enumerate if they can be changed to work without reservation or not.
ttm/ttm_bo.c ttm_bo_cleanup_refs_or_queue: needs reservation and ttm_bo_wait to finish for the direct destroy fastpath, if either fails it needs to be queued. Cannot work without reservation.
Doesn't block and no significant reservation contention expected.
ttm_bo_cleanup_refs_and_unlock: already drops reservation to wait, doesn't need to re-acquire. Simply reordering ttm_bo_wait until after re-reserve is enough.
Currently follows the above rules.
ttm_bo_evict: already has the reservation, cannot be dropped since only trylock is allowed. Dropping reservation would cause badness, cannot be converted.
Follows rule 2 above. We're about to move the buffer and if that can't be pipelined using the GPU (which TTM currently doesn't allow), we need to wait. Although eviction should be low priority compared to new command submission, so I can't really see why we couldn't wait before trying to reserve here?
ttm_bo_move_buffer: called from ttm_bo_validate, cannot drop reservation for same reason as ttm_bo_evict. It might be part of a ticketed reservation so really don't drop lock here.
Part of command submission and as such follows rule 2 above. If we can pipeline the move with the GPU, no need to wait (but needs to be implemented, of course).
ttm_bo_synccpu_write_grab: the wait could be converted to be done afterwards, without fence_lock. But in this case reservation could take the role of fence_lock too,
so no separate fence_lock would be needed.
With the exception that reservation is more likely to be contended.
True but rule 1.
ttm_bo_swapout: see ttm_bo_evict.
ttm/ttm_bo_util.c: ttm_bo_move_accel_cleanup: calls ttm_bo_wait, cannot drop lock, see ttm_bo_move_buffer, can be called from that function.
Rule 2.
ttm/ttm_bo_vm.c ttm_bo_vm_fault_idle: I guess you COULD drop the reservation here, but you already had the reservation, so a similar optimization to ttm_bo_synccpu_write_grab could be done without requiring fence_lock. If you would write it like that, you would end up with a patch similar to drm/nouveau: add reservation to nouveau_gem_ioctl_cpu_prep. I think we should do this, an
Ok, so the core does NOT need fence_lock because we can never drop reservations except in synccpu_write_grab and maybe ttm_bo_vm_fault_idle, but even in those cases reservation is done. So that could be used instead of fence_lock.
nouveau_gem_ioctl_cpu_prep: Either block on a global spinlock or a local reservation lock. Doesn't matter much which, I don't need the need to keep a global lock for this function... 2 cases can happen in the trylock reservation failure case: buffer is not reserved, so it's not in the process of being evicted. buffer is reserved, which means it's being used in command submission right now, or in one of the functions described above (eg not idle).
nouveau_gem_pushbuf_reloc_apply: has to call ttm_bo_wait with reservation, cannot be dropped.
So for core ttm and nouveau the fence_lock is never needed, radeon has only 1 function that calls ttm_bo_wait which uses a reservation too. It doesn't need the fence_lock either.
And vmwgfx now also has a syccpu IOCTL (see drm-next).
So assuming that we converted the functions that can be converted to wait outside of reservation, the same way you have done with Nouveau, leaving the ones that fall under 1) and 2) above, I would still argue that a spinlock should be used because taking a reservation may implicitly mean wait for gpu, and could give bad performance- and latency charateristics. You shouldn't need to wait for gpu to check for buffer idle.
Except that without reservation you can't tell if the buffer is really idle, or is currently being used as part of some command submission/eviction before the fence pointer is set.
Yes, but when that matters, you're either in case 1 or case 2 again. Otherwise, when you release the reservation, you still don't know. A typical example of this is the vmwgfx synccpu ioctl, where you can either choose to block command submission (not used currently) or not (user-space inter-process synchronization). The former is a case 1 wait and holds reservation while waiting for idle and then ups cpu_writers. The latter waits without reservation for previously submitted rendering to finish.
/Thomas
~Maarten
op 22-01-14 11:27, Thomas Hellstrom schreef:
On 01/22/2014 10:55 AM, Maarten Lankhorst wrote:
op 22-01-14 10:40, Thomas Hellstrom schreef:
On 01/22/2014 09:19 AM, Maarten Lankhorst wrote:
op 21-01-14 18:44, Thomas Hellstrom schreef:
On 01/21/2014 04:29 PM, Maarten Lankhorst wrote:
Hey,
op 21-01-14 16:17, Thomas Hellstrom schreef: > Maarten, for this and the other patches in this series, > > I seem to recall we have this discussion before? > IIRC I stated that reservation was a too heavy-weight lock to > hold to > determine whether a buffer was idle? It's a pretty nasty thing to > build in. > I've sent this patch after determining that this already didn't end up being heavyweight. Most places were already using the fence_lock and reservation, I just fixed up the few places that didn't hold a reservation while waiting. Converting the few places that didn't ended up being trivial, so I thought I'd submit it.
Actually the only *valid* reason for holding a reservation when waiting for idle is
- You want to block further command submission on the buffer.
- You want to switch GPU engine and don't have access to gpu
semaphores / barriers.
Reservation has the nasty side effect that it blocks command submission and pins the buffer (in addition now makes the evict list traversals skip the buffer) which in general is *not* necessary for most wait cases, so we should instead actually convert the wait cases that don't fulfill 1) and 2) above in the other direction if we have performance and latency-reduction in mind. I can't see how a spinlock protecting a fence pointer or fence list is stopping you from using RW fences as long as the spinlock is held while manipulating the fence list?
You wish. Fine I'll enumerate all cases of ttm_bo_wait (with the patchset, though) and enumerate if they can be changed to work without reservation or not.
ttm/ttm_bo.c ttm_bo_cleanup_refs_or_queue: needs reservation and ttm_bo_wait to finish for the direct destroy fastpath, if either fails it needs to be queued. Cannot work without reservation.
Doesn't block and no significant reservation contention expected.
ttm_bo_cleanup_refs_and_unlock: already drops reservation to wait, doesn't need to re-acquire. Simply reordering ttm_bo_wait until after re-reserve is enough.
Currently follows the above rules.
ttm_bo_evict: already has the reservation, cannot be dropped since only trylock is allowed. Dropping reservation would cause badness, cannot be converted.
Follows rule 2 above. We're about to move the buffer and if that can't be pipelined using the GPU (which TTM currently doesn't allow), we need to wait. Although eviction should be low priority compared to new command submission, so I can't really see why we couldn't wait before trying to reserve here?
ttm_bo_move_buffer: called from ttm_bo_validate, cannot drop reservation for same reason as ttm_bo_evict. It might be part of a ticketed reservation so really don't drop lock here.
Part of command submission and as such follows rule 2 above. If we can pipeline the move with the GPU, no need to wait (but needs to be implemented, of course).
ttm_bo_synccpu_write_grab: the wait could be converted to be done afterwards, without fence_lock. But in this case reservation could take the role of fence_lock too,
so no separate fence_lock would be needed.
With the exception that reservation is more likely to be contended.
True but rule 1.
ttm_bo_swapout: see ttm_bo_evict.
ttm/ttm_bo_util.c: ttm_bo_move_accel_cleanup: calls ttm_bo_wait, cannot drop lock, see ttm_bo_move_buffer, can be called from that function.
Rule 2.
ttm/ttm_bo_vm.c ttm_bo_vm_fault_idle: I guess you COULD drop the reservation here, but you already had the reservation, so a similar optimization to ttm_bo_synccpu_write_grab could be done without requiring fence_lock. If you would write it like that, you would end up with a patch similar to drm/nouveau: add reservation to nouveau_gem_ioctl_cpu_prep. I think we should do this, an
Ok, so the core does NOT need fence_lock because we can never drop reservations except in synccpu_write_grab and maybe ttm_bo_vm_fault_idle, but even in those cases reservation is done. So that could be used instead of fence_lock.
nouveau_gem_ioctl_cpu_prep: Either block on a global spinlock or a local reservation lock. Doesn't matter much which, I don't need the need to keep a global lock for this function... 2 cases can happen in the trylock reservation failure case: buffer is not reserved, so it's not in the process of being evicted. buffer is reserved, which means it's being used in command submission right now, or in one of the functions described above (eg not idle).
nouveau_gem_pushbuf_reloc_apply: has to call ttm_bo_wait with reservation, cannot be dropped.
So for core ttm and nouveau the fence_lock is never needed, radeon has only 1 function that calls ttm_bo_wait which uses a reservation too. It doesn't need the fence_lock either.
And vmwgfx now also has a syccpu IOCTL (see drm-next).
So assuming that we converted the functions that can be converted to wait outside of reservation, the same way you have done with Nouveau, leaving the ones that fall under 1) and 2) above, I would still argue that a spinlock should be used because taking a reservation may implicitly mean wait for gpu, and could give bad performance- and latency charateristics. You shouldn't need to wait for gpu to check for buffer idle.
Except that without reservation you can't tell if the buffer is really idle, or is currently being used as part of some command submission/eviction before the fence pointer is set.
Yes, but when that matters, you're either in case 1 or case 2 again. Otherwise, when you release the reservation, you still don't know. A typical example of this is the vmwgfx synccpu ioctl, where you can either choose to block command submission (not used currently) or not (user-space inter-process synchronization). The former is a case 1 wait and holds reservation while waiting for idle and then ups cpu_writers. The latter waits without reservation for previously submitted rendering to finish.
Yeah you could, but what exactly are you waiting on then? If it's some specific existing rendering, I would argue that you should create an android userspace fence during command submission, or provide your own api to block on a specfic fence in userspace.
If you don't then I think taking a reservation is not unreasonable. In the most common case the buffer is idle and not reserved, so it isn't contested. The actual waiting itself can be done without reservation held, by taking a reference on the fence.
~Maarten
On 01/22/2014 11:58 AM, Maarten Lankhorst wrote:
op 22-01-14 11:27, Thomas Hellstrom schreef:
On 01/22/2014 10:55 AM, Maarten Lankhorst wrote:
op 22-01-14 10:40, Thomas Hellstrom schreef:
On 01/22/2014 09:19 AM, Maarten Lankhorst wrote:
op 21-01-14 18:44, Thomas Hellstrom schreef:
On 01/21/2014 04:29 PM, Maarten Lankhorst wrote: > Hey, > > op 21-01-14 16:17, Thomas Hellstrom schreef: >> Maarten, for this and the other patches in this series, >> >> I seem to recall we have this discussion before? >> IIRC I stated that reservation was a too heavy-weight lock to >> hold to >> determine whether a buffer was idle? It's a pretty nasty thing to >> build in. >> > I've sent this patch after determining that this already didn't > end up > being heavyweight. > Most places were already using the fence_lock and reservation, I > just > fixed up the few > places that didn't hold a reservation while waiting. Converting the > few places that didn't > ended up being trivial, so I thought I'd submit it. Actually the only *valid* reason for holding a reservation when waiting for idle is
- You want to block further command submission on the buffer.
- You want to switch GPU engine and don't have access to gpu
semaphores / barriers.
Reservation has the nasty side effect that it blocks command submission and pins the buffer (in addition now makes the evict list traversals skip the buffer) which in general is *not* necessary for most wait cases, so we should instead actually convert the wait cases that don't fulfill 1) and 2) above in the other direction if we have performance and latency-reduction in mind. I can't see how a spinlock protecting a fence pointer or fence list is stopping you from using RW fences as long as the spinlock is held while manipulating the fence list?
You wish. Fine I'll enumerate all cases of ttm_bo_wait (with the patchset, though) and enumerate if they can be changed to work without reservation or not.
ttm/ttm_bo.c ttm_bo_cleanup_refs_or_queue: needs reservation and ttm_bo_wait to finish for the direct destroy fastpath, if either fails it needs to be queued. Cannot work without reservation.
Doesn't block and no significant reservation contention expected.
ttm_bo_cleanup_refs_and_unlock: already drops reservation to wait, doesn't need to re-acquire. Simply reordering ttm_bo_wait until after re-reserve is enough.
Currently follows the above rules.
ttm_bo_evict: already has the reservation, cannot be dropped since only trylock is allowed. Dropping reservation would cause badness, cannot be converted.
Follows rule 2 above. We're about to move the buffer and if that can't be pipelined using the GPU (which TTM currently doesn't allow), we need to wait. Although eviction should be low priority compared to new command submission, so I can't really see why we couldn't wait before trying to reserve here?
ttm_bo_move_buffer: called from ttm_bo_validate, cannot drop reservation for same reason as ttm_bo_evict. It might be part of a ticketed reservation so really don't drop lock here.
Part of command submission and as such follows rule 2 above. If we can pipeline the move with the GPU, no need to wait (but needs to be implemented, of course).
ttm_bo_synccpu_write_grab: the wait could be converted to be done afterwards, without fence_lock. But in this case reservation could take the role of fence_lock too,
so no separate fence_lock would be needed.
With the exception that reservation is more likely to be contended.
True but rule 1.
ttm_bo_swapout: see ttm_bo_evict.
ttm/ttm_bo_util.c: ttm_bo_move_accel_cleanup: calls ttm_bo_wait, cannot drop lock, see ttm_bo_move_buffer, can be called from that function.
Rule 2.
ttm/ttm_bo_vm.c ttm_bo_vm_fault_idle: I guess you COULD drop the reservation here, but you already had the reservation, so a similar optimization to ttm_bo_synccpu_write_grab could be done without requiring fence_lock. If you would write it like that, you would end up with a patch similar to drm/nouveau: add reservation to nouveau_gem_ioctl_cpu_prep. I think we should do this, an
Ok, so the core does NOT need fence_lock because we can never drop reservations except in synccpu_write_grab and maybe ttm_bo_vm_fault_idle, but even in those cases reservation is done. So that could be used instead of fence_lock.
nouveau_gem_ioctl_cpu_prep: Either block on a global spinlock or a local reservation lock. Doesn't matter much which, I don't need the need to keep a global lock for this function... 2 cases can happen in the trylock reservation failure case: buffer is not reserved, so it's not in the process of being evicted. buffer is reserved, which means it's being used in command submission right now, or in one of the functions described above (eg not idle).
nouveau_gem_pushbuf_reloc_apply: has to call ttm_bo_wait with reservation, cannot be dropped.
So for core ttm and nouveau the fence_lock is never needed, radeon has only 1 function that calls ttm_bo_wait which uses a reservation too. It doesn't need the fence_lock either.
And vmwgfx now also has a syccpu IOCTL (see drm-next).
So assuming that we converted the functions that can be converted to wait outside of reservation, the same way you have done with Nouveau, leaving the ones that fall under 1) and 2) above, I would still argue that a spinlock should be used because taking a reservation may implicitly mean wait for gpu, and could give bad performance- and latency charateristics. You shouldn't need to wait for gpu to check for buffer idle.
Except that without reservation you can't tell if the buffer is really idle, or is currently being used as part of some command submission/eviction before the fence pointer is set.
Yes, but when that matters, you're either in case 1 or case 2 again. Otherwise, when you release the reservation, you still don't know. A typical example of this is the vmwgfx synccpu ioctl, where you can either choose to block command submission (not used currently) or not (user-space inter-process synchronization). The former is a case 1 wait and holds reservation while waiting for idle and then ups cpu_writers. The latter waits without reservation for previously submitted rendering to finish.
Yeah you could, but what exactly are you waiting on then? If it's some specific existing rendering, I would argue that you should create an android userspace fence during command submission, or provide your own api to block on a specfic fence in userspace.
If you don't then I think taking a reservation is not unreasonable. In the most common case the buffer is idle and not reserved, so it isn't contested. The actual waiting itself can be done without reservation held, by taking a reference on the fence.
Yeah, here is where we disagree. I'm afraid people will start getting sloppy with reservations and use them to protect more stuff, and after a while they start wondering why the GPU command queue drains...
Perhaps we could agree on a solution (building on one of your original ideas) where we require reservation to modify the fence pointers, and the buffer object moving flag, but the structure holding the fence pointer(s) is RCU safe, so that the pointers can be safely read under an rcu lock.
People who don't care about potential reservation contention could just use the reservation lock, and clear pointers of fences that have signaled. People who do care could do a reservation trylock, and if it fails read the fence pointers under RCU. This of course means those fence objects need to be freed after an RCU grace period.
/Thomas
~Maarten
op 22-01-14 13:11, Thomas Hellstrom schreef:
On 01/22/2014 11:58 AM, Maarten Lankhorst wrote:
op 22-01-14 11:27, Thomas Hellstrom schreef:
On 01/22/2014 10:55 AM, Maarten Lankhorst wrote:
op 22-01-14 10:40, Thomas Hellstrom schreef:
On 01/22/2014 09:19 AM, Maarten Lankhorst wrote:
op 21-01-14 18:44, Thomas Hellstrom schreef: > On 01/21/2014 04:29 PM, Maarten Lankhorst wrote: >> Hey, >> >> op 21-01-14 16:17, Thomas Hellstrom schreef: >>> Maarten, for this and the other patches in this series, >>> >>> I seem to recall we have this discussion before? >>> IIRC I stated that reservation was a too heavy-weight lock to >>> hold to >>> determine whether a buffer was idle? It's a pretty nasty thing to >>> build in. >>> >> I've sent this patch after determining that this already didn't >> end up >> being heavyweight. >> Most places were already using the fence_lock and reservation, I >> just >> fixed up the few >> places that didn't hold a reservation while waiting. Converting the >> few places that didn't >> ended up being trivial, so I thought I'd submit it. > Actually the only *valid* reason for holding a reservation when > waiting > for idle is > 1) You want to block further command submission on the buffer. > 2) You want to switch GPU engine and don't have access to gpu > semaphores > / barriers. > > Reservation has the nasty side effect that it blocks command > submission > and pins the buffer (in addition now makes the evict list traversals > skip the buffer) which in general is *not* necessary for most wait > cases, so we should instead actually convert the wait cases that > don't > fulfill 1) and 2) above in the other direction if we have > performance > and latency-reduction in mind. I can't see how a spinlock > protecting a > fence pointer or fence list is stopping you from using RW fences as > long > as the spinlock is held while manipulating the fence list? > You wish. Fine I'll enumerate all cases of ttm_bo_wait (with the patchset, though) and enumerate if they can be changed to work without reservation or not.
ttm/ttm_bo.c ttm_bo_cleanup_refs_or_queue: needs reservation and ttm_bo_wait to finish for the direct destroy fastpath, if either fails it needs to be queued. Cannot work without reservation.
Doesn't block and no significant reservation contention expected.
ttm_bo_cleanup_refs_and_unlock: already drops reservation to wait, doesn't need to re-acquire. Simply reordering ttm_bo_wait until after re-reserve is enough.
Currently follows the above rules.
ttm_bo_evict: already has the reservation, cannot be dropped since only trylock is allowed. Dropping reservation would cause badness, cannot be converted.
Follows rule 2 above. We're about to move the buffer and if that can't be pipelined using the GPU (which TTM currently doesn't allow), we need to wait. Although eviction should be low priority compared to new command submission, so I can't really see why we couldn't wait before trying to reserve here?
ttm_bo_move_buffer: called from ttm_bo_validate, cannot drop reservation for same reason as ttm_bo_evict. It might be part of a ticketed reservation so really don't drop lock here.
Part of command submission and as such follows rule 2 above. If we can pipeline the move with the GPU, no need to wait (but needs to be implemented, of course).
ttm_bo_synccpu_write_grab: the wait could be converted to be done afterwards, without fence_lock. But in this case reservation could take the role of fence_lock too,
so no separate fence_lock would be needed.
With the exception that reservation is more likely to be contended.
True but rule 1.
ttm_bo_swapout: see ttm_bo_evict.
ttm/ttm_bo_util.c: ttm_bo_move_accel_cleanup: calls ttm_bo_wait, cannot drop lock, see ttm_bo_move_buffer, can be called from that function.
Rule 2.
ttm/ttm_bo_vm.c ttm_bo_vm_fault_idle: I guess you COULD drop the reservation here, but you already had the reservation, so a similar optimization to ttm_bo_synccpu_write_grab could be done without requiring fence_lock. If you would write it like that, you would end up with a patch similar to drm/nouveau: add reservation to nouveau_gem_ioctl_cpu_prep. I think we should do this, an
Ok, so the core does NOT need fence_lock because we can never drop reservations except in synccpu_write_grab and maybe ttm_bo_vm_fault_idle, but even in those cases reservation is done. So that could be used instead of fence_lock.
nouveau_gem_ioctl_cpu_prep: Either block on a global spinlock or a local reservation lock. Doesn't matter much which, I don't need the need to keep a global lock for this function... 2 cases can happen in the trylock reservation failure case: buffer is not reserved, so it's not in the process of being evicted. buffer is reserved, which means it's being used in command submission right now, or in one of the functions described above (eg not idle).
nouveau_gem_pushbuf_reloc_apply: has to call ttm_bo_wait with reservation, cannot be dropped.
So for core ttm and nouveau the fence_lock is never needed, radeon has only 1 function that calls ttm_bo_wait which uses a reservation too. It doesn't need the fence_lock either.
And vmwgfx now also has a syccpu IOCTL (see drm-next).
So assuming that we converted the functions that can be converted to wait outside of reservation, the same way you have done with Nouveau, leaving the ones that fall under 1) and 2) above, I would still argue that a spinlock should be used because taking a reservation may implicitly mean wait for gpu, and could give bad performance- and latency charateristics. You shouldn't need to wait for gpu to check for buffer idle.
Except that without reservation you can't tell if the buffer is really idle, or is currently being used as part of some command submission/eviction before the fence pointer is set.
Yes, but when that matters, you're either in case 1 or case 2 again. Otherwise, when you release the reservation, you still don't know. A typical example of this is the vmwgfx synccpu ioctl, where you can either choose to block command submission (not used currently) or not (user-space inter-process synchronization). The former is a case 1 wait and holds reservation while waiting for idle and then ups cpu_writers. The latter waits without reservation for previously submitted rendering to finish.
Yeah you could, but what exactly are you waiting on then? If it's some specific existing rendering, I would argue that you should create an android userspace fence during command submission, or provide your own api to block on a specfic fence in userspace.
If you don't then I think taking a reservation is not unreasonable. In the most common case the buffer is idle and not reserved, so it isn't contested. The actual waiting itself can be done without reservation held, by taking a reference on the fence.
Yeah, here is where we disagree. I'm afraid people will start getting sloppy with reservations and use them to protect more stuff, and after a while they start wondering why the GPU command queue drains...
Perhaps we could agree on a solution (building on one of your original ideas) where we require reservation to modify the fence pointers, and the buffer object moving flag, but the structure holding the fence pointer(s) is RCU safe, so that the pointers can be safely read under an rcu lock.
I think not modifying the fence pointer without reservation would be safest. I also don't think readers need the capability to clear sync_obj, this might simplify the implementation some.
But my preferred option is getting rid of sync_obj completely, and move to using reservation_object->fence_shared/exclusive, like the incomplete proof of concept conversion done in nouveau. But then I do need to grab the reservation lock to touch things, because fences may be set by the i915 driver I share the reservation_object with.
Alternatively could vmwgfx hold a spinlock when decrementing fence refcount instead? Then we wouldn't need this in the core, and vmwgfx could use:
spin_lock(&vmw_fence_lock); fence = ACCESS_ONCE(bo->sync_obj); if (fence && !kref_get_unless_zero(&fence->ref)) fence = NULL; spin_unlock(&vmw_fence_lock);
internally in that function, preserving old semantics but without unsetting sync_obj if no reservation is held. Full rcu might be slightly overkill.
~Maarten
On 01/22/2014 01:38 PM, Maarten Lankhorst wrote:
op 22-01-14 13:11, Thomas Hellstrom schreef:
On 01/22/2014 11:58 AM, Maarten Lankhorst wrote:
op 22-01-14 11:27, Thomas Hellstrom schreef:
On 01/22/2014 10:55 AM, Maarten Lankhorst wrote:
op 22-01-14 10:40, Thomas Hellstrom schreef:
On 01/22/2014 09:19 AM, Maarten Lankhorst wrote: > op 21-01-14 18:44, Thomas Hellstrom schreef: >> On 01/21/2014 04:29 PM, Maarten Lankhorst wrote: >>> Hey, >>> >>> op 21-01-14 16:17, Thomas Hellstrom schreef: >>>> Maarten, for this and the other patches in this series, >>>> >>>> I seem to recall we have this discussion before? >>>> IIRC I stated that reservation was a too heavy-weight lock to >>>> hold to >>>> determine whether a buffer was idle? It's a pretty nasty >>>> thing to >>>> build in. >>>> >>> I've sent this patch after determining that this already didn't >>> end up >>> being heavyweight. >>> Most places were already using the fence_lock and reservation, I >>> just >>> fixed up the few >>> places that didn't hold a reservation while waiting. >>> Converting the >>> few places that didn't >>> ended up being trivial, so I thought I'd submit it. >> Actually the only *valid* reason for holding a reservation when >> waiting >> for idle is >> 1) You want to block further command submission on the buffer. >> 2) You want to switch GPU engine and don't have access to gpu >> semaphores >> / barriers. >> >> Reservation has the nasty side effect that it blocks command >> submission >> and pins the buffer (in addition now makes the evict list >> traversals >> skip the buffer) which in general is *not* necessary for most wait >> cases, so we should instead actually convert the wait cases that >> don't >> fulfill 1) and 2) above in the other direction if we have >> performance >> and latency-reduction in mind. I can't see how a spinlock >> protecting a >> fence pointer or fence list is stopping you from using RW >> fences as >> long >> as the spinlock is held while manipulating the fence list? >> > You wish. Fine I'll enumerate all cases of ttm_bo_wait (with the > patchset, though) and enumerate if they can be changed to work > without > reservation or not. > > ttm/ttm_bo.c > ttm_bo_cleanup_refs_or_queue: needs reservation and ttm_bo_wait to > finish for the direct destroy fastpath, if either fails it needs > to be > queued. Cannot work without reservation. Doesn't block and no significant reservation contention expected.
> ttm_bo_cleanup_refs_and_unlock: already drops reservation to wait, > doesn't need to re-acquire. Simply reordering ttm_bo_wait until > after > re-reserve is enough. Currently follows the above rules.
> ttm_bo_evict: already has the reservation, cannot be dropped since > only trylock is allowed. Dropping reservation would cause badness, > cannot be converted. Follows rule 2 above. We're about to move the buffer and if that can't be pipelined using the GPU (which TTM currently doesn't allow), we need to wait. Although eviction should be low priority compared to new command submission, so I can't really see why we couldn't wait before trying to reserve here?
> ttm_bo_move_buffer: called from ttm_bo_validate, cannot drop > reservation for same reason as ttm_bo_evict. It might be part of a > ticketed reservation so really don't drop lock here. Part of command submission and as such follows rule 2 above. If we can pipeline the move with the GPU, no need to wait (but needs to be implemented, of course).
> ttm_bo_synccpu_write_grab: the wait could be converted to be done > afterwards, without fence_lock. But in this case reservation could > take the role of fence_lock too, > > so no separate fence_lock would be needed. With the exception that reservation is more likely to be contended.
True but rule 1.
> ttm_bo_swapout: see ttm_bo_evict. > > ttm/ttm_bo_util.c: > ttm_bo_move_accel_cleanup: calls ttm_bo_wait, cannot drop lock, see > ttm_bo_move_buffer, can be called from that function. Rule 2.
> ttm/ttm_bo_vm.c > ttm_bo_vm_fault_idle: I guess you COULD drop the reservation here, > but > you already had the reservation, so a similar optimization to > ttm_bo_synccpu_write_grab could be done without requiring > fence_lock. > If you would write it like that, you would end up with a patch > similar > to drm/nouveau: add reservation to nouveau_gem_ioctl_cpu_prep. I > think > we should do this, an > > Ok, so the core does NOT need fence_lock because we can never drop > reservations except in synccpu_write_grab and maybe > ttm_bo_vm_fault_idle, but even in those cases reservation is > done. So > that could be used instead of fence_lock. > > nouveau_gem_ioctl_cpu_prep: > Either block on a global spinlock or a local reservation lock. > Doesn't > matter much which, I don't need the need to keep a global lock for > this function... > 2 cases can happen in the trylock reservation failure case: > buffer is > not reserved, so it's not in the process of being evicted. > buffer is > reserved, which means it's being used in command submission right > now, > or in one of the functions described above (eg not idle). > > nouveau_gem_pushbuf_reloc_apply: > has to call ttm_bo_wait with reservation, cannot be dropped. > > So for core ttm and nouveau the fence_lock is never needed, radeon > has > only 1 function that calls ttm_bo_wait which uses a reservation > too. > It doesn't need the fence_lock either. And vmwgfx now also has a syccpu IOCTL (see drm-next).
So assuming that we converted the functions that can be converted to wait outside of reservation, the same way you have done with Nouveau, leaving the ones that fall under 1) and 2) above, I would still argue that a spinlock should be used because taking a reservation may implicitly mean wait for gpu, and could give bad performance- and latency charateristics. You shouldn't need to wait for gpu to check for buffer idle.
Except that without reservation you can't tell if the buffer is really idle, or is currently being used as part of some command submission/eviction before the fence pointer is set.
Yes, but when that matters, you're either in case 1 or case 2 again. Otherwise, when you release the reservation, you still don't know. A typical example of this is the vmwgfx synccpu ioctl, where you can either choose to block command submission (not used currently) or not (user-space inter-process synchronization). The former is a case 1 wait and holds reservation while waiting for idle and then ups cpu_writers. The latter waits without reservation for previously submitted rendering to finish.
Yeah you could, but what exactly are you waiting on then? If it's some specific existing rendering, I would argue that you should create an android userspace fence during command submission, or provide your own api to block on a specfic fence in userspace.
If you don't then I think taking a reservation is not unreasonable. In the most common case the buffer is idle and not reserved, so it isn't contested. The actual waiting itself can be done without reservation held, by taking a reference on the fence.
Yeah, here is where we disagree. I'm afraid people will start getting sloppy with reservations and use them to protect more stuff, and after a while they start wondering why the GPU command queue drains...
Perhaps we could agree on a solution (building on one of your original ideas) where we require reservation to modify the fence pointers, and the buffer object moving flag, but the structure holding the fence pointer(s) is RCU safe, so that the pointers can be safely read under an rcu lock.
I think not modifying the fence pointer without reservation would be safest. I also don't think readers need the capability to clear sync_obj, this might simplify the implementation some.
But my preferred option is getting rid of sync_obj completely, and move to using reservation_object->fence_shared/exclusive, like the incomplete proof of concept conversion done in nouveau. But then I do need to grab the reservation lock to touch things, because fences may be set by the i915 driver I share the reservation_object with.
Alternatively could vmwgfx hold a spinlock when decrementing fence refcount instead? Then we wouldn't need this in the core, and vmwgfx could use:
Maarten, requiring reservation to access the fence pointers really turns my gut! Being able to read them under rcu is a remedy, but something I figure would be the default and recommended thing to do. Not a vmware exception. This is about as far as I'm prepared to go.
/Thomas
On Wed, Jan 22, 2014 at 01:52:51PM +0100, Thomas Hellstrom wrote:
On 01/22/2014 01:38 PM, Maarten Lankhorst wrote:
op 22-01-14 13:11, Thomas Hellstrom schreef:
On 01/22/2014 11:58 AM, Maarten Lankhorst wrote:
op 22-01-14 11:27, Thomas Hellstrom schreef:
On 01/22/2014 10:55 AM, Maarten Lankhorst wrote:
op 22-01-14 10:40, Thomas Hellstrom schreef: > On 01/22/2014 09:19 AM, Maarten Lankhorst wrote: >> op 21-01-14 18:44, Thomas Hellstrom schreef: >>> On 01/21/2014 04:29 PM, Maarten Lankhorst wrote: >>>> Hey, >>>> >>>> op 21-01-14 16:17, Thomas Hellstrom schreef: >>>>> Maarten, for this and the other patches in this series, >>>>> >>>>> I seem to recall we have this discussion before? >>>>> IIRC I stated that reservation was a too heavy-weight lock to >>>>> hold to >>>>> determine whether a buffer was idle? It's a pretty nasty >>>>> thing to >>>>> build in. >>>>> >>>> I've sent this patch after determining that this already didn't >>>> end up >>>> being heavyweight. >>>> Most places were already using the fence_lock and reservation, I >>>> just >>>> fixed up the few >>>> places that didn't hold a reservation while waiting. >>>> Converting the >>>> few places that didn't >>>> ended up being trivial, so I thought I'd submit it. >>> Actually the only *valid* reason for holding a reservation when >>> waiting >>> for idle is >>> 1) You want to block further command submission on the buffer. >>> 2) You want to switch GPU engine and don't have access to gpu >>> semaphores >>> / barriers. >>> >>> Reservation has the nasty side effect that it blocks command >>> submission >>> and pins the buffer (in addition now makes the evict list >>> traversals >>> skip the buffer) which in general is *not* necessary for most wait >>> cases, so we should instead actually convert the wait cases that >>> don't >>> fulfill 1) and 2) above in the other direction if we have >>> performance >>> and latency-reduction in mind. I can't see how a spinlock >>> protecting a >>> fence pointer or fence list is stopping you from using RW >>> fences as >>> long >>> as the spinlock is held while manipulating the fence list? >>> >> You wish. Fine I'll enumerate all cases of ttm_bo_wait (with the >> patchset, though) and enumerate if they can be changed to work >> without >> reservation or not. >> >> ttm/ttm_bo.c >> ttm_bo_cleanup_refs_or_queue: needs reservation and ttm_bo_wait to >> finish for the direct destroy fastpath, if either fails it needs >> to be >> queued. Cannot work without reservation. > Doesn't block and no significant reservation contention expected. > >> ttm_bo_cleanup_refs_and_unlock: already drops reservation to wait, >> doesn't need to re-acquire. Simply reordering ttm_bo_wait until >> after >> re-reserve is enough. > Currently follows the above rules. > >> ttm_bo_evict: already has the reservation, cannot be dropped since >> only trylock is allowed. Dropping reservation would cause badness, >> cannot be converted. > Follows rule 2 above. We're about to move the buffer and if that > can't > be pipelined using the GPU (which TTM currently doesn't allow), we > need > to wait. Although eviction should be low priority compared to new > command submission, so I can't really see why we couldn't wait > before > trying to reserve here? > >> ttm_bo_move_buffer: called from ttm_bo_validate, cannot drop >> reservation for same reason as ttm_bo_evict. It might be part of a >> ticketed reservation so really don't drop lock here. > Part of command submission and as such follows rule 2 above. If > we can > pipeline the move with the GPU, no need to wait (but needs to be > implemented, of course). > >> ttm_bo_synccpu_write_grab: the wait could be converted to be done >> afterwards, without fence_lock. But in this case reservation could >> take the role of fence_lock too, >> >> so no separate fence_lock would be needed. > With the exception that reservation is more likely to be contended. True but rule 1. >> ttm_bo_swapout: see ttm_bo_evict. >> >> ttm/ttm_bo_util.c: >> ttm_bo_move_accel_cleanup: calls ttm_bo_wait, cannot drop lock, see >> ttm_bo_move_buffer, can be called from that function. > Rule 2. > >> ttm/ttm_bo_vm.c >> ttm_bo_vm_fault_idle: I guess you COULD drop the reservation here, >> but >> you already had the reservation, so a similar optimization to >> ttm_bo_synccpu_write_grab could be done without requiring >> fence_lock. >> If you would write it like that, you would end up with a patch >> similar >> to drm/nouveau: add reservation to nouveau_gem_ioctl_cpu_prep. I >> think >> we should do this, an >> >> Ok, so the core does NOT need fence_lock because we can never drop >> reservations except in synccpu_write_grab and maybe >> ttm_bo_vm_fault_idle, but even in those cases reservation is >> done. So >> that could be used instead of fence_lock. >> >> nouveau_gem_ioctl_cpu_prep: >> Either block on a global spinlock or a local reservation lock. >> Doesn't >> matter much which, I don't need the need to keep a global lock for >> this function... >> 2 cases can happen in the trylock reservation failure case: >> buffer is >> not reserved, so it's not in the process of being evicted. >> buffer is >> reserved, which means it's being used in command submission right >> now, >> or in one of the functions described above (eg not idle). >> >> nouveau_gem_pushbuf_reloc_apply: >> has to call ttm_bo_wait with reservation, cannot be dropped. >> >> So for core ttm and nouveau the fence_lock is never needed, radeon >> has >> only 1 function that calls ttm_bo_wait which uses a reservation >> too. >> It doesn't need the fence_lock either. > And vmwgfx now also has a syccpu IOCTL (see drm-next). > > So assuming that we converted the functions that can be converted to > wait outside of reservation, the same way you have done with > Nouveau, > leaving the ones that fall under 1) and 2) above, I would still > argue > that a spinlock should be used because taking a reservation may > implicitly mean wait for gpu, and could give bad performance- and > latency charateristics. You shouldn't need to wait for gpu to check > for > buffer idle. Except that without reservation you can't tell if the buffer is really idle, or is currently being used as part of some command submission/eviction before the fence pointer is set.
Yes, but when that matters, you're either in case 1 or case 2 again. Otherwise, when you release the reservation, you still don't know. A typical example of this is the vmwgfx synccpu ioctl, where you can either choose to block command submission (not used currently) or not (user-space inter-process synchronization). The former is a case 1 wait and holds reservation while waiting for idle and then ups cpu_writers. The latter waits without reservation for previously submitted rendering to finish.
Yeah you could, but what exactly are you waiting on then? If it's some specific existing rendering, I would argue that you should create an android userspace fence during command submission, or provide your own api to block on a specfic fence in userspace.
If you don't then I think taking a reservation is not unreasonable. In the most common case the buffer is idle and not reserved, so it isn't contested. The actual waiting itself can be done without reservation held, by taking a reference on the fence.
Yeah, here is where we disagree. I'm afraid people will start getting sloppy with reservations and use them to protect more stuff, and after a while they start wondering why the GPU command queue drains...
Perhaps we could agree on a solution (building on one of your original ideas) where we require reservation to modify the fence pointers, and the buffer object moving flag, but the structure holding the fence pointer(s) is RCU safe, so that the pointers can be safely read under an rcu lock.
I think not modifying the fence pointer without reservation would be safest. I also don't think readers need the capability to clear sync_obj, this might simplify the implementation some.
But my preferred option is getting rid of sync_obj completely, and move to using reservation_object->fence_shared/exclusive, like the incomplete proof of concept conversion done in nouveau. But then I do need to grab the reservation lock to touch things, because fences may be set by the i915 driver I share the reservation_object with.
Alternatively could vmwgfx hold a spinlock when decrementing fence refcount instead? Then we wouldn't need this in the core, and vmwgfx could use:
Maarten, requiring reservation to access the fence pointers really turns my gut! Being able to read them under rcu is a remedy, but something I figure would be the default and recommended thing to do. Not a vmware exception. This is about as far as I'm prepared to go.
Let me jump into your discussion and have a bit of fun too ;-)
More seriously I think we should take a step back and look at the larger picture: The overall aim is to allow cross-device shared dma-bufs to get fenced/reserved/whatever. Which means the per-device fence_lock ttm is currently using won't work any more. So we need to change things a bit.
I see a few solutions. Note that I haven't checked the implications for existing drivers (especially ttm) in detail, so please correct me when some of these ideas are horrible to implement:
- Make fence_lock a global thing instead of per-device. Probably not what we want given that dma-buf (and also all the ttm state) has more fine-grained locking.
- Remove the fence_lock and protect fences with the reservation lock the dma-buf already has. Has the appeal of being the simplest solution, at least if we exclude the One Lock to Rule Them all approach ;-)
- Add a new per-buffer spinlock just to protect the fences. Could end up being rather costly for the non-contended common case where we just want to push tons of buffers through execbuf ioctls.
- Allow fences attached to dma-bufs to be accessed read-only (to grab references of them and wait on them) using rcu as protection. I think we need some trickery with kref_get_unless_zero to make sure the rcu-delayed freeing of fences doesn't race in bad ways with lockless kref_gets. Another approach would be to rcu-delay all kref_puts, but I don't think we want that.
Personally I prefer the 2nd approach since it's the simplest, while not being un-scalable like the first. In my experience with the single lock in i915 where any contention and especially any waiting while holding locks is massively exagerated is that locking dropping games around the common gpu wait points are sufficient. Actually in almost all case the fence_lock wouldn't be sufficient for us since we need to check buffer placement, mmaps and similar things anyway.
Now I see that there's valid cases where we want the lowest possible overhead for waiting on or just checking for outstanding rendering. OpenCL with it's fancy/explicit synchronization model seems to be the prime example that usually pops up. For such uses I think it's better to just directly expose fences to userspace and completely eshew any indirection through buffer objects.
That leaves the issue of stalling unrelated processes when they try to look up fences when another process is heavily thrashing vram and so holds tons of reservations on non-shared objects and blocks waiting for the gpu to complete more batches. But from my cursory understanding non-UMA platforms currently (at least with the drivers we have and the memory management logic) drop off a rather steep cliff. So I fear that micro-optimizing this case is investing complexity into the wrong place. At least in i915 we've implemented quite a pile of tricks to smooth off the gtt thrashing cliff before even considering improving lock contention and lock holding times.
So overall I'm heavily favouring the simple approach of just reusing the reservation ww mutex to protec fence state, but I'm definitely not rejecting more complex approaches out of hand. I just think that we should have solid data to justify the complexity.
Finally if we can't reach an agreement here I guess we could duct-tape something together where ttm objects only used by a single driver are protected by the fence_lock and other, shared buffers are protected by the reservation. It won't be pretty, but the impact should be fairly contained. Especially since many paths in ttm currently grab both locks anyway, so wouldn't need any special handling.
I hope this helps to move the discussion forward.
Cheers, Daniel
On 01/22/2014 04:09 PM, Daniel Vetter wrote:
On Wed, Jan 22, 2014 at 01:52:51PM +0100, Thomas Hellstrom wrote:
On 01/22/2014 01:38 PM, Maarten Lankhorst wrote:
op 22-01-14 13:11, Thomas Hellstrom schreef:
On 01/22/2014 11:58 AM, Maarten Lankhorst wrote:
op 22-01-14 11:27, Thomas Hellstrom schreef:
On 01/22/2014 10:55 AM, Maarten Lankhorst wrote: > op 22-01-14 10:40, Thomas Hellstrom schreef: >> On 01/22/2014 09:19 AM, Maarten Lankhorst wrote: >>> op 21-01-14 18:44, Thomas Hellstrom schreef: >>>> On 01/21/2014 04:29 PM, Maarten Lankhorst wrote: >>>>> Hey, >>>>> >>>>> op 21-01-14 16:17, Thomas Hellstrom schreef: >>>>>> Maarten, for this and the other patches in this series, >>>>>> >>>>>> I seem to recall we have this discussion before? >>>>>> IIRC I stated that reservation was a too heavy-weight lock to >>>>>> hold to >>>>>> determine whether a buffer was idle? It's a pretty nasty >>>>>> thing to >>>>>> build in. >>>>>> >>>>> I've sent this patch after determining that this already didn't >>>>> end up >>>>> being heavyweight. >>>>> Most places were already using the fence_lock and reservation, I >>>>> just >>>>> fixed up the few >>>>> places that didn't hold a reservation while waiting. >>>>> Converting the >>>>> few places that didn't >>>>> ended up being trivial, so I thought I'd submit it. >>>> Actually the only *valid* reason for holding a reservation when >>>> waiting >>>> for idle is >>>> 1) You want to block further command submission on the buffer. >>>> 2) You want to switch GPU engine and don't have access to gpu >>>> semaphores >>>> / barriers. >>>> >>>> Reservation has the nasty side effect that it blocks command >>>> submission >>>> and pins the buffer (in addition now makes the evict list >>>> traversals >>>> skip the buffer) which in general is *not* necessary for most wait >>>> cases, so we should instead actually convert the wait cases that >>>> don't >>>> fulfill 1) and 2) above in the other direction if we have >>>> performance >>>> and latency-reduction in mind. I can't see how a spinlock >>>> protecting a >>>> fence pointer or fence list is stopping you from using RW >>>> fences as >>>> long >>>> as the spinlock is held while manipulating the fence list? >>>> >>> You wish. Fine I'll enumerate all cases of ttm_bo_wait (with the >>> patchset, though) and enumerate if they can be changed to work >>> without >>> reservation or not. >>> >>> ttm/ttm_bo.c >>> ttm_bo_cleanup_refs_or_queue: needs reservation and ttm_bo_wait to >>> finish for the direct destroy fastpath, if either fails it needs >>> to be >>> queued. Cannot work without reservation. >> Doesn't block and no significant reservation contention expected. >> >>> ttm_bo_cleanup_refs_and_unlock: already drops reservation to wait, >>> doesn't need to re-acquire. Simply reordering ttm_bo_wait until >>> after >>> re-reserve is enough. >> Currently follows the above rules. >> >>> ttm_bo_evict: already has the reservation, cannot be dropped since >>> only trylock is allowed. Dropping reservation would cause badness, >>> cannot be converted. >> Follows rule 2 above. We're about to move the buffer and if that >> can't >> be pipelined using the GPU (which TTM currently doesn't allow), we >> need >> to wait. Although eviction should be low priority compared to new >> command submission, so I can't really see why we couldn't wait >> before >> trying to reserve here? >> >>> ttm_bo_move_buffer: called from ttm_bo_validate, cannot drop >>> reservation for same reason as ttm_bo_evict. It might be part of a >>> ticketed reservation so really don't drop lock here. >> Part of command submission and as such follows rule 2 above. If >> we can >> pipeline the move with the GPU, no need to wait (but needs to be >> implemented, of course). >> >>> ttm_bo_synccpu_write_grab: the wait could be converted to be done >>> afterwards, without fence_lock. But in this case reservation could >>> take the role of fence_lock too, >>> >>> so no separate fence_lock would be needed. >> With the exception that reservation is more likely to be contended. > True but rule 1. >>> ttm_bo_swapout: see ttm_bo_evict. >>> >>> ttm/ttm_bo_util.c: >>> ttm_bo_move_accel_cleanup: calls ttm_bo_wait, cannot drop lock, see >>> ttm_bo_move_buffer, can be called from that function. >> Rule 2. >> >>> ttm/ttm_bo_vm.c >>> ttm_bo_vm_fault_idle: I guess you COULD drop the reservation here, >>> but >>> you already had the reservation, so a similar optimization to >>> ttm_bo_synccpu_write_grab could be done without requiring >>> fence_lock. >>> If you would write it like that, you would end up with a patch >>> similar >>> to drm/nouveau: add reservation to nouveau_gem_ioctl_cpu_prep. I >>> think >>> we should do this, an >>> >>> Ok, so the core does NOT need fence_lock because we can never drop >>> reservations except in synccpu_write_grab and maybe >>> ttm_bo_vm_fault_idle, but even in those cases reservation is >>> done. So >>> that could be used instead of fence_lock. >>> >>> nouveau_gem_ioctl_cpu_prep: >>> Either block on a global spinlock or a local reservation lock. >>> Doesn't >>> matter much which, I don't need the need to keep a global lock for >>> this function... >>> 2 cases can happen in the trylock reservation failure case: >>> buffer is >>> not reserved, so it's not in the process of being evicted. >>> buffer is >>> reserved, which means it's being used in command submission right >>> now, >>> or in one of the functions described above (eg not idle). >>> >>> nouveau_gem_pushbuf_reloc_apply: >>> has to call ttm_bo_wait with reservation, cannot be dropped. >>> >>> So for core ttm and nouveau the fence_lock is never needed, radeon >>> has >>> only 1 function that calls ttm_bo_wait which uses a reservation >>> too. >>> It doesn't need the fence_lock either. >> And vmwgfx now also has a syccpu IOCTL (see drm-next). >> >> So assuming that we converted the functions that can be converted to >> wait outside of reservation, the same way you have done with >> Nouveau, >> leaving the ones that fall under 1) and 2) above, I would still >> argue >> that a spinlock should be used because taking a reservation may >> implicitly mean wait for gpu, and could give bad performance- and >> latency charateristics. You shouldn't need to wait for gpu to check >> for >> buffer idle. > Except that without reservation you can't tell if the buffer is > really > idle, or is currently being > used as part of some command submission/eviction before the fence > pointer is set. > Yes, but when that matters, you're either in case 1 or case 2 again. Otherwise, when you release the reservation, you still don't know. A typical example of this is the vmwgfx synccpu ioctl, where you can either choose to block command submission (not used currently) or not (user-space inter-process synchronization). The former is a case 1 wait and holds reservation while waiting for idle and then ups cpu_writers. The latter waits without reservation for previously submitted rendering to finish.
Yeah you could, but what exactly are you waiting on then? If it's some specific existing rendering, I would argue that you should create an android userspace fence during command submission, or provide your own api to block on a specfic fence in userspace.
If you don't then I think taking a reservation is not unreasonable. In the most common case the buffer is idle and not reserved, so it isn't contested. The actual waiting itself can be done without reservation held, by taking a reference on the fence.
Yeah, here is where we disagree. I'm afraid people will start getting sloppy with reservations and use them to protect more stuff, and after a while they start wondering why the GPU command queue drains...
Perhaps we could agree on a solution (building on one of your original ideas) where we require reservation to modify the fence pointers, and the buffer object moving flag, but the structure holding the fence pointer(s) is RCU safe, so that the pointers can be safely read under an rcu lock.
I think not modifying the fence pointer without reservation would be safest. I also don't think readers need the capability to clear sync_obj, this might simplify the implementation some.
But my preferred option is getting rid of sync_obj completely, and move to using reservation_object->fence_shared/exclusive, like the incomplete proof of concept conversion done in nouveau. But then I do need to grab the reservation lock to touch things, because fences may be set by the i915 driver I share the reservation_object with.
Alternatively could vmwgfx hold a spinlock when decrementing fence refcount instead? Then we wouldn't need this in the core, and vmwgfx could use:
Maarten, requiring reservation to access the fence pointers really turns my gut! Being able to read them under rcu is a remedy, but something I figure would be the default and recommended thing to do. Not a vmware exception. This is about as far as I'm prepared to go.
Let me jump into your discussion and have a bit of fun too ;-)
More seriously I think we should take a step back and look at the larger picture: The overall aim is to allow cross-device shared dma-bufs to get fenced/reserved/whatever. Which means the per-device fence_lock ttm is currently using won't work any more. So we need to change things a bit.
I see a few solutions. Note that I haven't checked the implications for existing drivers (especially ttm) in detail, so please correct me when some of these ideas are horrible to implement:
Make fence_lock a global thing instead of per-device. Probably not what we want given that dma-buf (and also all the ttm state) has more fine-grained locking.
Remove the fence_lock and protect fences with the reservation lock the dma-buf already has. Has the appeal of being the simplest solution, at least if we exclude the One Lock to Rule Them all approach ;-)
Add a new per-buffer spinlock just to protect the fences. Could end up being rather costly for the non-contended common case where we just want to push tons of buffers through execbuf ioctls.
Allow fences attached to dma-bufs to be accessed read-only (to grab references of them and wait on them) using rcu as protection. I think we need some trickery with kref_get_unless_zero to make sure the rcu-delayed freeing of fences doesn't race in bad ways with lockless kref_gets. Another approach would be to rcu-delay all kref_puts, but I don't think we want that.
Personally I prefer the 2nd approach since it's the simplest, while not being un-scalable like the first. In my experience with the single lock in i915 where any contention and especially any waiting while holding locks is massively exagerated is that locking dropping games around the common gpu wait points are sufficient. Actually in almost all case the fence_lock wouldn't be sufficient for us since we need to check buffer placement, mmaps and similar things anyway.
Now I see that there's valid cases where we want the lowest possible overhead for waiting on or just checking for outstanding rendering. OpenCL with it's fancy/explicit synchronization model seems to be the prime example that usually pops up. For such uses I think it's better to just directly expose fences to userspace and completely eshew any indirection through buffer objects.
That leaves the issue of stalling unrelated processes when they try to look up fences when another process is heavily thrashing vram and so holds tons of reservations on non-shared objects and blocks waiting for the gpu to complete more batches. But from my cursory understanding non-UMA platforms currently (at least with the drivers we have and the memory management logic) drop off a rather steep cliff. So I fear that micro-optimizing this case is investing complexity into the wrong place. At least in i915 we've implemented quite a pile of tricks to smooth off the gtt thrashing cliff before even considering improving lock contention and lock holding times.
So overall I'm heavily favouring the simple approach of just reusing the reservation ww mutex to protec fence state, but I'm definitely not rejecting more complex approaches out of hand. I just think that we should have solid data to justify the complexity.
Finally if we can't reach an agreement here I guess we could duct-tape something together where ttm objects only used by a single driver are protected by the fence_lock and other, shared buffers are protected by the reservation. It won't be pretty, but the impact should be fairly contained. Especially since many paths in ttm currently grab both locks anyway, so wouldn't need any special handling.
I hope this helps to move the discussion forward.
First, I think in a situation like this with radically different opinions, one needs to be prepared to compromise to move things forward. And not just on the TTM side.
And I don't think making the dma-buf fence pointer structure rcu-safe is a big step that is in any way complex. If the sync_object- or fence ops don't support get_unless_zero(), we simply don't use the RCU path. Fence object exporters that do care implement those.
/Thomas
Cheers, Daniel
On 01/22/2014 04:09 PM, Daniel Vetter wrote:
On Wed, Jan 22, 2014 at 01:52:51PM +0100, Thomas Hellstrom wrote:
On 01/22/2014 01:38 PM, Maarten Lankhorst wrote:
op 22-01-14 13:11, Thomas Hellstrom schreef:
On 01/22/2014 11:58 AM, Maarten Lankhorst wrote:
op 22-01-14 11:27, Thomas Hellstrom schreef:
On 01/22/2014 10:55 AM, Maarten Lankhorst wrote: > op 22-01-14 10:40, Thomas Hellstrom schreef: >> On 01/22/2014 09:19 AM, Maarten Lankhorst wrote: >>> op 21-01-14 18:44, Thomas Hellstrom schreef: >>>> On 01/21/2014 04:29 PM, Maarten Lankhorst wrote: >>>>> Hey, >>>>> >>>>> op 21-01-14 16:17, Thomas Hellstrom schreef: >>>>>> Maarten, for this and the other patches in this series, >>>>>> >>>>>> I seem to recall we have this discussion before? >>>>>> IIRC I stated that reservation was a too heavy-weight lock to >>>>>> hold to >>>>>> determine whether a buffer was idle? It's a pretty nasty >>>>>> thing to >>>>>> build in. >>>>>> >>>>> I've sent this patch after determining that this already didn't >>>>> end up >>>>> being heavyweight. >>>>> Most places were already using the fence_lock and reservation, I >>>>> just >>>>> fixed up the few >>>>> places that didn't hold a reservation while waiting. >>>>> Converting the >>>>> few places that didn't >>>>> ended up being trivial, so I thought I'd submit it. >>>> Actually the only *valid* reason for holding a reservation when >>>> waiting >>>> for idle is >>>> 1) You want to block further command submission on the buffer. >>>> 2) You want to switch GPU engine and don't have access to gpu >>>> semaphores >>>> / barriers. >>>> >>>> Reservation has the nasty side effect that it blocks command >>>> submission >>>> and pins the buffer (in addition now makes the evict list >>>> traversals >>>> skip the buffer) which in general is *not* necessary for most wait >>>> cases, so we should instead actually convert the wait cases that >>>> don't >>>> fulfill 1) and 2) above in the other direction if we have >>>> performance >>>> and latency-reduction in mind. I can't see how a spinlock >>>> protecting a >>>> fence pointer or fence list is stopping you from using RW >>>> fences as >>>> long >>>> as the spinlock is held while manipulating the fence list? >>>> >>> You wish. Fine I'll enumerate all cases of ttm_bo_wait (with the >>> patchset, though) and enumerate if they can be changed to work >>> without >>> reservation or not. >>> >>> ttm/ttm_bo.c >>> ttm_bo_cleanup_refs_or_queue: needs reservation and ttm_bo_wait to >>> finish for the direct destroy fastpath, if either fails it needs >>> to be >>> queued. Cannot work without reservation. >> Doesn't block and no significant reservation contention expected. >> >>> ttm_bo_cleanup_refs_and_unlock: already drops reservation to wait, >>> doesn't need to re-acquire. Simply reordering ttm_bo_wait until >>> after >>> re-reserve is enough. >> Currently follows the above rules. >> >>> ttm_bo_evict: already has the reservation, cannot be dropped since >>> only trylock is allowed. Dropping reservation would cause badness, >>> cannot be converted. >> Follows rule 2 above. We're about to move the buffer and if that >> can't >> be pipelined using the GPU (which TTM currently doesn't allow), we >> need >> to wait. Although eviction should be low priority compared to new >> command submission, so I can't really see why we couldn't wait >> before >> trying to reserve here? >> >>> ttm_bo_move_buffer: called from ttm_bo_validate, cannot drop >>> reservation for same reason as ttm_bo_evict. It might be part of a >>> ticketed reservation so really don't drop lock here. >> Part of command submission and as such follows rule 2 above. If >> we can >> pipeline the move with the GPU, no need to wait (but needs to be >> implemented, of course). >> >>> ttm_bo_synccpu_write_grab: the wait could be converted to be done >>> afterwards, without fence_lock. But in this case reservation could >>> take the role of fence_lock too, >>> >>> so no separate fence_lock would be needed. >> With the exception that reservation is more likely to be contended. > True but rule 1. >>> ttm_bo_swapout: see ttm_bo_evict. >>> >>> ttm/ttm_bo_util.c: >>> ttm_bo_move_accel_cleanup: calls ttm_bo_wait, cannot drop lock, see >>> ttm_bo_move_buffer, can be called from that function. >> Rule 2. >> >>> ttm/ttm_bo_vm.c >>> ttm_bo_vm_fault_idle: I guess you COULD drop the reservation here, >>> but >>> you already had the reservation, so a similar optimization to >>> ttm_bo_synccpu_write_grab could be done without requiring >>> fence_lock. >>> If you would write it like that, you would end up with a patch >>> similar >>> to drm/nouveau: add reservation to nouveau_gem_ioctl_cpu_prep. I >>> think >>> we should do this, an >>> >>> Ok, so the core does NOT need fence_lock because we can never drop >>> reservations except in synccpu_write_grab and maybe >>> ttm_bo_vm_fault_idle, but even in those cases reservation is >>> done. So >>> that could be used instead of fence_lock. >>> >>> nouveau_gem_ioctl_cpu_prep: >>> Either block on a global spinlock or a local reservation lock. >>> Doesn't >>> matter much which, I don't need the need to keep a global lock for >>> this function... >>> 2 cases can happen in the trylock reservation failure case: >>> buffer is >>> not reserved, so it's not in the process of being evicted. >>> buffer is >>> reserved, which means it's being used in command submission right >>> now, >>> or in one of the functions described above (eg not idle). >>> >>> nouveau_gem_pushbuf_reloc_apply: >>> has to call ttm_bo_wait with reservation, cannot be dropped. >>> >>> So for core ttm and nouveau the fence_lock is never needed, radeon >>> has >>> only 1 function that calls ttm_bo_wait which uses a reservation >>> too. >>> It doesn't need the fence_lock either. >> And vmwgfx now also has a syccpu IOCTL (see drm-next). >> >> So assuming that we converted the functions that can be converted to >> wait outside of reservation, the same way you have done with >> Nouveau, >> leaving the ones that fall under 1) and 2) above, I would still >> argue >> that a spinlock should be used because taking a reservation may >> implicitly mean wait for gpu, and could give bad performance- and >> latency charateristics. You shouldn't need to wait for gpu to check >> for >> buffer idle. > Except that without reservation you can't tell if the buffer is > really > idle, or is currently being > used as part of some command submission/eviction before the fence > pointer is set. > Yes, but when that matters, you're either in case 1 or case 2 again. Otherwise, when you release the reservation, you still don't know. A typical example of this is the vmwgfx synccpu ioctl, where you can either choose to block command submission (not used currently) or not (user-space inter-process synchronization). The former is a case 1 wait and holds reservation while waiting for idle and then ups cpu_writers. The latter waits without reservation for previously submitted rendering to finish.
Yeah you could, but what exactly are you waiting on then? If it's some specific existing rendering, I would argue that you should create an android userspace fence during command submission, or provide your own api to block on a specfic fence in userspace.
If you don't then I think taking a reservation is not unreasonable. In the most common case the buffer is idle and not reserved, so it isn't contested. The actual waiting itself can be done without reservation held, by taking a reference on the fence.
Yeah, here is where we disagree. I'm afraid people will start getting sloppy with reservations and use them to protect more stuff, and after a while they start wondering why the GPU command queue drains...
Perhaps we could agree on a solution (building on one of your original ideas) where we require reservation to modify the fence pointers, and the buffer object moving flag, but the structure holding the fence pointer(s) is RCU safe, so that the pointers can be safely read under an rcu lock.
I think not modifying the fence pointer without reservation would be safest. I also don't think readers need the capability to clear sync_obj, this might simplify the implementation some.
But my preferred option is getting rid of sync_obj completely, and move to using reservation_object->fence_shared/exclusive, like the incomplete proof of concept conversion done in nouveau. But then I do need to grab the reservation lock to touch things, because fences may be set by the i915 driver I share the reservation_object with.
Alternatively could vmwgfx hold a spinlock when decrementing fence refcount instead? Then we wouldn't need this in the core, and vmwgfx could use:
Maarten, requiring reservation to access the fence pointers really turns my gut! Being able to read them under rcu is a remedy, but something I figure would be the default and recommended thing to do. Not a vmware exception. This is about as far as I'm prepared to go.
Let me jump into your discussion and have a bit of fun too ;-)
More seriously I think we should take a step back and look at the larger picture: The overall aim is to allow cross-device shared dma-bufs to get fenced/reserved/whatever. Which means the per-device fence_lock ttm is currently using won't work any more. So we need to change things a bit.
I see a few solutions. Note that I haven't checked the implications for existing drivers (especially ttm) in detail, so please correct me when some of these ideas are horrible to implement:
- Make fence_lock a global thing instead of per-device. Probably not what we want given that dma-buf (and also all the ttm state) has more fine-grained locking.
And a short comment about this, as well. It's not necessarily so that a lock that protects a single structure with members that are used in unrelated situations will see less contention than a lock that protects single members in a huge number of structures that are used in related situations.
In particular, I think (but guessing :) ) that a global spinlock protecting just the fence state of all objects will (at least initially) be the simplest solution and the solution that sees less lock contention.
/Thomas
op 22-01-14 13:11, Thomas Hellstrom schreef:
On 01/22/2014 11:58 AM, Maarten Lankhorst wrote:
op 22-01-14 11:27, Thomas Hellstrom schreef:
On 01/22/2014 10:55 AM, Maarten Lankhorst wrote:
op 22-01-14 10:40, Thomas Hellstrom schreef:
On 01/22/2014 09:19 AM, Maarten Lankhorst wrote:
op 21-01-14 18:44, Thomas Hellstrom schreef: > On 01/21/2014 04:29 PM, Maarten Lankhorst wrote: >> Hey, >> >> op 21-01-14 16:17, Thomas Hellstrom schreef: >>> Maarten, for this and the other patches in this series, >>> >>> I seem to recall we have this discussion before? >>> IIRC I stated that reservation was a too heavy-weight lock to >>> hold to >>> determine whether a buffer was idle? It's a pretty nasty thing to >>> build in. >>> >> I've sent this patch after determining that this already didn't >> end up >> being heavyweight. >> Most places were already using the fence_lock and reservation, I >> just >> fixed up the few >> places that didn't hold a reservation while waiting. Converting the >> few places that didn't >> ended up being trivial, so I thought I'd submit it. > Actually the only *valid* reason for holding a reservation when > waiting > for idle is > 1) You want to block further command submission on the buffer. > 2) You want to switch GPU engine and don't have access to gpu > semaphores > / barriers. > > Reservation has the nasty side effect that it blocks command > submission > and pins the buffer (in addition now makes the evict list traversals > skip the buffer) which in general is *not* necessary for most wait > cases, so we should instead actually convert the wait cases that > don't > fulfill 1) and 2) above in the other direction if we have > performance > and latency-reduction in mind. I can't see how a spinlock > protecting a > fence pointer or fence list is stopping you from using RW fences as > long > as the spinlock is held while manipulating the fence list? > You wish. Fine I'll enumerate all cases of ttm_bo_wait (with the patchset, though) and enumerate if they can be changed to work without reservation or not.
ttm/ttm_bo.c ttm_bo_cleanup_refs_or_queue: needs reservation and ttm_bo_wait to finish for the direct destroy fastpath, if either fails it needs to be queued. Cannot work without reservation.
Doesn't block and no significant reservation contention expected.
ttm_bo_cleanup_refs_and_unlock: already drops reservation to wait, doesn't need to re-acquire. Simply reordering ttm_bo_wait until after re-reserve is enough.
Currently follows the above rules.
ttm_bo_evict: already has the reservation, cannot be dropped since only trylock is allowed. Dropping reservation would cause badness, cannot be converted.
Follows rule 2 above. We're about to move the buffer and if that can't be pipelined using the GPU (which TTM currently doesn't allow), we need to wait. Although eviction should be low priority compared to new command submission, so I can't really see why we couldn't wait before trying to reserve here?
ttm_bo_move_buffer: called from ttm_bo_validate, cannot drop reservation for same reason as ttm_bo_evict. It might be part of a ticketed reservation so really don't drop lock here.
Part of command submission and as such follows rule 2 above. If we can pipeline the move with the GPU, no need to wait (but needs to be implemented, of course).
ttm_bo_synccpu_write_grab: the wait could be converted to be done afterwards, without fence_lock. But in this case reservation could take the role of fence_lock too,
so no separate fence_lock would be needed.
With the exception that reservation is more likely to be contended.
True but rule 1.
ttm_bo_swapout: see ttm_bo_evict.
ttm/ttm_bo_util.c: ttm_bo_move_accel_cleanup: calls ttm_bo_wait, cannot drop lock, see ttm_bo_move_buffer, can be called from that function.
Rule 2.
ttm/ttm_bo_vm.c ttm_bo_vm_fault_idle: I guess you COULD drop the reservation here, but you already had the reservation, so a similar optimization to ttm_bo_synccpu_write_grab could be done without requiring fence_lock. If you would write it like that, you would end up with a patch similar to drm/nouveau: add reservation to nouveau_gem_ioctl_cpu_prep. I think we should do this, an
Ok, so the core does NOT need fence_lock because we can never drop reservations except in synccpu_write_grab and maybe ttm_bo_vm_fault_idle, but even in those cases reservation is done. So that could be used instead of fence_lock.
nouveau_gem_ioctl_cpu_prep: Either block on a global spinlock or a local reservation lock. Doesn't matter much which, I don't need the need to keep a global lock for this function... 2 cases can happen in the trylock reservation failure case: buffer is not reserved, so it's not in the process of being evicted. buffer is reserved, which means it's being used in command submission right now, or in one of the functions described above (eg not idle).
nouveau_gem_pushbuf_reloc_apply: has to call ttm_bo_wait with reservation, cannot be dropped.
So for core ttm and nouveau the fence_lock is never needed, radeon has only 1 function that calls ttm_bo_wait which uses a reservation too. It doesn't need the fence_lock either.
And vmwgfx now also has a syccpu IOCTL (see drm-next).
So assuming that we converted the functions that can be converted to wait outside of reservation, the same way you have done with Nouveau, leaving the ones that fall under 1) and 2) above, I would still argue that a spinlock should be used because taking a reservation may implicitly mean wait for gpu, and could give bad performance- and latency charateristics. You shouldn't need to wait for gpu to check for buffer idle.
Except that without reservation you can't tell if the buffer is really idle, or is currently being used as part of some command submission/eviction before the fence pointer is set.
Yes, but when that matters, you're either in case 1 or case 2 again. Otherwise, when you release the reservation, you still don't know. A typical example of this is the vmwgfx synccpu ioctl, where you can either choose to block command submission (not used currently) or not (user-space inter-process synchronization). The former is a case 1 wait and holds reservation while waiting for idle and then ups cpu_writers. The latter waits without reservation for previously submitted rendering to finish.
Yeah you could, but what exactly are you waiting on then? If it's some specific existing rendering, I would argue that you should create an android userspace fence during command submission, or provide your own api to block on a specfic fence in userspace.
If you don't then I think taking a reservation is not unreasonable. In the most common case the buffer is idle and not reserved, so it isn't contested. The actual waiting itself can be done without reservation held, by taking a reference on the fence.
Yeah, here is where we disagree. I'm afraid people will start getting sloppy with reservations and use them to protect more stuff, and after a while they start wondering why the GPU command queue drains...
Also the reservation_object's lock is a normal lock, so you should be able to pull info about it when enabling CONFIG_LOCKSTAT.
~Maarten
This will ensure we always hold the required lock when calling those functions.
Signed-off-by: Maarten Lankhorst maarten.lankhorst@canonical.com --- drivers/gpu/drm/nouveau/nouveau_bo.c | 2 ++ drivers/gpu/drm/nouveau/nouveau_display.c | 24 +++++++++++++++++------- 2 files changed, 19 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c index c0fde6b9393c..38444ba22f0d 100644 --- a/drivers/gpu/drm/nouveau/nouveau_bo.c +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c @@ -1460,6 +1460,8 @@ nouveau_bo_fence(struct nouveau_bo *nvbo, struct nouveau_fence *fence) struct nouveau_fence *new_fence = nouveau_fence_ref(fence); struct nouveau_fence *old_fence = NULL;
+ lockdep_assert_held(&nvbo->bo.resv->lock.base); + spin_lock(&nvbo->bo.bdev->fence_lock); old_fence = nvbo->bo.sync_obj; nvbo->bo.sync_obj = new_fence; diff --git a/drivers/gpu/drm/nouveau/nouveau_display.c b/drivers/gpu/drm/nouveau/nouveau_display.c index 817b7c5c06f0..9d3892a1af96 100644 --- a/drivers/gpu/drm/nouveau/nouveau_display.c +++ b/drivers/gpu/drm/nouveau/nouveau_display.c @@ -609,19 +609,30 @@ nouveau_crtc_page_flip(struct drm_crtc *crtc, struct drm_framebuffer *fb, goto fail_free; }
+ mutex_lock(&chan->cli->mutex); + ret = ttm_bo_reserve(&old_bo->bo, true, false, false, NULL); + if (ret) + goto fail_unpin; + /* synchronise rendering channel with the kernel's channel */ spin_lock(&new_bo->bo.bdev->fence_lock); fence = nouveau_fence_ref(new_bo->bo.sync_obj); spin_unlock(&new_bo->bo.bdev->fence_lock); ret = nouveau_fence_sync(fence, chan); nouveau_fence_unref(&fence); - if (ret) + + if (ret) { + ttm_bo_unreserve(&new_bo->bo); goto fail_unpin; + }
- mutex_lock(&chan->cli->mutex); - ret = ttm_bo_reserve(&old_bo->bo, true, false, false, NULL); - if (ret) - goto fail_unlock; + if (new_bo != old_bo) { + ttm_bo_unreserve(&new_bo->bo); + + ret = ttm_bo_reserve(&old_bo->bo, true, false, false, NULL); + if (ret) + goto fail_unpin; + }
/* Initialize a page flip struct */ *s = (struct nouveau_page_flip_state) @@ -673,9 +684,8 @@ nouveau_crtc_page_flip(struct drm_crtc *crtc, struct drm_framebuffer *fb,
fail_unreserve: ttm_bo_unreserve(&old_bo->bo); -fail_unlock: - mutex_unlock(&chan->cli->mutex); fail_unpin: + mutex_unlock(&chan->cli->mutex); if (old_bo != new_bo) nouveau_bo_unpin(new_bo); fail_free:
This is the last remaining function that doesn't use the reservation lock completely to fence off access to a buffer.
Signed-off-by: Maarten Lankhorst maarten.lankhorst@canonical.com --- drivers/gpu/drm/ttm/ttm_bo.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-)
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 07e02c4bf5a8..a8cffb6ee1f3 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -500,17 +500,6 @@ static int ttm_bo_cleanup_refs_and_unlock(struct ttm_buffer_object *bo, if (ret) return ret;
- /* - * remove sync_obj with ttm_bo_wait, the wait should be - * finished, and no new wait object should have been added. - */ - spin_lock(&bdev->fence_lock); - ret = ttm_bo_wait(bo, false, false, true); - WARN_ON(ret); - spin_unlock(&bdev->fence_lock); - if (ret) - return ret; - spin_lock(&glob->lru_lock); ret = ttm_bo_reserve_nolru(bo, false, true, false, 0);
@@ -526,8 +515,16 @@ static int ttm_bo_cleanup_refs_and_unlock(struct ttm_buffer_object *bo, spin_unlock(&glob->lru_lock); return 0; } - } else - spin_unlock(&bdev->fence_lock); + + /* + * remove sync_obj with ttm_bo_wait, the wait should be + * finished, and no new wait object should have been added. + */ + spin_lock(&bdev->fence_lock); + ret = ttm_bo_wait(bo, false, false, true); + WARN_ON(ret); + } + spin_unlock(&bdev->fence_lock);
if (ret || unlikely(list_empty(&bo->ddestroy))) { ww_mutex_unlock(&bo->resv->lock); @@ -1536,6 +1533,8 @@ int ttm_bo_wait(struct ttm_buffer_object *bo, void *sync_obj; int ret = 0;
+ lockdep_assert_held(&bo->resv->lock.base); + if (likely(bo->sync_obj == NULL)) return 0;
No users are left, kill it off! :D
Signed-off-by: Maarten Lankhorst maarten.lankhorst@canonical.com --- drivers/gpu/drm/nouveau/nouveau_bo.c | 23 ++++------ drivers/gpu/drm/nouveau/nouveau_display.c | 6 +-- drivers/gpu/drm/nouveau/nouveau_gem.c | 16 +------ drivers/gpu/drm/qxl/qxl_cmd.c | 2 - drivers/gpu/drm/qxl/qxl_fence.c | 4 -- drivers/gpu/drm/qxl/qxl_object.h | 2 - drivers/gpu/drm/qxl/qxl_release.c | 2 - drivers/gpu/drm/radeon/radeon_display.c | 2 - drivers/gpu/drm/radeon/radeon_object.c | 2 - drivers/gpu/drm/ttm/ttm_bo.c | 75 ++++++++----------------------- drivers/gpu/drm/ttm/ttm_bo_util.c | 5 --- drivers/gpu/drm/ttm/ttm_bo_vm.c | 3 -- drivers/gpu/drm/ttm/ttm_execbuf_util.c | 2 - drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 3 -- drivers/gpu/drm/vmwgfx/vmwgfx_resource.c | 2 - include/drm/ttm/ttm_bo_api.h | 5 +-- include/drm/ttm/ttm_bo_driver.h | 3 -- 17 files changed, 30 insertions(+), 127 deletions(-)
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c index 38444ba22f0d..8e760b74cf0b 100644 --- a/drivers/gpu/drm/nouveau/nouveau_bo.c +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c @@ -1454,26 +1454,19 @@ nouveau_ttm_tt_unpopulate(struct ttm_tt *ttm) ttm_pool_unpopulate(ttm); }
+static void +nouveau_bo_fence_unref(void **sync_obj) +{ + nouveau_fence_unref((struct nouveau_fence **)sync_obj); +} + void nouveau_bo_fence(struct nouveau_bo *nvbo, struct nouveau_fence *fence) { - struct nouveau_fence *new_fence = nouveau_fence_ref(fence); - struct nouveau_fence *old_fence = NULL; - lockdep_assert_held(&nvbo->bo.resv->lock.base);
- spin_lock(&nvbo->bo.bdev->fence_lock); - old_fence = nvbo->bo.sync_obj; - nvbo->bo.sync_obj = new_fence; - spin_unlock(&nvbo->bo.bdev->fence_lock); - - nouveau_fence_unref(&old_fence); -} - -static void -nouveau_bo_fence_unref(void **sync_obj) -{ - nouveau_fence_unref((struct nouveau_fence **)sync_obj); + nouveau_bo_fence_unref(&nvbo->bo.sync_obj); + nvbo->bo.sync_obj = nouveau_fence_ref(fence); }
static void * diff --git a/drivers/gpu/drm/nouveau/nouveau_display.c b/drivers/gpu/drm/nouveau/nouveau_display.c index 9d3892a1af96..e3117c78ea95 100644 --- a/drivers/gpu/drm/nouveau/nouveau_display.c +++ b/drivers/gpu/drm/nouveau/nouveau_display.c @@ -615,11 +615,7 @@ nouveau_crtc_page_flip(struct drm_crtc *crtc, struct drm_framebuffer *fb, goto fail_unpin;
/* synchronise rendering channel with the kernel's channel */ - spin_lock(&new_bo->bo.bdev->fence_lock); - fence = nouveau_fence_ref(new_bo->bo.sync_obj); - spin_unlock(&new_bo->bo.bdev->fence_lock); - ret = nouveau_fence_sync(fence, chan); - nouveau_fence_unref(&fence); + ret = nouveau_fence_sync(new_bo->bo.sync_obj, chan);
if (ret) { ttm_bo_unreserve(&new_bo->bo); diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c index 24e9c58da8aa..fa9364fc0a66 100644 --- a/drivers/gpu/drm/nouveau/nouveau_gem.c +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c @@ -105,9 +105,7 @@ nouveau_gem_object_unmap(struct nouveau_bo *nvbo, struct nouveau_vma *vma) list_del(&vma->head);
if (mapped) { - spin_lock(&nvbo->bo.bdev->fence_lock); fence = nouveau_fence_ref(nvbo->bo.sync_obj); - spin_unlock(&nvbo->bo.bdev->fence_lock); }
if (fence) { @@ -434,17 +432,11 @@ retry: static int validate_sync(struct nouveau_channel *chan, struct nouveau_bo *nvbo) { - struct nouveau_fence *fence = NULL; + struct nouveau_fence *fence = nvbo->bo.sync_obj; int ret = 0;
- spin_lock(&nvbo->bo.bdev->fence_lock); - fence = nouveau_fence_ref(nvbo->bo.sync_obj); - spin_unlock(&nvbo->bo.bdev->fence_lock); - - if (fence) { + if (fence) ret = nouveau_fence_sync(fence, chan); - nouveau_fence_unref(&fence); - }
return ret; } @@ -669,9 +661,7 @@ nouveau_gem_pushbuf_reloc_apply(struct nouveau_cli *cli, data |= r->vor; }
- spin_lock(&nvbo->bo.bdev->fence_lock); ret = ttm_bo_wait(&nvbo->bo, false, false, false); - spin_unlock(&nvbo->bo.bdev->fence_lock); if (ret) { NV_ERROR(cli, "reloc wait_idle failed: %d\n", ret); break; @@ -904,11 +894,9 @@ nouveau_gem_ioctl_cpu_prep(struct drm_device *dev, void *data,
ret = ttm_bo_reserve(&nvbo->bo, true, false, false, 0); if (!ret) { - spin_lock(&nvbo->bo.bdev->fence_lock); ret = ttm_bo_wait(&nvbo->bo, true, true, true); if (!no_wait && ret) fence = nouveau_fence_ref(nvbo->bo.sync_obj); - spin_unlock(&nvbo->bo.bdev->fence_lock);
ttm_bo_unreserve(&nvbo->bo); } diff --git a/drivers/gpu/drm/qxl/qxl_cmd.c b/drivers/gpu/drm/qxl/qxl_cmd.c index eb89653a7a17..45fad7b45486 100644 --- a/drivers/gpu/drm/qxl/qxl_cmd.c +++ b/drivers/gpu/drm/qxl/qxl_cmd.c @@ -628,9 +628,7 @@ static int qxl_reap_surf(struct qxl_device *qdev, struct qxl_bo *surf, bool stal if (stall) mutex_unlock(&qdev->surf_evict_mutex);
- spin_lock(&surf->tbo.bdev->fence_lock); ret = ttm_bo_wait(&surf->tbo, true, true, !stall); - spin_unlock(&surf->tbo.bdev->fence_lock);
if (stall) mutex_lock(&qdev->surf_evict_mutex); diff --git a/drivers/gpu/drm/qxl/qxl_fence.c b/drivers/gpu/drm/qxl/qxl_fence.c index ae59e91cfb9a..c7248418117d 100644 --- a/drivers/gpu/drm/qxl/qxl_fence.c +++ b/drivers/gpu/drm/qxl/qxl_fence.c @@ -60,9 +60,6 @@ int qxl_fence_remove_release(struct qxl_fence *qfence, uint32_t rel_id) { void *ret; int retval = 0; - struct qxl_bo *bo = container_of(qfence, struct qxl_bo, fence); - - spin_lock(&bo->tbo.bdev->fence_lock);
ret = radix_tree_delete(&qfence->tree, rel_id); if (ret == qfence) @@ -71,7 +68,6 @@ int qxl_fence_remove_release(struct qxl_fence *qfence, uint32_t rel_id) DRM_DEBUG("didn't find fence in radix tree for %d\n", rel_id); retval = -ENOENT; } - spin_unlock(&bo->tbo.bdev->fence_lock); return retval; }
diff --git a/drivers/gpu/drm/qxl/qxl_object.h b/drivers/gpu/drm/qxl/qxl_object.h index d458a140c024..98395b223ad0 100644 --- a/drivers/gpu/drm/qxl/qxl_object.h +++ b/drivers/gpu/drm/qxl/qxl_object.h @@ -76,12 +76,10 @@ static inline int qxl_bo_wait(struct qxl_bo *bo, u32 *mem_type, } return r; } - spin_lock(&bo->tbo.bdev->fence_lock); if (mem_type) *mem_type = bo->tbo.mem.mem_type; if (bo->tbo.sync_obj) r = ttm_bo_wait(&bo->tbo, true, true, no_wait); - spin_unlock(&bo->tbo.bdev->fence_lock); ttm_bo_unreserve(&bo->tbo); return r; } diff --git a/drivers/gpu/drm/qxl/qxl_release.c b/drivers/gpu/drm/qxl/qxl_release.c index 6f71cadc7c9b..44f43e6adc81 100644 --- a/drivers/gpu/drm/qxl/qxl_release.c +++ b/drivers/gpu/drm/qxl/qxl_release.c @@ -337,7 +337,6 @@ void qxl_release_fence_buffer_objects(struct qxl_release *release) glob = bo->glob;
spin_lock(&glob->lru_lock); - spin_lock(&bdev->fence_lock);
list_for_each_entry(entry, &release->bos, head) { bo = entry->bo; @@ -351,7 +350,6 @@ void qxl_release_fence_buffer_objects(struct qxl_release *release) ttm_bo_add_to_lru(bo); ww_mutex_unlock(&bo->resv->lock); } - spin_unlock(&bdev->fence_lock); spin_unlock(&glob->lru_lock); ww_acquire_fini(&release->ticket); } diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c index 7b253815a323..f2db8c6606e8 100644 --- a/drivers/gpu/drm/radeon/radeon_display.c +++ b/drivers/gpu/drm/radeon/radeon_display.c @@ -380,10 +380,8 @@ static int radeon_crtc_page_flip(struct drm_crtc *crtc, obj = new_radeon_fb->obj; rbo = gem_to_radeon_bo(obj);
- spin_lock(&rbo->tbo.bdev->fence_lock); if (rbo->tbo.sync_obj) work->fence = radeon_fence_ref(rbo->tbo.sync_obj); - spin_unlock(&rbo->tbo.bdev->fence_lock);
INIT_WORK(&work->work, radeon_unpin_work_func);
diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c index a3b92bfbe81b..f1389173f9ac 100644 --- a/drivers/gpu/drm/radeon/radeon_object.c +++ b/drivers/gpu/drm/radeon/radeon_object.c @@ -612,12 +612,10 @@ int radeon_bo_wait(struct radeon_bo *bo, u32 *mem_type, bool no_wait) r = ttm_bo_reserve(&bo->tbo, true, no_wait, false, 0); if (unlikely(r != 0)) return r; - spin_lock(&bo->tbo.bdev->fence_lock); if (mem_type) *mem_type = bo->tbo.mem.mem_type; if (bo->tbo.sync_obj) r = ttm_bo_wait(&bo->tbo, true, true, no_wait); - spin_unlock(&bo->tbo.bdev->fence_lock); ttm_bo_unreserve(&bo->tbo); return r; } diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index a8cffb6ee1f3..df1725fa4b9d 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -412,24 +412,20 @@ static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo) spin_lock(&glob->lru_lock); ret = ttm_bo_reserve_nolru(bo, false, true, false, 0);
- spin_lock(&bdev->fence_lock); - (void) ttm_bo_wait(bo, false, false, true); - if (!ret && !bo->sync_obj) { - spin_unlock(&bdev->fence_lock); - put_count = ttm_bo_del_from_lru(bo); + if (!ret) { + (void) ttm_bo_wait(bo, false, false, true);
- spin_unlock(&glob->lru_lock); - ttm_bo_cleanup_memtype_use(bo); + if (!bo->sync_obj) { + put_count = ttm_bo_del_from_lru(bo);
- ttm_bo_list_ref_sub(bo, put_count, true); + spin_unlock(&glob->lru_lock); + ttm_bo_cleanup_memtype_use(bo);
- return; - } - if (bo->sync_obj) - sync_obj = driver->sync_obj_ref(bo->sync_obj); - spin_unlock(&bdev->fence_lock); + ttm_bo_list_ref_sub(bo, put_count, true);
- if (!ret) { + return; + } + sync_obj = driver->sync_obj_ref(bo->sync_obj);
/* * Make NO_EVICT bos immediately available to @@ -478,7 +474,6 @@ static int ttm_bo_cleanup_refs_and_unlock(struct ttm_buffer_object *bo, int put_count; int ret;
- spin_lock(&bdev->fence_lock); ret = ttm_bo_wait(bo, false, false, true);
if (ret && !no_wait_gpu) { @@ -490,7 +485,6 @@ static int ttm_bo_cleanup_refs_and_unlock(struct ttm_buffer_object *bo, * no new sync objects can be attached. */ sync_obj = driver->sync_obj_ref(bo->sync_obj); - spin_unlock(&bdev->fence_lock);
ww_mutex_unlock(&bo->resv->lock); spin_unlock(&glob->lru_lock); @@ -520,11 +514,9 @@ static int ttm_bo_cleanup_refs_and_unlock(struct ttm_buffer_object *bo, * remove sync_obj with ttm_bo_wait, the wait should be * finished, and no new wait object should have been added. */ - spin_lock(&bdev->fence_lock); ret = ttm_bo_wait(bo, false, false, true); WARN_ON(ret); } - spin_unlock(&bdev->fence_lock);
if (ret || unlikely(list_empty(&bo->ddestroy))) { ww_mutex_unlock(&bo->resv->lock); @@ -662,9 +654,7 @@ static int ttm_bo_evict(struct ttm_buffer_object *bo, bool interruptible, struct ttm_placement placement; int ret = 0;
- spin_lock(&bdev->fence_lock); ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu); - spin_unlock(&bdev->fence_lock);
if (unlikely(ret != 0)) { if (ret != -ERESTARTSYS) { @@ -961,7 +951,6 @@ int ttm_bo_move_buffer(struct ttm_buffer_object *bo, { int ret = 0; struct ttm_mem_reg mem; - struct ttm_bo_device *bdev = bo->bdev;
lockdep_assert_held(&bo->resv->lock.base);
@@ -970,9 +959,7 @@ int ttm_bo_move_buffer(struct ttm_buffer_object *bo, * Have the driver move function wait for idle when necessary, * instead of doing it here. */ - spin_lock(&bdev->fence_lock); ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu); - spin_unlock(&bdev->fence_lock); if (ret) return ret; mem.num_pages = bo->num_pages; @@ -1471,7 +1458,6 @@ int ttm_bo_device_init(struct ttm_bo_device *bdev, bdev->glob = glob; bdev->need_dma32 = need_dma32; bdev->val_seq = 0; - spin_lock_init(&bdev->fence_lock); mutex_lock(&glob->device_list_mutex); list_add_tail(&bdev->device_list, &glob->device_list); mutex_unlock(&glob->device_list_mutex); @@ -1529,7 +1515,6 @@ int ttm_bo_wait(struct ttm_buffer_object *bo, bool lazy, bool interruptible, bool no_wait) { struct ttm_bo_driver *driver = bo->bdev->driver; - struct ttm_bo_device *bdev = bo->bdev; void *sync_obj; int ret = 0;
@@ -1538,53 +1523,33 @@ int ttm_bo_wait(struct ttm_buffer_object *bo, if (likely(bo->sync_obj == NULL)) return 0;
- while (bo->sync_obj) { - + if (bo->sync_obj) { if (driver->sync_obj_signaled(bo->sync_obj)) { - void *tmp_obj = bo->sync_obj; - bo->sync_obj = NULL; + driver->sync_obj_unref(&bo->sync_obj); clear_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags); - spin_unlock(&bdev->fence_lock); - driver->sync_obj_unref(&tmp_obj); - spin_lock(&bdev->fence_lock); - continue; + return 0; }
if (no_wait) return -EBUSY;
sync_obj = driver->sync_obj_ref(bo->sync_obj); - spin_unlock(&bdev->fence_lock); ret = driver->sync_obj_wait(sync_obj, lazy, interruptible); - if (unlikely(ret != 0)) { - driver->sync_obj_unref(&sync_obj); - spin_lock(&bdev->fence_lock); - return ret; - } - spin_lock(&bdev->fence_lock); - if (likely(bo->sync_obj == sync_obj)) { - void *tmp_obj = bo->sync_obj; - bo->sync_obj = NULL; + + if (likely(ret == 0)) { clear_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags); - spin_unlock(&bdev->fence_lock); - driver->sync_obj_unref(&sync_obj); - driver->sync_obj_unref(&tmp_obj); - spin_lock(&bdev->fence_lock); - } else { - spin_unlock(&bdev->fence_lock); - driver->sync_obj_unref(&sync_obj); - spin_lock(&bdev->fence_lock); + driver->sync_obj_unref(&bo->sync_obj); } + driver->sync_obj_unref(&sync_obj); } - return 0; + return ret; } EXPORT_SYMBOL(ttm_bo_wait);
int ttm_bo_synccpu_write_grab(struct ttm_buffer_object *bo, bool no_wait) { - struct ttm_bo_device *bdev = bo->bdev; int ret = 0;
/* @@ -1594,9 +1559,7 @@ int ttm_bo_synccpu_write_grab(struct ttm_buffer_object *bo, bool no_wait) ret = ttm_bo_reserve(bo, true, no_wait, false, 0); if (unlikely(ret != 0)) return ret; - spin_lock(&bdev->fence_lock); ret = ttm_bo_wait(bo, false, true, no_wait); - spin_unlock(&bdev->fence_lock); if (likely(ret == 0)) atomic_inc(&bo->cpu_writers); ttm_bo_unreserve(bo); @@ -1653,9 +1616,7 @@ static int ttm_bo_swapout(struct ttm_mem_shrink *shrink) * Wait for GPU, then move to system cached. */
- spin_lock(&bo->bdev->fence_lock); ret = ttm_bo_wait(bo, false, false, false); - spin_unlock(&bo->bdev->fence_lock);
if (unlikely(ret != 0)) goto out; diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c index 406152152315..f3d12b3fd603 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_util.c +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c @@ -466,12 +466,10 @@ static int ttm_buffer_object_transfer(struct ttm_buffer_object *bo, drm_vma_node_reset(&fbo->vma_node); atomic_set(&fbo->cpu_writers, 0);
- spin_lock(&bdev->fence_lock); if (bo->sync_obj) fbo->sync_obj = driver->sync_obj_ref(bo->sync_obj); else fbo->sync_obj = NULL; - spin_unlock(&bdev->fence_lock); kref_init(&fbo->list_kref); kref_init(&fbo->kref); fbo->destroy = &ttm_transfered_destroy; @@ -657,7 +655,6 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo, struct ttm_buffer_object *ghost_obj; void *tmp_obj = NULL;
- spin_lock(&bdev->fence_lock); if (bo->sync_obj) { tmp_obj = bo->sync_obj; bo->sync_obj = NULL; @@ -665,7 +662,6 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo, bo->sync_obj = driver->sync_obj_ref(sync_obj); if (evict) { ret = ttm_bo_wait(bo, false, false, false); - spin_unlock(&bdev->fence_lock); if (tmp_obj) driver->sync_obj_unref(&tmp_obj); if (ret) @@ -688,7 +684,6 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo, */
set_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags); - spin_unlock(&bdev->fence_lock); if (tmp_obj) driver->sync_obj_unref(&tmp_obj);
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c index 6440eeac22d2..15d63e4ca6f1 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c @@ -45,10 +45,8 @@ static int ttm_bo_vm_fault_idle(struct ttm_buffer_object *bo, struct vm_area_struct *vma, struct vm_fault *vmf) { - struct ttm_bo_device *bdev = bo->bdev; int ret = 0;
- spin_lock(&bdev->fence_lock); if (likely(!test_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags))) goto out_unlock;
@@ -82,7 +80,6 @@ static int ttm_bo_vm_fault_idle(struct ttm_buffer_object *bo, VM_FAULT_NOPAGE;
out_unlock: - spin_unlock(&bdev->fence_lock); return ret; }
diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c b/drivers/gpu/drm/ttm/ttm_execbuf_util.c index ec36206da95a..bedb3fba4493 100644 --- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c +++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c @@ -181,7 +181,6 @@ void ttm_eu_fence_buffer_objects(struct ww_acquire_ctx *ticket, glob = bo->glob;
spin_lock(&glob->lru_lock); - spin_lock(&bdev->fence_lock);
list_for_each_entry(entry, list, head) { bo = entry->bo; @@ -190,7 +189,6 @@ void ttm_eu_fence_buffer_objects(struct ww_acquire_ctx *ticket, ttm_bo_add_to_lru(bo); ww_mutex_unlock(&bo->resv->lock); } - spin_unlock(&bdev->fence_lock); spin_unlock(&glob->lru_lock); if (ticket) ww_acquire_fini(ticket); diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c index c7a549694e59..6d7628b5cc22 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c @@ -262,13 +262,10 @@ static void vmw_dummy_query_bo_prepare(struct vmw_private *dev_priv) volatile SVGA3dQueryResult *result; bool dummy; int ret; - struct ttm_bo_device *bdev = &dev_priv->bdev; struct ttm_buffer_object *bo = dev_priv->dummy_query_bo;
ttm_bo_reserve(bo, false, false, false, 0); - spin_lock(&bdev->fence_lock); ret = ttm_bo_wait(bo, false, false, false); - spin_unlock(&bdev->fence_lock); if (unlikely(ret != 0)) (void) vmw_fallback_wait(dev_priv, false, true, 0, false, 10*HZ); diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c index e7a73f651a5d..6055deedd2a7 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c @@ -1273,12 +1273,10 @@ void vmw_fence_single_bo(struct ttm_buffer_object *bo, else driver->sync_obj_ref(fence);
- spin_lock(&bdev->fence_lock);
old_fence_obj = bo->sync_obj; bo->sync_obj = fence;
- spin_unlock(&bdev->fence_lock);
if (old_fence_obj) vmw_fence_obj_unreference(&old_fence_obj); diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h index ee127ec33c60..f34d59b67218 100644 --- a/include/drm/ttm/ttm_bo_api.h +++ b/include/drm/ttm/ttm_bo_api.h @@ -227,10 +227,7 @@ struct ttm_buffer_object { struct list_head io_reserve_lru;
/** - * Members protected by struct buffer_object_device::fence_lock - * In addition, setting sync_obj to anything else - * than NULL requires bo::reserved to be held. This allows for - * checking NULL while reserved but not holding the mentioned lock. + * Members protected by a bo reservation. */
void *sync_obj; diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h index 8639c85d61c4..aed1a1f070ef 100644 --- a/include/drm/ttm/ttm_bo_driver.h +++ b/include/drm/ttm/ttm_bo_driver.h @@ -518,8 +518,6 @@ struct ttm_bo_global { * * @driver: Pointer to a struct ttm_bo_driver struct setup by the driver. * @man: An array of mem_type_managers. - * @fence_lock: Protects the synchronizing members on *all* bos belonging - * to this device. * @vma_manager: Address space manager * lru_lock: Spinlock that protects the buffer+device lru lists and * ddestroy lists. @@ -539,7 +537,6 @@ struct ttm_bo_device { struct ttm_bo_global *glob; struct ttm_bo_driver *driver; struct ttm_mem_type_manager man[TTM_NUM_MEM_TYPES]; - spinlock_t fence_lock;
/* * Protected by internal locks.
No users are left, kill it off! :D
Signed-off-by: Maarten Lankhorst maarten.lankhorst@canonical.com --- Fixed to apply on top of drm-next.
Also, for nouveau "[PATCH 1/2] drm/nouveau: hold mutex while syncing to kernel channel" is required, else this patch will fail to apply.
drivers/gpu/drm/nouveau/nouveau_bo.c | 23 ++++------ drivers/gpu/drm/nouveau/nouveau_display.c | 6 +-- drivers/gpu/drm/nouveau/nouveau_gem.c | 16 +------ drivers/gpu/drm/qxl/qxl_cmd.c | 2 - drivers/gpu/drm/qxl/qxl_fence.c | 4 -- drivers/gpu/drm/qxl/qxl_object.h | 2 - drivers/gpu/drm/qxl/qxl_release.c | 2 - drivers/gpu/drm/radeon/radeon_display.c | 2 - drivers/gpu/drm/radeon/radeon_object.c | 2 - drivers/gpu/drm/ttm/ttm_bo.c | 75 ++++++++----------------------- drivers/gpu/drm/ttm/ttm_bo_util.c | 5 --- drivers/gpu/drm/ttm/ttm_bo_vm.c | 3 -- drivers/gpu/drm/ttm/ttm_execbuf_util.c | 2 - drivers/gpu/drm/vmwgfx/vmwgfx_buffer.c | 4 -- drivers/gpu/drm/vmwgfx/vmwgfx_resource.c | 16 +++---- include/drm/ttm/ttm_bo_api.h | 5 +-- include/drm/ttm/ttm_bo_driver.h | 3 -- 17 files changed, 36 insertions(+), 136 deletions(-)
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c index 38444ba22f0d..8e760b74cf0b 100644 --- a/drivers/gpu/drm/nouveau/nouveau_bo.c +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c @@ -1454,26 +1454,19 @@ nouveau_ttm_tt_unpopulate(struct ttm_tt *ttm) ttm_pool_unpopulate(ttm); }
+static void +nouveau_bo_fence_unref(void **sync_obj) +{ + nouveau_fence_unref((struct nouveau_fence **)sync_obj); +} + void nouveau_bo_fence(struct nouveau_bo *nvbo, struct nouveau_fence *fence) { - struct nouveau_fence *new_fence = nouveau_fence_ref(fence); - struct nouveau_fence *old_fence = NULL; - lockdep_assert_held(&nvbo->bo.resv->lock.base);
- spin_lock(&nvbo->bo.bdev->fence_lock); - old_fence = nvbo->bo.sync_obj; - nvbo->bo.sync_obj = new_fence; - spin_unlock(&nvbo->bo.bdev->fence_lock); - - nouveau_fence_unref(&old_fence); -} - -static void -nouveau_bo_fence_unref(void **sync_obj) -{ - nouveau_fence_unref((struct nouveau_fence **)sync_obj); + nouveau_bo_fence_unref(&nvbo->bo.sync_obj); + nvbo->bo.sync_obj = nouveau_fence_ref(fence); }
static void * diff --git a/drivers/gpu/drm/nouveau/nouveau_display.c b/drivers/gpu/drm/nouveau/nouveau_display.c index 9d3892a1af96..e3117c78ea95 100644 --- a/drivers/gpu/drm/nouveau/nouveau_display.c +++ b/drivers/gpu/drm/nouveau/nouveau_display.c @@ -615,11 +615,7 @@ nouveau_crtc_page_flip(struct drm_crtc *crtc, struct drm_framebuffer *fb, goto fail_unpin;
/* synchronise rendering channel with the kernel's channel */ - spin_lock(&new_bo->bo.bdev->fence_lock); - fence = nouveau_fence_ref(new_bo->bo.sync_obj); - spin_unlock(&new_bo->bo.bdev->fence_lock); - ret = nouveau_fence_sync(fence, chan); - nouveau_fence_unref(&fence); + ret = nouveau_fence_sync(new_bo->bo.sync_obj, chan);
if (ret) { ttm_bo_unreserve(&new_bo->bo); diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c index 0e35aafc628e..e15178a5893b 100644 --- a/drivers/gpu/drm/nouveau/nouveau_gem.c +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c @@ -105,9 +105,7 @@ nouveau_gem_object_unmap(struct nouveau_bo *nvbo, struct nouveau_vma *vma) list_del(&vma->head);
if (mapped) { - spin_lock(&nvbo->bo.bdev->fence_lock); fence = nouveau_fence_ref(nvbo->bo.sync_obj); - spin_unlock(&nvbo->bo.bdev->fence_lock); }
if (fence) { @@ -434,17 +432,11 @@ retry: static int validate_sync(struct nouveau_channel *chan, struct nouveau_bo *nvbo) { - struct nouveau_fence *fence = NULL; + struct nouveau_fence *fence = nvbo->bo.sync_obj; int ret = 0;
- spin_lock(&nvbo->bo.bdev->fence_lock); - fence = nouveau_fence_ref(nvbo->bo.sync_obj); - spin_unlock(&nvbo->bo.bdev->fence_lock); - - if (fence) { + if (fence) ret = nouveau_fence_sync(fence, chan); - nouveau_fence_unref(&fence); - }
return ret; } @@ -669,9 +661,7 @@ nouveau_gem_pushbuf_reloc_apply(struct nouveau_cli *cli, data |= r->vor; }
- spin_lock(&nvbo->bo.bdev->fence_lock); ret = ttm_bo_wait(&nvbo->bo, false, false, false); - spin_unlock(&nvbo->bo.bdev->fence_lock); if (ret) { NV_ERROR(cli, "reloc wait_idle failed: %d\n", ret); break; @@ -904,11 +894,9 @@ nouveau_gem_ioctl_cpu_prep(struct drm_device *dev, void *data,
ret = ttm_bo_reserve(&nvbo->bo, true, false, false, 0); if (!ret) { - spin_lock(&nvbo->bo.bdev->fence_lock); ret = ttm_bo_wait(&nvbo->bo, true, true, true); if (!no_wait && ret) fence = nouveau_fence_ref(nvbo->bo.sync_obj); - spin_unlock(&nvbo->bo.bdev->fence_lock);
ttm_bo_unreserve(&nvbo->bo); } diff --git a/drivers/gpu/drm/qxl/qxl_cmd.c b/drivers/gpu/drm/qxl/qxl_cmd.c index eb89653a7a17..45fad7b45486 100644 --- a/drivers/gpu/drm/qxl/qxl_cmd.c +++ b/drivers/gpu/drm/qxl/qxl_cmd.c @@ -628,9 +628,7 @@ static int qxl_reap_surf(struct qxl_device *qdev, struct qxl_bo *surf, bool stal if (stall) mutex_unlock(&qdev->surf_evict_mutex);
- spin_lock(&surf->tbo.bdev->fence_lock); ret = ttm_bo_wait(&surf->tbo, true, true, !stall); - spin_unlock(&surf->tbo.bdev->fence_lock);
if (stall) mutex_lock(&qdev->surf_evict_mutex); diff --git a/drivers/gpu/drm/qxl/qxl_fence.c b/drivers/gpu/drm/qxl/qxl_fence.c index ae59e91cfb9a..c7248418117d 100644 --- a/drivers/gpu/drm/qxl/qxl_fence.c +++ b/drivers/gpu/drm/qxl/qxl_fence.c @@ -60,9 +60,6 @@ int qxl_fence_remove_release(struct qxl_fence *qfence, uint32_t rel_id) { void *ret; int retval = 0; - struct qxl_bo *bo = container_of(qfence, struct qxl_bo, fence); - - spin_lock(&bo->tbo.bdev->fence_lock);
ret = radix_tree_delete(&qfence->tree, rel_id); if (ret == qfence) @@ -71,7 +68,6 @@ int qxl_fence_remove_release(struct qxl_fence *qfence, uint32_t rel_id) DRM_DEBUG("didn't find fence in radix tree for %d\n", rel_id); retval = -ENOENT; } - spin_unlock(&bo->tbo.bdev->fence_lock); return retval; }
diff --git a/drivers/gpu/drm/qxl/qxl_object.h b/drivers/gpu/drm/qxl/qxl_object.h index d458a140c024..98395b223ad0 100644 --- a/drivers/gpu/drm/qxl/qxl_object.h +++ b/drivers/gpu/drm/qxl/qxl_object.h @@ -76,12 +76,10 @@ static inline int qxl_bo_wait(struct qxl_bo *bo, u32 *mem_type, } return r; } - spin_lock(&bo->tbo.bdev->fence_lock); if (mem_type) *mem_type = bo->tbo.mem.mem_type; if (bo->tbo.sync_obj) r = ttm_bo_wait(&bo->tbo, true, true, no_wait); - spin_unlock(&bo->tbo.bdev->fence_lock); ttm_bo_unreserve(&bo->tbo); return r; } diff --git a/drivers/gpu/drm/qxl/qxl_release.c b/drivers/gpu/drm/qxl/qxl_release.c index 6f71cadc7c9b..44f43e6adc81 100644 --- a/drivers/gpu/drm/qxl/qxl_release.c +++ b/drivers/gpu/drm/qxl/qxl_release.c @@ -337,7 +337,6 @@ void qxl_release_fence_buffer_objects(struct qxl_release *release) glob = bo->glob;
spin_lock(&glob->lru_lock); - spin_lock(&bdev->fence_lock);
list_for_each_entry(entry, &release->bos, head) { bo = entry->bo; @@ -351,7 +350,6 @@ void qxl_release_fence_buffer_objects(struct qxl_release *release) ttm_bo_add_to_lru(bo); ww_mutex_unlock(&bo->resv->lock); } - spin_unlock(&bdev->fence_lock); spin_unlock(&glob->lru_lock); ww_acquire_fini(&release->ticket); } diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c index 7ea647b84733..b62ba6f6507c 100644 --- a/drivers/gpu/drm/radeon/radeon_display.c +++ b/drivers/gpu/drm/radeon/radeon_display.c @@ -380,10 +380,8 @@ static int radeon_crtc_page_flip(struct drm_crtc *crtc, obj = new_radeon_fb->obj; rbo = gem_to_radeon_bo(obj);
- spin_lock(&rbo->tbo.bdev->fence_lock); if (rbo->tbo.sync_obj) work->fence = radeon_fence_ref(rbo->tbo.sync_obj); - spin_unlock(&rbo->tbo.bdev->fence_lock);
INIT_WORK(&work->work, radeon_unpin_work_func);
diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c index ffc5496cea2d..a2be41a21803 100644 --- a/drivers/gpu/drm/radeon/radeon_object.c +++ b/drivers/gpu/drm/radeon/radeon_object.c @@ -612,12 +612,10 @@ int radeon_bo_wait(struct radeon_bo *bo, u32 *mem_type, bool no_wait) r = ttm_bo_reserve(&bo->tbo, true, no_wait, false, 0); if (unlikely(r != 0)) return r; - spin_lock(&bo->tbo.bdev->fence_lock); if (mem_type) *mem_type = bo->tbo.mem.mem_type; if (bo->tbo.sync_obj) r = ttm_bo_wait(&bo->tbo, true, true, no_wait); - spin_unlock(&bo->tbo.bdev->fence_lock); ttm_bo_unreserve(&bo->tbo); return r; } diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index b2b38b52f449..2c0528aba01c 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -412,24 +412,20 @@ static void ttm_bo_cleanup_refs_or_queue(struct ttm_buffer_object *bo) spin_lock(&glob->lru_lock); ret = ttm_bo_reserve_nolru(bo, false, true, false, 0);
- spin_lock(&bdev->fence_lock); - (void) ttm_bo_wait(bo, false, false, true); - if (!ret && !bo->sync_obj) { - spin_unlock(&bdev->fence_lock); - put_count = ttm_bo_del_from_lru(bo); + if (!ret) { + (void) ttm_bo_wait(bo, false, false, true);
- spin_unlock(&glob->lru_lock); - ttm_bo_cleanup_memtype_use(bo); + if (!bo->sync_obj) { + put_count = ttm_bo_del_from_lru(bo);
- ttm_bo_list_ref_sub(bo, put_count, true); + spin_unlock(&glob->lru_lock); + ttm_bo_cleanup_memtype_use(bo);
- return; - } - if (bo->sync_obj) - sync_obj = driver->sync_obj_ref(bo->sync_obj); - spin_unlock(&bdev->fence_lock); + ttm_bo_list_ref_sub(bo, put_count, true);
- if (!ret) { + return; + } + sync_obj = driver->sync_obj_ref(bo->sync_obj);
/* * Make NO_EVICT bos immediately available to @@ -478,7 +474,6 @@ static int ttm_bo_cleanup_refs_and_unlock(struct ttm_buffer_object *bo, int put_count; int ret;
- spin_lock(&bdev->fence_lock); ret = ttm_bo_wait(bo, false, false, true);
if (ret && !no_wait_gpu) { @@ -490,7 +485,6 @@ static int ttm_bo_cleanup_refs_and_unlock(struct ttm_buffer_object *bo, * no new sync objects can be attached. */ sync_obj = driver->sync_obj_ref(bo->sync_obj); - spin_unlock(&bdev->fence_lock);
ww_mutex_unlock(&bo->resv->lock); spin_unlock(&glob->lru_lock); @@ -520,11 +514,9 @@ static int ttm_bo_cleanup_refs_and_unlock(struct ttm_buffer_object *bo, * remove sync_obj with ttm_bo_wait, the wait should be * finished, and no new wait object should have been added. */ - spin_lock(&bdev->fence_lock); ret = ttm_bo_wait(bo, false, false, true); WARN_ON(ret); } - spin_unlock(&bdev->fence_lock);
if (ret || unlikely(list_empty(&bo->ddestroy))) { ww_mutex_unlock(&bo->resv->lock); @@ -662,9 +654,7 @@ static int ttm_bo_evict(struct ttm_buffer_object *bo, bool interruptible, struct ttm_placement placement; int ret = 0;
- spin_lock(&bdev->fence_lock); ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu); - spin_unlock(&bdev->fence_lock);
if (unlikely(ret != 0)) { if (ret != -ERESTARTSYS) { @@ -961,7 +951,6 @@ static int ttm_bo_move_buffer(struct ttm_buffer_object *bo, { int ret = 0; struct ttm_mem_reg mem; - struct ttm_bo_device *bdev = bo->bdev;
lockdep_assert_held(&bo->resv->lock.base);
@@ -970,9 +959,7 @@ static int ttm_bo_move_buffer(struct ttm_buffer_object *bo, * Have the driver move function wait for idle when necessary, * instead of doing it here. */ - spin_lock(&bdev->fence_lock); ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu); - spin_unlock(&bdev->fence_lock); if (ret) return ret; mem.num_pages = bo->num_pages; @@ -1471,7 +1458,6 @@ int ttm_bo_device_init(struct ttm_bo_device *bdev, bdev->glob = glob; bdev->need_dma32 = need_dma32; bdev->val_seq = 0; - spin_lock_init(&bdev->fence_lock); mutex_lock(&glob->device_list_mutex); list_add_tail(&bdev->device_list, &glob->device_list); mutex_unlock(&glob->device_list_mutex); @@ -1529,7 +1515,6 @@ int ttm_bo_wait(struct ttm_buffer_object *bo, bool lazy, bool interruptible, bool no_wait) { struct ttm_bo_driver *driver = bo->bdev->driver; - struct ttm_bo_device *bdev = bo->bdev; void *sync_obj; int ret = 0;
@@ -1538,53 +1523,33 @@ int ttm_bo_wait(struct ttm_buffer_object *bo, if (likely(bo->sync_obj == NULL)) return 0;
- while (bo->sync_obj) { - + if (bo->sync_obj) { if (driver->sync_obj_signaled(bo->sync_obj)) { - void *tmp_obj = bo->sync_obj; - bo->sync_obj = NULL; + driver->sync_obj_unref(&bo->sync_obj); clear_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags); - spin_unlock(&bdev->fence_lock); - driver->sync_obj_unref(&tmp_obj); - spin_lock(&bdev->fence_lock); - continue; + return 0; }
if (no_wait) return -EBUSY;
sync_obj = driver->sync_obj_ref(bo->sync_obj); - spin_unlock(&bdev->fence_lock); ret = driver->sync_obj_wait(sync_obj, lazy, interruptible); - if (unlikely(ret != 0)) { - driver->sync_obj_unref(&sync_obj); - spin_lock(&bdev->fence_lock); - return ret; - } - spin_lock(&bdev->fence_lock); - if (likely(bo->sync_obj == sync_obj)) { - void *tmp_obj = bo->sync_obj; - bo->sync_obj = NULL; + + if (likely(ret == 0)) { clear_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags); - spin_unlock(&bdev->fence_lock); - driver->sync_obj_unref(&sync_obj); - driver->sync_obj_unref(&tmp_obj); - spin_lock(&bdev->fence_lock); - } else { - spin_unlock(&bdev->fence_lock); - driver->sync_obj_unref(&sync_obj); - spin_lock(&bdev->fence_lock); + driver->sync_obj_unref(&bo->sync_obj); } + driver->sync_obj_unref(&sync_obj); } - return 0; + return ret; } EXPORT_SYMBOL(ttm_bo_wait);
int ttm_bo_synccpu_write_grab(struct ttm_buffer_object *bo, bool no_wait) { - struct ttm_bo_device *bdev = bo->bdev; int ret = 0;
/* @@ -1594,9 +1559,7 @@ int ttm_bo_synccpu_write_grab(struct ttm_buffer_object *bo, bool no_wait) ret = ttm_bo_reserve(bo, true, no_wait, false, 0); if (unlikely(ret != 0)) return ret; - spin_lock(&bdev->fence_lock); ret = ttm_bo_wait(bo, false, true, no_wait); - spin_unlock(&bdev->fence_lock); if (likely(ret == 0)) atomic_inc(&bo->cpu_writers); ttm_bo_unreserve(bo); @@ -1653,9 +1616,7 @@ static int ttm_bo_swapout(struct ttm_mem_shrink *shrink) * Wait for GPU, then move to system cached. */
- spin_lock(&bo->bdev->fence_lock); ret = ttm_bo_wait(bo, false, false, false); - spin_unlock(&bo->bdev->fence_lock);
if (unlikely(ret != 0)) goto out; diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c index 1df856f78568..23db594e55c0 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_util.c +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c @@ -466,12 +466,10 @@ static int ttm_buffer_object_transfer(struct ttm_buffer_object *bo, drm_vma_node_reset(&fbo->vma_node); atomic_set(&fbo->cpu_writers, 0);
- spin_lock(&bdev->fence_lock); if (bo->sync_obj) fbo->sync_obj = driver->sync_obj_ref(bo->sync_obj); else fbo->sync_obj = NULL; - spin_unlock(&bdev->fence_lock); kref_init(&fbo->list_kref); kref_init(&fbo->kref); fbo->destroy = &ttm_transfered_destroy; @@ -657,7 +655,6 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo, struct ttm_buffer_object *ghost_obj; void *tmp_obj = NULL;
- spin_lock(&bdev->fence_lock); if (bo->sync_obj) { tmp_obj = bo->sync_obj; bo->sync_obj = NULL; @@ -665,7 +662,6 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo, bo->sync_obj = driver->sync_obj_ref(sync_obj); if (evict) { ret = ttm_bo_wait(bo, false, false, false); - spin_unlock(&bdev->fence_lock); if (tmp_obj) driver->sync_obj_unref(&tmp_obj); if (ret) @@ -688,7 +684,6 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo, */
set_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags); - spin_unlock(&bdev->fence_lock); if (tmp_obj) driver->sync_obj_unref(&tmp_obj);
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c index 801231c9ae48..6bf22391999d 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c @@ -45,10 +45,8 @@ static int ttm_bo_vm_fault_idle(struct ttm_buffer_object *bo, struct vm_area_struct *vma, struct vm_fault *vmf) { - struct ttm_bo_device *bdev = bo->bdev; int ret = 0;
- spin_lock(&bdev->fence_lock); if (likely(!test_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags))) goto out_unlock;
@@ -82,7 +80,6 @@ static int ttm_bo_vm_fault_idle(struct ttm_buffer_object *bo, VM_FAULT_NOPAGE;
out_unlock: - spin_unlock(&bdev->fence_lock); return ret; }
diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c b/drivers/gpu/drm/ttm/ttm_execbuf_util.c index ec36206da95a..bedb3fba4493 100644 --- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c +++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c @@ -181,7 +181,6 @@ void ttm_eu_fence_buffer_objects(struct ww_acquire_ctx *ticket, glob = bo->glob;
spin_lock(&glob->lru_lock); - spin_lock(&bdev->fence_lock);
list_for_each_entry(entry, list, head) { bo = entry->bo; @@ -190,7 +189,6 @@ void ttm_eu_fence_buffer_objects(struct ww_acquire_ctx *ticket, ttm_bo_add_to_lru(bo); ww_mutex_unlock(&bo->resv->lock); } - spin_unlock(&bdev->fence_lock); spin_unlock(&glob->lru_lock); if (ticket) ww_acquire_fini(ticket); diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_buffer.c index 6327cfc36805..4a36bb1dc525 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_buffer.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_buffer.c @@ -829,11 +829,7 @@ static void vmw_move_notify(struct ttm_buffer_object *bo, */ static void vmw_swap_notify(struct ttm_buffer_object *bo) { - struct ttm_bo_device *bdev = bo->bdev; - - spin_lock(&bdev->fence_lock); ttm_bo_wait(bo, false, false, false); - spin_unlock(&bdev->fence_lock); }
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c index 9bfde71a2232..fdc76323c534 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c @@ -554,12 +554,13 @@ static int vmw_user_dmabuf_synccpu_grab(struct vmw_user_dma_buffer *user_bo, int ret;
if (flags & drm_vmw_synccpu_allow_cs) { - struct ttm_bo_device *bdev = bo->bdev; + int try = !!(flags & drm_vmw_synccpu_dontblock);
- spin_lock(&bdev->fence_lock); - ret = ttm_bo_wait(bo, false, true, - !!(flags & drm_vmw_synccpu_dontblock)); - spin_unlock(&bdev->fence_lock); + ret = ttm_bo_reserve(bo, false, try, false, NULL); + if (!ret) { + ret = ttm_bo_wait(bo, false, true, try); + ttm_bo_unreserve(bo); + } return ret; }
@@ -1419,12 +1420,10 @@ void vmw_fence_single_bo(struct ttm_buffer_object *bo, else driver->sync_obj_ref(fence);
- spin_lock(&bdev->fence_lock);
old_fence_obj = bo->sync_obj; bo->sync_obj = fence;
- spin_unlock(&bdev->fence_lock);
if (old_fence_obj) vmw_fence_obj_unreference(&old_fence_obj); @@ -1465,7 +1464,6 @@ void vmw_resource_move_notify(struct ttm_buffer_object *bo,
if (mem->mem_type != VMW_PL_MOB) { struct vmw_resource *res, *n; - struct ttm_bo_device *bdev = bo->bdev; struct ttm_validate_buffer val_buf;
val_buf.bo = bo; @@ -1481,9 +1479,7 @@ void vmw_resource_move_notify(struct ttm_buffer_object *bo, list_del_init(&res->mob_head); }
- spin_lock(&bdev->fence_lock); (void) ttm_bo_wait(bo, false, false, false); - spin_unlock(&bdev->fence_lock); } }
diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h index ee127ec33c60..f34d59b67218 100644 --- a/include/drm/ttm/ttm_bo_api.h +++ b/include/drm/ttm/ttm_bo_api.h @@ -227,10 +227,7 @@ struct ttm_buffer_object { struct list_head io_reserve_lru;
/** - * Members protected by struct buffer_object_device::fence_lock - * In addition, setting sync_obj to anything else - * than NULL requires bo::reserved to be held. This allows for - * checking NULL while reserved but not holding the mentioned lock. + * Members protected by a bo reservation. */
void *sync_obj; diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h index 32d34ebf0706..e7045bc12d3b 100644 --- a/include/drm/ttm/ttm_bo_driver.h +++ b/include/drm/ttm/ttm_bo_driver.h @@ -518,8 +518,6 @@ struct ttm_bo_global { * * @driver: Pointer to a struct ttm_bo_driver struct setup by the driver. * @man: An array of mem_type_managers. - * @fence_lock: Protects the synchronizing members on *all* bos belonging - * to this device. * @vma_manager: Address space manager * lru_lock: Spinlock that protects the buffer+device lru lists and * ddestroy lists. @@ -539,7 +537,6 @@ struct ttm_bo_device { struct ttm_bo_global *glob; struct ttm_bo_driver *driver; struct ttm_mem_type_manager man[TTM_NUM_MEM_TYPES]; - spinlock_t fence_lock;
/* * Protected by internal locks.
dri-devel@lists.freedesktop.org