Hold the ring lock the whole time the reset is in progress, otherwise another process can submit new jobs.
Signed-off-by: Christian König deathsimple@vodafone.de --- drivers/gpu/drm/radeon/evergreen.c | 8 ++++---- drivers/gpu/drm/radeon/ni.c | 8 ++++---- drivers/gpu/drm/radeon/r100.c | 8 ++++---- drivers/gpu/drm/radeon/r300.c | 4 ++-- drivers/gpu/drm/radeon/r420.c | 8 ++++---- drivers/gpu/drm/radeon/r600.c | 8 ++++---- drivers/gpu/drm/radeon/radeon_device.c | 14 ++++++++------ drivers/gpu/drm/radeon/rv515.c | 4 ++-- drivers/gpu/drm/radeon/si.c | 12 ++++++------ 9 files changed, 38 insertions(+), 36 deletions(-)
diff --git a/drivers/gpu/drm/radeon/evergreen.c b/drivers/gpu/drm/radeon/evergreen.c index f716e08..09e6da8 100644 --- a/drivers/gpu/drm/radeon/evergreen.c +++ b/drivers/gpu/drm/radeon/evergreen.c @@ -1418,7 +1418,7 @@ static int evergreen_cp_start(struct radeon_device *rdev) int r, i; uint32_t cp_me;
- r = radeon_ring_lock(rdev, ring, 7); + r = radeon_ring_alloc(rdev, ring, 7); if (r) { DRM_ERROR("radeon: cp failed to lock ring (%d).\n", r); return r; @@ -1430,12 +1430,12 @@ static int evergreen_cp_start(struct radeon_device *rdev) radeon_ring_write(ring, PACKET3_ME_INITIALIZE_DEVICE_ID(1)); radeon_ring_write(ring, 0); radeon_ring_write(ring, 0); - radeon_ring_unlock_commit(rdev, ring); + radeon_ring_commit(rdev, ring);
cp_me = 0xff; WREG32(CP_ME_CNTL, cp_me);
- r = radeon_ring_lock(rdev, ring, evergreen_default_size + 19); + r = radeon_ring_alloc(rdev, ring, evergreen_default_size + 19); if (r) { DRM_ERROR("radeon: cp failed to lock ring (%d).\n", r); return r; @@ -1473,7 +1473,7 @@ static int evergreen_cp_start(struct radeon_device *rdev) radeon_ring_write(ring, 0x0000000e); /* VGT_VERTEX_REUSE_BLOCK_CNTL */ radeon_ring_write(ring, 0x00000010); /* */
- radeon_ring_unlock_commit(rdev, ring); + radeon_ring_commit(rdev, ring);
return 0; } diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c index 2366be3..7e516ce 100644 --- a/drivers/gpu/drm/radeon/ni.c +++ b/drivers/gpu/drm/radeon/ni.c @@ -918,7 +918,7 @@ static int cayman_cp_start(struct radeon_device *rdev) struct radeon_ring *ring = &rdev->ring[RADEON_RING_TYPE_GFX_INDEX]; int r, i;
- r = radeon_ring_lock(rdev, ring, 7); + r = radeon_ring_alloc(rdev, ring, 7); if (r) { DRM_ERROR("radeon: cp failed to lock ring (%d).\n", r); return r; @@ -930,11 +930,11 @@ static int cayman_cp_start(struct radeon_device *rdev) radeon_ring_write(ring, PACKET3_ME_INITIALIZE_DEVICE_ID(1)); radeon_ring_write(ring, 0); radeon_ring_write(ring, 0); - radeon_ring_unlock_commit(rdev, ring); + radeon_ring_commit(rdev, ring);
cayman_cp_enable(rdev, true);
- r = radeon_ring_lock(rdev, ring, cayman_default_size + 19); + r = radeon_ring_alloc(rdev, ring, cayman_default_size + 19); if (r) { DRM_ERROR("radeon: cp failed to lock ring (%d).\n", r); return r; @@ -972,7 +972,7 @@ static int cayman_cp_start(struct radeon_device *rdev) radeon_ring_write(ring, 0x0000000e); /* VGT_VERTEX_REUSE_BLOCK_CNTL */ radeon_ring_write(ring, 0x00000010); /* */
- radeon_ring_unlock_commit(rdev, ring); + radeon_ring_commit(rdev, ring);
/* XXX init other rings */
diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c index 35825bf..3901a68 100644 --- a/drivers/gpu/drm/radeon/r100.c +++ b/drivers/gpu/drm/radeon/r100.c @@ -955,7 +955,7 @@ void r100_ring_start(struct radeon_device *rdev, struct radeon_ring *ring) { int r;
- r = radeon_ring_lock(rdev, ring, 2); + r = radeon_ring_alloc(rdev, ring, 2); if (r) { return; } @@ -965,7 +965,7 @@ void r100_ring_start(struct radeon_device *rdev, struct radeon_ring *ring) RADEON_ISYNC_ANY3D_IDLE2D | RADEON_ISYNC_WAIT_IDLEGUI | RADEON_ISYNC_CPSCRATCH_IDLEGUI); - radeon_ring_unlock_commit(rdev, ring); + radeon_ring_commit(rdev, ring); }
@@ -3631,7 +3631,7 @@ int r100_ring_test(struct radeon_device *rdev, struct radeon_ring *ring) return r; } WREG32(scratch, 0xCAFEDEAD); - r = radeon_ring_lock(rdev, ring, 2); + r = radeon_ring_alloc(rdev, ring, 2); if (r) { DRM_ERROR("radeon: cp failed to lock ring (%d).\n", r); radeon_scratch_free(rdev, scratch); @@ -3639,7 +3639,7 @@ int r100_ring_test(struct radeon_device *rdev, struct radeon_ring *ring) } radeon_ring_write(ring, PACKET0(scratch, 0)); radeon_ring_write(ring, 0xDEADBEEF); - radeon_ring_unlock_commit(rdev, ring); + radeon_ring_commit(rdev, ring); for (i = 0; i < rdev->usec_timeout; i++) { tmp = RREG32(scratch); if (tmp == 0xDEADBEEF) { diff --git a/drivers/gpu/drm/radeon/r300.c b/drivers/gpu/drm/radeon/r300.c index 97722a3..32d3510 100644 --- a/drivers/gpu/drm/radeon/r300.c +++ b/drivers/gpu/drm/radeon/r300.c @@ -229,7 +229,7 @@ void r300_ring_start(struct radeon_device *rdev, struct radeon_ring *ring) break; }
- r = radeon_ring_lock(rdev, ring, 64); + r = radeon_ring_alloc(rdev, ring, 64); if (r) { return; } @@ -293,7 +293,7 @@ void r300_ring_start(struct radeon_device *rdev, struct radeon_ring *ring) radeon_ring_write(ring, R300_GEOMETRY_ROUND_NEAREST | R300_COLOR_ROUND_NEAREST); - radeon_ring_unlock_commit(rdev, ring); + radeon_ring_commit(rdev, ring); }
void r300_errata(struct radeon_device *rdev) diff --git a/drivers/gpu/drm/radeon/r420.c b/drivers/gpu/drm/radeon/r420.c index 99137be..b9a4560 100644 --- a/drivers/gpu/drm/radeon/r420.c +++ b/drivers/gpu/drm/radeon/r420.c @@ -208,11 +208,11 @@ static void r420_cp_errata_init(struct radeon_device *rdev) * of the CP init, apparently. */ radeon_scratch_get(rdev, &rdev->config.r300.resync_scratch); - radeon_ring_lock(rdev, ring, 8); + radeon_ring_alloc(rdev, ring, 8); radeon_ring_write(ring, PACKET0(R300_CP_RESYNC_ADDR, 1)); radeon_ring_write(ring, rdev->config.r300.resync_scratch); radeon_ring_write(ring, 0xDEADBEEF); - radeon_ring_unlock_commit(rdev, ring); + radeon_ring_commit(rdev, ring); }
static void r420_cp_errata_fini(struct radeon_device *rdev) @@ -222,10 +222,10 @@ static void r420_cp_errata_fini(struct radeon_device *rdev) /* Catch the RESYNC we dispatched all the way back, * at the very beginning of the CP init. */ - radeon_ring_lock(rdev, ring, 8); + radeon_ring_alloc(rdev, ring, 8); radeon_ring_write(ring, PACKET0(R300_RB3D_DSTCACHE_CTLSTAT, 0)); radeon_ring_write(ring, R300_RB3D_DC_FINISH); - radeon_ring_unlock_commit(rdev, ring); + radeon_ring_commit(rdev, ring); radeon_scratch_free(rdev, rdev->config.r300.resync_scratch); }
diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c index 43d0c41..fe60bf8 100644 --- a/drivers/gpu/drm/radeon/r600.c +++ b/drivers/gpu/drm/radeon/r600.c @@ -2066,7 +2066,7 @@ int r600_cp_start(struct radeon_device *rdev) int r; uint32_t cp_me;
- r = radeon_ring_lock(rdev, ring, 7); + r = radeon_ring_alloc(rdev, ring, 7); if (r) { DRM_ERROR("radeon: cp failed to lock ring (%d).\n", r); return r; @@ -2083,7 +2083,7 @@ int r600_cp_start(struct radeon_device *rdev) radeon_ring_write(ring, PACKET3_ME_INITIALIZE_DEVICE_ID(1)); radeon_ring_write(ring, 0); radeon_ring_write(ring, 0); - radeon_ring_unlock_commit(rdev, ring); + radeon_ring_commit(rdev, ring);
cp_me = 0xff; WREG32(R_0086D8_CP_ME_CNTL, cp_me); @@ -2198,7 +2198,7 @@ int r600_ring_test(struct radeon_device *rdev, struct radeon_ring *ring) return r; } WREG32(scratch, 0xCAFEDEAD); - r = radeon_ring_lock(rdev, ring, 3); + r = radeon_ring_alloc(rdev, ring, 3); if (r) { DRM_ERROR("radeon: cp failed to lock ring %d (%d).\n", ridx, r); radeon_scratch_free(rdev, scratch); @@ -2207,7 +2207,7 @@ int r600_ring_test(struct radeon_device *rdev, struct radeon_ring *ring) radeon_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1)); radeon_ring_write(ring, ((scratch - PACKET3_SET_CONFIG_REG_OFFSET) >> 2)); radeon_ring_write(ring, 0xDEADBEEF); - radeon_ring_unlock_commit(rdev, ring); + radeon_ring_commit(rdev, ring); for (i = 0; i < rdev->usec_timeout; i++) { tmp = RREG32(scratch); if (tmp == 0xDEADBEEF) diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c index f654ba8..9df4130 100644 --- a/drivers/gpu/drm/radeon/radeon_device.c +++ b/drivers/gpu/drm/radeon/radeon_device.c @@ -991,26 +991,28 @@ int radeon_gpu_reset(struct radeon_device *rdev) radeon_save_bios_scratch_regs(rdev); /* block TTM */ resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev); - radeon_suspend(rdev);
+ mutex_lock(&rdev->ring_lock); + radeon_suspend(rdev); r = radeon_asic_reset(rdev); if (!r) { dev_info(rdev->dev, "GPU reset succeed\n"); radeon_resume(rdev); - radeon_restore_bios_scratch_regs(rdev); - drm_helper_resume_force_mode(rdev->ddev); - ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched); } + mutex_unlock(&rdev->ring_lock); + + /* no matter what restore video mode */ + radeon_restore_bios_scratch_regs(rdev); + drm_helper_resume_force_mode(rdev->ddev); + ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
if (r) { /* bad news, how to tell it to userspace ? */ dev_info(rdev->dev, "GPU reset failed\n"); } - return r; }
- /* * Debugfs */ diff --git a/drivers/gpu/drm/radeon/rv515.c b/drivers/gpu/drm/radeon/rv515.c index 7f08ced..48441ce 100644 --- a/drivers/gpu/drm/radeon/rv515.c +++ b/drivers/gpu/drm/radeon/rv515.c @@ -57,7 +57,7 @@ void rv515_ring_start(struct radeon_device *rdev, struct radeon_ring *ring) { int r;
- r = radeon_ring_lock(rdev, ring, 64); + r = radeon_ring_alloc(rdev, ring, 64); if (r) { return; } @@ -118,7 +118,7 @@ void rv515_ring_start(struct radeon_device *rdev, struct radeon_ring *ring) radeon_ring_write(ring, GEOMETRY_ROUND_NEAREST | COLOR_ROUND_NEAREST); radeon_ring_write(ring, PACKET0(0x20C8, 0)); radeon_ring_write(ring, 0); - radeon_ring_unlock_commit(rdev, ring); + radeon_ring_commit(rdev, ring); }
int rv515_mc_wait_for_idle(struct radeon_device *rdev) diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c index 34603b3c8..e38a360 100644 --- a/drivers/gpu/drm/radeon/si.c +++ b/drivers/gpu/drm/radeon/si.c @@ -1851,7 +1851,7 @@ static int si_cp_start(struct radeon_device *rdev) struct radeon_ring *ring = &rdev->ring[RADEON_RING_TYPE_GFX_INDEX]; int r, i;
- r = radeon_ring_lock(rdev, ring, 7 + 4); + r = radeon_ring_alloc(rdev, ring, 7 + 4); if (r) { DRM_ERROR("radeon: cp failed to lock ring (%d).\n", r); return r; @@ -1870,11 +1870,11 @@ static int si_cp_start(struct radeon_device *rdev) radeon_ring_write(ring, PACKET3_BASE_INDEX(CE_PARTITION_BASE)); radeon_ring_write(ring, 0xc000); radeon_ring_write(ring, 0xe000); - radeon_ring_unlock_commit(rdev, ring); + radeon_ring_commit(rdev, ring);
si_cp_enable(rdev, true);
- r = radeon_ring_lock(rdev, ring, si_default_size + 10); + r = radeon_ring_alloc(rdev, ring, si_default_size + 10); if (r) { DRM_ERROR("radeon: cp failed to lock ring (%d).\n", r); return r; @@ -1899,17 +1899,17 @@ static int si_cp_start(struct radeon_device *rdev) radeon_ring_write(ring, 0x0000000e); /* VGT_VERTEX_REUSE_BLOCK_CNTL */ radeon_ring_write(ring, 0x00000010); /* VGT_OUT_DEALLOC_CNTL */
- radeon_ring_unlock_commit(rdev, ring); + radeon_ring_commit(rdev, ring);
for (i = RADEON_RING_TYPE_GFX_INDEX; i <= CAYMAN_RING_TYPE_CP2_INDEX; ++i) { ring = &rdev->ring[i]; - r = radeon_ring_lock(rdev, ring, 2); + r = radeon_ring_alloc(rdev, ring, 2);
/* clear the compute context state */ radeon_ring_write(ring, PACKET3_COMPUTE(PACKET3_CLEAR_STATE, 0)); radeon_ring_write(ring, 0);
- radeon_ring_unlock_commit(rdev, ring); + radeon_ring_commit(rdev, ring); }
return 0;
Instead of returning the error handle it directly and while at it fix the comments about the ring lock.
Signed-off-by: Christian König deathsimple@vodafone.de --- drivers/gpu/drm/radeon/radeon.h | 2 +- drivers/gpu/drm/radeon/radeon_fence.c | 33 +++++++++++++++++++++------------ 2 files changed, 22 insertions(+), 13 deletions(-)
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 77b4519b..5861ec8 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -239,7 +239,7 @@ void radeon_fence_process(struct radeon_device *rdev, int ring); bool radeon_fence_signaled(struct radeon_fence *fence); int radeon_fence_wait(struct radeon_fence *fence, bool interruptible); int radeon_fence_wait_next_locked(struct radeon_device *rdev, int ring); -int radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring); +void radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring); int radeon_fence_wait_any(struct radeon_device *rdev, struct radeon_fence **fences, bool intr); diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index 7b55625..be4e4f3 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -440,14 +440,11 @@ int radeon_fence_wait_any(struct radeon_device *rdev, return 0; }
+/* caller must hold ring lock */ int radeon_fence_wait_next_locked(struct radeon_device *rdev, int ring) { uint64_t seq;
- /* We are not protected by ring lock when reading current seq but - * it's ok as worst case is we return to early while we could have - * wait. - */ seq = atomic64_read(&rdev->fence_drv[ring].last_seq) + 1ULL; if (seq >= rdev->fence_drv[ring].sync_seq[ring]) { /* nothing to wait for, last_seq is @@ -457,15 +454,27 @@ int radeon_fence_wait_next_locked(struct radeon_device *rdev, int ring) return radeon_fence_wait_seq(rdev, seq, ring, false, false); }
-int radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring) +/* caller must hold ring lock */ +void radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring) { - /* We are not protected by ring lock when reading current seq - * but it's ok as wait empty is call from place where no more - * activity can be scheduled so there won't be concurrent access - * to seq value. - */ - return radeon_fence_wait_seq(rdev, rdev->fence_drv[ring].sync_seq[ring], - ring, false, false); + uint64_t seq = rdev->fence_drv[ring].sync_seq[ring]; + + while(1) { + int r; + r = radeon_fence_wait_seq(rdev, seq, ring, false, false); + if (r == -EDEADLK) { + mutex_unlock(&rdev->ring_lock); + r = radeon_gpu_reset(rdev); + mutex_lock(&rdev->ring_lock); + if (!r) + continue; + } + if (r) { + dev_err(rdev->dev, "error waiting for ring to become" + " idle (%d)\n", r); + } + return; + } }
struct radeon_fence *radeon_fence_ref(struct radeon_fence *fence)
On Fre, 2012-06-29 at 16:45 +0200, Christian König wrote:
Instead of returning the error handle it directly and while at it fix the comments about the ring lock.
Sounds like this should really be two separate changes?
Signed-off-by: Christian König deathsimple@vodafone.de
Either way,
Reviewed-by: Michel Dänzer michel.daenzer@amd.com
Waiting for a fence can fail for different reasons, the most common is a deadlock.
Signed-off-by: Christian König deathsimple@vodafone.de --- drivers/gpu/drm/radeon/radeon_gart.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/radeon/radeon_gart.c b/drivers/gpu/drm/radeon/radeon_gart.c index 2b34c1a..ee11c50 100644 --- a/drivers/gpu/drm/radeon/radeon_gart.c +++ b/drivers/gpu/drm/radeon/radeon_gart.c @@ -316,10 +316,21 @@ static void radeon_vm_unbind_locked(struct radeon_device *rdev, }
/* wait for vm use to end */ - if (vm->fence) { - radeon_fence_wait(vm->fence, false); - radeon_fence_unref(&vm->fence); + while (vm->fence) { + int r; + r = radeon_fence_wait(vm->fence, false); + if (r) + DRM_ERROR("error while waiting for fence: %d\n", r); + if (r == -EDEADLK) { + mutex_unlock(&rdev->vm_manager.lock); + r = radeon_gpu_reset(rdev); + mutex_lock(&rdev->vm_manager.lock); + if (!r) + continue; + } + break; } + radeon_fence_unref(&vm->fence);
/* hw unbind */ rdev->vm_manager.funcs->unbind(rdev, vm);
On Fre, 2012-06-29 at 16:45 +0200, Christian König wrote:
Waiting for a fence can fail for different reasons, the most common is a deadlock.
Signed-off-by: Christian König deathsimple@vodafone.de
Reviewed-by: Michel Dänzer michel.daenzer@amd.com
On Fre, 2012-06-29 at 16:45 +0200, Christian König wrote:
Hold the ring lock the whole time the reset is in progress, otherwise another process can submit new jobs.
Sounds good, but doesn't this create other paths (e.g. initialization, resume) where the ring is being accessed without holding the lock? Isn't that a problem?
On 29.06.2012 17:09, Michel Dänzer wrote:
On Fre, 2012-06-29 at 16:45 +0200, Christian König wrote:
Hold the ring lock the whole time the reset is in progress, otherwise another process can submit new jobs.
Sounds good, but doesn't this create other paths (e.g. initialization, resume) where the ring is being accessed without holding the lock? Isn't that a problem?
Thought about that also.
For init I'm pretty sure that no application can submit commands before we are done, otherwise we are doomed anyway.
For resume I'm not really sure, but I think that applications are resumed after the hardware driver had a chance of doing so.
Christian.
On Fre, 2012-06-29 at 17:18 +0200, Christian König wrote:
On 29.06.2012 17:09, Michel Dänzer wrote:
On Fre, 2012-06-29 at 16:45 +0200, Christian König wrote:
Hold the ring lock the whole time the reset is in progress, otherwise another process can submit new jobs.
Sounds good, but doesn't this create other paths (e.g. initialization, resume) where the ring is being accessed without holding the lock? Isn't that a problem?
Thought about that also.
For init I'm pretty sure that no application can submit commands before we are done, otherwise we are doomed anyway.
For resume I'm not really sure, but I think that applications are resumed after the hardware driver had a chance of doing so.
I hope you're right... but if it's not too much trouble, it might be better to be safe than sorry and take the lock for those paths as well.
On Fri, Jun 29, 2012 at 12:15 PM, Michel Dänzer michel@daenzer.net wrote:
On Fre, 2012-06-29 at 17:18 +0200, Christian König wrote:
On 29.06.2012 17:09, Michel Dänzer wrote:
On Fre, 2012-06-29 at 16:45 +0200, Christian König wrote:
Hold the ring lock the whole time the reset is in progress, otherwise another process can submit new jobs.
Sounds good, but doesn't this create other paths (e.g. initialization, resume) where the ring is being accessed without holding the lock? Isn't that a problem?
Thought about that also.
For init I'm pretty sure that no application can submit commands before we are done, otherwise we are doomed anyway.
For resume I'm not really sure, but I think that applications are resumed after the hardware driver had a chance of doing so.
I hope you're right... but if it's not too much trouble, it might be better to be safe than sorry and take the lock for those paths as well.
NAK this is the wrong way to solve the issue, we need a global lock on all path that can trigger gpu activities. Previously it was the cs mutex, but i haven't thought about it too much when it got removed. So to fix the situation i am sending a patch with rw semaphore.
Cheers, Jerome
On 02.07.2012 17:41, Jerome Glisse wrote:
On Fri, Jun 29, 2012 at 12:15 PM, Michel Dänzer michel@daenzer.net wrote:
On Fre, 2012-06-29 at 17:18 +0200, Christian König wrote:
On 29.06.2012 17:09, Michel Dänzer wrote:
On Fre, 2012-06-29 at 16:45 +0200, Christian König wrote:
Hold the ring lock the whole time the reset is in progress, otherwise another process can submit new jobs.
Sounds good, but doesn't this create other paths (e.g. initialization, resume) where the ring is being accessed without holding the lock? Isn't that a problem?
Thought about that also.
For init I'm pretty sure that no application can submit commands before we are done, otherwise we are doomed anyway.
For resume I'm not really sure, but I think that applications are resumed after the hardware driver had a chance of doing so.
I hope you're right... but if it's not too much trouble, it might be better to be safe than sorry and take the lock for those paths as well.
NAK this is the wrong way to solve the issue, we need a global lock on all path that can trigger gpu activities. Previously it was the cs mutex, but i haven't thought about it too much when it got removed. So to fix the situation i am sending a patch with rw semaphore.
So what I'm missing? What else can trigger GPU activity when not the rings?
I'm currently working on ring-partial resets and also resets where you only skip over a single faulty IB instead of flushing the whole ring. And my current idea for that to work is that we hold the ring lock while we do suspend, ring_save, asic_reset, resume and ring_restore.
Christian.
Cheers, Jerome
On Mon, Jul 2, 2012 at 11:58 AM, Christian König deathsimple@vodafone.de wrote:
On 02.07.2012 17:41, Jerome Glisse wrote:
On Fri, Jun 29, 2012 at 12:15 PM, Michel Dänzer michel@daenzer.net wrote:
On Fre, 2012-06-29 at 17:18 +0200, Christian König wrote:
On 29.06.2012 17:09, Michel Dänzer wrote:
On Fre, 2012-06-29 at 16:45 +0200, Christian König wrote:
Hold the ring lock the whole time the reset is in progress, otherwise another process can submit new jobs.
Sounds good, but doesn't this create other paths (e.g. initialization, resume) where the ring is being accessed without holding the lock? Isn't that a problem?
Thought about that also.
For init I'm pretty sure that no application can submit commands before we are done, otherwise we are doomed anyway.
For resume I'm not really sure, but I think that applications are resumed after the hardware driver had a chance of doing so.
I hope you're right... but if it's not too much trouble, it might be better to be safe than sorry and take the lock for those paths as well.
NAK this is the wrong way to solve the issue, we need a global lock on all path that can trigger gpu activities. Previously it was the cs mutex, but i haven't thought about it too much when it got removed. So to fix the situation i am sending a patch with rw semaphore.
So what I'm missing? What else can trigger GPU activity when not the rings?
I'm currently working on ring-partial resets and also resets where you only skip over a single faulty IB instead of flushing the whole ring. And my current idea for that to work is that we hold the ring lock while we do suspend, ring_save, asic_reset, resume and ring_restore.
Christian.
I just sent a patch, the idea is that you want gpu reset to be an exclusive operation like gpu init, or gpu resume. So by taking rw semaphore you allow the gpu reset to be exclusive and so you know nobody can trigger gpu activies while still allowing concurrency in case no gpu reset is on going.
Cheers, Jerome
On Mon, Jul 2, 2012 at 11:58 AM, Christian König deathsimple@vodafone.de wrote:
On 02.07.2012 17:41, Jerome Glisse wrote:
On Fri, Jun 29, 2012 at 12:15 PM, Michel Dänzer michel@daenzer.net wrote:
On Fre, 2012-06-29 at 17:18 +0200, Christian König wrote:
On 29.06.2012 17:09, Michel Dänzer wrote:
On Fre, 2012-06-29 at 16:45 +0200, Christian König wrote:
Hold the ring lock the whole time the reset is in progress, otherwise another process can submit new jobs.
Sounds good, but doesn't this create other paths (e.g. initialization, resume) where the ring is being accessed without holding the lock? Isn't that a problem?
Thought about that also.
For init I'm pretty sure that no application can submit commands before we are done, otherwise we are doomed anyway.
For resume I'm not really sure, but I think that applications are resumed after the hardware driver had a chance of doing so.
I hope you're right... but if it's not too much trouble, it might be better to be safe than sorry and take the lock for those paths as well.
NAK this is the wrong way to solve the issue, we need a global lock on all path that can trigger gpu activities. Previously it was the cs mutex, but i haven't thought about it too much when it got removed. So to fix the situation i am sending a patch with rw semaphore.
So what I'm missing? What else can trigger GPU activity when not the rings?
I'm currently working on ring-partial resets and also resets where you only skip over a single faulty IB instead of flushing the whole ring. And my current idea for that to work is that we hold the ring lock while we do suspend, ring_save, asic_reset, resume and ring_restore.
Christian.
I should add that gpu_reset should be an heavy reset, and if you want to only reset one ring and see if gpu can continue without heavy reset then you should do it as a special ring reset that doesn't reset mc and some other block but only the affected ring (and i am assuming that hw behave properly here). If that light weight ring reset doesn't work than let the heavy reset kicks in. So yes your light weight per ring reset would only need to take the ring lock but now need to change the ring lock usage we have now.
Cheers, Jerome
dri-devel@lists.freedesktop.org