Am 21.06.22 um 00:03 schrieb Andrey Grodzovsky:
Align refcount behaviour for amdgpu_job embedded HW fence with classic pointer style HW fences by increasing refcount each time emit is called so amdgpu code doesn't need to make workarounds using amdgpu_job.job_run_counter to keep the HW fence refcount balanced.
Could we now also remove job_run_counter?
Christian.
Also since in the previous patch we resumed setting s_fence->parent to NULL in drm_sched_stop switch to directly checking if job->hw_fence is signaled to short circuit reset if already signed.
Signed-off-by: Andrey Grodzovsky andrey.grodzovsky@amd.com Tested-by: Yiqing Yao yiqing.yao@amd.com
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 23 ++++++++++++++++------ drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 7 ++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 4 ---- 4 files changed, 25 insertions(+), 11 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 513c57f839d8..447bd92c4856 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -684,6 +684,8 @@ int amdgpu_amdkfd_submit_ib(struct amdgpu_device *adev, goto err_ib_sched; }
/* Drop the initial kref_init count (see drm_sched_main as example) */
dma_fence_put(f); ret = dma_fence_wait(f, false);
err_ib_sched:
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index c99541685804..f9718119834f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5009,16 +5009,28 @@ static void amdgpu_device_recheck_guilty_jobs(
/* clear job's guilty and depend the folowing step to decide the real one */ drm_sched_reset_karma(s_job);
/* for the real bad job, it will be resubmitted twice, adding a dma_fence_get
* to make sure fence is balanced */
drm_sched_resubmit_jobs_ext(&ring->sched, 1);dma_fence_get(s_job->s_fence->parent);
if (!s_job->s_fence->parent) {
DRM_WARN("Failed to get a HW fence for job!");
continue;
}
ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); if (ret == 0) { /* timeout */ DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", ring->sched.name, s_job->id);
/* Clear this failed job from fence array */
amdgpu_fence_driver_clear_job_fences(ring);
/* Since the job won't signal and we go for
* another resubmit drop this parent pointer
*/
dma_fence_put(s_job->s_fence->parent);
s_job->s_fence->parent = NULL;
/* set guilty */ drm_sched_increase_karma(s_job);
retry:
@@ -5047,7 +5059,6 @@ static void amdgpu_device_recheck_guilty_jobs(
/* got the hw fence, signal finished fence */ atomic_dec(ring->sched.score);
dma_fence_get(&s_job->s_fence->finished); dma_fence_signal(&s_job->s_fence->finished); dma_fence_put(&s_job->s_fence->finished);dma_fence_put(s_job->s_fence->parent);
@@ -5220,8 +5231,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, * * job->base holds a reference to parent fence */
- if (job && job->base.s_fence->parent &&
dma_fence_is_signaled(job->base.s_fence->parent)) {
- if (job && (job->hw_fence.ops != NULL) &&
job_signaled = true; dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); goto skip_hw_reset;dma_fence_is_signaled(&job->hw_fence)) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index d6d54ba4c185..9bd4e18212fc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -164,11 +164,16 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amd if (job && job->job_run_counter) { /* reinit seq for resubmitted jobs */ fence->seqno = seq;
/* TO be inline with external fence creation and other drivers */
} else {dma_fence_get(fence);
if (job)
if (job) { dma_fence_init(fence, &amdgpu_job_fence_ops, &ring->fence_drv.lock, adev->fence_context + ring->idx, seq);
/* Against remove in amdgpu_job_{free, free_cb} */
dma_fence_get(fence);
else dma_fence_init(fence, &amdgpu_fence_ops, &ring->fence_drv.lock,}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 58568fdde2d0..638e1d600258 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -267,10 +267,6 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) DRM_ERROR("Error scheduling IBs (%d)\n", r); }
- if (!job->job_run_counter)
dma_fence_get(fence);
- else if (finished->error < 0)
job->job_run_counter++; amdgpu_job_free_resources(job);dma_fence_put(&job->hw_fence);