+ dri-devel
Since scheduler is a shared component, please add dri-devel on all scheduler patches.
On Wed, Aug 18, 2021 at 7:21 AM Jingwen Chen Jingwen.Chen2@amd.com wrote:
[Why] for bailing job, this commit will delete it from pending list thus the bailing job will never have a chance to be resubmitted even in advance tdr mode.
[How] after embeded hw_fence into amdgpu_job is done, the race condition that this commit tries to work around is completely solved.So revert this commit. This reverts commit 135517d3565b48f4def3b1b82008bc17eb5d1c90. v2: add dma_fence_get/put() around timedout_job to avoid concurrent delete during processing timedout_job
Signed-off-by: Jingwen Chen Jingwen.Chen2@amd.com
drivers/gpu/drm/scheduler/sched_main.c | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index a2a953693b45..f9b9b3aefc4a 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -314,6 +314,7 @@ static void drm_sched_job_timedout(struct work_struct *work) { struct drm_gpu_scheduler *sched; struct drm_sched_job *job;
struct dma_fence *fence; enum drm_gpu_sched_stat status = DRM_GPU_SCHED_STAT_NOMINAL; sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
@@ -325,11 +326,10 @@ static void drm_sched_job_timedout(struct work_struct *work)
if (job) { /*
* Remove the bad job so it cannot be freed by concurrent
* drm_sched_cleanup_jobs. It will be reinserted back after sched->thread
* is parked at which point it's safe.
* Get job->s_fence->parent here to avoid concurrent delete during
* processing timedout_job */
list_del_init(&job->list);
fence = dma_fence_get(job->s_fence->parent); spin_unlock(&sched->job_list_lock); status = job->sched->ops->timedout_job(job);
@@ -342,6 +342,7 @@ static void drm_sched_job_timedout(struct work_struct *work) job->sched->ops->free_job(job); sched->free_guilty = false; }
dma_fence_put(fence); } else { spin_unlock(&sched->job_list_lock); }
@@ -392,20 +393,6 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
kthread_park(sched->thread);
/*
* Reinsert back the bad job here - now it's safe as
* drm_sched_get_cleanup_job cannot race against us and release the
* bad job at this point - we parked (waited for) any in progress
* (earlier) cleanups and drm_sched_get_cleanup_job will not be called
* now until the scheduler thread is unparked.
*/
if (bad && bad->sched == sched)
/*
* Add at the head of the queue to reflect it was the earliest
* job extracted.
*/
list_add(&bad->list, &sched->pending_list);
/* * Iterate the job list from later to earlier one and either deactive * their HW callbacks or remove them from pending list if they already
-- 2.25.1