Am 18.09.2018 um 18:17 schrieb Nayan Deshmukh:
having a delayed work item per job is redundant as we only need one per scheduler to track the time out the currently executing job.
Well that looks simpler than I thought it would be.
But it shows the next problem that the timeout and the completion could race.
As far as I can see that can be fixed by moving the dma_fence_remove_callback()/dma_fence_add_callback() dance from drm_sched_hw_job_reset() to drm_sched_job_timedout().
Anyway, I would say drop patch #1 and fix the one comment below and we can use this.
Signed-off-by: Nayan Deshmukh nayan26deshmukh@gmail.com Suggested-by: Christian König christian.koenig@amd.com
drivers/gpu/drm/scheduler/sched_main.c | 16 +++++++++------- include/drm/gpu_scheduler.h | 6 +++--- 2 files changed, 12 insertions(+), 10 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 0e6ccc8243db..f213b5c7f718 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -198,7 +198,7 @@ static void drm_sched_job_finish(struct work_struct *work) * manages to find this job as the next job in the list, the fence * signaled check below will prevent the timeout to be restarted. */
- cancel_delayed_work_sync(&s_job->work_tdr);
cancel_delayed_work_sync(&sched->work_tdr);
spin_lock(&sched->job_list_lock); /* queue TDR for next job */
@@ -207,7 +207,7 @@ static void drm_sched_job_finish(struct work_struct *work) if (sched->timeout != MAX_SCHEDULE_TIMEOUT && !list_is_last(&s_job->node, &sched->ring_mirror_list)) { if (!dma_fence_is_signaled(&next->s_fence->finished))
Since we now have only one delayed work item we can just drop the test if next is already signaled.
Regards, Christian.
schedule_delayed_work(&next->work_tdr, sched->timeout);
} /* remove job from ring_mirror_list */ list_del(&s_job->node);schedule_delayed_work(&sched->work_tdr, sched->timeout);
@@ -237,7 +237,7 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job) if (list_first_entry_or_null(&sched->ring_mirror_list, struct drm_sched_job, node) == s_job) { if (sched->timeout != MAX_SCHEDULE_TIMEOUT)
schedule_delayed_work(&s_job->work_tdr, sched->timeout);
sched->curr_job = s_job; } spin_unlock(&sched->job_list_lock);schedule_delayed_work(&sched->work_tdr, sched->timeout);
@@ -245,8 +245,10 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job)
static void drm_sched_job_timedout(struct work_struct *work) {
- struct drm_sched_job *job = container_of(work, struct drm_sched_job,
work_tdr.work);
struct drm_gpu_scheduler *sched = container_of(work,
struct drm_gpu_scheduler,
work_tdr.work);
struct drm_sched_job *job = sched->curr_job;
job->sched->ops->timedout_job(job); }
@@ -318,7 +320,7 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler *sched) s_job = list_first_entry_or_null(&sched->ring_mirror_list, struct drm_sched_job, node); if (s_job && sched->timeout != MAX_SCHEDULE_TIMEOUT)
schedule_delayed_work(&s_job->work_tdr, sched->timeout);
if (s_job) sched->curr_job = s_job;schedule_delayed_work(&sched->work_tdr, sched->timeout);
@@ -389,7 +391,6 @@ int drm_sched_job_init(struct drm_sched_job *job,
INIT_WORK(&job->finish_work, drm_sched_job_finish); INIT_LIST_HEAD(&job->node);
INIT_DELAYED_WORK(&job->work_tdr, drm_sched_job_timedout);
return 0; }
@@ -580,6 +581,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched, INIT_LIST_HEAD(&sched->ring_mirror_list); spin_lock_init(&sched->job_list_lock); atomic_set(&sched->hw_rq_count, 0);
- INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout); atomic_set(&sched->num_jobs, 0); atomic64_set(&sched->job_id_count, 0);
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 07e776b1ca42..9d50d7f3eaa4 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -175,8 +175,6 @@ struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);
finished to remove the job from the
@drm_gpu_scheduler.ring_mirror_list.
- @node: used to append this struct to the @drm_gpu_scheduler.ring_mirror_list.
- @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the timeout
interval is over.
- @id: a unique id assigned to each job scheduled on the scheduler.
- @karma: increment on every hang caused by this job. If this exceeds the hang
limit of the scheduler then the job is marked guilty and will not
@@ -195,7 +193,6 @@ struct drm_sched_job { struct dma_fence_cb finish_cb; struct work_struct finish_work; struct list_head node;
- struct delayed_work work_tdr; uint64_t id; atomic_t karma; enum drm_sched_priority s_priority;
@@ -260,6 +257,8 @@ struct drm_sched_backend_ops {
finished.
- @hw_rq_count: the number of jobs currently in the hardware queue.
- @job_id_count: used to assign unique id to the each job.
- @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
timeout interval is over.
- @thread: the kthread on which the scheduler which run.
- @ring_mirror_list: the list of jobs which are currently in the job queue.
- @job_list_lock: lock to protect the ring_mirror_list.
@@ -280,6 +279,7 @@ struct drm_gpu_scheduler { wait_queue_head_t job_scheduled; atomic_t hw_rq_count; atomic64_t job_id_count;
- struct delayed_work work_tdr; struct task_struct *thread; struct list_head ring_mirror_list; spinlock_t job_list_lock;