Hi guys,
This series of patches implements a pending list for jobs which are in the hardware, and a done list for tasks which are done and need to be freed.
As tasks complete and call their DRM callback, their fences are signalled and tasks are added to the done list and the main scheduler thread woken up. The main scheduler thread then frees them up.
When a task times out, the timeout function prototype now returns a value back to DRM. The reason for this is that the GPU driver has intimate knowledge of the hardware and can pass back information to DRM on what to do. Whether to attempt to abort the task (by say calling a driver abort function, etc., as the implementation dictates), or whether the task needs more time. Note that the task is not moved away from the pending list, unless it is no longer in the GPU. (The pending list holds tasks which are pending from DRM's point of view, i.e. the GPU has control over them--that could be things like DMA is active, CU's are active, for the task, etc.)
The idea really is that what DRM wants to know is whether the task is in the GPU or not. So now drm_sched_backend_ops::timedout_job() returns DRM_TASK_STATUS_COMPLETE if the task is no longer with the GPU, or DRM_TASK_STATUS_ALIVE if the task needs more time.
This series applies to drm-misc-next at 0a260e731d6c.
Tested and works, but I get a lot of WARN_ON(bo->pin_count)) from ttm_bo_release() for the VCN ring of amdgpu.
Cc: Alexander Deucher Alexander.Deucher@amd.com Cc: Andrey Grodzovsky Andrey.Grodzovsky@amd.com Cc: Christian König christian.koenig@amd.com Cc: Daniel Vetter daniel.vetter@ffwll.ch
Luben Tuikov (5): drm/scheduler: "node" --> "list" gpu/drm: ring_mirror_list --> pending_list drm/scheduler: Essentialize the job done callback drm/scheduler: Job timeout handler returns status (v2) drm/sched: Make use of a "done" list (v2)
drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 6 +- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 8 +- drivers/gpu/drm/etnaviv/etnaviv_sched.c | 10 +- drivers/gpu/drm/lima/lima_sched.c | 4 +- drivers/gpu/drm/panfrost/panfrost_job.c | 9 +- drivers/gpu/drm/scheduler/sched_main.c | 345 +++++++++++--------- drivers/gpu/drm/v3d/v3d_sched.c | 32 +- include/drm/gpu_scheduler.h | 38 ++- 9 files changed, 255 insertions(+), 201 deletions(-)
Rename "node" to "list" in struct drm_sched_job, in order to make it consistent with what we see being used throughout gpu_scheduler.h, for instance in struct drm_sched_entity, as well as the rest of DRM and the kernel.
Signed-off-by: Luben Tuikov luben.tuikov@amd.com Reviewed-by: Christian König christian.koenig@amd.com
Cc: Alexander Deucher Alexander.Deucher@amd.com Cc: Andrey Grodzovsky Andrey.Grodzovsky@amd.com Cc: Christian König christian.koenig@amd.com Cc: Daniel Vetter daniel.vetter@ffwll.ch --- drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 6 +++--- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 2 +- drivers/gpu/drm/scheduler/sched_main.c | 23 +++++++++++---------- include/drm/gpu_scheduler.h | 4 ++-- 5 files changed, 19 insertions(+), 18 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 5c1f3725c741..8358cae0b5a4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -1427,7 +1427,7 @@ static void amdgpu_ib_preempt_job_recovery(struct drm_gpu_scheduler *sched) struct dma_fence *fence;
spin_lock(&sched->job_list_lock); - list_for_each_entry(s_job, &sched->ring_mirror_list, node) { + list_for_each_entry(s_job, &sched->ring_mirror_list, list) { fence = sched->ops->run_job(s_job); dma_fence_put(fence); } @@ -1459,10 +1459,10 @@ static void amdgpu_ib_preempt_mark_partial_job(struct amdgpu_ring *ring)
no_preempt: spin_lock(&sched->job_list_lock); - list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) { + list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, list) { if (dma_fence_is_signaled(&s_job->s_fence->finished)) { /* remove job from ring_mirror_list */ - list_del_init(&s_job->node); + list_del_init(&s_job->list); sched->ops->free_job(s_job); continue; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 7560b05e4ac1..4df6de81cd41 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4128,7 +4128,7 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
spin_lock(&ring->sched.job_list_lock); job = list_first_entry_or_null(&ring->sched.ring_mirror_list, - struct drm_sched_job, node); + struct drm_sched_job, list); spin_unlock(&ring->sched.job_list_lock); if (job) return true; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index dcfe8a3b03ff..aca52a46b93d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -271,7 +271,7 @@ void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched) }
/* Signal all jobs already scheduled to HW */ - list_for_each_entry(s_job, &sched->ring_mirror_list, node) { + list_for_each_entry(s_job, &sched->ring_mirror_list, list) { struct drm_sched_fence *s_fence = s_job->s_fence;
dma_fence_set_error(&s_fence->finished, -EHWPOISON); diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index c6332d75025e..c52eba407ebd 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -272,7 +272,7 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job) struct drm_gpu_scheduler *sched = s_job->sched;
spin_lock(&sched->job_list_lock); - list_add_tail(&s_job->node, &sched->ring_mirror_list); + list_add_tail(&s_job->list, &sched->ring_mirror_list); drm_sched_start_timeout(sched); spin_unlock(&sched->job_list_lock); } @@ -287,7 +287,7 @@ static void drm_sched_job_timedout(struct work_struct *work) /* Protects against concurrent deletion in drm_sched_get_cleanup_job */ spin_lock(&sched->job_list_lock); job = list_first_entry_or_null(&sched->ring_mirror_list, - struct drm_sched_job, node); + struct drm_sched_job, list);
if (job) { /* @@ -295,7 +295,7 @@ static void drm_sched_job_timedout(struct work_struct *work) * drm_sched_cleanup_jobs. It will be reinserted back after sched->thread * is parked at which point it's safe. */ - list_del_init(&job->node); + list_del_init(&job->list); spin_unlock(&sched->job_list_lock);
job->sched->ops->timedout_job(job); @@ -392,7 +392,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad) * Add at the head of the queue to reflect it was the earliest * job extracted. */ - list_add(&bad->node, &sched->ring_mirror_list); + list_add(&bad->list, &sched->ring_mirror_list);
/* * Iterate the job list from later to earlier one and either deactive @@ -400,7 +400,8 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad) * signaled. * This iteration is thread safe as sched thread is stopped. */ - list_for_each_entry_safe_reverse(s_job, tmp, &sched->ring_mirror_list, node) { + list_for_each_entry_safe_reverse(s_job, tmp, &sched->ring_mirror_list, + list) { if (s_job->s_fence->parent && dma_fence_remove_callback(s_job->s_fence->parent, &s_job->cb)) { @@ -411,7 +412,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad) * Locking here is for concurrent resume timeout */ spin_lock(&sched->job_list_lock); - list_del_init(&s_job->node); + list_del_init(&s_job->list); spin_unlock(&sched->job_list_lock);
/* @@ -462,7 +463,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery) * so no new jobs are being inserted or removed. Also concurrent * GPU recovers can't run in parallel. */ - list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) { + list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, list) { struct dma_fence *fence = s_job->s_fence->parent;
atomic_inc(&sched->hw_rq_count); @@ -505,7 +506,7 @@ void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched) bool found_guilty = false; struct dma_fence *fence;
- list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) { + list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, list) { struct drm_sched_fence *s_fence = s_job->s_fence;
if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) { @@ -565,7 +566,7 @@ int drm_sched_job_init(struct drm_sched_job *job, return -ENOMEM; job->id = atomic64_inc_return(&sched->job_id_count);
- INIT_LIST_HEAD(&job->node); + INIT_LIST_HEAD(&job->list);
return 0; } @@ -684,11 +685,11 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched) spin_lock(&sched->job_list_lock);
job = list_first_entry_or_null(&sched->ring_mirror_list, - struct drm_sched_job, node); + struct drm_sched_job, list);
if (job && dma_fence_is_signaled(&job->s_fence->finished)) { /* remove job from ring_mirror_list */ - list_del_init(&job->node); + list_del_init(&job->list); } else { job = NULL; /* queue timeout for next job */ diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 92436553fd6a..3add0072bd37 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -189,14 +189,14 @@ struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f); */ struct drm_sched_job { struct spsc_node queue_node; + struct list_head list; struct drm_gpu_scheduler *sched; struct drm_sched_fence *s_fence; struct dma_fence_cb finish_cb; - struct list_head node; uint64_t id; atomic_t karma; enum drm_sched_priority s_priority; - struct drm_sched_entity *entity; + struct drm_sched_entity *entity; struct dma_fence_cb cb; };
Rename "ring_mirror_list" to "pending_list", to describe what something is, not what it does, how it's used, or how the hardware implements it.
This also abstracts the actual hardware implementation, i.e. how the low-level driver communicates with the device it drives, ring, CAM, etc., shouldn't be exposed to DRM.
The pending_list keeps jobs submitted, which are out of our control. Usually this means they are pending execution status in hardware, but the latter definition is a more general (inclusive) definition.
Signed-off-by: Luben Tuikov luben.tuikov@amd.com Acked-by: Christian König christian.koenig@amd.com
Cc: Alexander Deucher Alexander.Deucher@amd.com Cc: Andrey Grodzovsky Andrey.Grodzovsky@amd.com Cc: Christian König christian.koenig@amd.com Cc: Daniel Vetter daniel.vetter@ffwll.ch --- drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 4 +-- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +-- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 2 +- drivers/gpu/drm/scheduler/sched_main.c | 34 ++++++++++----------- include/drm/gpu_scheduler.h | 10 +++--- 5 files changed, 27 insertions(+), 27 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 8358cae0b5a4..db77a5bdfa45 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -1427,7 +1427,7 @@ static void amdgpu_ib_preempt_job_recovery(struct drm_gpu_scheduler *sched) struct dma_fence *fence;
spin_lock(&sched->job_list_lock); - list_for_each_entry(s_job, &sched->ring_mirror_list, list) { + list_for_each_entry(s_job, &sched->pending_list, list) { fence = sched->ops->run_job(s_job); dma_fence_put(fence); } @@ -1459,7 +1459,7 @@ static void amdgpu_ib_preempt_mark_partial_job(struct amdgpu_ring *ring)
no_preempt: spin_lock(&sched->job_list_lock); - list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, list) { + list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) { if (dma_fence_is_signaled(&s_job->s_fence->finished)) { /* remove job from ring_mirror_list */ list_del_init(&s_job->list); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 4df6de81cd41..fbae600aa5f9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4127,8 +4127,8 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev) continue;
spin_lock(&ring->sched.job_list_lock); - job = list_first_entry_or_null(&ring->sched.ring_mirror_list, - struct drm_sched_job, list); + job = list_first_entry_or_null(&ring->sched.pending_list, + struct drm_sched_job, list); spin_unlock(&ring->sched.job_list_lock); if (job) return true; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index aca52a46b93d..ff48101bab55 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -271,7 +271,7 @@ void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched) }
/* Signal all jobs already scheduled to HW */ - list_for_each_entry(s_job, &sched->ring_mirror_list, list) { + list_for_each_entry(s_job, &sched->pending_list, list) { struct drm_sched_fence *s_fence = s_job->s_fence;
dma_fence_set_error(&s_fence->finished, -EHWPOISON); diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index c52eba407ebd..b694df12aaba 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -198,7 +198,7 @@ EXPORT_SYMBOL(drm_sched_dependency_optimized); static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched) { if (sched->timeout != MAX_SCHEDULE_TIMEOUT && - !list_empty(&sched->ring_mirror_list)) + !list_empty(&sched->pending_list)) schedule_delayed_work(&sched->work_tdr, sched->timeout); }
@@ -258,7 +258,7 @@ void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched, { spin_lock(&sched->job_list_lock);
- if (list_empty(&sched->ring_mirror_list)) + if (list_empty(&sched->pending_list)) cancel_delayed_work(&sched->work_tdr); else mod_delayed_work(system_wq, &sched->work_tdr, remaining); @@ -272,7 +272,7 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job) struct drm_gpu_scheduler *sched = s_job->sched;
spin_lock(&sched->job_list_lock); - list_add_tail(&s_job->list, &sched->ring_mirror_list); + list_add_tail(&s_job->list, &sched->pending_list); drm_sched_start_timeout(sched); spin_unlock(&sched->job_list_lock); } @@ -286,7 +286,7 @@ static void drm_sched_job_timedout(struct work_struct *work)
/* Protects against concurrent deletion in drm_sched_get_cleanup_job */ spin_lock(&sched->job_list_lock); - job = list_first_entry_or_null(&sched->ring_mirror_list, + job = list_first_entry_or_null(&sched->pending_list, struct drm_sched_job, list);
if (job) { @@ -371,7 +371,7 @@ EXPORT_SYMBOL(drm_sched_increase_karma); * Stop the scheduler and also removes and frees all completed jobs. * Note: bad job will not be freed as it might be used later and so it's * callers responsibility to release it manually if it's not part of the - * mirror list any more. + * pending list any more. * */ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad) @@ -392,15 +392,15 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad) * Add at the head of the queue to reflect it was the earliest * job extracted. */ - list_add(&bad->list, &sched->ring_mirror_list); + list_add(&bad->list, &sched->pending_list);
/* * Iterate the job list from later to earlier one and either deactive - * their HW callbacks or remove them from mirror list if they already + * their HW callbacks or remove them from pending list if they already * signaled. * This iteration is thread safe as sched thread is stopped. */ - list_for_each_entry_safe_reverse(s_job, tmp, &sched->ring_mirror_list, + list_for_each_entry_safe_reverse(s_job, tmp, &sched->pending_list, list) { if (s_job->s_fence->parent && dma_fence_remove_callback(s_job->s_fence->parent, @@ -408,7 +408,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad) atomic_dec(&sched->hw_rq_count); } else { /* - * remove job from ring_mirror_list. + * remove job from pending_list. * Locking here is for concurrent resume timeout */ spin_lock(&sched->job_list_lock); @@ -463,7 +463,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery) * so no new jobs are being inserted or removed. Also concurrent * GPU recovers can't run in parallel. */ - list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, list) { + list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) { struct dma_fence *fence = s_job->s_fence->parent;
atomic_inc(&sched->hw_rq_count); @@ -494,7 +494,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery) EXPORT_SYMBOL(drm_sched_start);
/** - * drm_sched_resubmit_jobs - helper to relunch job from mirror ring list + * drm_sched_resubmit_jobs - helper to relunch job from pending ring list * * @sched: scheduler instance * @@ -506,7 +506,7 @@ void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched) bool found_guilty = false; struct dma_fence *fence;
- list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, list) { + list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) { struct drm_sched_fence *s_fence = s_job->s_fence;
if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) { @@ -665,7 +665,7 @@ static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb) * * @sched: scheduler instance * - * Returns the next finished job from the mirror list (if there is one) + * Returns the next finished job from the pending list (if there is one) * ready for it to be destroyed. */ static struct drm_sched_job * @@ -675,7 +675,7 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
/* * Don't destroy jobs while the timeout worker is running OR thread - * is being parked and hence assumed to not touch ring_mirror_list + * is being parked and hence assumed to not touch pending_list */ if ((sched->timeout != MAX_SCHEDULE_TIMEOUT && !cancel_delayed_work(&sched->work_tdr)) || @@ -684,11 +684,11 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
spin_lock(&sched->job_list_lock);
- job = list_first_entry_or_null(&sched->ring_mirror_list, + job = list_first_entry_or_null(&sched->pending_list, struct drm_sched_job, list);
if (job && dma_fence_is_signaled(&job->s_fence->finished)) { - /* remove job from ring_mirror_list */ + /* remove job from pending_list */ list_del_init(&job->list); } else { job = NULL; @@ -858,7 +858,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
init_waitqueue_head(&sched->wake_up_worker); init_waitqueue_head(&sched->job_scheduled); - INIT_LIST_HEAD(&sched->ring_mirror_list); + INIT_LIST_HEAD(&sched->pending_list); spin_lock_init(&sched->job_list_lock); atomic_set(&sched->hw_rq_count, 0); INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout); diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 3add0072bd37..2e0c368e19f6 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -174,7 +174,7 @@ struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f); * @sched: the scheduler instance on which this job is scheduled. * @s_fence: contains the fences for the scheduling of job. * @finish_cb: the callback for the finished fence. - * @node: used to append this struct to the @drm_gpu_scheduler.ring_mirror_list. + * @node: used to append this struct to the @drm_gpu_scheduler.pending_list. * @id: a unique id assigned to each job scheduled on the scheduler. * @karma: increment on every hang caused by this job. If this exceeds the hang * limit of the scheduler then the job is marked guilty and will not @@ -203,7 +203,7 @@ struct drm_sched_job { static inline bool drm_sched_invalidate_job(struct drm_sched_job *s_job, int threshold) { - return (s_job && atomic_inc_return(&s_job->karma) > threshold); + return s_job && atomic_inc_return(&s_job->karma) > threshold; }
/** @@ -260,8 +260,8 @@ struct drm_sched_backend_ops { * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the * timeout interval is over. * @thread: the kthread on which the scheduler which run. - * @ring_mirror_list: the list of jobs which are currently in the job queue. - * @job_list_lock: lock to protect the ring_mirror_list. + * @pending_list: the list of jobs which are currently in the job queue. + * @job_list_lock: lock to protect the pending_list. * @hang_limit: once the hangs by a job crosses this limit then it is marked * guilty and it will be considered for scheduling further. * @score: score to help loadbalancer pick a idle sched @@ -282,7 +282,7 @@ struct drm_gpu_scheduler { atomic64_t job_id_count; struct delayed_work work_tdr; struct task_struct *thread; - struct list_head ring_mirror_list; + struct list_head pending_list; spinlock_t job_list_lock; int hang_limit; atomic_t score;
The job done callback is called from various places, in two ways: in job done role, and as a fence callback role.
Essentialize the callback to an atom function to just complete the job, and into a second function as a prototype of fence callback which calls to complete the job.
This is used in latter patches by the completion code.
Signed-off-by: Luben Tuikov luben.tuikov@amd.com Reviewed-by: Christian König christian.koenig@amd.com
Cc: Alexander Deucher Alexander.Deucher@amd.com Cc: Andrey Grodzovsky Andrey.Grodzovsky@amd.com Cc: Christian König christian.koenig@amd.com Cc: Daniel Vetter daniel.vetter@ffwll.ch --- drivers/gpu/drm/scheduler/sched_main.c | 73 ++++++++++++++------------ 1 file changed, 40 insertions(+), 33 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index b694df12aaba..3eb7618a627d 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -60,8 +60,6 @@ #define to_drm_sched_job(sched_job) \ container_of((sched_job), struct drm_sched_job, queue_node)
-static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb); - /** * drm_sched_rq_init - initialize a given run queue struct * @@ -162,6 +160,40 @@ drm_sched_rq_select_entity(struct drm_sched_rq *rq) return NULL; }
+/** + * drm_sched_job_done - complete a job + * @s_job: pointer to the job which is done + * + * Finish the job's fence and wake up the worker thread. + */ +static void drm_sched_job_done(struct drm_sched_job *s_job) +{ + struct drm_sched_fence *s_fence = s_job->s_fence; + struct drm_gpu_scheduler *sched = s_fence->sched; + + atomic_dec(&sched->hw_rq_count); + atomic_dec(&sched->score); + + trace_drm_sched_process_job(s_fence); + + dma_fence_get(&s_fence->finished); + drm_sched_fence_finished(s_fence); + dma_fence_put(&s_fence->finished); + wake_up_interruptible(&sched->wake_up_worker); +} + +/** + * drm_sched_job_done_cb - the callback for a done job + * @f: fence + * @cb: fence callbacks + */ +static void drm_sched_job_done_cb(struct dma_fence *f, struct dma_fence_cb *cb) +{ + struct drm_sched_job *s_job = container_of(cb, struct drm_sched_job, cb); + + drm_sched_job_done(s_job); +} + /** * drm_sched_dependency_optimized * @@ -473,14 +505,14 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
if (fence) { r = dma_fence_add_callback(fence, &s_job->cb, - drm_sched_process_job); + drm_sched_job_done_cb); if (r == -ENOENT) - drm_sched_process_job(fence, &s_job->cb); + drm_sched_job_done(s_job); else if (r) DRM_ERROR("fence add callback failed (%d)\n", r); } else - drm_sched_process_job(NULL, &s_job->cb); + drm_sched_job_done(s_job); }
if (full_recovery) { @@ -635,31 +667,6 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched) return entity; }
-/** - * drm_sched_process_job - process a job - * - * @f: fence - * @cb: fence callbacks - * - * Called after job has finished execution. - */ -static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb) -{ - struct drm_sched_job *s_job = container_of(cb, struct drm_sched_job, cb); - struct drm_sched_fence *s_fence = s_job->s_fence; - struct drm_gpu_scheduler *sched = s_fence->sched; - - atomic_dec(&sched->hw_rq_count); - atomic_dec(&sched->score); - - trace_drm_sched_process_job(s_fence); - - dma_fence_get(&s_fence->finished); - drm_sched_fence_finished(s_fence); - dma_fence_put(&s_fence->finished); - wake_up_interruptible(&sched->wake_up_worker); -} - /** * drm_sched_get_cleanup_job - fetch the next finished job to be destroyed * @@ -809,9 +816,9 @@ static int drm_sched_main(void *param) if (!IS_ERR_OR_NULL(fence)) { s_fence->parent = dma_fence_get(fence); r = dma_fence_add_callback(fence, &sched_job->cb, - drm_sched_process_job); + drm_sched_job_done_cb); if (r == -ENOENT) - drm_sched_process_job(fence, &sched_job->cb); + drm_sched_job_done(sched_job); else if (r) DRM_ERROR("fence add callback failed (%d)\n", r); @@ -820,7 +827,7 @@ static int drm_sched_main(void *param) if (IS_ERR(fence)) dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
- drm_sched_process_job(NULL, &sched_job->cb); + drm_sched_job_done(sched_job); }
wake_up(&sched->job_scheduled);
The driver's job timeout handler now returns status indicating back to the DRM layer whether the task (job) was successfully aborted or whether more time should be given to the task to complete.
Default behaviour as of this patch, is preserved, except in obvious-by-comment case in the Panfrost driver, as documented below.
All drivers which make use of the drm_sched_backend_ops' .timedout_job() callback have been accordingly renamed and return the would've-been default value of DRM_TASK_STATUS_ALIVE to restart the task's timeout timer--this is the old behaviour, and is preserved by this patch.
In the case of the Panfrost driver, its timedout callback correctly first checks if the job had completed in due time and if so, it now returns DRM_TASK_STATUS_COMPLETE to notify the DRM layer that the task can be moved to the done list, to be freed later. In the other two subsequent checks, the value of DRM_TASK_STATUS_ALIVE is returned, as per the default behaviour.
A more involved driver's solutions can be had in subequent patches.
Signed-off-by: Luben Tuikov luben.tuikov@amd.com Reported-by: kernel test robot lkp@intel.com
Cc: Alexander Deucher Alexander.Deucher@amd.com Cc: Andrey Grodzovsky Andrey.Grodzovsky@amd.com Cc: Christian König christian.koenig@amd.com Cc: Daniel Vetter daniel.vetter@ffwll.ch Cc: Lucas Stach l.stach@pengutronix.de Cc: Russell King linux+etnaviv@armlinux.org.uk Cc: Christian Gmeiner christian.gmeiner@gmail.com Cc: Qiang Yu yuq825@gmail.com Cc: Rob Herring robh@kernel.org Cc: Tomeu Vizoso tomeu.vizoso@collabora.com Cc: Steven Price steven.price@arm.com Cc: Alyssa Rosenzweig alyssa.rosenzweig@collabora.com Cc: Eric Anholt eric@anholt.net
v2: Use enum as the status of a driver's job timeout callback method. --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 6 +++-- drivers/gpu/drm/etnaviv/etnaviv_sched.c | 10 +++++++- drivers/gpu/drm/lima/lima_sched.c | 4 +++- drivers/gpu/drm/panfrost/panfrost_job.c | 9 ++++--- drivers/gpu/drm/scheduler/sched_main.c | 4 +--- drivers/gpu/drm/v3d/v3d_sched.c | 32 +++++++++++++------------ include/drm/gpu_scheduler.h | 20 +++++++++++++--- 7 files changed, 57 insertions(+), 28 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index ff48101bab55..a111326cbdde 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -28,7 +28,7 @@ #include "amdgpu.h" #include "amdgpu_trace.h"
-static void amdgpu_job_timedout(struct drm_sched_job *s_job) +static enum drm_task_status amdgpu_job_timedout(struct drm_sched_job *s_job) { struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched); struct amdgpu_job *job = to_amdgpu_job(s_job); @@ -41,7 +41,7 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job) amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) { DRM_ERROR("ring %s timeout, but soft recovered\n", s_job->sched->name); - return; + return DRM_TASK_STATUS_ALIVE; }
amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti); @@ -53,10 +53,12 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
if (amdgpu_device_should_recover_gpu(ring->adev)) { amdgpu_device_gpu_recover(ring->adev, job); + return DRM_TASK_STATUS_ALIVE; } else { drm_sched_suspend_timeout(&ring->sched); if (amdgpu_sriov_vf(adev)) adev->virt.tdr_debug = true; + return DRM_TASK_STATUS_ALIVE; } }
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c index cd46c882269c..c49516942328 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c @@ -82,7 +82,8 @@ static struct dma_fence *etnaviv_sched_run_job(struct drm_sched_job *sched_job) return fence; }
-static void etnaviv_sched_timedout_job(struct drm_sched_job *sched_job) +static enum drm_task_status etnaviv_sched_timedout_job(struct drm_sched_job + *sched_job) { struct etnaviv_gem_submit *submit = to_etnaviv_submit(sched_job); struct etnaviv_gpu *gpu = submit->gpu; @@ -120,9 +121,16 @@ static void etnaviv_sched_timedout_job(struct drm_sched_job *sched_job)
drm_sched_resubmit_jobs(&gpu->sched);
+ /* Tell the DRM scheduler that this task needs + * more time. + */ + drm_sched_start(&gpu->sched, true); + return DRM_TASK_STATUS_ALIVE; + out_no_timeout: /* restart scheduler after GPU is usable again */ drm_sched_start(&gpu->sched, true); + return DRM_TASK_STATUS_ALIVE; }
static void etnaviv_sched_free_job(struct drm_sched_job *sched_job) diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c index 63b4c5643f9c..66d9236b8760 100644 --- a/drivers/gpu/drm/lima/lima_sched.c +++ b/drivers/gpu/drm/lima/lima_sched.c @@ -415,7 +415,7 @@ static void lima_sched_build_error_task_list(struct lima_sched_task *task) mutex_unlock(&dev->error_task_list_lock); }
-static void lima_sched_timedout_job(struct drm_sched_job *job) +static enum drm_task_status lima_sched_timedout_job(struct drm_sched_job *job) { struct lima_sched_pipe *pipe = to_lima_pipe(job->sched); struct lima_sched_task *task = to_lima_task(job); @@ -449,6 +449,8 @@ static void lima_sched_timedout_job(struct drm_sched_job *job)
drm_sched_resubmit_jobs(&pipe->base); drm_sched_start(&pipe->base, true); + + return DRM_TASK_STATUS_ALIVE; }
static void lima_sched_free_job(struct drm_sched_job *job) diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c index 04e6f6f9b742..845148a722e4 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.c +++ b/drivers/gpu/drm/panfrost/panfrost_job.c @@ -432,7 +432,8 @@ static void panfrost_scheduler_start(struct panfrost_queue_state *queue) mutex_unlock(&queue->lock); }
-static void panfrost_job_timedout(struct drm_sched_job *sched_job) +static enum drm_task_status panfrost_job_timedout(struct drm_sched_job + *sched_job) { struct panfrost_job *job = to_panfrost_job(sched_job); struct panfrost_device *pfdev = job->pfdev; @@ -443,7 +444,7 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job) * spurious. Bail out. */ if (dma_fence_is_signaled(job->done_fence)) - return; + return DRM_TASK_STATUS_COMPLETE;
dev_err(pfdev->dev, "gpu sched timeout, js=%d, config=0x%x, status=0x%x, head=0x%x, tail=0x%x, sched_job=%p", js, @@ -455,11 +456,13 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)
/* Scheduler is already stopped, nothing to do. */ if (!panfrost_scheduler_stop(&pfdev->js->queue[js], sched_job)) - return; + return DRM_TASK_STATUS_ALIVE;
/* Schedule a reset if there's no reset in progress. */ if (!atomic_xchg(&pfdev->reset.pending, 1)) schedule_work(&pfdev->reset.work); + + return DRM_TASK_STATUS_ALIVE; }
static const struct drm_sched_backend_ops panfrost_sched_ops = { diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 3eb7618a627d..b9876cad94f2 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -526,7 +526,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery) EXPORT_SYMBOL(drm_sched_start);
/** - * drm_sched_resubmit_jobs - helper to relunch job from pending ring list + * drm_sched_resubmit_jobs - helper to relaunch jobs from the pending list * * @sched: scheduler instance * @@ -560,8 +560,6 @@ void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched) } else { s_job->s_fence->parent = fence; } - - } } EXPORT_SYMBOL(drm_sched_resubmit_jobs); diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c index 452682e2209f..3740665ec479 100644 --- a/drivers/gpu/drm/v3d/v3d_sched.c +++ b/drivers/gpu/drm/v3d/v3d_sched.c @@ -259,7 +259,7 @@ v3d_cache_clean_job_run(struct drm_sched_job *sched_job) return NULL; }
-static void +static enum drm_task_status v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job) { enum v3d_queue q; @@ -285,6 +285,8 @@ v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job) }
mutex_unlock(&v3d->reset_lock); + + return DRM_TASK_STATUS_ALIVE; }
/* If the current address or return address have changed, then the GPU @@ -292,7 +294,7 @@ v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job) * could fail if the GPU got in an infinite loop in the CL, but that * is pretty unlikely outside of an i-g-t testcase. */ -static void +static enum drm_task_status v3d_cl_job_timedout(struct drm_sched_job *sched_job, enum v3d_queue q, u32 *timedout_ctca, u32 *timedout_ctra) { @@ -304,39 +306,39 @@ v3d_cl_job_timedout(struct drm_sched_job *sched_job, enum v3d_queue q, if (*timedout_ctca != ctca || *timedout_ctra != ctra) { *timedout_ctca = ctca; *timedout_ctra = ctra; - return; + return DRM_TASK_STATUS_ALIVE; }
- v3d_gpu_reset_for_timeout(v3d, sched_job); + return v3d_gpu_reset_for_timeout(v3d, sched_job); }
-static void +static enum drm_task_status v3d_bin_job_timedout(struct drm_sched_job *sched_job) { struct v3d_bin_job *job = to_bin_job(sched_job);
- v3d_cl_job_timedout(sched_job, V3D_BIN, - &job->timedout_ctca, &job->timedout_ctra); + return v3d_cl_job_timedout(sched_job, V3D_BIN, + &job->timedout_ctca, &job->timedout_ctra); }
-static void +static enum drm_task_status v3d_render_job_timedout(struct drm_sched_job *sched_job) { struct v3d_render_job *job = to_render_job(sched_job);
- v3d_cl_job_timedout(sched_job, V3D_RENDER, - &job->timedout_ctca, &job->timedout_ctra); + return v3d_cl_job_timedout(sched_job, V3D_RENDER, + &job->timedout_ctca, &job->timedout_ctra); }
-static void +static enum drm_task_status v3d_generic_job_timedout(struct drm_sched_job *sched_job) { struct v3d_job *job = to_v3d_job(sched_job);
- v3d_gpu_reset_for_timeout(job->v3d, sched_job); + return v3d_gpu_reset_for_timeout(job->v3d, sched_job); }
-static void +static enum drm_task_status v3d_csd_job_timedout(struct drm_sched_job *sched_job) { struct v3d_csd_job *job = to_csd_job(sched_job); @@ -348,10 +350,10 @@ v3d_csd_job_timedout(struct drm_sched_job *sched_job) */ if (job->timedout_batches != batches) { job->timedout_batches = batches; - return; + return DRM_TASK_STATUS_ALIVE; }
- v3d_gpu_reset_for_timeout(v3d, sched_job); + return v3d_gpu_reset_for_timeout(v3d, sched_job); }
static const struct drm_sched_backend_ops v3d_bin_sched_ops = { diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 2e0c368e19f6..cedfc5394e52 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -206,6 +206,11 @@ static inline bool drm_sched_invalidate_job(struct drm_sched_job *s_job, return s_job && atomic_inc_return(&s_job->karma) > threshold; }
+enum drm_task_status { + DRM_TASK_STATUS_COMPLETE, + DRM_TASK_STATUS_ALIVE +}; + /** * struct drm_sched_backend_ops * @@ -230,10 +235,19 @@ struct drm_sched_backend_ops { struct dma_fence *(*run_job)(struct drm_sched_job *sched_job);
/** - * @timedout_job: Called when a job has taken too long to execute, - * to trigger GPU recovery. + * @timedout_job: Called when a job has taken too long to execute, + * to trigger GPU recovery. + * + * Return DRM_TASK_STATUS_ALIVE, if the task (job) is healthy + * and executing in the hardware, i.e. it needs more time. + * + * Return DRM_TASK_STATUS_COMPLETE, if the task (job) has + * been aborted or is unknown to the hardware, i.e. if + * the task is out of the hardware, and maybe it is now + * in the done list, or it was completed long ago, or + * if it is unknown to the hardware. */ - void (*timedout_job)(struct drm_sched_job *sched_job); + enum drm_task_status (*timedout_job)(struct drm_sched_job *sched_job);
/** * @free_job: Called once the job's finished fence has been signaled
Thinking more about that I came to the conclusion that the whole approach here isn't correct.
See even when the job has been completed or canceled we still want to restart the timer.
The reason for this is that the timer is then not restarted for the current job, but for the next job in the queue.
The only valid reason to not restart the timer is that the whole device was hot plugged and we return -ENODEV here. E.g. what Andrey has been working on.
Regards, Christian.
Am 04.12.20 um 04:17 schrieb Luben Tuikov:
On 12/4/20 3:13 AM, Christian König wrote:
We discussed this with Luben off line a few days ago but came to a conclusion that for the next job the timer restart in drm_sched_job_begin should do the work, no ?
Andrey
Am 04.12.20 um 16:10 schrieb Andrey Grodzovsky:
Nope, drm_sched_job_begin() pushes the job to the hardware and starts the timeout in case the hardware was idle before.
The function should probably be renamed to drm_sched_job_pushed() because it doesn't begin the execution in any way.
Christian.
Am 07.12.20 um 20:09 schrieb Andrey Grodzovsky:
Ok, let me explain from the beginning.
drm_sched_start_timeout() initially starts the timer, it does NOT rearms it! When the timer is already running it doesn't has any effect at all.
When a job completes drm_sched_get_cleanup_job() cancels the timer, frees the job and then starts a new timer for the engine.
When a timeout happens the job is either canceled or give some extra time by putting it back on the pending list.
When the job is canceled the timer must be restarted for the next job, because drm_sched_job_begin() was already called long ago.
When the job gets some extra time we should also restart the timer.
The only case when the timer should not be restarted is when the device was hotplugged and is completely gone now.
I think the right approach to stop this messing with the ring mirror list is to avoid using the job altogether for recovery.
What we should do instead is to put the recovery information on the scheduler fence, because that is the object which stays alive after pushing the job to the hardware.
Christian.
On 12/7/20 2:19 PM, Christian König wrote:
In a sense that delayed work cannot be enqueued while another instance is still in the queue I agree. I forgot about this in the context of drm_sched_start_timeout.
Now i get it. Next job might have called (and probably did) drm_sched_job_begin while previous timer work (currently executing one) was still in the workqueue and so we cannot count on it to actually have restarted the timer and so we must do it.
When the job gets some extra time we should also restart the timer.
Same as above.
Thanks for clarifying this.
Andrey
On 2020-12-04 3:13 a.m., Christian König wrote:
Got it. I'll make that change in patch 5/5 as this patch, 4/5, only changes the timer timeout function from void to enum, and doesn't affect behaviour.
Yes, perhaps something like DRM_TASK_STATUS_ENODEV. We can add it now or later when Andrey adds his hotplug/unplug patches.
Regards, Luben
The drm_sched_job_done() callback now moves done jobs from the pending list to a "done" list.
In drm_sched_job_timeout, make use of the status returned by a GPU driver job timeout handler to decide whether to leave the oldest job in the pending list, or to send it off to the done list. If a driver's job timeout callback returns a status that that job is done, it is added to the done list and the done thread woken up. If that job needs more time, it is left on the pending list and the timeout timer restarted.
The idea is that a GPU driver can check the IP to which the passed-in job belongs to and determine whether the IP is alive and well, or if it needs more time to complete this job and perhaps others also executing on it.
In drm_sched_job_timeout(), the main scheduler thread is now parked, before calling a driver's timeout_job callback, so as to not compete pushing jobs down to the GPU while the recovery method is taking place.
Eliminate the polling mechanism of picking out done jobs from the pending list, i.e. eliminate drm_sched_get_cleanup_job().
This also eliminates the eldest job disappearing from the pending list, while the driver timeout handler is called.
Various other optimizations to the GPU scheduler and job recovery are possible with this format.
Signed-off-by: Luben Tuikov luben.tuikov@amd.com
Cc: Alexander Deucher Alexander.Deucher@amd.com Cc: Andrey Grodzovsky Andrey.Grodzovsky@amd.com Cc: Christian König christian.koenig@amd.com Cc: Daniel Vetter daniel.vetter@ffwll.ch Cc: Lucas Stach l.stach@pengutronix.de Cc: Russell King linux+etnaviv@armlinux.org.uk Cc: Christian Gmeiner christian.gmeiner@gmail.com Cc: Qiang Yu yuq825@gmail.com Cc: Rob Herring robh@kernel.org Cc: Tomeu Vizoso tomeu.vizoso@collabora.com Cc: Steven Price steven.price@arm.com Cc: Alyssa Rosenzweig alyssa.rosenzweig@collabora.com Cc: Eric Anholt eric@anholt.net
v2: Dispell using a done thread, so as to keep the cache hot on the same processor. --- drivers/gpu/drm/scheduler/sched_main.c | 247 +++++++++++++------------ include/drm/gpu_scheduler.h | 4 + 2 files changed, 134 insertions(+), 117 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index b9876cad94f2..d77180b44998 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -164,7 +164,9 @@ drm_sched_rq_select_entity(struct drm_sched_rq *rq) * drm_sched_job_done - complete a job * @s_job: pointer to the job which is done * - * Finish the job's fence and wake up the worker thread. + * Move the completed task to the done list, + * signal the its fence to mark it finished, + * and wake up the worker thread. */ static void drm_sched_job_done(struct drm_sched_job *s_job) { @@ -176,9 +178,14 @@ static void drm_sched_job_done(struct drm_sched_job *s_job)
trace_drm_sched_process_job(s_fence);
+ spin_lock(&sched->job_list_lock); + list_move(&s_job->list, &sched->done_list); + spin_unlock(&sched->job_list_lock); + dma_fence_get(&s_fence->finished); drm_sched_fence_finished(s_fence); dma_fence_put(&s_fence->finished); + wake_up_interruptible(&sched->wake_up_worker); }
@@ -309,6 +316,37 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job) spin_unlock(&sched->job_list_lock); }
+/** drm_sched_job_timeout -- a timer timeout occurred + * @work: pointer to work_struct + * + * First, park the scheduler thread whose IP timed out, + * so that we don't race with the scheduler thread pushing + * jobs down the IP as we try to investigate what + * happened and give drivers a chance to recover. + * + * Second, take the fist job in the pending list + * (oldest), leave it in the pending list and call the + * driver's timer timeout callback to find out what + * happened, passing this job as the suspect one. + * + * The driver may return DRM_TASK_STATUS COMPLETE, + * which means the task is not in the IP(*) and we move + * it to the done list to free it. + * + * (*) A reason for this would be, say, that the job + * completed in due time, or the driver has aborted + * this job using driver specific methods in the + * timedout_job callback and has now removed it from + * the hardware. + * + * Or, the driver may return DRM_TASK_STATUS_ALIVE, to + * indicate that it had inquired about this job, and it + * has verified that this job is alive and well, and + * that the DRM layer should give this task more time + * to complete. In this case, we restart the timeout timer. + * + * Lastly, we unpark the scheduler thread. + */ static void drm_sched_job_timedout(struct work_struct *work) { struct drm_gpu_scheduler *sched; @@ -316,37 +354,32 @@ static void drm_sched_job_timedout(struct work_struct *work)
sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
- /* Protects against concurrent deletion in drm_sched_get_cleanup_job */ + kthread_park(sched->thread); + spin_lock(&sched->job_list_lock); job = list_first_entry_or_null(&sched->pending_list, struct drm_sched_job, list); + spin_unlock(&sched->job_list_lock);
if (job) { - /* - * Remove the bad job so it cannot be freed by concurrent - * drm_sched_cleanup_jobs. It will be reinserted back after sched->thread - * is parked at which point it's safe. - */ - list_del_init(&job->list); - spin_unlock(&sched->job_list_lock); - - job->sched->ops->timedout_job(job); + int res;
- /* - * Guilty job did complete and hence needs to be manually removed - * See drm_sched_stop doc. - */ - if (sched->free_guilty) { - job->sched->ops->free_job(job); - sched->free_guilty = false; + res = job->sched->ops->timedout_job(job); + if (res == DRM_TASK_STATUS_COMPLETE) { + /* The job is out of the device. + */ + spin_lock(&sched->job_list_lock); + list_move(&job->list, &sched->done_list); + spin_unlock(&sched->job_list_lock); + wake_up_interruptible(&sched->wake_up_worker); + } else { + /* The job needs more time. + */ + drm_sched_start_timeout(sched); } - } else { - spin_unlock(&sched->job_list_lock); }
- spin_lock(&sched->job_list_lock); - drm_sched_start_timeout(sched); - spin_unlock(&sched->job_list_lock); + kthread_unpark(sched->thread); }
/** @@ -413,24 +446,13 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad) kthread_park(sched->thread);
/* - * Reinsert back the bad job here - now it's safe as - * drm_sched_get_cleanup_job cannot race against us and release the - * bad job at this point - we parked (waited for) any in progress - * (earlier) cleanups and drm_sched_get_cleanup_job will not be called - * now until the scheduler thread is unparked. - */ - if (bad && bad->sched == sched) - /* - * Add at the head of the queue to reflect it was the earliest - * job extracted. - */ - list_add(&bad->list, &sched->pending_list); - - /* - * Iterate the job list from later to earlier one and either deactive - * their HW callbacks or remove them from pending list if they already - * signaled. - * This iteration is thread safe as sched thread is stopped. + * Iterate the pending list in reverse order, + * from most recently submitted to oldest + * tasks. Tasks which haven't completed, leave + * them in the pending list, but decrement + * their hardware run queue count. + * Else, the fence must've signalled, and the job + * is in the done list. */ list_for_each_entry_safe_reverse(s_job, tmp, &sched->pending_list, list) { @@ -439,36 +461,52 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad) &s_job->cb)) { atomic_dec(&sched->hw_rq_count); } else { - /* - * remove job from pending_list. - * Locking here is for concurrent resume timeout - */ - spin_lock(&sched->job_list_lock); - list_del_init(&s_job->list); - spin_unlock(&sched->job_list_lock); - - /* - * Wait for job's HW fence callback to finish using s_job - * before releasing it. - * - * Job is still alive so fence refcount at least 1 - */ - dma_fence_wait(&s_job->s_fence->finished, false); - - /* - * We must keep bad job alive for later use during - * recovery by some of the drivers but leave a hint - * that the guilty job must be released. - */ - if (bad != s_job) - sched->ops->free_job(s_job); - else - sched->free_guilty = true; + if (bad == s_job) { + /* This is the oldest job on the pending list + * whose IP timed out. The + * drm_sched_job_timeout() function calls the + * driver's timedout_job callback passing @bad, + * who then calls this function here--as such + * we shouldn't move @bad or free it. This will + * be decided by drm_sched_job_timeout() when + * this function here returns back to the caller + * (the driver) and the driver's timedout_job + * callback returns a result to + * drm_sched_job_timeout(). + */ + ; + } else { + int res; + + /* This job is not the @bad job passed above. + * Note that perhaps it was *this* job which + * timed out. The wait below is suspect. Since, + * it waits with maximum timeout and "intr" set + * to false, it will either return 0 indicating + * that the fence has signalled, or negative on + * error. What if, the whole IP is stuck and + * this ends up waiting forever? + * + * Wait for job's HW fence callback to finish + * using s_job before releasing it. + * + * Job is still alive so fence + * refcount at least 1 + */ + res = dma_fence_wait(&s_job->s_fence->finished, + false); + + if (res == 0) + sched->ops->free_job(s_job); + else + pr_err_once("%s: dma_fence_wait: %d\n", + sched->name, res); + } } }
/* - * Stop pending timer in flight as we rearm it in drm_sched_start. This + * Stop pending timer in flight as we rearm it in drm_sched_start. This * avoids the pending timeout work in progress to fire right away after * this TDR finished and before the newly restarted jobs had a * chance to complete. @@ -511,8 +549,9 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery) else if (r) DRM_ERROR("fence add callback failed (%d)\n", r); - } else + } else { drm_sched_job_done(s_job); + } }
if (full_recovery) { @@ -665,47 +704,6 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched) return entity; }
-/** - * drm_sched_get_cleanup_job - fetch the next finished job to be destroyed - * - * @sched: scheduler instance - * - * Returns the next finished job from the pending list (if there is one) - * ready for it to be destroyed. - */ -static struct drm_sched_job * -drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched) -{ - struct drm_sched_job *job; - - /* - * Don't destroy jobs while the timeout worker is running OR thread - * is being parked and hence assumed to not touch pending_list - */ - if ((sched->timeout != MAX_SCHEDULE_TIMEOUT && - !cancel_delayed_work(&sched->work_tdr)) || - kthread_should_park()) - return NULL; - - spin_lock(&sched->job_list_lock); - - job = list_first_entry_or_null(&sched->pending_list, - struct drm_sched_job, list); - - if (job && dma_fence_is_signaled(&job->s_fence->finished)) { - /* remove job from pending_list */ - list_del_init(&job->list); - } else { - job = NULL; - /* queue timeout for next job */ - drm_sched_start_timeout(sched); - } - - spin_unlock(&sched->job_list_lock); - - return job; -} - /** * drm_sched_pick_best - Get a drm sched from a sched_list with the least load * @sched_list: list of drm_gpu_schedulers @@ -759,6 +757,25 @@ static bool drm_sched_blocked(struct drm_gpu_scheduler *sched) return false; }
+static void drm_sched_free_done(struct drm_gpu_scheduler *sched) +{ + LIST_HEAD(done_q); + + spin_lock(&sched->job_list_lock); + list_splice_init(&sched->done_list, &done_q); + spin_unlock(&sched->job_list_lock); + + while (!list_empty(&done_q)) { + struct drm_sched_job *job; + + job = list_first_entry(&done_q, + struct drm_sched_job, + list); + list_del_init(&job->list); + sched->ops->free_job(job); + } +} + /** * drm_sched_main - main scheduler thread * @@ -768,7 +785,7 @@ static bool drm_sched_blocked(struct drm_gpu_scheduler *sched) */ static int drm_sched_main(void *param) { - struct drm_gpu_scheduler *sched = (struct drm_gpu_scheduler *)param; + struct drm_gpu_scheduler *sched = param; int r;
sched_set_fifo_low(current); @@ -778,19 +795,14 @@ static int drm_sched_main(void *param) struct drm_sched_fence *s_fence; struct drm_sched_job *sched_job; struct dma_fence *fence; - struct drm_sched_job *cleanup_job = NULL;
wait_event_interruptible(sched->wake_up_worker, - (cleanup_job = drm_sched_get_cleanup_job(sched)) || + (!list_empty(&sched->done_list)) || (!drm_sched_blocked(sched) && (entity = drm_sched_select_entity(sched))) || kthread_should_stop());
- if (cleanup_job) { - sched->ops->free_job(cleanup_job); - /* queue timeout for next job */ - drm_sched_start_timeout(sched); - } + drm_sched_free_done(sched);
if (!entity) continue; @@ -864,6 +876,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched, init_waitqueue_head(&sched->wake_up_worker); init_waitqueue_head(&sched->job_scheduled); INIT_LIST_HEAD(&sched->pending_list); + INIT_LIST_HEAD(&sched->done_list); spin_lock_init(&sched->job_list_lock); atomic_set(&sched->hw_rq_count, 0); INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout); diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index cedfc5394e52..11278695fed0 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -289,6 +289,7 @@ struct drm_gpu_scheduler { uint32_t hw_submission_limit; long timeout; const char *name; + struct drm_sched_rq sched_rq[DRM_SCHED_PRIORITY_COUNT]; wait_queue_head_t wake_up_worker; wait_queue_head_t job_scheduled; @@ -296,8 +297,11 @@ struct drm_gpu_scheduler { atomic64_t job_id_count; struct delayed_work work_tdr; struct task_struct *thread; + struct list_head pending_list; + struct list_head done_list; spinlock_t job_list_lock; + int hang_limit; atomic_t score; bool ready;
Am 04.12.20 um 04:17 schrieb Luben Tuikov:
That is racy, as soon as the spinlock is dropped the job and with it the s_fence might haven been destroyed.
In other words this here needs to come first.
Regards, Christian.
dri-devel@lists.freedesktop.org