Re: [PATCH v2] drm/scheduler: remove timeout work_struct from drm_sched_job

26 Sep 2018

Hi Christian,
On Wed, Sep 26, 2018, 10:13 AM Christian König <
ckoenig.leichtzumerken@gmail.com> wrote:
...
Am 26.09.2018 um 09:39 schrieb Lucas Stach:
...
Hi Nayan,
Am Mittwoch, den 26.09.2018, 02:09 +0900 schrieb Nayan Deshmukh:
...
having a delayed work item per job is redundant as we only need one
per scheduler to track the time out the currently executing job.
v2: the first element of the ring mirror list is the currently
executing job so we don't need a additional variable for it
Signed-off-by: Nayan Deshmukh nayan26deshmukh@gmail.com
Suggested-by: Christian König christian.koenig@amd.com

drivers/gpu/drm/scheduler/sched_main.c | 31
++++++++++++++++---------------
...
...
include/drm/gpu_scheduler.h            |  6 +++---
  2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c
b/drivers/gpu/drm/scheduler/sched_main.c
...
...
index 9ca741f3a0bc..4e8505d51795 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -197,19 +197,15 @@ static void drm_sched_job_finish(struct
work_struct *work)
...
...
  * manages to find this job as the next job in the list, the fence
  * signaled check below will prevent the timeout to be restarted.
  */


cancel_delayed_work_sync(&s_job->work_tdr);


cancel_delayed_work_sync(&sched->work_tdr);
spin_lock(&sched->job_list_lock);



/* queue TDR for next job */
if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
   !list_is_last(&s_job->node, &sched->ring_mirror_list)) {


       struct drm_sched_job *next = list_next_entry(s_job, node);



       if (!dma_fence_is_signaled(&next->s_fence->finished))


               schedule_delayed_work(&next->work_tdr,



sched->timeout);
...
...

}
/* remove job from ring_mirror_list */
list_del(&s_job->node);


/* queue TDR for next job */

if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&

   !list_empty(&sched->ring_mirror_list))


       schedule_delayed_work(&sched->work_tdr, sched->timeout);

spin_unlock(&sched->job_list_lock);
dma_fence_put(&s_job->s_fence->finished);


@@ -236,16 +232,21 @@ static void drm_sched_job_begin(struct
drm_sched_job *s_job)
...
...
 if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
     list_first_entry_or_null(&sched->ring_mirror_list,
                              struct drm_sched_job, node) == s_job)


       schedule_delayed_work(&s_job->work_tdr, sched->timeout);




       schedule_delayed_work(&sched->work_tdr, sched->timeout);

spin_unlock(&sched->job_list_lock);
}
static void drm_sched_job_timedout(struct work_struct *work)
{



struct drm_sched_job *job = container_of(work, struct

drm_sched_job,
...
...

                                        work_tdr.work);




struct drm_gpu_scheduler *sched;
struct drm_sched_job *job;

sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work

);
...
...

job = list_first_entry_or_null(&sched->ring_mirror_list,
                              struct drm_sched_job, node);





job->sched->ops->timedout_job(job);


if (job)
       job->sched->ops->timedout_job(job);



I don't think this is fully robust. Jobs are only removed from the
ring_mirror_list once the job_finish worker has run. If execution of
this worker is delayed for any reason (though it's really unlikely for
a delay as long as the job timeout to happen) you are blaming the wrong
job here.
So I think what you need to to is find the first job in the ring mirror
list with an unsignaled finish fence to robustly find the stuck job.
Yeah, that is a known problem I've pointed out as well.
The issue is we have bug reports that this happened before the patch,
but I'm not 100% sure how.
My suggestion is to move a good part of the logic from
drm_sched_hw_job_reset() and drm_sched_job_recovery() into
drm_sched_job_timedout().
E.g. we first call dma_fence_remove_callback() for each job and actually
check the return value if the fence was already signaled.
If we find a signaled fence we abort and add the callback back to the
ones where we removed it.
Nayan do you want to take care of this or should I take a look?
I can take care of it.
Regards,
Nayan
...
Regards,
Christian.
...
Regards,
Lucas
...
}
/**
@@ -315,7 +316,7 @@ void drm_sched_job_recovery(struct
drm_gpu_scheduler *sched)
...
...
 s_job = list_first_entry_or_null(&sched->ring_mirror_list,
                                  struct drm_sched_job, node);
 if (s_job && sched->timeout != MAX_SCHEDULE_TIMEOUT)


       schedule_delayed_work(&s_job->work_tdr, sched->timeout);




       schedule_delayed_work(&sched->work_tdr, sched->timeout);


list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list,


node) {
...
...
         struct drm_sched_fence *s_fence = s_job->s_fence;

@@ -384,7 +385,6 @@ int drm_sched_job_init(struct drm_sched_job *job,
 INIT_WORK(&job->finish_work, drm_sched_job_finish);
 INIT_LIST_HEAD(&job->node);


INIT_DELAYED_WORK(&job->work_tdr, drm_sched_job_timedout);
return 0;
}


@@ -575,6 +575,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
     INIT_LIST_HEAD(&sched->ring_mirror_list);
     spin_lock_init(&sched->job_list_lock);
     atomic_set(&sched->hw_rq_count, 0);

INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
atomic_set(&sched->num_jobs, 0);
atomic64_set(&sched->job_id_count, 0);

diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index daec50f887b3..d87b268f1781 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -175,8 +175,6 @@ struct drm_sched_fence *to_drm_sched_fence(struct
dma_fence *f);
...
...

          finished to remove the job from the


          @drm_gpu_scheduler.ring_mirror_list.


@node: used to append this struct to the

@drm_gpu_scheduler.ring_mirror_list.
...
...


@work_tdr: schedules a delayed call to @drm_sched_job_timedout



after the timeout
...
...


       interval is over.


@id: a unique id assigned to each job scheduled on the scheduler.
@karma: increment on every hang caused by this job. If this



exceeds the hang
...
...

    limit of the scheduler then the job is marked guilty and



will not
...
...
@@ -195,7 +193,6 @@ struct drm_sched_job {
     struct dma_fence_cb             finish_cb;
     struct work_struct              finish_work;
     struct list_head                node;

struct delayed_work             work_tdr;
uint64_t                        id;
atomic_t                        karma;
enum drm_sched_priority         s_priority;

@@ -259,6 +256,8 @@ struct drm_sched_backend_ops {

            finished.


@hw_rq_count: the number of jobs currently in the hardware queue.
@job_id_count: used to assign unique id to the each job.



@work_tdr: schedules a delayed call to @drm_sched_job_timedout



after the
...
...


       timeout interval is over.


@thread: the kthread on which the scheduler which run.
@ring_mirror_list: the list of jobs which are currently in the job



queue.
...
...

@job_list_lock: lock to protect the ring_mirror_list.

@@ -278,6 +277,7 @@ struct drm_gpu_scheduler {
     wait_queue_head_t               job_scheduled;
     atomic_t                        hw_rq_count;
     atomic64_t                      job_id_count;

struct delayed_work             work_tdr;
struct task_struct              *thread;
struct list_head                ring_mirror_list;
spinlock_t                      job_list_lock;


dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

    

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

Re: [PATCH v2] drm/scheduler: remove timeout work_struct from drm_sched_job