Hi Andrey,
Good catch. I guess since both Christian and I were working on it parallelly we didn't catch this problem.
If it is only causing a problem in push_job then we can handle it something like this:
---------------------------------------------------------------------------------------------------------------------------- diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index 05dc6ecd4003..f344ce32f128 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -563,6 +563,11 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job, first = spsc_queue_count(&entity->job_queue) == 0; reschedule = idle && first && (entity->num_rq_list > 1);
+ if (first && list_empty(&entity->list)) { + DRM_ERROR("Trying to push to a killed entity\n"); + return; + } + if (reschedule) { rq = drm_sched_entity_get_free_sched(entity); spin_lock(&entity->rq_lock); @@ -580,11 +585,6 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job, if (first) { /* Add the entity to the run queue */ spin_lock(&entity->rq_lock); - if (!entity->rq) { - DRM_ERROR("Trying to push to a killed entity\n"); - spin_unlock(&entity->rq_lock); - return; - } drm_sched_rq_add_entity(entity->rq, entity); spin_unlock(&entity->rq_lock); drm_sched_wakeup(entity->rq->sched); -----------------------------------------------------------------------------------------------------------------------------
On Thu, Aug 2, 2018 at 3:55 AM Andrey Grodzovsky Andrey.Grodzovsky@amd.com wrote:
Thinking again about this change and 53d5f1e drm/scheduler: move idle entities to scheduler with less load v2
Looks to me like use case for which fc9a539 drm/scheduler: add NULL pointer check for run queue (v2) was done
will not work anymore.
First of all in drm_sched_entity_push_job, 'if (!entity->rq)' will never be true any more since we stopped entity->rq to NULL in
drm_sched_entity_flush.
Second of all, even after we removed the entity from rq in drm_sched_entity_flush to terminate any subsequent submissions
to the queue the other thread doing push job can just acquire again a run queue
from drm_sched_entity_get_free_sched and continue submission so you can't substitute ' if (!entity->rq)' to 'if (list_empty(&entity->list))'.
What about adding a 'stopped' flag to drm_sched_entity to be set in drm_sched_entity_flush and in
But it might be worth adding a stopped flag field if it is required at
other places as well.
Thanks, Nayan
drm_sched_entity_push_job check for 'if (!entity->stopped)' instead of ' if (!entity->rq)' ?
Andrey
On 07/30/2018 07:03 AM, Christian König wrote:
We removed the redundancy of having an extra scheduler field, so we can't set the rq to NULL any more or otherwise won't know which scheduler to use for the cleanup.
Just remove the entity from the scheduling list instead.
Signed-off-by: Christian König christian.koenig@amd.com
drivers/gpu/drm/scheduler/gpu_scheduler.c | 35
+++++++------------------------
1 file changed, 8 insertions(+), 27 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c
b/drivers/gpu/drm/scheduler/gpu_scheduler.c
index f563e4fbb4b6..1b733229201e 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct drm_sched_entity
*entity,
} EXPORT_SYMBOL(drm_sched_entity_init);
-/**
- drm_sched_entity_is_initialized - Query if entity is initialized
- @sched: Pointer to scheduler instance
- @entity: The pointer to a valid scheduler entity
- return true if entity is initialized, false otherwise
-*/ -static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler
*sched,
struct drm_sched_entity
*entity)
-{
return entity->rq != NULL &&
entity->rq->sched == sched;
-}
- /**
- drm_sched_entity_is_idle - Check if entity is idle
@@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct
drm_sched_entity *entity)
{ rmb();
if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL)
if (list_empty(&entity->list) ||
spsc_queue_peek(&entity->job_queue) == NULL) return true; return false;
@@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct drm_sched_entity
*entity, long timeout)
long ret = timeout; sched = entity->rq->sched;
if (!drm_sched_entity_is_initialized(sched, entity))
return ret; /** * The client will not queue more IBs during this fini, consume
existing
* queued IBs or discard them on SIGKILL
@@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct drm_sched_entity
*entity, long timeout)
last_user = cmpxchg(&entity->last_user, current->group_leader,
NULL);
if ((!last_user || last_user == current->group_leader) && (current->flags & PF_EXITING) && (current->exit_code ==
SIGKILL))
drm_sched_entity_set_rq(entity, NULL);
drm_sched_rq_remove_entity(entity->rq, entity); return ret;
}
@@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct drm_sched_entity
*entity)
struct drm_gpu_scheduler *sched; sched = entity->rq->sched;
drm_sched_entity_set_rq(entity, NULL);
drm_sched_rq_remove_entity(entity->rq, entity); /* Consumption of existing IBs wasn't completed. Forcefully * remove them here.
@@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct
drm_sched_entity *entity,
if (entity->rq == rq) return;
spin_lock(&entity->rq_lock);
if (entity->rq)
drm_sched_rq_remove_entity(entity->rq, entity);
BUG_ON(!rq);
spin_lock(&entity->rq_lock);
drm_sched_rq_remove_entity(entity->rq, entity); entity->rq = rq;
if (rq)
drm_sched_rq_add_entity(rq, entity);
} EXPORT_SYMBOL(drm_sched_entity_set_rq);drm_sched_rq_add_entity(rq, entity); spin_unlock(&entity->rq_lock);