Note which task is using the entity and only kill it if the last user of the entity is killed. This should prevent problems when entities are leaked to child processes.
v2: add missing kernel doc
Signed-off-by: Christian König christian.koenig@amd.com --- drivers/gpu/drm/scheduler/gpu_scheduler.c | 6 +++++- include/drm/gpu_scheduler.h | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index 3f2fc5e8242a..f563e4fbb4b6 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -275,6 +275,7 @@ static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f, long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) { struct drm_gpu_scheduler *sched; + struct task_struct *last_user; long ret = timeout;
sched = entity->rq->sched; @@ -295,7 +296,9 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout)
/* For killed process disable any more IBs enqueue right now */ - if ((current->flags & PF_EXITING) && (current->exit_code == SIGKILL)) + last_user = cmpxchg(&entity->last_user, current->group_leader, NULL); + if ((!last_user || last_user == current->group_leader) && + (current->flags & PF_EXITING) && (current->exit_code == SIGKILL)) drm_sched_entity_set_rq(entity, NULL);
return ret; @@ -541,6 +544,7 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job,
trace_drm_sched_job(sched_job, entity);
+ WRITE_ONCE(entity->last_user, current->group_leader); first = spsc_queue_push(&entity->job_queue, &sched_job->queue_node);
/* first job wakes up scheduler */ diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 091b9afcd184..21c648b0b2a1 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -66,6 +66,7 @@ enum drm_sched_priority { * @guilty: points to ctx's guilty. * @fini_status: contains the exit status in case the process was signalled. * @last_scheduled: points to the finished fence of the last scheduled job. + * @last_user: last group leader pushing a job into the entity. * * Entities will emit jobs in order to their corresponding hardware * ring, and the scheduler will alternate between entities based on @@ -85,6 +86,7 @@ struct drm_sched_entity { struct dma_fence_cb cb; atomic_t *guilty; struct dma_fence *last_scheduled; + struct task_struct *last_user; };
/**
We removed the redundancy of having an extra scheduler field, so we can't set the rq to NULL any more or otherwise won't know which scheduler to use for the cleanup.
Just remove the entity from the scheduling list instead.
Signed-off-by: Christian König christian.koenig@amd.com --- drivers/gpu/drm/scheduler/gpu_scheduler.c | 35 +++++++------------------------ 1 file changed, 8 insertions(+), 27 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index f563e4fbb4b6..1b733229201e 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct drm_sched_entity *entity, } EXPORT_SYMBOL(drm_sched_entity_init);
-/** - * drm_sched_entity_is_initialized - Query if entity is initialized - * - * @sched: Pointer to scheduler instance - * @entity: The pointer to a valid scheduler entity - * - * return true if entity is initialized, false otherwise -*/ -static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler *sched, - struct drm_sched_entity *entity) -{ - return entity->rq != NULL && - entity->rq->sched == sched; -} - /** * drm_sched_entity_is_idle - Check if entity is idle * @@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity) { rmb();
- if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL) + if (list_empty(&entity->list) || + spsc_queue_peek(&entity->job_queue) == NULL) return true;
return false; @@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) long ret = timeout;
sched = entity->rq->sched; - if (!drm_sched_entity_is_initialized(sched, entity)) - return ret; /** * The client will not queue more IBs during this fini, consume existing * queued IBs or discard them on SIGKILL @@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) last_user = cmpxchg(&entity->last_user, current->group_leader, NULL); if ((!last_user || last_user == current->group_leader) && (current->flags & PF_EXITING) && (current->exit_code == SIGKILL)) - drm_sched_entity_set_rq(entity, NULL); + drm_sched_rq_remove_entity(entity->rq, entity);
return ret; } @@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct drm_sched_entity *entity) struct drm_gpu_scheduler *sched;
sched = entity->rq->sched; - drm_sched_entity_set_rq(entity, NULL); + drm_sched_rq_remove_entity(entity->rq, entity);
/* Consumption of existing IBs wasn't completed. Forcefully * remove them here. @@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct drm_sched_entity *entity, if (entity->rq == rq) return;
- spin_lock(&entity->rq_lock); - - if (entity->rq) - drm_sched_rq_remove_entity(entity->rq, entity); + BUG_ON(!rq);
+ spin_lock(&entity->rq_lock); + drm_sched_rq_remove_entity(entity->rq, entity); entity->rq = rq; - if (rq) - drm_sched_rq_add_entity(rq, entity); - + drm_sched_rq_add_entity(rq, entity); spin_unlock(&entity->rq_lock); } EXPORT_SYMBOL(drm_sched_entity_set_rq);
On Mon, Jul 30, 2018 at 4:33 PM Christian König < ckoenig.leichtzumerken@gmail.com> wrote:
We removed the redundancy of having an extra scheduler field, so we can't set the rq to NULL any more or otherwise won't know which scheduler to use for the cleanup.
Just remove the entity from the scheduling list instead.
Signed-off-by: Christian König christian.koenig@amd.com
Good catch.
Acked-by: Nayan Deshmukh nayan26deshmukh@gmail.com
drivers/gpu/drm/scheduler/gpu_scheduler.c | 35 +++++++------------------------ 1 file changed, 8 insertions(+), 27 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index f563e4fbb4b6..1b733229201e 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct drm_sched_entity *entity, } EXPORT_SYMBOL(drm_sched_entity_init);
-/**
- drm_sched_entity_is_initialized - Query if entity is initialized
- @sched: Pointer to scheduler instance
- @entity: The pointer to a valid scheduler entity
- return true if entity is initialized, false otherwise
-*/ -static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler *sched,
struct drm_sched_entity
*entity) -{
return entity->rq != NULL &&
entity->rq->sched == sched;
-}
/**
- drm_sched_entity_is_idle - Check if entity is idle
@@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity) { rmb();
if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL)
if (list_empty(&entity->list) ||
spsc_queue_peek(&entity->job_queue) == NULL) return true; return false;
@@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) long ret = timeout;
sched = entity->rq->sched;
if (!drm_sched_entity_is_initialized(sched, entity))
return ret; /** * The client will not queue more IBs during this fini, consume
existing * queued IBs or discard them on SIGKILL @@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) last_user = cmpxchg(&entity->last_user, current->group_leader, NULL); if ((!last_user || last_user == current->group_leader) && (current->flags & PF_EXITING) && (current->exit_code == SIGKILL))
drm_sched_entity_set_rq(entity, NULL);
drm_sched_rq_remove_entity(entity->rq, entity); return ret;
} @@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct drm_sched_entity *entity) struct drm_gpu_scheduler *sched;
sched = entity->rq->sched;
drm_sched_entity_set_rq(entity, NULL);
drm_sched_rq_remove_entity(entity->rq, entity); /* Consumption of existing IBs wasn't completed. Forcefully * remove them here.
@@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct drm_sched_entity *entity, if (entity->rq == rq) return;
spin_lock(&entity->rq_lock);
if (entity->rq)
drm_sched_rq_remove_entity(entity->rq, entity);
BUG_ON(!rq);
spin_lock(&entity->rq_lock);
drm_sched_rq_remove_entity(entity->rq, entity); entity->rq = rq;
if (rq)
drm_sched_rq_add_entity(rq, entity);
drm_sched_rq_add_entity(rq, entity); spin_unlock(&entity->rq_lock);
} EXPORT_SYMBOL(drm_sched_entity_set_rq); -- 2.14.1
On 07/30/2018 09:30 AM, Nayan Deshmukh wrote:
On Mon, Jul 30, 2018 at 4:33 PM Christian König <ckoenig.leichtzumerken@gmail.com mailto:ckoenig.leichtzumerken@gmail.com> wrote:
We removed the redundancy of having an extra scheduler field, so we can't set the rq to NULL any more or otherwise won't know which scheduler to use for the cleanup. Just remove the entity from the scheduling list instead. Signed-off-by: Christian König <christian.koenig@amd.com <mailto:christian.koenig@amd.com>>
Good catch.
Acked-by: Nayan Deshmukh <nayan26deshmukh@gmail.com mailto:nayan26deshmukh@gmail.com>
--- drivers/gpu/drm/scheduler/gpu_scheduler.c | 35 +++++++------------------------ 1 file changed, 8 insertions(+), 27 deletions(-) diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index f563e4fbb4b6..1b733229201e 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct drm_sched_entity *entity, } EXPORT_SYMBOL(drm_sched_entity_init); -/** - * drm_sched_entity_is_initialized - Query if entity is initialized - * - * @sched: Pointer to scheduler instance - * @entity: The pointer to a valid scheduler entity - * - * return true if entity is initialized, false otherwise -*/ -static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler *sched, - struct drm_sched_entity *entity) -{ - return entity->rq != NULL && - entity->rq->sched == sched; -} - /** * drm_sched_entity_is_idle - Check if entity is idle * @@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity) { rmb(); - if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL) + if (list_empty(&entity->list) || + spsc_queue_peek(&entity->job_queue) == NULL) return true; return false; @@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) long ret = timeout; sched = entity->rq->sched; - if (!drm_sched_entity_is_initialized(sched, entity)) - return ret;
Just to be clear, you remove this function because it's redundant to call it?
Andrey
/** * The client will not queue more IBs during this fini, consume existing * queued IBs or discard them on SIGKILL @@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) last_user = cmpxchg(&entity->last_user, current->group_leader, NULL); if ((!last_user || last_user == current->group_leader) && (current->flags & PF_EXITING) && (current->exit_code == SIGKILL)) - drm_sched_entity_set_rq(entity, NULL); + drm_sched_rq_remove_entity(entity->rq, entity); return ret; } @@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct drm_sched_entity *entity) struct drm_gpu_scheduler *sched; sched = entity->rq->sched; - drm_sched_entity_set_rq(entity, NULL); + drm_sched_rq_remove_entity(entity->rq, entity); /* Consumption of existing IBs wasn't completed. Forcefully * remove them here. @@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct drm_sched_entity *entity, if (entity->rq == rq) return; - spin_lock(&entity->rq_lock); - - if (entity->rq) - drm_sched_rq_remove_entity(entity->rq, entity); + BUG_ON(!rq); + spin_lock(&entity->rq_lock); + drm_sched_rq_remove_entity(entity->rq, entity); entity->rq = rq; - if (rq) - drm_sched_rq_add_entity(rq, entity); - + drm_sched_rq_add_entity(rq, entity); spin_unlock(&entity->rq_lock); } EXPORT_SYMBOL(drm_sched_entity_set_rq); -- 2.14.1
dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Am 30.07.2018 um 22:51 schrieb Andrey Grodzovsky:
On 07/30/2018 09:30 AM, Nayan Deshmukh wrote:
On Mon, Jul 30, 2018 at 4:33 PM Christian König <ckoenig.leichtzumerken@gmail.com mailto:ckoenig.leichtzumerken@gmail.com> wrote:
We removed the redundancy of having an extra scheduler field, so we can't set the rq to NULL any more or otherwise won't know which scheduler to use for the cleanup. Just remove the entity from the scheduling list instead. Signed-off-by: Christian König <christian.koenig@amd.com <mailto:christian.koenig@amd.com>>
Good catch.
Acked-by: Nayan Deshmukh <nayan26deshmukh@gmail.com mailto:nayan26deshmukh@gmail.com>
--- drivers/gpu/drm/scheduler/gpu_scheduler.c | 35 +++++++------------------------ 1 file changed, 8 insertions(+), 27 deletions(-) diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index f563e4fbb4b6..1b733229201e 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct drm_sched_entity *entity, } EXPORT_SYMBOL(drm_sched_entity_init); -/** - * drm_sched_entity_is_initialized - Query if entity is initialized - * - * @sched: Pointer to scheduler instance - * @entity: The pointer to a valid scheduler entity - * - * return true if entity is initialized, false otherwise -*/ -static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler *sched, - struct drm_sched_entity *entity) -{ - return entity->rq != NULL && - entity->rq->sched == sched; -} - /** * drm_sched_entity_is_idle - Check if entity is idle * @@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity) { rmb(); - if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL) + if (list_empty(&entity->list) || + spsc_queue_peek(&entity->job_queue) == NULL) return true; return false; @@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) long ret = timeout; sched = entity->rq->sched; - if (!drm_sched_entity_is_initialized(sched, entity)) - return ret;
Just to be clear, you remove this function because it's redundant to call it?
Yes, exactly.
Christian.
Andrey
/** * The client will not queue more IBs during this fini, consume existing * queued IBs or discard them on SIGKILL @@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) last_user = cmpxchg(&entity->last_user, current->group_leader, NULL); if ((!last_user || last_user == current->group_leader) && (current->flags & PF_EXITING) && (current->exit_code == SIGKILL)) - drm_sched_entity_set_rq(entity, NULL); + drm_sched_rq_remove_entity(entity->rq, entity); return ret; } @@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct drm_sched_entity *entity) struct drm_gpu_scheduler *sched; sched = entity->rq->sched; - drm_sched_entity_set_rq(entity, NULL); + drm_sched_rq_remove_entity(entity->rq, entity); /* Consumption of existing IBs wasn't completed. Forcefully * remove them here. @@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct drm_sched_entity *entity, if (entity->rq == rq) return; - spin_lock(&entity->rq_lock); - - if (entity->rq) - drm_sched_rq_remove_entity(entity->rq, entity); + BUG_ON(!rq); + spin_lock(&entity->rq_lock); + drm_sched_rq_remove_entity(entity->rq, entity); entity->rq = rq; - if (rq) - drm_sched_rq_add_entity(rq, entity); - + drm_sched_rq_add_entity(rq, entity); spin_unlock(&entity->rq_lock); } EXPORT_SYMBOL(drm_sched_entity_set_rq); -- 2.14.1
dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Change is Reviewed-by: Andrey Grodzovsky andrey.grodzovsky@amd.com
Andrey
On 07/31/2018 02:50 AM, Christian König wrote:
Am 30.07.2018 um 22:51 schrieb Andrey Grodzovsky:
On 07/30/2018 09:30 AM, Nayan Deshmukh wrote:
On Mon, Jul 30, 2018 at 4:33 PM Christian König <ckoenig.leichtzumerken@gmail.com mailto:ckoenig.leichtzumerken@gmail.com> wrote:
We removed the redundancy of having an extra scheduler field, so we can't set the rq to NULL any more or otherwise won't know which scheduler to use for the cleanup. Just remove the entity from the scheduling list instead. Signed-off-by: Christian König <christian.koenig@amd.com <mailto:christian.koenig@amd.com>>
Good catch.
Acked-by: Nayan Deshmukh <nayan26deshmukh@gmail.com mailto:nayan26deshmukh@gmail.com>
--- drivers/gpu/drm/scheduler/gpu_scheduler.c | 35 +++++++------------------------ 1 file changed, 8 insertions(+), 27 deletions(-) diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index f563e4fbb4b6..1b733229201e 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct drm_sched_entity *entity, } EXPORT_SYMBOL(drm_sched_entity_init); -/** - * drm_sched_entity_is_initialized - Query if entity is initialized - * - * @sched: Pointer to scheduler instance - * @entity: The pointer to a valid scheduler entity - * - * return true if entity is initialized, false otherwise -*/ -static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler *sched, - struct drm_sched_entity *entity) -{ - return entity->rq != NULL && - entity->rq->sched == sched; -} - /** * drm_sched_entity_is_idle - Check if entity is idle * @@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity) { rmb(); - if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL) + if (list_empty(&entity->list) || + spsc_queue_peek(&entity->job_queue) == NULL) return true; return false; @@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) long ret = timeout; sched = entity->rq->sched; - if (!drm_sched_entity_is_initialized(sched, entity)) - return ret;
Just to be clear, you remove this function because it's redundant to call it?
Yes, exactly.
Christian.
Andrey
/** * The client will not queue more IBs during this fini, consume existing * queued IBs or discard them on SIGKILL @@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) last_user = cmpxchg(&entity->last_user, current->group_leader, NULL); if ((!last_user || last_user == current->group_leader) && (current->flags & PF_EXITING) && (current->exit_code == SIGKILL)) - drm_sched_entity_set_rq(entity, NULL); + drm_sched_rq_remove_entity(entity->rq, entity); return ret; } @@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct drm_sched_entity *entity) struct drm_gpu_scheduler *sched; sched = entity->rq->sched; - drm_sched_entity_set_rq(entity, NULL); + drm_sched_rq_remove_entity(entity->rq, entity); /* Consumption of existing IBs wasn't completed. Forcefully * remove them here. @@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct drm_sched_entity *entity, if (entity->rq == rq) return; - spin_lock(&entity->rq_lock); - - if (entity->rq) - drm_sched_rq_remove_entity(entity->rq, entity); + BUG_ON(!rq); + spin_lock(&entity->rq_lock); + drm_sched_rq_remove_entity(entity->rq, entity); entity->rq = rq; - if (rq) - drm_sched_rq_add_entity(rq, entity); - + drm_sched_rq_add_entity(rq, entity); spin_unlock(&entity->rq_lock); } EXPORT_SYMBOL(drm_sched_entity_set_rq); -- 2.14.1
dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Thinking again about this change and 53d5f1e drm/scheduler: move idle entities to scheduler with less load v2
Looks to me like use case for which fc9a539 drm/scheduler: add NULL pointer check for run queue (v2) was done
will not work anymore.
First of all in drm_sched_entity_push_job, 'if (!entity->rq)' will never be true any more since we stopped entity->rq to NULL in
drm_sched_entity_flush.
Second of all, even after we removed the entity from rq in drm_sched_entity_flush to terminate any subsequent submissions
to the queue the other thread doing push job can just acquire again a run queue
from drm_sched_entity_get_free_sched and continue submission so you can't substitute ' if (!entity->rq)' to 'if (list_empty(&entity->list))'.
What about adding a 'stopped' flag to drm_sched_entity to be set in drm_sched_entity_flush and in
drm_sched_entity_push_job check for 'if (!entity->stopped)' instead of ' if (!entity->rq)' ?
Andrey
On 07/30/2018 07:03 AM, Christian König wrote:
We removed the redundancy of having an extra scheduler field, so we can't set the rq to NULL any more or otherwise won't know which scheduler to use for the cleanup.
Just remove the entity from the scheduling list instead.
Signed-off-by: Christian König christian.koenig@amd.com
drivers/gpu/drm/scheduler/gpu_scheduler.c | 35 +++++++------------------------ 1 file changed, 8 insertions(+), 27 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index f563e4fbb4b6..1b733229201e 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct drm_sched_entity *entity, } EXPORT_SYMBOL(drm_sched_entity_init);
-/**
- drm_sched_entity_is_initialized - Query if entity is initialized
- @sched: Pointer to scheduler instance
- @entity: The pointer to a valid scheduler entity
- return true if entity is initialized, false otherwise
-*/ -static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler *sched,
struct drm_sched_entity *entity)
-{
- return entity->rq != NULL &&
entity->rq->sched == sched;
-}
- /**
- drm_sched_entity_is_idle - Check if entity is idle
@@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity) { rmb();
- if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL)
if (list_empty(&entity->list) ||
spsc_queue_peek(&entity->job_queue) == NULL)
return true;
return false;
@@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) long ret = timeout;
sched = entity->rq->sched;
- if (!drm_sched_entity_is_initialized(sched, entity))
/**return ret;
- The client will not queue more IBs during this fini, consume existing
- queued IBs or discard them on SIGKILL
@@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) last_user = cmpxchg(&entity->last_user, current->group_leader, NULL); if ((!last_user || last_user == current->group_leader) && (current->flags & PF_EXITING) && (current->exit_code == SIGKILL))
drm_sched_entity_set_rq(entity, NULL);
drm_sched_rq_remove_entity(entity->rq, entity);
return ret; }
@@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct drm_sched_entity *entity) struct drm_gpu_scheduler *sched;
sched = entity->rq->sched;
- drm_sched_entity_set_rq(entity, NULL);
drm_sched_rq_remove_entity(entity->rq, entity);
/* Consumption of existing IBs wasn't completed. Forcefully
- remove them here.
@@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct drm_sched_entity *entity, if (entity->rq == rq) return;
- spin_lock(&entity->rq_lock);
- if (entity->rq)
drm_sched_rq_remove_entity(entity->rq, entity);
BUG_ON(!rq);
spin_lock(&entity->rq_lock);
drm_sched_rq_remove_entity(entity->rq, entity); entity->rq = rq;
- if (rq)
drm_sched_rq_add_entity(rq, entity);
- drm_sched_rq_add_entity(rq, entity); spin_unlock(&entity->rq_lock); } EXPORT_SYMBOL(drm_sched_entity_set_rq);
Am 02.08.2018 um 00:25 schrieb Andrey Grodzovsky:
Thinking again about this change and 53d5f1e drm/scheduler: move idle entities to scheduler with less load v2
Looks to me like use case for which fc9a539 drm/scheduler: add NULL pointer check for run queue (v2) was done
will not work anymore.
First of all in drm_sched_entity_push_job, 'if (!entity->rq)' will never be true any more since we stopped entity->rq to NULL in
drm_sched_entity_flush.
Good point, going to remove that.
Second of all, even after we removed the entity from rq in drm_sched_entity_flush to terminate any subsequent submissions
to the queue the other thread doing push job can just acquire again a run queue
from drm_sched_entity_get_free_sched and continue submission
That is actually desired.
When another process is now using the entity to submit jobs adding it back to the rq is actually the right thing to do cause the entity is still in use.
Christian.
so you can't substitute ' if (!entity->rq)' to 'if (list_empty(&entity->list))'.
What about adding a 'stopped' flag to drm_sched_entity to be set in drm_sched_entity_flush and in
drm_sched_entity_push_job check for 'if (!entity->stopped)' instead of ' if (!entity->rq)' ?
Andrey
On 07/30/2018 07:03 AM, Christian König wrote:
We removed the redundancy of having an extra scheduler field, so we can't set the rq to NULL any more or otherwise won't know which scheduler to use for the cleanup.
Just remove the entity from the scheduling list instead.
Signed-off-by: Christian König christian.koenig@amd.com
drivers/gpu/drm/scheduler/gpu_scheduler.c | 35 +++++++------------------------ 1 file changed, 8 insertions(+), 27 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index f563e4fbb4b6..1b733229201e 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct drm_sched_entity *entity, } EXPORT_SYMBOL(drm_sched_entity_init); -/**
- drm_sched_entity_is_initialized - Query if entity is initialized
- @sched: Pointer to scheduler instance
- @entity: The pointer to a valid scheduler entity
- return true if entity is initialized, false otherwise
-*/ -static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler *sched, - struct drm_sched_entity *entity) -{ - return entity->rq != NULL && - entity->rq->sched == sched; -}
/** * drm_sched_entity_is_idle - Check if entity is idle * @@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity) { rmb(); - if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL) + if (list_empty(&entity->list) || + spsc_queue_peek(&entity->job_queue) == NULL) return true; return false; @@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) long ret = timeout; sched = entity->rq->sched; - if (!drm_sched_entity_is_initialized(sched, entity)) - return ret; /** * The client will not queue more IBs during this fini, consume existing * queued IBs or discard them on SIGKILL @@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) last_user = cmpxchg(&entity->last_user, current->group_leader, NULL); if ((!last_user || last_user == current->group_leader) && (current->flags & PF_EXITING) && (current->exit_code == SIGKILL)) - drm_sched_entity_set_rq(entity, NULL); + drm_sched_rq_remove_entity(entity->rq, entity); return ret; } @@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct drm_sched_entity *entity) struct drm_gpu_scheduler *sched; sched = entity->rq->sched; - drm_sched_entity_set_rq(entity, NULL); + drm_sched_rq_remove_entity(entity->rq, entity); /* Consumption of existing IBs wasn't completed. Forcefully * remove them here. @@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct drm_sched_entity *entity, if (entity->rq == rq) return; - spin_lock(&entity->rq_lock);
- if (entity->rq) - drm_sched_rq_remove_entity(entity->rq, entity); + BUG_ON(!rq); + spin_lock(&entity->rq_lock); + drm_sched_rq_remove_entity(entity->rq, entity); entity->rq = rq; - if (rq) - drm_sched_rq_add_entity(rq, entity);
+ drm_sched_rq_add_entity(rq, entity); spin_unlock(&entity->rq_lock); } EXPORT_SYMBOL(drm_sched_entity_set_rq);
On Thu, Aug 2, 2018 at 12:12 PM Christian König < ckoenig.leichtzumerken@gmail.com> wrote:
Am 02.08.2018 um 00:25 schrieb Andrey Grodzovsky:
Thinking again about this change and 53d5f1e drm/scheduler: move idle entities to scheduler with less load v2
Looks to me like use case for which fc9a539 drm/scheduler: add NULL pointer check for run queue (v2) was done
will not work anymore.
First of all in drm_sched_entity_push_job, 'if (!entity->rq)' will never be true any more since we stopped entity->rq to NULL in
drm_sched_entity_flush.
Good point, going to remove that.
Second of all, even after we removed the entity from rq in drm_sched_entity_flush to terminate any subsequent submissions
to the queue the other thread doing push job can just acquire again a run queue
from drm_sched_entity_get_free_sched and continue submission
Hi Christian
That is actually desired.
When another process is now using the entity to submit jobs adding it back to the rq is actually the right thing to do cause the entity is still in use.
I am not aware of the usecase so I might just be rambling. but if the thread/process that created the entity has called drm_sched_entity_flush then we shouldn't allow other processes to push jobs to that entity.
Nayan
Christian.
so you can't substitute ' if (!entity->rq)' to 'if (list_empty(&entity->list))'.
What about adding a 'stopped' flag to drm_sched_entity to be set in drm_sched_entity_flush and in
drm_sched_entity_push_job check for 'if (!entity->stopped)' instead of ' if (!entity->rq)' ?
Andrey
On 07/30/2018 07:03 AM, Christian König wrote:
We removed the redundancy of having an extra scheduler field, so we can't set the rq to NULL any more or otherwise won't know which scheduler to use for the cleanup.
Just remove the entity from the scheduling list instead.
Signed-off-by: Christian König christian.koenig@amd.com
drivers/gpu/drm/scheduler/gpu_scheduler.c | 35 +++++++------------------------ 1 file changed, 8 insertions(+), 27 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index f563e4fbb4b6..1b733229201e 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct drm_sched_entity *entity, } EXPORT_SYMBOL(drm_sched_entity_init); -/**
- drm_sched_entity_is_initialized - Query if entity is initialized
- @sched: Pointer to scheduler instance
- @entity: The pointer to a valid scheduler entity
- return true if entity is initialized, false otherwise
-*/ -static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler *sched,
struct drm_sched_entity *entity)
-{
- return entity->rq != NULL &&
entity->rq->sched == sched;
-}
- /**
- drm_sched_entity_is_idle - Check if entity is idle
@@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity) { rmb();
- if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL)
- if (list_empty(&entity->list) ||
spsc_queue_peek(&entity->job_queue) == NULL) return true; return false;
@@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) long ret = timeout; sched = entity->rq->sched;
- if (!drm_sched_entity_is_initialized(sched, entity))
return ret; /** * The client will not queue more IBs during this fini, consume
existing * queued IBs or discard them on SIGKILL @@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) last_user = cmpxchg(&entity->last_user, current->group_leader, NULL); if ((!last_user || last_user == current->group_leader) && (current->flags & PF_EXITING) && (current->exit_code == SIGKILL))
drm_sched_entity_set_rq(entity, NULL);
}drm_sched_rq_remove_entity(entity->rq, entity); return ret;
@@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct drm_sched_entity *entity) struct drm_gpu_scheduler *sched; sched = entity->rq->sched;
- drm_sched_entity_set_rq(entity, NULL);
- drm_sched_rq_remove_entity(entity->rq, entity); /* Consumption of existing IBs wasn't completed. Forcefully
- remove them here.
@@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct drm_sched_entity *entity, if (entity->rq == rq) return;
- spin_lock(&entity->rq_lock);
- if (entity->rq)
drm_sched_rq_remove_entity(entity->rq, entity);
- BUG_ON(!rq);
- spin_lock(&entity->rq_lock);
- drm_sched_rq_remove_entity(entity->rq, entity); entity->rq = rq;
- if (rq)
drm_sched_rq_add_entity(rq, entity);
- drm_sched_rq_add_entity(rq, entity); spin_unlock(&entity->rq_lock); } EXPORT_SYMBOL(drm_sched_entity_set_rq);
Am 02.08.2018 um 08:47 schrieb Nayan Deshmukh:
On Thu, Aug 2, 2018 at 12:12 PM Christian König <ckoenig.leichtzumerken@gmail.com mailto:ckoenig.leichtzumerken@gmail.com> wrote:
Am 02.08.2018 um 00:25 schrieb Andrey Grodzovsky: > Thinking again about this change and 53d5f1e drm/scheduler: move idle > entities to scheduler with less load v2 > > Looks to me like use case for which fc9a539 drm/scheduler: add NULL > pointer check for run queue (v2) was done > > will not work anymore. > > First of all in drm_sched_entity_push_job, 'if (!entity->rq)' will > never be true any more since we stopped entity->rq to NULL in > > drm_sched_entity_flush. Good point, going to remove that. > > Second of all, even after we removed the entity from rq in > drm_sched_entity_flush to terminate any subsequent submissions > > to the queue the other thread doing push job can just acquire again a > run queue > > from drm_sched_entity_get_free_sched and continue submission
Hi Christian
That is actually desired. When another process is now using the entity to submit jobs adding it back to the rq is actually the right thing to do cause the entity is still in use.
I am not aware of the usecase so I might just be rambling. but if the thread/process that created the entity has called drm_sched_entity_flush then we shouldn't allow other processes to push jobs to that entity.
Why not?
Calling flush() only means that we should wait for all waiting jobs to be send to the hardware. It doesn't mean that we need to stop accepting new jobs.
It is perfectly possible that a child process has inherited a file descriptor from it's parent and calls flush because it exits, while the parent is still using the file descriptor.
The only tricky part is how to handle it when a process is killed, but I think we came to good conclusion there as well.
We remove the entity from the scheduling when the last process which submitted jobs is killed or if there is no such process any more and the current flushing process is killed.
Christian.
Nayan
Christian. > so you can't substitute ' if (!entity->rq)' to 'if > (list_empty(&entity->list))'. > > What about adding a 'stopped' flag to drm_sched_entity to be set in > drm_sched_entity_flush and in > > drm_sched_entity_push_job check for 'if (!entity->stopped)' instead > of ' if (!entity->rq)' ? > > Andrey > > > On 07/30/2018 07:03 AM, Christian König wrote: >> We removed the redundancy of having an extra scheduler field, so we >> can't set the rq to NULL any more or otherwise won't know which >> scheduler to use for the cleanup. >> >> Just remove the entity from the scheduling list instead. >> >> Signed-off-by: Christian König <christian.koenig@amd.com <mailto:christian.koenig@amd.com>> >> --- >> drivers/gpu/drm/scheduler/gpu_scheduler.c | 35 >> +++++++------------------------ >> 1 file changed, 8 insertions(+), 27 deletions(-) >> >> diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c >> b/drivers/gpu/drm/scheduler/gpu_scheduler.c >> index f563e4fbb4b6..1b733229201e 100644 >> --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c >> +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c >> @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct >> drm_sched_entity *entity, >> } >> EXPORT_SYMBOL(drm_sched_entity_init); >> -/** >> - * drm_sched_entity_is_initialized - Query if entity is initialized >> - * >> - * @sched: Pointer to scheduler instance >> - * @entity: The pointer to a valid scheduler entity >> - * >> - * return true if entity is initialized, false otherwise >> -*/ >> -static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler >> *sched, >> - struct drm_sched_entity *entity) >> -{ >> - return entity->rq != NULL && >> - entity->rq->sched == sched; >> -} >> - >> /** >> * drm_sched_entity_is_idle - Check if entity is idle >> * >> @@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct >> drm_sched_entity *entity) >> { >> rmb(); >> - if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL) >> + if (list_empty(&entity->list) || >> + spsc_queue_peek(&entity->job_queue) == NULL) >> return true; >> return false; >> @@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct >> drm_sched_entity *entity, long timeout) >> long ret = timeout; >> sched = entity->rq->sched; >> - if (!drm_sched_entity_is_initialized(sched, entity)) >> - return ret; >> /** >> * The client will not queue more IBs during this fini, consume >> existing >> * queued IBs or discard them on SIGKILL >> @@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct >> drm_sched_entity *entity, long timeout) >> last_user = cmpxchg(&entity->last_user, current->group_leader, >> NULL); >> if ((!last_user || last_user == current->group_leader) && >> (current->flags & PF_EXITING) && (current->exit_code == >> SIGKILL)) >> - drm_sched_entity_set_rq(entity, NULL); >> + drm_sched_rq_remove_entity(entity->rq, entity); >> return ret; >> } >> @@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct >> drm_sched_entity *entity) >> struct drm_gpu_scheduler *sched; >> sched = entity->rq->sched; >> - drm_sched_entity_set_rq(entity, NULL); >> + drm_sched_rq_remove_entity(entity->rq, entity); >> /* Consumption of existing IBs wasn't completed. Forcefully >> * remove them here. >> @@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct >> drm_sched_entity *entity, >> if (entity->rq == rq) >> return; >> - spin_lock(&entity->rq_lock); >> - >> - if (entity->rq) >> - drm_sched_rq_remove_entity(entity->rq, entity); >> + BUG_ON(!rq); >> + spin_lock(&entity->rq_lock); >> + drm_sched_rq_remove_entity(entity->rq, entity); >> entity->rq = rq; >> - if (rq) >> - drm_sched_rq_add_entity(rq, entity); >> - >> + drm_sched_rq_add_entity(rq, entity); >> spin_unlock(&entity->rq_lock); >> } >> EXPORT_SYMBOL(drm_sched_entity_set_rq); >
dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
On 08/02/2018 02:47 AM, Nayan Deshmukh wrote:
On Thu, Aug 2, 2018 at 12:12 PM Christian König <ckoenig.leichtzumerken@gmail.com mailto:ckoenig.leichtzumerken@gmail.com> wrote:
Am 02.08.2018 um 00:25 schrieb Andrey Grodzovsky: > Thinking again about this change and 53d5f1e drm/scheduler: move idle > entities to scheduler with less load v2 > > Looks to me like use case for which fc9a539 drm/scheduler: add NULL > pointer check for run queue (v2) was done > > will not work anymore. > > First of all in drm_sched_entity_push_job, 'if (!entity->rq)' will > never be true any more since we stopped entity->rq to NULL in > > drm_sched_entity_flush. Good point, going to remove that.
So basically we don't need that if (...){ return; } in drm_sched_entity_push_job any more ?
> > Second of all, even after we removed the entity from rq in > drm_sched_entity_flush to terminate any subsequent submissions > > to the queue the other thread doing push job can just acquire again a > run queue > > from drm_sched_entity_get_free_sched and continue submission
Hi Christian
That is actually desired. When another process is now using the entity to submit jobs adding it back to the rq is actually the right thing to do cause the entity is still in use.
Yes, no problem if it's another process. But what about another thread from same process ? Is it a possible use case that 2 threads from same process submit to same entity concurrently ? If so and we specifically kill one, the other will not stop event if we want him to because current code makes him just require a rq for him self.
I am not aware of the usecase so I might just be rambling. but if the thread/process that created the entity has called drm_sched_entity_flush then we shouldn't allow other processes to push jobs to that entity.
Nayan
Christian.
We don't really know who created the entity, the creation could be by X itself and then it's passed to other process for use. Check 'drm/scheduler: only kill entity if last user is killed v2', the idea is that if by the time you want to kill this entity another process (not another thread from your process - i.e. current->group_leader != last_user in drm_sched_entity_flush) already started to use this entity just let it be.
Andrey
> so you can't substitute ' if (!entity->rq)' to 'if > (list_empty(&entity->list))'. > > What about adding a 'stopped' flag to drm_sched_entity to be set in > drm_sched_entity_flush and in > > drm_sched_entity_push_job check for 'if (!entity->stopped)' instead > of ' if (!entity->rq)' ? > > Andrey > > > On 07/30/2018 07:03 AM, Christian König wrote: >> We removed the redundancy of having an extra scheduler field, so we >> can't set the rq to NULL any more or otherwise won't know which >> scheduler to use for the cleanup. >> >> Just remove the entity from the scheduling list instead. >> >> Signed-off-by: Christian König <christian.koenig@amd.com <mailto:christian.koenig@amd.com>> >> --- >> drivers/gpu/drm/scheduler/gpu_scheduler.c | 35 >> +++++++------------------------ >> 1 file changed, 8 insertions(+), 27 deletions(-) >> >> diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c >> b/drivers/gpu/drm/scheduler/gpu_scheduler.c >> index f563e4fbb4b6..1b733229201e 100644 >> --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c >> +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c >> @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct >> drm_sched_entity *entity, >> } >> EXPORT_SYMBOL(drm_sched_entity_init); >> -/** >> - * drm_sched_entity_is_initialized - Query if entity is initialized >> - * >> - * @sched: Pointer to scheduler instance >> - * @entity: The pointer to a valid scheduler entity >> - * >> - * return true if entity is initialized, false otherwise >> -*/ >> -static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler >> *sched, >> - struct drm_sched_entity *entity) >> -{ >> - return entity->rq != NULL && >> - entity->rq->sched == sched; >> -} >> - >> /** >> * drm_sched_entity_is_idle - Check if entity is idle >> * >> @@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct >> drm_sched_entity *entity) >> { >> rmb(); >> - if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL) >> + if (list_empty(&entity->list) || >> + spsc_queue_peek(&entity->job_queue) == NULL) >> return true; >> return false; >> @@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct >> drm_sched_entity *entity, long timeout) >> long ret = timeout; >> sched = entity->rq->sched; >> - if (!drm_sched_entity_is_initialized(sched, entity)) >> - return ret; >> /** >> * The client will not queue more IBs during this fini, consume >> existing >> * queued IBs or discard them on SIGKILL >> @@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct >> drm_sched_entity *entity, long timeout) >> last_user = cmpxchg(&entity->last_user, current->group_leader, >> NULL); >> if ((!last_user || last_user == current->group_leader) && >> (current->flags & PF_EXITING) && (current->exit_code == >> SIGKILL)) >> - drm_sched_entity_set_rq(entity, NULL); >> + drm_sched_rq_remove_entity(entity->rq, entity); >> return ret; >> } >> @@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct >> drm_sched_entity *entity) >> struct drm_gpu_scheduler *sched; >> sched = entity->rq->sched; >> - drm_sched_entity_set_rq(entity, NULL); >> + drm_sched_rq_remove_entity(entity->rq, entity); >> /* Consumption of existing IBs wasn't completed. Forcefully >> * remove them here. >> @@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct >> drm_sched_entity *entity, >> if (entity->rq == rq) >> return; >> - spin_lock(&entity->rq_lock); >> - >> - if (entity->rq) >> - drm_sched_rq_remove_entity(entity->rq, entity); >> + BUG_ON(!rq); >> + spin_lock(&entity->rq_lock); >> + drm_sched_rq_remove_entity(entity->rq, entity); >> entity->rq = rq; >> - if (rq) >> - drm_sched_rq_add_entity(rq, entity); >> - >> + drm_sched_rq_add_entity(rq, entity); >> spin_unlock(&entity->rq_lock); >> } >> EXPORT_SYMBOL(drm_sched_entity_set_rq); >
Am 02.08.2018 um 16:11 schrieb Andrey Grodzovsky:
On 08/02/2018 02:47 AM, Nayan Deshmukh wrote:
On Thu, Aug 2, 2018 at 12:12 PM Christian König <ckoenig.leichtzumerken@gmail.com mailto:ckoenig.leichtzumerken@gmail.com> wrote:
Am 02.08.2018 um 00:25 schrieb Andrey Grodzovsky: > Thinking again about this change and 53d5f1e drm/scheduler: move idle > entities to scheduler with less load v2 > > Looks to me like use case for which fc9a539 drm/scheduler: add NULL > pointer check for run queue (v2) was done > > will not work anymore. > > First of all in drm_sched_entity_push_job, 'if (!entity->rq)' will > never be true any more since we stopped entity->rq to NULL in > > drm_sched_entity_flush. Good point, going to remove that.
So basically we don't need that if (...){ return; } in drm_sched_entity_push_job any more ?
Yes, exactly.
> > Second of all, even after we removed the entity from rq in > drm_sched_entity_flush to terminate any subsequent submissions > > to the queue the other thread doing push job can just acquire again a > run queue > > from drm_sched_entity_get_free_sched and continue submission
Hi Christian
That is actually desired. When another process is now using the entity to submit jobs adding it back to the rq is actually the right thing to do cause the entity is still in use.
Yes, no problem if it's another process. But what about another thread from same process ? Is it a possible use case that 2 threads from same process submit to same entity concurrently ? If so and we specifically kill one, the other will not stop event if we want him to because current code makes him just require a rq for him self.
Well you can't kill a single thread of a process (you can only interrupt it), a SIGKILL will always kill the whole process.
I am not aware of the usecase so I might just be rambling. but if the thread/process that created the entity has called drm_sched_entity_flush then we shouldn't allow other processes to push jobs to that entity.
Nayan
Christian.
We don't really know who created the entity, the creation could be by X itself and then it's passed to other process for use. Check 'drm/scheduler: only kill entity if last user is killed v2', the idea is that if by the time you want to kill this entity another process (not another thread from your process
- i.e. current->group_leader != last_user in drm_sched_entity_flush)
already started to use this entity just let it be.
Yes, exactly that's the idea.
Christian.
Andrey
> so you can't substitute ' if (!entity->rq)' to 'if > (list_empty(&entity->list))'. > > What about adding a 'stopped' flag to drm_sched_entity to be set in > drm_sched_entity_flush and in > > drm_sched_entity_push_job check for 'if (!entity->stopped)' instead > of ' if (!entity->rq)' ? > > Andrey > > > On 07/30/2018 07:03 AM, Christian König wrote: >> We removed the redundancy of having an extra scheduler field, so we >> can't set the rq to NULL any more or otherwise won't know which >> scheduler to use for the cleanup. >> >> Just remove the entity from the scheduling list instead. >> >> Signed-off-by: Christian König <christian.koenig@amd.com <mailto:christian.koenig@amd.com>> >> --- >> drivers/gpu/drm/scheduler/gpu_scheduler.c | 35 >> +++++++------------------------ >> 1 file changed, 8 insertions(+), 27 deletions(-) >> >> diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c >> b/drivers/gpu/drm/scheduler/gpu_scheduler.c >> index f563e4fbb4b6..1b733229201e 100644 >> --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c >> +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c >> @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct >> drm_sched_entity *entity, >> } >> EXPORT_SYMBOL(drm_sched_entity_init); >> -/** >> - * drm_sched_entity_is_initialized - Query if entity is initialized >> - * >> - * @sched: Pointer to scheduler instance >> - * @entity: The pointer to a valid scheduler entity >> - * >> - * return true if entity is initialized, false otherwise >> -*/ >> -static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler >> *sched, >> - struct drm_sched_entity *entity) >> -{ >> - return entity->rq != NULL && >> - entity->rq->sched == sched; >> -} >> - >> /** >> * drm_sched_entity_is_idle - Check if entity is idle >> * >> @@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct >> drm_sched_entity *entity) >> { >> rmb(); >> - if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL) >> + if (list_empty(&entity->list) || >> + spsc_queue_peek(&entity->job_queue) == NULL) >> return true; >> return false; >> @@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct >> drm_sched_entity *entity, long timeout) >> long ret = timeout; >> sched = entity->rq->sched; >> - if (!drm_sched_entity_is_initialized(sched, entity)) >> - return ret; >> /** >> * The client will not queue more IBs during this fini, consume >> existing >> * queued IBs or discard them on SIGKILL >> @@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct >> drm_sched_entity *entity, long timeout) >> last_user = cmpxchg(&entity->last_user, current->group_leader, >> NULL); >> if ((!last_user || last_user == current->group_leader) && >> (current->flags & PF_EXITING) && (current->exit_code == >> SIGKILL)) >> - drm_sched_entity_set_rq(entity, NULL); >> + drm_sched_rq_remove_entity(entity->rq, entity); >> return ret; >> } >> @@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct >> drm_sched_entity *entity) >> struct drm_gpu_scheduler *sched; >> sched = entity->rq->sched; >> - drm_sched_entity_set_rq(entity, NULL); >> + drm_sched_rq_remove_entity(entity->rq, entity); >> /* Consumption of existing IBs wasn't completed. Forcefully >> * remove them here. >> @@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct >> drm_sched_entity *entity, >> if (entity->rq == rq) >> return; >> - spin_lock(&entity->rq_lock); >> - >> - if (entity->rq) >> - drm_sched_rq_remove_entity(entity->rq, entity); >> + BUG_ON(!rq); >> + spin_lock(&entity->rq_lock); >> + drm_sched_rq_remove_entity(entity->rq, entity); >> entity->rq = rq; >> - if (rq) >> - drm_sched_rq_add_entity(rq, entity); >> - >> + drm_sched_rq_add_entity(rq, entity); >> spin_unlock(&entity->rq_lock); >> } >> EXPORT_SYMBOL(drm_sched_entity_set_rq); >
On 08/03/2018 10:06 AM, Christian König wrote:
Am 02.08.2018 um 16:11 schrieb Andrey Grodzovsky:
On 08/02/2018 02:47 AM, Nayan Deshmukh wrote:
On Thu, Aug 2, 2018 at 12:12 PM Christian König <ckoenig.leichtzumerken@gmail.com mailto:ckoenig.leichtzumerken@gmail.com> wrote:
Am 02.08.2018 um 00:25 schrieb Andrey Grodzovsky: > Thinking again about this change and 53d5f1e drm/scheduler: move idle > entities to scheduler with less load v2 > > Looks to me like use case for which fc9a539 drm/scheduler: add NULL > pointer check for run queue (v2) was done > > will not work anymore. > > First of all in drm_sched_entity_push_job, 'if (!entity->rq)' will > never be true any more since we stopped entity->rq to NULL in > > drm_sched_entity_flush. Good point, going to remove that.
So basically we don't need that if (...){ return; } in drm_sched_entity_push_job any more ?
Yes, exactly.
> > Second of all, even after we removed the entity from rq in > drm_sched_entity_flush to terminate any subsequent submissions > > to the queue the other thread doing push job can just acquire again a > run queue > > from drm_sched_entity_get_free_sched and continue submission
Hi Christian
That is actually desired. When another process is now using the entity to submit jobs adding it back to the rq is actually the right thing to do cause the entity is still in use.
Yes, no problem if it's another process. But what about another thread from same process ? Is it a possible use case that 2 threads from same process submit to same entity concurrently ? If so and we specifically kill one, the other will not stop event if we want him to because current code makes him just require a rq for him self.
Well you can't kill a single thread of a process (you can only interrupt it), a SIGKILL will always kill the whole process.
Is the following scenario possible and acceptable ? 2 threads from same process working on same queue where thread A currently in drm_sched_entity_flush->wait_event_timeout (the process getting shut down because of SIGKILL sent by user) while thread B still inside drm_sched_entity_push_job before 'if (reschedule)'. 'A' stopped waiting because queue became empty and then removes the entity queue from scheduler's run queue while B goes inside 'reschedule' because it evaluates to true ('first' is true and all the rest of the conditions), acquires new rq, and later adds it back to scheduler (different one maybe) and keeps submitting jobs as much as he likes and then can be stack for up to 'timeout' time in his drm_sched_entity_flush waiting for them.
My understanding was that introduction of entity->last is to force immediate termination job submissions by any thread from the terminating process.
Andrey
I am not aware of the usecase so I might just be rambling. but if the thread/process that created the entity has called drm_sched_entity_flush then we shouldn't allow other processes to push jobs to that entity.
Nayan
Christian.
We don't really know who created the entity, the creation could be by X itself and then it's passed to other process for use. Check 'drm/scheduler: only kill entity if last user is killed v2', the idea is that if by the time you want to kill this entity another process (not another thread from your process - i.e. current->group_leader != last_user in drm_sched_entity_flush) already started to use this entity just let it be.
Yes, exactly that's the idea.
Christian.
Andrey
> so you can't substitute ' if (!entity->rq)' to 'if > (list_empty(&entity->list))'. > > What about adding a 'stopped' flag to drm_sched_entity to be set in > drm_sched_entity_flush and in > > drm_sched_entity_push_job check for 'if (!entity->stopped)' instead > of ' if (!entity->rq)' ? > > Andrey > > > On 07/30/2018 07:03 AM, Christian König wrote: >> We removed the redundancy of having an extra scheduler field, so we >> can't set the rq to NULL any more or otherwise won't know which >> scheduler to use for the cleanup. >> >> Just remove the entity from the scheduling list instead. >> >> Signed-off-by: Christian König <christian.koenig@amd.com <mailto:christian.koenig@amd.com>> >> --- >> drivers/gpu/drm/scheduler/gpu_scheduler.c | 35 >> +++++++------------------------ >> 1 file changed, 8 insertions(+), 27 deletions(-) >> >> diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c >> b/drivers/gpu/drm/scheduler/gpu_scheduler.c >> index f563e4fbb4b6..1b733229201e 100644 >> --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c >> +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c >> @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct >> drm_sched_entity *entity, >> } >> EXPORT_SYMBOL(drm_sched_entity_init); >> -/** >> - * drm_sched_entity_is_initialized - Query if entity is initialized >> - * >> - * @sched: Pointer to scheduler instance >> - * @entity: The pointer to a valid scheduler entity >> - * >> - * return true if entity is initialized, false otherwise >> -*/ >> -static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler >> *sched, >> - struct drm_sched_entity *entity) >> -{ >> - return entity->rq != NULL && >> - entity->rq->sched == sched; >> -} >> - >> /** >> * drm_sched_entity_is_idle - Check if entity is idle >> * >> @@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct >> drm_sched_entity *entity) >> { >> rmb(); >> - if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL) >> + if (list_empty(&entity->list) || >> + spsc_queue_peek(&entity->job_queue) == NULL) >> return true; >> return false; >> @@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct >> drm_sched_entity *entity, long timeout) >> long ret = timeout; >> sched = entity->rq->sched; >> - if (!drm_sched_entity_is_initialized(sched, entity)) >> - return ret; >> /** >> * The client will not queue more IBs during this fini, consume >> existing >> * queued IBs or discard them on SIGKILL >> @@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct >> drm_sched_entity *entity, long timeout) >> last_user = cmpxchg(&entity->last_user, current->group_leader, >> NULL); >> if ((!last_user || last_user == current->group_leader) && >> (current->flags & PF_EXITING) && (current->exit_code == >> SIGKILL)) >> - drm_sched_entity_set_rq(entity, NULL); >> + drm_sched_rq_remove_entity(entity->rq, entity); >> return ret; >> } >> @@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct >> drm_sched_entity *entity) >> struct drm_gpu_scheduler *sched; >> sched = entity->rq->sched; >> - drm_sched_entity_set_rq(entity, NULL); >> + drm_sched_rq_remove_entity(entity->rq, entity); >> /* Consumption of existing IBs wasn't completed. Forcefully >> * remove them here. >> @@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct >> drm_sched_entity *entity, >> if (entity->rq == rq) >> return; >> - spin_lock(&entity->rq_lock); >> - >> - if (entity->rq) >> - drm_sched_rq_remove_entity(entity->rq, entity); >> + BUG_ON(!rq); >> + spin_lock(&entity->rq_lock); >> + drm_sched_rq_remove_entity(entity->rq, entity); >> entity->rq = rq; >> - if (rq) >> - drm_sched_rq_add_entity(rq, entity); >> - >> + drm_sched_rq_add_entity(rq, entity); >> spin_unlock(&entity->rq_lock); >> } >> EXPORT_SYMBOL(drm_sched_entity_set_rq); >
Am 03.08.2018 um 16:54 schrieb Andrey Grodzovsky:
[SNIP]
> > Second of all, even after we removed the entity from rq in > drm_sched_entity_flush to terminate any subsequent submissions > > to the queue the other thread doing push job can just acquire again a > run queue > > from drm_sched_entity_get_free_sched and continue submission
Hi Christian
That is actually desired. When another process is now using the entity to submit jobs adding it back to the rq is actually the right thing to do cause the entity is still in use.
Yes, no problem if it's another process. But what about another thread from same process ? Is it a possible use case that 2 threads from same process submit to same entity concurrently ? If so and we specifically kill one, the other will not stop event if we want him to because current code makes him just require a rq for him self.
Well you can't kill a single thread of a process (you can only interrupt it), a SIGKILL will always kill the whole process.
Is the following scenario possible and acceptable ? 2 threads from same process working on same queue where thread A currently in drm_sched_entity_flush->wait_event_timeout (the process getting shut down because of SIGKILL sent by user) while thread B still inside drm_sched_entity_push_job before 'if (reschedule)'. 'A' stopped waiting because queue became empty and then removes the entity queue from scheduler's run queue while B goes inside 'reschedule' because it evaluates to true ('first' is true and all the rest of the conditions), acquires new rq, and later adds it back to scheduler (different one maybe) and keeps submitting jobs as much as he likes and then can be stack for up to 'timeout' time in his drm_sched_entity_flush waiting for them.
I'm not 100% sure but I don't think that can happen.
See flushing the fd is done while dropping the fd, which happens only after all threads of the process in question are killed.
Otherwise the flushing wouldn't make to much sense. In other words imagine an application where a thread does a write() on a fd which is killed.
The idea of the flush is to preserve the data and that won't work if that isn't correctly ordered.
My understanding was that introduction of entity->last is to force immediate termination job submissions by any thread from the terminating process.
We could consider reordering that once more. Going to play out all scenarios in my head over the weekend :)
Christian.
Andrey
On 08/06/2018 08:44 AM, Christian König wrote:
Am 03.08.2018 um 16:54 schrieb Andrey Grodzovsky:
[SNIP]
> > Second of all, even after we removed the entity from rq in > drm_sched_entity_flush to terminate any subsequent submissions > > to the queue the other thread doing push job can just acquire again a > run queue > > from drm_sched_entity_get_free_sched and continue submission
Hi Christian
That is actually desired. When another process is now using the entity to submit jobs adding it back to the rq is actually the right thing to do cause the entity is still in use.
Yes, no problem if it's another process. But what about another thread from same process ? Is it a possible use case that 2 threads from same process submit to same entity concurrently ? If so and we specifically kill one, the other will not stop event if we want him to because current code makes him just require a rq for him self.
Well you can't kill a single thread of a process (you can only interrupt it), a SIGKILL will always kill the whole process.
Is the following scenario possible and acceptable ? 2 threads from same process working on same queue where thread A currently in drm_sched_entity_flush->wait_event_timeout (the process getting shut down because of SIGKILL sent by user) while thread B still inside drm_sched_entity_push_job before 'if (reschedule)'. 'A' stopped waiting because queue became empty and then removes the entity queue from scheduler's run queue while B goes inside 'reschedule' because it evaluates to true ('first' is true and all the rest of the conditions), acquires new rq, and later adds it back to scheduler (different one maybe) and keeps submitting jobs as much as he likes and then can be stack for up to 'timeout' time in his drm_sched_entity_flush waiting for them.
I'm not 100% sure but I don't think that can happen.
See flushing the fd is done while dropping the fd, which happens only after all threads of the process in question are killed.
Yea, this FDs handling is indeed a lot of gray area for me but as far as I remember flushing is done per each thread when exits (possibly due to a signal). Now signals interception and processing (as a result of which .flush will get called if SIGKILL received) is done in some points amongst which is when returning from IOCTL. So if first thread was at the very end of the CS ioctl when SIGKILL was received while the other one at the beginning then I think we might see something like the scenario above.
Andrey
Otherwise the flushing wouldn't make to much sense. In other words imagine an application where a thread does a write() on a fd which is killed.
The idea of the flush is to preserve the data and that won't work if that isn't correctly ordered.
My understanding was that introduction of entity->last is to force immediate termination job submissions by any thread from the terminating process.
We could consider reordering that once more. Going to play out all scenarios in my head over the weekend :)
Christian.
Andrey
Am 03.08.2018 um 20:41 schrieb Andrey Grodzovsky:
On 08/06/2018 08:44 AM, Christian König wrote:
Am 03.08.2018 um 16:54 schrieb Andrey Grodzovsky:
[SNIP]
> > Second of all, even after we removed the entity from rq in > drm_sched_entity_flush to terminate any subsequent submissions > > to the queue the other thread doing push job can just acquire again a > run queue > > from drm_sched_entity_get_free_sched and continue submission
Hi Christian
That is actually desired. When another process is now using the entity to submit jobs adding it back to the rq is actually the right thing to do cause the entity is still in use.
Yes, no problem if it's another process. But what about another thread from same process ? Is it a possible use case that 2 threads from same process submit to same entity concurrently ? If so and we specifically kill one, the other will not stop event if we want him to because current code makes him just require a rq for him self.
Well you can't kill a single thread of a process (you can only interrupt it), a SIGKILL will always kill the whole process.
Is the following scenario possible and acceptable ? 2 threads from same process working on same queue where thread A currently in drm_sched_entity_flush->wait_event_timeout (the process getting shut down because of SIGKILL sent by user) while thread B still inside drm_sched_entity_push_job before 'if (reschedule)'. 'A' stopped waiting because queue became empty and then removes the entity queue from scheduler's run queue while B goes inside 'reschedule' because it evaluates to true ('first' is true and all the rest of the conditions), acquires new rq, and later adds it back to scheduler (different one maybe) and keeps submitting jobs as much as he likes and then can be stack for up to 'timeout' time in his drm_sched_entity_flush waiting for them.
I'm not 100% sure but I don't think that can happen.
See flushing the fd is done while dropping the fd, which happens only after all threads of the process in question are killed.
Yea, this FDs handling is indeed a lot of gray area for me but as far as I remember flushing is done per each thread when exits (possibly due to a signal). Now signals interception and processing (as a result of which .flush will get called if SIGKILL received) is done in some points amongst which is when returning from IOCTL. So if first thread was at the very end of the CS ioctl when SIGKILL was received while the other one at the beginning then I think we might see something like the scenario above.
SIGKILL isn't processed as long as any thread of the application is still inside the kernel. That's why we have wait_event_killable().
So I don't think that the scenario above is possible, but I'm really not 100% sure either.
Christian.
Andrey
Otherwise the flushing wouldn't make to much sense. In other words imagine an application where a thread does a write() on a fd which is killed.
The idea of the flush is to preserve the data and that won't work if that isn't correctly ordered.
My understanding was that introduction of entity->last is to force immediate termination job submissions by any thread from the terminating process.
We could consider reordering that once more. Going to play out all scenarios in my head over the weekend :)
Christian.
Andrey
dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
On 08/06/2018 04:14 AM, Christian König wrote:
Am 03.08.2018 um 20:41 schrieb Andrey Grodzovsky:
On 08/06/2018 08:44 AM, Christian König wrote:
Am 03.08.2018 um 16:54 schrieb Andrey Grodzovsky:
[SNIP]
> > > > > Second of all, even after we removed the entity from rq in > > drm_sched_entity_flush to terminate any subsequent submissions > > > > to the queue the other thread doing push job can just > acquire again a > > run queue > > > > from drm_sched_entity_get_free_sched and continue submission > > Hi Christian > > That is actually desired. > > When another process is now using the entity to submit jobs > adding it > back to the rq is actually the right thing to do cause the > entity is > still in use. >
Yes, no problem if it's another process. But what about another thread from same process ? Is it a possible use case that 2 threads from same process submit to same entity concurrently ? If so and we specifically kill one, the other will not stop event if we want him to because current code makes him just require a rq for him self.
Well you can't kill a single thread of a process (you can only interrupt it), a SIGKILL will always kill the whole process.
Is the following scenario possible and acceptable ? 2 threads from same process working on same queue where thread A currently in drm_sched_entity_flush->wait_event_timeout (the process getting shut down because of SIGKILL sent by user) while thread B still inside drm_sched_entity_push_job before 'if (reschedule)'. 'A' stopped waiting because queue became empty and then removes the entity queue from scheduler's run queue while B goes inside 'reschedule' because it evaluates to true ('first' is true and all the rest of the conditions), acquires new rq, and later adds it back to scheduler (different one maybe) and keeps submitting jobs as much as he likes and then can be stack for up to 'timeout' time in his drm_sched_entity_flush waiting for them.
I'm not 100% sure but I don't think that can happen.
See flushing the fd is done while dropping the fd, which happens only after all threads of the process in question are killed.
Yea, this FDs handling is indeed a lot of gray area for me but as far as I remember flushing is done per each thread when exits (possibly due to a signal). Now signals interception and processing (as a result of which .flush will get called if SIGKILL received) is done in some points amongst which is when returning from IOCTL. So if first thread was at the very end of the CS ioctl when SIGKILL was received while the other one at the beginning then I think we might see something like the scenario above.
SIGKILL isn't processed as long as any thread of the application is still inside the kernel. That's why we have wait_event_killable().
Can you tell me where is this happening ? What i see is in the code is that do_group_exit calls zap_other_threads which just adds SIGKILL to signal sets of other threads in group and sends a wake up. Then do_exit will close all FDs for current thread and so .flush will be called, when last thread drops it's refcount for the FD .release will be called.
Andrey
So I don't think that the scenario above is possible, but I'm really not 100% sure either.
Christian.
Andrey
Otherwise the flushing wouldn't make to much sense. In other words imagine an application where a thread does a write() on a fd which is killed.
The idea of the flush is to preserve the data and that won't work if that isn't correctly ordered.
My understanding was that introduction of entity->last is to force immediate termination job submissions by any thread from the terminating process.
We could consider reordering that once more. Going to play out all scenarios in my head over the weekend :)
Christian.
Andrey
dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Am 09.08.2018 um 19:39 schrieb Andrey Grodzovsky:
[SNIP]
SIGKILL isn't processed as long as any thread of the application is still inside the kernel. That's why we have wait_event_killable().
Can you tell me where is this happening ? What i see is in the code is that do_group_exit calls zap_other_threads which just adds SIGKILL to signal sets of other threads in group and sends a wake up. Then do_exit will close all FDs for current thread and so .flush will be called, when last thread drops it's refcount for the FD .release will be called.
Good question, I have not the slightest idea.
Killed processes certainly doesn't die until all threads return from kernel space, but I'm not 100% sure if that happens before or after the flush.
Christian.
Andrey
Hi Andrey,
Good catch. I guess since both Christian and I were working on it parallelly we didn't catch this problem.
If it is only causing a problem in push_job then we can handle it something like this:
---------------------------------------------------------------------------------------------------------------------------- diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index 05dc6ecd4003..f344ce32f128 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -563,6 +563,11 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job, first = spsc_queue_count(&entity->job_queue) == 0; reschedule = idle && first && (entity->num_rq_list > 1);
+ if (first && list_empty(&entity->list)) { + DRM_ERROR("Trying to push to a killed entity\n"); + return; + } + if (reschedule) { rq = drm_sched_entity_get_free_sched(entity); spin_lock(&entity->rq_lock); @@ -580,11 +585,6 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job, if (first) { /* Add the entity to the run queue */ spin_lock(&entity->rq_lock); - if (!entity->rq) { - DRM_ERROR("Trying to push to a killed entity\n"); - spin_unlock(&entity->rq_lock); - return; - } drm_sched_rq_add_entity(entity->rq, entity); spin_unlock(&entity->rq_lock); drm_sched_wakeup(entity->rq->sched); -----------------------------------------------------------------------------------------------------------------------------
On Thu, Aug 2, 2018 at 3:55 AM Andrey Grodzovsky Andrey.Grodzovsky@amd.com wrote:
Thinking again about this change and 53d5f1e drm/scheduler: move idle entities to scheduler with less load v2
Looks to me like use case for which fc9a539 drm/scheduler: add NULL pointer check for run queue (v2) was done
will not work anymore.
First of all in drm_sched_entity_push_job, 'if (!entity->rq)' will never be true any more since we stopped entity->rq to NULL in
drm_sched_entity_flush.
Second of all, even after we removed the entity from rq in drm_sched_entity_flush to terminate any subsequent submissions
to the queue the other thread doing push job can just acquire again a run queue
from drm_sched_entity_get_free_sched and continue submission so you can't substitute ' if (!entity->rq)' to 'if (list_empty(&entity->list))'.
What about adding a 'stopped' flag to drm_sched_entity to be set in drm_sched_entity_flush and in
But it might be worth adding a stopped flag field if it is required at
other places as well.
Thanks, Nayan
drm_sched_entity_push_job check for 'if (!entity->stopped)' instead of ' if (!entity->rq)' ?
Andrey
On 07/30/2018 07:03 AM, Christian König wrote:
We removed the redundancy of having an extra scheduler field, so we can't set the rq to NULL any more or otherwise won't know which scheduler to use for the cleanup.
Just remove the entity from the scheduling list instead.
Signed-off-by: Christian König christian.koenig@amd.com
drivers/gpu/drm/scheduler/gpu_scheduler.c | 35
+++++++------------------------
1 file changed, 8 insertions(+), 27 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c
b/drivers/gpu/drm/scheduler/gpu_scheduler.c
index f563e4fbb4b6..1b733229201e 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct drm_sched_entity
*entity,
} EXPORT_SYMBOL(drm_sched_entity_init);
-/**
- drm_sched_entity_is_initialized - Query if entity is initialized
- @sched: Pointer to scheduler instance
- @entity: The pointer to a valid scheduler entity
- return true if entity is initialized, false otherwise
-*/ -static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler
*sched,
struct drm_sched_entity
*entity)
-{
return entity->rq != NULL &&
entity->rq->sched == sched;
-}
- /**
- drm_sched_entity_is_idle - Check if entity is idle
@@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct
drm_sched_entity *entity)
{ rmb();
if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL)
if (list_empty(&entity->list) ||
spsc_queue_peek(&entity->job_queue) == NULL) return true; return false;
@@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct drm_sched_entity
*entity, long timeout)
long ret = timeout; sched = entity->rq->sched;
if (!drm_sched_entity_is_initialized(sched, entity))
return ret; /** * The client will not queue more IBs during this fini, consume
existing
* queued IBs or discard them on SIGKILL
@@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct drm_sched_entity
*entity, long timeout)
last_user = cmpxchg(&entity->last_user, current->group_leader,
NULL);
if ((!last_user || last_user == current->group_leader) && (current->flags & PF_EXITING) && (current->exit_code ==
SIGKILL))
drm_sched_entity_set_rq(entity, NULL);
drm_sched_rq_remove_entity(entity->rq, entity); return ret;
}
@@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct drm_sched_entity
*entity)
struct drm_gpu_scheduler *sched; sched = entity->rq->sched;
drm_sched_entity_set_rq(entity, NULL);
drm_sched_rq_remove_entity(entity->rq, entity); /* Consumption of existing IBs wasn't completed. Forcefully * remove them here.
@@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct
drm_sched_entity *entity,
if (entity->rq == rq) return;
spin_lock(&entity->rq_lock);
if (entity->rq)
drm_sched_rq_remove_entity(entity->rq, entity);
BUG_ON(!rq);
spin_lock(&entity->rq_lock);
drm_sched_rq_remove_entity(entity->rq, entity); entity->rq = rq;
if (rq)
drm_sched_rq_add_entity(rq, entity);
} EXPORT_SYMBOL(drm_sched_entity_set_rq);drm_sched_rq_add_entity(rq, entity); spin_unlock(&entity->rq_lock);
On 08/02/2018 02:43 AM, Nayan Deshmukh wrote:
Hi Andrey,
Good catch. I guess since both Christian and I were working on it parallelly we didn't catch this problem.
If it is only causing a problem in push_job then we can handle it something like this:
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index 05dc6ecd4003..f344ce32f128 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -563,6 +563,11 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job, first = spsc_queue_count(&entity->job_queue) == 0; reschedule = idle && first && (entity->num_rq_list > 1);
+ if (first && list_empty(&entity->list)) {
first might be false if the other process interrupted by SIGKILL in middle of wait_event_killable in drm_sched_entity_flush when there were still item in queue. I don't see a good way besides adding a 'stopped' flag to drm_sched_enitity.
Andrey
+ DRM_ERROR("Trying to push to a killed entity\n"); + return; + }
if (reschedule) { rq = drm_sched_entity_get_free_sched(entity); spin_lock(&entity->rq_lock); @@ -580,11 +585,6 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job, if (first) { /* Add the entity to the run queue */ spin_lock(&entity->rq_lock); - if (!entity->rq) { - DRM_ERROR("Trying to push to a killed entity\n"); - spin_unlock(&entity->rq_lock); - return; - } drm_sched_rq_add_entity(entity->rq, entity); spin_unlock(&entity->rq_lock); drm_sched_wakeup(entity->rq->sched);
On Thu, Aug 2, 2018 at 3:55 AM Andrey Grodzovsky <Andrey.Grodzovsky@amd.com mailto:Andrey.Grodzovsky@amd.com> wrote:
Thinking again about this change and 53d5f1e drm/scheduler: move idle entities to scheduler with less load v2 Looks to me like use case for which fc9a539 drm/scheduler: add NULL pointer check for run queue (v2) was done will not work anymore. First of all in drm_sched_entity_push_job, 'if (!entity->rq)' will never be true any more since we stopped entity->rq to NULL in drm_sched_entity_flush. Second of all, even after we removed the entity from rq in drm_sched_entity_flush to terminate any subsequent submissions to the queue the other thread doing push job can just acquire again a run queue from drm_sched_entity_get_free_sched and continue submission so you can't substitute ' if (!entity->rq)' to 'if (list_empty(&entity->list))'. What about adding a 'stopped' flag to drm_sched_entity to be set in drm_sched_entity_flush and in
But it might be worth adding a stopped flag field if it is required at other places as well.
Thanks, Nayan
drm_sched_entity_push_job check for 'if (!entity->stopped)' instead of ' if (!entity->rq)' ? Andrey On 07/30/2018 07:03 AM, Christian König wrote: > We removed the redundancy of having an extra scheduler field, so we > can't set the rq to NULL any more or otherwise won't know which > scheduler to use for the cleanup. > > Just remove the entity from the scheduling list instead. > > Signed-off-by: Christian König <christian.koenig@amd.com <mailto:christian.koenig@amd.com>> > --- > drivers/gpu/drm/scheduler/gpu_scheduler.c | 35 +++++++------------------------ > 1 file changed, 8 insertions(+), 27 deletions(-) > > diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c > index f563e4fbb4b6..1b733229201e 100644 > --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c > +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c > @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct drm_sched_entity *entity, > } > EXPORT_SYMBOL(drm_sched_entity_init); > > -/** > - * drm_sched_entity_is_initialized - Query if entity is initialized > - * > - * @sched: Pointer to scheduler instance > - * @entity: The pointer to a valid scheduler entity > - * > - * return true if entity is initialized, false otherwise > -*/ > -static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler *sched, > - struct drm_sched_entity *entity) > -{ > - return entity->rq != NULL && > - entity->rq->sched == sched; > -} > - > /** > * drm_sched_entity_is_idle - Check if entity is idle > * > @@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity) > { > rmb(); > > - if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL) > + if (list_empty(&entity->list) || > + spsc_queue_peek(&entity->job_queue) == NULL) > return true; > > return false; > @@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) > long ret = timeout; > > sched = entity->rq->sched; > - if (!drm_sched_entity_is_initialized(sched, entity)) > - return ret; > /** > * The client will not queue more IBs during this fini, consume existing > * queued IBs or discard them on SIGKILL > @@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) > last_user = cmpxchg(&entity->last_user, current->group_leader, NULL); > if ((!last_user || last_user == current->group_leader) && > (current->flags & PF_EXITING) && (current->exit_code == SIGKILL)) > - drm_sched_entity_set_rq(entity, NULL); > + drm_sched_rq_remove_entity(entity->rq, entity); > > return ret; > } > @@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct drm_sched_entity *entity) > struct drm_gpu_scheduler *sched; > > sched = entity->rq->sched; > - drm_sched_entity_set_rq(entity, NULL); > + drm_sched_rq_remove_entity(entity->rq, entity); > > /* Consumption of existing IBs wasn't completed. Forcefully > * remove them here. > @@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct drm_sched_entity *entity, > if (entity->rq == rq) > return; > > - spin_lock(&entity->rq_lock); > - > - if (entity->rq) > - drm_sched_rq_remove_entity(entity->rq, entity); > + BUG_ON(!rq); > > + spin_lock(&entity->rq_lock); > + drm_sched_rq_remove_entity(entity->rq, entity); > entity->rq = rq; > - if (rq) > - drm_sched_rq_add_entity(rq, entity); > - > + drm_sched_rq_add_entity(rq, entity); > spin_unlock(&entity->rq_lock); > } > EXPORT_SYMBOL(drm_sched_entity_set_rq);
On Thu, Aug 2, 2018, 8:09 PM Andrey Grodzovsky Andrey.Grodzovsky@amd.com wrote:
On 08/02/2018 02:43 AM, Nayan Deshmukh wrote:
Hi Andrey,
Good catch. I guess since both Christian and I were working on it parallelly we didn't catch this problem.
If it is only causing a problem in push_job then we can handle it something like this:
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index 05dc6ecd4003..f344ce32f128 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -563,6 +563,11 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job, first = spsc_queue_count(&entity->job_queue) == 0; reschedule = idle && first && (entity->num_rq_list > 1);
if (first && list_empty(&entity->list)) {
first might be false if the other process interrupted by SIGKILL in middle of wait_event_killable in drm_sched_entity_flush when there were still item in queue. I don't see a good way besides adding a 'stopped' flag to drm_sched_enitity.
Sorry I missed this mail. This case might happen but this was also not being handled previously.
Nayan
Andrey
DRM_ERROR("Trying to push to a killed entity\n");
return;
}
if (reschedule) { rq = drm_sched_entity_get_free_sched(entity); spin_lock(&entity->rq_lock);
@@ -580,11 +585,6 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job, if (first) { /* Add the entity to the run queue */ spin_lock(&entity->rq_lock);
if (!entity->rq) {
DRM_ERROR("Trying to push to a killed entity\n");
spin_unlock(&entity->rq_lock);
return;
} drm_sched_rq_add_entity(entity->rq, entity); spin_unlock(&entity->rq_lock); drm_sched_wakeup(entity->rq->sched);
On Thu, Aug 2, 2018 at 3:55 AM Andrey Grodzovsky < Andrey.Grodzovsky@amd.com> wrote:
Thinking again about this change and 53d5f1e drm/scheduler: move idle entities to scheduler with less load v2
Looks to me like use case for which fc9a539 drm/scheduler: add NULL pointer check for run queue (v2) was done
will not work anymore.
First of all in drm_sched_entity_push_job, 'if (!entity->rq)' will never be true any more since we stopped entity->rq to NULL in
drm_sched_entity_flush.
Second of all, even after we removed the entity from rq in drm_sched_entity_flush to terminate any subsequent submissions
to the queue the other thread doing push job can just acquire again a run queue
from drm_sched_entity_get_free_sched and continue submission so you can't substitute ' if (!entity->rq)' to 'if (list_empty(&entity->list))'.
What about adding a 'stopped' flag to drm_sched_entity to be set in drm_sched_entity_flush and in
But it might be worth adding a stopped flag field if it is required at
other places as well.
Thanks, Nayan
drm_sched_entity_push_job check for 'if (!entity->stopped)' instead of ' if (!entity->rq)' ?
Andrey
On 07/30/2018 07:03 AM, Christian König wrote:
We removed the redundancy of having an extra scheduler field, so we can't set the rq to NULL any more or otherwise won't know which scheduler to use for the cleanup.
Just remove the entity from the scheduling list instead.
Signed-off-by: Christian König christian.koenig@amd.com
drivers/gpu/drm/scheduler/gpu_scheduler.c | 35
+++++++------------------------
1 file changed, 8 insertions(+), 27 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c
b/drivers/gpu/drm/scheduler/gpu_scheduler.c
index f563e4fbb4b6..1b733229201e 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct drm_sched_entity
*entity,
} EXPORT_SYMBOL(drm_sched_entity_init);
-/**
- drm_sched_entity_is_initialized - Query if entity is initialized
- @sched: Pointer to scheduler instance
- @entity: The pointer to a valid scheduler entity
- return true if entity is initialized, false otherwise
-*/ -static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler
*sched,
struct drm_sched_entity
*entity)
-{
return entity->rq != NULL &&
entity->rq->sched == sched;
-}
- /**
- drm_sched_entity_is_idle - Check if entity is idle
@@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct
drm_sched_entity *entity)
{ rmb();
if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL)
if (list_empty(&entity->list) ||
spsc_queue_peek(&entity->job_queue) == NULL) return true; return false;
@@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct drm_sched_entity
*entity, long timeout)
long ret = timeout; sched = entity->rq->sched;
if (!drm_sched_entity_is_initialized(sched, entity))
return ret; /** * The client will not queue more IBs during this fini, consume
existing
* queued IBs or discard them on SIGKILL
@@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct drm_sched_entity
*entity, long timeout)
last_user = cmpxchg(&entity->last_user, current->group_leader,
NULL);
if ((!last_user || last_user == current->group_leader) && (current->flags & PF_EXITING) && (current->exit_code ==
SIGKILL))
drm_sched_entity_set_rq(entity, NULL);
drm_sched_rq_remove_entity(entity->rq, entity); return ret;
}
@@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct drm_sched_entity
*entity)
struct drm_gpu_scheduler *sched; sched = entity->rq->sched;
drm_sched_entity_set_rq(entity, NULL);
drm_sched_rq_remove_entity(entity->rq, entity); /* Consumption of existing IBs wasn't completed. Forcefully * remove them here.
@@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct
drm_sched_entity *entity,
if (entity->rq == rq) return;
spin_lock(&entity->rq_lock);
if (entity->rq)
drm_sched_rq_remove_entity(entity->rq, entity);
BUG_ON(!rq);
spin_lock(&entity->rq_lock);
drm_sched_rq_remove_entity(entity->rq, entity); entity->rq = rq;
if (rq)
drm_sched_rq_add_entity(rq, entity);
} EXPORT_SYMBOL(drm_sched_entity_set_rq);drm_sched_rq_add_entity(rq, entity); spin_unlock(&entity->rq_lock);
On 08/03/2018 08:12 AM, Nayan Deshmukh wrote:
On Thu, Aug 2, 2018, 8:09 PM Andrey Grodzovsky <Andrey.Grodzovsky@amd.com mailto:Andrey.Grodzovsky@amd.com> wrote:
On 08/02/2018 02:43 AM, Nayan Deshmukh wrote:
Hi Andrey, Good catch. I guess since both Christian and I were working on it parallelly we didn't catch this problem. If it is only causing a problem in push_job then we can handle it something like this: ---------------------------------------------------------------------------------------------------------------------------- diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index 05dc6ecd4003..f344ce32f128 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -563,6 +563,11 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job, first = spsc_queue_count(&entity->job_queue) == 0; reschedule = idle && first && (entity->num_rq_list > 1); + if (first && list_empty(&entity->list)) {
first might be false if the other process interrupted by SIGKILL in middle of wait_event_killable in drm_sched_entity_flush when there were still item in queue. I don't see a good way besides adding a 'stopped' flag to drm_sched_enitity.
Sorry I missed this mail. This case might happen but this was also not being handled previously.
Nayan
The original code before 'drm/scheduler: stop setting rq to NULL' was , because you didn't use the queue's emptiness as a criteria for aborting your next push to queue but rather checking for entity->rq != NULL.
Andrey
Andrey
+ DRM_ERROR("Trying to push to a killed entity\n"); + return; + } + if (reschedule) { rq = drm_sched_entity_get_free_sched(entity); spin_lock(&entity->rq_lock); @@ -580,11 +585,6 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job, if (first) { /* Add the entity to the run queue */ spin_lock(&entity->rq_lock); - if (!entity->rq) { - DRM_ERROR("Trying to push to a killed entity\n"); - spin_unlock(&entity->rq_lock); - return; - } drm_sched_rq_add_entity(entity->rq, entity); spin_unlock(&entity->rq_lock); drm_sched_wakeup(entity->rq->sched); ----------------------------------------------------------------------------------------------------------------------------- On Thu, Aug 2, 2018 at 3:55 AM Andrey Grodzovsky <Andrey.Grodzovsky@amd.com <mailto:Andrey.Grodzovsky@amd.com>> wrote: Thinking again about this change and 53d5f1e drm/scheduler: move idle entities to scheduler with less load v2 Looks to me like use case for which fc9a539 drm/scheduler: add NULL pointer check for run queue (v2) was done will not work anymore. First of all in drm_sched_entity_push_job, 'if (!entity->rq)' will never be true any more since we stopped entity->rq to NULL in drm_sched_entity_flush. Second of all, even after we removed the entity from rq in drm_sched_entity_flush to terminate any subsequent submissions to the queue the other thread doing push job can just acquire again a run queue from drm_sched_entity_get_free_sched and continue submission so you can't substitute ' if (!entity->rq)' to 'if (list_empty(&entity->list))'. What about adding a 'stopped' flag to drm_sched_entity to be set in drm_sched_entity_flush and in But it might be worth adding a stopped flag field if it is required at other places as well. Thanks, Nayan drm_sched_entity_push_job check for 'if (!entity->stopped)' instead of ' if (!entity->rq)' ? Andrey On 07/30/2018 07:03 AM, Christian König wrote: > We removed the redundancy of having an extra scheduler field, so we > can't set the rq to NULL any more or otherwise won't know which > scheduler to use for the cleanup. > > Just remove the entity from the scheduling list instead. > > Signed-off-by: Christian König <christian.koenig@amd.com <mailto:christian.koenig@amd.com>> > --- > drivers/gpu/drm/scheduler/gpu_scheduler.c | 35 +++++++------------------------ > 1 file changed, 8 insertions(+), 27 deletions(-) > > diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c > index f563e4fbb4b6..1b733229201e 100644 > --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c > +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c > @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct drm_sched_entity *entity, > } > EXPORT_SYMBOL(drm_sched_entity_init); > > -/** > - * drm_sched_entity_is_initialized - Query if entity is initialized > - * > - * @sched: Pointer to scheduler instance > - * @entity: The pointer to a valid scheduler entity > - * > - * return true if entity is initialized, false otherwise > -*/ > -static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler *sched, > - struct drm_sched_entity *entity) > -{ > - return entity->rq != NULL && > - entity->rq->sched == sched; > -} > - > /** > * drm_sched_entity_is_idle - Check if entity is idle > * > @@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity) > { > rmb(); > > - if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL) > + if (list_empty(&entity->list) || > + spsc_queue_peek(&entity->job_queue) == NULL) > return true; > > return false; > @@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) > long ret = timeout; > > sched = entity->rq->sched; > - if (!drm_sched_entity_is_initialized(sched, entity)) > - return ret; > /** > * The client will not queue more IBs during this fini, consume existing > * queued IBs or discard them on SIGKILL > @@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) > last_user = cmpxchg(&entity->last_user, current->group_leader, NULL); > if ((!last_user || last_user == current->group_leader) && > (current->flags & PF_EXITING) && (current->exit_code == SIGKILL)) > - drm_sched_entity_set_rq(entity, NULL); > + drm_sched_rq_remove_entity(entity->rq, entity); > > return ret; > } > @@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct drm_sched_entity *entity) > struct drm_gpu_scheduler *sched; > > sched = entity->rq->sched; > - drm_sched_entity_set_rq(entity, NULL); > + drm_sched_rq_remove_entity(entity->rq, entity); > > /* Consumption of existing IBs wasn't completed. Forcefully > * remove them here. > @@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct drm_sched_entity *entity, > if (entity->rq == rq) > return; > > - spin_lock(&entity->rq_lock); > - > - if (entity->rq) > - drm_sched_rq_remove_entity(entity->rq, entity); > + BUG_ON(!rq); > > + spin_lock(&entity->rq_lock); > + drm_sched_rq_remove_entity(entity->rq, entity); > entity->rq = rq; > - if (rq) > - drm_sched_rq_add_entity(rq, entity); > - > + drm_sched_rq_add_entity(rq, entity); > spin_unlock(&entity->rq_lock); > } > EXPORT_SYMBOL(drm_sched_entity_set_rq);
On Fri, Aug 3, 2018 at 7:17 PM Andrey Grodzovsky Andrey.Grodzovsky@amd.com wrote:
On 08/03/2018 08:12 AM, Nayan Deshmukh wrote:
On Thu, Aug 2, 2018, 8:09 PM Andrey Grodzovsky Andrey.Grodzovsky@amd.com wrote:
On 08/02/2018 02:43 AM, Nayan Deshmukh wrote:
Hi Andrey,
Good catch. I guess since both Christian and I were working on it parallelly we didn't catch this problem.
If it is only causing a problem in push_job then we can handle it something like this:
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index 05dc6ecd4003..f344ce32f128 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -563,6 +563,11 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job, first = spsc_queue_count(&entity->job_queue) == 0; reschedule = idle && first && (entity->num_rq_list > 1);
if (first && list_empty(&entity->list)) {
first might be false if the other process interrupted by SIGKILL in middle of wait_event_killable in drm_sched_entity_flush when there were still item in queue. I don't see a good way besides adding a 'stopped' flag to drm_sched_enitity.
Sorry I missed this mail. This case might happen but this was also not being handled previously.
Nayan
The original code before 'drm/scheduler: stop setting rq to NULL' was , because you didn't use the queue's emptiness as a criteria for aborting your next push to queue but rather checking for entity->rq != NULL.
But (entity->rq != NULL) check was inside the body of if (first) so it was also dependent on the queue's emptiness earlier. Well in any case it doesn't matter now as with Christian's patch we can remove the if condition altogether.
Nayan
Andrey
Andrey
DRM_ERROR("Trying to push to a killed entity\n");
return;
}
if (reschedule) { rq = drm_sched_entity_get_free_sched(entity); spin_lock(&entity->rq_lock);
@@ -580,11 +585,6 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job, if (first) { /* Add the entity to the run queue */ spin_lock(&entity->rq_lock);
if (!entity->rq) {
DRM_ERROR("Trying to push to a killed entity\n");
spin_unlock(&entity->rq_lock);
return;
} drm_sched_rq_add_entity(entity->rq, entity); spin_unlock(&entity->rq_lock); drm_sched_wakeup(entity->rq->sched);
On Thu, Aug 2, 2018 at 3:55 AM Andrey Grodzovsky < Andrey.Grodzovsky@amd.com> wrote:
Thinking again about this change and 53d5f1e drm/scheduler: move idle entities to scheduler with less load v2
Looks to me like use case for which fc9a539 drm/scheduler: add NULL pointer check for run queue (v2) was done
will not work anymore.
First of all in drm_sched_entity_push_job, 'if (!entity->rq)' will never be true any more since we stopped entity->rq to NULL in
drm_sched_entity_flush.
Second of all, even after we removed the entity from rq in drm_sched_entity_flush to terminate any subsequent submissions
to the queue the other thread doing push job can just acquire again a run queue
from drm_sched_entity_get_free_sched and continue submission so you can't substitute ' if (!entity->rq)' to 'if (list_empty(&entity->list))'.
What about adding a 'stopped' flag to drm_sched_entity to be set in drm_sched_entity_flush and in
But it might be worth adding a stopped flag field if it is required at
other places as well.
Thanks, Nayan
drm_sched_entity_push_job check for 'if (!entity->stopped)' instead of ' if (!entity->rq)' ?
Andrey
On 07/30/2018 07:03 AM, Christian König wrote:
We removed the redundancy of having an extra scheduler field, so we can't set the rq to NULL any more or otherwise won't know which scheduler to use for the cleanup.
Just remove the entity from the scheduling list instead.
Signed-off-by: Christian König christian.koenig@amd.com
drivers/gpu/drm/scheduler/gpu_scheduler.c | 35
+++++++------------------------
1 file changed, 8 insertions(+), 27 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c
b/drivers/gpu/drm/scheduler/gpu_scheduler.c
index f563e4fbb4b6..1b733229201e 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct drm_sched_entity
*entity,
} EXPORT_SYMBOL(drm_sched_entity_init);
-/**
- drm_sched_entity_is_initialized - Query if entity is initialized
- @sched: Pointer to scheduler instance
- @entity: The pointer to a valid scheduler entity
- return true if entity is initialized, false otherwise
-*/ -static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler
*sched,
struct drm_sched_entity
*entity)
-{
return entity->rq != NULL &&
entity->rq->sched == sched;
-}
- /**
- drm_sched_entity_is_idle - Check if entity is idle
@@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct
drm_sched_entity *entity)
{ rmb();
if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL)
if (list_empty(&entity->list) ||
spsc_queue_peek(&entity->job_queue) == NULL) return true; return false;
@@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct
drm_sched_entity *entity, long timeout)
long ret = timeout; sched = entity->rq->sched;
if (!drm_sched_entity_is_initialized(sched, entity))
return ret; /** * The client will not queue more IBs during this fini, consume
existing
* queued IBs or discard them on SIGKILL
@@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct
drm_sched_entity *entity, long timeout)
last_user = cmpxchg(&entity->last_user, current->group_leader,
NULL);
if ((!last_user || last_user == current->group_leader) && (current->flags & PF_EXITING) && (current->exit_code ==
SIGKILL))
drm_sched_entity_set_rq(entity, NULL);
drm_sched_rq_remove_entity(entity->rq, entity); return ret;
}
@@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct drm_sched_entity
*entity)
struct drm_gpu_scheduler *sched; sched = entity->rq->sched;
drm_sched_entity_set_rq(entity, NULL);
drm_sched_rq_remove_entity(entity->rq, entity); /* Consumption of existing IBs wasn't completed. Forcefully * remove them here.
@@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct
drm_sched_entity *entity,
if (entity->rq == rq) return;
spin_lock(&entity->rq_lock);
if (entity->rq)
drm_sched_rq_remove_entity(entity->rq, entity);
BUG_ON(!rq);
spin_lock(&entity->rq_lock);
drm_sched_rq_remove_entity(entity->rq, entity); entity->rq = rq;
if (rq)
drm_sched_rq_add_entity(rq, entity);
} EXPORT_SYMBOL(drm_sched_entity_set_rq);drm_sched_rq_add_entity(rq, entity); spin_unlock(&entity->rq_lock);
Hi Christian,
The code looks good to me. But I was just wondering what will happen when the last user is killed and some other user tries to push to the entity.
Regards, Nayan Deshmukh
On Mon, Jul 30, 2018 at 4:33 PM Christian König < ckoenig.leichtzumerken@gmail.com> wrote:
Note which task is using the entity and only kill it if the last user of the entity is killed. This should prevent problems when entities are leaked to child processes.
v2: add missing kernel doc
Signed-off-by: Christian König christian.koenig@amd.com
drivers/gpu/drm/scheduler/gpu_scheduler.c | 6 +++++- include/drm/gpu_scheduler.h | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index 3f2fc5e8242a..f563e4fbb4b6 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -275,6 +275,7 @@ static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f, long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) { struct drm_gpu_scheduler *sched;
struct task_struct *last_user; long ret = timeout; sched = entity->rq->sched;
@@ -295,7 +296,9 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout)
/* For killed process disable any more IBs enqueue right now */
if ((current->flags & PF_EXITING) && (current->exit_code ==
SIGKILL))
last_user = cmpxchg(&entity->last_user, current->group_leader,
NULL);
if ((!last_user || last_user == current->group_leader) &&
(current->flags & PF_EXITING) && (current->exit_code ==
SIGKILL)) drm_sched_entity_set_rq(entity, NULL);
return ret;
@@ -541,6 +544,7 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job,
trace_drm_sched_job(sched_job, entity);
WRITE_ONCE(entity->last_user, current->group_leader); first = spsc_queue_push(&entity->job_queue,
&sched_job->queue_node);
/* first job wakes up scheduler */
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 091b9afcd184..21c648b0b2a1 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -66,6 +66,7 @@ enum drm_sched_priority {
- @guilty: points to ctx's guilty.
- @fini_status: contains the exit status in case the process was
signalled.
- @last_scheduled: points to the finished fence of the last scheduled
job.
- @last_user: last group leader pushing a job into the entity.
- Entities will emit jobs in order to their corresponding hardware
- ring, and the scheduler will alternate between entities based on
@@ -85,6 +86,7 @@ struct drm_sched_entity { struct dma_fence_cb cb; atomic_t *guilty; struct dma_fence *last_scheduled;
struct task_struct *last_user;
};
/**
2.14.1
I believe that in this case
if (!entity->rq) {
DRM_ERROR...
return;
}
clause will take place.
P.S I remember we planned to actually propagate the error back to the caller so i guess we should take care of this sooner or later.
The change is Reviewed-by: Andrey Grodzovsky andrey.grodzovsky@amd.com
Andrey
On 07/30/2018 09:34 AM, Nayan Deshmukh wrote:
Hi Christian,
The code looks good to me. But I was just wondering what will happen when the last user is killed and some other user tries to push to the entity.
Regards, Nayan Deshmukh
On Mon, Jul 30, 2018 at 4:33 PM Christian König <ckoenig.leichtzumerken@gmail.com mailto:ckoenig.leichtzumerken@gmail.com> wrote:
Note which task is using the entity and only kill it if the last user of the entity is killed. This should prevent problems when entities are leaked to child processes. v2: add missing kernel doc Signed-off-by: Christian König <christian.koenig@amd.com <mailto:christian.koenig@amd.com>> --- drivers/gpu/drm/scheduler/gpu_scheduler.c | 6 +++++- include/drm/gpu_scheduler.h | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index 3f2fc5e8242a..f563e4fbb4b6 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -275,6 +275,7 @@ static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f, long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) { struct drm_gpu_scheduler *sched; + struct task_struct *last_user; long ret = timeout; sched = entity->rq->sched; @@ -295,7 +296,9 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) /* For killed process disable any more IBs enqueue right now */ - if ((current->flags & PF_EXITING) && (current->exit_code == SIGKILL)) + last_user = cmpxchg(&entity->last_user, current->group_leader, NULL); + if ((!last_user || last_user == current->group_leader) && + (current->flags & PF_EXITING) && (current->exit_code == SIGKILL)) drm_sched_entity_set_rq(entity, NULL); return ret; @@ -541,6 +544,7 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job, trace_drm_sched_job(sched_job, entity); + WRITE_ONCE(entity->last_user, current->group_leader); first = spsc_queue_push(&entity->job_queue, &sched_job->queue_node); /* first job wakes up scheduler */ diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 091b9afcd184..21c648b0b2a1 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -66,6 +66,7 @@ enum drm_sched_priority { * @guilty: points to ctx's guilty. * @fini_status: contains the exit status in case the process was signalled. * @last_scheduled: points to the finished fence of the last scheduled job. + * @last_user: last group leader pushing a job into the entity. * * Entities will emit jobs in order to their corresponding hardware * ring, and the scheduler will alternate between entities based on @@ -85,6 +86,7 @@ struct drm_sched_entity { struct dma_fence_cb cb; atomic_t *guilty; struct dma_fence *last_scheduled; + struct task_struct *last_user; }; /** -- 2.14.1
dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
That makes sense. The change is Acked-by: Nayan Deshmukh < nayan26deshmukh@gmail.com>
On Tue, Jul 31, 2018 at 2:12 AM Andrey Grodzovsky Andrey.Grodzovsky@amd.com wrote:
I believe that in this case
if (!entity->rq) {
DRM_ERROR... return;
}
clause will take place.
P.S I remember we planned to actually propagate the error back to the caller so i guess we should take care of this sooner or later.
The change is Reviewed-by: Andrey Grodzovsky andrey.grodzovsky@amd.com andrey.grodzovsky@amd.com
Andrey
On 07/30/2018 09:34 AM, Nayan Deshmukh wrote:
Hi Christian,
The code looks good to me. But I was just wondering what will happen when the last user is killed and some other user tries to push to the entity.
Regards, Nayan Deshmukh
On Mon, Jul 30, 2018 at 4:33 PM Christian König < ckoenig.leichtzumerken@gmail.com> wrote:
Note which task is using the entity and only kill it if the last user of the entity is killed. This should prevent problems when entities are leaked to child processes.
v2: add missing kernel doc
Signed-off-by: Christian König christian.koenig@amd.com
drivers/gpu/drm/scheduler/gpu_scheduler.c | 6 +++++- include/drm/gpu_scheduler.h | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index 3f2fc5e8242a..f563e4fbb4b6 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -275,6 +275,7 @@ static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f, long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) { struct drm_gpu_scheduler *sched;
struct task_struct *last_user; long ret = timeout; sched = entity->rq->sched;
@@ -295,7 +296,9 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout)
/* For killed process disable any more IBs enqueue right now */
if ((current->flags & PF_EXITING) && (current->exit_code ==
SIGKILL))
last_user = cmpxchg(&entity->last_user, current->group_leader,
NULL);
if ((!last_user || last_user == current->group_leader) &&
(current->flags & PF_EXITING) && (current->exit_code ==
SIGKILL)) drm_sched_entity_set_rq(entity, NULL);
return ret;
@@ -541,6 +544,7 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job,
trace_drm_sched_job(sched_job, entity);
WRITE_ONCE(entity->last_user, current->group_leader); first = spsc_queue_push(&entity->job_queue,
&sched_job->queue_node);
/* first job wakes up scheduler */
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 091b9afcd184..21c648b0b2a1 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -66,6 +66,7 @@ enum drm_sched_priority {
- @guilty: points to ctx's guilty.
- @fini_status: contains the exit status in case the process was
signalled.
- @last_scheduled: points to the finished fence of the last scheduled
job.
- @last_user: last group leader pushing a job into the entity.
- Entities will emit jobs in order to their corresponding hardware
- ring, and the scheduler will alternate between entities based on
@@ -85,6 +86,7 @@ struct drm_sched_entity { struct dma_fence_cb cb; atomic_t *guilty; struct dma_fence *last_scheduled;
struct task_struct *last_user;
};
/**
2.14.1
dri-devel mailing listdri-devel@lists.freedesktop.orghttps://lists.freedesktop.org/mailman/listinfo/dri-devel
dri-devel@lists.freedesktop.org