Dying process might be blocked from receiving any more signals so avoid using it.
Also retire enity->fini_status and just check the SW queue, if it's not empty do the fallback cleanup.
Also handle entity->last_scheduled == NULL use case which happens when HW ring is already hangged whem a new entity tried to enqeue jobs.
v2: Return the remaining timeout and use that as parameter for the next call. This way when we need to cleanup multiple queues we don't wait for the entire TO period for each queue but rather in total. Styling comments. Rebase.
Signed-off-by: Andrey Grodzovsky andrey.grodzovsky@amd.com --- drivers/gpu/drm/scheduler/gpu_scheduler.c | 74 ++++++++++++++++++++++++------- include/drm/gpu_scheduler.h | 7 +-- 2 files changed, 61 insertions(+), 20 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index 8c1e80c..c594d17 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -181,7 +181,6 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched, entity->rq = rq; entity->sched = sched; entity->guilty = guilty; - entity->fini_status = 0; entity->last_scheduled = NULL;
spin_lock_init(&entity->rq_lock); @@ -219,7 +218,8 @@ static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler *sched, static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity) { rmb(); - if (spsc_queue_peek(&entity->job_queue) == NULL) + + if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL) return true;
return false; @@ -260,25 +260,48 @@ static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f, * * @sched: scheduler instance * @entity: scheduler entity + * @timeout: time to wait in ms for Q to become empty. * * Splitting drm_sched_entity_fini() into two functions, The first one does the waiting, * removes the entity from the runqueue and returns an error when the process was killed. + * + * Returns amount of time spent in waiting for TO. + * 0 if wait wasn't with time out. + * MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS if wait timed out with condition false + * Number of MS spent in waiting before condition became true + * */ -void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, - struct drm_sched_entity *entity) +unsigned drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, + struct drm_sched_entity *entity, unsigned timeout) { + unsigned ret = 0; + if (!drm_sched_entity_is_initialized(sched, entity)) return; /** * The client will not queue more IBs during this fini, consume existing * queued IBs or discard them on SIGKILL */ - if ((current->flags & PF_SIGNALED) && current->exit_code == SIGKILL) - entity->fini_status = -ERESTARTSYS; - else - entity->fini_status = wait_event_killable(sched->job_scheduled, - drm_sched_entity_is_idle(entity)); - drm_sched_entity_set_rq(entity, NULL); + if (current->flags & PF_EXITING) { + if (timeout) { + ret = jiffies_to_msecs( + wait_event_timeout( + sched->job_scheduled, + drm_sched_entity_is_idle(entity), + msecs_to_jiffies(timeout))); + + if (!ret) + ret = MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS; + } + } else + wait_event_killable(sched->job_scheduled, drm_sched_entity_is_idle(entity)); + + + /* For killed process disable any more IBs enqueue right now */ + if ((current->flags & PF_EXITING) && (current->exit_code == SIGKILL)) + drm_sched_entity_set_rq(entity, NULL); + + return ret; } EXPORT_SYMBOL(drm_sched_entity_do_release);
@@ -290,11 +313,18 @@ EXPORT_SYMBOL(drm_sched_entity_do_release); * * This should be called after @drm_sched_entity_do_release. It goes over the * entity and signals all jobs with an error code if the process was killed. + * */ void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched, struct drm_sched_entity *entity) { - if (entity->fini_status) { + + drm_sched_entity_set_rq(entity, NULL); + + /* Consumption of existing IBs wasn't completed. Forcefully + * remove them here. + */ + if (spsc_queue_peek(&entity->job_queue)) { struct drm_sched_job *job; int r;
@@ -314,12 +344,22 @@ void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched, struct drm_sched_fence *s_fence = job->s_fence; drm_sched_fence_scheduled(s_fence); dma_fence_set_error(&s_fence->finished, -ESRCH); - r = dma_fence_add_callback(entity->last_scheduled, &job->finish_cb, - drm_sched_entity_kill_jobs_cb); - if (r == -ENOENT) + + /* + * When pipe is hanged by older entity, new entity might + * not even have chance to submit it's first job to HW + * and so entity->last_scheduled will remain NULL + */ + if (!entity->last_scheduled) { drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb); - else if (r) - DRM_ERROR("fence add callback failed (%d)\n", r); + } else { + r = dma_fence_add_callback(entity->last_scheduled, &job->finish_cb, + drm_sched_entity_kill_jobs_cb); + if (r == -ENOENT) + drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb); + else if (r) + DRM_ERROR("fence add callback failed (%d)\n", r); + } } }
@@ -339,7 +379,7 @@ EXPORT_SYMBOL(drm_sched_entity_cleanup); void drm_sched_entity_fini(struct drm_gpu_scheduler *sched, struct drm_sched_entity *entity) { - drm_sched_entity_do_release(sched, entity); + drm_sched_entity_do_release(sched, entity, MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS); drm_sched_entity_cleanup(sched, entity); } EXPORT_SYMBOL(drm_sched_entity_fini); diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 496442f..af07875 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -27,6 +27,8 @@ #include <drm/spsc_queue.h> #include <linux/dma-fence.h>
+#define MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS 1000 + struct drm_gpu_scheduler; struct drm_sched_rq;
@@ -84,7 +86,6 @@ struct drm_sched_entity { struct dma_fence *dependency; struct dma_fence_cb cb; atomic_t *guilty; - int fini_status; struct dma_fence *last_scheduled; };
@@ -283,8 +284,8 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched, struct drm_sched_entity *entity, struct drm_sched_rq *rq, atomic_t *guilty); -void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, - struct drm_sched_entity *entity); +unsigned drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, + struct drm_sched_entity *entity, unsigned timeout); void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched, struct drm_sched_entity *entity); void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
With this we can now terminate jobs enqueue into SW queue the moment the task is being killed instead of waiting for last user of drm file to release it.
Also stop checking for kref_read(&ctx->refcount) == 1 when calling drm_sched_entity_do_release since other task might still hold a reference to this entity but we don't care since KILL means terminate job submission regardless of what other tasks are doing.
v2: Use returned remaining timeout as parameter for the next call. Rebase.
Signed-off-by: Andrey Grodzovsky andrey.grodzovsky@amd.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 17 +++++++++++------ drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 12 ++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 1 - 3 files changed, 23 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c index c5bb362..cbf2fcc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c @@ -449,26 +449,31 @@ void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr) struct amdgpu_ctx *ctx; struct idr *idp; uint32_t id, i; + unsigned ret; + unsigned max_wait = MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS;
idp = &mgr->ctx_handles;
+ mutex_lock(&mgr->lock); idr_for_each_entry(idp, ctx, id) {
- if (!ctx->adev) + if (!ctx->adev) { + mutex_unlock(&mgr->lock); return; + }
for (i = 0; i < ctx->adev->num_rings; i++) {
if (ctx->adev->rings[i] == &ctx->adev->gfx.kiq.ring) continue;
- if (kref_read(&ctx->refcount) == 1) - drm_sched_entity_do_release(&ctx->adev->rings[i]->sched, - &ctx->rings[i].entity); - else - DRM_ERROR("ctx %p is still alive\n", ctx); + ret = drm_sched_entity_do_release(&ctx->adev->rings[i]->sched, + &ctx->rings[i].entity, max_wait); + + max_wait = max_wait >= ret ? max_wait - ret : 0; } } + mutex_unlock(&mgr->lock); }
void amdgpu_ctx_mgr_entity_cleanup(struct amdgpu_ctx_mgr *mgr) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index b0bf2f2..a549483 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -855,9 +855,21 @@ static const struct dev_pm_ops amdgpu_pm_ops = { .runtime_idle = amdgpu_pmops_runtime_idle, };
+static int amdgpu_flush(struct file *f, fl_owner_t id) +{ + struct drm_file *file_priv = f->private_data; + struct amdgpu_fpriv *fpriv = file_priv->driver_priv; + + amdgpu_ctx_mgr_entity_fini(&fpriv->ctx_mgr); + + return 0; +} + + static const struct file_operations amdgpu_driver_kms_fops = { .owner = THIS_MODULE, .open = drm_open, + .flush = amdgpu_flush, .release = drm_release, .unlocked_ioctl = amdgpu_drm_ioctl, .mmap = amdgpu_mmap, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index ca21549..1239384 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -930,7 +930,6 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev, return;
pm_runtime_get_sync(dev->dev); - amdgpu_ctx_mgr_entity_fini(&fpriv->ctx_mgr);
if (adev->asic_type != CHIP_RAVEN) { amdgpu_uvd_free_handles(adev, file_priv);
Am 01.06.2018 um 19:11 schrieb Andrey Grodzovsky:
Dying process might be blocked from receiving any more signals so avoid using it.
Also retire enity->fini_status and just check the SW queue, if it's not empty do the fallback cleanup.
Also handle entity->last_scheduled == NULL use case which happens when HW ring is already hangged whem a new entity tried to enqeue jobs.
v2: Return the remaining timeout and use that as parameter for the next call. This way when we need to cleanup multiple queues we don't wait for the entire TO period for each queue but rather in total. Styling comments. Rebase.
Signed-off-by: Andrey Grodzovsky andrey.grodzovsky@amd.com
drivers/gpu/drm/scheduler/gpu_scheduler.c | 74 ++++++++++++++++++++++++------- include/drm/gpu_scheduler.h | 7 +-- 2 files changed, 61 insertions(+), 20 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index 8c1e80c..c594d17 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -181,7 +181,6 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched, entity->rq = rq; entity->sched = sched; entity->guilty = guilty;
entity->fini_status = 0; entity->last_scheduled = NULL;
spin_lock_init(&entity->rq_lock);
@@ -219,7 +218,8 @@ static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler *sched, static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity) { rmb();
- if (spsc_queue_peek(&entity->job_queue) == NULL)
if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL) return true;
return false;
@@ -260,25 +260,48 @@ static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
- @sched: scheduler instance
- @entity: scheduler entity
- @timeout: time to wait in ms for Q to become empty.
- Splitting drm_sched_entity_fini() into two functions, The first one does the waiting,
- removes the entity from the runqueue and returns an error when the process was killed.
- Returns amount of time spent in waiting for TO.
- 0 if wait wasn't with time out.
- MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS if wait timed out with condition false
- Number of MS spent in waiting before condition became true
*/
-void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
struct drm_sched_entity *entity)
+unsigned drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
struct drm_sched_entity *entity, unsigned timeout)
Better use long for return type and timeout.
{
- unsigned ret = 0;
Also use a long here and initialize it with timeout.
- if (!drm_sched_entity_is_initialized(sched, entity)) return; /**
*/
- The client will not queue more IBs during this fini, consume existing
- queued IBs or discard them on SIGKILL
- if ((current->flags & PF_SIGNALED) && current->exit_code == SIGKILL)
entity->fini_status = -ERESTARTSYS;
- else
entity->fini_status = wait_event_killable(sched->job_scheduled,
drm_sched_entity_is_idle(entity));
- drm_sched_entity_set_rq(entity, NULL);
- if (current->flags & PF_EXITING) {
if (timeout) {
ret = jiffies_to_msecs(
wait_event_timeout(
sched->job_scheduled,
drm_sched_entity_is_idle(entity),
msecs_to_jiffies(timeout)));
Oh please don't use msecs as timeout, just use jiffies and let the caller do the conversion.
if (!ret)
ret = MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS;
Why that? It is common coding style to return 0 when a timeout occurs.
Christian.
}
- } else
wait_event_killable(sched->job_scheduled, drm_sched_entity_is_idle(entity));
- /* For killed process disable any more IBs enqueue right now */
- if ((current->flags & PF_EXITING) && (current->exit_code == SIGKILL))
drm_sched_entity_set_rq(entity, NULL);
- return ret; } EXPORT_SYMBOL(drm_sched_entity_do_release);
@@ -290,11 +313,18 @@ EXPORT_SYMBOL(drm_sched_entity_do_release);
- This should be called after @drm_sched_entity_do_release. It goes over the
- entity and signals all jobs with an error code if the process was killed.
*/ void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched, struct drm_sched_entity *entity) {
- if (entity->fini_status) {
- drm_sched_entity_set_rq(entity, NULL);
- /* Consumption of existing IBs wasn't completed. Forcefully
* remove them here.
*/
- if (spsc_queue_peek(&entity->job_queue)) { struct drm_sched_job *job; int r;
@@ -314,12 +344,22 @@ void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched, struct drm_sched_fence *s_fence = job->s_fence; drm_sched_fence_scheduled(s_fence); dma_fence_set_error(&s_fence->finished, -ESRCH);
r = dma_fence_add_callback(entity->last_scheduled, &job->finish_cb,
drm_sched_entity_kill_jobs_cb);
if (r == -ENOENT)
/*
* When pipe is hanged by older entity, new entity might
* not even have chance to submit it's first job to HW
* and so entity->last_scheduled will remain NULL
*/
if (!entity->last_scheduled) { drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb);
else if (r)
DRM_ERROR("fence add callback failed (%d)\n", r);
} else {
r = dma_fence_add_callback(entity->last_scheduled, &job->finish_cb,
drm_sched_entity_kill_jobs_cb);
if (r == -ENOENT)
drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb);
else if (r)
DRM_ERROR("fence add callback failed (%d)\n", r);
} }}
@@ -339,7 +379,7 @@ EXPORT_SYMBOL(drm_sched_entity_cleanup); void drm_sched_entity_fini(struct drm_gpu_scheduler *sched, struct drm_sched_entity *entity) {
- drm_sched_entity_do_release(sched, entity);
- drm_sched_entity_do_release(sched, entity, MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS); drm_sched_entity_cleanup(sched, entity); } EXPORT_SYMBOL(drm_sched_entity_fini);
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 496442f..af07875 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -27,6 +27,8 @@ #include <drm/spsc_queue.h> #include <linux/dma-fence.h>
+#define MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS 1000
I suggest to use msecs_to_jiffies(1000) here and drop the _MS postfix.
Christian.
- struct drm_gpu_scheduler; struct drm_sched_rq;
@@ -84,7 +86,6 @@ struct drm_sched_entity { struct dma_fence *dependency; struct dma_fence_cb cb; atomic_t *guilty;
- int fini_status; struct dma_fence *last_scheduled; };
@@ -283,8 +284,8 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched, struct drm_sched_entity *entity, struct drm_sched_rq *rq, atomic_t *guilty); -void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
struct drm_sched_entity *entity);
+unsigned drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched, struct drm_sched_entity *entity); void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,struct drm_sched_entity *entity, unsigned timeout);
On 06/01/2018 01:22 PM, Christian König wrote:
Am 01.06.2018 um 19:11 schrieb Andrey Grodzovsky:
Dying process might be blocked from receiving any more signals so avoid using it.
Also retire enity->fini_status and just check the SW queue, if it's not empty do the fallback cleanup.
Also handle entity->last_scheduled == NULL use case which happens when HW ring is already hangged whem a new entity tried to enqeue jobs.
v2: Return the remaining timeout and use that as parameter for the next call. This way when we need to cleanup multiple queues we don't wait for the entire TO period for each queue but rather in total. Styling comments. Rebase.
Signed-off-by: Andrey Grodzovsky andrey.grodzovsky@amd.com
drivers/gpu/drm/scheduler/gpu_scheduler.c | 74 ++++++++++++++++++++++++------- include/drm/gpu_scheduler.h | 7 +-- 2 files changed, 61 insertions(+), 20 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index 8c1e80c..c594d17 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -181,7 +181,6 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched, entity->rq = rq; entity->sched = sched; entity->guilty = guilty; - entity->fini_status = 0; entity->last_scheduled = NULL; spin_lock_init(&entity->rq_lock); @@ -219,7 +218,8 @@ static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler *sched, static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity) { rmb(); - if (spsc_queue_peek(&entity->job_queue) == NULL)
+ if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL) return true; return false; @@ -260,25 +260,48 @@ static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f, * * @sched: scheduler instance * @entity: scheduler entity
- @timeout: time to wait in ms for Q to become empty.
* * Splitting drm_sched_entity_fini() into two functions, The first one does the waiting, * removes the entity from the runqueue and returns an error when the process was killed.
- Returns amount of time spent in waiting for TO.
- 0 if wait wasn't with time out.
- MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS if wait timed out with condition
false
- Number of MS spent in waiting before condition became true
*/ -void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, - struct drm_sched_entity *entity) +unsigned drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, + struct drm_sched_entity *entity, unsigned timeout)
Better use long for return type and timeout.
{ + unsigned ret = 0;
Also use a long here and initialize it with timeout.
Please see bellow
if (!drm_sched_entity_is_initialized(sched, entity)) return; /** * The client will not queue more IBs during this fini, consume existing * queued IBs or discard them on SIGKILL */ - if ((current->flags & PF_SIGNALED) && current->exit_code == SIGKILL) - entity->fini_status = -ERESTARTSYS; - else - entity->fini_status = wait_event_killable(sched->job_scheduled, - drm_sched_entity_is_idle(entity)); - drm_sched_entity_set_rq(entity, NULL); + if (current->flags & PF_EXITING) { + if (timeout) { + ret = jiffies_to_msecs( + wait_event_timeout( + sched->job_scheduled, + drm_sched_entity_is_idle(entity), + msecs_to_jiffies(timeout)));
Oh please don't use msecs as timeout, just use jiffies and let the caller do the conversion.
+ if (!ret) + ret = MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS;
Why that? It is common coding style to return 0 when a timeout occurs.
Christian.
What should i return when i do wait_event_killable, it's return values are opposite to wait_event_timeout... This way returning 0 has no impact on remaining waiting time, dong it the other way will force the caller to do some cumbersome logic instead of just
max_wait = max_wait >= ret ? max_wait - ret : 0;
like in amdgpu_ctx_mgr_entity_fini
Andrey
+ } + } else + wait_event_killable(sched->job_scheduled, drm_sched_entity_is_idle(entity));
+ /* For killed process disable any more IBs enqueue right now */ + if ((current->flags & PF_EXITING) && (current->exit_code == SIGKILL)) + drm_sched_entity_set_rq(entity, NULL);
+ return ret; } EXPORT_SYMBOL(drm_sched_entity_do_release); @@ -290,11 +313,18 @@ EXPORT_SYMBOL(drm_sched_entity_do_release); * * This should be called after @drm_sched_entity_do_release. It goes over the * entity and signals all jobs with an error code if the process was killed.
*/ void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched, struct drm_sched_entity *entity) { - if (entity->fini_status) {
+ drm_sched_entity_set_rq(entity, NULL);
+ /* Consumption of existing IBs wasn't completed. Forcefully + * remove them here. + */ + if (spsc_queue_peek(&entity->job_queue)) { struct drm_sched_job *job; int r; @@ -314,12 +344,22 @@ void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched, struct drm_sched_fence *s_fence = job->s_fence; drm_sched_fence_scheduled(s_fence); dma_fence_set_error(&s_fence->finished, -ESRCH); - r = dma_fence_add_callback(entity->last_scheduled, &job->finish_cb, - drm_sched_entity_kill_jobs_cb); - if (r == -ENOENT)
+ /* + * When pipe is hanged by older entity, new entity might + * not even have chance to submit it's first job to HW + * and so entity->last_scheduled will remain NULL + */ + if (!entity->last_scheduled) { drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb); - else if (r) - DRM_ERROR("fence add callback failed (%d)\n", r); + } else { + r = dma_fence_add_callback(entity->last_scheduled, &job->finish_cb, + drm_sched_entity_kill_jobs_cb); + if (r == -ENOENT) + drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb); + else if (r) + DRM_ERROR("fence add callback failed (%d)\n", r); + } } } @@ -339,7 +379,7 @@ EXPORT_SYMBOL(drm_sched_entity_cleanup); void drm_sched_entity_fini(struct drm_gpu_scheduler *sched, struct drm_sched_entity *entity) { - drm_sched_entity_do_release(sched, entity); + drm_sched_entity_do_release(sched, entity, MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS); drm_sched_entity_cleanup(sched, entity); } EXPORT_SYMBOL(drm_sched_entity_fini); diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 496442f..af07875 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -27,6 +27,8 @@ #include <drm/spsc_queue.h> #include <linux/dma-fence.h> +#define MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS 1000
I suggest to use msecs_to_jiffies(1000) here and drop the _MS postfix.
Christian.
struct drm_gpu_scheduler; struct drm_sched_rq; @@ -84,7 +86,6 @@ struct drm_sched_entity { struct dma_fence *dependency; struct dma_fence_cb cb; atomic_t *guilty; - int fini_status; struct dma_fence *last_scheduled; }; @@ -283,8 +284,8 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched, struct drm_sched_entity *entity, struct drm_sched_rq *rq, atomic_t *guilty); -void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, - struct drm_sched_entity *entity); +unsigned drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, + struct drm_sched_entity *entity, unsigned timeout); void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched, struct drm_sched_entity *entity); void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
Am 01.06.2018 um 21:56 schrieb Andrey Grodzovsky:
On 06/01/2018 01:22 PM, Christian König wrote:
Am 01.06.2018 um 19:11 schrieb Andrey Grodzovsky:
Dying process might be blocked from receiving any more signals so avoid using it.
Also retire enity->fini_status and just check the SW queue, if it's not empty do the fallback cleanup.
Also handle entity->last_scheduled == NULL use case which happens when HW ring is already hangged whem a new entity tried to enqeue jobs.
v2: Return the remaining timeout and use that as parameter for the next call. This way when we need to cleanup multiple queues we don't wait for the entire TO period for each queue but rather in total. Styling comments. Rebase.
Signed-off-by: Andrey Grodzovsky andrey.grodzovsky@amd.com
drivers/gpu/drm/scheduler/gpu_scheduler.c | 74 ++++++++++++++++++++++++------- include/drm/gpu_scheduler.h | 7 +-- 2 files changed, 61 insertions(+), 20 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index 8c1e80c..c594d17 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -181,7 +181,6 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched, entity->rq = rq; entity->sched = sched; entity->guilty = guilty; - entity->fini_status = 0; entity->last_scheduled = NULL; spin_lock_init(&entity->rq_lock); @@ -219,7 +218,8 @@ static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler *sched, static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity) { rmb(); - if (spsc_queue_peek(&entity->job_queue) == NULL)
+ if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL) return true; return false; @@ -260,25 +260,48 @@ static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f, * * @sched: scheduler instance * @entity: scheduler entity
- @timeout: time to wait in ms for Q to become empty.
* * Splitting drm_sched_entity_fini() into two functions, The first one does the waiting, * removes the entity from the runqueue and returns an error when the process was killed.
- Returns amount of time spent in waiting for TO.
- 0 if wait wasn't with time out.
- MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS if wait timed out with
condition false
- Number of MS spent in waiting before condition became true
*/ -void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, - struct drm_sched_entity *entity) +unsigned drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, + struct drm_sched_entity *entity, unsigned timeout)
Better use long for return type and timeout.
{ + unsigned ret = 0;
Also use a long here and initialize it with timeout.
Please see bellow
if (!drm_sched_entity_is_initialized(sched, entity)) return; /** * The client will not queue more IBs during this fini, consume existing * queued IBs or discard them on SIGKILL */ - if ((current->flags & PF_SIGNALED) && current->exit_code == SIGKILL) - entity->fini_status = -ERESTARTSYS; - else - entity->fini_status = wait_event_killable(sched->job_scheduled, - drm_sched_entity_is_idle(entity)); - drm_sched_entity_set_rq(entity, NULL); + if (current->flags & PF_EXITING) { + if (timeout) { + ret = jiffies_to_msecs( + wait_event_timeout( + sched->job_scheduled, + drm_sched_entity_is_idle(entity), + msecs_to_jiffies(timeout)));
Oh please don't use msecs as timeout, just use jiffies and let the caller do the conversion.
+ if (!ret) + ret = MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS;
Why that? It is common coding style to return 0 when a timeout occurs.
Christian.
What should i return when i do wait_event_killable, it's return values are opposite to wait_event_timeout...
Just the unmodified timeout. The timeout should begin only after the process is killed.
This way returning 0 has no impact on remaining waiting time, dong it the other way will force the caller to do some cumbersome logic instead of just
max_wait = max_wait >= ret ? max_wait - ret : 0;
like in amdgpu_ctx_mgr_entity_fini
Hui? Why do you want to fiddle with the max_wait here all together?
The usual pattern of using timeouts with multiple wait_event_timeout calls is the following:
timeout = MAX_TIMEOUT; while (more_events_to_handle) { timeout = wait_event_timeout(... timeout); if (timeout == 0) break; }
if (timeout == 0) we_timeout_out_waiting_for_all_events();
Christian.
Andrey
+ } + } else + wait_event_killable(sched->job_scheduled, drm_sched_entity_is_idle(entity));
+ /* For killed process disable any more IBs enqueue right now */ + if ((current->flags & PF_EXITING) && (current->exit_code == SIGKILL)) + drm_sched_entity_set_rq(entity, NULL);
+ return ret; } EXPORT_SYMBOL(drm_sched_entity_do_release); @@ -290,11 +313,18 @@ EXPORT_SYMBOL(drm_sched_entity_do_release); * * This should be called after @drm_sched_entity_do_release. It goes over the * entity and signals all jobs with an error code if the process was killed.
*/ void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched, struct drm_sched_entity *entity) { - if (entity->fini_status) {
+ drm_sched_entity_set_rq(entity, NULL);
+ /* Consumption of existing IBs wasn't completed. Forcefully + * remove them here. + */ + if (spsc_queue_peek(&entity->job_queue)) { struct drm_sched_job *job; int r; @@ -314,12 +344,22 @@ void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched, struct drm_sched_fence *s_fence = job->s_fence; drm_sched_fence_scheduled(s_fence); dma_fence_set_error(&s_fence->finished, -ESRCH); - r = dma_fence_add_callback(entity->last_scheduled, &job->finish_cb, - drm_sched_entity_kill_jobs_cb); - if (r == -ENOENT)
+ /* + * When pipe is hanged by older entity, new entity might + * not even have chance to submit it's first job to HW + * and so entity->last_scheduled will remain NULL + */ + if (!entity->last_scheduled) { drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb); - else if (r) - DRM_ERROR("fence add callback failed (%d)\n", r); + } else { + r = dma_fence_add_callback(entity->last_scheduled, &job->finish_cb,
- drm_sched_entity_kill_jobs_cb);
+ if (r == -ENOENT) + drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb); + else if (r) + DRM_ERROR("fence add callback failed (%d)\n", r); + } } } @@ -339,7 +379,7 @@ EXPORT_SYMBOL(drm_sched_entity_cleanup); void drm_sched_entity_fini(struct drm_gpu_scheduler *sched, struct drm_sched_entity *entity) { - drm_sched_entity_do_release(sched, entity); + drm_sched_entity_do_release(sched, entity, MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS); drm_sched_entity_cleanup(sched, entity); } EXPORT_SYMBOL(drm_sched_entity_fini); diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 496442f..af07875 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -27,6 +27,8 @@ #include <drm/spsc_queue.h> #include <linux/dma-fence.h> +#define MAX_WAIT_SCHED_ENTITY_Q_EMPTY_MS 1000
I suggest to use msecs_to_jiffies(1000) here and drop the _MS postfix.
Christian.
struct drm_gpu_scheduler; struct drm_sched_rq; @@ -84,7 +86,6 @@ struct drm_sched_entity { struct dma_fence *dependency; struct dma_fence_cb cb; atomic_t *guilty; - int fini_status; struct dma_fence *last_scheduled; }; @@ -283,8 +284,8 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched, struct drm_sched_entity *entity, struct drm_sched_rq *rq, atomic_t *guilty); -void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, - struct drm_sched_entity *entity); +unsigned drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, + struct drm_sched_entity *entity, unsigned timeout); void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched, struct drm_sched_entity *entity); void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
dri-devel@lists.freedesktop.org