[PATCH 1/2] drm/scheduler: only kill entity if last user is killed v2

List overview All Threads
Download

newer

older

[PATCH v5 0/2] Add XYUV format...

[PATCH v7 0/3] drm/atmel-hlcdc:...

Christian König

30 Jul 2018 30 Jul '18

11:03 a.m.

Note which task is using the entity and only kill it if the last user of the entity is killed. This should prevent problems when entities are leaked to child processes.

v2: add missing kernel doc

Signed-off-by: Christian König christian.koenig@amd.com --- drivers/gpu/drm/scheduler/gpu_scheduler.c | 6 +++++- include/drm/gpu_scheduler.h | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index 3f2fc5e8242a..f563e4fbb4b6 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -275,6 +275,7 @@ static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f, long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) { struct drm_gpu_scheduler *sched; + struct task_struct *last_user; long ret = timeout;

sched = entity->rq->sched; @@ -295,7 +296,9 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout)

/* For killed process disable any more IBs enqueue right now */ - if ((current->flags & PF_EXITING) && (current->exit_code == SIGKILL)) + last_user = cmpxchg(&entity->last_user, current->group_leader, NULL); + if ((!last_user || last_user == current->group_leader) && + (current->flags & PF_EXITING) && (current->exit_code == SIGKILL)) drm_sched_entity_set_rq(entity, NULL);

return ret; @@ -541,6 +544,7 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job,

trace_drm_sched_job(sched_job, entity);

+ WRITE_ONCE(entity->last_user, current->group_leader); first = spsc_queue_push(&entity->job_queue, &sched_job->queue_node);

/* first job wakes up scheduler */ diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 091b9afcd184..21c648b0b2a1 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -66,6 +66,7 @@ enum drm_sched_priority { * @guilty: points to ctx's guilty. * @fini_status: contains the exit status in case the process was signalled. * @last_scheduled: points to the finished fence of the last scheduled job. + * @last_user: last group leader pushing a job into the entity. * * Entities will emit jobs in order to their corresponding hardware * ring, and the scheduler will alternate between entities based on @@ -85,6 +86,7 @@ struct drm_sched_entity { struct dma_fence_cb cb; atomic_t *guilty; struct dma_fence *last_scheduled; + struct task_struct *last_user; };

/**

-- 2.14.1

Show replies by date

Christian König

30 Jul 30 Jul

11:03 a.m.

New subject: [PATCH 2/2] drm/scheduler: stop setting rq to NULL

We removed the redundancy of having an extra scheduler field, so we can't set the rq to NULL any more or otherwise won't know which scheduler to use for the cleanup.

Just remove the entity from the scheduling list instead.

Signed-off-by: Christian König christian.koenig@amd.com --- drivers/gpu/drm/scheduler/gpu_scheduler.c | 35 +++++++------------------------ 1 file changed, 8 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index f563e4fbb4b6..1b733229201e 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct drm_sched_entity *entity, } EXPORT_SYMBOL(drm_sched_entity_init);

-/** - * drm_sched_entity_is_initialized - Query if entity is initialized - * - * @sched: Pointer to scheduler instance - * @entity: The pointer to a valid scheduler entity - * - * return true if entity is initialized, false otherwise -*/ -static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler *sched, - struct drm_sched_entity *entity) -{ - return entity->rq != NULL && - entity->rq->sched == sched; -} - /** * drm_sched_entity_is_idle - Check if entity is idle * @@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity) { rmb();

- if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL) + if (list_empty(&entity->list) || + spsc_queue_peek(&entity->job_queue) == NULL) return true;

return false; @@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) long ret = timeout;

sched = entity->rq->sched; - if (!drm_sched_entity_is_initialized(sched, entity)) - return ret; /** * The client will not queue more IBs during this fini, consume existing * queued IBs or discard them on SIGKILL @@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) last_user = cmpxchg(&entity->last_user, current->group_leader, NULL); if ((!last_user || last_user == current->group_leader) && (current->flags & PF_EXITING) && (current->exit_code == SIGKILL)) - drm_sched_entity_set_rq(entity, NULL); + drm_sched_rq_remove_entity(entity->rq, entity);

return ret; } @@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct drm_sched_entity *entity) struct drm_gpu_scheduler *sched;

sched = entity->rq->sched; - drm_sched_entity_set_rq(entity, NULL); + drm_sched_rq_remove_entity(entity->rq, entity);

/* Consumption of existing IBs wasn't completed. Forcefully * remove them here. @@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct drm_sched_entity *entity, if (entity->rq == rq) return;

- spin_lock(&entity->rq_lock); - - if (entity->rq) - drm_sched_rq_remove_entity(entity->rq, entity); + BUG_ON(!rq);

+ spin_lock(&entity->rq_lock); + drm_sched_rq_remove_entity(entity->rq, entity); entity->rq = rq; - if (rq) - drm_sched_rq_add_entity(rq, entity); - + drm_sched_rq_add_entity(rq, entity); spin_unlock(&entity->rq_lock); } EXPORT_SYMBOL(drm_sched_entity_set_rq);

-- 2.14.1

Nayan Deshmukh

1:30 p.m.

New subject: [PATCH 2/2] drm/scheduler: stop setting rq to NULL

On Mon, Jul 30, 2018 at 4:33 PM Christian König < ckoenig.leichtzumerken@gmail.com> wrote:

...

We removed the redundancy of having an extra scheduler field, so we can't set the rq to NULL any more or otherwise won't know which scheduler to use for the cleanup.

Just remove the entity from the scheduling list instead.

Signed-off-by: Christian König christian.koenig@amd.com

Good catch.

Acked-by: Nayan Deshmukh nayan26deshmukh@gmail.com

...

drivers/gpu/drm/scheduler/gpu_scheduler.c | 35 +++++++------------------------ 1 file changed, 8 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index f563e4fbb4b6..1b733229201e 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct drm_sched_entity *entity, } EXPORT_SYMBOL(drm_sched_entity_init);

-/**

drm_sched_entity_is_initialized - Query if entity is initialized

@sched: Pointer to scheduler instance

@entity: The pointer to a valid scheduler entity

return true if entity is initialized, false otherwise

-*/ -static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler *sched,
                                      struct drm_sched_entity
*entity) -{
  return entity->rq != NULL &&
          entity->rq->sched == sched;
-}

/**

drm_sched_entity_is_idle - Check if entity is idle

@@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity) { rmb();
  if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL)
  if (list_empty(&entity->list) ||
      spsc_queue_peek(&entity->job_queue) == NULL)
          return true;

  return false;
@@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) long ret = timeout;
    sched = entity->rq->sched;
  if (!drm_sched_entity_is_initialized(sched, entity))
          return ret;
  /**
   * The client will not queue more IBs during this fini, consume
existing * queued IBs or discard them on SIGKILL @@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) last_user = cmpxchg(&entity->last_user, current->group_leader, NULL); if ((!last_user || last_user == current->group_leader) && (current->flags & PF_EXITING) && (current->exit_code == SIGKILL))
          drm_sched_entity_set_rq(entity, NULL);
          drm_sched_rq_remove_entity(entity->rq, entity);

  return ret;
} @@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct drm_sched_entity *entity) struct drm_gpu_scheduler *sched;
    sched = entity->rq->sched;
  drm_sched_entity_set_rq(entity, NULL);
  drm_sched_rq_remove_entity(entity->rq, entity);

  /* Consumption of existing IBs wasn't completed. Forcefully
   * remove them here.
@@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct drm_sched_entity *entity, if (entity->rq == rq) return;
  spin_lock(&entity->rq_lock);
  if (entity->rq)
          drm_sched_rq_remove_entity(entity->rq, entity);
  BUG_ON(!rq);
  spin_lock(&entity->rq_lock);
  drm_sched_rq_remove_entity(entity->rq, entity);
  entity->rq = rq;
  if (rq)
          drm_sched_rq_add_entity(rq, entity);
  drm_sched_rq_add_entity(rq, entity);
  spin_unlock(&entity->rq_lock);
} EXPORT_SYMBOL(drm_sched_entity_set_rq); -- 2.14.1

Andrey Grodzovsky

8:51 p.m.

New subject: [PATCH 2/2] drm/scheduler: stop setting rq to NULL

On 07/30/2018 09:30 AM, Nayan Deshmukh wrote:

...

On Mon, Jul 30, 2018 at 4:33 PM Christian König <ckoenig.leichtzumerken@gmail.com mailto:ckoenig.leichtzumerken@gmail.com> wrote:

We removed the redundancy of having an extra scheduler field, so we
can't set the rq to NULL any more or otherwise won't know which
scheduler to use for the cleanup.

Just remove the entity from the scheduling list instead.

Signed-off-by: Christian König <christian.koenig@amd.com
<mailto:christian.koenig@amd.com>>

Good catch.

Acked-by: Nayan Deshmukh <nayan26deshmukh@gmail.com mailto:nayan26deshmukh@gmail.com>

---
 drivers/gpu/drm/scheduler/gpu_scheduler.c | 35
+++++++------------------------
 1 file changed, 8 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c
b/drivers/gpu/drm/scheduler/gpu_scheduler.c
index f563e4fbb4b6..1b733229201e 100644
--- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
@@ -198,21 +198,6 @@ int drm_sched_entity_init(struct
drm_sched_entity *entity,
 }
 EXPORT_SYMBOL(drm_sched_entity_init);

-/**
- * drm_sched_entity_is_initialized - Query if entity is initialized
- *
- * @sched: Pointer to scheduler instance
- * @entity: The pointer to a valid scheduler entity
- *
- * return true if entity is initialized, false otherwise
-*/
-static bool drm_sched_entity_is_initialized(struct
drm_gpu_scheduler *sched,
-                                           struct
drm_sched_entity *entity)
-{
-       return entity->rq != NULL &&
-               entity->rq->sched == sched;
-}
-
 /**
  * drm_sched_entity_is_idle - Check if entity is idle
  *
@@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct
drm_sched_entity *entity)
 {
        rmb();

-       if (!entity->rq || spsc_queue_peek(&entity->job_queue) ==
NULL)
+       if (list_empty(&entity->list) ||
+           spsc_queue_peek(&entity->job_queue) == NULL)
                return true;

        return false;
@@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct
drm_sched_entity *entity, long timeout)
        long ret = timeout;

        sched = entity->rq->sched;
-       if (!drm_sched_entity_is_initialized(sched, entity))
-               return ret;

Just to be clear, you remove this function because it's redundant to call it?

Andrey

...

        /**
         * The client will not queue more IBs during this fini,
consume existing
         * queued IBs or discard them on SIGKILL
@@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct
drm_sched_entity *entity, long timeout)
        last_user = cmpxchg(&entity->last_user,
current->group_leader, NULL);
        if ((!last_user || last_user == current->group_leader) &&
            (current->flags & PF_EXITING) && (current->exit_code
== SIGKILL))
-               drm_sched_entity_set_rq(entity, NULL);
+               drm_sched_rq_remove_entity(entity->rq, entity);

        return ret;
 }
@@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct
drm_sched_entity *entity)
        struct drm_gpu_scheduler *sched;

        sched = entity->rq->sched;
-       drm_sched_entity_set_rq(entity, NULL);
+       drm_sched_rq_remove_entity(entity->rq, entity);

        /* Consumption of existing IBs wasn't completed. Forcefully
         * remove them here.
@@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct
drm_sched_entity *entity,
        if (entity->rq == rq)
                return;

-       spin_lock(&entity->rq_lock);
-
-       if (entity->rq)
-               drm_sched_rq_remove_entity(entity->rq, entity);
+       BUG_ON(!rq);

+       spin_lock(&entity->rq_lock);
+       drm_sched_rq_remove_entity(entity->rq, entity);
        entity->rq = rq;
-       if (rq)
-               drm_sched_rq_add_entity(rq, entity);
-
+       drm_sched_rq_add_entity(rq, entity);
        spin_unlock(&entity->rq_lock);
 }
 EXPORT_SYMBOL(drm_sched_entity_set_rq);
-- 
2.14.1

dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel

Christian König

31 Jul 31 Jul

6:50 a.m.

New subject: [PATCH 2/2] drm/scheduler: stop setting rq to NULL

Am 30.07.2018 um 22:51 schrieb Andrey Grodzovsky:

...

On 07/30/2018 09:30 AM, Nayan Deshmukh wrote:

...

On Mon, Jul 30, 2018 at 4:33 PM Christian König <ckoenig.leichtzumerken@gmail.com mailto:ckoenig.leichtzumerken@gmail.com> wrote:

We removed the redundancy of having an extra scheduler field, so we
can't set the rq to NULL any more or otherwise won't know which
scheduler to use for the cleanup.

Just remove the entity from the scheduling list instead.

Signed-off-by: Christian König <christian.koenig@amd.com
<mailto:christian.koenig@amd.com>>

Good catch.

Acked-by: Nayan Deshmukh <nayan26deshmukh@gmail.com mailto:nayan26deshmukh@gmail.com>

---
 drivers/gpu/drm/scheduler/gpu_scheduler.c | 35
+++++++------------------------
 1 file changed, 8 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c
b/drivers/gpu/drm/scheduler/gpu_scheduler.c
index f563e4fbb4b6..1b733229201e 100644
--- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
@@ -198,21 +198,6 @@ int drm_sched_entity_init(struct
drm_sched_entity *entity,
 }
 EXPORT_SYMBOL(drm_sched_entity_init);

-/**
- * drm_sched_entity_is_initialized - Query if entity is initialized
- *
- * @sched: Pointer to scheduler instance
- * @entity: The pointer to a valid scheduler entity
- *
- * return true if entity is initialized, false otherwise
-*/
-static bool drm_sched_entity_is_initialized(struct
drm_gpu_scheduler *sched,
-                                           struct
drm_sched_entity *entity)
-{
-       return entity->rq != NULL &&
-               entity->rq->sched == sched;
-}
-
 /**
  * drm_sched_entity_is_idle - Check if entity is idle
  *
@@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct
drm_sched_entity *entity)
 {
        rmb();

-       if (!entity->rq || spsc_queue_peek(&entity->job_queue) ==
NULL)
+       if (list_empty(&entity->list) ||
+           spsc_queue_peek(&entity->job_queue) == NULL)
                return true;

        return false;
@@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct
drm_sched_entity *entity, long timeout)
        long ret = timeout;

        sched = entity->rq->sched;
-       if (!drm_sched_entity_is_initialized(sched, entity))
-               return ret;

Just to be clear, you remove this function because it's redundant to call it?

Yes, exactly.

Christian.

...

Andrey

...

/**
         * The client will not queue more IBs during this fini,
consume existing
         * queued IBs or discard them on SIGKILL
@@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct
drm_sched_entity *entity, long timeout)
        last_user = cmpxchg(&entity->last_user,
current->group_leader, NULL);
        if ((!last_user || last_user == current->group_leader) &&
            (current->flags & PF_EXITING) && (current->exit_code
== SIGKILL))
-               drm_sched_entity_set_rq(entity, NULL);
+               drm_sched_rq_remove_entity(entity->rq, entity);

        return ret;
 }
@@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct
drm_sched_entity *entity)
        struct drm_gpu_scheduler *sched;

        sched = entity->rq->sched;
-       drm_sched_entity_set_rq(entity, NULL);
+       drm_sched_rq_remove_entity(entity->rq, entity);

        /* Consumption of existing IBs wasn't completed. Forcefully
         * remove them here.
@@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct
drm_sched_entity *entity,
        if (entity->rq == rq)
                return;

-       spin_lock(&entity->rq_lock);
-
-       if (entity->rq)
-               drm_sched_rq_remove_entity(entity->rq, entity);
+       BUG_ON(!rq);

+       spin_lock(&entity->rq_lock);
+       drm_sched_rq_remove_entity(entity->rq, entity);
        entity->rq = rq;
-       if (rq)
-               drm_sched_rq_add_entity(rq, entity);
-
+       drm_sched_rq_add_entity(rq, entity);
        spin_unlock(&entity->rq_lock);
 }
 EXPORT_SYMBOL(drm_sched_entity_set_rq);
-- 
2.14.1

dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel

Andrey Grodzovsky

2:16 p.m.

New subject: [PATCH 2/2] drm/scheduler: stop setting rq to NULL

Change is Reviewed-by: Andrey Grodzovsky andrey.grodzovsky@amd.com

Andrey

On 07/31/2018 02:50 AM, Christian König wrote:

...

Am 30.07.2018 um 22:51 schrieb Andrey Grodzovsky:

...

On 07/30/2018 09:30 AM, Nayan Deshmukh wrote:

...

On Mon, Jul 30, 2018 at 4:33 PM Christian König <ckoenig.leichtzumerken@gmail.com mailto:ckoenig.leichtzumerken@gmail.com> wrote:

We removed the redundancy of having an extra scheduler field, so we
can't set the rq to NULL any more or otherwise won't know which
scheduler to use for the cleanup.

Just remove the entity from the scheduling list instead.

Signed-off-by: Christian König <christian.koenig@amd.com
<mailto:christian.koenig@amd.com>>

Good catch.

Acked-by: Nayan Deshmukh <nayan26deshmukh@gmail.com mailto:nayan26deshmukh@gmail.com>

---
 drivers/gpu/drm/scheduler/gpu_scheduler.c | 35
+++++++------------------------
 1 file changed, 8 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c
b/drivers/gpu/drm/scheduler/gpu_scheduler.c
index f563e4fbb4b6..1b733229201e 100644
--- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
@@ -198,21 +198,6 @@ int drm_sched_entity_init(struct
drm_sched_entity *entity,
 }
 EXPORT_SYMBOL(drm_sched_entity_init);

-/**
- * drm_sched_entity_is_initialized - Query if entity is initialized
- *
- * @sched: Pointer to scheduler instance
- * @entity: The pointer to a valid scheduler entity
- *
- * return true if entity is initialized, false otherwise
-*/
-static bool drm_sched_entity_is_initialized(struct
drm_gpu_scheduler *sched,
-                                           struct
drm_sched_entity *entity)
-{
-       return entity->rq != NULL &&
-               entity->rq->sched == sched;
-}
-
 /**
  * drm_sched_entity_is_idle - Check if entity is idle
  *
@@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct
drm_sched_entity *entity)
 {
        rmb();

-       if (!entity->rq || spsc_queue_peek(&entity->job_queue)
== NULL)
+       if (list_empty(&entity->list) ||
+           spsc_queue_peek(&entity->job_queue) == NULL)
                return true;

        return false;
@@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct
drm_sched_entity *entity, long timeout)
        long ret = timeout;

        sched = entity->rq->sched;
-       if (!drm_sched_entity_is_initialized(sched, entity))
-               return ret;

Just to be clear, you remove this function because it's redundant to call it?

Yes, exactly.

Christian.

...

Andrey

...

  /**
         * The client will not queue more IBs during this fini,
consume existing
         * queued IBs or discard them on SIGKILL
@@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct
drm_sched_entity *entity, long timeout)
        last_user = cmpxchg(&entity->last_user,
current->group_leader, NULL);
        if ((!last_user || last_user == current->group_leader) &&
            (current->flags & PF_EXITING) && (current->exit_code
== SIGKILL))
-               drm_sched_entity_set_rq(entity, NULL);
+  drm_sched_rq_remove_entity(entity->rq, entity);

        return ret;
 }
@@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct
drm_sched_entity *entity)
        struct drm_gpu_scheduler *sched;

        sched = entity->rq->sched;
-       drm_sched_entity_set_rq(entity, NULL);
+       drm_sched_rq_remove_entity(entity->rq, entity);

        /* Consumption of existing IBs wasn't completed. Forcefully
         * remove them here.
@@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct
drm_sched_entity *entity,
        if (entity->rq == rq)
                return;

-       spin_lock(&entity->rq_lock);
-
-       if (entity->rq)
-  drm_sched_rq_remove_entity(entity->rq, entity);
+       BUG_ON(!rq);

+       spin_lock(&entity->rq_lock);
+       drm_sched_rq_remove_entity(entity->rq, entity);
        entity->rq = rq;
-       if (rq)
-               drm_sched_rq_add_entity(rq, entity);
-
+       drm_sched_rq_add_entity(rq, entity);
        spin_unlock(&entity->rq_lock);
 }
 EXPORT_SYMBOL(drm_sched_entity_set_rq);
-- 
2.14.1

dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel

Andrey Grodzovsky

1 Aug 1 Aug

10:25 p.m.

New subject: [PATCH 2/2] drm/scheduler: stop setting rq to NULL

Thinking again about this change and 53d5f1e drm/scheduler: move idle entities to scheduler with less load v2

Looks to me like use case for which fc9a539 drm/scheduler: add NULL pointer check for run queue (v2) was done

will not work anymore.

First of all in drm_sched_entity_push_job, 'if (!entity->rq)' will never be true any more since we stopped entity->rq to NULL in

drm_sched_entity_flush.

Second of all, even after we removed the entity from rq in drm_sched_entity_flush to terminate any subsequent submissions

to the queue the other thread doing push job can just acquire again a run queue

from drm_sched_entity_get_free_sched and continue submission so you can't substitute ' if (!entity->rq)' to 'if (list_empty(&entity->list))'.

What about adding a 'stopped' flag to drm_sched_entity to be set in drm_sched_entity_flush and in

drm_sched_entity_push_job check for 'if (!entity->stopped)' instead of ' if (!entity->rq)' ?

Andrey

On 07/30/2018 07:03 AM, Christian König wrote:

...

We removed the redundancy of having an extra scheduler field, so we can't set the rq to NULL any more or otherwise won't know which scheduler to use for the cleanup.

Just remove the entity from the scheduling list instead.

Signed-off-by: Christian König christian.koenig@amd.com

drivers/gpu/drm/scheduler/gpu_scheduler.c | 35 +++++++------------------------ 1 file changed, 8 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index f563e4fbb4b6..1b733229201e 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct drm_sched_entity *entity, } EXPORT_SYMBOL(drm_sched_entity_init);

-/**

drm_sched_entity_is_initialized - Query if entity is initialized

@sched: Pointer to scheduler instance

@entity: The pointer to a valid scheduler entity

return true if entity is initialized, false otherwise

-*/ -static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler *sched,
			    struct drm_sched_entity *entity)
-{
return entity->rq != NULL &&
entity->rq->sched == sched;
-}

/**

drm_sched_entity_is_idle - Check if entity is idle

@@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity) { rmb();

if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL)
if (list_empty(&entity->list) ||
   spsc_queue_peek(&entity->job_queue) == NULL)
return true;

return false;
@@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) long ret = timeout;

sched = entity->rq->sched;
if (!drm_sched_entity_is_initialized(sched, entity))
return ret;
/**

The client will not queue more IBs during this fini, consume existing

queued IBs or discard them on SIGKILL
@@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) last_user = cmpxchg(&entity->last_user, current->group_leader, NULL); if ((!last_user || last_user == current->group_leader) && (current->flags & PF_EXITING) && (current->exit_code == SIGKILL))
drm_sched_entity_set_rq(entity, NULL);
drm_sched_rq_remove_entity(entity->rq, entity);
return ret; }
@@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct drm_sched_entity *entity) struct drm_gpu_scheduler *sched;

sched = entity->rq->sched;

drm_sched_entity_set_rq(entity, NULL);

drm_sched_rq_remove_entity(entity->rq, entity);

/* Consumption of existing IBs wasn't completed. Forcefully

remove them here.

@@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct drm_sched_entity *entity, if (entity->rq == rq) return;
spin_lock(&entity->rq_lock);

if (entity->rq)
drm_sched_rq_remove_entity(entity->rq, entity);
BUG_ON(!rq);

spin_lock(&entity->rq_lock);

drm_sched_rq_remove_entity(entity->rq, entity); entity->rq = rq;
if (rq)
drm_sched_rq_add_entity(rq, entity);
drm_sched_rq_add_entity(rq, entity); spin_unlock(&entity->rq_lock); } EXPORT_SYMBOL(drm_sched_entity_set_rq);

Christian König

2 Aug 2 Aug

6:42 a.m.

New subject: [PATCH 2/2] drm/scheduler: stop setting rq to NULL

Am 02.08.2018 um 00:25 schrieb Andrey Grodzovsky:

...

Thinking again about this change and 53d5f1e drm/scheduler: move idle entities to scheduler with less load v2

Looks to me like use case for which fc9a539 drm/scheduler: add NULL pointer check for run queue (v2) was done

will not work anymore.

First of all in drm_sched_entity_push_job, 'if (!entity->rq)' will never be true any more since we stopped entity->rq to NULL in

drm_sched_entity_flush.

Good point, going to remove that.

...

Second of all, even after we removed the entity from rq in drm_sched_entity_flush to terminate any subsequent submissions

to the queue the other thread doing push job can just acquire again a run queue

from drm_sched_entity_get_free_sched and continue submission

That is actually desired.

When another process is now using the entity to submit jobs adding it back to the rq is actually the right thing to do cause the entity is still in use.

Christian.

...

so you can't substitute ' if (!entity->rq)' to 'if (list_empty(&entity->list))'.

What about adding a 'stopped' flag to drm_sched_entity to be set in drm_sched_entity_flush and in

drm_sched_entity_push_job check for 'if (!entity->stopped)' instead of ' if (!entity->rq)' ?

Andrey

On 07/30/2018 07:03 AM, Christian König wrote:

...
We removed the redundancy of having an extra scheduler field, so we can't set the rq to NULL any more or otherwise won't know which scheduler to use for the cleanup.

Just remove the entity from the scheduling list instead.

Signed-off-by: Christian König christian.koenig@amd.com

drivers/gpu/drm/scheduler/gpu_scheduler.c | 35 +++++++------------------------ 1 file changed, 8 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index f563e4fbb4b6..1b733229201e 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct drm_sched_entity *entity, } EXPORT_SYMBOL(drm_sched_entity_init); -/**

drm_sched_entity_is_initialized - Query if entity is initialized

@sched: Pointer to scheduler instance

@entity: The pointer to a valid scheduler entity

return true if entity is initialized, false otherwise

-*/ -static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler *sched, -                        struct drm_sched_entity *entity) -{ -    return entity->rq != NULL && -        entity->rq->sched == sched; -}

/**    * drm_sched_entity_is_idle - Check if entity is idle    * @@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity) {       rmb(); -    if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL) +    if (list_empty(&entity->list) || +        spsc_queue_peek(&entity->job_queue) == NULL)           return true;       return false; @@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout)       long ret = timeout;       sched = entity->rq->sched; -    if (!drm_sched_entity_is_initialized(sched, entity)) -        return ret;       /**        * The client will not queue more IBs during this fini, consume existing        * queued IBs or discard them on SIGKILL @@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout)       last_user = cmpxchg(&entity->last_user, current->group_leader, NULL);       if ((!last_user || last_user == current->group_leader) &&           (current->flags & PF_EXITING) && (current->exit_code == SIGKILL)) -        drm_sched_entity_set_rq(entity, NULL); +        drm_sched_rq_remove_entity(entity->rq, entity);       return ret; } @@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct drm_sched_entity *entity)       struct drm_gpu_scheduler *sched;       sched = entity->rq->sched; -    drm_sched_entity_set_rq(entity, NULL); +    drm_sched_rq_remove_entity(entity->rq, entity);       /* Consumption of existing IBs wasn't completed. Forcefully        * remove them here. @@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct drm_sched_entity *entity,       if (entity->rq == rq)           return; -    spin_lock(&entity->rq_lock);

-    if (entity->rq) -        drm_sched_rq_remove_entity(entity->rq, entity); +    BUG_ON(!rq); +    spin_lock(&entity->rq_lock); +    drm_sched_rq_remove_entity(entity->rq, entity);       entity->rq = rq; -    if (rq) -        drm_sched_rq_add_entity(rq, entity);

+    drm_sched_rq_add_entity(rq, entity);       spin_unlock(&entity->rq_lock); } EXPORT_SYMBOL(drm_sched_entity_set_rq);

Nayan Deshmukh

6:47 a.m.

New subject: [PATCH 2/2] drm/scheduler: stop setting rq to NULL

On Thu, Aug 2, 2018 at 12:12 PM Christian König < ckoenig.leichtzumerken@gmail.com> wrote:

...

Am 02.08.2018 um 00:25 schrieb Andrey Grodzovsky:

...
Thinking again about this change and 53d5f1e drm/scheduler: move idle entities to scheduler with less load v2

Looks to me like use case for which fc9a539 drm/scheduler: add NULL pointer check for run queue (v2) was done

will not work anymore.

First of all in drm_sched_entity_push_job, 'if (!entity->rq)' will never be true any more since we stopped entity->rq to NULL in

drm_sched_entity_flush.

Good point, going to remove that.

...
Second of all, even after we removed the entity from rq in drm_sched_entity_flush to terminate any subsequent submissions

to the queue the other thread doing push job can just acquire again a run queue

from drm_sched_entity_get_free_sched and continue submission

Hi Christian

...

That is actually desired.

When another process is now using the entity to submit jobs adding it back to the rq is actually the right thing to do cause the entity is still in use.

I am not aware of the usecase so I might just be rambling. but if the thread/process that created the entity has called drm_sched_entity_flush then we shouldn't allow other processes to push jobs to that entity.

Nayan

...

Christian.

...
so you can't substitute ' if (!entity->rq)' to 'if (list_empty(&entity->list))'.

What about adding a 'stopped' flag to drm_sched_entity to be set in drm_sched_entity_flush and in

drm_sched_entity_push_job check for 'if (!entity->stopped)' instead of ' if (!entity->rq)' ?

Andrey

On 07/30/2018 07:03 AM, Christian König wrote:

...
We removed the redundancy of having an extra scheduler field, so we can't set the rq to NULL any more or otherwise won't know which scheduler to use for the cleanup.

Just remove the entity from the scheduling list instead.

Signed-off-by: Christian König christian.koenig@amd.com

drivers/gpu/drm/scheduler/gpu_scheduler.c | 35 +++++++------------------------ 1 file changed, 8 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index f563e4fbb4b6..1b733229201e 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct drm_sched_entity *entity, } EXPORT_SYMBOL(drm_sched_entity_init); -/**

drm_sched_entity_is_initialized - Query if entity is initialized

@sched: Pointer to scheduler instance

@entity: The pointer to a valid scheduler entity

return true if entity is initialized, false otherwise

-*/ -static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler *sched,
                   struct drm_sched_entity *entity)
-{
return entity->rq != NULL &&
   entity->rq->sched == sched;
-}

/**

drm_sched_entity_is_idle - Check if entity is idle

@@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity) { rmb();

if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL)
if (list_empty(&entity->list) ||
   spsc_queue_peek(&entity->job_queue) == NULL)
    return true;
  return false;
@@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) long ret = timeout; sched = entity->rq->sched;
if (!drm_sched_entity_is_initialized(sched, entity))
   return ret;
/**
 * The client will not queue more IBs during this fini, consume
existing * queued IBs or discard them on SIGKILL @@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) last_user = cmpxchg(&entity->last_user, current->group_leader, NULL); if ((!last_user || last_user == current->group_leader) && (current->flags & PF_EXITING) && (current->exit_code == SIGKILL))
   drm_sched_entity_set_rq(entity, NULL);
   drm_sched_rq_remove_entity(entity->rq, entity);
  return ret;
}
@@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct drm_sched_entity *entity) struct drm_gpu_scheduler *sched; sched = entity->rq->sched;

drm_sched_entity_set_rq(entity, NULL);

drm_sched_rq_remove_entity(entity->rq, entity); /* Consumption of existing IBs wasn't completed. Forcefully

remove them here.

@@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct drm_sched_entity *entity, if (entity->rq == rq) return;
spin_lock(&entity->rq_lock);

if (entity->rq)
   drm_sched_rq_remove_entity(entity->rq, entity);
BUG_ON(!rq);

spin_lock(&entity->rq_lock);

drm_sched_rq_remove_entity(entity->rq, entity); entity->rq = rq;
if (rq)
   drm_sched_rq_add_entity(rq, entity);
drm_sched_rq_add_entity(rq, entity); spin_unlock(&entity->rq_lock); } EXPORT_SYMBOL(drm_sched_entity_set_rq);

Christian König

7:26 a.m.

New subject: [PATCH 2/2] drm/scheduler: stop setting rq to NULL

Am 02.08.2018 um 08:47 schrieb Nayan Deshmukh:

...

On Thu, Aug 2, 2018 at 12:12 PM Christian König <ckoenig.leichtzumerken@gmail.com mailto:ckoenig.leichtzumerken@gmail.com> wrote:

Am 02.08.2018 um 00:25 schrieb Andrey Grodzovsky:
> Thinking again about this change and 53d5f1e drm/scheduler: move
idle
> entities to scheduler with less load v2
>
> Looks to me like use case for which fc9a539 drm/scheduler: add NULL
> pointer check for run queue (v2) was done
>
> will not work anymore.
>
> First of all in drm_sched_entity_push_job, 'if (!entity->rq)' will
> never be true any more since we stopped entity->rq to NULL in
>
> drm_sched_entity_flush.

Good point, going to remove that.

>
> Second of all, even after we removed the entity from rq in
> drm_sched_entity_flush to terminate any subsequent submissions
>
> to the queue the other thread doing push job can just acquire
again a
> run queue
>
> from drm_sched_entity_get_free_sched and continue submission

Hi Christian

That is actually desired.

When another process is now using the entity to submit jobs adding it
back to the rq is actually the right thing to do cause the entity is
still in use.

Why not?

Calling flush() only means that we should wait for all waiting jobs to be send to the hardware. It doesn't mean that we need to stop accepting new jobs.

It is perfectly possible that a child process has inherited a file descriptor from it's parent and calls flush because it exits, while the parent is still using the file descriptor.

The only tricky part is how to handle it when a process is killed, but I think we came to good conclusion there as well.

We remove the entity from the scheduling when the last process which submitted jobs is killed or if there is no such process any more and the current flushing process is killed.

Christian.

...

Nayan

Christian.

> so you can't substitute ' if (!entity->rq)' to 'if
> (list_empty(&entity->list))'.
>
> What about adding a 'stopped' flag to drm_sched_entity to be set in
> drm_sched_entity_flush and in
>
> drm_sched_entity_push_job check for  'if (!entity->stopped)'
instead
> of  ' if (!entity->rq)' ?
>
> Andrey
>
>
> On 07/30/2018 07:03 AM, Christian König wrote:
>> We removed the redundancy of having an extra scheduler field, so we
>> can't set the rq to NULL any more or otherwise won't know which
>> scheduler to use for the cleanup.
>>
>> Just remove the entity from the scheduling list instead.
>>
>> Signed-off-by: Christian König <christian.koenig@amd.com
<mailto:christian.koenig@amd.com>>
>> ---
>>   drivers/gpu/drm/scheduler/gpu_scheduler.c | 35
>> +++++++------------------------
>>   1 file changed, 8 insertions(+), 27 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c
>> b/drivers/gpu/drm/scheduler/gpu_scheduler.c
>> index f563e4fbb4b6..1b733229201e 100644
>> --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
>> +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
>> @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct
>> drm_sched_entity *entity,
>>   }
>>   EXPORT_SYMBOL(drm_sched_entity_init);
>>   -/**
>> - * drm_sched_entity_is_initialized - Query if entity is
initialized
>> - *
>> - * @sched: Pointer to scheduler instance
>> - * @entity: The pointer to a valid scheduler entity
>> - *
>> - * return true if entity is initialized, false otherwise
>> -*/
>> -static bool drm_sched_entity_is_initialized(struct
drm_gpu_scheduler
>> *sched,
>> -                        struct drm_sched_entity *entity)
>> -{
>> -    return entity->rq != NULL &&
>> -        entity->rq->sched == sched;
>> -}
>> -
>>   /**
>>    * drm_sched_entity_is_idle - Check if entity is idle
>>    *
>> @@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct
>> drm_sched_entity *entity)
>>   {
>>       rmb();
>>   -    if (!entity->rq || spsc_queue_peek(&entity->job_queue)
== NULL)
>> +    if (list_empty(&entity->list) ||
>> +        spsc_queue_peek(&entity->job_queue) == NULL)
>>           return true;
>>         return false;
>> @@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct
>> drm_sched_entity *entity, long timeout)
>>       long ret = timeout;
>>         sched = entity->rq->sched;
>> -    if (!drm_sched_entity_is_initialized(sched, entity))
>> -        return ret;
>>       /**
>>        * The client will not queue more IBs during this fini,
consume
>> existing
>>        * queued IBs or discard them on SIGKILL
>> @@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct
>> drm_sched_entity *entity, long timeout)
>>       last_user = cmpxchg(&entity->last_user,
current->group_leader,
>> NULL);
>>       if ((!last_user || last_user == current->group_leader) &&
>>           (current->flags & PF_EXITING) && (current->exit_code ==
>> SIGKILL))
>> -        drm_sched_entity_set_rq(entity, NULL);
>> +        drm_sched_rq_remove_entity(entity->rq, entity);
>>         return ret;
>>   }
>> @@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct
>> drm_sched_entity *entity)
>>       struct drm_gpu_scheduler *sched;
>>         sched = entity->rq->sched;
>> -    drm_sched_entity_set_rq(entity, NULL);
>> +    drm_sched_rq_remove_entity(entity->rq, entity);
>>         /* Consumption of existing IBs wasn't completed. Forcefully
>>        * remove them here.
>> @@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct
>> drm_sched_entity *entity,
>>       if (entity->rq == rq)
>>           return;
>>   -    spin_lock(&entity->rq_lock);
>> -
>> -    if (entity->rq)
>> -        drm_sched_rq_remove_entity(entity->rq, entity);
>> +    BUG_ON(!rq);
>>   +    spin_lock(&entity->rq_lock);
>> +    drm_sched_rq_remove_entity(entity->rq, entity);
>>       entity->rq = rq;
>> -    if (rq)
>> -        drm_sched_rq_add_entity(rq, entity);
>> -
>> +    drm_sched_rq_add_entity(rq, entity);
>>       spin_unlock(&entity->rq_lock);
>>   }
>>   EXPORT_SYMBOL(drm_sched_entity_set_rq);
>

dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel

Andrey Grodzovsky

2:11 p.m.

New subject: [PATCH 2/2] drm/scheduler: stop setting rq to NULL

On 08/02/2018 02:47 AM, Nayan Deshmukh wrote:

...

On Thu, Aug 2, 2018 at 12:12 PM Christian König <ckoenig.leichtzumerken@gmail.com mailto:ckoenig.leichtzumerken@gmail.com> wrote:

Am 02.08.2018 um 00:25 schrieb Andrey Grodzovsky:
> Thinking again about this change and 53d5f1e drm/scheduler: move
idle
> entities to scheduler with less load v2
>
> Looks to me like use case for which fc9a539 drm/scheduler: add NULL
> pointer check for run queue (v2) was done
>
> will not work anymore.
>
> First of all in drm_sched_entity_push_job, 'if (!entity->rq)' will
> never be true any more since we stopped entity->rq to NULL in
>
> drm_sched_entity_flush.

Good point, going to remove that.

So basically we don't need that if (...){ return; } in drm_sched_entity_push_job any more ?

...

>
> Second of all, even after we removed the entity from rq in
> drm_sched_entity_flush to terminate any subsequent submissions
>
> to the queue the other thread doing push job can just acquire
again a
> run queue
>
> from drm_sched_entity_get_free_sched and continue submission

Hi Christian

That is actually desired.

When another process is now using the entity to submit jobs adding it
back to the rq is actually the right thing to do cause the entity is
still in use.

Yes, no problem if it's another process. But what about another thread from same process ? Is it a possible use case that 2 threads from same process submit to same entity concurrently ? If so and we specifically kill one, the other will not stop event if we want him to because current code makes him just require a rq for him self.

...

I am not aware of the usecase so I might just be rambling. but if the thread/process that created the entity has called drm_sched_entity_flush then we shouldn't allow other processes to push jobs to that entity.

Nayan
Christian.

We don't really know who created the entity, the creation could be by X itself and then it's passed to other process for use. Check 'drm/scheduler: only kill entity if last user is killed v2', the idea is that if by the time you want to kill this entity another process (not another thread from your process - i.e. current->group_leader != last_user in drm_sched_entity_flush) already started to use this entity just let it be.

Andrey

...

> so you can't substitute ' if (!entity->rq)' to 'if
> (list_empty(&entity->list))'.
>
> What about adding a 'stopped' flag to drm_sched_entity to be set in
> drm_sched_entity_flush and in
>
> drm_sched_entity_push_job check for  'if (!entity->stopped)'
instead
> of  ' if (!entity->rq)' ?
>
> Andrey
>
>
> On 07/30/2018 07:03 AM, Christian König wrote:
>> We removed the redundancy of having an extra scheduler field, so we
>> can't set the rq to NULL any more or otherwise won't know which
>> scheduler to use for the cleanup.
>>
>> Just remove the entity from the scheduling list instead.
>>
>> Signed-off-by: Christian König <christian.koenig@amd.com
<mailto:christian.koenig@amd.com>>
>> ---
>>   drivers/gpu/drm/scheduler/gpu_scheduler.c | 35
>> +++++++------------------------
>>   1 file changed, 8 insertions(+), 27 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c
>> b/drivers/gpu/drm/scheduler/gpu_scheduler.c
>> index f563e4fbb4b6..1b733229201e 100644
>> --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
>> +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
>> @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct
>> drm_sched_entity *entity,
>>   }
>>   EXPORT_SYMBOL(drm_sched_entity_init);
>>   -/**
>> - * drm_sched_entity_is_initialized - Query if entity is
initialized
>> - *
>> - * @sched: Pointer to scheduler instance
>> - * @entity: The pointer to a valid scheduler entity
>> - *
>> - * return true if entity is initialized, false otherwise
>> -*/
>> -static bool drm_sched_entity_is_initialized(struct
drm_gpu_scheduler
>> *sched,
>> -                        struct drm_sched_entity *entity)
>> -{
>> -    return entity->rq != NULL &&
>> -        entity->rq->sched == sched;
>> -}
>> -
>>   /**
>>    * drm_sched_entity_is_idle - Check if entity is idle
>>    *
>> @@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct
>> drm_sched_entity *entity)
>>   {
>>       rmb();
>>   -    if (!entity->rq || spsc_queue_peek(&entity->job_queue)
== NULL)
>> +    if (list_empty(&entity->list) ||
>> +        spsc_queue_peek(&entity->job_queue) == NULL)
>>           return true;
>>         return false;
>> @@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct
>> drm_sched_entity *entity, long timeout)
>>       long ret = timeout;
>>         sched = entity->rq->sched;
>> -    if (!drm_sched_entity_is_initialized(sched, entity))
>> -        return ret;
>>       /**
>>        * The client will not queue more IBs during this fini,
consume
>> existing
>>        * queued IBs or discard them on SIGKILL
>> @@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct
>> drm_sched_entity *entity, long timeout)
>>       last_user = cmpxchg(&entity->last_user,
current->group_leader,
>> NULL);
>>       if ((!last_user || last_user == current->group_leader) &&
>>           (current->flags & PF_EXITING) && (current->exit_code ==
>> SIGKILL))
>> -        drm_sched_entity_set_rq(entity, NULL);
>> +        drm_sched_rq_remove_entity(entity->rq, entity);
>>         return ret;
>>   }
>> @@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct
>> drm_sched_entity *entity)
>>       struct drm_gpu_scheduler *sched;
>>         sched = entity->rq->sched;
>> -    drm_sched_entity_set_rq(entity, NULL);
>> +    drm_sched_rq_remove_entity(entity->rq, entity);
>>         /* Consumption of existing IBs wasn't completed. Forcefully
>>        * remove them here.
>> @@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct
>> drm_sched_entity *entity,
>>       if (entity->rq == rq)
>>           return;
>>   -    spin_lock(&entity->rq_lock);
>> -
>> -    if (entity->rq)
>> -        drm_sched_rq_remove_entity(entity->rq, entity);
>> +    BUG_ON(!rq);
>>   +    spin_lock(&entity->rq_lock);
>> +    drm_sched_rq_remove_entity(entity->rq, entity);
>>       entity->rq = rq;
>> -    if (rq)
>> -        drm_sched_rq_add_entity(rq, entity);
>> -
>> +    drm_sched_rq_add_entity(rq, entity);
>>       spin_unlock(&entity->rq_lock);
>>   }
>>   EXPORT_SYMBOL(drm_sched_entity_set_rq);
>

Christian König

3 Aug 3 Aug

2:06 p.m.

New subject: [PATCH 2/2] drm/scheduler: stop setting rq to NULL

Am 02.08.2018 um 16:11 schrieb Andrey Grodzovsky:

...

On 08/02/2018 02:47 AM, Nayan Deshmukh wrote:

...
On Thu, Aug 2, 2018 at 12:12 PM Christian König <ckoenig.leichtzumerken@gmail.com mailto:ckoenig.leichtzumerken@gmail.com> wrote:
Am 02.08.2018 um 00:25 schrieb Andrey Grodzovsky:
> Thinking again about this change and 53d5f1e drm/scheduler:
move idle
> entities to scheduler with less load v2
>
> Looks to me like use case for which fc9a539 drm/scheduler: add
NULL
> pointer check for run queue (v2) was done
>
> will not work anymore.
>
> First of all in drm_sched_entity_push_job, 'if (!entity->rq)' will
> never be true any more since we stopped entity->rq to NULL in
>
> drm_sched_entity_flush.

Good point, going to remove that.
So basically we don't need that if (...){ return; } in drm_sched_entity_push_job any more ?

Yes, exactly.

...

...
>
> Second of all, even after we removed the entity from rq in
> drm_sched_entity_flush to terminate any subsequent submissions
>
> to the queue the other thread doing push job can just acquire
again a
> run queue
>
> from drm_sched_entity_get_free_sched and continue submission
Hi Christian
That is actually desired.

When another process is now using the entity to submit jobs
adding it
back to the rq is actually the right thing to do cause the entity is
still in use.
Yes, no problem if it's another process. But what about another thread from same process ? Is it a possible use case that 2 threads from same process submit to same entity concurrently ? If so and we specifically kill one, the other will not stop event if we want him to because current code makes him just require a rq for him self.

Well you can't kill a single thread of a process (you can only interrupt it), a SIGKILL will always kill the whole process.

...

...
I am not aware of the usecase so I might just be rambling. but if the thread/process that created the entity has called drm_sched_entity_flush then we shouldn't allow other processes to push jobs to that entity.

Nayan
Christian.
We don't really know who created the entity, the creation could be by X itself and then it's passed to other process for use. Check 'drm/scheduler: only kill entity if last user is killed v2', the idea is that if by the time you want to kill this entity another process (not another thread from your process

i.e. current->group_leader != last_user in drm_sched_entity_flush)

already started to use this entity just let it be.

Yes, exactly that's the idea.

Christian.

...

Andrey

...

> so you can't substitute ' if (!entity->rq)' to 'if
> (list_empty(&entity->list))'.
>
> What about adding a 'stopped' flag to drm_sched_entity to be
set in
> drm_sched_entity_flush and in
>
> drm_sched_entity_push_job check for  'if (!entity->stopped)'
instead
> of  ' if (!entity->rq)' ?
>
> Andrey
>
>
> On 07/30/2018 07:03 AM, Christian König wrote:
>> We removed the redundancy of having an extra scheduler field,
so we
>> can't set the rq to NULL any more or otherwise won't know which
>> scheduler to use for the cleanup.
>>
>> Just remove the entity from the scheduling list instead.
>>
>> Signed-off-by: Christian König <christian.koenig@amd.com
<mailto:christian.koenig@amd.com>>
>> ---
>>   drivers/gpu/drm/scheduler/gpu_scheduler.c | 35
>> +++++++------------------------
>>   1 file changed, 8 insertions(+), 27 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c
>> b/drivers/gpu/drm/scheduler/gpu_scheduler.c
>> index f563e4fbb4b6..1b733229201e 100644
>> --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
>> +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
>> @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct
>> drm_sched_entity *entity,
>>   }
>>   EXPORT_SYMBOL(drm_sched_entity_init);
>>   -/**
>> - * drm_sched_entity_is_initialized - Query if entity is
initialized
>> - *
>> - * @sched: Pointer to scheduler instance
>> - * @entity: The pointer to a valid scheduler entity
>> - *
>> - * return true if entity is initialized, false otherwise
>> -*/
>> -static bool drm_sched_entity_is_initialized(struct
drm_gpu_scheduler
>> *sched,
>> -                        struct drm_sched_entity *entity)
>> -{
>> -    return entity->rq != NULL &&
>> -        entity->rq->sched == sched;
>> -}
>> -
>>   /**
>>    * drm_sched_entity_is_idle - Check if entity is idle
>>    *
>> @@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct
>> drm_sched_entity *entity)
>>   {
>>       rmb();
>>   -    if (!entity->rq || spsc_queue_peek(&entity->job_queue)
== NULL)
>> +    if (list_empty(&entity->list) ||
>> + spsc_queue_peek(&entity->job_queue) == NULL)
>>           return true;
>>         return false;
>> @@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct
>> drm_sched_entity *entity, long timeout)
>>       long ret = timeout;
>>         sched = entity->rq->sched;
>> -    if (!drm_sched_entity_is_initialized(sched, entity))
>> -        return ret;
>>       /**
>>        * The client will not queue more IBs during this fini,
consume
>> existing
>>        * queued IBs or discard them on SIGKILL
>> @@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct
>> drm_sched_entity *entity, long timeout)
>>       last_user = cmpxchg(&entity->last_user,
current->group_leader,
>> NULL);
>>       if ((!last_user || last_user == current->group_leader) &&
>>           (current->flags & PF_EXITING) && (current->exit_code ==
>> SIGKILL))
>> -        drm_sched_entity_set_rq(entity, NULL);
>> + drm_sched_rq_remove_entity(entity->rq, entity);
>>         return ret;
>>   }
>> @@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct
>> drm_sched_entity *entity)
>>       struct drm_gpu_scheduler *sched;
>>         sched = entity->rq->sched;
>> -    drm_sched_entity_set_rq(entity, NULL);
>> +    drm_sched_rq_remove_entity(entity->rq, entity);
>>         /* Consumption of existing IBs wasn't completed.
Forcefully
>>        * remove them here.
>> @@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct
>> drm_sched_entity *entity,
>>       if (entity->rq == rq)
>>           return;
>>   -    spin_lock(&entity->rq_lock);
>> -
>> -    if (entity->rq)
>> - drm_sched_rq_remove_entity(entity->rq, entity);
>> +    BUG_ON(!rq);
>>   +    spin_lock(&entity->rq_lock);
>> +    drm_sched_rq_remove_entity(entity->rq, entity);
>>       entity->rq = rq;
>> -    if (rq)
>> -        drm_sched_rq_add_entity(rq, entity);
>> -
>> +    drm_sched_rq_add_entity(rq, entity);
>>       spin_unlock(&entity->rq_lock);
>>   }
>>   EXPORT_SYMBOL(drm_sched_entity_set_rq);
>

Andrey Grodzovsky

2:54 p.m.

New subject: [PATCH 2/2] drm/scheduler: stop setting rq to NULL

On 08/03/2018 10:06 AM, Christian König wrote:

...

Am 02.08.2018 um 16:11 schrieb Andrey Grodzovsky:

...
On 08/02/2018 02:47 AM, Nayan Deshmukh wrote:

...
On Thu, Aug 2, 2018 at 12:12 PM Christian König <ckoenig.leichtzumerken@gmail.com mailto:ckoenig.leichtzumerken@gmail.com> wrote:
Am 02.08.2018 um 00:25 schrieb Andrey Grodzovsky:
> Thinking again about this change and 53d5f1e drm/scheduler:
move idle
> entities to scheduler with less load v2
>
> Looks to me like use case for which fc9a539 drm/scheduler: add
NULL
> pointer check for run queue (v2) was done
>
> will not work anymore.
>
> First of all in drm_sched_entity_push_job, 'if (!entity->rq)'
will
> never be true any more since we stopped entity->rq to NULL in
>
> drm_sched_entity_flush.

Good point, going to remove that.
So basically we don't need that if (...){ return; } in drm_sched_entity_push_job any more ?
Yes, exactly.

...
...
>
> Second of all, even after we removed the entity from rq in
> drm_sched_entity_flush to terminate any subsequent submissions
>
> to the queue the other thread doing push job can just acquire
again a
> run queue
>
> from drm_sched_entity_get_free_sched and continue submission
Hi Christian
That is actually desired.

When another process is now using the entity to submit jobs
adding it
back to the rq is actually the right thing to do cause the
entity is
still in use.
Yes, no problem if it's another process. But what about another thread from same process ? Is it a possible use case that 2 threads from same process submit to same entity concurrently ? If so and we specifically kill one, the other will not stop event if we want him to because current code makes him just require a rq for him self.
Well you can't kill a single thread of a process (you can only interrupt it), a SIGKILL will always kill the whole process.

Is the following scenario possible and acceptable ? 2 threads from same process working on same queue where thread A currently in drm_sched_entity_flush->wait_event_timeout (the process getting shut down because of SIGKILL sent by user) while thread B still inside drm_sched_entity_push_job before 'if (reschedule)'. 'A' stopped waiting because queue became empty and then removes the entity queue from scheduler's run queue while B goes inside 'reschedule' because it evaluates to true ('first' is true and all the rest of the conditions), acquires new rq, and later adds it back to scheduler (different one maybe) and keeps submitting jobs as much as he likes and then can be stack for up to 'timeout' time in his drm_sched_entity_flush waiting for them.

My understanding was that introduction of entity->last is to force immediate termination job submissions by any thread from the terminating process.

Andrey

...

...
I am not aware of the usecase so I might just be rambling. but if the thread/process that created the entity has called drm_sched_entity_flush then we shouldn't allow other processes to push jobs to that entity.

Nayan
Christian.
We don't really know who created the entity, the creation could be by X itself and then it's passed to other process for use. Check 'drm/scheduler: only kill entity if last user is killed v2', the idea is that if by the time you want to kill this entity another process (not another thread from your process - i.e. current->group_leader != last_user in drm_sched_entity_flush) already started to use this entity just let it be.

Yes, exactly that's the idea.

Christian.

...

Andrey

...

> so you can't substitute ' if (!entity->rq)' to 'if
> (list_empty(&entity->list))'.
>
> What about adding a 'stopped' flag to drm_sched_entity to be
set in
> drm_sched_entity_flush and in
>
> drm_sched_entity_push_job check for  'if (!entity->stopped)'
instead
> of  ' if (!entity->rq)' ?
>
> Andrey
>
>
> On 07/30/2018 07:03 AM, Christian König wrote:
>> We removed the redundancy of having an extra scheduler field,
so we
>> can't set the rq to NULL any more or otherwise won't know which
>> scheduler to use for the cleanup.
>>
>> Just remove the entity from the scheduling list instead.
>>
>> Signed-off-by: Christian König <christian.koenig@amd.com
<mailto:christian.koenig@amd.com>>
>> ---
>>   drivers/gpu/drm/scheduler/gpu_scheduler.c | 35
>> +++++++------------------------
>>   1 file changed, 8 insertions(+), 27 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c
>> b/drivers/gpu/drm/scheduler/gpu_scheduler.c
>> index f563e4fbb4b6..1b733229201e 100644
>> --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
>> +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
>> @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct
>> drm_sched_entity *entity,
>>   }
>>   EXPORT_SYMBOL(drm_sched_entity_init);
>>   -/**
>> - * drm_sched_entity_is_initialized - Query if entity is
initialized
>> - *
>> - * @sched: Pointer to scheduler instance
>> - * @entity: The pointer to a valid scheduler entity
>> - *
>> - * return true if entity is initialized, false otherwise
>> -*/
>> -static bool drm_sched_entity_is_initialized(struct
drm_gpu_scheduler
>> *sched,
>> -                        struct drm_sched_entity *entity)
>> -{
>> -    return entity->rq != NULL &&
>> -        entity->rq->sched == sched;
>> -}
>> -
>>   /**
>>    * drm_sched_entity_is_idle - Check if entity is idle
>>    *
>> @@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct
>> drm_sched_entity *entity)
>>   {
>>       rmb();
>>   -    if (!entity->rq || spsc_queue_peek(&entity->job_queue)
== NULL)
>> +    if (list_empty(&entity->list) ||
>> + spsc_queue_peek(&entity->job_queue) == NULL)
>>           return true;
>>         return false;
>> @@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct
>> drm_sched_entity *entity, long timeout)
>>       long ret = timeout;
>>         sched = entity->rq->sched;
>> -    if (!drm_sched_entity_is_initialized(sched, entity))
>> -        return ret;
>>       /**
>>        * The client will not queue more IBs during this fini,
consume
>> existing
>>        * queued IBs or discard them on SIGKILL
>> @@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct
>> drm_sched_entity *entity, long timeout)
>>       last_user = cmpxchg(&entity->last_user,
current->group_leader,
>> NULL);
>>       if ((!last_user || last_user == current->group_leader) &&
>>           (current->flags & PF_EXITING) &&
(current->exit_code ==
>> SIGKILL))
>> -        drm_sched_entity_set_rq(entity, NULL);
>> + drm_sched_rq_remove_entity(entity->rq, entity);
>>         return ret;
>>   }
>> @@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct
>> drm_sched_entity *entity)
>>       struct drm_gpu_scheduler *sched;
>>         sched = entity->rq->sched;
>> -    drm_sched_entity_set_rq(entity, NULL);
>> +    drm_sched_rq_remove_entity(entity->rq, entity);
>>         /* Consumption of existing IBs wasn't completed.
Forcefully
>>        * remove them here.
>> @@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct
>> drm_sched_entity *entity,
>>       if (entity->rq == rq)
>>           return;
>>   -    spin_lock(&entity->rq_lock);
>> -
>> -    if (entity->rq)
>> - drm_sched_rq_remove_entity(entity->rq, entity);
>> +    BUG_ON(!rq);
>>   +    spin_lock(&entity->rq_lock);
>> +    drm_sched_rq_remove_entity(entity->rq, entity);
>>       entity->rq = rq;
>> -    if (rq)
>> -        drm_sched_rq_add_entity(rq, entity);
>> -
>> +    drm_sched_rq_add_entity(rq, entity);
>>       spin_unlock(&entity->rq_lock);
>>   }
>>   EXPORT_SYMBOL(drm_sched_entity_set_rq);
>

Christian König

6 Aug 6 Aug

12:44 p.m.

New subject: [PATCH 2/2] drm/scheduler: stop setting rq to NULL

Am 03.08.2018 um 16:54 schrieb Andrey Grodzovsky:

...

[SNIP]

...
...
...
>
> Second of all, even after we removed the entity from rq in
> drm_sched_entity_flush to terminate any subsequent submissions
>
> to the queue the other thread doing push job can just acquire
again a
> run queue
>
> from drm_sched_entity_get_free_sched and continue submission
Hi Christian
That is actually desired.

When another process is now using the entity to submit jobs
adding it
back to the rq is actually the right thing to do cause the
entity is
still in use.
Yes, no problem if it's another process. But what about another thread from same process ? Is it a possible use case that 2 threads from same process submit to same entity concurrently ? If so and we specifically kill one, the other will not stop event if we want him to because current code makes him just require a rq for him self.
Well you can't kill a single thread of a process (you can only interrupt it), a SIGKILL will always kill the whole process.
Is the following scenario possible and acceptable ? 2 threads from same process working on same queue where thread A currently in drm_sched_entity_flush->wait_event_timeout (the process getting shut down because of SIGKILL sent by user) while thread B still inside drm_sched_entity_push_job before 'if (reschedule)'. 'A' stopped waiting because queue became empty and then removes the entity queue from scheduler's run queue while B goes inside 'reschedule' because it evaluates to true ('first' is true and all the rest of the conditions), acquires new rq, and later adds it back to scheduler (different one maybe) and keeps submitting jobs as much as he likes and then can be stack for up to 'timeout' time in his drm_sched_entity_flush waiting for them.

I'm not 100% sure but I don't think that can happen.

See flushing the fd is done while dropping the fd, which happens only after all threads of the process in question are killed.

Otherwise the flushing wouldn't make to much sense. In other words imagine an application where a thread does a write() on a fd which is killed.

The idea of the flush is to preserve the data and that won't work if that isn't correctly ordered.

...

My understanding was that introduction of entity->last is to force immediate termination job submissions by any thread from the terminating process.

We could consider reordering that once more. Going to play out all scenarios in my head over the weekend :)

Christian.

...

Andrey

Andrey Grodzovsky

3 Aug 3 Aug

6:41 p.m.

New subject: [PATCH 2/2] drm/scheduler: stop setting rq to NULL

On 08/06/2018 08:44 AM, Christian König wrote:

...

Am 03.08.2018 um 16:54 schrieb Andrey Grodzovsky:

...
[SNIP]

...
...
...
>
> Second of all, even after we removed the entity from rq in
> drm_sched_entity_flush to terminate any subsequent submissions
>
> to the queue the other thread doing push job can just
acquire again a
> run queue
>
> from drm_sched_entity_get_free_sched and continue submission
Hi Christian
That is actually desired.

When another process is now using the entity to submit jobs
adding it
back to the rq is actually the right thing to do cause the
entity is
still in use.
Yes, no problem if it's another process. But what about another thread from same process ? Is it a possible use case that 2 threads from same process submit to same entity concurrently ? If so and we specifically kill one, the other will not stop event if we want him to because current code makes him just require a rq for him self.
Well you can't kill a single thread of a process (you can only interrupt it), a SIGKILL will always kill the whole process.
Is the following scenario possible and acceptable ? 2 threads from same process working on same queue where thread A currently in drm_sched_entity_flush->wait_event_timeout (the process getting shut down because of SIGKILL sent by user) while thread B still inside drm_sched_entity_push_job before 'if (reschedule)'. 'A' stopped waiting because queue became empty and then removes the entity queue from scheduler's run queue while B goes inside 'reschedule' because it evaluates to true ('first' is true and all the rest of the conditions), acquires new rq, and later adds it back to scheduler (different one maybe) and keeps submitting jobs as much as he likes and then can be stack for up to 'timeout' time in his drm_sched_entity_flush waiting for them.
I'm not 100% sure but I don't think that can happen.

See flushing the fd is done while dropping the fd, which happens only after all threads of the process in question are killed.

Yea, this FDs handling is indeed a lot of gray area for me but as far as I remember flushing is done per each thread when exits (possibly due to a signal). Now signals interception and processing (as a result of which .flush will get called if SIGKILL received) is done in some points amongst which is when returning from IOCTL. So if first thread was at the very end of the CS ioctl when SIGKILL was received while the other one at the beginning then I think we might see something like the scenario above.

Andrey

...

Otherwise the flushing wouldn't make to much sense. In other words imagine an application where a thread does a write() on a fd which is killed.

The idea of the flush is to preserve the data and that won't work if that isn't correctly ordered.

...
My understanding was that introduction of entity->last is to force immediate termination job submissions by any thread from the terminating process.

We could consider reordering that once more. Going to play out all scenarios in my head over the weekend :)

Christian.

...
Andrey

Christian König

6 Aug 6 Aug

8:14 a.m.

New subject: [PATCH 2/2] drm/scheduler: stop setting rq to NULL

Am 03.08.2018 um 20:41 schrieb Andrey Grodzovsky:

...

On 08/06/2018 08:44 AM, Christian König wrote:

...
Am 03.08.2018 um 16:54 schrieb Andrey Grodzovsky:

...
[SNIP]

...
...
...
>
> Second of all, even after we removed the entity from rq in
> drm_sched_entity_flush to terminate any subsequent submissions
>
> to the queue the other thread doing push job can just
acquire again a
> run queue
>
> from drm_sched_entity_get_free_sched and continue submission
Hi Christian
That is actually desired.

When another process is now using the entity to submit jobs
adding it
back to the rq is actually the right thing to do cause the
entity is
still in use.
Yes, no problem if it's another process. But what about another thread from same process ? Is it a possible use case that 2 threads from same process submit to same entity concurrently ? If so and we specifically kill one, the other will not stop event if we want him to because current code makes him just require a rq for him self.
Well you can't kill a single thread of a process (you can only interrupt it), a SIGKILL will always kill the whole process.
Is the following scenario possible and acceptable ? 2 threads from same process working on same queue where thread A currently in drm_sched_entity_flush->wait_event_timeout (the process getting shut down because of SIGKILL sent by user) while thread B still inside drm_sched_entity_push_job before 'if (reschedule)'. 'A' stopped waiting because queue became empty and then removes the entity queue from scheduler's run queue while B goes inside 'reschedule' because it evaluates to true ('first' is true and all the rest of the conditions), acquires new rq, and later adds it back to scheduler (different one maybe) and keeps submitting jobs as much as he likes and then can be stack for up to 'timeout' time in his drm_sched_entity_flush waiting for them.
I'm not 100% sure but I don't think that can happen.

See flushing the fd is done while dropping the fd, which happens only after all threads of the process in question are killed.
Yea, this FDs handling is indeed a lot of gray area for me but as far as I remember flushing is done per each thread when exits (possibly due to a signal). Now signals interception and processing (as a result of which .flush will get called if SIGKILL received) is done in some points amongst which is when returning from IOCTL. So if first thread was at the very end of the CS ioctl when SIGKILL was received while the other one at the beginning then I think we might see something like the scenario above.

SIGKILL isn't processed as long as any thread of the application is still inside the kernel. That's why we have wait_event_killable().

So I don't think that the scenario above is possible, but I'm really not 100% sure either.

Christian.

...

Andrey

...
Otherwise the flushing wouldn't make to much sense. In other words imagine an application where a thread does a write() on a fd which is killed.

The idea of the flush is to preserve the data and that won't work if that isn't correctly ordered.

...
My understanding was that introduction of entity->last is to force immediate termination job submissions by any thread from the terminating process.

We could consider reordering that once more. Going to play out all scenarios in my head over the weekend :)

Christian.

...
Andrey

dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel

Andrey Grodzovsky

9 Aug 9 Aug

5:39 p.m.

New subject: [PATCH 2/2] drm/scheduler: stop setting rq to NULL

On 08/06/2018 04:14 AM, Christian König wrote:

...

Am 03.08.2018 um 20:41 schrieb Andrey Grodzovsky:

...
On 08/06/2018 08:44 AM, Christian König wrote:

...
Am 03.08.2018 um 16:54 schrieb Andrey Grodzovsky:

...
[SNIP]

...
...
> > > > > Second of all, even after we removed the entity from rq in > > drm_sched_entity_flush to terminate any subsequent submissions > > > > to the queue the other thread doing push job can just > acquire again a > > run queue > > > > from drm_sched_entity_get_free_sched and continue submission > > Hi Christian > > That is actually desired. > > When another process is now using the entity to submit jobs > adding it > back to the rq is actually the right thing to do cause the > entity is > still in use. >

Yes, no problem if it's another process. But what about another thread from same process ? Is it a possible use case that 2 threads from same process submit to same entity concurrently ? If so and we specifically kill one, the other will not stop event if we want him to because current code makes him just require a rq for him self.

Well you can't kill a single thread of a process (you can only interrupt it), a SIGKILL will always kill the whole process.

Is the following scenario possible and acceptable ? 2 threads from same process working on same queue where thread A currently in drm_sched_entity_flush->wait_event_timeout (the process getting shut down because of SIGKILL sent by user) while thread B still inside drm_sched_entity_push_job before 'if (reschedule)'. 'A' stopped waiting because queue became empty and then removes the entity queue from scheduler's run queue while B goes inside 'reschedule' because it evaluates to true ('first' is true and all the rest of the conditions), acquires new rq, and later adds it back to scheduler (different one maybe) and keeps submitting jobs as much as he likes and then can be stack for up to 'timeout' time in his drm_sched_entity_flush waiting for them.

I'm not 100% sure but I don't think that can happen.

See flushing the fd is done while dropping the fd, which happens only after all threads of the process in question are killed.

Yea, this FDs handling is indeed a lot of gray area for me but as far as I remember flushing is done per each thread when exits (possibly due to a signal). Now signals interception and processing (as a result of which .flush will get called if SIGKILL received) is done in some points amongst which is when returning from IOCTL. So if first thread was at the very end of the CS ioctl when SIGKILL was received while the other one at the beginning then I think we might see something like the scenario above.

SIGKILL isn't processed as long as any thread of the application is still inside the kernel. That's why we have wait_event_killable().

Can you tell me where is this happening ? What i see is in the code is that do_group_exit calls zap_other_threads which just adds SIGKILL to signal sets of other threads in group and sends a wake up. Then do_exit will close all FDs for current thread and so .flush will be called, when last thread drops it's refcount for the FD .release will be called.

Andrey

...

So I don't think that the scenario above is possible, but I'm really not 100% sure either.

Christian.

...
Andrey

...
Otherwise the flushing wouldn't make to much sense. In other words imagine an application where a thread does a write() on a fd which is killed.

The idea of the flush is to preserve the data and that won't work if that isn't correctly ordered.

...
My understanding was that introduction of entity->last is to force immediate termination job submissions by any thread from the terminating process.

We could consider reordering that once more. Going to play out all scenarios in my head over the weekend :)

Christian.

...
Andrey

dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel

Christian König

10 Aug 10 Aug

1:16 p.m.

New subject: [PATCH 2/2] drm/scheduler: stop setting rq to NULL

Am 09.08.2018 um 19:39 schrieb Andrey Grodzovsky:

...

[SNIP]

...
SIGKILL isn't processed as long as any thread of the application is still inside the kernel. That's why we have wait_event_killable().

Can you tell me where is this happening ? What i see is in the code is that do_group_exit calls zap_other_threads which just adds SIGKILL to signal sets of other threads in group and sends a wake up. Then do_exit will close all FDs for current thread and so .flush will be called, when last thread drops it's refcount for the FD .release will be called.

Good question, I have not the slightest idea.

Killed processes certainly doesn't die until all threads return from kernel space, but I'm not 100% sure if that happens before or after the flush.

Christian.

...

Andrey

Nayan Deshmukh

2 Aug 2 Aug

6:43 a.m.

New subject: [PATCH 2/2] drm/scheduler: stop setting rq to NULL

Hi Andrey,

Good catch. I guess since both Christian and I were working on it parallelly we didn't catch this problem.

If it is only causing a problem in push_job then we can handle it something like this:

---------------------------------------------------------------------------------------------------------------------------- diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index 05dc6ecd4003..f344ce32f128 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -563,6 +563,11 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job, first = spsc_queue_count(&entity->job_queue) == 0; reschedule = idle && first && (entity->num_rq_list > 1);

+ if (first && list_empty(&entity->list)) { + DRM_ERROR("Trying to push to a killed entity\n"); + return; + } + if (reschedule) { rq = drm_sched_entity_get_free_sched(entity); spin_lock(&entity->rq_lock); @@ -580,11 +585,6 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job, if (first) { /* Add the entity to the run queue */ spin_lock(&entity->rq_lock); - if (!entity->rq) { - DRM_ERROR("Trying to push to a killed entity\n"); - spin_unlock(&entity->rq_lock); - return; - } drm_sched_rq_add_entity(entity->rq, entity); spin_unlock(&entity->rq_lock); drm_sched_wakeup(entity->rq->sched); -----------------------------------------------------------------------------------------------------------------------------

On Thu, Aug 2, 2018 at 3:55 AM Andrey Grodzovsky Andrey.Grodzovsky@amd.com wrote:

...

Thinking again about this change and 53d5f1e drm/scheduler: move idle entities to scheduler with less load v2

Looks to me like use case for which fc9a539 drm/scheduler: add NULL pointer check for run queue (v2) was done

will not work anymore.

First of all in drm_sched_entity_push_job, 'if (!entity->rq)' will never be true any more since we stopped entity->rq to NULL in

drm_sched_entity_flush.

Second of all, even after we removed the entity from rq in drm_sched_entity_flush to terminate any subsequent submissions

to the queue the other thread doing push job can just acquire again a run queue

from drm_sched_entity_get_free_sched and continue submission so you can't substitute ' if (!entity->rq)' to 'if (list_empty(&entity->list))'.

What about adding a 'stopped' flag to drm_sched_entity to be set in drm_sched_entity_flush and in

But it might be worth adding a stopped flag field if it is required at

other places as well.

Thanks, Nayan

...

drm_sched_entity_push_job check for 'if (!entity->stopped)' instead of ' if (!entity->rq)' ?

Andrey

On 07/30/2018 07:03 AM, Christian König wrote:

...
We removed the redundancy of having an extra scheduler field, so we can't set the rq to NULL any more or otherwise won't know which scheduler to use for the cleanup.

Just remove the entity from the scheduling list instead.

Signed-off-by: Christian König christian.koenig@amd.com

drivers/gpu/drm/scheduler/gpu_scheduler.c | 35

+++++++------------------------

...
1 file changed, 8 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c

b/drivers/gpu/drm/scheduler/gpu_scheduler.c

...
index f563e4fbb4b6..1b733229201e 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct drm_sched_entity

*entity,

...
} EXPORT_SYMBOL(drm_sched_entity_init);

-/**

drm_sched_entity_is_initialized - Query if entity is initialized

@sched: Pointer to scheduler instance

@entity: The pointer to a valid scheduler entity

return true if entity is initialized, false otherwise

-*/ -static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler

*sched,

...
                                    struct drm_sched_entity
*entity)

...
-{
return entity->rq != NULL &&
        entity->rq->sched == sched;
-}

/**

drm_sched_entity_is_idle - Check if entity is idle

@@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct
drm_sched_entity *entity)

...
{ rmb();
if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL)
if (list_empty(&entity->list) ||
    spsc_queue_peek(&entity->job_queue) == NULL)
        return true;

return false;
@@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct drm_sched_entity
*entity, long timeout)

...
  long ret = timeout;

  sched = entity->rq->sched;
if (!drm_sched_entity_is_initialized(sched, entity))
        return ret;
/**
 * The client will not queue more IBs during this fini, consume
existing

...
   * queued IBs or discard them on SIGKILL
@@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct drm_sched_entity
*entity, long timeout)

...
  last_user = cmpxchg(&entity->last_user, current->group_leader,
NULL);

...
  if ((!last_user || last_user == current->group_leader) &&
      (current->flags & PF_EXITING) && (current->exit_code ==
SIGKILL))

...
        drm_sched_entity_set_rq(entity, NULL);
        drm_sched_rq_remove_entity(entity->rq, entity);

return ret;
}
@@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct drm_sched_entity
*entity)

...
  struct drm_gpu_scheduler *sched;

  sched = entity->rq->sched;
drm_sched_entity_set_rq(entity, NULL);
drm_sched_rq_remove_entity(entity->rq, entity);

/* Consumption of existing IBs wasn't completed. Forcefully
 * remove them here.
@@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct
drm_sched_entity *entity,

...
  if (entity->rq == rq)
          return;
spin_lock(&entity->rq_lock);
if (entity->rq)
        drm_sched_rq_remove_entity(entity->rq, entity);
BUG_ON(!rq);
spin_lock(&entity->rq_lock);
drm_sched_rq_remove_entity(entity->rq, entity);
entity->rq = rq;
if (rq)
        drm_sched_rq_add_entity(rq, entity);
drm_sched_rq_add_entity(rq, entity);
spin_unlock(&entity->rq_lock);
} EXPORT_SYMBOL(drm_sched_entity_set_rq);

Andrey Grodzovsky

2:38 p.m.

New subject: [PATCH 2/2] drm/scheduler: stop setting rq to NULL

On 08/02/2018 02:43 AM, Nayan Deshmukh wrote:

...

Hi Andrey,

Good catch. I guess since both Christian and I were working on it parallelly we didn't catch this problem.

If it is only causing a problem in push_job then we can handle it something like this:

diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index 05dc6ecd4003..f344ce32f128 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -563,6 +563,11 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job, first = spsc_queue_count(&entity->job_queue) == 0; reschedule = idle && first && (entity->num_rq_list > 1);

+ if (first && list_empty(&entity->list)) {

first might be false if the other process interrupted by SIGKILL in middle of wait_event_killable in drm_sched_entity_flush when there were still item in queue. I don't see a good way besides adding a 'stopped' flag to drm_sched_enitity.

Andrey

...

+ DRM_ERROR("Trying to push to a killed entity\n"); + return; + }

if (reschedule) { rq = drm_sched_entity_get_free_sched(entity); spin_lock(&entity->rq_lock); @@ -580,11 +585,6 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job, if (first) { /* Add the entity to the run queue */ spin_lock(&entity->rq_lock); - if (!entity->rq) { - DRM_ERROR("Trying to push to a killed entity\n"); - spin_unlock(&entity->rq_lock); - return; - } drm_sched_rq_add_entity(entity->rq, entity); spin_unlock(&entity->rq_lock); drm_sched_wakeup(entity->rq->sched);

On Thu, Aug 2, 2018 at 3:55 AM Andrey Grodzovsky <Andrey.Grodzovsky@amd.com mailto:Andrey.Grodzovsky@amd.com> wrote:

Thinking again about this change and 53d5f1e drm/scheduler: move idle
entities to scheduler with less load v2

Looks to me like use case for which fc9a539 drm/scheduler: add NULL
pointer check for run queue (v2) was done

will not work anymore.

First of all in drm_sched_entity_push_job, 'if (!entity->rq)' will
never
be true any more since we stopped entity->rq to NULL in

drm_sched_entity_flush.

Second of all, even after we removed the entity from rq in
drm_sched_entity_flush to terminate any subsequent submissions

to the queue the other thread doing push job can just acquire again a
run queue

from drm_sched_entity_get_free_sched and continue submission so you
can't substitute ' if (!entity->rq)' to 'if
(list_empty(&entity->list))'.

What about adding a 'stopped' flag to drm_sched_entity to be set in
drm_sched_entity_flush and in

But it might be worth adding a stopped flag field if it is required at other places as well.

Thanks, Nayan

drm_sched_entity_push_job check for  'if (!entity->stopped)'
instead of
' if (!entity->rq)' ?

Andrey


On 07/30/2018 07:03 AM, Christian König wrote:
> We removed the redundancy of having an extra scheduler field, so we
> can't set the rq to NULL any more or otherwise won't know which
> scheduler to use for the cleanup.
>
> Just remove the entity from the scheduling list instead.
>
> Signed-off-by: Christian König <christian.koenig@amd.com
<mailto:christian.koenig@amd.com>>
> ---
>   drivers/gpu/drm/scheduler/gpu_scheduler.c | 35
+++++++------------------------
>   1 file changed, 8 insertions(+), 27 deletions(-)
>
> diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c
b/drivers/gpu/drm/scheduler/gpu_scheduler.c
> index f563e4fbb4b6..1b733229201e 100644
> --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
> +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
> @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct
drm_sched_entity *entity,
>   }
>   EXPORT_SYMBOL(drm_sched_entity_init);
>
> -/**
> - * drm_sched_entity_is_initialized - Query if entity is initialized
> - *
> - * @sched: Pointer to scheduler instance
> - * @entity: The pointer to a valid scheduler entity
> - *
> - * return true if entity is initialized, false otherwise
> -*/
> -static bool drm_sched_entity_is_initialized(struct
drm_gpu_scheduler *sched,
> -                                         struct
drm_sched_entity *entity)
> -{
> -     return entity->rq != NULL &&
> -             entity->rq->sched == sched;
> -}
> -
>   /**
>    * drm_sched_entity_is_idle - Check if entity is idle
>    *
> @@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct
drm_sched_entity *entity)
>   {
>       rmb();
>
> -     if (!entity->rq || spsc_queue_peek(&entity->job_queue) ==
NULL)
> +     if (list_empty(&entity->list) ||
> +         spsc_queue_peek(&entity->job_queue) == NULL)
>               return true;
>
>       return false;
> @@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct
drm_sched_entity *entity, long timeout)
>       long ret = timeout;
>
>       sched = entity->rq->sched;
> -     if (!drm_sched_entity_is_initialized(sched, entity))
> -             return ret;
>       /**
>        * The client will not queue more IBs during this fini,
consume existing
>        * queued IBs or discard them on SIGKILL
> @@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct
drm_sched_entity *entity, long timeout)
>       last_user = cmpxchg(&entity->last_user,
current->group_leader, NULL);
>       if ((!last_user || last_user == current->group_leader) &&
>           (current->flags & PF_EXITING) && (current->exit_code
== SIGKILL))
> -             drm_sched_entity_set_rq(entity, NULL);
> +             drm_sched_rq_remove_entity(entity->rq, entity);
>
>       return ret;
>   }
> @@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct
drm_sched_entity *entity)
>       struct drm_gpu_scheduler *sched;
>
>       sched = entity->rq->sched;
> -     drm_sched_entity_set_rq(entity, NULL);
> +     drm_sched_rq_remove_entity(entity->rq, entity);
>
>       /* Consumption of existing IBs wasn't completed. Forcefully
>        * remove them here.
> @@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct
drm_sched_entity *entity,
>       if (entity->rq == rq)
>               return;
>
> -     spin_lock(&entity->rq_lock);
> -
> -     if (entity->rq)
> -             drm_sched_rq_remove_entity(entity->rq, entity);
> +     BUG_ON(!rq);
>
> +     spin_lock(&entity->rq_lock);
> +     drm_sched_rq_remove_entity(entity->rq, entity);
>       entity->rq = rq;
> -     if (rq)
> -             drm_sched_rq_add_entity(rq, entity);
> -
> +     drm_sched_rq_add_entity(rq, entity);
>       spin_unlock(&entity->rq_lock);
>   }
>   EXPORT_SYMBOL(drm_sched_entity_set_rq);

Nayan Deshmukh

3 Aug 3 Aug

12:12 p.m.

New subject: [PATCH 2/2] drm/scheduler: stop setting rq to NULL

On Thu, Aug 2, 2018, 8:09 PM Andrey Grodzovsky Andrey.Grodzovsky@amd.com wrote:

...

On 08/02/2018 02:43 AM, Nayan Deshmukh wrote:

Hi Andrey,

Good catch. I guess since both Christian and I were working on it parallelly we didn't catch this problem.

If it is only causing a problem in push_job then we can handle it something like this:

diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index 05dc6ecd4003..f344ce32f128 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -563,6 +563,11 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job, first = spsc_queue_count(&entity->job_queue) == 0; reschedule = idle && first && (entity->num_rq_list > 1);
  if (first && list_empty(&entity->list)) {
first might be false if the other process interrupted by SIGKILL in middle of wait_event_killable in drm_sched_entity_flush when there were still item in queue. I don't see a good way besides adding a 'stopped' flag to drm_sched_enitity.

Sorry I missed this mail. This case might happen but this was also not being handled previously.

Nayan

...

Andrey
          DRM_ERROR("Trying to push to a killed entity\n");
          return;
  }
  if (reschedule) {
          rq = drm_sched_entity_get_free_sched(entity);
          spin_lock(&entity->rq_lock);
@@ -580,11 +585,6 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job, if (first) { /* Add the entity to the run queue */ spin_lock(&entity->rq_lock);
          if (!entity->rq) {
                  DRM_ERROR("Trying to push to a killed entity\n");
                  spin_unlock(&entity->rq_lock);
                  return;
          }
          drm_sched_rq_add_entity(entity->rq, entity);
          spin_unlock(&entity->rq_lock);
          drm_sched_wakeup(entity->rq->sched);
On Thu, Aug 2, 2018 at 3:55 AM Andrey Grodzovsky < Andrey.Grodzovsky@amd.com> wrote:

...
Thinking again about this change and 53d5f1e drm/scheduler: move idle entities to scheduler with less load v2

Looks to me like use case for which fc9a539 drm/scheduler: add NULL pointer check for run queue (v2) was done

will not work anymore.

First of all in drm_sched_entity_push_job, 'if (!entity->rq)' will never be true any more since we stopped entity->rq to NULL in

drm_sched_entity_flush.

Second of all, even after we removed the entity from rq in drm_sched_entity_flush to terminate any subsequent submissions

to the queue the other thread doing push job can just acquire again a run queue

from drm_sched_entity_get_free_sched and continue submission so you can't substitute ' if (!entity->rq)' to 'if (list_empty(&entity->list))'.

What about adding a 'stopped' flag to drm_sched_entity to be set in drm_sched_entity_flush and in

But it might be worth adding a stopped flag field if it is required at

other places as well.

Thanks, Nayan

...
drm_sched_entity_push_job check for 'if (!entity->stopped)' instead of ' if (!entity->rq)' ?

Andrey

On 07/30/2018 07:03 AM, Christian König wrote:

...
We removed the redundancy of having an extra scheduler field, so we can't set the rq to NULL any more or otherwise won't know which scheduler to use for the cleanup.

Just remove the entity from the scheduling list instead.

Signed-off-by: Christian König christian.koenig@amd.com

drivers/gpu/drm/scheduler/gpu_scheduler.c | 35

+++++++------------------------

...
1 file changed, 8 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c

b/drivers/gpu/drm/scheduler/gpu_scheduler.c

...
index f563e4fbb4b6..1b733229201e 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct drm_sched_entity

*entity,

...
} EXPORT_SYMBOL(drm_sched_entity_init);

-/**

drm_sched_entity_is_initialized - Query if entity is initialized

@sched: Pointer to scheduler instance

@entity: The pointer to a valid scheduler entity

return true if entity is initialized, false otherwise

-*/ -static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler

*sched,

...
                                    struct drm_sched_entity
*entity)

...
-{
return entity->rq != NULL &&
        entity->rq->sched == sched;
-}

/**

drm_sched_entity_is_idle - Check if entity is idle

@@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct
drm_sched_entity *entity)

...
{ rmb();
if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL)
if (list_empty(&entity->list) ||
    spsc_queue_peek(&entity->job_queue) == NULL)
        return true;

return false;
@@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct drm_sched_entity
*entity, long timeout)

...
  long ret = timeout;

  sched = entity->rq->sched;
if (!drm_sched_entity_is_initialized(sched, entity))
        return ret;
/**
 * The client will not queue more IBs during this fini, consume
existing

...
   * queued IBs or discard them on SIGKILL
@@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct drm_sched_entity
*entity, long timeout)

...
  last_user = cmpxchg(&entity->last_user, current->group_leader,
NULL);

...
  if ((!last_user || last_user == current->group_leader) &&
      (current->flags & PF_EXITING) && (current->exit_code ==
SIGKILL))

...
        drm_sched_entity_set_rq(entity, NULL);
        drm_sched_rq_remove_entity(entity->rq, entity);

return ret;
}
@@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct drm_sched_entity
*entity)

...
  struct drm_gpu_scheduler *sched;

  sched = entity->rq->sched;
drm_sched_entity_set_rq(entity, NULL);
drm_sched_rq_remove_entity(entity->rq, entity);

/* Consumption of existing IBs wasn't completed. Forcefully
 * remove them here.
@@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct
drm_sched_entity *entity,

...
  if (entity->rq == rq)
          return;
spin_lock(&entity->rq_lock);
if (entity->rq)
        drm_sched_rq_remove_entity(entity->rq, entity);
BUG_ON(!rq);
spin_lock(&entity->rq_lock);
drm_sched_rq_remove_entity(entity->rq, entity);
entity->rq = rq;
if (rq)
        drm_sched_rq_add_entity(rq, entity);
drm_sched_rq_add_entity(rq, entity);
spin_unlock(&entity->rq_lock);
} EXPORT_SYMBOL(drm_sched_entity_set_rq);

Andrey Grodzovsky

1:47 p.m.

New subject: [PATCH 2/2] drm/scheduler: stop setting rq to NULL

On 08/03/2018 08:12 AM, Nayan Deshmukh wrote:

...

On Thu, Aug 2, 2018, 8:09 PM Andrey Grodzovsky <Andrey.Grodzovsky@amd.com mailto:Andrey.Grodzovsky@amd.com> wrote:

On 08/02/2018 02:43 AM, Nayan Deshmukh wrote:

...

Hi Andrey,

Good catch. I guess since both Christian and I were working on it
parallelly we didn't catch this problem.

If it is only causing a problem in push_job then we can handle it
something like this:

----------------------------------------------------------------------------------------------------------------------------
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c
b/drivers/gpu/drm/scheduler/gpu_scheduler.c
index 05dc6ecd4003..f344ce32f128 100644
--- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
@@ -563,6 +563,11 @@ void drm_sched_entity_push_job(struct
drm_sched_job *sched_job,
        first = spsc_queue_count(&entity->job_queue) == 0;
        reschedule = idle && first && (entity->num_rq_list > 1);

+       if (first && list_empty(&entity->list)) {

first might be false if the other process interrupted by SIGKILL 
in middle of  wait_event_killable in drm_sched_entity_flush when
there were still item in queue.
I don't see a good way besides adding a 'stopped' flag to
drm_sched_enitity.

Sorry I missed this mail. This case might happen but this was also not being handled previously.

Nayan

The original code before 'drm/scheduler: stop setting rq to NULL' was , because you didn't use the queue's emptiness as a criteria for aborting your next push to queue but rather checking for entity->rq != NULL.

Andrey

...

Andrey

...

+               DRM_ERROR("Trying to push to a killed entity\n");
+               return;
+       }
+
        if (reschedule) {
                rq = drm_sched_entity_get_free_sched(entity);
spin_lock(&entity->rq_lock);
@@ -580,11 +585,6 @@ void drm_sched_entity_push_job(struct
drm_sched_job *sched_job,
        if (first) {
                /* Add the entity to the run queue */
spin_lock(&entity->rq_lock);
-               if (!entity->rq) {
-                       DRM_ERROR("Trying to push to a killed
entity\n");
- spin_unlock(&entity->rq_lock);
-                       return;
-               }
drm_sched_rq_add_entity(entity->rq, entity);
spin_unlock(&entity->rq_lock);
drm_sched_wakeup(entity->rq->sched);
-----------------------------------------------------------------------------------------------------------------------------


On Thu, Aug 2, 2018 at 3:55 AM Andrey Grodzovsky
<Andrey.Grodzovsky@amd.com <mailto:Andrey.Grodzovsky@amd.com>> wrote:

    Thinking again about this change and 53d5f1e drm/scheduler:
    move idle
    entities to scheduler with less load v2

    Looks to me like use case for which fc9a539 drm/scheduler:
    add NULL
    pointer check for run queue (v2) was done

    will not work anymore.

    First of all in drm_sched_entity_push_job, 'if (!entity->rq)'
    will never
    be true any more since we stopped entity->rq to NULL in

    drm_sched_entity_flush.

    Second of all, even after we removed the entity from rq in
    drm_sched_entity_flush to terminate any subsequent submissions

    to the queue the other thread doing push job can just acquire
    again a
    run queue

    from drm_sched_entity_get_free_sched and continue submission
    so you
    can't substitute ' if (!entity->rq)' to 'if
    (list_empty(&entity->list))'.

    What about adding a 'stopped' flag to drm_sched_entity to be
    set in
    drm_sched_entity_flush and in

But it might be worth adding a stopped flag field if it is
required at other places as well.

Thanks,
Nayan

    drm_sched_entity_push_job check for  'if (!entity->stopped)'
    instead of
    ' if (!entity->rq)' ?

    Andrey


    On 07/30/2018 07:03 AM, Christian König wrote:
    > We removed the redundancy of having an extra scheduler
    field, so we
    > can't set the rq to NULL any more or otherwise won't know which
    > scheduler to use for the cleanup.
    >
    > Just remove the entity from the scheduling list instead.
    >
    > Signed-off-by: Christian König <christian.koenig@amd.com
    <mailto:christian.koenig@amd.com>>
    > ---
    >   drivers/gpu/drm/scheduler/gpu_scheduler.c | 35
    +++++++------------------------
    >   1 file changed, 8 insertions(+), 27 deletions(-)
    >
    > diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c
    b/drivers/gpu/drm/scheduler/gpu_scheduler.c
    > index f563e4fbb4b6..1b733229201e 100644
    > --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
    > +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
    > @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct
    drm_sched_entity *entity,
    >   }
    >   EXPORT_SYMBOL(drm_sched_entity_init);
    >
    > -/**
    > - * drm_sched_entity_is_initialized - Query if entity is
    initialized
    > - *
    > - * @sched: Pointer to scheduler instance
    > - * @entity: The pointer to a valid scheduler entity
    > - *
    > - * return true if entity is initialized, false otherwise
    > -*/
    > -static bool drm_sched_entity_is_initialized(struct
    drm_gpu_scheduler *sched,
    > -  struct drm_sched_entity *entity)
    > -{
    > -     return entity->rq != NULL &&
    > -             entity->rq->sched == sched;
    > -}
    > -
    >   /**
    >    * drm_sched_entity_is_idle - Check if entity is idle
    >    *
    > @@ -224,7 +209,8 @@ static bool
    drm_sched_entity_is_idle(struct drm_sched_entity *entity)
    >   {
    >       rmb();
    >
    > -     if (!entity->rq ||
    spsc_queue_peek(&entity->job_queue) == NULL)
    > +     if (list_empty(&entity->list) ||
    > +  spsc_queue_peek(&entity->job_queue) == NULL)
    >               return true;
    >
    >       return false;
    > @@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct
    drm_sched_entity *entity, long timeout)
    >       long ret = timeout;
    >
    >       sched = entity->rq->sched;
    > -     if (!drm_sched_entity_is_initialized(sched, entity))
    > -             return ret;
    >       /**
    >        * The client will not queue more IBs during this
    fini, consume existing
    >        * queued IBs or discard them on SIGKILL
    > @@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct
    drm_sched_entity *entity, long timeout)
    >       last_user = cmpxchg(&entity->last_user,
    current->group_leader, NULL);
    >       if ((!last_user || last_user == current->group_leader) &&
    >           (current->flags & PF_EXITING) &&
    (current->exit_code == SIGKILL))
    > -  drm_sched_entity_set_rq(entity, NULL);
    > +  drm_sched_rq_remove_entity(entity->rq, entity);
    >
    >       return ret;
    >   }
    > @@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct
    drm_sched_entity *entity)
    >       struct drm_gpu_scheduler *sched;
    >
    >       sched = entity->rq->sched;
    > -     drm_sched_entity_set_rq(entity, NULL);
    > +  drm_sched_rq_remove_entity(entity->rq, entity);
    >
    >       /* Consumption of existing IBs wasn't completed.
    Forcefully
    >        * remove them here.
    > @@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct
    drm_sched_entity *entity,
    >       if (entity->rq == rq)
    >               return;
    >
    > -     spin_lock(&entity->rq_lock);
    > -
    > -     if (entity->rq)
    > -  drm_sched_rq_remove_entity(entity->rq, entity);
    > +     BUG_ON(!rq);
    >
    > +     spin_lock(&entity->rq_lock);
    > +  drm_sched_rq_remove_entity(entity->rq, entity);
    >       entity->rq = rq;
    > -     if (rq)
    > -             drm_sched_rq_add_entity(rq, entity);
    > -
    > +     drm_sched_rq_add_entity(rq, entity);
    >       spin_unlock(&entity->rq_lock);
    >   }
    >   EXPORT_SYMBOL(drm_sched_entity_set_rq);

Nayan Deshmukh

1:58 p.m.

New subject: [PATCH 2/2] drm/scheduler: stop setting rq to NULL

On Fri, Aug 3, 2018 at 7:17 PM Andrey Grodzovsky Andrey.Grodzovsky@amd.com wrote:

...

On 08/03/2018 08:12 AM, Nayan Deshmukh wrote:

On Thu, Aug 2, 2018, 8:09 PM Andrey Grodzovsky Andrey.Grodzovsky@amd.com wrote:

...
On 08/02/2018 02:43 AM, Nayan Deshmukh wrote:

Hi Andrey,

Good catch. I guess since both Christian and I were working on it parallelly we didn't catch this problem.

If it is only causing a problem in push_job then we can handle it something like this:

diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index 05dc6ecd4003..f344ce32f128 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -563,6 +563,11 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job, first = spsc_queue_count(&entity->job_queue) == 0; reschedule = idle && first && (entity->num_rq_list > 1);
  if (first && list_empty(&entity->list)) {
first might be false if the other process interrupted by SIGKILL in middle of wait_event_killable in drm_sched_entity_flush when there were still item in queue. I don't see a good way besides adding a 'stopped' flag to drm_sched_enitity.
Sorry I missed this mail. This case might happen but this was also not being handled previously.

Nayan

The original code before 'drm/scheduler: stop setting rq to NULL' was , because you didn't use the queue's emptiness as a criteria for aborting your next push to queue but rather checking for entity->rq != NULL.

But (entity->rq != NULL) check was inside the body of if (first) so it was also dependent on the queue's emptiness earlier. Well in any case it doesn't matter now as with Christian's patch we can remove the if condition altogether.

Nayan

...

Andrey

...
Andrey
          DRM_ERROR("Trying to push to a killed entity\n");
          return;
  }
  if (reschedule) {
          rq = drm_sched_entity_get_free_sched(entity);
          spin_lock(&entity->rq_lock);
@@ -580,11 +585,6 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job, if (first) { /* Add the entity to the run queue */ spin_lock(&entity->rq_lock);
          if (!entity->rq) {
                  DRM_ERROR("Trying to push to a killed entity\n");
                  spin_unlock(&entity->rq_lock);
                  return;
          }
          drm_sched_rq_add_entity(entity->rq, entity);
          spin_unlock(&entity->rq_lock);
          drm_sched_wakeup(entity->rq->sched);
On Thu, Aug 2, 2018 at 3:55 AM Andrey Grodzovsky < Andrey.Grodzovsky@amd.com> wrote:

...
Thinking again about this change and 53d5f1e drm/scheduler: move idle entities to scheduler with less load v2

Looks to me like use case for which fc9a539 drm/scheduler: add NULL pointer check for run queue (v2) was done

will not work anymore.

First of all in drm_sched_entity_push_job, 'if (!entity->rq)' will never be true any more since we stopped entity->rq to NULL in

drm_sched_entity_flush.

Second of all, even after we removed the entity from rq in drm_sched_entity_flush to terminate any subsequent submissions

to the queue the other thread doing push job can just acquire again a run queue

from drm_sched_entity_get_free_sched and continue submission so you can't substitute ' if (!entity->rq)' to 'if (list_empty(&entity->list))'.

What about adding a 'stopped' flag to drm_sched_entity to be set in drm_sched_entity_flush and in

But it might be worth adding a stopped flag field if it is required at

other places as well.

Thanks, Nayan

...
drm_sched_entity_push_job check for 'if (!entity->stopped)' instead of ' if (!entity->rq)' ?

Andrey

On 07/30/2018 07:03 AM, Christian König wrote:

...
We removed the redundancy of having an extra scheduler field, so we can't set the rq to NULL any more or otherwise won't know which scheduler to use for the cleanup.

Just remove the entity from the scheduling list instead.

Signed-off-by: Christian König christian.koenig@amd.com

drivers/gpu/drm/scheduler/gpu_scheduler.c | 35

+++++++------------------------

...
1 file changed, 8 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c

b/drivers/gpu/drm/scheduler/gpu_scheduler.c

...
index f563e4fbb4b6..1b733229201e 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -198,21 +198,6 @@ int drm_sched_entity_init(struct drm_sched_entity

*entity,

...
} EXPORT_SYMBOL(drm_sched_entity_init);

-/**

drm_sched_entity_is_initialized - Query if entity is initialized

@sched: Pointer to scheduler instance

@entity: The pointer to a valid scheduler entity

return true if entity is initialized, false otherwise

-*/ -static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler

*sched,

...
                                    struct drm_sched_entity
*entity)

...
-{
return entity->rq != NULL &&
        entity->rq->sched == sched;
-}

/**

drm_sched_entity_is_idle - Check if entity is idle

@@ -224,7 +209,8 @@ static bool drm_sched_entity_is_idle(struct
drm_sched_entity *entity)

...
{ rmb();
if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL)
if (list_empty(&entity->list) ||
    spsc_queue_peek(&entity->job_queue) == NULL)
        return true;

return false;
@@ -279,8 +265,6 @@ long drm_sched_entity_flush(struct
drm_sched_entity *entity, long timeout)

...
  long ret = timeout;

  sched = entity->rq->sched;
if (!drm_sched_entity_is_initialized(sched, entity))
        return ret;
/**
 * The client will not queue more IBs during this fini, consume
existing

...
   * queued IBs or discard them on SIGKILL
@@ -299,7 +283,7 @@ long drm_sched_entity_flush(struct
drm_sched_entity *entity, long timeout)

...
  last_user = cmpxchg(&entity->last_user, current->group_leader,
NULL);

...
  if ((!last_user || last_user == current->group_leader) &&
      (current->flags & PF_EXITING) && (current->exit_code ==
SIGKILL))

...
        drm_sched_entity_set_rq(entity, NULL);
        drm_sched_rq_remove_entity(entity->rq, entity);

return ret;
}
@@ -320,7 +304,7 @@ void drm_sched_entity_fini(struct drm_sched_entity
*entity)

...
  struct drm_gpu_scheduler *sched;

  sched = entity->rq->sched;
drm_sched_entity_set_rq(entity, NULL);
drm_sched_rq_remove_entity(entity->rq, entity);

/* Consumption of existing IBs wasn't completed. Forcefully
 * remove them here.
@@ -416,15 +400,12 @@ void drm_sched_entity_set_rq(struct
drm_sched_entity *entity,

...
  if (entity->rq == rq)
          return;
spin_lock(&entity->rq_lock);
if (entity->rq)
        drm_sched_rq_remove_entity(entity->rq, entity);
BUG_ON(!rq);
spin_lock(&entity->rq_lock);
drm_sched_rq_remove_entity(entity->rq, entity);
entity->rq = rq;
if (rq)
        drm_sched_rq_add_entity(rq, entity);
drm_sched_rq_add_entity(rq, entity);
spin_unlock(&entity->rq_lock);
} EXPORT_SYMBOL(drm_sched_entity_set_rq);

Nayan Deshmukh

30 Jul 30 Jul

1:34 p.m.

Hi Christian,

The code looks good to me. But I was just wondering what will happen when the last user is killed and some other user tries to push to the entity.

Regards, Nayan Deshmukh

On Mon, Jul 30, 2018 at 4:33 PM Christian König < ckoenig.leichtzumerken@gmail.com> wrote:

...

Note which task is using the entity and only kill it if the last user of the entity is killed. This should prevent problems when entities are leaked to child processes.

v2: add missing kernel doc

Signed-off-by: Christian König christian.koenig@amd.com

drivers/gpu/drm/scheduler/gpu_scheduler.c | 6 +++++- include/drm/gpu_scheduler.h | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index 3f2fc5e8242a..f563e4fbb4b6 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -275,6 +275,7 @@ static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f, long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) { struct drm_gpu_scheduler *sched;
  struct task_struct *last_user;
  long ret = timeout;

  sched = entity->rq->sched;
@@ -295,7 +296,9 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout)
    /* For killed process disable any more IBs enqueue right now */
  if ((current->flags & PF_EXITING) && (current->exit_code ==
SIGKILL))
  last_user = cmpxchg(&entity->last_user, current->group_leader,
NULL);
  if ((!last_user || last_user == current->group_leader) &&
      (current->flags & PF_EXITING) && (current->exit_code ==
SIGKILL)) drm_sched_entity_set_rq(entity, NULL);
    return ret;
@@ -541,6 +544,7 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job,
    trace_drm_sched_job(sched_job, entity);
  WRITE_ONCE(entity->last_user, current->group_leader);
  first = spsc_queue_push(&entity->job_queue,
&sched_job->queue_node);
    /* first job wakes up scheduler */
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 091b9afcd184..21c648b0b2a1 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -66,6 +66,7 @@ enum drm_sched_priority {

@guilty: points to ctx's guilty.

@fini_status: contains the exit status in case the process was

signalled.

@last_scheduled: points to the finished fence of the last scheduled

job.

@last_user: last group leader pushing a job into the entity.

Entities will emit jobs in order to their corresponding hardware

ring, and the scheduler will alternate between entities based on

@@ -85,6 +86,7 @@ struct drm_sched_entity { struct dma_fence_cb cb; atomic_t *guilty; struct dma_fence *last_scheduled;
  struct task_struct              *last_user;
};

/**

2.14.1

Andrey Grodzovsky

8:42 p.m.

I believe that in this case

if (!entity->rq) {

DRM_ERROR...

return;

}

clause will take place.

P.S I remember we planned to actually propagate the error back to the caller so i guess we should take care of this sooner or later.

The change is Reviewed-by: Andrey Grodzovsky andrey.grodzovsky@amd.com

Andrey

On 07/30/2018 09:34 AM, Nayan Deshmukh wrote:

...

Hi Christian,

The code looks good to me. But I was just wondering what will happen when the last user is killed and some other user tries to push to the entity.

Regards, Nayan Deshmukh

On Mon, Jul 30, 2018 at 4:33 PM Christian König <ckoenig.leichtzumerken@gmail.com mailto:ckoenig.leichtzumerken@gmail.com> wrote:

Note which task is using the entity and only kill it if the last
user of
the entity is killed. This should prevent problems when entities
are leaked to
child processes.

v2: add missing kernel doc

Signed-off-by: Christian König <christian.koenig@amd.com
<mailto:christian.koenig@amd.com>>
---
 drivers/gpu/drm/scheduler/gpu_scheduler.c | 6 +++++-
 include/drm/gpu_scheduler.h               | 2 ++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c
b/drivers/gpu/drm/scheduler/gpu_scheduler.c
index 3f2fc5e8242a..f563e4fbb4b6 100644
--- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
@@ -275,6 +275,7 @@ static void
drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
 long drm_sched_entity_flush(struct drm_sched_entity *entity, long
timeout)
 {
        struct drm_gpu_scheduler *sched;
+       struct task_struct *last_user;
        long ret = timeout;

        sched = entity->rq->sched;
@@ -295,7 +296,9 @@ long drm_sched_entity_flush(struct
drm_sched_entity *entity, long timeout)


        /* For killed process disable any more IBs enqueue right
now */
-       if ((current->flags & PF_EXITING) && (current->exit_code
== SIGKILL))
+       last_user = cmpxchg(&entity->last_user,
current->group_leader, NULL);
+       if ((!last_user || last_user == current->group_leader) &&
+           (current->flags & PF_EXITING) && (current->exit_code
== SIGKILL))
                drm_sched_entity_set_rq(entity, NULL);

        return ret;
@@ -541,6 +544,7 @@ void drm_sched_entity_push_job(struct
drm_sched_job *sched_job,

        trace_drm_sched_job(sched_job, entity);

+       WRITE_ONCE(entity->last_user, current->group_leader);
        first = spsc_queue_push(&entity->job_queue,
&sched_job->queue_node);

        /* first job wakes up scheduler */
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 091b9afcd184..21c648b0b2a1 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -66,6 +66,7 @@ enum drm_sched_priority {
  * @guilty: points to ctx's guilty.
  * @fini_status: contains the exit status in case the process was
signalled.
  * @last_scheduled: points to the finished fence of the last
scheduled job.
+ * @last_user: last group leader pushing a job into the entity.
  *
  * Entities will emit jobs in order to their corresponding hardware
  * ring, and the scheduler will alternate between entities based on
@@ -85,6 +86,7 @@ struct drm_sched_entity {
        struct dma_fence_cb             cb;
        atomic_t                        *guilty;
        struct dma_fence                *last_scheduled;
+       struct task_struct              *last_user;
 };

 /**
-- 
2.14.1

dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel

Nayan Deshmukh

31 Jul 31 Jul

9:11 a.m.

That makes sense. The change is Acked-by: Nayan Deshmukh < nayan26deshmukh@gmail.com>

On Tue, Jul 31, 2018 at 2:12 AM Andrey Grodzovsky Andrey.Grodzovsky@amd.com wrote:

...

I believe that in this case

if (!entity->rq) {
DRM_ERROR...

return;
}

clause will take place.

P.S I remember we planned to actually propagate the error back to the caller so i guess we should take care of this sooner or later.

The change is Reviewed-by: Andrey Grodzovsky andrey.grodzovsky@amd.com andrey.grodzovsky@amd.com

Andrey

On 07/30/2018 09:34 AM, Nayan Deshmukh wrote:

Hi Christian,

The code looks good to me. But I was just wondering what will happen when the last user is killed and some other user tries to push to the entity.

Regards, Nayan Deshmukh

On Mon, Jul 30, 2018 at 4:33 PM Christian König < ckoenig.leichtzumerken@gmail.com> wrote:

...
Note which task is using the entity and only kill it if the last user of the entity is killed. This should prevent problems when entities are leaked to child processes.

v2: add missing kernel doc

Signed-off-by: Christian König christian.koenig@amd.com

drivers/gpu/drm/scheduler/gpu_scheduler.c | 6 +++++- include/drm/gpu_scheduler.h | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index 3f2fc5e8242a..f563e4fbb4b6 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -275,6 +275,7 @@ static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f, long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) { struct drm_gpu_scheduler *sched;
  struct task_struct *last_user;
  long ret = timeout;

  sched = entity->rq->sched;
@@ -295,7 +296,9 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout)
    /* For killed process disable any more IBs enqueue right now */
  if ((current->flags & PF_EXITING) && (current->exit_code ==
SIGKILL))
  last_user = cmpxchg(&entity->last_user, current->group_leader,
NULL);
  if ((!last_user || last_user == current->group_leader) &&
      (current->flags & PF_EXITING) && (current->exit_code ==
SIGKILL)) drm_sched_entity_set_rq(entity, NULL);
    return ret;
@@ -541,6 +544,7 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job,
    trace_drm_sched_job(sched_job, entity);
  WRITE_ONCE(entity->last_user, current->group_leader);
  first = spsc_queue_push(&entity->job_queue,
&sched_job->queue_node);
    /* first job wakes up scheduler */
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 091b9afcd184..21c648b0b2a1 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -66,6 +66,7 @@ enum drm_sched_priority {

@guilty: points to ctx's guilty.

@fini_status: contains the exit status in case the process was

signalled.

@last_scheduled: points to the finished fence of the last scheduled

job.

@last_user: last group leader pushing a job into the entity.

Entities will emit jobs in order to their corresponding hardware

ring, and the scheduler will alternate between entities based on

@@ -85,6 +86,7 @@ struct drm_sched_entity { struct dma_fence_cb cb; atomic_t *guilty; struct dma_fence *last_scheduled;
  struct task_struct              *last_user;
};

/**

2.14.1
dri-devel mailing listdri-devel@lists.freedesktop.orghttps://lists.freedesktop.org/mailman/listinfo/dri-devel

2469

Age (days ago)

2480

Last active (days ago)

dri-devel@lists.freedesktop.org

25 comments

4 participants

tags (0)

participants (4)

Andrey Grodzovsky
Christian König
Christian König
Nayan Deshmukh