A handful of minor updates and bugfixes. Includes an UAPI change to MSM_DRM_SUBMITQUEUE_CLOSE API as suggested by Rob, a tweak to rd to dump all iova addresses from a submission and a few performance tweaks for user submissions that don't use relocs.
Jordan Crouse (3): drm/msm: dump a rd GPUADDR header for all buffers in the command drm/msm: Change MSM_DRM_SUBMITQUEUE_CLOSE drm/msm: Do priority checking during submitqueue create
Sharat Masetty (1): drm/msm: Fix race condition in the submit path
Sushmita Susheelendra (2): drm/msm: Map command buffers to kernel only if required drm/msm: Map buffers on demand on the submit path
drivers/gpu/drm/msm/adreno/a5xx_gpu.h | 8 +++- drivers/gpu/drm/msm/adreno/a5xx_preempt.c | 17 ++++++-- drivers/gpu/drm/msm/msm_drv.c | 4 +- drivers/gpu/drm/msm/msm_gem_submit.c | 69 ++++++++++++++++--------------- drivers/gpu/drm/msm/msm_gpu.h | 1 - drivers/gpu/drm/msm/msm_rd.c | 30 +++++++------- drivers/gpu/drm/msm/msm_submitqueue.c | 25 ++++++++--- include/uapi/drm/msm_drm.h | 2 +- 8 files changed, 94 insertions(+), 62 deletions(-)
From: Sharat Masetty smasetty@codeaurora.org
There is a race condition issue between the IRQ context trying to trigger preemption and the user context trying to submit commands to the GPU. The check in a5xx_flush() API only updates the wptr if the GPU is not in preemption. In the cases where we move from PREEMPT_START to PREEMPT_NONE there is a small window where the preempt state is still in START but the CPU context switches to the user thread which is in the a5xx_flush() call to update the wptr, but fails to update the wptr to the GPU since the preempt state is not PREEMPT_NONE. This leads to a GPU stall.
Introduce a new intermediate state PREEMPT_ABORT and change preempt_trigger() to use gpu's current ring instead of the ring retrieved from get_next_ring() while in this state.
Signed-off-by: Sharat Masetty smasetty@codeaurora.org Signed-off-by: Jordan Crouse jcrouse@codeaurora.org --- drivers/gpu/drm/msm/adreno/a5xx_gpu.h | 8 +++++++- drivers/gpu/drm/msm/adreno/a5xx_preempt.c | 17 ++++++++++++++--- 2 files changed, 21 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.h b/drivers/gpu/drm/msm/adreno/a5xx_gpu.h index f062a90..6fb8c2f 100644 --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.h +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.h @@ -56,6 +56,8 @@ struct a5xx_gpu { * PREEMPT_NONE - no preemption in progress. Next state START. * PREEMPT_START - The trigger is evaulating if preemption is possible. Next * states: TRIGGERED, NONE + * PREEMPT_ABORT - An intermediate state before moving back to NONE. Next + * state: NONE. * PREEMPT_TRIGGERED: A preemption has been executed on the hardware. Next * states: FAULTED, PENDING * PREEMPT_FAULTED: A preemption timed out (never completed). This will trigger @@ -67,6 +69,7 @@ struct a5xx_gpu { enum preempt_state { PREEMPT_NONE = 0, PREEMPT_START, + PREEMPT_ABORT, PREEMPT_TRIGGERED, PREEMPT_FAULTED, PREEMPT_PENDING, @@ -154,7 +157,10 @@ static inline int spin_usecs(struct msm_gpu *gpu, uint32_t usecs, /* Return true if we are in a preempt state */ static inline bool a5xx_in_preempt(struct a5xx_gpu *a5xx_gpu) { - return !(atomic_read(&a5xx_gpu->preempt_state) == PREEMPT_NONE); + int preempt_state = atomic_read(&a5xx_gpu->preempt_state); + + return !(preempt_state == PREEMPT_NONE || + preempt_state == PREEMPT_ABORT); }
#endif /* __A5XX_GPU_H__ */ diff --git a/drivers/gpu/drm/msm/adreno/a5xx_preempt.c b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c index 6a3767d..40f4840 100644 --- a/drivers/gpu/drm/msm/adreno/a5xx_preempt.c +++ b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c @@ -122,9 +122,20 @@ void a5xx_preempt_trigger(struct msm_gpu *gpu) * one do nothing except to update the wptr to the latest and greatest */ if (!ring || (a5xx_gpu->cur_ring == ring)) { - update_wptr(gpu, ring); - - /* Set the state back to NONE */ + /* + * Its possible that while a preemption request is in progress + * from an irq context, a user context trying to submit might + * fail to update the write pointer, because it determines + * that the preempt state is not PREEMPT_NONE. + * + * Close the race by introducing an intermediate + * state PREEMPT_ABORT to let the submit path + * know that the ringbuffer is not going to change + * and can safely update the write pointer. + */ + + set_preempt_state(a5xx_gpu, PREEMPT_ABORT); + update_wptr(gpu, a5xx_gpu->cur_ring); set_preempt_state(a5xx_gpu, PREEMPT_NONE); return; }
Currently the rd dump avoids any buffers marked as WRITE under the assumption that the contents are not interesting. While it is true that the contents are uninteresting we should still print the iova and size for all buffers so that any listening replay tools can correctly construct the submission.
Print the header for all buffers but only dump the contents for buffers marked as READ.
Signed-off-by: Jordan Crouse jcrouse@codeaurora.org --- drivers/gpu/drm/msm/msm_rd.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-)
diff --git a/drivers/gpu/drm/msm/msm_rd.c b/drivers/gpu/drm/msm/msm_rd.c index 0366b80..4c858d8 100644 --- a/drivers/gpu/drm/msm/msm_rd.c +++ b/drivers/gpu/drm/msm/msm_rd.c @@ -268,10 +268,6 @@ static void snapshot_buf(struct msm_rd_state *rd, struct msm_gem_object *obj = submit->bos[idx].obj; const char *buf;
- buf = msm_gem_get_vaddr(&obj->base); - if (IS_ERR(buf)) - return; - if (iova) { buf += iova - submit->bos[idx].iova; } else { @@ -279,8 +275,21 @@ static void snapshot_buf(struct msm_rd_state *rd, size = obj->base.size; }
+ /* + * Always write the GPUADDR header so can get a complete list of all the + * buffers in the cmd + */ rd_write_section(rd, RD_GPUADDR, (uint32_t[3]){ iova, size, iova >> 32 }, 12); + + /* But only dump the contents of buffers marked READ */ + if (!(submit->bos[idx].flags & MSM_SUBMIT_BO_READ)) + return; + + buf = msm_gem_get_vaddr(&obj->base); + if (IS_ERR(buf)) + return; + rd_write_section(rd, RD_BUFFER_CONTENTS, buf, size);
msm_gem_put_vaddr(&obj->base); @@ -309,17 +318,8 @@ void msm_rd_dump_submit(struct msm_gem_submit *submit)
rd_write_section(rd, RD_CMD, msg, ALIGN(n, 4));
- if (rd_full) { - for (i = 0; i < submit->nr_bos; i++) { - /* buffers that are written to probably don't start out - * with anything interesting: - */ - if (submit->bos[i].flags & MSM_SUBMIT_BO_WRITE) - continue; - - snapshot_buf(rd, submit, i, 0, 0); - } - } + for (i = 0; rd_full && i < submit->nr_bos; i++) + snapshot_buf(rd, submit, i, 0, 0);
for (i = 0; i < submit->nr_cmds; i++) { uint64_t iova = submit->cmd[i].iova;
From: Sushmita Susheelendra ssusheel@codeaurora.org
Map command buffers to the kernel address space only if relocs are specified for the submission. This reduces some overhead on the submission path.
Signed-off-by: Sushmita Susheelendra ssusheel@codeaurora.org Signed-off-by: Jordan Crouse jcrouse@codeaurora.org --- drivers/gpu/drm/msm/msm_gem_submit.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index 2708d3a..2d2aa65 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -320,6 +320,9 @@ static int submit_reloc(struct msm_gem_submit *submit, struct msm_gem_object *ob return -EINVAL; }
+ if (nr_relocs == 0) + return 0; + /* For now, just map the entire thing. Eventually we probably * to do it page-by-page, w/ kmap() if not vmap()d.. */
From: Sushmita Susheelendra ssusheel@codeaurora.org
Map and pin buffers on demand on the submission path. This ensures that we only map buffers whose iova are actually needed for submission as opposed to all buffers in the buffer list. For instance, the command buffers, and the reloc buffers for processing relocs. Also remove unused member valid from the struct msm_gem_submit.
Signed-off-by: Sushmita Susheelendra ssusheel@codeaurora.org Signed-off-by: Jordan Crouse jcrouse@codeaurora.org --- drivers/gpu/drm/msm/msm_gem_submit.c | 58 +++++++++++++++++------------------- 1 file changed, 28 insertions(+), 30 deletions(-)
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index 2d2aa65..47a9e87 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -256,41 +256,38 @@ static int submit_fence_sync(struct msm_gem_submit *submit, bool no_implicit) return ret; }
-static int submit_pin_objects(struct msm_gem_submit *submit) +static int submit_pin_object(struct msm_gem_submit *submit, int i) { - int i, ret = 0; - - submit->valid = true; - - for (i = 0; i < submit->nr_bos; i++) { - struct msm_gem_object *msm_obj = submit->bos[i].obj; - uint64_t iova; + struct msm_gem_object *msm_obj = submit->bos[i].obj; + uint64_t iova; + int ret;
- /* if locking succeeded, pin bo: */ - ret = msm_gem_get_iova(&msm_obj->base, - submit->gpu->aspace, &iova); + if (submit->bos[i].flags & BO_PINNED) + return 0;
- if (ret) - break; + /* if locking succeeded, pin bo: */ + ret = msm_gem_get_iova(&msm_obj->base, submit->gpu->aspace, &iova); + if (ret) + return ret;
- submit->bos[i].flags |= BO_PINNED; + submit->bos[i].flags |= BO_PINNED;
- if (iova == submit->bos[i].iova) { - submit->bos[i].flags |= BO_VALID; - } else { - submit->bos[i].iova = iova; - /* iova changed, so address in cmdstream is not valid: */ - submit->bos[i].flags &= ~BO_VALID; - submit->valid = false; - } + if (iova == submit->bos[i].iova) { + submit->bos[i].flags |= BO_VALID; + } else { + submit->bos[i].iova = iova; + /* iova changed, so address in cmdstream is not valid: */ + submit->bos[i].flags &= ~BO_VALID; }
- return ret; + return 0; }
static int submit_bo(struct msm_gem_submit *submit, uint32_t idx, struct msm_gem_object **obj, uint64_t *iova, bool *valid) { + int ret; + if (idx >= submit->nr_bos) { DRM_ERROR("invalid buffer index: %u (out of %u)\n", idx, submit->nr_bos); @@ -299,6 +296,14 @@ static int submit_bo(struct msm_gem_submit *submit, uint32_t idx,
if (obj) *obj = submit->bos[idx].obj; + + if (!iova && !valid) + return 0; + + ret = submit_pin_object(submit, idx); + if (ret) + return ret; + if (iova) *iova = submit->bos[idx].iova; if (valid) @@ -482,10 +487,6 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, if (ret) goto out;
- ret = submit_pin_objects(submit); - if (ret) - goto out; - for (i = 0; i < args->nr_cmds; i++) { struct drm_msm_gem_submit_cmd submit_cmd; void __user *userptr = @@ -536,9 +537,6 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, submit->cmd[i].iova = iova + submit_cmd.submit_offset; submit->cmd[i].idx = submit_cmd.submit_idx;
- if (submit->valid) - continue; - ret = submit_reloc(submit, msm_obj, submit_cmd.submit_offset, submit_cmd.nr_relocs, submit_cmd.relocs); if (ret)
Instead of passing a mostly unused struct to MSM_DRM_SUBMITQEUUE_CLOSE we only need to pass the u32 value of the queue ID.
Signed-off-by: Jordan Crouse jcrouse@codeaurora.org --- drivers/gpu/drm/msm/msm_drv.c | 4 ++-- drivers/gpu/drm/msm/msm_gpu.h | 1 - include/uapi/drm/msm_drm.h | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index e1db580..482e1a9 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -829,9 +829,9 @@ static int msm_ioctl_submitqueue_new(struct drm_device *dev, void *data, static int msm_ioctl_submitqueue_close(struct drm_device *dev, void *data, struct drm_file *file) { - struct drm_msm_submitqueue *args = data; + u32 *args = data;
- return msm_submitqueue_remove(file->driver_priv, args->id); + return msm_submitqueue_remove(file->driver_priv, *args); }
static const struct drm_ioctl_desc msm_ioctls[] = { diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h index 914cd2f..0e2aec1 100644 --- a/drivers/gpu/drm/msm/msm_gpu.h +++ b/drivers/gpu/drm/msm/msm_gpu.h @@ -172,7 +172,6 @@ struct msm_gpu_submitqueue { int faults; struct list_head node; struct kref ref; - struct msm_fence_context *fctx; };
diff --git a/include/uapi/drm/msm_drm.h b/include/uapi/drm/msm_drm.h index 711ea30..3183ef7 100644 --- a/include/uapi/drm/msm_drm.h +++ b/include/uapi/drm/msm_drm.h @@ -297,7 +297,7 @@ struct drm_msm_submitqueue { #define DRM_IOCTL_MSM_WAIT_FENCE DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_WAIT_FENCE, struct drm_msm_wait_fence) #define DRM_IOCTL_MSM_GEM_MADVISE DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GEM_MADVISE, struct drm_msm_gem_madvise) #define DRM_IOCTL_MSM_SUBMITQUEUE_NEW DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_NEW, struct drm_msm_submitqueue) -#define DRM_IOCTL_MSM_SUBMITQUEUE_CLOSE DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_CLOSE, struct drm_msm_submitqueue) +#define DRM_IOCTL_MSM_SUBMITQUEUE_CLOSE DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_CLOSE, __u32)
#if defined(__cplusplus) }
Now that the priority must be set in the submitqueue we can check at create time that the requested priority is valid.
Signed-off-by: Jordan Crouse jcrouse@codeaurora.org --- drivers/gpu/drm/msm/msm_gem_submit.c | 8 +++++--- drivers/gpu/drm/msm/msm_submitqueue.c | 25 +++++++++++++++++++------ 2 files changed, 24 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index 47a9e87..5b9210b 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -421,7 +421,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, struct msm_gpu_submitqueue *queue; int out_fence_fd = -1; unsigned i; - int ret, ring; + int ret;
if (!gpu) return -ENXIO; @@ -439,6 +439,9 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, if (!queue) return -ENOENT;
+ if (queue->prio >= gpu->nr_rings) + return -EINVAL; + if (args->flags & MSM_SUBMIT_FENCE_FD_IN) { in_fence = sync_file_get_fence(args->fence_fd);
@@ -545,8 +548,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
submit->nr_cmds = i;
- ring = clamp_t(uint32_t, queue->prio, 0, gpu->nr_rings - 1); - submit->ring = gpu->rb[ring]; + submit->ring = gpu->rb[queue->prio];
submit->fence = msm_fence_alloc(queue->fctx); if (IS_ERR(submit->fence)) { diff --git a/drivers/gpu/drm/msm/msm_submitqueue.c b/drivers/gpu/drm/msm/msm_submitqueue.c index f23b43e..da4bee8 100644 --- a/drivers/gpu/drm/msm/msm_submitqueue.c +++ b/drivers/gpu/drm/msm/msm_submitqueue.c @@ -64,6 +64,7 @@ void msm_submitqueue_close(struct msm_file_private *ctx) int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx, u32 prio, u32 flags, u32 *id) { + struct msm_drm_private *priv = drm->dev_private; struct msm_gpu_submitqueue *queue; char name[32]; int ret = 0; @@ -77,7 +78,13 @@ int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx,
kref_init(&queue->ref); queue->flags = flags; - queue->prio = prio; + + if (priv->gpu) { + if (prio >= priv->gpu->nr_rings) + return -EINVAL; + + queue->prio = prio; + }
write_lock(&ctx->queuelock);
@@ -114,18 +121,24 @@ int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx,
int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx) { + struct msm_drm_private *priv = drm->dev_private; + int default_prio; + if (!ctx) return 0;
+ /* + * Select priority 2 as the "default priority" unless nr_rings is less + * than 2 and then pick the lowest pirority + */ + default_prio = priv->gpu ? + clamp_t(uint32_t, 2, 0, priv->gpu->nr_rings - 1) : 0; + INIT_LIST_HEAD(&ctx->submitqueues);
rwlock_init(&ctx->queuelock);
- /* - * Add the "default" submitqueue with id 0 - * "low" priority (2) and no flags - */ - return msm_submitqueue_create(drm, ctx, 2, 0, NULL); + return msm_submitqueue_create(drm, ctx, default_prio, 0, NULL); }
int msm_submitqueue_remove(struct msm_file_private *ctx, u32 id)
dri-devel@lists.freedesktop.org