This series is an effort to have the msm drm driver start using the DRM gpu scheduler for its general job scheduling needs. Immediate benefits to the msm drm driver include async fencing and improved driver performance.
Testing is still under progress, but general GPU submissions, preemption, timeout, fault recovery and slumber all work fine with these changes when tested using my test apps. My workspace is based on ~4.18.rc6, so these changes should apply on tip relatively easily.
Please review and share your valuable feedback :-).
Note that I backported a change from Linus's 4.19.rc1 tree to handle jiffie wrapping. If it's not desirable, I will squash the last few changes of the series under drm/msm into one single commit.
Matteo Croce (1): jiffies: add utility function to calculate delta in ms
Sharat Masetty (12): drm/msm: Track GPU fences with idr drm/msm: Change msm_gpu_submit() API drm/msm: Save the ring name in the ring structure drm/msm: Change the name of the fence to hw_fence drm/msm: rearrange submit buffer objects clean up drm/msm: Use kzalloc for submit struct allocation drm/msm: Fix leak in submitqueue create drm/scheduler: set sched->thread to NULL in failure drm/msm: Use the DRM common Scheduler msm/drm: Remove unused code drm/scheduler: Add a timeout_start_notify function op drm/msm: Implement better timeout detection
drivers/gpu/drm/msm/Kconfig | 1 + drivers/gpu/drm/msm/Makefile | 3 +- drivers/gpu/drm/msm/adreno/a5xx_gpu.c | 3 - drivers/gpu/drm/msm/adreno/a5xx_preempt.c | 29 +++ drivers/gpu/drm/msm/adreno/adreno_gpu.c | 3 + drivers/gpu/drm/msm/msm_drv.c | 3 +- drivers/gpu/drm/msm/msm_drv.h | 5 +- drivers/gpu/drm/msm/msm_fence.c | 52 ++--- drivers/gpu/drm/msm/msm_fence.h | 5 +- drivers/gpu/drm/msm/msm_gem.c | 44 +--- drivers/gpu/drm/msm/msm_gem.h | 10 +- drivers/gpu/drm/msm/msm_gem_submit.c | 170 +++++++++----- drivers/gpu/drm/msm/msm_gpu.c | 264 ++++------------------ drivers/gpu/drm/msm/msm_gpu.h | 11 +- drivers/gpu/drm/msm/msm_ringbuffer.c | 17 +- drivers/gpu/drm/msm/msm_ringbuffer.h | 7 + drivers/gpu/drm/msm/msm_sched.c | 355 ++++++++++++++++++++++++++++++ drivers/gpu/drm/msm/msm_sched.h | 18 ++ drivers/gpu/drm/msm/msm_submitqueue.c | 16 +- drivers/gpu/drm/scheduler/gpu_scheduler.c | 22 +- include/drm/gpu_scheduler.h | 6 + include/linux/jiffies.h | 5 + 22 files changed, 662 insertions(+), 387 deletions(-) create mode 100644 drivers/gpu/drm/msm/msm_sched.c create mode 100644 drivers/gpu/drm/msm/msm_sched.h
-- 1.9.1
Track the GPU fences created at submit time with idr instead of the ring the sequence number. This helps with easily changing the underlying fence to something we don't truly own, like the scheduler fence.
Also move the GPU fence allocation to msm_gpu_submit() and have the function return the fence. This suits well when integrating with the GPU scheduler.
Additionally remove the non-interruptible wait variant from msm_wait_fence() as it is not used.
Signed-off-by: Sharat Masetty smasetty@codeaurora.org --- drivers/gpu/drm/msm/msm_drv.c | 3 +-- drivers/gpu/drm/msm/msm_fence.c | 30 ++++++++++++++---------------- drivers/gpu/drm/msm/msm_fence.h | 5 +++-- drivers/gpu/drm/msm/msm_gem.h | 1 + drivers/gpu/drm/msm/msm_gem_submit.c | 26 ++++++++++++++++++-------- drivers/gpu/drm/msm/msm_gpu.c | 10 ++++++++-- drivers/gpu/drm/msm/msm_gpu.h | 4 ++-- drivers/gpu/drm/msm/msm_ringbuffer.c | 5 +++++ drivers/gpu/drm/msm/msm_ringbuffer.h | 1 + 9 files changed, 53 insertions(+), 32 deletions(-)
diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index 021a0b6..8eaa1bd 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -746,8 +746,7 @@ static int msm_ioctl_wait_fence(struct drm_device *dev, void *data, if (!queue) return -ENOENT;
- ret = msm_wait_fence(gpu->rb[queue->prio]->fctx, args->fence, &timeout, - true); + ret = msm_wait_fence(gpu->rb[queue->prio], args->fence, &timeout);
msm_submitqueue_put(queue); return ret; diff --git a/drivers/gpu/drm/msm/msm_fence.c b/drivers/gpu/drm/msm/msm_fence.c index 349c12f..0e7912b 100644 --- a/drivers/gpu/drm/msm/msm_fence.c +++ b/drivers/gpu/drm/msm/msm_fence.c @@ -50,41 +50,39 @@ static inline bool fence_completed(struct msm_fence_context *fctx, uint32_t fenc }
/* legacy path for WAIT_FENCE ioctl: */ -int msm_wait_fence(struct msm_fence_context *fctx, uint32_t fence, - ktime_t *timeout, bool interruptible) +int msm_wait_fence(struct msm_ringbuffer *ring, uint32_t fence_id, + ktime_t *timeout) { + struct dma_fence *fence; int ret;
- if (fence > fctx->last_fence) { - DRM_ERROR("%s: waiting on invalid fence: %u (of %u)\n", - fctx->name, fence, fctx->last_fence); - return -EINVAL; + rcu_read_lock(); + fence = idr_find(&ring->fence_idr, fence_id); + + if (!fence || !dma_fence_get_rcu(fence)) { + rcu_read_unlock(); + return 0; } + rcu_read_unlock();
if (!timeout) { /* no-wait: */ - ret = fence_completed(fctx, fence) ? 0 : -EBUSY; + ret = dma_fence_is_signaled(fence) ? 0 : -EBUSY; } else { unsigned long remaining_jiffies = timeout_to_jiffies(timeout);
- if (interruptible) - ret = wait_event_interruptible_timeout(fctx->event, - fence_completed(fctx, fence), - remaining_jiffies); - else - ret = wait_event_timeout(fctx->event, - fence_completed(fctx, fence), - remaining_jiffies); + ret = dma_fence_wait_timeout(fence, true, remaining_jiffies);
if (ret == 0) { DBG("timeout waiting for fence: %u (completed: %u)", - fence, fctx->completed_fence); + fence_id, ring->memptrs->fence); ret = -ETIMEDOUT; } else if (ret != -ERESTARTSYS) { ret = 0; } }
+ dma_fence_put(fence); return ret; }
diff --git a/drivers/gpu/drm/msm/msm_fence.h b/drivers/gpu/drm/msm/msm_fence.h index b9fe059..a8133f7 100644 --- a/drivers/gpu/drm/msm/msm_fence.h +++ b/drivers/gpu/drm/msm/msm_fence.h @@ -19,6 +19,7 @@ #define __MSM_FENCE_H__
#include "msm_drv.h" +#include "msm_ringbuffer.h"
struct msm_fence_context { struct drm_device *dev; @@ -35,8 +36,8 @@ struct msm_fence_context * msm_fence_context_alloc(struct drm_device *dev, const char *name); void msm_fence_context_free(struct msm_fence_context *fctx);
-int msm_wait_fence(struct msm_fence_context *fctx, uint32_t fence, - ktime_t *timeout, bool interruptible); +int msm_wait_fence(struct msm_ringbuffer *ring, uint32_t fence_id, + ktime_t *timeout); void msm_update_fence(struct msm_fence_context *fctx, uint32_t fence);
struct dma_fence * msm_fence_alloc(struct msm_fence_context *fctx); diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h index c5d9bd3..287f974 100644 --- a/drivers/gpu/drm/msm/msm_gem.h +++ b/drivers/gpu/drm/msm/msm_gem.h @@ -143,6 +143,7 @@ struct msm_gem_submit { struct ww_acquire_ctx ticket; uint32_t seqno; /* Sequence number of the submit on the ring */ struct dma_fence *fence; + int out_fence_id; struct msm_gpu_submitqueue *queue; struct pid *pid; /* submitting process */ bool valid; /* true if no cmdstream patching needed */ diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index 7bd83e0..00e6347 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -48,6 +48,7 @@ static struct msm_gem_submit *submit_create(struct drm_device *dev, submit->dev = dev; submit->gpu = gpu; submit->fence = NULL; + submit->out_fence_id = -1; submit->pid = get_pid(task_pid(current)); submit->cmd = (void *)&submit->bos[nr_bos]; submit->queue = queue; @@ -66,6 +67,8 @@ static struct msm_gem_submit *submit_create(struct drm_device *dev,
void msm_gem_submit_free(struct msm_gem_submit *submit) { + idr_remove(&submit->ring->fence_idr, submit->out_fence_id); + dma_fence_put(submit->fence); list_del(&submit->node); put_pid(submit->pid); @@ -557,26 +560,33 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
submit->nr_cmds = i;
- submit->fence = msm_fence_alloc(ring->fctx); + msm_gpu_submit(gpu, submit, ctx); if (IS_ERR(submit->fence)) { ret = PTR_ERR(submit->fence); submit->fence = NULL; goto out; }
+ /* + * No protection should be okay here since this is protected by the big + * GPU lock. + */ + submit->out_fence_id = idr_alloc_cyclic(&ring->fence_idr, submit->fence, + 0, INT_MAX, GFP_KERNEL); + + if (submit->out_fence_id < 0) { + ret = -ENOMEM; + goto out; + } + + args->fence = submit->out_fence_id; + if (args->flags & MSM_SUBMIT_FENCE_FD_OUT) { sync_file = sync_file_create(submit->fence); if (!sync_file) { ret = -ENOMEM; goto out; } - } - - msm_gpu_submit(gpu, submit, ctx); - - args->fence = submit->fence->seqno; - - if (args->flags & MSM_SUBMIT_FENCE_FD_OUT) { fd_install(out_fence_fd, sync_file->file); args->fence_fd = out_fence_fd; } diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index 1c09acf..eb67172 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -602,8 +602,8 @@ void msm_gpu_retire(struct msm_gpu *gpu) }
/* add bo's to gpu's ring, and kick gpu: */ -void msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit, - struct msm_file_private *ctx) +struct dma_fence *msm_gpu_submit(struct msm_gpu *gpu, + struct msm_gem_submit *submit, struct msm_file_private *ctx) { struct drm_device *dev = gpu->dev; struct msm_drm_private *priv = dev->dev_private; @@ -612,6 +612,10 @@ void msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
WARN_ON(!mutex_is_locked(&dev->struct_mutex));
+ submit->fence = msm_fence_alloc(ring->fctx); + if (IS_ERR(submit->fence)) + return submit->fence; + pm_runtime_get_sync(&gpu->pdev->dev);
msm_gpu_hw_init(gpu); @@ -648,6 +652,8 @@ void msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit, priv->lastctx = ctx;
hangcheck_timer_reset(gpu); + + return submit->fence; }
/* diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h index b824117..b562b25 100644 --- a/drivers/gpu/drm/msm/msm_gpu.h +++ b/drivers/gpu/drm/msm/msm_gpu.h @@ -235,8 +235,8 @@ int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime, uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs);
void msm_gpu_retire(struct msm_gpu *gpu); -void msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit, - struct msm_file_private *ctx); +struct dma_fence *msm_gpu_submit(struct msm_gpu *gpu, + struct msm_gem_submit *submit, struct msm_file_private *ctx);
int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, struct msm_gpu *gpu, const struct msm_gpu_funcs *funcs, diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c index 6f5295b..734f2b8 100644 --- a/drivers/gpu/drm/msm/msm_ringbuffer.c +++ b/drivers/gpu/drm/msm/msm_ringbuffer.c @@ -59,6 +59,8 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id,
ring->fctx = msm_fence_context_alloc(gpu->dev, name);
+ idr_init(&ring->fence_idr); + return ring;
fail: @@ -78,5 +80,8 @@ void msm_ringbuffer_destroy(struct msm_ringbuffer *ring) msm_gem_put_vaddr(ring->bo); drm_gem_object_put_unlocked(ring->bo); } + + idr_destroy(&ring->fence_idr); + kfree(ring); } diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.h b/drivers/gpu/drm/msm/msm_ringbuffer.h index cffce09..b74a0a9 100644 --- a/drivers/gpu/drm/msm/msm_ringbuffer.h +++ b/drivers/gpu/drm/msm/msm_ringbuffer.h @@ -40,6 +40,7 @@ struct msm_ringbuffer { struct msm_rbmemptrs *memptrs; uint64_t memptrs_iova; struct msm_fence_context *fctx; + struct idr fence_idr; spinlock_t lock; };
On Mon, Oct 01, 2018 at 06:01:33PM +0530, Sharat Masetty wrote:
Track the GPU fences created at submit time with idr instead of the ring the sequence number. This helps with easily changing the underlying fence to something we don't truly own, like the scheduler fence.
Also move the GPU fence allocation to msm_gpu_submit() and have the function return the fence. This suits well when integrating with the GPU scheduler.
Additionally remove the non-interruptible wait variant from msm_wait_fence() as it is not used.
So basically this is just propping up the msm_wait_fence ioctl a bit more? At what point should we just deprecate that bad boy and move on with our lives?
Signed-off-by: Sharat Masetty smasetty@codeaurora.org
drivers/gpu/drm/msm/msm_drv.c | 3 +-- drivers/gpu/drm/msm/msm_fence.c | 30 ++++++++++++++---------------- drivers/gpu/drm/msm/msm_fence.h | 5 +++-- drivers/gpu/drm/msm/msm_gem.h | 1 + drivers/gpu/drm/msm/msm_gem_submit.c | 26 ++++++++++++++++++-------- drivers/gpu/drm/msm/msm_gpu.c | 10 ++++++++-- drivers/gpu/drm/msm/msm_gpu.h | 4 ++-- drivers/gpu/drm/msm/msm_ringbuffer.c | 5 +++++ drivers/gpu/drm/msm/msm_ringbuffer.h | 1 + 9 files changed, 53 insertions(+), 32 deletions(-)
diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index 021a0b6..8eaa1bd 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -746,8 +746,7 @@ static int msm_ioctl_wait_fence(struct drm_device *dev, void *data, if (!queue) return -ENOENT;
- ret = msm_wait_fence(gpu->rb[queue->prio]->fctx, args->fence, &timeout,
true);
ret = msm_wait_fence(gpu->rb[queue->prio], args->fence, &timeout);
msm_submitqueue_put(queue); return ret;
diff --git a/drivers/gpu/drm/msm/msm_fence.c b/drivers/gpu/drm/msm/msm_fence.c index 349c12f..0e7912b 100644 --- a/drivers/gpu/drm/msm/msm_fence.c +++ b/drivers/gpu/drm/msm/msm_fence.c @@ -50,41 +50,39 @@ static inline bool fence_completed(struct msm_fence_context *fctx, uint32_t fenc }
/* legacy path for WAIT_FENCE ioctl: */ -int msm_wait_fence(struct msm_fence_context *fctx, uint32_t fence,
ktime_t *timeout, bool interruptible)
+int msm_wait_fence(struct msm_ringbuffer *ring, uint32_t fence_id,
ktime_t *timeout)
{
- struct dma_fence *fence; int ret;
- if (fence > fctx->last_fence) {
DRM_ERROR("%s: waiting on invalid fence: %u (of %u)\n",
fctx->name, fence, fctx->last_fence);
return -EINVAL;
rcu_read_lock();
fence = idr_find(&ring->fence_idr, fence_id);
if (!fence || !dma_fence_get_rcu(fence)) {
rcu_read_unlock();
return 0;
}
rcu_read_unlock();
if (!timeout) { /* no-wait: */
ret = fence_completed(fctx, fence) ? 0 : -EBUSY;
} else { unsigned long remaining_jiffies = timeout_to_jiffies(timeout);ret = dma_fence_is_signaled(fence) ? 0 : -EBUSY;
if (interruptible)
ret = wait_event_interruptible_timeout(fctx->event,
fence_completed(fctx, fence),
remaining_jiffies);
else
ret = wait_event_timeout(fctx->event,
fence_completed(fctx, fence),
remaining_jiffies);
ret = dma_fence_wait_timeout(fence, true, remaining_jiffies);
if (ret == 0) { DBG("timeout waiting for fence: %u (completed: %u)",
fence, fctx->completed_fence);
fence_id, ring->memptrs->fence); ret = -ETIMEDOUT;
} else if (ret != -ERESTARTSYS) { ret = 0; } }
dma_fence_put(fence); return ret;
}
diff --git a/drivers/gpu/drm/msm/msm_fence.h b/drivers/gpu/drm/msm/msm_fence.h index b9fe059..a8133f7 100644 --- a/drivers/gpu/drm/msm/msm_fence.h +++ b/drivers/gpu/drm/msm/msm_fence.h @@ -19,6 +19,7 @@ #define __MSM_FENCE_H__
#include "msm_drv.h" +#include "msm_ringbuffer.h"
struct msm_fence_context { struct drm_device *dev; @@ -35,8 +36,8 @@ struct msm_fence_context * msm_fence_context_alloc(struct drm_device *dev, const char *name); void msm_fence_context_free(struct msm_fence_context *fctx);
-int msm_wait_fence(struct msm_fence_context *fctx, uint32_t fence,
ktime_t *timeout, bool interruptible);
+int msm_wait_fence(struct msm_ringbuffer *ring, uint32_t fence_id,
ktime_t *timeout);
void msm_update_fence(struct msm_fence_context *fctx, uint32_t fence);
struct dma_fence * msm_fence_alloc(struct msm_fence_context *fctx); diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h index c5d9bd3..287f974 100644 --- a/drivers/gpu/drm/msm/msm_gem.h +++ b/drivers/gpu/drm/msm/msm_gem.h @@ -143,6 +143,7 @@ struct msm_gem_submit { struct ww_acquire_ctx ticket; uint32_t seqno; /* Sequence number of the submit on the ring */ struct dma_fence *fence;
- int out_fence_id; struct msm_gpu_submitqueue *queue; struct pid *pid; /* submitting process */ bool valid; /* true if no cmdstream patching needed */
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index 7bd83e0..00e6347 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -48,6 +48,7 @@ static struct msm_gem_submit *submit_create(struct drm_device *dev, submit->dev = dev; submit->gpu = gpu; submit->fence = NULL;
- submit->out_fence_id = -1; submit->pid = get_pid(task_pid(current)); submit->cmd = (void *)&submit->bos[nr_bos]; submit->queue = queue;
@@ -66,6 +67,8 @@ static struct msm_gem_submit *submit_create(struct drm_device *dev,
void msm_gem_submit_free(struct msm_gem_submit *submit) {
- idr_remove(&submit->ring->fence_idr, submit->out_fence_id);
- dma_fence_put(submit->fence); list_del(&submit->node); put_pid(submit->pid);
@@ -557,26 +560,33 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
submit->nr_cmds = i;
- submit->fence = msm_fence_alloc(ring->fctx);
msm_gpu_submit(gpu, submit, ctx); if (IS_ERR(submit->fence)) { ret = PTR_ERR(submit->fence); submit->fence = NULL; goto out; }
/*
* No protection should be okay here since this is protected by the big
* GPU lock.
*/
submit->out_fence_id = idr_alloc_cyclic(&ring->fence_idr, submit->fence,
0, INT_MAX, GFP_KERNEL);
if (submit->out_fence_id < 0) {
ret = -ENOMEM;
goto out;
}
args->fence = submit->out_fence_id;
if (args->flags & MSM_SUBMIT_FENCE_FD_OUT) { sync_file = sync_file_create(submit->fence); if (!sync_file) { ret = -ENOMEM; goto out; }
- }
- msm_gpu_submit(gpu, submit, ctx);
- args->fence = submit->fence->seqno;
- if (args->flags & MSM_SUBMIT_FENCE_FD_OUT) { fd_install(out_fence_fd, sync_file->file); args->fence_fd = out_fence_fd; }
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index 1c09acf..eb67172 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -602,8 +602,8 @@ void msm_gpu_retire(struct msm_gpu *gpu) }
/* add bo's to gpu's ring, and kick gpu: */ -void msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
struct msm_file_private *ctx)
+struct dma_fence *msm_gpu_submit(struct msm_gpu *gpu,
struct msm_gem_submit *submit, struct msm_file_private *ctx)
{ struct drm_device *dev = gpu->dev; struct msm_drm_private *priv = dev->dev_private; @@ -612,6 +612,10 @@ void msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
WARN_ON(!mutex_is_locked(&dev->struct_mutex));
submit->fence = msm_fence_alloc(ring->fctx);
if (IS_ERR(submit->fence))
return submit->fence;
pm_runtime_get_sync(&gpu->pdev->dev);
msm_gpu_hw_init(gpu);
@@ -648,6 +652,8 @@ void msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit, priv->lastctx = ctx;
hangcheck_timer_reset(gpu);
- return submit->fence;
}
/* diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h index b824117..b562b25 100644 --- a/drivers/gpu/drm/msm/msm_gpu.h +++ b/drivers/gpu/drm/msm/msm_gpu.h @@ -235,8 +235,8 @@ int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime, uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs);
void msm_gpu_retire(struct msm_gpu *gpu); -void msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
struct msm_file_private *ctx);
+struct dma_fence *msm_gpu_submit(struct msm_gpu *gpu,
struct msm_gem_submit *submit, struct msm_file_private *ctx);
int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, struct msm_gpu *gpu, const struct msm_gpu_funcs *funcs, diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c index 6f5295b..734f2b8 100644 --- a/drivers/gpu/drm/msm/msm_ringbuffer.c +++ b/drivers/gpu/drm/msm/msm_ringbuffer.c @@ -59,6 +59,8 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id,
ring->fctx = msm_fence_context_alloc(gpu->dev, name);
- idr_init(&ring->fence_idr);
- return ring;
fail: @@ -78,5 +80,8 @@ void msm_ringbuffer_destroy(struct msm_ringbuffer *ring) msm_gem_put_vaddr(ring->bo); drm_gem_object_put_unlocked(ring->bo); }
- idr_destroy(&ring->fence_idr);
- kfree(ring);
} diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.h b/drivers/gpu/drm/msm/msm_ringbuffer.h index cffce09..b74a0a9 100644 --- a/drivers/gpu/drm/msm/msm_ringbuffer.h +++ b/drivers/gpu/drm/msm/msm_ringbuffer.h @@ -40,6 +40,7 @@ struct msm_ringbuffer { struct msm_rbmemptrs *memptrs; uint64_t memptrs_iova; struct msm_fence_context *fctx;
- struct idr fence_idr; spinlock_t lock;
};
-- 1.9.1
On Mon, Oct 1, 2018 at 3:04 PM Jordan Crouse jcrouse@codeaurora.org wrote:
On Mon, Oct 01, 2018 at 06:01:33PM +0530, Sharat Masetty wrote:
Track the GPU fences created at submit time with idr instead of the ring the sequence number. This helps with easily changing the underlying fence to something we don't truly own, like the scheduler fence.
Also move the GPU fence allocation to msm_gpu_submit() and have the function return the fence. This suits well when integrating with the GPU scheduler.
Additionally remove the non-interruptible wait variant from msm_wait_fence() as it is not used.
So basically this is just propping up the msm_wait_fence ioctl a bit more? At what point should we just deprecate that bad boy and move on with our lives?
As I mentioned on IRC, wait_fence ioctl is still in use, so unfortunately I don't think we can just deprecate it. I guess it is worth some profiling, and if this shows up as enough overhead to care about perhaps we can introduce a submit ioctl flag to disable "old" fences.
Personally, my guestimate about what we want to do for performance from a kernel standpoint is more towards introducing 64b reloc's (rather than using 2x 32b relocs), or possibly skip straight ahead to something like i915's softpin.. there are a couple other things that I'd started on a few months back to reduce userspace draw-overhead that I think I need to pick back up, but reloc overhead is something near the top of the list. (Reloc and submit overhead is partially mitigated with freedreno, since the way the driver is structured it gets pushed off to a background thread, but still I think there is some room for improvement.)
BR, -R
When the scheduler comes into picture, the msm_gpu_submit() will be called from the scheduler worker thread. So save the necessary information into msm job structure for use at the actual GPU submission time. This also simplifies the msm_gpu_submit() API.
Signed-off-by: Sharat Masetty smasetty@codeaurora.org --- drivers/gpu/drm/msm/msm_gem.h | 1 + drivers/gpu/drm/msm/msm_gem_submit.c | 8 +++++--- drivers/gpu/drm/msm/msm_gpu.c | 8 ++++---- drivers/gpu/drm/msm/msm_gpu.h | 3 +-- 4 files changed, 11 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h index 287f974..289abe5 100644 --- a/drivers/gpu/drm/msm/msm_gem.h +++ b/drivers/gpu/drm/msm/msm_gem.h @@ -137,6 +137,7 @@ enum msm_gem_lock { */ struct msm_gem_submit { struct drm_device *dev; + struct msm_file_private *ctx; struct msm_gpu *gpu; struct list_head node; /* node in ring submit list */ struct list_head bo_list; diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index 00e6347..14906b9 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -32,7 +32,7 @@
static struct msm_gem_submit *submit_create(struct drm_device *dev, struct msm_gpu *gpu, struct msm_gpu_submitqueue *queue, - uint32_t nr_bos, uint32_t nr_cmds) + uint32_t nr_bos, uint32_t nr_cmds, struct msm_file_private *ctx) { struct msm_gem_submit *submit; uint64_t sz = sizeof(*submit) + ((u64)nr_bos * sizeof(submit->bos[0])) + @@ -47,6 +47,7 @@ static struct msm_gem_submit *submit_create(struct drm_device *dev,
submit->dev = dev; submit->gpu = gpu; + submit->ctx = ctx; submit->fence = NULL; submit->out_fence_id = -1; submit->pid = get_pid(task_pid(current)); @@ -474,7 +475,8 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, } }
- submit = submit_create(dev, gpu, queue, args->nr_bos, args->nr_cmds); + submit = submit_create(dev, gpu, queue, args->nr_bos, + args->nr_cmds, ctx); if (!submit) { ret = -ENOMEM; goto out_unlock; @@ -560,7 +562,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
submit->nr_cmds = i;
- msm_gpu_submit(gpu, submit, ctx); + msm_gpu_submit(submit); if (IS_ERR(submit->fence)) { ret = PTR_ERR(submit->fence); submit->fence = NULL; diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index eb67172..5f9cd85 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -602,9 +602,9 @@ void msm_gpu_retire(struct msm_gpu *gpu) }
/* add bo's to gpu's ring, and kick gpu: */ -struct dma_fence *msm_gpu_submit(struct msm_gpu *gpu, - struct msm_gem_submit *submit, struct msm_file_private *ctx) +struct dma_fence *msm_gpu_submit(struct msm_gem_submit *submit) { + struct msm_gpu *gpu = submit->gpu; struct drm_device *dev = gpu->dev; struct msm_drm_private *priv = dev->dev_private; struct msm_ringbuffer *ring = submit->ring; @@ -648,8 +648,8 @@ struct dma_fence *msm_gpu_submit(struct msm_gpu *gpu, msm_gem_move_to_active(&msm_obj->base, gpu, false, submit->fence); }
- gpu->funcs->submit(gpu, submit, ctx); - priv->lastctx = ctx; + gpu->funcs->submit(gpu, submit, submit->ctx); + priv->lastctx = submit->ctx;
hangcheck_timer_reset(gpu);
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h index b562b25..dd55979 100644 --- a/drivers/gpu/drm/msm/msm_gpu.h +++ b/drivers/gpu/drm/msm/msm_gpu.h @@ -235,8 +235,7 @@ int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime, uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs);
void msm_gpu_retire(struct msm_gpu *gpu); -struct dma_fence *msm_gpu_submit(struct msm_gpu *gpu, - struct msm_gem_submit *submit, struct msm_file_private *ctx); +struct dma_fence *msm_gpu_submit(struct msm_gem_submit *submit);
int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, struct msm_gpu *gpu, const struct msm_gpu_funcs *funcs,
On Mon, Oct 01, 2018 at 06:01:34PM +0530, Sharat Masetty wrote:
When the scheduler comes into picture, the msm_gpu_submit() will be called from the scheduler worker thread. So save the necessary information into msm job structure for use at the actual GPU submission time. This also simplifies the msm_gpu_submit() API.
When I read this, I immediately thought you were changing the format of the submit ioctl struct, but really you are just changing the parameters of the functions - this isn't an API by definition because it isn't an interface to anything else. Words like API get people all worked up - I would recommend rewording this to be less scary / more accurate.
Signed-off-by: Sharat Masetty smasetty@codeaurora.org
drivers/gpu/drm/msm/msm_gem.h | 1 + drivers/gpu/drm/msm/msm_gem_submit.c | 8 +++++--- drivers/gpu/drm/msm/msm_gpu.c | 8 ++++---- drivers/gpu/drm/msm/msm_gpu.h | 3 +-- 4 files changed, 11 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h index 287f974..289abe5 100644 --- a/drivers/gpu/drm/msm/msm_gem.h +++ b/drivers/gpu/drm/msm/msm_gem.h @@ -137,6 +137,7 @@ enum msm_gem_lock { */ struct msm_gem_submit { struct drm_device *dev;
- struct msm_file_private *ctx; struct msm_gpu *gpu; struct list_head node; /* node in ring submit list */ struct list_head bo_list;
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index 00e6347..14906b9 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -32,7 +32,7 @@
static struct msm_gem_submit *submit_create(struct drm_device *dev, struct msm_gpu *gpu, struct msm_gpu_submitqueue *queue,
uint32_t nr_bos, uint32_t nr_cmds)
uint32_t nr_bos, uint32_t nr_cmds, struct msm_file_private *ctx)
submit_create() is a catch-22. If I saw all that code inline I would probably want it to be a sub function too, but on the other hand we're steadily adding new parameters to it and adding new lines of code that the calling function doesn't need to translate.
{ struct msm_gem_submit *submit; uint64_t sz = sizeof(*submit) + ((u64)nr_bos * sizeof(submit->bos[0])) + @@ -47,6 +47,7 @@ static struct msm_gem_submit *submit_create(struct drm_device *dev,
submit->dev = dev; submit->gpu = gpu;
- submit->ctx = ctx; submit->fence = NULL; submit->out_fence_id = -1; submit->pid = get_pid(task_pid(current));
@@ -474,7 +475,8 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, } }
- submit = submit_create(dev, gpu, queue, args->nr_bos, args->nr_cmds);
- submit = submit_create(dev, gpu, queue, args->nr_bos,
args->nr_cmds, ctx);
Like maybe we can compromise and pass in args directly to submit_create to help offset the new parameters.
if (!submit) { ret = -ENOMEM; goto out_unlock; @@ -560,7 +562,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
submit->nr_cmds = i;
- msm_gpu_submit(gpu, submit, ctx);
- msm_gpu_submit(submit); if (IS_ERR(submit->fence)) { ret = PTR_ERR(submit->fence); submit->fence = NULL;
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index eb67172..5f9cd85 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -602,9 +602,9 @@ void msm_gpu_retire(struct msm_gpu *gpu) }
/* add bo's to gpu's ring, and kick gpu: */ -struct dma_fence *msm_gpu_submit(struct msm_gpu *gpu,
struct msm_gem_submit *submit, struct msm_file_private *ctx)
+struct dma_fence *msm_gpu_submit(struct msm_gem_submit *submit) {
- struct msm_gpu *gpu = submit->gpu; struct drm_device *dev = gpu->dev; struct msm_drm_private *priv = dev->dev_private; struct msm_ringbuffer *ring = submit->ring;
@@ -648,8 +648,8 @@ struct dma_fence *msm_gpu_submit(struct msm_gpu *gpu, msm_gem_move_to_active(&msm_obj->base, gpu, false, submit->fence); }
- gpu->funcs->submit(gpu, submit, ctx);
- priv->lastctx = ctx;
gpu->funcs->submit(gpu, submit, submit->ctx);
priv->lastctx = submit->ctx;
hangcheck_timer_reset(gpu);
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h index b562b25..dd55979 100644 --- a/drivers/gpu/drm/msm/msm_gpu.h +++ b/drivers/gpu/drm/msm/msm_gpu.h @@ -235,8 +235,7 @@ int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime, uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs);
void msm_gpu_retire(struct msm_gpu *gpu); -struct dma_fence *msm_gpu_submit(struct msm_gpu *gpu,
struct msm_gem_submit *submit, struct msm_file_private *ctx);
+struct dma_fence *msm_gpu_submit(struct msm_gem_submit *submit);
int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, struct msm_gpu *gpu, const struct msm_gpu_funcs *funcs, -- 1.9.1
The scheduler needs an instance name mostly for debug purposes. Save the name in the ringbuffer instead of a stack variable, so that the name can be shared with the scheduler.
Signed-off-by: Sharat Masetty smasetty@codeaurora.org --- drivers/gpu/drm/msm/msm_ringbuffer.c | 5 ++--- drivers/gpu/drm/msm/msm_ringbuffer.h | 1 + 2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c index 734f2b8..0889766 100644 --- a/drivers/gpu/drm/msm/msm_ringbuffer.c +++ b/drivers/gpu/drm/msm/msm_ringbuffer.c @@ -22,7 +22,6 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id, void *memptrs, uint64_t memptrs_iova) { struct msm_ringbuffer *ring; - char name[32]; int ret;
/* We assume everwhere that MSM_GPU_RINGBUFFER_SZ is a power of 2 */ @@ -55,9 +54,9 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id, INIT_LIST_HEAD(&ring->submits); spin_lock_init(&ring->lock);
- snprintf(name, sizeof(name), "gpu-ring-%d", ring->id); + snprintf(ring->name, sizeof(ring->name), "msm-gpu-ring-%d", ring->id);
- ring->fctx = msm_fence_context_alloc(gpu->dev, name); + ring->fctx = msm_fence_context_alloc(gpu->dev, ring->name);
idr_init(&ring->fence_idr);
diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.h b/drivers/gpu/drm/msm/msm_ringbuffer.h index b74a0a9..523373b 100644 --- a/drivers/gpu/drm/msm/msm_ringbuffer.h +++ b/drivers/gpu/drm/msm/msm_ringbuffer.h @@ -31,6 +31,7 @@ struct msm_rbmemptrs { struct msm_ringbuffer { struct msm_gpu *gpu; int id; + char name[16]; struct drm_gem_object *bo; uint32_t *start, *end, *cur, *next; struct list_head submits;
On Mon, Oct 01, 2018 at 06:01:35PM +0530, Sharat Masetty wrote:
The scheduler needs an instance name mostly for debug purposes. Save the name in the ringbuffer instead of a stack variable, so that the name can be shared with the scheduler.
Signed-off-by: Sharat Masetty smasetty@codeaurora.org
drivers/gpu/drm/msm/msm_ringbuffer.c | 5 ++--- drivers/gpu/drm/msm/msm_ringbuffer.h | 1 + 2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c index 734f2b8..0889766 100644 --- a/drivers/gpu/drm/msm/msm_ringbuffer.c +++ b/drivers/gpu/drm/msm/msm_ringbuffer.c @@ -22,7 +22,6 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id, void *memptrs, uint64_t memptrs_iova) { struct msm_ringbuffer *ring;
char name[32]; int ret;
/* We assume everwhere that MSM_GPU_RINGBUFFER_SZ is a power of 2 */
@@ -55,9 +54,9 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id, INIT_LIST_HEAD(&ring->submits); spin_lock_init(&ring->lock);
- snprintf(name, sizeof(name), "gpu-ring-%d", ring->id);
- snprintf(ring->name, sizeof(ring->name), "msm-gpu-ring-%d", ring->id);
Okay I guess, but can't we just generate this on the fly when the scheduler needs it? Its not like the name is random or anything.
- ring->fctx = msm_fence_context_alloc(gpu->dev, name);
ring->fctx = msm_fence_context_alloc(gpu->dev, ring->name);
idr_init(&ring->fence_idr);
diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.h b/drivers/gpu/drm/msm/msm_ringbuffer.h index b74a0a9..523373b 100644 --- a/drivers/gpu/drm/msm/msm_ringbuffer.h +++ b/drivers/gpu/drm/msm/msm_ringbuffer.h @@ -31,6 +31,7 @@ struct msm_rbmemptrs { struct msm_ringbuffer { struct msm_gpu *gpu; int id;
- char name[16]; struct drm_gem_object *bo; uint32_t *start, *end, *cur, *next; struct list_head submits;
-- 1.9.1
We are going to soon deal with lot more fences, so change the generic name 'fence' to hw_fence to denote association with an actual hardware submission.
Signed-off-by: Sharat Masetty smasetty@codeaurora.org --- drivers/gpu/drm/msm/msm_gem.h | 2 +- drivers/gpu/drm/msm/msm_gem_submit.c | 17 ++++++++--------- drivers/gpu/drm/msm/msm_gpu.c | 16 ++++++++-------- 3 files changed, 17 insertions(+), 18 deletions(-)
diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h index 289abe5..cae3aa5 100644 --- a/drivers/gpu/drm/msm/msm_gem.h +++ b/drivers/gpu/drm/msm/msm_gem.h @@ -143,7 +143,7 @@ struct msm_gem_submit { struct list_head bo_list; struct ww_acquire_ctx ticket; uint32_t seqno; /* Sequence number of the submit on the ring */ - struct dma_fence *fence; + struct dma_fence *hw_fence; int out_fence_id; struct msm_gpu_submitqueue *queue; struct pid *pid; /* submitting process */ diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index 14906b9..fd28ace 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -48,7 +48,7 @@ static struct msm_gem_submit *submit_create(struct drm_device *dev, submit->dev = dev; submit->gpu = gpu; submit->ctx = ctx; - submit->fence = NULL; + submit->hw_fence = NULL; submit->out_fence_id = -1; submit->pid = get_pid(task_pid(current)); submit->cmd = (void *)&submit->bos[nr_bos]; @@ -70,7 +70,7 @@ void msm_gem_submit_free(struct msm_gem_submit *submit) { idr_remove(&submit->ring->fence_idr, submit->out_fence_id);
- dma_fence_put(submit->fence); + dma_fence_put(submit->hw_fence); list_del(&submit->node); put_pid(submit->pid); msm_submitqueue_put(submit->queue); @@ -563,9 +563,9 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, submit->nr_cmds = i;
msm_gpu_submit(submit); - if (IS_ERR(submit->fence)) { - ret = PTR_ERR(submit->fence); - submit->fence = NULL; + if (IS_ERR(submit->hw_fence)) { + ret = PTR_ERR(submit->hw_fence); + submit->hw_fence = NULL; goto out; }
@@ -573,9 +573,8 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, * No protection should be okay here since this is protected by the big * GPU lock. */ - submit->out_fence_id = idr_alloc_cyclic(&ring->fence_idr, submit->fence, - 0, INT_MAX, GFP_KERNEL); - + submit->out_fence_id = idr_alloc_cyclic(&ring->fence_idr, + submit->hw_fence, 0, INT_MAX, GFP_KERNEL); if (submit->out_fence_id < 0) { ret = -ENOMEM; goto out; @@ -584,7 +583,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, args->fence = submit->out_fence_id;
if (args->flags & MSM_SUBMIT_FENCE_FD_OUT) { - sync_file = sync_file_create(submit->fence); + sync_file = sync_file_create(submit->hw_fence); if (!sync_file) { ret = -ENOMEM; goto out; diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index 5f9cd85..449cc23 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -287,7 +287,7 @@ static void update_fences(struct msm_gpu *gpu, struct msm_ringbuffer *ring, break;
msm_update_fence(submit->ring->fctx, - submit->fence->seqno); + submit->hw_fence->seqno); } }
@@ -573,7 +573,7 @@ static void retire_submits(struct msm_gpu *gpu) struct msm_ringbuffer *ring = gpu->rb[i];
list_for_each_entry_safe(submit, tmp, &ring->submits, node) { - if (dma_fence_is_signaled(submit->fence)) + if (dma_fence_is_signaled(submit->hw_fence)) retire_submit(gpu, submit); } } @@ -612,9 +612,9 @@ struct dma_fence *msm_gpu_submit(struct msm_gem_submit *submit)
WARN_ON(!mutex_is_locked(&dev->struct_mutex));
- submit->fence = msm_fence_alloc(ring->fctx); - if (IS_ERR(submit->fence)) - return submit->fence; + submit->hw_fence = msm_fence_alloc(ring->fctx); + if (IS_ERR(submit->hw_fence)) + return submit->hw_fence;
pm_runtime_get_sync(&gpu->pdev->dev);
@@ -643,9 +643,9 @@ struct dma_fence *msm_gpu_submit(struct msm_gem_submit *submit) submit->gpu->aspace, &iova);
if (submit->bos[i].flags & MSM_SUBMIT_BO_WRITE) - msm_gem_move_to_active(&msm_obj->base, gpu, true, submit->fence); + msm_gem_move_to_active(&msm_obj->base, gpu, true, submit->hw_fence); else if (submit->bos[i].flags & MSM_SUBMIT_BO_READ) - msm_gem_move_to_active(&msm_obj->base, gpu, false, submit->fence); + msm_gem_move_to_active(&msm_obj->base, gpu, false, submit->hw_fence); }
gpu->funcs->submit(gpu, submit, submit->ctx); @@ -653,7 +653,7 @@ struct dma_fence *msm_gpu_submit(struct msm_gem_submit *submit)
hangcheck_timer_reset(gpu);
- return submit->fence; + return submit->hw_fence; }
/*
The submit path in msm_gpu_submit() takes a reference to the buffer object and the iova. This should not be needed with a little bit of code rearrangement. We still keep the same semantic of a valid GPU submission holding a reference to the base object and the iova until retirement.
Signed-off-by: Sharat Masetty smasetty@codeaurora.org --- drivers/gpu/drm/msm/msm_gem_submit.c | 20 ++++++++++++++++++-- drivers/gpu/drm/msm/msm_gpu.c | 8 -------- 2 files changed, 18 insertions(+), 10 deletions(-)
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index fd28ace..a7c8cbc 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -68,9 +68,21 @@ static struct msm_gem_submit *submit_create(struct drm_device *dev,
void msm_gem_submit_free(struct msm_gem_submit *submit) { + int i; + idr_remove(&submit->ring->fence_idr, submit->out_fence_id);
dma_fence_put(submit->hw_fence); + + for (i = 0; i < submit->nr_bos; i++) { + struct msm_gem_object *msm_obj = submit->bos[i].obj; + + if (submit->bos[i].flags & BO_PINNED) + msm_gem_put_iova(&msm_obj->base, submit->gpu->aspace); + + drm_gem_object_put(&msm_obj->base); + } + list_del(&submit->node); put_pid(submit->pid); msm_submitqueue_put(submit->queue); @@ -398,9 +410,13 @@ static void submit_cleanup(struct msm_gem_submit *submit)
for (i = 0; i < submit->nr_bos; i++) { struct msm_gem_object *msm_obj = submit->bos[i].obj; - submit_unlock_unpin_bo(submit, i, false); + + if (submit->bos[i].flags & BO_LOCKED) { + ww_mutex_unlock(&msm_obj->resv->lock); + submit->bos[i].flags &= ~BO_LOCKED; + } + list_del_init(&msm_obj->submit_entry); - drm_gem_object_unreference(&msm_obj->base); }
ww_acquire_fini(&submit->ticket); diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index 449cc23..cd5fe49 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -551,8 +551,6 @@ static void retire_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit) struct msm_gem_object *msm_obj = submit->bos[i].obj; /* move to inactive: */ msm_gem_move_to_inactive(&msm_obj->base); - msm_gem_put_iova(&msm_obj->base, gpu->aspace); - drm_gem_object_put(&msm_obj->base); }
pm_runtime_mark_last_busy(&gpu->pdev->dev); @@ -630,18 +628,12 @@ struct dma_fence *msm_gpu_submit(struct msm_gem_submit *submit)
for (i = 0; i < submit->nr_bos; i++) { struct msm_gem_object *msm_obj = submit->bos[i].obj; - uint64_t iova;
/* can't happen yet.. but when we add 2d support we'll have * to deal w/ cross-ring synchronization: */ WARN_ON(is_active(msm_obj) && (msm_obj->gpu != gpu));
- /* submit takes a reference to the bo and iova until retired: */ - drm_gem_object_get(&msm_obj->base); - msm_gem_get_iova(&msm_obj->base, - submit->gpu->aspace, &iova); - if (submit->bos[i].flags & MSM_SUBMIT_BO_WRITE) msm_gem_move_to_active(&msm_obj->base, gpu, true, submit->hw_fence); else if (submit->bos[i].flags & MSM_SUBMIT_BO_READ)
This patch changes to kzalloc and avoids setting individual submit struct fields to zero manually.
Signed-off-by: Sharat Masetty smasetty@codeaurora.org --- drivers/gpu/drm/msm/msm_gem_submit.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index a7c8cbc..7931c2a 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -41,24 +41,19 @@ static struct msm_gem_submit *submit_create(struct drm_device *dev, if (sz > SIZE_MAX) return NULL;
- submit = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); + submit = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); if (!submit) return NULL;
submit->dev = dev; submit->gpu = gpu; submit->ctx = ctx; - submit->hw_fence = NULL; submit->out_fence_id = -1; submit->pid = get_pid(task_pid(current)); submit->cmd = (void *)&submit->bos[nr_bos]; submit->queue = queue; submit->ring = gpu->rb[queue->prio];
- /* initially, until copy_from_user() and bo lookup succeeds: */ - submit->nr_bos = 0; - submit->nr_cmds = 0; - INIT_LIST_HEAD(&submit->node); INIT_LIST_HEAD(&submit->bo_list); ww_acquire_init(&submit->ticket, &reservation_ww_class);
On Mon, Oct 01, 2018 at 06:01:38PM +0530, Sharat Masetty wrote:
This patch changes to kzalloc and avoids setting individual submit struct fields to zero manually.
I don't think this one is worth it. There are so many members in submit and so few that get reset to 0 - I don't think the extra cycles are worth it in this fast path.
Signed-off-by: Sharat Masetty smasetty@codeaurora.org
drivers/gpu/drm/msm/msm_gem_submit.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index a7c8cbc..7931c2a 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -41,24 +41,19 @@ static struct msm_gem_submit *submit_create(struct drm_device *dev, if (sz > SIZE_MAX) return NULL;
- submit = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
submit = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); if (!submit) return NULL;
submit->dev = dev; submit->gpu = gpu; submit->ctx = ctx;
submit->hw_fence = NULL; submit->out_fence_id = -1; submit->pid = get_pid(task_pid(current)); submit->cmd = (void *)&submit->bos[nr_bos]; submit->queue = queue; submit->ring = gpu->rb[queue->prio];
/* initially, until copy_from_user() and bo lookup succeeds: */
submit->nr_bos = 0;
submit->nr_cmds = 0;
INIT_LIST_HEAD(&submit->node); INIT_LIST_HEAD(&submit->bo_list); ww_acquire_init(&submit->ticket, &reservation_ww_class);
-- 1.9.1
On 10/1/2018 11:43 PM, Jordan Crouse wrote:
On Mon, Oct 01, 2018 at 06:01:38PM +0530, Sharat Masetty wrote:
This patch changes to kzalloc and avoids setting individual submit struct fields to zero manually.
I don't think this one is worth it. There are so many members in submit and so few that get reset to 0 - I don't think the extra cycles are worth it in this fast path.
The patch "drm/msm: Use the DRM common Scheduler" adds a few more fields to the bo struct, If not kzalloc, then I will have to run a loop or memset the bos area to 0 at least.
Signed-off-by: Sharat Masetty smasetty@codeaurora.org
drivers/gpu/drm/msm/msm_gem_submit.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index a7c8cbc..7931c2a 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -41,24 +41,19 @@ static struct msm_gem_submit *submit_create(struct drm_device *dev, if (sz > SIZE_MAX) return NULL;
- submit = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
submit = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); if (!submit) return NULL;
submit->dev = dev; submit->gpu = gpu; submit->ctx = ctx;
submit->hw_fence = NULL; submit->out_fence_id = -1; submit->pid = get_pid(task_pid(current)); submit->cmd = (void *)&submit->bos[nr_bos]; submit->queue = queue; submit->ring = gpu->rb[queue->prio];
/* initially, until copy_from_user() and bo lookup succeeds: */
submit->nr_bos = 0;
submit->nr_cmds = 0;
INIT_LIST_HEAD(&submit->node); INIT_LIST_HEAD(&submit->bo_list); ww_acquire_init(&submit->ticket, &reservation_ww_class);
-- 1.9.1
This patch fixes a trivial leak when trying to create a submitqueue.
Signed-off-by: Sharat Masetty smasetty@codeaurora.org Reviewed-by: Jordan Crouse jcrouse@codeaurora.org --- drivers/gpu/drm/msm/msm_submitqueue.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/msm/msm_submitqueue.c b/drivers/gpu/drm/msm/msm_submitqueue.c index 5115f75..325da44 100644 --- a/drivers/gpu/drm/msm/msm_submitqueue.c +++ b/drivers/gpu/drm/msm/msm_submitqueue.c @@ -78,8 +78,10 @@ int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx, queue->flags = flags;
if (priv->gpu) { - if (prio >= priv->gpu->nr_rings) + if (prio >= priv->gpu->nr_rings) { + kfree(queue); return -EINVAL; + }
queue->prio = prio; }
In cases where the scheduler instance is used a base object of another vendor driver object, it's not clear if the driver can call sched cleanup on the fail path. Set the sched->thread to NULL, so that the vendor driver can safely call drm_sched_fini() during cleanup.
Signed-off-by: Sharat Masetty smasetty@codeaurora.org --- drivers/gpu/drm/scheduler/gpu_scheduler.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index 44d4807..bf0e0c9 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -760,7 +760,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched, long timeout, const char *name) { - int i; + int i, ret; sched->ops = ops; sched->hw_submission_limit = hw_submission; sched->name = name; @@ -779,8 +779,10 @@ int drm_sched_init(struct drm_gpu_scheduler *sched, /* Each scheduler will run on a seperate kernel thread */ sched->thread = kthread_run(drm_sched_main, sched, sched->name); if (IS_ERR(sched->thread)) { + ret = PTR_ERR(sched->thread); + sched->thread = NULL; DRM_ERROR("Failed to create scheduler for %s.\n", name); - return PTR_ERR(sched->thread); + return ret; }
return 0;
This patch hooks up the DRM gpu scheduler to the msm DRM driver. The most noticeable changes to the driver are as follows. The submit path is split into two parts, in the user context the submit(job) is created and added to one of the entity's scheduler run queue. The scheduler then tries to drain the queue by submitting the jobs the hardware to act upon. The submit job sits on the scheduler queue until all the dependent fences are waited upon successfully.
We have one scheduler instance per ring. The submitqueues will host a scheduler entity object. This entity will be mapped to the scheduler's default runqueue. This should be good for now, but in future it is possible to extend the API to allow for scheduling amongst the submitqueues on the same ring.
With this patch the role of the struct_mutex lock has been greatly reduced in scope in the submit path, evidently when actually putting the job on the ringbuffer. This should enable us with increased parallelism in the driver which should translate to better performance overall hopefully.
Signed-off-by: Sharat Masetty smasetty@codeaurora.org --- drivers/gpu/drm/msm/Kconfig | 1 + drivers/gpu/drm/msm/Makefile | 3 +- drivers/gpu/drm/msm/msm_drv.h | 3 +- drivers/gpu/drm/msm/msm_gem.c | 8 +- drivers/gpu/drm/msm/msm_gem.h | 6 + drivers/gpu/drm/msm/msm_gem_submit.c | 138 +++++++++------ drivers/gpu/drm/msm/msm_gpu.c | 72 ++++++-- drivers/gpu/drm/msm/msm_gpu.h | 2 + drivers/gpu/drm/msm/msm_ringbuffer.c | 7 + drivers/gpu/drm/msm/msm_ringbuffer.h | 3 + drivers/gpu/drm/msm/msm_sched.c | 313 ++++++++++++++++++++++++++++++++++ drivers/gpu/drm/msm/msm_sched.h | 18 ++ drivers/gpu/drm/msm/msm_submitqueue.c | 18 +- 13 files changed, 507 insertions(+), 85 deletions(-) create mode 100644 drivers/gpu/drm/msm/msm_sched.c create mode 100644 drivers/gpu/drm/msm/msm_sched.h
diff --git a/drivers/gpu/drm/msm/Kconfig b/drivers/gpu/drm/msm/Kconfig index 38cbde9..0d01810 100644 --- a/drivers/gpu/drm/msm/Kconfig +++ b/drivers/gpu/drm/msm/Kconfig @@ -15,6 +15,7 @@ config DRM_MSM select SND_SOC_HDMI_CODEC if SND_SOC select SYNC_FILE select PM_OPP + select DRM_SCHED default y help DRM/KMS driver for MSM/snapdragon. diff --git a/drivers/gpu/drm/msm/Makefile b/drivers/gpu/drm/msm/Makefile index cd40c05..71ed921 100644 --- a/drivers/gpu/drm/msm/Makefile +++ b/drivers/gpu/drm/msm/Makefile @@ -60,7 +60,8 @@ msm-y := \ msm_perf.o \ msm_rd.o \ msm_ringbuffer.o \ - msm_submitqueue.o + msm_submitqueue.o \ + msm_sched.o
msm-$(CONFIG_DEBUG_FS) += adreno/a5xx_debugfs.o
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h index b2da1fb..e461a9c 100644 --- a/drivers/gpu/drm/msm/msm_drv.h +++ b/drivers/gpu/drm/msm/msm_drv.h @@ -213,8 +213,7 @@ struct drm_gem_object *msm_gem_prime_import_sg_table(struct drm_device *dev, int msm_gem_madvise(struct drm_gem_object *obj, unsigned madv); int msm_gem_sync_object(struct drm_gem_object *obj, struct msm_fence_context *fctx, bool exclusive); -void msm_gem_move_to_active(struct drm_gem_object *obj, - struct msm_gpu *gpu, bool exclusive, struct dma_fence *fence); +void msm_gem_move_to_active(struct drm_gem_object *obj, struct msm_gpu *gpu); void msm_gem_move_to_inactive(struct drm_gem_object *obj); int msm_gem_cpu_prep(struct drm_gem_object *obj, uint32_t op, ktime_t *timeout); int msm_gem_cpu_fini(struct drm_gem_object *obj); diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index f583bb4..7a12f30 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -663,16 +663,12 @@ int msm_gem_sync_object(struct drm_gem_object *obj, return 0; }
-void msm_gem_move_to_active(struct drm_gem_object *obj, - struct msm_gpu *gpu, bool exclusive, struct dma_fence *fence) +void msm_gem_move_to_active(struct drm_gem_object *obj, struct msm_gpu *gpu) { struct msm_gem_object *msm_obj = to_msm_bo(obj); WARN_ON(msm_obj->madv != MSM_MADV_WILLNEED); msm_obj->gpu = gpu; - if (exclusive) - reservation_object_add_excl_fence(msm_obj->resv, fence); - else - reservation_object_add_shared_fence(msm_obj->resv, fence); + list_del_init(&msm_obj->mm_list); list_add_tail(&msm_obj->mm_list, &gpu->active_list); } diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h index cae3aa5..8c6269f 100644 --- a/drivers/gpu/drm/msm/msm_gem.h +++ b/drivers/gpu/drm/msm/msm_gem.h @@ -20,6 +20,7 @@
#include <linux/kref.h> #include <linux/reservation.h> +#include <drm/gpu_scheduler.h> #include "msm_drv.h"
/* Additional internal-use only BO flags: */ @@ -136,6 +137,7 @@ enum msm_gem_lock { * lasts for the duration of the submit-ioctl. */ struct msm_gem_submit { + struct drm_sched_job sched_job; struct drm_device *dev; struct msm_file_private *ctx; struct msm_gpu *gpu; @@ -144,6 +146,7 @@ struct msm_gem_submit { struct ww_acquire_ctx ticket; uint32_t seqno; /* Sequence number of the submit on the ring */ struct dma_fence *hw_fence; + struct dma_fence *in_fence, *out_fence; int out_fence_id; struct msm_gpu_submitqueue *queue; struct pid *pid; /* submitting process */ @@ -162,6 +165,9 @@ struct msm_gem_submit { uint32_t flags; struct msm_gem_object *obj; uint64_t iova; + struct dma_fence *excl; + uint32_t nr_shared; + struct dma_fence **shared; } bos[0]; };
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index 7931c2a..dff945c 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -65,23 +65,37 @@ void msm_gem_submit_free(struct msm_gem_submit *submit) { int i;
+ mutex_lock(&submit->ring->fence_idr_lock); idr_remove(&submit->ring->fence_idr, submit->out_fence_id); - - dma_fence_put(submit->hw_fence); + mutex_unlock(&submit->ring->fence_idr_lock);
for (i = 0; i < submit->nr_bos; i++) { + int j; + struct msm_gem_object *msm_obj = submit->bos[i].obj;
if (submit->bos[i].flags & BO_PINNED) msm_gem_put_iova(&msm_obj->base, submit->gpu->aspace);
- drm_gem_object_put(&msm_obj->base); + drm_gem_object_put_unlocked(&msm_obj->base); + + /* + * While we are at it, clear the saved exclusive and shared + * fences if any + */ + dma_fence_put(submit->bos[i].excl); + + for (j = 0; j < submit->bos[i].nr_shared; j++) + dma_fence_put(submit->bos[i].shared[j]); + + kfree(submit->bos[i].shared); }
- list_del(&submit->node); put_pid(submit->pid); msm_submitqueue_put(submit->queue);
+ dma_fence_put(submit->in_fence); + dma_fence_put(submit->out_fence); kfree(submit); }
@@ -238,6 +252,27 @@ static int submit_lock_objects(struct msm_gem_submit *submit) return ret; }
+static void submit_attach_fences_and_unlock(struct msm_gem_submit *submit) +{ + int i; + + for (i = 0; i < submit->nr_bos; i++) { + struct msm_gem_object *msm_obj = submit->bos[i].obj; + + if (submit->bos[i].flags & MSM_SUBMIT_BO_WRITE) + reservation_object_add_excl_fence(msm_obj->resv, + submit->out_fence); + else + reservation_object_add_shared_fence(msm_obj->resv, + submit->out_fence); + + if (submit->bos[i].flags & BO_LOCKED) { + ww_mutex_unlock(&msm_obj->resv->lock); + submit->bos[i].flags &= ~BO_LOCKED; + } + } +} + static int submit_fence_sync(struct msm_gem_submit *submit, bool no_implicit) { int i, ret = 0; @@ -260,10 +295,24 @@ static int submit_fence_sync(struct msm_gem_submit *submit, bool no_implicit) if (no_implicit) continue;
- ret = msm_gem_sync_object(&msm_obj->base, submit->ring->fctx, - write); - if (ret) - break; + if (write) { + /* + * Save the per buffer shared and exclusive fences for + * the scheduler thread to wait on. + * + * Note: The memory allocated for the storing the + * shared fences will be released when the scheduler + * is done waiting on all the saved fences. + */ + ret = reservation_object_get_fences_rcu(msm_obj->resv, + &submit->bos[i].excl, + &submit->bos[i].nr_shared, + &submit->bos[i].shared); + if (ret) + return ret; + } else + submit->bos[i].excl = + reservation_object_get_excl_rcu(msm_obj->resv); }
return ret; @@ -425,7 +474,6 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, struct msm_file_private *ctx = file->driver_priv; struct msm_gem_submit *submit; struct msm_gpu *gpu = priv->gpu; - struct dma_fence *in_fence = NULL; struct sync_file *sync_file = NULL; struct msm_gpu_submitqueue *queue; struct msm_ringbuffer *ring; @@ -457,32 +505,11 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
ring = gpu->rb[queue->prio];
- if (args->flags & MSM_SUBMIT_FENCE_FD_IN) { - in_fence = sync_file_get_fence(args->fence_fd); - - if (!in_fence) - return -EINVAL; - - /* - * Wait if the fence is from a foreign context, or if the fence - * array contains any fence from a foreign context. - */ - if (!dma_fence_match_context(in_fence, ring->fctx->context)) { - ret = dma_fence_wait(in_fence, true); - if (ret) - return ret; - } - } - - ret = mutex_lock_interruptible(&dev->struct_mutex); - if (ret) - return ret; - if (args->flags & MSM_SUBMIT_FENCE_FD_OUT) { out_fence_fd = get_unused_fd_flags(O_CLOEXEC); if (out_fence_fd < 0) { ret = out_fence_fd; - goto out_unlock; + goto out_err; } }
@@ -490,7 +517,16 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, args->nr_cmds, ctx); if (!submit) { ret = -ENOMEM; - goto out_unlock; + goto out_err; + } + + if (args->flags & MSM_SUBMIT_FENCE_FD_IN) { + submit->in_fence = sync_file_get_fence(args->fence_fd); + + if (!submit->in_fence) { + ret = -EINVAL; + goto out; + } }
if (args->flags & MSM_SUBMIT_SUDO) @@ -573,28 +609,16 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
submit->nr_cmds = i;
- msm_gpu_submit(submit); - if (IS_ERR(submit->hw_fence)) { - ret = PTR_ERR(submit->hw_fence); - submit->hw_fence = NULL; + ret = msm_sched_job_init(&submit->sched_job); + if (ret) goto out; - }
- /* - * No protection should be okay here since this is protected by the big - * GPU lock. - */ - submit->out_fence_id = idr_alloc_cyclic(&ring->fence_idr, - submit->hw_fence, 0, INT_MAX, GFP_KERNEL); - if (submit->out_fence_id < 0) { - ret = -ENOMEM; - goto out; - } + submit_attach_fences_and_unlock(submit);
args->fence = submit->out_fence_id;
if (args->flags & MSM_SUBMIT_FENCE_FD_OUT) { - sync_file = sync_file_create(submit->hw_fence); + sync_file = sync_file_create(submit->out_fence); if (!sync_file) { ret = -ENOMEM; goto out; @@ -604,14 +628,22 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, }
out: - if (in_fence) - dma_fence_put(in_fence); + /* + * Clean up the submit bits and pieces which are not needed beyond this + * function context + */ submit_cleanup(submit); - if (ret) + + if (!ret) + /* + * If we reached here, then all is well. Push the job to the + * scheduler + */ + msm_sched_push_job(submit); + else msm_gem_submit_free(submit); -out_unlock: +out_err: if (ret && (out_fence_fd >= 0)) put_unused_fd(out_fence_fd); - mutex_unlock(&dev->struct_mutex); return ret; } diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index cd5fe49..481a55c 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -295,12 +295,17 @@ static void update_fences(struct msm_gpu *gpu, struct msm_ringbuffer *ring, find_submit(struct msm_ringbuffer *ring, uint32_t fence) { struct msm_gem_submit *submit; + unsigned long flags;
- WARN_ON(!mutex_is_locked(&ring->gpu->dev->struct_mutex)); + spin_lock_irqsave(&ring->lock, flags);
list_for_each_entry(submit, &ring->submits, node) - if (submit->seqno == fence) + if (submit->seqno == fence) { + spin_unlock_irqrestore(&ring->lock, flags); return submit; + } + + spin_unlock_irqrestore(&ring->lock, flags);
return NULL; } @@ -316,6 +321,12 @@ static void recover_worker(struct work_struct *work) struct msm_ringbuffer *cur_ring = gpu->funcs->active_ring(gpu); int i;
+ submit = find_submit(cur_ring, cur_ring->memptrs->fence + 1); + return msm_sched_gpu_recovery(gpu, submit); + + /* + * The unused code below will be removed in a subsequent patch + */ mutex_lock(&dev->struct_mutex);
dev_err(dev->dev, "%s: hangcheck recover!\n", gpu->name); @@ -591,11 +602,36 @@ static void retire_worker(struct work_struct *work) mutex_unlock(&dev->struct_mutex); }
-/* call from irq handler to schedule work to retire bo's */ +static void signal_hw_fences(struct msm_ringbuffer *ring) +{ + unsigned long flags; + struct msm_gem_submit *submit, *tmp; + + spin_lock_irqsave(&ring->lock, flags); + + list_for_each_entry_safe(submit, tmp, &ring->submits, node) { + if (submit->seqno > ring->memptrs->fence) + break; + + list_del(&submit->node); + + dma_fence_signal(submit->hw_fence); + } + + spin_unlock_irqrestore(&ring->lock, flags); +} + +/* + * Called from the IRQ context to signal hardware fences for the completed + * submits + */ void msm_gpu_retire(struct msm_gpu *gpu) { - struct msm_drm_private *priv = gpu->dev->dev_private; - queue_work(priv->wq, &gpu->retire_work); + int i; + + for (i = 0; i < gpu->nr_rings; i++) + signal_hw_fences(gpu->rb[i]); + update_sw_cntrs(gpu); }
@@ -606,25 +642,28 @@ struct dma_fence *msm_gpu_submit(struct msm_gem_submit *submit) struct drm_device *dev = gpu->dev; struct msm_drm_private *priv = dev->dev_private; struct msm_ringbuffer *ring = submit->ring; + unsigned long flags; int i;
- WARN_ON(!mutex_is_locked(&dev->struct_mutex)); - submit->hw_fence = msm_fence_alloc(ring->fctx); if (IS_ERR(submit->hw_fence)) return submit->hw_fence;
- pm_runtime_get_sync(&gpu->pdev->dev); - - msm_gpu_hw_init(gpu); + spin_lock_irqsave(&ring->lock, flags); + list_add_tail(&submit->node, &ring->submits); + spin_unlock_irqrestore(&ring->lock, flags);
submit->seqno = ++ring->seqno;
- list_add_tail(&submit->node, &ring->submits); + update_sw_cntrs(gpu);
- msm_rd_dump_submit(priv->rd, submit, NULL); + mutex_lock(&dev->struct_mutex);
- update_sw_cntrs(gpu); + pm_runtime_get_sync(&gpu->pdev->dev); + + msm_gpu_hw_init(gpu); + + msm_rd_dump_submit(priv->rd, submit, NULL);
for (i = 0; i < submit->nr_bos; i++) { struct msm_gem_object *msm_obj = submit->bos[i].obj; @@ -634,16 +673,13 @@ struct dma_fence *msm_gpu_submit(struct msm_gem_submit *submit) */ WARN_ON(is_active(msm_obj) && (msm_obj->gpu != gpu));
- if (submit->bos[i].flags & MSM_SUBMIT_BO_WRITE) - msm_gem_move_to_active(&msm_obj->base, gpu, true, submit->hw_fence); - else if (submit->bos[i].flags & MSM_SUBMIT_BO_READ) - msm_gem_move_to_active(&msm_obj->base, gpu, false, submit->hw_fence); + msm_gem_move_to_active(&msm_obj->base, gpu); }
gpu->funcs->submit(gpu, submit, submit->ctx); priv->lastctx = submit->ctx;
- hangcheck_timer_reset(gpu); + mutex_unlock(&dev->struct_mutex);
return submit->hw_fence; } diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h index dd55979..3bd1920 100644 --- a/drivers/gpu/drm/msm/msm_gpu.h +++ b/drivers/gpu/drm/msm/msm_gpu.h @@ -173,6 +173,8 @@ struct msm_gpu_submitqueue { int faults; struct list_head node; struct kref ref; + struct msm_gpu *gpu; + struct drm_sched_entity sched_entity; };
static inline void gpu_write(struct msm_gpu *gpu, u32 reg, u32 data) diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c index 0889766..ef2bf10 100644 --- a/drivers/gpu/drm/msm/msm_ringbuffer.c +++ b/drivers/gpu/drm/msm/msm_ringbuffer.c @@ -56,9 +56,14 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id,
snprintf(ring->name, sizeof(ring->name), "msm-gpu-ring-%d", ring->id);
+ if (msm_sched_init(&ring->sched, ring->name) != 0) { + ret = -EINVAL; + goto fail; + } ring->fctx = msm_fence_context_alloc(gpu->dev, ring->name);
idr_init(&ring->fence_idr); + mutex_init(&ring->fence_idr_lock);
return ring;
@@ -74,6 +79,7 @@ void msm_ringbuffer_destroy(struct msm_ringbuffer *ring)
msm_fence_context_free(ring->fctx);
+ msm_sched_cleanup(&ring->sched); if (ring->bo) { msm_gem_put_iova(ring->bo, ring->gpu->aspace); msm_gem_put_vaddr(ring->bo); @@ -81,6 +87,7 @@ void msm_ringbuffer_destroy(struct msm_ringbuffer *ring) }
idr_destroy(&ring->fence_idr); + mutex_destroy(&ring->fence_idr_lock);
kfree(ring); } diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.h b/drivers/gpu/drm/msm/msm_ringbuffer.h index 523373b..10ae4a8 100644 --- a/drivers/gpu/drm/msm/msm_ringbuffer.h +++ b/drivers/gpu/drm/msm/msm_ringbuffer.h @@ -19,6 +19,7 @@ #define __MSM_RINGBUFFER_H__
#include "msm_drv.h" +#include "msm_sched.h"
#define rbmemptr(ring, member) \ ((ring)->memptrs_iova + offsetof(struct msm_rbmemptrs, member)) @@ -42,7 +43,9 @@ struct msm_ringbuffer { uint64_t memptrs_iova; struct msm_fence_context *fctx; struct idr fence_idr; + struct mutex fence_idr_lock; spinlock_t lock; + struct drm_gpu_scheduler sched; };
struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id, diff --git a/drivers/gpu/drm/msm/msm_sched.c b/drivers/gpu/drm/msm/msm_sched.c new file mode 100644 index 0000000..8b805ce --- /dev/null +++ b/drivers/gpu/drm/msm/msm_sched.c @@ -0,0 +1,313 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2018, The Linux Foundation. All rights reserved. */ + +#include "msm_gpu.h" +#include "msm_gem.h" +#include "msm_sched.h" + +#include <linux/string_helpers.h> +#include <linux/kthread.h> + +struct msm_gem_submit *to_msm_gem_submit(struct drm_sched_job *sched_job) +{ + return container_of(sched_job, struct msm_gem_submit, sched_job); +} + +/* + * Go through the bo's one by one and return the previously saved shared and + * exclusive fences. If the scheduler gets the fence, then it takes care of + * releasing the reference on the fence. + */ +static struct dma_fence *msm_sched_dependency(struct drm_sched_job *sched_job, + struct drm_sched_entity *entity) +{ + struct msm_gem_submit *submit = to_msm_gem_submit(sched_job); + struct dma_fence *fence; + int i; + + if (submit->in_fence) { + fence = submit->in_fence; + submit->in_fence = NULL; + + if (!dma_fence_is_signaled(fence)) + return fence; + + dma_fence_put(fence); + } + + /* Return the previously saved shared and exclusive fences if any */ + for (i = 0; i < submit->nr_bos; i++) { + int j; + + if (submit->bos[i].excl) { + fence = submit->bos[i].excl; + submit->bos[i].excl = NULL; + + if (!dma_fence_is_signaled(fence)) + return fence; + + dma_fence_put(fence); + } + + for (j = 0; j < submit->bos[i].nr_shared; j++) { + if (!submit->bos[i].shared[j]) + continue; + + fence = submit->bos[i].shared[j]; + submit->bos[i].shared[j] = NULL; + + if (!dma_fence_is_signaled(fence)) + return fence; + + dma_fence_put(fence); + } + + kfree(submit->bos[i].shared); + submit->bos[i].nr_shared = 0; + submit->bos[i].shared = NULL; + } + + return NULL; +} + +static struct dma_fence *msm_sched_run_job(struct drm_sched_job *sched_job) +{ + struct msm_gem_submit *submit = to_msm_gem_submit(sched_job); + + return !sched_job->s_fence->finished.error ? + msm_gpu_submit(submit) : NULL; +} + +static void dump_bad_submit(struct msm_gem_submit *submit) +{ + struct msm_gpu *gpu = submit->gpu; + struct drm_device *dev = gpu->dev; + struct msm_drm_private *priv = dev->dev_private; + struct task_struct *task; + char task_name[32] = {0}; + + rcu_read_lock(); + task = pid_task(submit->pid, PIDTYPE_PID); + if (task) { + char *cmd; + + cmd = kstrdup_quotable_cmdline(task, GFP_KERNEL); + + dev_err(&gpu->pdev->dev, "%s: offending task: %s (%s)\n", + gpu->name, task->comm, cmd); + + snprintf(task_name, sizeof(task_name), + "offending task: %s (%s)", task->comm, cmd); + + kfree(cmd); + } + rcu_read_unlock(); + + mutex_lock(&gpu->dev->struct_mutex); + msm_rd_dump_submit(priv->hangrd, submit, task_name); + mutex_unlock(&gpu->dev->struct_mutex); +} + +void msm_sched_gpu_recovery(struct msm_gpu *gpu, struct msm_gem_submit *submit) +{ + int i; + static atomic_t gpu_recovery; + + /* Check if a GPU recovery is already in progress */ + if (!(atomic_cmpxchg(&gpu_recovery, 0, 1) == 0)) + return; + + /* + * Pause the schedulers so we don't get new requests while the recovery + * is in progress + */ + for (i = 0; i < gpu->nr_rings; i++) + kthread_park(gpu->rb[i]->sched.thread); + + /* + * Disable interrupts so we don't get interrupted while the recovery is + * in progress + */ + disable_irq(gpu->irq); + + dev_err(&gpu->pdev->dev, "%s: hangcheck recover!\n", gpu->name); + + if (submit) + dump_bad_submit(submit); + + /* + * For each ring, we do the following + * a) Inform the scheduler to drop the hardware fence for the + * submissions on its mirror list + * b) The submit(job) list on the ring is not useful anymore + * as we are going for a gpu reset, so we empty the submit list + */ + for (i = 0; i < gpu->nr_rings; i++) { + struct msm_gem_submit *job, *tmp; + unsigned long flags; + struct msm_ringbuffer *ring = gpu->rb[i]; + + /* a) Release the hardware fences */ + drm_sched_hw_job_reset(&ring->sched, + (submit && submit->ring == ring) ? + &submit->sched_job : NULL); + + /* b) Free up the ring submit list */ + spin_lock_irqsave(&ring->lock, flags); + + list_for_each_entry_safe(job, tmp, &ring->submits, node) + list_del(&job->node); + + spin_unlock_irqrestore(&ring->lock, flags); + } + + /* Power cycle and reset the GPU back to init state */ + mutex_lock(&gpu->dev->struct_mutex); + + pm_runtime_get_sync(&gpu->pdev->dev); + gpu->funcs->recover(gpu); + pm_runtime_put_sync(&gpu->pdev->dev); + + mutex_unlock(&gpu->dev->struct_mutex); + + /* Re-enable the interrupts once the gpu reset is complete */ + enable_irq(gpu->irq); + + /* + * The GPU is hopefully back in good shape, now request the schedulers + * to replay its mirror list, starting with the scheduler on the highest + * priority ring + */ + for (i = 0; i < gpu->nr_rings; i++) { + drm_sched_job_recovery(&gpu->rb[i]->sched); + kthread_unpark(gpu->rb[i]->sched.thread); + } + + atomic_set(&gpu_recovery, 0); +} + +static void msm_sched_timedout_job(struct drm_sched_job *bad_job) +{ + struct msm_gem_submit *submit = to_msm_gem_submit(bad_job); + struct msm_gpu *gpu = submit->gpu; + struct msm_ringbuffer *ring = submit->ring; + + /* + * If this submission completed in the mean time, then the timeout is + * spurious + */ + if (submit->seqno <= submit->ring->memptrs->fence) + return; + + dev_err(&gpu->pdev->dev, "%s: hangcheck detected gpu lockup rb %d!\n", + gpu->name, ring->id); + dev_err(&gpu->pdev->dev, "%s: completed fence: %u\n", + gpu->name, ring->memptrs->fence); + dev_err(&gpu->pdev->dev, "%s: submitted fence: %u\n", + gpu->name, ring->seqno); + + msm_sched_gpu_recovery(gpu, submit); +} + +static void msm_sched_free_job(struct drm_sched_job *sched_job) +{ + struct msm_gem_submit *submit = to_msm_gem_submit(sched_job); + struct msm_gpu *gpu = submit->gpu; + int i; + + mutex_lock(&gpu->dev->struct_mutex); + + for (i = 0; i < submit->nr_bos; i++) { + struct msm_gem_object *msm_obj = submit->bos[i].obj; + + msm_gem_move_to_inactive(&msm_obj->base); + } + + mutex_unlock(&gpu->dev->struct_mutex); + + pm_runtime_mark_last_busy(&gpu->pdev->dev); + pm_runtime_put_autosuspend(&gpu->pdev->dev); + + msm_gem_submit_free(submit); +} + +static const struct drm_sched_backend_ops msm_sched_ops = { + .dependency = msm_sched_dependency, + .run_job = msm_sched_run_job, + .timedout_job = msm_sched_timedout_job, + .free_job = msm_sched_free_job, +}; + +int msm_sched_job_init(struct drm_sched_job *sched_job) +{ + int ret; + struct msm_gem_submit *submit = to_msm_gem_submit(sched_job); + struct msm_ringbuffer *ring = submit->ring; + + ret = drm_sched_job_init(sched_job, &ring->sched, + &submit->queue->sched_entity, submit->ctx); + if (ret) + return ret; + + submit->out_fence = dma_fence_get(&submit->sched_job.s_fence->finished); + + mutex_lock(&ring->fence_idr_lock); + submit->out_fence_id = idr_alloc_cyclic(&ring->fence_idr, + submit->out_fence, 0, INT_MAX, + GFP_KERNEL); + mutex_unlock(&ring->fence_idr_lock); + + if (submit->out_fence_id < 0) { + /* + * TODO: The scheduler's finished fence leaks here since the + * job will not be pushed to the queue. Need to update scheduler + * to fix this cleanly(?) + */ + dma_fence_put(submit->out_fence); + submit->out_fence = NULL; + return -ENOMEM; + } + + return 0; +} + +void msm_sched_push_job(struct msm_gem_submit *submit) +{ + return drm_sched_entity_push_job(&submit->sched_job, + &submit->queue->sched_entity); +} + +int msm_sched_init(struct drm_gpu_scheduler *sched, const char *name) +{ + return drm_sched_init(sched, &msm_sched_ops, 4, 0, + msecs_to_jiffies(500), name); +} + +void msm_sched_cleanup(struct drm_gpu_scheduler *sched) +{ + drm_sched_fini(sched); +} + +/* + * Create a new entity on the schedulers normal priority runqueue. For now we + * choose to always use the normal run queue priority, but in future its + * possible with some extension to the msm drm interface, to create the + * submitqueue(entities) of different priorities on the same ring, thereby + * allowing to priortize and schedule submitqueues belonging to the same ring + */ +int msm_sched_entity_init(struct msm_gpu *gpu, + struct drm_sched_entity *sched_entity, int prio) +{ + struct drm_gpu_scheduler *sched = &gpu->rb[prio]->sched; + + return drm_sched_entity_init(sched, sched_entity, + &sched->sched_rq[DRM_SCHED_PRIORITY_NORMAL], NULL); +} + +void msm_sched_entity_cleanup(struct msm_gpu *gpu, + struct drm_sched_entity *sched_entity, int prio) +{ + struct drm_gpu_scheduler *sched = &gpu->rb[prio]->sched; + + drm_sched_entity_fini(sched, sched_entity); +} diff --git a/drivers/gpu/drm/msm/msm_sched.h b/drivers/gpu/drm/msm/msm_sched.h new file mode 100644 index 0000000..6ab2728 --- /dev/null +++ b/drivers/gpu/drm/msm/msm_sched.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2018, The Linux Foundation. All rights reserved. */ + +#ifndef __MSM_SCHED_H__ +#define __MSM_SCHED_H__ + +#include <drm/gpu_scheduler.h> + +int msm_sched_init(struct drm_gpu_scheduler *sched, const char *name); +void msm_sched_cleanup(struct drm_gpu_scheduler *sched); +int msm_sched_entity_init(struct msm_gpu *gpu, struct drm_sched_entity *queue, + int prio); +void msm_sched_entity_cleanup(struct msm_gpu *gpu, + struct drm_sched_entity *queue, int prio); +int msm_sched_job_init(struct drm_sched_job *sched_job); +void msm_sched_push_job(struct msm_gem_submit *submit); +void msm_sched_gpu_recovery(struct msm_gpu *gpu, struct msm_gem_submit *submit); +#endif /* __MSM_SCHED_H__ */ diff --git a/drivers/gpu/drm/msm/msm_submitqueue.c b/drivers/gpu/drm/msm/msm_submitqueue.c index 325da44..b6eb13e 100644 --- a/drivers/gpu/drm/msm/msm_submitqueue.c +++ b/drivers/gpu/drm/msm/msm_submitqueue.c @@ -19,6 +19,7 @@ void msm_submitqueue_destroy(struct kref *kref) struct msm_gpu_submitqueue *queue = container_of(kref, struct msm_gpu_submitqueue, ref);
+ msm_sched_entity_cleanup(queue->gpu, &queue->sched_entity, queue->prio); kfree(queue); }
@@ -65,6 +66,7 @@ int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx, { struct msm_drm_private *priv = drm->dev_private; struct msm_gpu_submitqueue *queue; + struct msm_gpu *gpu = priv->gpu;
if (!ctx) return -ENODEV; @@ -77,13 +79,16 @@ int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx, kref_init(&queue->ref); queue->flags = flags;
- if (priv->gpu) { - if (prio >= priv->gpu->nr_rings) { - kfree(queue); - return -EINVAL; - } + if (gpu) { + if (prio >= gpu->nr_rings) + goto fail; + + if (msm_sched_entity_init(priv->gpu, &queue->sched_entity, + prio)) + goto fail;
queue->prio = prio; + queue->gpu = gpu; }
write_lock(&ctx->queuelock); @@ -98,6 +103,9 @@ int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx, write_unlock(&ctx->queuelock);
return 0; +fail: + kfree(queue); + return -EINVAL; }
int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx)
On Mon, Oct 01, 2018 at 06:01:41PM +0530, Sharat Masetty wrote:
This patch hooks up the DRM gpu scheduler to the msm DRM driver. The most noticeable changes to the driver are as follows. The submit path is split into two parts, in the user context the submit(job) is created and added to one of the entity's scheduler run queue. The scheduler then tries to drain the queue by submitting the jobs the hardware to act upon. The submit job sits on the scheduler queue until all the dependent fences are waited upon successfully.
We have one scheduler instance per ring. The submitqueues will host a scheduler entity object. This entity will be mapped to the scheduler's default runqueue. This should be good for now, but in future it is possible to extend the API to allow for scheduling amongst the submitqueues on the same ring.
With this patch the role of the struct_mutex lock has been greatly reduced in scope in the submit path, evidently when actually putting the job on the ringbuffer. This should enable us with increased parallelism in the driver which should translate to better performance overall hopefully.
Signed-off-by: Sharat Masetty smasetty@codeaurora.org
drivers/gpu/drm/msm/Kconfig | 1 + drivers/gpu/drm/msm/Makefile | 3 +- drivers/gpu/drm/msm/msm_drv.h | 3 +- drivers/gpu/drm/msm/msm_gem.c | 8 +- drivers/gpu/drm/msm/msm_gem.h | 6 + drivers/gpu/drm/msm/msm_gem_submit.c | 138 +++++++++------ drivers/gpu/drm/msm/msm_gpu.c | 72 ++++++-- drivers/gpu/drm/msm/msm_gpu.h | 2 + drivers/gpu/drm/msm/msm_ringbuffer.c | 7 + drivers/gpu/drm/msm/msm_ringbuffer.h | 3 + drivers/gpu/drm/msm/msm_sched.c | 313 ++++++++++++++++++++++++++++++++++ drivers/gpu/drm/msm/msm_sched.h | 18 ++ drivers/gpu/drm/msm/msm_submitqueue.c | 18 +- 13 files changed, 507 insertions(+), 85 deletions(-) create mode 100644 drivers/gpu/drm/msm/msm_sched.c create mode 100644 drivers/gpu/drm/msm/msm_sched.h
diff --git a/drivers/gpu/drm/msm/Kconfig b/drivers/gpu/drm/msm/Kconfig index 38cbde9..0d01810 100644 --- a/drivers/gpu/drm/msm/Kconfig +++ b/drivers/gpu/drm/msm/Kconfig @@ -15,6 +15,7 @@ config DRM_MSM select SND_SOC_HDMI_CODEC if SND_SOC select SYNC_FILE select PM_OPP
- select DRM_SCHED default y help DRM/KMS driver for MSM/snapdragon.
diff --git a/drivers/gpu/drm/msm/Makefile b/drivers/gpu/drm/msm/Makefile index cd40c05..71ed921 100644 --- a/drivers/gpu/drm/msm/Makefile +++ b/drivers/gpu/drm/msm/Makefile @@ -60,7 +60,8 @@ msm-y := \ msm_perf.o \ msm_rd.o \ msm_ringbuffer.o \
- msm_submitqueue.o
- msm_submitqueue.o \
- msm_sched.o
msm-$(CONFIG_DEBUG_FS) += adreno/a5xx_debugfs.o
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h index b2da1fb..e461a9c 100644 --- a/drivers/gpu/drm/msm/msm_drv.h +++ b/drivers/gpu/drm/msm/msm_drv.h @@ -213,8 +213,7 @@ struct drm_gem_object *msm_gem_prime_import_sg_table(struct drm_device *dev, int msm_gem_madvise(struct drm_gem_object *obj, unsigned madv); int msm_gem_sync_object(struct drm_gem_object *obj, struct msm_fence_context *fctx, bool exclusive); -void msm_gem_move_to_active(struct drm_gem_object *obj,
struct msm_gpu *gpu, bool exclusive, struct dma_fence *fence);
+void msm_gem_move_to_active(struct drm_gem_object *obj, struct msm_gpu *gpu); void msm_gem_move_to_inactive(struct drm_gem_object *obj); int msm_gem_cpu_prep(struct drm_gem_object *obj, uint32_t op, ktime_t *timeout); int msm_gem_cpu_fini(struct drm_gem_object *obj); diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index f583bb4..7a12f30 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -663,16 +663,12 @@ int msm_gem_sync_object(struct drm_gem_object *obj, return 0; }
-void msm_gem_move_to_active(struct drm_gem_object *obj,
struct msm_gpu *gpu, bool exclusive, struct dma_fence *fence)
+void msm_gem_move_to_active(struct drm_gem_object *obj, struct msm_gpu *gpu) { struct msm_gem_object *msm_obj = to_msm_bo(obj); WARN_ON(msm_obj->madv != MSM_MADV_WILLNEED); msm_obj->gpu = gpu;
- if (exclusive)
reservation_object_add_excl_fence(msm_obj->resv, fence);
- else
reservation_object_add_shared_fence(msm_obj->resv, fence);
- list_del_init(&msm_obj->mm_list); list_add_tail(&msm_obj->mm_list, &gpu->active_list);
} diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h index cae3aa5..8c6269f 100644 --- a/drivers/gpu/drm/msm/msm_gem.h +++ b/drivers/gpu/drm/msm/msm_gem.h @@ -20,6 +20,7 @@
#include <linux/kref.h> #include <linux/reservation.h> +#include <drm/gpu_scheduler.h> #include "msm_drv.h"
/* Additional internal-use only BO flags: */ @@ -136,6 +137,7 @@ enum msm_gem_lock {
- lasts for the duration of the submit-ioctl.
*/ struct msm_gem_submit {
- struct drm_sched_job sched_job; struct drm_device *dev; struct msm_file_private *ctx; struct msm_gpu *gpu;
@@ -144,6 +146,7 @@ struct msm_gem_submit { struct ww_acquire_ctx ticket; uint32_t seqno; /* Sequence number of the submit on the ring */ struct dma_fence *hw_fence;
- struct dma_fence *in_fence, *out_fence; int out_fence_id; struct msm_gpu_submitqueue *queue; struct pid *pid; /* submitting process */
@@ -162,6 +165,9 @@ struct msm_gem_submit { uint32_t flags; struct msm_gem_object *obj; uint64_t iova;
struct dma_fence *excl;
uint32_t nr_shared;
} bos[0];struct dma_fence **shared;
};
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index 7931c2a..dff945c 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -65,23 +65,37 @@ void msm_gem_submit_free(struct msm_gem_submit *submit) { int i;
- mutex_lock(&submit->ring->fence_idr_lock); idr_remove(&submit->ring->fence_idr, submit->out_fence_id);
- dma_fence_put(submit->hw_fence);
- mutex_unlock(&submit->ring->fence_idr_lock);
Why are we using a mutex for this guy - this seems like a job for a spin if I ever saw one. Maybe even a rw spin depending on how often that idr gets queried.
for (i = 0; i < submit->nr_bos; i++) {
int j;
struct msm_gem_object *msm_obj = submit->bos[i].obj;
if (submit->bos[i].flags & BO_PINNED) msm_gem_put_iova(&msm_obj->base, submit->gpu->aspace);
drm_gem_object_put(&msm_obj->base);
drm_gem_object_put_unlocked(&msm_obj->base);
/*
* While we are at it, clear the saved exclusive and shared
* fences if any
*/
dma_fence_put(submit->bos[i].excl);
for (j = 0; j < submit->bos[i].nr_shared; j++)
dma_fence_put(submit->bos[i].shared[j]);
}kfree(submit->bos[i].shared);
- list_del(&submit->node); put_pid(submit->pid); msm_submitqueue_put(submit->queue);
- dma_fence_put(submit->in_fence);
- dma_fence_put(submit->out_fence); kfree(submit);
}
@@ -238,6 +252,27 @@ static int submit_lock_objects(struct msm_gem_submit *submit) return ret; }
+static void submit_attach_fences_and_unlock(struct msm_gem_submit *submit) +{
- int i;
- for (i = 0; i < submit->nr_bos; i++) {
struct msm_gem_object *msm_obj = submit->bos[i].obj;
if (submit->bos[i].flags & MSM_SUBMIT_BO_WRITE)
reservation_object_add_excl_fence(msm_obj->resv,
submit->out_fence);
else
reservation_object_add_shared_fence(msm_obj->resv,
submit->out_fence);
if (submit->bos[i].flags & BO_LOCKED) {
ww_mutex_unlock(&msm_obj->resv->lock);
submit->bos[i].flags &= ~BO_LOCKED;
}
- }
+}
static int submit_fence_sync(struct msm_gem_submit *submit, bool no_implicit) { int i, ret = 0; @@ -260,10 +295,24 @@ static int submit_fence_sync(struct msm_gem_submit *submit, bool no_implicit) if (no_implicit) continue;
ret = msm_gem_sync_object(&msm_obj->base, submit->ring->fctx,
write);
if (ret)
break;
if (write) {
/*
* Save the per buffer shared and exclusive fences for
* the scheduler thread to wait on.
*
* Note: The memory allocated for the storing the
* shared fences will be released when the scheduler
* is done waiting on all the saved fences.
*/
ret = reservation_object_get_fences_rcu(msm_obj->resv,
&submit->bos[i].excl,
&submit->bos[i].nr_shared,
&submit->bos[i].shared);
if (ret)
return ret;
I think this can just be a return ret;
} else
No need for the else
submit->bos[i].excl =
reservation_object_get_excl_rcu(msm_obj->resv);
}
return ret;
This can be a return 0;
@@ -425,7 +474,6 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, struct msm_file_private *ctx = file->driver_priv; struct msm_gem_submit *submit; struct msm_gpu *gpu = priv->gpu;
- struct dma_fence *in_fence = NULL; struct sync_file *sync_file = NULL; struct msm_gpu_submitqueue *queue; struct msm_ringbuffer *ring;
@@ -457,32 +505,11 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
ring = gpu->rb[queue->prio];
- if (args->flags & MSM_SUBMIT_FENCE_FD_IN) {
in_fence = sync_file_get_fence(args->fence_fd);
if (!in_fence)
return -EINVAL;
/*
* Wait if the fence is from a foreign context, or if the fence
* array contains any fence from a foreign context.
*/
if (!dma_fence_match_context(in_fence, ring->fctx->context)) {
ret = dma_fence_wait(in_fence, true);
if (ret)
return ret;
}
- }
- ret = mutex_lock_interruptible(&dev->struct_mutex);
- if (ret)
return ret;
- if (args->flags & MSM_SUBMIT_FENCE_FD_OUT) { out_fence_fd = get_unused_fd_flags(O_CLOEXEC); if (out_fence_fd < 0) { ret = out_fence_fd;
goto out_unlock;
} }goto out_err;
@@ -490,7 +517,16 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, args->nr_cmds, ctx); if (!submit) { ret = -ENOMEM;
goto out_unlock;
goto out_err;
}
if (args->flags & MSM_SUBMIT_FENCE_FD_IN) {
submit->in_fence = sync_file_get_fence(args->fence_fd);
if (!submit->in_fence) {
ret = -EINVAL;
goto out;
}
}
if (args->flags & MSM_SUBMIT_SUDO)
@@ -573,28 +609,16 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
submit->nr_cmds = i;
- msm_gpu_submit(submit);
- if (IS_ERR(submit->hw_fence)) {
ret = PTR_ERR(submit->hw_fence);
submit->hw_fence = NULL;
- ret = msm_sched_job_init(&submit->sched_job);
- if (ret) goto out;
}
/*
* No protection should be okay here since this is protected by the big
* GPU lock.
*/
submit->out_fence_id = idr_alloc_cyclic(&ring->fence_idr,
submit->hw_fence, 0, INT_MAX, GFP_KERNEL);
if (submit->out_fence_id < 0) {
ret = -ENOMEM;
goto out;
}
submit_attach_fences_and_unlock(submit);
args->fence = submit->out_fence_id;
if (args->flags & MSM_SUBMIT_FENCE_FD_OUT) {
sync_file = sync_file_create(submit->hw_fence);
if (!sync_file) { ret = -ENOMEM; goto out;sync_file = sync_file_create(submit->out_fence);
@@ -604,14 +628,22 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, }
out:
- if (in_fence)
dma_fence_put(in_fence);
- /*
* Clean up the submit bits and pieces which are not needed beyond this
* function context
submit_cleanup(submit);*/
- if (ret)
- if (!ret)
/*
* If we reached here, then all is well. Push the job to the
* scheduler
*/
msm_sched_push_job(submit);
- else msm_gem_submit_free(submit);
-out_unlock: +out_err: if (ret && (out_fence_fd >= 0)) put_unused_fd(out_fence_fd);
- mutex_unlock(&dev->struct_mutex); return ret;
} diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index cd5fe49..481a55c 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -295,12 +295,17 @@ static void update_fences(struct msm_gpu *gpu, struct msm_ringbuffer *ring, find_submit(struct msm_ringbuffer *ring, uint32_t fence) { struct msm_gem_submit *submit;
- unsigned long flags;
- WARN_ON(!mutex_is_locked(&ring->gpu->dev->struct_mutex));
spin_lock_irqsave(&ring->lock, flags);
list_for_each_entry(submit, &ring->submits, node)
if (submit->seqno == fence)
if (submit->seqno == fence) {
spin_unlock_irqrestore(&ring->lock, flags); return submit;
}
spin_unlock_irqrestore(&ring->lock, flags);
return NULL;
} @@ -316,6 +321,12 @@ static void recover_worker(struct work_struct *work) struct msm_ringbuffer *cur_ring = gpu->funcs->active_ring(gpu); int i;
- submit = find_submit(cur_ring, cur_ring->memptrs->fence + 1);
- return msm_sched_gpu_recovery(gpu, submit);
- /*
* The unused code below will be removed in a subsequent patch
*/
Why not now?
mutex_lock(&dev->struct_mutex);
dev_err(dev->dev, "%s: hangcheck recover!\n", gpu->name); @@ -591,11 +602,36 @@ static void retire_worker(struct work_struct *work) mutex_unlock(&dev->struct_mutex); }
-/* call from irq handler to schedule work to retire bo's */ +static void signal_hw_fences(struct msm_ringbuffer *ring) +{
- unsigned long flags;
- struct msm_gem_submit *submit, *tmp;
- spin_lock_irqsave(&ring->lock, flags);
- list_for_each_entry_safe(submit, tmp, &ring->submits, node) {
if (submit->seqno > ring->memptrs->fence)
break;
list_del(&submit->node);
dma_fence_signal(submit->hw_fence);
- }
- spin_unlock_irqrestore(&ring->lock, flags);
+}
+/*
- Called from the IRQ context to signal hardware fences for the completed
- submits
- */
void msm_gpu_retire(struct msm_gpu *gpu) {
- struct msm_drm_private *priv = gpu->dev->dev_private;
- queue_work(priv->wq, &gpu->retire_work);
- int i;
- for (i = 0; i < gpu->nr_rings; i++)
signal_hw_fences(gpu->rb[i]);
- update_sw_cntrs(gpu);
}
@@ -606,25 +642,28 @@ struct dma_fence *msm_gpu_submit(struct msm_gem_submit *submit) struct drm_device *dev = gpu->dev; struct msm_drm_private *priv = dev->dev_private; struct msm_ringbuffer *ring = submit->ring;
- unsigned long flags; int i;
WARN_ON(!mutex_is_locked(&dev->struct_mutex));
submit->hw_fence = msm_fence_alloc(ring->fctx); if (IS_ERR(submit->hw_fence)) return submit->hw_fence;
pm_runtime_get_sync(&gpu->pdev->dev);
msm_gpu_hw_init(gpu);
- spin_lock_irqsave(&ring->lock, flags);
- list_add_tail(&submit->node, &ring->submits);
- spin_unlock_irqrestore(&ring->lock, flags); submit->seqno = ++ring->seqno;
- list_add_tail(&submit->node, &ring->submits);
- update_sw_cntrs(gpu);
I'm not sure if this needs the hardware to be up (it does check msm_gpu_active), but I don't think we should reorganize the order of these functions unless we need to.
- msm_rd_dump_submit(priv->rd, submit, NULL);
- mutex_lock(&dev->struct_mutex);
- update_sw_cntrs(gpu);
pm_runtime_get_sync(&gpu->pdev->dev);
msm_gpu_hw_init(gpu);
msm_rd_dump_submit(priv->rd, submit, NULL);
for (i = 0; i < submit->nr_bos; i++) { struct msm_gem_object *msm_obj = submit->bos[i].obj;
@@ -634,16 +673,13 @@ struct dma_fence *msm_gpu_submit(struct msm_gem_submit *submit) */ WARN_ON(is_active(msm_obj) && (msm_obj->gpu != gpu));
if (submit->bos[i].flags & MSM_SUBMIT_BO_WRITE)
msm_gem_move_to_active(&msm_obj->base, gpu, true, submit->hw_fence);
else if (submit->bos[i].flags & MSM_SUBMIT_BO_READ)
msm_gem_move_to_active(&msm_obj->base, gpu, false, submit->hw_fence);
msm_gem_move_to_active(&msm_obj->base, gpu);
}
gpu->funcs->submit(gpu, submit, submit->ctx); priv->lastctx = submit->ctx;
- hangcheck_timer_reset(gpu);
mutex_unlock(&dev->struct_mutex);
return submit->hw_fence;
} diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h index dd55979..3bd1920 100644 --- a/drivers/gpu/drm/msm/msm_gpu.h +++ b/drivers/gpu/drm/msm/msm_gpu.h @@ -173,6 +173,8 @@ struct msm_gpu_submitqueue { int faults; struct list_head node; struct kref ref;
- struct msm_gpu *gpu;
- struct drm_sched_entity sched_entity;
};
static inline void gpu_write(struct msm_gpu *gpu, u32 reg, u32 data) diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c index 0889766..ef2bf10 100644 --- a/drivers/gpu/drm/msm/msm_ringbuffer.c +++ b/drivers/gpu/drm/msm/msm_ringbuffer.c @@ -56,9 +56,14 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id,
snprintf(ring->name, sizeof(ring->name), "msm-gpu-ring-%d", ring->id);
if (msm_sched_init(&ring->sched, ring->name) != 0) {
ret = -EINVAL;
goto fail;
} ring->fctx = msm_fence_context_alloc(gpu->dev, ring->name);
idr_init(&ring->fence_idr);
mutex_init(&ring->fence_idr_lock);
return ring;
@@ -74,6 +79,7 @@ void msm_ringbuffer_destroy(struct msm_ringbuffer *ring)
msm_fence_context_free(ring->fctx);
- msm_sched_cleanup(&ring->sched); if (ring->bo) { msm_gem_put_iova(ring->bo, ring->gpu->aspace); msm_gem_put_vaddr(ring->bo);
@@ -81,6 +87,7 @@ void msm_ringbuffer_destroy(struct msm_ringbuffer *ring) }
idr_destroy(&ring->fence_idr);
mutex_destroy(&ring->fence_idr_lock);
kfree(ring);
} diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.h b/drivers/gpu/drm/msm/msm_ringbuffer.h index 523373b..10ae4a8 100644 --- a/drivers/gpu/drm/msm/msm_ringbuffer.h +++ b/drivers/gpu/drm/msm/msm_ringbuffer.h @@ -19,6 +19,7 @@ #define __MSM_RINGBUFFER_H__
#include "msm_drv.h" +#include "msm_sched.h"
#define rbmemptr(ring, member) \ ((ring)->memptrs_iova + offsetof(struct msm_rbmemptrs, member)) @@ -42,7 +43,9 @@ struct msm_ringbuffer { uint64_t memptrs_iova; struct msm_fence_context *fctx; struct idr fence_idr;
- struct mutex fence_idr_lock; spinlock_t lock;
- struct drm_gpu_scheduler sched;
};
struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id, diff --git a/drivers/gpu/drm/msm/msm_sched.c b/drivers/gpu/drm/msm/msm_sched.c new file mode 100644 index 0000000..8b805ce --- /dev/null +++ b/drivers/gpu/drm/msm/msm_sched.c @@ -0,0 +1,313 @@ +/* SPDX-License-Identifier: GPL-2.0 */
// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, The Linux Foundation. All rights reserved. */
+#include "msm_gpu.h" +#include "msm_gem.h" +#include "msm_sched.h"
+#include <linux/string_helpers.h> +#include <linux/kthread.h>
+struct msm_gem_submit *to_msm_gem_submit(struct drm_sched_job *sched_job) +{
- return container_of(sched_job, struct msm_gem_submit, sched_job);
+}
+/*
- Go through the bo's one by one and return the previously saved shared and
bo's -> bos
- exclusive fences. If the scheduler gets the fence, then it takes care of
- releasing the reference on the fence.
- */
+static struct dma_fence *msm_sched_dependency(struct drm_sched_job *sched_job,
struct drm_sched_entity *entity)
+{
- struct msm_gem_submit *submit = to_msm_gem_submit(sched_job);
- struct dma_fence *fence;
- int i;
- if (submit->in_fence) {
fence = submit->in_fence;
submit->in_fence = NULL;
if (!dma_fence_is_signaled(fence))
return fence;
dma_fence_put(fence);
- }
There might be a chance to consolidate some code here
static struct dma_fence *_get_fence(struct dma_fence **) { struct dma_fence *fence = *in; *in = NULL;
if (fence && !dma_fence_is_signaled(fence)) return fence;
dma_fence_put(fence); return NULL; }
fence = _fence(&submit->in_fence); if (fence) return fence;
- /* Return the previously saved shared and exclusive fences if any */
- for (i = 0; i < submit->nr_bos; i++) {
int j;
if (submit->bos[i].excl) {
fence = submit->bos[i].excl;
submit->bos[i].excl = NULL;
if (!dma_fence_is_signaled(fence))
return fence;
dma_fence_put(fence);
}
fence = _get_fence(&submit->bos[i].excl); if (fence) return fence;
for (j = 0; j < submit->bos[i].nr_shared; j++) {
if (!submit->bos[i].shared[j])
continue;
fence = submit->bos[i].shared[j];
submit->bos[i].shared[j] = NULL;
if (!dma_fence_is_signaled(fence))
return fence;
dma_fence_put(fence);
}
fence = _get_fence(&submit->bos[i].shared); if (fence) return fence;
kfree(submit->bos[i].shared);
submit->bos[i].nr_shared = 0;
submit->bos[i].shared = NULL;
- }
- return NULL;
+}
This is an interesting function - in order to avoid wasting cycles it needs to be ordered so that the most likely fences happen first. If that is the case, I think that in_fence might be the least likely so you should check it last.
+static struct dma_fence *msm_sched_run_job(struct drm_sched_job *sched_job) +{
- struct msm_gem_submit *submit = to_msm_gem_submit(sched_job);
- return !sched_job->s_fence->finished.error ?
msm_gpu_submit(submit) : NULL;
+}
+static void dump_bad_submit(struct msm_gem_submit *submit) +{
- struct msm_gpu *gpu = submit->gpu;
- struct drm_device *dev = gpu->dev;
- struct msm_drm_private *priv = dev->dev_private;
- struct task_struct *task;
- char task_name[32] = {0};
- rcu_read_lock();
- task = pid_task(submit->pid, PIDTYPE_PID);
- if (task) {
char *cmd;
cmd = kstrdup_quotable_cmdline(task, GFP_KERNEL);
This should be GFP_ATOMIC because we are in the rcu_read_lock(). There might be something else wrong with it too - I know we have this problem in the current kernel and I'm not sure if it is avoidable without losing the task name.
dev_err(&gpu->pdev->dev, "%s: offending task: %s (%s)\n",
gpu->name, task->comm, cmd);
snprintf(task_name, sizeof(task_name),
"offending task: %s (%s)", task->comm, cmd);
kfree(cmd);
- }
- rcu_read_unlock();
- mutex_lock(&gpu->dev->struct_mutex);
- msm_rd_dump_submit(priv->hangrd, submit, task_name);
- mutex_unlock(&gpu->dev->struct_mutex);
+}
+void msm_sched_gpu_recovery(struct msm_gpu *gpu, struct msm_gem_submit *submit) +{
- int i;
- static atomic_t gpu_recovery;
- /* Check if a GPU recovery is already in progress */
- if (!(atomic_cmpxchg(&gpu_recovery, 0, 1) == 0))
return;
- /*
* Pause the schedulers so we don't get new requests while the recovery
* is in progress
*/
- for (i = 0; i < gpu->nr_rings; i++)
kthread_park(gpu->rb[i]->sched.thread);
- /*
* Disable interrupts so we don't get interrupted while the recovery is
* in progress
*/
- disable_irq(gpu->irq);
- dev_err(&gpu->pdev->dev, "%s: hangcheck recover!\n", gpu->name);
I don't know if we still need this line? Recovery seems to be standard operating procedure these days.
- if (submit)
dump_bad_submit(submit);
- /*
* For each ring, we do the following
* a) Inform the scheduler to drop the hardware fence for the
* submissions on its mirror list
* b) The submit(job) list on the ring is not useful anymore
* as we are going for a gpu reset, so we empty the submit list
*/
- for (i = 0; i < gpu->nr_rings; i++) {
struct msm_gem_submit *job, *tmp;
unsigned long flags;
struct msm_ringbuffer *ring = gpu->rb[i];
/* a) Release the hardware fences */
drm_sched_hw_job_reset(&ring->sched,
(submit && submit->ring == ring) ?
&submit->sched_job : NULL);
/* b) Free up the ring submit list */
spin_lock_irqsave(&ring->lock, flags);
list_for_each_entry_safe(job, tmp, &ring->submits, node)
list_del(&job->node);
spin_unlock_irqrestore(&ring->lock, flags);
- }
- /* Power cycle and reset the GPU back to init state */
- mutex_lock(&gpu->dev->struct_mutex);
- pm_runtime_get_sync(&gpu->pdev->dev);
- gpu->funcs->recover(gpu);
- pm_runtime_put_sync(&gpu->pdev->dev);
- mutex_unlock(&gpu->dev->struct_mutex);
- /* Re-enable the interrupts once the gpu reset is complete */
- enable_irq(gpu->irq);
- /*
* The GPU is hopefully back in good shape, now request the schedulers
* to replay its mirror list, starting with the scheduler on the highest
* priority ring
*/
- for (i = 0; i < gpu->nr_rings; i++) {
drm_sched_job_recovery(&gpu->rb[i]->sched);
kthread_unpark(gpu->rb[i]->sched.thread);
- }
- atomic_set(&gpu_recovery, 0);
I think we need a smp_wmb() here to make sure everybody else sees the update.
+}
+static void msm_sched_timedout_job(struct drm_sched_job *bad_job) +{
- struct msm_gem_submit *submit = to_msm_gem_submit(bad_job);
- struct msm_gpu *gpu = submit->gpu;
- struct msm_ringbuffer *ring = submit->ring;
- /*
* If this submission completed in the mean time, then the timeout is
* spurious
*/
- if (submit->seqno <= submit->ring->memptrs->fence)
return;
- dev_err(&gpu->pdev->dev, "%s: hangcheck detected gpu lockup rb %d!\n",
gpu->name, ring->id);
- dev_err(&gpu->pdev->dev, "%s: completed fence: %u\n",
gpu->name, ring->memptrs->fence);
- dev_err(&gpu->pdev->dev, "%s: submitted fence: %u\n",
gpu->name, ring->seqno);
- msm_sched_gpu_recovery(gpu, submit);
+}
+static void msm_sched_free_job(struct drm_sched_job *sched_job) +{
- struct msm_gem_submit *submit = to_msm_gem_submit(sched_job);
- struct msm_gpu *gpu = submit->gpu;
- int i;
- mutex_lock(&gpu->dev->struct_mutex);
- for (i = 0; i < submit->nr_bos; i++) {
struct msm_gem_object *msm_obj = submit->bos[i].obj;
msm_gem_move_to_inactive(&msm_obj->base);
- }
- mutex_unlock(&gpu->dev->struct_mutex);
- pm_runtime_mark_last_busy(&gpu->pdev->dev);
- pm_runtime_put_autosuspend(&gpu->pdev->dev);
- msm_gem_submit_free(submit);
+}
+static const struct drm_sched_backend_ops msm_sched_ops = {
- .dependency = msm_sched_dependency,
- .run_job = msm_sched_run_job,
- .timedout_job = msm_sched_timedout_job,
- .free_job = msm_sched_free_job,
+};
+int msm_sched_job_init(struct drm_sched_job *sched_job) +{
- int ret;
- struct msm_gem_submit *submit = to_msm_gem_submit(sched_job);
- struct msm_ringbuffer *ring = submit->ring;
- ret = drm_sched_job_init(sched_job, &ring->sched,
&submit->queue->sched_entity, submit->ctx);
- if (ret)
return ret;
- submit->out_fence = dma_fence_get(&submit->sched_job.s_fence->finished);
- mutex_lock(&ring->fence_idr_lock);
- submit->out_fence_id = idr_alloc_cyclic(&ring->fence_idr,
submit->out_fence, 0, INT_MAX,
GFP_KERNEL);
- mutex_unlock(&ring->fence_idr_lock);
- if (submit->out_fence_id < 0) {
/*
* TODO: The scheduler's finished fence leaks here since the
* job will not be pushed to the queue. Need to update scheduler
* to fix this cleanly(?)
*/
How would you propose to fix this?
dma_fence_put(submit->out_fence);
submit->out_fence = NULL;
return -ENOMEM;
- }
- return 0;
+}
+void msm_sched_push_job(struct msm_gem_submit *submit) +{
- return drm_sched_entity_push_job(&submit->sched_job,
&submit->queue->sched_entity);
+}
+int msm_sched_init(struct drm_gpu_scheduler *sched, const char *name) +{
- return drm_sched_init(sched, &msm_sched_ops, 4, 0,
msecs_to_jiffies(500), name);
Okay, I see where the ring->name comes in. I don't like it but at least it is a relatively low cost option if you share when the fence names. Still... sigh.
+}
+void msm_sched_cleanup(struct drm_gpu_scheduler *sched) +{
- drm_sched_fini(sched);
+}
I don't think this function is needed - I think we're smart enough to call drm_sched_fini directly.
+/*
- Create a new entity on the schedulers normal priority runqueue. For now we
- choose to always use the normal run queue priority, but in future its
- possible with some extension to the msm drm interface, to create the
- submitqueue(entities) of different priorities on the same ring, thereby
- allowing to priortize and schedule submitqueues belonging to the same ring
- */
+int msm_sched_entity_init(struct msm_gpu *gpu,
struct drm_sched_entity *sched_entity, int prio)
+{
- struct drm_gpu_scheduler *sched = &gpu->rb[prio]->sched;
- return drm_sched_entity_init(sched, sched_entity,
&sched->sched_rq[DRM_SCHED_PRIORITY_NORMAL], NULL);
+}
+void msm_sched_entity_cleanup(struct msm_gpu *gpu,
struct drm_sched_entity *sched_entity, int prio)
+{
- struct drm_gpu_scheduler *sched = &gpu->rb[prio]->sched;
- drm_sched_entity_fini(sched, sched_entity);
+} diff --git a/drivers/gpu/drm/msm/msm_sched.h b/drivers/gpu/drm/msm/msm_sched.h new file mode 100644 index 0000000..6ab2728 --- /dev/null +++ b/drivers/gpu/drm/msm/msm_sched.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */
// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, The Linux Foundation. All rights reserved. */
+#ifndef __MSM_SCHED_H__ +#define __MSM_SCHED_H__
+#include <drm/gpu_scheduler.h>
+int msm_sched_init(struct drm_gpu_scheduler *sched, const char *name); +void msm_sched_cleanup(struct drm_gpu_scheduler *sched); +int msm_sched_entity_init(struct msm_gpu *gpu, struct drm_sched_entity *queue,
int prio);
+void msm_sched_entity_cleanup(struct msm_gpu *gpu,
struct drm_sched_entity *queue, int prio);
+int msm_sched_job_init(struct drm_sched_job *sched_job); +void msm_sched_push_job(struct msm_gem_submit *submit); +void msm_sched_gpu_recovery(struct msm_gpu *gpu, struct msm_gem_submit *submit); +#endif /* __MSM_SCHED_H__ */ diff --git a/drivers/gpu/drm/msm/msm_submitqueue.c b/drivers/gpu/drm/msm/msm_submitqueue.c index 325da44..b6eb13e 100644 --- a/drivers/gpu/drm/msm/msm_submitqueue.c +++ b/drivers/gpu/drm/msm/msm_submitqueue.c @@ -19,6 +19,7 @@ void msm_submitqueue_destroy(struct kref *kref) struct msm_gpu_submitqueue *queue = container_of(kref, struct msm_gpu_submitqueue, ref);
- msm_sched_entity_cleanup(queue->gpu, &queue->sched_entity, queue->prio); kfree(queue);
}
@@ -65,6 +66,7 @@ int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx, { struct msm_drm_private *priv = drm->dev_private; struct msm_gpu_submitqueue *queue;
struct msm_gpu *gpu = priv->gpu;
if (!ctx) return -ENODEV;
@@ -77,13 +79,16 @@ int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx, kref_init(&queue->ref); queue->flags = flags;
- if (priv->gpu) {
if (prio >= priv->gpu->nr_rings) {
kfree(queue);
return -EINVAL;
}
if (gpu) {
if (prio >= gpu->nr_rings)
goto fail;
if (msm_sched_entity_init(priv->gpu, &queue->sched_entity,
prio))
goto fail;
queue->prio = prio;
queue->gpu = gpu;
}
write_lock(&ctx->queuelock);
@@ -98,6 +103,9 @@ int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx, write_unlock(&ctx->queuelock);
return 0; +fail:
- kfree(queue);
- return -EINVAL;
}
int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx)
1.9.1
Thanks for the review comments Jordan. I tried to answer a few queries.. please check.
On 10/2/2018 12:32 AM, Jordan Crouse wrote:
On Mon, Oct 01, 2018 at 06:01:41PM +0530, Sharat Masetty wrote:
This patch hooks up the DRM gpu scheduler to the msm DRM driver. The most noticeable changes to the driver are as follows. The submit path is split into two parts, in the user context the submit(job) is created and added to one of the entity's scheduler run queue. The scheduler then tries to drain the queue by submitting the jobs the hardware to act upon. The submit job sits on the scheduler queue until all the dependent fences are waited upon successfully.
We have one scheduler instance per ring. The submitqueues will host a scheduler entity object. This entity will be mapped to the scheduler's default runqueue. This should be good for now, but in future it is possible to extend the API to allow for scheduling amongst the submitqueues on the same ring.
With this patch the role of the struct_mutex lock has been greatly reduced in scope in the submit path, evidently when actually putting the job on the ringbuffer. This should enable us with increased parallelism in the driver which should translate to better performance overall hopefully.
Signed-off-by: Sharat Masetty smasetty@codeaurora.org
drivers/gpu/drm/msm/Kconfig | 1 + drivers/gpu/drm/msm/Makefile | 3 +- drivers/gpu/drm/msm/msm_drv.h | 3 +- drivers/gpu/drm/msm/msm_gem.c | 8 +- drivers/gpu/drm/msm/msm_gem.h | 6 + drivers/gpu/drm/msm/msm_gem_submit.c | 138 +++++++++------ drivers/gpu/drm/msm/msm_gpu.c | 72 ++++++-- drivers/gpu/drm/msm/msm_gpu.h | 2 + drivers/gpu/drm/msm/msm_ringbuffer.c | 7 + drivers/gpu/drm/msm/msm_ringbuffer.h | 3 + drivers/gpu/drm/msm/msm_sched.c | 313 ++++++++++++++++++++++++++++++++++ drivers/gpu/drm/msm/msm_sched.h | 18 ++ drivers/gpu/drm/msm/msm_submitqueue.c | 18 +- 13 files changed, 507 insertions(+), 85 deletions(-) create mode 100644 drivers/gpu/drm/msm/msm_sched.c create mode 100644 drivers/gpu/drm/msm/msm_sched.h
diff --git a/drivers/gpu/drm/msm/Kconfig b/drivers/gpu/drm/msm/Kconfig index 38cbde9..0d01810 100644 --- a/drivers/gpu/drm/msm/Kconfig +++ b/drivers/gpu/drm/msm/Kconfig @@ -15,6 +15,7 @@ config DRM_MSM select SND_SOC_HDMI_CODEC if SND_SOC select SYNC_FILE select PM_OPP
- select DRM_SCHED default y help DRM/KMS driver for MSM/snapdragon.
diff --git a/drivers/gpu/drm/msm/Makefile b/drivers/gpu/drm/msm/Makefile index cd40c05..71ed921 100644 --- a/drivers/gpu/drm/msm/Makefile +++ b/drivers/gpu/drm/msm/Makefile @@ -60,7 +60,8 @@ msm-y := \ msm_perf.o \ msm_rd.o \ msm_ringbuffer.o \
- msm_submitqueue.o
msm_submitqueue.o \
msm_sched.o
msm-$(CONFIG_DEBUG_FS) += adreno/a5xx_debugfs.o
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h index b2da1fb..e461a9c 100644 --- a/drivers/gpu/drm/msm/msm_drv.h +++ b/drivers/gpu/drm/msm/msm_drv.h @@ -213,8 +213,7 @@ struct drm_gem_object *msm_gem_prime_import_sg_table(struct drm_device *dev, int msm_gem_madvise(struct drm_gem_object *obj, unsigned madv); int msm_gem_sync_object(struct drm_gem_object *obj, struct msm_fence_context *fctx, bool exclusive); -void msm_gem_move_to_active(struct drm_gem_object *obj,
struct msm_gpu *gpu, bool exclusive, struct dma_fence *fence);
+void msm_gem_move_to_active(struct drm_gem_object *obj, struct msm_gpu *gpu); void msm_gem_move_to_inactive(struct drm_gem_object *obj); int msm_gem_cpu_prep(struct drm_gem_object *obj, uint32_t op, ktime_t *timeout); int msm_gem_cpu_fini(struct drm_gem_object *obj); diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index f583bb4..7a12f30 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -663,16 +663,12 @@ int msm_gem_sync_object(struct drm_gem_object *obj, return 0; }
-void msm_gem_move_to_active(struct drm_gem_object *obj,
struct msm_gpu *gpu, bool exclusive, struct dma_fence *fence)
+void msm_gem_move_to_active(struct drm_gem_object *obj, struct msm_gpu *gpu) { struct msm_gem_object *msm_obj = to_msm_bo(obj); WARN_ON(msm_obj->madv != MSM_MADV_WILLNEED); msm_obj->gpu = gpu;
- if (exclusive)
reservation_object_add_excl_fence(msm_obj->resv, fence);
- else
reservation_object_add_shared_fence(msm_obj->resv, fence);
- list_del_init(&msm_obj->mm_list); list_add_tail(&msm_obj->mm_list, &gpu->active_list); }
diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h index cae3aa5..8c6269f 100644 --- a/drivers/gpu/drm/msm/msm_gem.h +++ b/drivers/gpu/drm/msm/msm_gem.h @@ -20,6 +20,7 @@
#include <linux/kref.h> #include <linux/reservation.h> +#include <drm/gpu_scheduler.h> #include "msm_drv.h"
/* Additional internal-use only BO flags: */ @@ -136,6 +137,7 @@ enum msm_gem_lock {
- lasts for the duration of the submit-ioctl.
*/ struct msm_gem_submit {
- struct drm_sched_job sched_job; struct drm_device *dev; struct msm_file_private *ctx; struct msm_gpu *gpu;
@@ -144,6 +146,7 @@ struct msm_gem_submit { struct ww_acquire_ctx ticket; uint32_t seqno; /* Sequence number of the submit on the ring */ struct dma_fence *hw_fence;
- struct dma_fence *in_fence, *out_fence; int out_fence_id; struct msm_gpu_submitqueue *queue; struct pid *pid; /* submitting process */
@@ -162,6 +165,9 @@ struct msm_gem_submit { uint32_t flags; struct msm_gem_object *obj; uint64_t iova;
struct dma_fence *excl;
uint32_t nr_shared;
} bos[0]; };struct dma_fence **shared;
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index 7931c2a..dff945c 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -65,23 +65,37 @@ void msm_gem_submit_free(struct msm_gem_submit *submit) { int i;
- mutex_lock(&submit->ring->fence_idr_lock); idr_remove(&submit->ring->fence_idr, submit->out_fence_id);
- dma_fence_put(submit->hw_fence);
- mutex_unlock(&submit->ring->fence_idr_lock);
Why are we using a mutex for this guy - this seems like a job for a spin if I ever saw one. Maybe even a rw spin depending on how often that idr gets queried.
for (i = 0; i < submit->nr_bos; i++) {
int j;
struct msm_gem_object *msm_obj = submit->bos[i].obj;
if (submit->bos[i].flags & BO_PINNED) msm_gem_put_iova(&msm_obj->base, submit->gpu->aspace);
drm_gem_object_put(&msm_obj->base);
drm_gem_object_put_unlocked(&msm_obj->base);
/*
* While we are at it, clear the saved exclusive and shared
* fences if any
*/
dma_fence_put(submit->bos[i].excl);
for (j = 0; j < submit->bos[i].nr_shared; j++)
dma_fence_put(submit->bos[i].shared[j]);
}kfree(submit->bos[i].shared);
- list_del(&submit->node); put_pid(submit->pid); msm_submitqueue_put(submit->queue);
- dma_fence_put(submit->in_fence);
- dma_fence_put(submit->out_fence); kfree(submit); }
@@ -238,6 +252,27 @@ static int submit_lock_objects(struct msm_gem_submit *submit) return ret; }
+static void submit_attach_fences_and_unlock(struct msm_gem_submit *submit) +{
- int i;
- for (i = 0; i < submit->nr_bos; i++) {
struct msm_gem_object *msm_obj = submit->bos[i].obj;
if (submit->bos[i].flags & MSM_SUBMIT_BO_WRITE)
reservation_object_add_excl_fence(msm_obj->resv,
submit->out_fence);
else
reservation_object_add_shared_fence(msm_obj->resv,
submit->out_fence);
if (submit->bos[i].flags & BO_LOCKED) {
ww_mutex_unlock(&msm_obj->resv->lock);
submit->bos[i].flags &= ~BO_LOCKED;
}
- }
+}
- static int submit_fence_sync(struct msm_gem_submit *submit, bool no_implicit) { int i, ret = 0;
@@ -260,10 +295,24 @@ static int submit_fence_sync(struct msm_gem_submit *submit, bool no_implicit) if (no_implicit) continue;
ret = msm_gem_sync_object(&msm_obj->base, submit->ring->fctx,
write);
if (ret)
break;
if (write) {
/*
* Save the per buffer shared and exclusive fences for
* the scheduler thread to wait on.
*
* Note: The memory allocated for the storing the
* shared fences will be released when the scheduler
* is done waiting on all the saved fences.
*/
ret = reservation_object_get_fences_rcu(msm_obj->resv,
&submit->bos[i].excl,
&submit->bos[i].nr_shared,
&submit->bos[i].shared);
if (ret)
return ret;
I think this can just be a return ret;
} else
No need for the else
submit->bos[i].excl =
reservation_object_get_excl_rcu(msm_obj->resv);
}
return ret;
This can be a return 0;
@@ -425,7 +474,6 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, struct msm_file_private *ctx = file->driver_priv; struct msm_gem_submit *submit; struct msm_gpu *gpu = priv->gpu;
- struct dma_fence *in_fence = NULL; struct sync_file *sync_file = NULL; struct msm_gpu_submitqueue *queue; struct msm_ringbuffer *ring;
@@ -457,32 +505,11 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
ring = gpu->rb[queue->prio];
- if (args->flags & MSM_SUBMIT_FENCE_FD_IN) {
in_fence = sync_file_get_fence(args->fence_fd);
if (!in_fence)
return -EINVAL;
/*
* Wait if the fence is from a foreign context, or if the fence
* array contains any fence from a foreign context.
*/
if (!dma_fence_match_context(in_fence, ring->fctx->context)) {
ret = dma_fence_wait(in_fence, true);
if (ret)
return ret;
}
- }
- ret = mutex_lock_interruptible(&dev->struct_mutex);
- if (ret)
return ret;
- if (args->flags & MSM_SUBMIT_FENCE_FD_OUT) { out_fence_fd = get_unused_fd_flags(O_CLOEXEC); if (out_fence_fd < 0) { ret = out_fence_fd;
goto out_unlock;
} }goto out_err;
@@ -490,7 +517,16 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, args->nr_cmds, ctx); if (!submit) { ret = -ENOMEM;
goto out_unlock;
goto out_err;
}
if (args->flags & MSM_SUBMIT_FENCE_FD_IN) {
submit->in_fence = sync_file_get_fence(args->fence_fd);
if (!submit->in_fence) {
ret = -EINVAL;
goto out;
}
}
if (args->flags & MSM_SUBMIT_SUDO)
@@ -573,28 +609,16 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
submit->nr_cmds = i;
- msm_gpu_submit(submit);
- if (IS_ERR(submit->hw_fence)) {
ret = PTR_ERR(submit->hw_fence);
submit->hw_fence = NULL;
- ret = msm_sched_job_init(&submit->sched_job);
- if (ret) goto out;
}
/*
* No protection should be okay here since this is protected by the big
* GPU lock.
*/
submit->out_fence_id = idr_alloc_cyclic(&ring->fence_idr,
submit->hw_fence, 0, INT_MAX, GFP_KERNEL);
if (submit->out_fence_id < 0) {
ret = -ENOMEM;
goto out;
}
submit_attach_fences_and_unlock(submit);
args->fence = submit->out_fence_id;
if (args->flags & MSM_SUBMIT_FENCE_FD_OUT) {
sync_file = sync_file_create(submit->hw_fence);
if (!sync_file) { ret = -ENOMEM; goto out;sync_file = sync_file_create(submit->out_fence);
@@ -604,14 +628,22 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, }
out:
- if (in_fence)
dma_fence_put(in_fence);
- /*
* Clean up the submit bits and pieces which are not needed beyond this
* function context
submit_cleanup(submit);*/
- if (ret)
- if (!ret)
/*
* If we reached here, then all is well. Push the job to the
* scheduler
*/
msm_sched_push_job(submit);
- else msm_gem_submit_free(submit);
-out_unlock: +out_err: if (ret && (out_fence_fd >= 0)) put_unused_fd(out_fence_fd);
- mutex_unlock(&dev->struct_mutex); return ret; }
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index cd5fe49..481a55c 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -295,12 +295,17 @@ static void update_fences(struct msm_gpu *gpu, struct msm_ringbuffer *ring, find_submit(struct msm_ringbuffer *ring, uint32_t fence) { struct msm_gem_submit *submit;
- unsigned long flags;
- WARN_ON(!mutex_is_locked(&ring->gpu->dev->struct_mutex));
spin_lock_irqsave(&ring->lock, flags);
list_for_each_entry(submit, &ring->submits, node)
if (submit->seqno == fence)
if (submit->seqno == fence) {
spin_unlock_irqrestore(&ring->lock, flags); return submit;
}
spin_unlock_irqrestore(&ring->lock, flags);
return NULL; }
@@ -316,6 +321,12 @@ static void recover_worker(struct work_struct *work) struct msm_ringbuffer *cur_ring = gpu->funcs->active_ring(gpu); int i;
- submit = find_submit(cur_ring, cur_ring->memptrs->fence + 1);
- return msm_sched_gpu_recovery(gpu, submit);
- /*
* The unused code below will be removed in a subsequent patch
*/
Why not now?
I removed all the dead code as part of a separate patch "msm/drm: Remove unused code". I can either remove this comment from here or squash the other commit with this one.
mutex_lock(&dev->struct_mutex);
dev_err(dev->dev, "%s: hangcheck recover!\n", gpu->name); @@ -591,11 +602,36 @@ static void retire_worker(struct work_struct *work) mutex_unlock(&dev->struct_mutex); }
-/* call from irq handler to schedule work to retire bo's */ +static void signal_hw_fences(struct msm_ringbuffer *ring) +{
- unsigned long flags;
- struct msm_gem_submit *submit, *tmp;
- spin_lock_irqsave(&ring->lock, flags);
- list_for_each_entry_safe(submit, tmp, &ring->submits, node) {
if (submit->seqno > ring->memptrs->fence)
break;
list_del(&submit->node);
dma_fence_signal(submit->hw_fence);
- }
- spin_unlock_irqrestore(&ring->lock, flags);
+}
+/*
- Called from the IRQ context to signal hardware fences for the completed
- submits
- */ void msm_gpu_retire(struct msm_gpu *gpu) {
- struct msm_drm_private *priv = gpu->dev->dev_private;
- queue_work(priv->wq, &gpu->retire_work);
- int i;
- for (i = 0; i < gpu->nr_rings; i++)
signal_hw_fences(gpu->rb[i]);
- update_sw_cntrs(gpu); }
@@ -606,25 +642,28 @@ struct dma_fence *msm_gpu_submit(struct msm_gem_submit *submit) struct drm_device *dev = gpu->dev; struct msm_drm_private *priv = dev->dev_private; struct msm_ringbuffer *ring = submit->ring;
- unsigned long flags; int i;
WARN_ON(!mutex_is_locked(&dev->struct_mutex));
submit->hw_fence = msm_fence_alloc(ring->fctx); if (IS_ERR(submit->hw_fence)) return submit->hw_fence;
pm_runtime_get_sync(&gpu->pdev->dev);
msm_gpu_hw_init(gpu);
- spin_lock_irqsave(&ring->lock, flags);
- list_add_tail(&submit->node, &ring->submits);
- spin_unlock_irqrestore(&ring->lock, flags); submit->seqno = ++ring->seqno;
- list_add_tail(&submit->node, &ring->submits);
- update_sw_cntrs(gpu);
I'm not sure if this needs the hardware to be up (it does check msm_gpu_active), but I don't think we should reorganize the order of these functions unless we need to.
Sure, I will check this out. The intent was to have only the bare essentials under the struct_mutex
- msm_rd_dump_submit(priv->rd, submit, NULL);
- mutex_lock(&dev->struct_mutex);
- update_sw_cntrs(gpu);
pm_runtime_get_sync(&gpu->pdev->dev);
msm_gpu_hw_init(gpu);
msm_rd_dump_submit(priv->rd, submit, NULL);
for (i = 0; i < submit->nr_bos; i++) { struct msm_gem_object *msm_obj = submit->bos[i].obj;
@@ -634,16 +673,13 @@ struct dma_fence *msm_gpu_submit(struct msm_gem_submit *submit) */ WARN_ON(is_active(msm_obj) && (msm_obj->gpu != gpu));
if (submit->bos[i].flags & MSM_SUBMIT_BO_WRITE)
msm_gem_move_to_active(&msm_obj->base, gpu, true, submit->hw_fence);
else if (submit->bos[i].flags & MSM_SUBMIT_BO_READ)
msm_gem_move_to_active(&msm_obj->base, gpu, false, submit->hw_fence);
msm_gem_move_to_active(&msm_obj->base, gpu);
}
gpu->funcs->submit(gpu, submit, submit->ctx); priv->lastctx = submit->ctx;
- hangcheck_timer_reset(gpu);
mutex_unlock(&dev->struct_mutex);
return submit->hw_fence; }
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h index dd55979..3bd1920 100644 --- a/drivers/gpu/drm/msm/msm_gpu.h +++ b/drivers/gpu/drm/msm/msm_gpu.h @@ -173,6 +173,8 @@ struct msm_gpu_submitqueue { int faults; struct list_head node; struct kref ref;
struct msm_gpu *gpu;
struct drm_sched_entity sched_entity; };
static inline void gpu_write(struct msm_gpu *gpu, u32 reg, u32 data)
diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c index 0889766..ef2bf10 100644 --- a/drivers/gpu/drm/msm/msm_ringbuffer.c +++ b/drivers/gpu/drm/msm/msm_ringbuffer.c @@ -56,9 +56,14 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id,
snprintf(ring->name, sizeof(ring->name), "msm-gpu-ring-%d", ring->id);
if (msm_sched_init(&ring->sched, ring->name) != 0) {
ret = -EINVAL;
goto fail;
} ring->fctx = msm_fence_context_alloc(gpu->dev, ring->name);
idr_init(&ring->fence_idr);
mutex_init(&ring->fence_idr_lock);
return ring;
@@ -74,6 +79,7 @@ void msm_ringbuffer_destroy(struct msm_ringbuffer *ring)
msm_fence_context_free(ring->fctx);
- msm_sched_cleanup(&ring->sched); if (ring->bo) { msm_gem_put_iova(ring->bo, ring->gpu->aspace); msm_gem_put_vaddr(ring->bo);
@@ -81,6 +87,7 @@ void msm_ringbuffer_destroy(struct msm_ringbuffer *ring) }
idr_destroy(&ring->fence_idr);
mutex_destroy(&ring->fence_idr_lock);
kfree(ring); }
diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.h b/drivers/gpu/drm/msm/msm_ringbuffer.h index 523373b..10ae4a8 100644 --- a/drivers/gpu/drm/msm/msm_ringbuffer.h +++ b/drivers/gpu/drm/msm/msm_ringbuffer.h @@ -19,6 +19,7 @@ #define __MSM_RINGBUFFER_H__
#include "msm_drv.h" +#include "msm_sched.h"
#define rbmemptr(ring, member) \ ((ring)->memptrs_iova + offsetof(struct msm_rbmemptrs, member)) @@ -42,7 +43,9 @@ struct msm_ringbuffer { uint64_t memptrs_iova; struct msm_fence_context *fctx; struct idr fence_idr;
struct mutex fence_idr_lock; spinlock_t lock;
struct drm_gpu_scheduler sched; };
struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id,
diff --git a/drivers/gpu/drm/msm/msm_sched.c b/drivers/gpu/drm/msm/msm_sched.c new file mode 100644 index 0000000..8b805ce --- /dev/null +++ b/drivers/gpu/drm/msm/msm_sched.c @@ -0,0 +1,313 @@ +/* SPDX-License-Identifier: GPL-2.0 */
// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, The Linux Foundation. All rights reserved. */
+#include "msm_gpu.h" +#include "msm_gem.h" +#include "msm_sched.h"
+#include <linux/string_helpers.h> +#include <linux/kthread.h>
+struct msm_gem_submit *to_msm_gem_submit(struct drm_sched_job *sched_job) +{
- return container_of(sched_job, struct msm_gem_submit, sched_job);
+}
+/*
- Go through the bo's one by one and return the previously saved shared and
bo's -> bos
- exclusive fences. If the scheduler gets the fence, then it takes care of
- releasing the reference on the fence.
- */
+static struct dma_fence *msm_sched_dependency(struct drm_sched_job *sched_job,
struct drm_sched_entity *entity)
+{
- struct msm_gem_submit *submit = to_msm_gem_submit(sched_job);
- struct dma_fence *fence;
- int i;
- if (submit->in_fence) {
fence = submit->in_fence;
submit->in_fence = NULL;
if (!dma_fence_is_signaled(fence))
return fence;
dma_fence_put(fence);
- }
There might be a chance to consolidate some code here
static struct dma_fence *_get_fence(struct dma_fence **) { struct dma_fence *fence = *in; *in = NULL;
if (fence && !dma_fence_is_signaled(fence)) return fence;
dma_fence_put(fence); return NULL; }
fence = _fence(&submit->in_fence); if (fence) return fence;
- /* Return the previously saved shared and exclusive fences if any */
- for (i = 0; i < submit->nr_bos; i++) {
int j;
if (submit->bos[i].excl) {
fence = submit->bos[i].excl;
submit->bos[i].excl = NULL;
if (!dma_fence_is_signaled(fence))
return fence;
dma_fence_put(fence);
}
fence = _get_fence(&submit->bos[i].excl); if (fence) return fence;
for (j = 0; j < submit->bos[i].nr_shared; j++) {
if (!submit->bos[i].shared[j])
continue;
fence = submit->bos[i].shared[j];
submit->bos[i].shared[j] = NULL;
if (!dma_fence_is_signaled(fence))
return fence;
dma_fence_put(fence);
}
fence = _get_fence(&submit->bos[i].shared); if (fence) return fence;
kfree(submit->bos[i].shared);
submit->bos[i].nr_shared = 0;
submit->bos[i].shared = NULL;
- }
- return NULL;
+}
This is an interesting function - in order to avoid wasting cycles it needs to be ordered so that the most likely fences happen first. If that is the case, I think that in_fence might be the least likely so you should check it last.
Looked at this again.. In addition to your suggestion, I think this function will benefit greatly by adding a couple of iterators, one for bo and the other for shared fences, that way it doesn't have to restart checking from scratch everytime. The iterators will have to be part of the submit structure.
+static struct dma_fence *msm_sched_run_job(struct drm_sched_job *sched_job) +{
- struct msm_gem_submit *submit = to_msm_gem_submit(sched_job);
- return !sched_job->s_fence->finished.error ?
msm_gpu_submit(submit) : NULL;
+}
+static void dump_bad_submit(struct msm_gem_submit *submit) +{
- struct msm_gpu *gpu = submit->gpu;
- struct drm_device *dev = gpu->dev;
- struct msm_drm_private *priv = dev->dev_private;
- struct task_struct *task;
- char task_name[32] = {0};
- rcu_read_lock();
- task = pid_task(submit->pid, PIDTYPE_PID);
- if (task) {
char *cmd;
cmd = kstrdup_quotable_cmdline(task, GFP_KERNEL);
This should be GFP_ATOMIC because we are in the rcu_read_lock(). There might be something else wrong with it too - I know we have this problem in the current kernel and I'm not sure if it is avoidable without losing the task name.
dev_err(&gpu->pdev->dev, "%s: offending task: %s (%s)\n",
gpu->name, task->comm, cmd);
snprintf(task_name, sizeof(task_name),
"offending task: %s (%s)", task->comm, cmd);
kfree(cmd);
- }
- rcu_read_unlock();
- mutex_lock(&gpu->dev->struct_mutex);
- msm_rd_dump_submit(priv->hangrd, submit, task_name);
- mutex_unlock(&gpu->dev->struct_mutex);
+}
+void msm_sched_gpu_recovery(struct msm_gpu *gpu, struct msm_gem_submit *submit) +{
- int i;
- static atomic_t gpu_recovery;
- /* Check if a GPU recovery is already in progress */
- if (!(atomic_cmpxchg(&gpu_recovery, 0, 1) == 0))
return;
- /*
* Pause the schedulers so we don't get new requests while the recovery
* is in progress
*/
- for (i = 0; i < gpu->nr_rings; i++)
kthread_park(gpu->rb[i]->sched.thread);
- /*
* Disable interrupts so we don't get interrupted while the recovery is
* in progress
*/
- disable_irq(gpu->irq);
- dev_err(&gpu->pdev->dev, "%s: hangcheck recover!\n", gpu->name);
I don't know if we still need this line? Recovery seems to be standard operating procedure these days.
- if (submit)
dump_bad_submit(submit);
- /*
* For each ring, we do the following
* a) Inform the scheduler to drop the hardware fence for the
* submissions on its mirror list
* b) The submit(job) list on the ring is not useful anymore
* as we are going for a gpu reset, so we empty the submit list
*/
- for (i = 0; i < gpu->nr_rings; i++) {
struct msm_gem_submit *job, *tmp;
unsigned long flags;
struct msm_ringbuffer *ring = gpu->rb[i];
/* a) Release the hardware fences */
drm_sched_hw_job_reset(&ring->sched,
(submit && submit->ring == ring) ?
&submit->sched_job : NULL);
/* b) Free up the ring submit list */
spin_lock_irqsave(&ring->lock, flags);
list_for_each_entry_safe(job, tmp, &ring->submits, node)
list_del(&job->node);
spin_unlock_irqrestore(&ring->lock, flags);
- }
- /* Power cycle and reset the GPU back to init state */
- mutex_lock(&gpu->dev->struct_mutex);
- pm_runtime_get_sync(&gpu->pdev->dev);
- gpu->funcs->recover(gpu);
- pm_runtime_put_sync(&gpu->pdev->dev);
- mutex_unlock(&gpu->dev->struct_mutex);
- /* Re-enable the interrupts once the gpu reset is complete */
- enable_irq(gpu->irq);
- /*
* The GPU is hopefully back in good shape, now request the schedulers
* to replay its mirror list, starting with the scheduler on the highest
* priority ring
*/
- for (i = 0; i < gpu->nr_rings; i++) {
drm_sched_job_recovery(&gpu->rb[i]->sched);
kthread_unpark(gpu->rb[i]->sched.thread);
- }
- atomic_set(&gpu_recovery, 0);
I think we need a smp_wmb() here to make sure everybody else sees the update.
+}
+static void msm_sched_timedout_job(struct drm_sched_job *bad_job) +{
- struct msm_gem_submit *submit = to_msm_gem_submit(bad_job);
- struct msm_gpu *gpu = submit->gpu;
- struct msm_ringbuffer *ring = submit->ring;
- /*
* If this submission completed in the mean time, then the timeout is
* spurious
*/
- if (submit->seqno <= submit->ring->memptrs->fence)
return;
- dev_err(&gpu->pdev->dev, "%s: hangcheck detected gpu lockup rb %d!\n",
gpu->name, ring->id);
- dev_err(&gpu->pdev->dev, "%s: completed fence: %u\n",
gpu->name, ring->memptrs->fence);
- dev_err(&gpu->pdev->dev, "%s: submitted fence: %u\n",
gpu->name, ring->seqno);
- msm_sched_gpu_recovery(gpu, submit);
+}
+static void msm_sched_free_job(struct drm_sched_job *sched_job) +{
- struct msm_gem_submit *submit = to_msm_gem_submit(sched_job);
- struct msm_gpu *gpu = submit->gpu;
- int i;
- mutex_lock(&gpu->dev->struct_mutex);
- for (i = 0; i < submit->nr_bos; i++) {
struct msm_gem_object *msm_obj = submit->bos[i].obj;
msm_gem_move_to_inactive(&msm_obj->base);
- }
- mutex_unlock(&gpu->dev->struct_mutex);
- pm_runtime_mark_last_busy(&gpu->pdev->dev);
- pm_runtime_put_autosuspend(&gpu->pdev->dev);
- msm_gem_submit_free(submit);
+}
+static const struct drm_sched_backend_ops msm_sched_ops = {
- .dependency = msm_sched_dependency,
- .run_job = msm_sched_run_job,
- .timedout_job = msm_sched_timedout_job,
- .free_job = msm_sched_free_job,
+};
+int msm_sched_job_init(struct drm_sched_job *sched_job) +{
- int ret;
- struct msm_gem_submit *submit = to_msm_gem_submit(sched_job);
- struct msm_ringbuffer *ring = submit->ring;
- ret = drm_sched_job_init(sched_job, &ring->sched,
&submit->queue->sched_entity, submit->ctx);
- if (ret)
return ret;
- submit->out_fence = dma_fence_get(&submit->sched_job.s_fence->finished);
- mutex_lock(&ring->fence_idr_lock);
- submit->out_fence_id = idr_alloc_cyclic(&ring->fence_idr,
submit->out_fence, 0, INT_MAX,
GFP_KERNEL);
- mutex_unlock(&ring->fence_idr_lock);
- if (submit->out_fence_id < 0) {
/*
* TODO: The scheduler's finished fence leaks here since the
* job will not be pushed to the queue. Need to update scheduler
* to fix this cleanly(?)
*/
How would you propose to fix this?
The problem arises because the scheduler code provided an API to create a sched_job, but no API to cleanup, instead it took care of cleaning up the sched_job implicitly for us.
I was thinking of this change... a) Add a drm_sched_job_cleanup() API which will remove the last reference count on the finished fence(dma_fence_put(finished_fence)) as part of this function. b) Call this new function from this failure case here as well as from the msm_sched_free_job() c) remove dma_fence_put(finished_fence) from drm_sched_job_finish(), since it is now moved to (b)
This I think should handle both cases. (a) and (c) require changes in the scheduler and will impact other drivers too, so I did not try it out as part of this patch series.
I will make another patch and share it.
dma_fence_put(submit->out_fence);
submit->out_fence = NULL;
return -ENOMEM;
- }
- return 0;
+}
+void msm_sched_push_job(struct msm_gem_submit *submit) +{
- return drm_sched_entity_push_job(&submit->sched_job,
&submit->queue->sched_entity);
+}
+int msm_sched_init(struct drm_gpu_scheduler *sched, const char *name) +{
- return drm_sched_init(sched, &msm_sched_ops, 4, 0,
msecs_to_jiffies(500), name);
Okay, I see where the ring->name comes in. I don't like it but at least it is a relatively low cost option if you share when the fence names. Still... sigh.
+}
+void msm_sched_cleanup(struct drm_gpu_scheduler *sched) +{
- drm_sched_fini(sched);
+}
I don't think this function is needed - I think we're smart enough to call drm_sched_fini directly.
+/*
- Create a new entity on the schedulers normal priority runqueue. For now we
- choose to always use the normal run queue priority, but in future its
- possible with some extension to the msm drm interface, to create the
- submitqueue(entities) of different priorities on the same ring, thereby
- allowing to priortize and schedule submitqueues belonging to the same ring
- */
+int msm_sched_entity_init(struct msm_gpu *gpu,
struct drm_sched_entity *sched_entity, int prio)
+{
- struct drm_gpu_scheduler *sched = &gpu->rb[prio]->sched;
- return drm_sched_entity_init(sched, sched_entity,
&sched->sched_rq[DRM_SCHED_PRIORITY_NORMAL], NULL);
+}
+void msm_sched_entity_cleanup(struct msm_gpu *gpu,
struct drm_sched_entity *sched_entity, int prio)
+{
- struct drm_gpu_scheduler *sched = &gpu->rb[prio]->sched;
- drm_sched_entity_fini(sched, sched_entity);
+} diff --git a/drivers/gpu/drm/msm/msm_sched.h b/drivers/gpu/drm/msm/msm_sched.h new file mode 100644 index 0000000..6ab2728 --- /dev/null +++ b/drivers/gpu/drm/msm/msm_sched.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */
// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, The Linux Foundation. All rights reserved. */
+#ifndef __MSM_SCHED_H__ +#define __MSM_SCHED_H__
+#include <drm/gpu_scheduler.h>
+int msm_sched_init(struct drm_gpu_scheduler *sched, const char *name); +void msm_sched_cleanup(struct drm_gpu_scheduler *sched); +int msm_sched_entity_init(struct msm_gpu *gpu, struct drm_sched_entity *queue,
int prio);
+void msm_sched_entity_cleanup(struct msm_gpu *gpu,
struct drm_sched_entity *queue, int prio);
+int msm_sched_job_init(struct drm_sched_job *sched_job); +void msm_sched_push_job(struct msm_gem_submit *submit); +void msm_sched_gpu_recovery(struct msm_gpu *gpu, struct msm_gem_submit *submit); +#endif /* __MSM_SCHED_H__ */ diff --git a/drivers/gpu/drm/msm/msm_submitqueue.c b/drivers/gpu/drm/msm/msm_submitqueue.c index 325da44..b6eb13e 100644 --- a/drivers/gpu/drm/msm/msm_submitqueue.c +++ b/drivers/gpu/drm/msm/msm_submitqueue.c @@ -19,6 +19,7 @@ void msm_submitqueue_destroy(struct kref *kref) struct msm_gpu_submitqueue *queue = container_of(kref, struct msm_gpu_submitqueue, ref);
- msm_sched_entity_cleanup(queue->gpu, &queue->sched_entity, queue->prio); kfree(queue); }
@@ -65,6 +66,7 @@ int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx, { struct msm_drm_private *priv = drm->dev_private; struct msm_gpu_submitqueue *queue;
struct msm_gpu *gpu = priv->gpu;
if (!ctx) return -ENODEV;
@@ -77,13 +79,16 @@ int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx, kref_init(&queue->ref); queue->flags = flags;
- if (priv->gpu) {
if (prio >= priv->gpu->nr_rings) {
kfree(queue);
return -EINVAL;
}
if (gpu) {
if (prio >= gpu->nr_rings)
goto fail;
if (msm_sched_entity_init(priv->gpu, &queue->sched_entity,
prio))
goto fail;
queue->prio = prio;
queue->gpu = gpu;
}
write_lock(&ctx->queuelock);
@@ -98,6 +103,9 @@ int msm_submitqueue_create(struct drm_device *drm, struct msm_file_private *ctx, write_unlock(&ctx->queuelock);
return 0; +fail:
kfree(queue);
return -EINVAL; }
int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx)
-- 1.9.1
With the introduction of the scheduler, most of the code related to timeout detection, recovery and submission retirement are no longer needed in the msm driver. This patch simply removes the no longer used code.
Signed-off-by: Sharat Masetty smasetty@codeaurora.org --- drivers/gpu/drm/msm/adreno/a5xx_gpu.c | 3 - drivers/gpu/drm/msm/msm_drv.h | 2 - drivers/gpu/drm/msm/msm_fence.c | 22 ---- drivers/gpu/drm/msm/msm_gem.c | 36 ------ drivers/gpu/drm/msm/msm_gpu.c | 204 ---------------------------------- drivers/gpu/drm/msm/msm_gpu.h | 6 - 6 files changed, 273 deletions(-)
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c index d39400e..6f5a4c5 100644 --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c @@ -1034,9 +1034,6 @@ static void a5xx_fault_detect_irq(struct msm_gpu *gpu) gpu_read64(gpu, REG_A5XX_CP_IB2_BASE, REG_A5XX_CP_IB2_BASE_HI), gpu_read(gpu, REG_A5XX_CP_IB2_BUFSZ));
- /* Turn off the hangcheck timer to keep it from bothering us */ - del_timer(&gpu->hangcheck_timer); - queue_work(priv->wq, &gpu->recover_work); }
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h index e461a9c..9004738 100644 --- a/drivers/gpu/drm/msm/msm_drv.h +++ b/drivers/gpu/drm/msm/msm_drv.h @@ -211,8 +211,6 @@ struct drm_gem_object *msm_gem_prime_import_sg_table(struct drm_device *dev, void *msm_gem_get_vaddr_active(struct drm_gem_object *obj); void msm_gem_put_vaddr(struct drm_gem_object *obj); int msm_gem_madvise(struct drm_gem_object *obj, unsigned madv); -int msm_gem_sync_object(struct drm_gem_object *obj, - struct msm_fence_context *fctx, bool exclusive); void msm_gem_move_to_active(struct drm_gem_object *obj, struct msm_gpu *gpu); void msm_gem_move_to_inactive(struct drm_gem_object *obj); int msm_gem_cpu_prep(struct drm_gem_object *obj, uint32_t op, ktime_t *timeout); diff --git a/drivers/gpu/drm/msm/msm_fence.c b/drivers/gpu/drm/msm/msm_fence.c index 0e7912b..d5bba25 100644 --- a/drivers/gpu/drm/msm/msm_fence.c +++ b/drivers/gpu/drm/msm/msm_fence.c @@ -44,11 +44,6 @@ void msm_fence_context_free(struct msm_fence_context *fctx) kfree(fctx); }
-static inline bool fence_completed(struct msm_fence_context *fctx, uint32_t fence) -{ - return (int32_t)(fctx->completed_fence - fence) >= 0; -} - /* legacy path for WAIT_FENCE ioctl: */ int msm_wait_fence(struct msm_ringbuffer *ring, uint32_t fence_id, ktime_t *timeout) @@ -86,16 +81,6 @@ int msm_wait_fence(struct msm_ringbuffer *ring, uint32_t fence_id, return ret; }
-/* called from workqueue */ -void msm_update_fence(struct msm_fence_context *fctx, uint32_t fence) -{ - spin_lock(&fctx->spinlock); - fctx->completed_fence = max(fence, fctx->completed_fence); - spin_unlock(&fctx->spinlock); - - wake_up_all(&fctx->event); -} - struct msm_fence { struct dma_fence base; struct msm_fence_context *fctx; @@ -122,17 +107,10 @@ static bool msm_fence_enable_signaling(struct dma_fence *fence) return true; }
-static bool msm_fence_signaled(struct dma_fence *fence) -{ - struct msm_fence *f = to_msm_fence(fence); - return fence_completed(f->fctx, f->base.seqno); -} - static const struct dma_fence_ops msm_fence_ops = { .get_driver_name = msm_fence_get_driver_name, .get_timeline_name = msm_fence_get_timeline_name, .enable_signaling = msm_fence_enable_signaling, - .signaled = msm_fence_signaled, .wait = dma_fence_default_wait, .release = dma_fence_free, }; diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index 7a12f30..e916c00 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -627,42 +627,6 @@ void msm_gem_vunmap(struct drm_gem_object *obj, enum msm_gem_lock subclass) mutex_unlock(&msm_obj->lock); }
-/* must be called before _move_to_active().. */ -int msm_gem_sync_object(struct drm_gem_object *obj, - struct msm_fence_context *fctx, bool exclusive) -{ - struct msm_gem_object *msm_obj = to_msm_bo(obj); - struct reservation_object_list *fobj; - struct dma_fence *fence; - int i, ret; - - fobj = reservation_object_get_list(msm_obj->resv); - if (!fobj || (fobj->shared_count == 0)) { - fence = reservation_object_get_excl(msm_obj->resv); - /* don't need to wait on our own fences, since ring is fifo */ - if (fence && (fence->context != fctx->context)) { - ret = dma_fence_wait(fence, true); - if (ret) - return ret; - } - } - - if (!exclusive || !fobj) - return 0; - - for (i = 0; i < fobj->shared_count; i++) { - fence = rcu_dereference_protected(fobj->shared[i], - reservation_object_held(msm_obj->resv)); - if (fence->context != fctx->context) { - ret = dma_fence_wait(fence, true); - if (ret) - return ret; - } - } - - return 0; -} - void msm_gem_move_to_active(struct drm_gem_object *obj, struct msm_gpu *gpu) { struct msm_gem_object *msm_obj = to_msm_bo(obj); diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index 481a55c..1cc8745 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -20,7 +20,6 @@ #include "msm_mmu.h" #include "msm_fence.h"
-#include <linux/string_helpers.h> #include <linux/pm_opp.h> #include <linux/devfreq.h>
@@ -273,24 +272,6 @@ int msm_gpu_hw_init(struct msm_gpu *gpu) return ret; }
-/* - * Hangcheck detection for locked gpu: - */ - -static void update_fences(struct msm_gpu *gpu, struct msm_ringbuffer *ring, - uint32_t fence) -{ - struct msm_gem_submit *submit; - - list_for_each_entry(submit, &ring->submits, node) { - if (submit->seqno > fence) - break; - - msm_update_fence(submit->ring->fctx, - submit->hw_fence->seqno); - } -} - static struct msm_gem_submit * find_submit(struct msm_ringbuffer *ring, uint32_t fence) { @@ -310,146 +291,14 @@ static void update_fences(struct msm_gpu *gpu, struct msm_ringbuffer *ring, return NULL; }
-static void retire_submits(struct msm_gpu *gpu); - static void recover_worker(struct work_struct *work) { struct msm_gpu *gpu = container_of(work, struct msm_gpu, recover_work); - struct drm_device *dev = gpu->dev; - struct msm_drm_private *priv = dev->dev_private; struct msm_gem_submit *submit; struct msm_ringbuffer *cur_ring = gpu->funcs->active_ring(gpu); - int i;
submit = find_submit(cur_ring, cur_ring->memptrs->fence + 1); return msm_sched_gpu_recovery(gpu, submit); - - /* - * The unused code below will be removed in a subsequent patch - */ - mutex_lock(&dev->struct_mutex); - - dev_err(dev->dev, "%s: hangcheck recover!\n", gpu->name); - - submit = find_submit(cur_ring, cur_ring->memptrs->fence + 1); - if (submit) { - struct task_struct *task; - - rcu_read_lock(); - task = pid_task(submit->pid, PIDTYPE_PID); - if (task) { - char *cmd; - - /* - * So slightly annoying, in other paths like - * mmap'ing gem buffers, mmap_sem is acquired - * before struct_mutex, which means we can't - * hold struct_mutex across the call to - * get_cmdline(). But submits are retired - * from the same in-order workqueue, so we can - * safely drop the lock here without worrying - * about the submit going away. - */ - mutex_unlock(&dev->struct_mutex); - cmd = kstrdup_quotable_cmdline(task, GFP_KERNEL); - mutex_lock(&dev->struct_mutex); - - dev_err(dev->dev, "%s: offending task: %s (%s)\n", - gpu->name, task->comm, cmd); - - msm_rd_dump_submit(priv->hangrd, submit, - "offending task: %s (%s)", task->comm, cmd); - - kfree(cmd); - } else { - msm_rd_dump_submit(priv->hangrd, submit, NULL); - } - rcu_read_unlock(); - } - - - /* - * Update all the rings with the latest and greatest fence.. this - * needs to happen after msm_rd_dump_submit() to ensure that the - * bo's referenced by the offending submit are still around. - */ - for (i = 0; i < gpu->nr_rings; i++) { - struct msm_ringbuffer *ring = gpu->rb[i]; - - uint32_t fence = ring->memptrs->fence; - - /* - * For the current (faulting?) ring/submit advance the fence by - * one more to clear the faulting submit - */ - if (ring == cur_ring) - fence++; - - update_fences(gpu, ring, fence); - } - - if (msm_gpu_active(gpu)) { - /* retire completed submits, plus the one that hung: */ - retire_submits(gpu); - - pm_runtime_get_sync(&gpu->pdev->dev); - gpu->funcs->recover(gpu); - pm_runtime_put_sync(&gpu->pdev->dev); - - /* - * Replay all remaining submits starting with highest priority - * ring - */ - for (i = 0; i < gpu->nr_rings; i++) { - struct msm_ringbuffer *ring = gpu->rb[i]; - - list_for_each_entry(submit, &ring->submits, node) - gpu->funcs->submit(gpu, submit, NULL); - } - } - - mutex_unlock(&dev->struct_mutex); - - msm_gpu_retire(gpu); -} - -static void hangcheck_timer_reset(struct msm_gpu *gpu) -{ - DBG("%s", gpu->name); - mod_timer(&gpu->hangcheck_timer, - round_jiffies_up(jiffies + DRM_MSM_HANGCHECK_JIFFIES)); -} - -static void hangcheck_handler(struct timer_list *t) -{ - struct msm_gpu *gpu = from_timer(gpu, t, hangcheck_timer); - struct drm_device *dev = gpu->dev; - struct msm_drm_private *priv = dev->dev_private; - struct msm_ringbuffer *ring = gpu->funcs->active_ring(gpu); - uint32_t fence = ring->memptrs->fence; - - if (fence != ring->hangcheck_fence) { - /* some progress has been made.. ya! */ - ring->hangcheck_fence = fence; - } else if (fence < ring->seqno) { - /* no progress and not done.. hung! */ - ring->hangcheck_fence = fence; - dev_err(dev->dev, "%s: hangcheck detected gpu lockup rb %d!\n", - gpu->name, ring->id); - dev_err(dev->dev, "%s: completed fence: %u\n", - gpu->name, fence); - dev_err(dev->dev, "%s: submitted fence: %u\n", - gpu->name, ring->seqno); - - queue_work(priv->wq, &gpu->recover_work); - } - - /* if still more pending work, reset the hangcheck timer: */ - if (ring->seqno > ring->hangcheck_fence) - hangcheck_timer_reset(gpu); - - /* workaround for missing irq: */ - queue_work(priv->wq, &gpu->retire_work); }
/* @@ -553,55 +402,6 @@ int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime, /* * Cmdstream submission/retirement: */ - -static void retire_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit) -{ - int i; - - for (i = 0; i < submit->nr_bos; i++) { - struct msm_gem_object *msm_obj = submit->bos[i].obj; - /* move to inactive: */ - msm_gem_move_to_inactive(&msm_obj->base); - } - - pm_runtime_mark_last_busy(&gpu->pdev->dev); - pm_runtime_put_autosuspend(&gpu->pdev->dev); - msm_gem_submit_free(submit); -} - -static void retire_submits(struct msm_gpu *gpu) -{ - struct drm_device *dev = gpu->dev; - struct msm_gem_submit *submit, *tmp; - int i; - - WARN_ON(!mutex_is_locked(&dev->struct_mutex)); - - /* Retire the commits starting with highest priority */ - for (i = 0; i < gpu->nr_rings; i++) { - struct msm_ringbuffer *ring = gpu->rb[i]; - - list_for_each_entry_safe(submit, tmp, &ring->submits, node) { - if (dma_fence_is_signaled(submit->hw_fence)) - retire_submit(gpu, submit); - } - } -} - -static void retire_worker(struct work_struct *work) -{ - struct msm_gpu *gpu = container_of(work, struct msm_gpu, retire_work); - struct drm_device *dev = gpu->dev; - int i; - - for (i = 0; i < gpu->nr_rings; i++) - update_fences(gpu, gpu->rb[i], gpu->rb[i]->memptrs->fence); - - mutex_lock(&dev->struct_mutex); - retire_submits(gpu); - mutex_unlock(&dev->struct_mutex); -} - static void signal_hw_fences(struct msm_ringbuffer *ring) { unsigned long flags; @@ -791,12 +591,8 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, gpu->name = name;
INIT_LIST_HEAD(&gpu->active_list); - INIT_WORK(&gpu->retire_work, retire_worker); INIT_WORK(&gpu->recover_work, recover_worker);
- - timer_setup(&gpu->hangcheck_timer, hangcheck_handler, 0); - spin_lock_init(&gpu->perf_lock);
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h index 3bd1920..6296758 100644 --- a/drivers/gpu/drm/msm/msm_gpu.h +++ b/drivers/gpu/drm/msm/msm_gpu.h @@ -98,9 +98,6 @@ struct msm_gpu { /* does gpu need hw_init? */ bool needs_hw_init;
- /* worker for handling active-list retiring: */ - struct work_struct retire_work; - void __iomem *mmio; int irq;
@@ -117,9 +114,6 @@ struct msm_gpu { */ #define DRM_MSM_INACTIVE_PERIOD 66 /* in ms (roughly four frames) */
-#define DRM_MSM_HANGCHECK_PERIOD 500 /* in ms */ -#define DRM_MSM_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_MSM_HANGCHECK_PERIOD) - struct timer_list hangcheck_timer; struct work_struct recover_work;
struct drm_gem_object *memptrs_bo;
Add an optional backend function op which will let the scheduler clients know when the timeout work got scheduled for a job. This will help drivers with multiple schedulers(one per ring) measure time spent on the ring accurately, eventually helping with better timeout detection.
Signed-off-by: Sharat Masetty smasetty@codeaurora.org --- drivers/gpu/drm/scheduler/gpu_scheduler.c | 16 +++++++++++++--- include/drm/gpu_scheduler.h | 6 ++++++ 2 files changed, 19 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index bf0e0c9..f5534ff 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -69,6 +69,16 @@ static void drm_sched_rq_remove_entity(struct drm_sched_rq *rq, spin_unlock(&rq->lock); }
+static void drm_sched_queue_delayed_work(struct drm_sched_job *s_job) +{ + struct drm_gpu_scheduler *sched = s_job->sched; + + schedule_delayed_work(&s_job->work_tdr, sched->timeout); + + if (sched->ops->timeout_start_notify) + sched->ops->timeout_start_notify(s_job); +} + /** * Select an entity which could provide a job to run * @@ -467,7 +477,7 @@ static void drm_sched_job_finish(struct work_struct *work) struct drm_sched_job, node);
if (next) - schedule_delayed_work(&next->work_tdr, sched->timeout); + drm_sched_queue_delayed_work(next); } spin_unlock(&sched->job_list_lock); dma_fence_put(&s_job->s_fence->finished); @@ -494,7 +504,7 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job) if (sched->timeout != MAX_SCHEDULE_TIMEOUT && list_first_entry_or_null(&sched->ring_mirror_list, struct drm_sched_job, node) == s_job) - schedule_delayed_work(&s_job->work_tdr, sched->timeout); + drm_sched_queue_delayed_work(s_job); spin_unlock(&sched->job_list_lock); }
@@ -560,7 +570,7 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler *sched) s_job = list_first_entry_or_null(&sched->ring_mirror_list, struct drm_sched_job, node); if (s_job && sched->timeout != MAX_SCHEDULE_TIMEOUT) - schedule_delayed_work(&s_job->work_tdr, sched->timeout); + drm_sched_queue_delayed_work(s_job);
list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) { struct drm_sched_fence *s_fence = s_job->s_fence; diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index dec6558..5c59c38 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -157,6 +157,12 @@ struct drm_sched_backend_ops { * it's time to clean it up. */ void (*free_job)(struct drm_sched_job *sched_job); + + /* + * (Optional) Called to let the driver know that a timeout detection + * timer has been started for this job. + */ + void (*timeout_start_notify)(struct drm_sched_job *sched_job); };
/**
From: Matteo Croce mcroce@redhat.com
add jiffies_delta_to_msecs() helper func to calculate the delta between two times and eventually 0 if negative.
Suggested-by: Eric Dumazet eric.dumazet@gmail.com Signed-off-by: Matteo Croce mcroce@redhat.com Reviewed-by: Eric Dumazet edumazet@google.com Acked-by: Simon Horman horms@verge.net.au Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org --- include/linux/jiffies.h | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index a27cf66..fa92824 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -447,6 +447,11 @@ static inline clock_t jiffies_delta_to_clock_t(long delta) return jiffies_to_clock_t(max(0L, delta)); }
+static inline unsigned int jiffies_delta_to_msecs(long delta) +{ + return jiffies_to_msecs(max(0L, delta)); +} + extern unsigned long clock_t_to_jiffies(unsigned long x); extern u64 jiffies_64_to_clock_t(u64 x); extern u64 nsec_to_clock_t(u64 x);
The base scheduler patch has barebones timeout implementation, it does not account for issues like starvation on lower priority rings. This patch enables more accurate measurement on time spent on each ringbuffer, thereby helping us with better timeout detection mechanism.
Signed-off-by: Sharat Masetty smasetty@codeaurora.org --- drivers/gpu/drm/msm/adreno/a5xx_preempt.c | 29 +++++++++++++++++++++ drivers/gpu/drm/msm/adreno/adreno_gpu.c | 3 +++ drivers/gpu/drm/msm/msm_ringbuffer.h | 2 ++ drivers/gpu/drm/msm/msm_sched.c | 42 +++++++++++++++++++++++++++++++ 4 files changed, 76 insertions(+)
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_preempt.c b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c index 6a3c560..8bf81c1c 100644 --- a/drivers/gpu/drm/msm/adreno/a5xx_preempt.c +++ b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c @@ -165,6 +165,33 @@ void a5xx_preempt_trigger(struct msm_gpu *gpu) gpu_write(gpu, REG_A5XX_CP_CONTEXT_SWITCH_CNTL, 1); }
+static void update_ring_timestamps(struct msm_ringbuffer *prev_ring, + struct msm_ringbuffer *cur_ring) +{ + unsigned long flags; + + /* + * For the outgoing ring(prev_ring), capture the last sample of time + * spent on this ring and add it to the ring's total active_time. + */ + spin_lock_irqsave(&prev_ring->lock, flags); + + prev_ring->active_time += jiffies_delta_to_msecs(jiffies - + prev_ring->last_ts); + + spin_unlock_irqrestore(&prev_ring->lock, flags); + + /* + * For the incoming ring(cur_ring), save the new current timestamp to + * restart active time measurement + */ + spin_lock_irqsave(&cur_ring->lock, flags); + + cur_ring->last_ts = jiffies_to_msecs(jiffies); + + spin_unlock_irqrestore(&cur_ring->lock, flags); +} + void a5xx_preempt_irq(struct msm_gpu *gpu) { uint32_t status; @@ -194,6 +221,8 @@ void a5xx_preempt_irq(struct msm_gpu *gpu) return; }
+ update_ring_timestamps(a5xx_gpu->cur_ring, a5xx_gpu->next_ring); + a5xx_gpu->cur_ring = a5xx_gpu->next_ring; a5xx_gpu->next_ring = NULL;
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c index 17d0506..f8b5f4a 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c @@ -212,6 +212,9 @@ int adreno_hw_init(struct msm_gpu *gpu) /* reset completed fence seqno: */ ring->memptrs->fence = ring->seqno; ring->memptrs->rptr = 0; + + ring->last_ts = 0; + ring->active_time = 0; }
/* diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.h b/drivers/gpu/drm/msm/msm_ringbuffer.h index 10ae4a8..27e0ab2 100644 --- a/drivers/gpu/drm/msm/msm_ringbuffer.h +++ b/drivers/gpu/drm/msm/msm_ringbuffer.h @@ -46,6 +46,8 @@ struct msm_ringbuffer { struct mutex fence_idr_lock; spinlock_t lock; struct drm_gpu_scheduler sched; + u32 last_ts; + u32 active_time; };
struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id, diff --git a/drivers/gpu/drm/msm/msm_sched.c b/drivers/gpu/drm/msm/msm_sched.c index 8b805ce..70b7713 100644 --- a/drivers/gpu/drm/msm/msm_sched.c +++ b/drivers/gpu/drm/msm/msm_sched.c @@ -191,6 +191,9 @@ static void msm_sched_timedout_job(struct drm_sched_job *bad_job) struct msm_gem_submit *submit = to_msm_gem_submit(bad_job); struct msm_gpu *gpu = submit->gpu; struct msm_ringbuffer *ring = submit->ring; + struct drm_gpu_scheduler *sched = &ring->sched; + unsigned long flags; + u32 total_time = 0;
/* * If this submission completed in the mean time, then the timeout is @@ -199,6 +202,23 @@ static void msm_sched_timedout_job(struct drm_sched_job *bad_job) if (submit->seqno <= submit->ring->memptrs->fence) return;
+ spin_lock_irqsave(&ring->lock, flags); + + total_time = ring->active_time; + + /* Measure the last sample only if this is the active ring */ + if (ring == gpu->funcs->active_ring(gpu)) + total_time += jiffies_delta_to_msecs(jiffies - ring->last_ts); + + spin_unlock_irqrestore(&ring->lock, flags); + + if (total_time < sched->timeout) { + schedule_delayed_work(&bad_job->work_tdr, + msecs_to_jiffies(sched->timeout - total_time)); + return; + } + + /* Timeout occurred, go for a recovery */ dev_err(&gpu->pdev->dev, "%s: hangcheck detected gpu lockup rb %d!\n", gpu->name, ring->id); dev_err(&gpu->pdev->dev, "%s: completed fence: %u\n", @@ -231,11 +251,33 @@ static void msm_sched_free_job(struct drm_sched_job *sched_job) msm_gem_submit_free(submit); }
+static void msm_sched_timeout_start(struct drm_sched_job *sched_job) +{ + struct msm_gem_submit *submit = to_msm_gem_submit(sched_job); + struct msm_gpu *gpu = submit->gpu; + struct msm_ringbuffer *ring = submit->ring; + unsigned long flags; + + spin_lock_irqsave(&ring->lock, flags); + + ring->active_time = 0; + + /* + * Save the initial timestamp only if this ring is active. For other + * rings the initial timestamp is captured at preemption switch-in + */ + if (ring == gpu->funcs->active_ring(gpu)) + ring->last_ts = jiffies_to_msecs(jiffies); + + spin_unlock_irqrestore(&ring->lock, flags); +} + static const struct drm_sched_backend_ops msm_sched_ops = { .dependency = msm_sched_dependency, .run_job = msm_sched_run_job, .timedout_job = msm_sched_timedout_job, .free_job = msm_sched_free_job, + .timeout_start_notify = msm_sched_timeout_start, };
int msm_sched_job_init(struct drm_sched_job *sched_job)
dri-devel@lists.freedesktop.org