The READ/WRITE flags are particularly useful if we want to avoid serialization of jobs that read from the same BO but never write to it. The NO_IMPLICIT_FENCE might be useful when the user knows the BO is shared but jobs are using different portions of the buffer.
Signed-off-by: Boris Brezillon boris.brezillon@collabora.com --- drivers/gpu/drm/panfrost/panfrost_drv.c | 72 +++++++++-- drivers/gpu/drm/panfrost/panfrost_job.c | 164 +++++++++++++++++++----- drivers/gpu/drm/panfrost/panfrost_job.h | 11 +- include/uapi/drm/panfrost_drm.h | 41 ++++++ 4 files changed, 247 insertions(+), 41 deletions(-)
diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c index d74442d71048..08082fd557c3 100644 --- a/drivers/gpu/drm/panfrost/panfrost_drv.c +++ b/drivers/gpu/drm/panfrost/panfrost_drv.c @@ -119,20 +119,76 @@ panfrost_lookup_bos(struct drm_device *dev, struct drm_panfrost_submit *args, struct panfrost_job *job) { - job->bo_count = args->bo_handle_count; + struct drm_panfrost_submit_bo *bo_descs = NULL; + u32 *handles = NULL; + u32 i, bo_count; + int ret = 0;
- if (!job->bo_count) + bo_count = args->bo_desc_count ? + args->bo_desc_count : args->bo_handle_count; + if (!bo_count) return 0;
- job->implicit_fences = kvmalloc_array(job->bo_count, - sizeof(struct dma_fence *), + job->bos = kvmalloc_array(bo_count, sizeof(*job->bos), GFP_KERNEL | __GFP_ZERO); - if (!job->implicit_fences) + if (!job->bos) return -ENOMEM;
- return drm_gem_objects_lookup(file_priv, - (void __user *)(uintptr_t)args->bo_handles, - job->bo_count, &job->bos); + job->bo_count = bo_count; + bo_descs = kvmalloc_array(bo_count, sizeof(*bo_descs), + GFP_KERNEL | __GFP_ZERO); + if (!bo_descs) { + ret = -ENOMEM; + goto out; + } + + if (!args->bo_desc_count) { + handles = kvmalloc_array(bo_count, sizeof(*handles), + GFP_KERNEL); + if (!handles) { + ret =-ENOMEM; + goto out; + } + + if (copy_from_user(handles, + (void __user *)(uintptr_t)args->bo_handles, + job->bo_count * sizeof(*handles))) { + ret = -EFAULT; + goto out; + } + + for (i = 0; i < job->bo_count; i++) { + bo_descs[i].handle = handles[i]; + bo_descs[i].flags = PANFROST_SUBMIT_BO_WRITE | + PANFROST_SUBMIT_BO_READ; + } + } else if (copy_from_user(bo_descs, + (void __user *)(uintptr_t)args->bo_descs, + job->bo_count * sizeof(*bo_descs))) { + ret = -EFAULT; + goto out; + } + + for (i = 0; i < job->bo_count; i++) { + if ((bo_descs[i].flags & ~PANFROST_SUBMIT_BO_VALID_FLAGS) || + !(bo_descs[i].flags & PANFROST_SUBMIT_BO_RW)) { + ret = -EINVAL; + goto out; + } + + job->bos[i].flags = bo_descs[i].flags; + job->bos[i].obj = drm_gem_object_lookup(file_priv, + bo_descs[i].handle); + if (!job->bos[i].obj) { + ret = -ENOENT; + goto out; + } + } + +out: + kvfree(handles); + kvfree(bo_descs); + return ret; }
/** diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c index 05c85f45a0de..e4b74fde9339 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.c +++ b/drivers/gpu/drm/panfrost/panfrost_job.c @@ -193,24 +193,116 @@ static void panfrost_job_hw_submit(struct panfrost_job *job, int js) pm_runtime_put_autosuspend(pfdev->dev); }
-static void panfrost_acquire_object_fences(struct drm_gem_object **bos, - int bo_count, - struct dma_fence **implicit_fences) +static int panfrost_acquire_object_fences(struct panfrost_job *job) { - int i; + int i, ret;
- for (i = 0; i < bo_count; i++) - implicit_fences[i] = dma_resv_get_excl_rcu(bos[i]->resv); + for (i = 0; i < job->bo_count; i++) { + struct panfrost_job_bo_desc *bo = &job->bos[i]; + struct dma_resv *robj = bo->obj->resv; + + if (!(job->bos[i].flags & PANFROST_SUBMIT_BO_WRITE)) { + ret = dma_resv_reserve_shared(robj, 1); + if (ret) + return ret; + } + + if (bo->flags & PANFROST_SUBMIT_BO_NO_IMPLICIT_FENCE) + continue; + + if (bo->flags & PANFROST_SUBMIT_BO_WRITE) { + ret = dma_resv_get_fences_rcu(robj, &bo->excl, + &bo->shared_count, + &bo->shared); + if (ret) + return ret; + } else { + bo->excl = dma_resv_get_excl_rcu(robj); + } + } + + return 0; }
-static void panfrost_attach_object_fences(struct drm_gem_object **bos, - int bo_count, - struct dma_fence *fence) +static void panfrost_attach_object_fences(struct panfrost_job *job) { int i;
- for (i = 0; i < bo_count; i++) - dma_resv_add_excl_fence(bos[i]->resv, fence); + for (i = 0; i < job->bo_count; i++) { + struct drm_gem_object *obj = job->bos[i].obj; + + if (job->bos[i].flags & PANFROST_SUBMIT_BO_WRITE) + dma_resv_add_excl_fence(obj->resv, + job->render_done_fence); + else + dma_resv_add_shared_fence(obj->resv, + job->render_done_fence); + } +} + +static int panfrost_job_lock_bos(struct panfrost_job *job, + struct ww_acquire_ctx *acquire_ctx) +{ + int contended = -1; + int i, ret; + + ww_acquire_init(acquire_ctx, &reservation_ww_class); + +retry: + if (contended != -1) { + struct drm_gem_object *obj = job->bos[contended].obj; + + ret = ww_mutex_lock_slow_interruptible(&obj->resv->lock, + acquire_ctx); + if (ret) { + ww_acquire_done(acquire_ctx); + return ret; + } + } + + for (i = 0; i < job->bo_count; i++) { + if (i == contended) + continue; + + ret = ww_mutex_lock_interruptible(&job->bos[i].obj->resv->lock, + acquire_ctx); + if (ret) { + int j; + + for (j = 0; j < i; j++) + ww_mutex_unlock(&job->bos[j].obj->resv->lock); + + if (contended != -1 && contended >= i) { + struct drm_gem_object *contended_obj; + + contended_obj = job->bos[contended].obj; + ww_mutex_unlock(&contended_obj->resv->lock); + } + + if (ret == -EDEADLK) { + contended = i; + goto retry; + } + + ww_acquire_done(acquire_ctx); + return ret; + } + } + + ww_acquire_done(acquire_ctx); + + return 0; +} + +static void panfrost_job_unlock_bos(struct panfrost_job *job, + struct ww_acquire_ctx *acquire_ctx) +{ + int i; + + for (i = 0; i < job->bo_count; i++) + ww_mutex_unlock(&job->bos[i].obj->resv->lock); + + ww_acquire_fini(acquire_ctx); }
int panfrost_job_push(struct panfrost_job *job) @@ -223,8 +315,7 @@ int panfrost_job_push(struct panfrost_job *job)
mutex_lock(&pfdev->sched_lock);
- ret = drm_gem_lock_reservations(job->bos, job->bo_count, - &acquire_ctx); + ret = panfrost_job_lock_bos(job, &acquire_ctx); if (ret) { mutex_unlock(&pfdev->sched_lock); return ret; @@ -240,18 +331,16 @@ int panfrost_job_push(struct panfrost_job *job)
kref_get(&job->refcount); /* put by scheduler job completion */
- panfrost_acquire_object_fences(job->bos, job->bo_count, - job->implicit_fences); + panfrost_acquire_object_fences(job);
drm_sched_entity_push_job(&job->base, entity);
mutex_unlock(&pfdev->sched_lock);
- panfrost_attach_object_fences(job->bos, job->bo_count, - job->render_done_fence); + panfrost_attach_object_fences(job);
unlock: - drm_gem_unlock_reservations(job->bos, job->bo_count, &acquire_ctx); + panfrost_job_unlock_bos(job, &acquire_ctx);
return ret; } @@ -267,20 +356,22 @@ static void panfrost_job_cleanup(struct kref *ref) dma_fence_put(job->in_fences[i]); kvfree(job->in_fences); } - if (job->implicit_fences) { - for (i = 0; i < job->bo_count; i++) - dma_fence_put(job->implicit_fences[i]); - kvfree(job->implicit_fences); + + for (i = 0; i < job->bo_count; i++) { + unsigned int j; + + dma_fence_put(job->bos[i].excl); + for (j = 0; j < job->bos[i].shared_count; j++) + dma_fence_put(job->bos[i].shared[j]); } + dma_fence_put(job->done_fence); dma_fence_put(job->render_done_fence);
- if (job->bos) { - for (i = 0; i < job->bo_count; i++) - drm_gem_object_put_unlocked(job->bos[i]); - kvfree(job->bos); - } + for (i = 0; i < job->bo_count; i++) + drm_gem_object_put_unlocked(job->bos[i].obj);
+ kvfree(job->bos); kfree(job); }
@@ -314,11 +405,22 @@ static struct dma_fence *panfrost_job_dependency(struct drm_sched_job *sched_job } }
- /* Implicit fences, max. one per BO */ + /* Implicit fences */ for (i = 0; i < job->bo_count; i++) { - if (job->implicit_fences[i]) { - fence = job->implicit_fences[i]; - job->implicit_fences[i] = NULL; + unsigned int j; + + if (job->bos[i].excl) { + fence = job->bos[i].excl; + job->bos[i].excl = NULL; + return fence; + } + + for (j = 0; j < job->bos[i].shared_count; j++) { + if (!job->bos[i].shared[j]) + continue; + + fence = job->bos[i].shared[j]; + job->bos[i].shared[j] = NULL; return fence; } } diff --git a/drivers/gpu/drm/panfrost/panfrost_job.h b/drivers/gpu/drm/panfrost/panfrost_job.h index 62454128a792..53440640a6e3 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.h +++ b/drivers/gpu/drm/panfrost/panfrost_job.h @@ -11,6 +11,14 @@ struct panfrost_device; struct panfrost_gem_object; struct panfrost_file_priv;
+struct panfrost_job_bo_desc { + struct drm_gem_object *obj; + struct dma_fence *excl; + struct dma_fence **shared; + unsigned int shared_count; + u32 flags; +}; + struct panfrost_job { struct drm_sched_job base;
@@ -31,8 +39,7 @@ struct panfrost_job { __u32 flush_id;
/* Exclusive fences we have taken from the BOs to wait for */ - struct dma_fence **implicit_fences; - struct drm_gem_object **bos; + struct panfrost_job_bo_desc *bos; u32 bo_count;
/* Fence to be signaled by drm-sched once its done with the job */ diff --git a/include/uapi/drm/panfrost_drm.h b/include/uapi/drm/panfrost_drm.h index ec19db1eead8..029c6ae1b1f0 100644 --- a/include/uapi/drm/panfrost_drm.h +++ b/include/uapi/drm/panfrost_drm.h @@ -40,6 +40,28 @@ extern "C" { #define DRM_IOCTL_PANFROST_PERFCNT_DUMP DRM_IOW(DRM_COMMAND_BASE + DRM_PANFROST_PERFCNT_DUMP, struct drm_panfrost_perfcnt_dump)
#define PANFROST_JD_REQ_FS (1 << 0) + +#define PANFROST_SUBMIT_BO_READ (1 << 0) +#define PANFROST_SUBMIT_BO_WRITE (1 << 1) +#define PANFROST_SUBMIT_BO_RW (PANFROST_SUBMIT_BO_READ | \ + PANFROST_SUBMIT_BO_WRITE) +#define PANFROST_SUBMIT_BO_NO_IMPLICIT_FENCE (1 << 2) +#define PANFROST_SUBMIT_BO_VALID_FLAGS \ + (PANFROST_SUBMIT_BO_RW | PANFROST_SUBMIT_BO_NO_IMPLICIT_FENCE) + +/** + * struct drm_panfrost_submit_bo - BO descriptor passed to the submit ioctl. + * + * Useful to give detailed information about BOs used by a given job. + * + * @handle: the GEM handle + * @flags: a combination of PANFROST_SUBMIT_BO_* + */ +struct drm_panfrost_submit_bo { + __u32 handle; + __u32 flags; +}; + /** * struct drm_panfrost_submit - ioctl argument for submitting commands to the 3D * engine. @@ -68,6 +90,25 @@ struct drm_panfrost_submit {
/** A combination of PANFROST_JD_REQ_* */ __u32 requirements; + + /** + * Pointer to a u32 array of &drm_panfrost_submit_bo_desc objects. This + * field is meant to replace &drm_panfrost_submit.bo_handles which did + * not provide enough information to relax synchronization between + * jobs that only only read the BO they share. When both + * &drm_panfrost_submit.bo_handles and &drm_panfrost_submit.bo_descs + * are provided, drm_panfrost_submit.bo_handles is ignored. + */ + __u64 bo_descs; + + /** + * Number of BO descriptors passed in (size is that times + * sizeof(drm_panfrost_submit_bo_desc)). + */ + __u32 bo_desc_count; + + /** Padding, must be 0. */ + __u32 pad; };
/**
So we can choose to wait for all BO users, or just for writers.
Signed-off-by: Boris Brezillon boris.brezillon@collabora.com --- drivers/gpu/drm/panfrost/panfrost_drv.c | 8 ++++++-- include/uapi/drm/panfrost_drm.h | 4 ++++ 2 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c index 08082fd557c3..6a94aea2147f 100644 --- a/drivers/gpu/drm/panfrost/panfrost_drv.c +++ b/drivers/gpu/drm/panfrost/panfrost_drv.c @@ -322,16 +322,20 @@ panfrost_ioctl_wait_bo(struct drm_device *dev, void *data, struct drm_panfrost_wait_bo *args = data; struct drm_gem_object *gem_obj; unsigned long timeout = drm_timeout_abs_to_jiffies(args->timeout_ns); + bool wait_all = !(args->flags & PANFROST_WAIT_BO_WRITERS);
if (args->pad) return -EINVAL;
+ if (args->flags & ~PANFROST_WAIT_BO_WRITERS) + return -EINVAL; + gem_obj = drm_gem_object_lookup(file_priv, args->handle); if (!gem_obj) return -ENOENT;
- ret = dma_resv_wait_timeout_rcu(gem_obj->resv, true, - true, timeout); + ret = dma_resv_wait_timeout_rcu(gem_obj->resv, wait_all, + true, timeout); if (!ret) ret = timeout ? -ETIMEDOUT : -EBUSY;
diff --git a/include/uapi/drm/panfrost_drm.h b/include/uapi/drm/panfrost_drm.h index 029c6ae1b1f0..ac4facbcee47 100644 --- a/include/uapi/drm/panfrost_drm.h +++ b/include/uapi/drm/panfrost_drm.h @@ -111,6 +111,9 @@ struct drm_panfrost_submit { __u32 pad; };
+#define PANFROST_WAIT_ALL_BO_USERS (0 << 0) +#define PANFROST_WAIT_BO_WRITERS (1 << 0) + /** * struct drm_panfrost_wait_bo - ioctl argument for waiting for * completion of the last DRM_PANFROST_SUBMIT on a BO. @@ -123,6 +126,7 @@ struct drm_panfrost_wait_bo { __u32 handle; __u32 pad; __s64 timeout_ns; /* absolute */ + __u64 flags; };
#define PANFROST_BO_NOEXEC 1
On 13/09/2019 12:17, Boris Brezillon wrote:
So we can choose to wait for all BO users, or just for writers.
Signed-off-by: Boris Brezillon boris.brezillon@collabora.com
Looks good to me:
Reviewed-by: Steven Price steven.price@arm.com
Although I don't know if the term "writers" should be in the API or if "exclusive" is the preferred term?
Steve
drivers/gpu/drm/panfrost/panfrost_drv.c | 8 ++++++-- include/uapi/drm/panfrost_drm.h | 4 ++++ 2 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c index 08082fd557c3..6a94aea2147f 100644 --- a/drivers/gpu/drm/panfrost/panfrost_drv.c +++ b/drivers/gpu/drm/panfrost/panfrost_drv.c @@ -322,16 +322,20 @@ panfrost_ioctl_wait_bo(struct drm_device *dev, void *data, struct drm_panfrost_wait_bo *args = data; struct drm_gem_object *gem_obj; unsigned long timeout = drm_timeout_abs_to_jiffies(args->timeout_ns);
bool wait_all = !(args->flags & PANFROST_WAIT_BO_WRITERS);
if (args->pad) return -EINVAL;
if (args->flags & ~PANFROST_WAIT_BO_WRITERS)
return -EINVAL;
gem_obj = drm_gem_object_lookup(file_priv, args->handle); if (!gem_obj) return -ENOENT;
- ret = dma_resv_wait_timeout_rcu(gem_obj->resv, true,
true, timeout);
- ret = dma_resv_wait_timeout_rcu(gem_obj->resv, wait_all,
if (!ret) ret = timeout ? -ETIMEDOUT : -EBUSY;true, timeout);
diff --git a/include/uapi/drm/panfrost_drm.h b/include/uapi/drm/panfrost_drm.h index 029c6ae1b1f0..ac4facbcee47 100644 --- a/include/uapi/drm/panfrost_drm.h +++ b/include/uapi/drm/panfrost_drm.h @@ -111,6 +111,9 @@ struct drm_panfrost_submit { __u32 pad; };
+#define PANFROST_WAIT_ALL_BO_USERS (0 << 0) +#define PANFROST_WAIT_BO_WRITERS (1 << 0)
/**
- struct drm_panfrost_wait_bo - ioctl argument for waiting for
- completion of the last DRM_PANFROST_SUBMIT on a BO.
@@ -123,6 +126,7 @@ struct drm_panfrost_wait_bo { __u32 handle; __u32 pad; __s64 timeout_ns; /* absolute */
- __u64 flags;
};
#define PANFROST_BO_NOEXEC 1
On Fri, 13 Sep 2019 14:46:46 +0100 Steven Price steven.price@arm.com wrote:
On 13/09/2019 12:17, Boris Brezillon wrote:
So we can choose to wait for all BO users, or just for writers.
Signed-off-by: Boris Brezillon boris.brezillon@collabora.com
Looks good to me:
Reviewed-by: Steven Price steven.price@arm.com
Although I don't know if the term "writers" should be in the API or if "exclusive" is the preferred term?
I'll go for PANFROST_WAIT_BO_EXCLUSIVE then.
Thanks,
Boris
Steve
drivers/gpu/drm/panfrost/panfrost_drv.c | 8 ++++++-- include/uapi/drm/panfrost_drm.h | 4 ++++ 2 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c index 08082fd557c3..6a94aea2147f 100644 --- a/drivers/gpu/drm/panfrost/panfrost_drv.c +++ b/drivers/gpu/drm/panfrost/panfrost_drv.c @@ -322,16 +322,20 @@ panfrost_ioctl_wait_bo(struct drm_device *dev, void *data, struct drm_panfrost_wait_bo *args = data; struct drm_gem_object *gem_obj; unsigned long timeout = drm_timeout_abs_to_jiffies(args->timeout_ns);
bool wait_all = !(args->flags & PANFROST_WAIT_BO_WRITERS);
if (args->pad) return -EINVAL;
if (args->flags & ~PANFROST_WAIT_BO_WRITERS)
return -EINVAL;
gem_obj = drm_gem_object_lookup(file_priv, args->handle); if (!gem_obj) return -ENOENT;
- ret = dma_resv_wait_timeout_rcu(gem_obj->resv, true,
true, timeout);
- ret = dma_resv_wait_timeout_rcu(gem_obj->resv, wait_all,
if (!ret) ret = timeout ? -ETIMEDOUT : -EBUSY;true, timeout);
diff --git a/include/uapi/drm/panfrost_drm.h b/include/uapi/drm/panfrost_drm.h index 029c6ae1b1f0..ac4facbcee47 100644 --- a/include/uapi/drm/panfrost_drm.h +++ b/include/uapi/drm/panfrost_drm.h @@ -111,6 +111,9 @@ struct drm_panfrost_submit { __u32 pad; };
+#define PANFROST_WAIT_ALL_BO_USERS (0 << 0) +#define PANFROST_WAIT_BO_WRITERS (1 << 0)
/**
- struct drm_panfrost_wait_bo - ioctl argument for waiting for
- completion of the last DRM_PANFROST_SUBMIT on a BO.
@@ -123,6 +126,7 @@ struct drm_panfrost_wait_bo { __u32 handle; __u32 pad; __s64 timeout_ns; /* absolute */
- __u64 flags;
};
#define PANFROST_BO_NOEXEC 1
On 13/09/2019 12:17, Boris Brezillon wrote:
The READ/WRITE flags are particularly useful if we want to avoid serialization of jobs that read from the same BO but never write to it. The NO_IMPLICIT_FENCE might be useful when the user knows the BO is shared but jobs are using different portions of the buffer.
Signed-off-by: Boris Brezillon boris.brezillon@collabora.com
Good feature - we could do with an (easy) way of the user driver detecting this - so it might be worth bumping the driver version for this?
Some more comments below.
drivers/gpu/drm/panfrost/panfrost_drv.c | 72 +++++++++-- drivers/gpu/drm/panfrost/panfrost_job.c | 164 +++++++++++++++++++----- drivers/gpu/drm/panfrost/panfrost_job.h | 11 +- include/uapi/drm/panfrost_drm.h | 41 ++++++ 4 files changed, 247 insertions(+), 41 deletions(-)
diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c index d74442d71048..08082fd557c3 100644 --- a/drivers/gpu/drm/panfrost/panfrost_drv.c +++ b/drivers/gpu/drm/panfrost/panfrost_drv.c @@ -119,20 +119,76 @@ panfrost_lookup_bos(struct drm_device *dev, struct drm_panfrost_submit *args, struct panfrost_job *job) {
- job->bo_count = args->bo_handle_count;
- struct drm_panfrost_submit_bo *bo_descs = NULL;
- u32 *handles = NULL;
- u32 i, bo_count;
- int ret = 0;
- if (!job->bo_count)
- bo_count = args->bo_desc_count ?
args->bo_desc_count : args->bo_handle_count;
- if (!bo_count) return 0;
- job->implicit_fences = kvmalloc_array(job->bo_count,
sizeof(struct dma_fence *),
- job->bos = kvmalloc_array(bo_count, sizeof(*job->bos), GFP_KERNEL | __GFP_ZERO);
- if (!job->implicit_fences)
- if (!job->bos) return -ENOMEM;
- return drm_gem_objects_lookup(file_priv,
(void __user *)(uintptr_t)args->bo_handles,
job->bo_count, &job->bos);
- job->bo_count = bo_count;
- bo_descs = kvmalloc_array(bo_count, sizeof(*bo_descs),
GFP_KERNEL | __GFP_ZERO);
- if (!bo_descs) {
ret = -ENOMEM;
goto out;
This can be just "return -ENOMEM" - both handles and bo_descs will be NULL.
- }
- if (!args->bo_desc_count) {
handles = kvmalloc_array(bo_count, sizeof(*handles),
GFP_KERNEL);
if (!handles) {
ret =-ENOMEM;
goto out;
}
if (copy_from_user(handles,
(void __user *)(uintptr_t)args->bo_handles,
job->bo_count * sizeof(*handles))) {
ret = -EFAULT;
goto out;
}
for (i = 0; i < job->bo_count; i++) {
bo_descs[i].handle = handles[i];
bo_descs[i].flags = PANFROST_SUBMIT_BO_WRITE |
PANFROST_SUBMIT_BO_READ;
You can use PANFROST_SUBMIT_BO_RW here.
}
- } else if (copy_from_user(bo_descs,
(void __user *)(uintptr_t)args->bo_descs,
job->bo_count * sizeof(*bo_descs))) {
ret = -EFAULT;
goto out;
- }
- for (i = 0; i < job->bo_count; i++) {
if ((bo_descs[i].flags & ~PANFROST_SUBMIT_BO_VALID_FLAGS) ||
!(bo_descs[i].flags & PANFROST_SUBMIT_BO_RW)) {
ret = -EINVAL;
goto out;
}
job->bos[i].flags = bo_descs[i].flags;
job->bos[i].obj = drm_gem_object_lookup(file_priv,
bo_descs[i].handle);
if (!job->bos[i].obj) {
ret = -ENOENT;
goto out;
}
- }
+out:
- kvfree(handles);
- kvfree(bo_descs);
- return ret;
}
/** diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c index 05c85f45a0de..e4b74fde9339 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.c +++ b/drivers/gpu/drm/panfrost/panfrost_job.c @@ -193,24 +193,116 @@ static void panfrost_job_hw_submit(struct panfrost_job *job, int js) pm_runtime_put_autosuspend(pfdev->dev); }
-static void panfrost_acquire_object_fences(struct drm_gem_object **bos,
int bo_count,
struct dma_fence **implicit_fences)
+static int panfrost_acquire_object_fences(struct panfrost_job *job) {
- int i;
- int i, ret;
- for (i = 0; i < bo_count; i++)
implicit_fences[i] = dma_resv_get_excl_rcu(bos[i]->resv);
- for (i = 0; i < job->bo_count; i++) {
struct panfrost_job_bo_desc *bo = &job->bos[i];
struct dma_resv *robj = bo->obj->resv;
if (!(job->bos[i].flags & PANFROST_SUBMIT_BO_WRITE)) {
ret = dma_resv_reserve_shared(robj, 1);
if (ret)
return ret;
}
if (bo->flags & PANFROST_SUBMIT_BO_NO_IMPLICIT_FENCE)
continue;
if (bo->flags & PANFROST_SUBMIT_BO_WRITE) {
ret = dma_resv_get_fences_rcu(robj, &bo->excl,
&bo->shared_count,
&bo->shared);
if (ret)
return ret;
} else {
bo->excl = dma_resv_get_excl_rcu(robj);
}
The implementation of NO_IMPLICIT_FENCE seems a bit strange to me: READ | NO_IMPLICIT_FENCE still reserves space for a shared fence. I don't understand why.
- }
- return 0;
}
-static void panfrost_attach_object_fences(struct drm_gem_object **bos,
int bo_count,
struct dma_fence *fence)
+static void panfrost_attach_object_fences(struct panfrost_job *job) { int i;
- for (i = 0; i < bo_count; i++)
dma_resv_add_excl_fence(bos[i]->resv, fence);
- for (i = 0; i < job->bo_count; i++) {
struct drm_gem_object *obj = job->bos[i].obj;
if (job->bos[i].flags & PANFROST_SUBMIT_BO_WRITE)
dma_resv_add_excl_fence(obj->resv,
job->render_done_fence);
else
dma_resv_add_shared_fence(obj->resv,
job->render_done_fence);
- }
+}
+static int panfrost_job_lock_bos(struct panfrost_job *job,
struct ww_acquire_ctx *acquire_ctx)
+{
- int contended = -1;
- int i, ret;
- ww_acquire_init(acquire_ctx, &reservation_ww_class);
+retry:
- if (contended != -1) {
struct drm_gem_object *obj = job->bos[contended].obj;
ret = ww_mutex_lock_slow_interruptible(&obj->resv->lock,
acquire_ctx);
dma_resv_lock_slot_interruptible()?
if (ret) {
ww_acquire_done(acquire_ctx);
return ret;
}
- }
- for (i = 0; i < job->bo_count; i++) {
if (i == contended)
continue;
ret = ww_mutex_lock_interruptible(&job->bos[i].obj->resv->lock,
acquire_ctx);
dma_resv_lock_interruptible()?
if (ret) {
int j;
for (j = 0; j < i; j++)
ww_mutex_unlock(&job->bos[j].obj->resv->lock);
if (contended != -1 && contended >= i) {
struct drm_gem_object *contended_obj;
contended_obj = job->bos[contended].obj;
ww_mutex_unlock(&contended_obj->resv->lock);
}
if (ret == -EDEADLK) {
contended = i;
goto retry;
}
ww_acquire_done(acquire_ctx);
return ret;
}
- }
- ww_acquire_done(acquire_ctx);
- return 0;
+}
This looks like a copy of drm_gem_lock_reservations(). The only reason for it as far as I can see is because we now have an array of struct panfrost_job_bo_desc rather than a direct array of struct drm_gem_object. I'm not sure having everything neatly in one structure is worth this cost?
+static void panfrost_job_unlock_bos(struct panfrost_job *job,
struct ww_acquire_ctx *acquire_ctx)
+{
- int i;
- for (i = 0; i < job->bo_count; i++)
ww_mutex_unlock(&job->bos[i].obj->resv->lock);
- ww_acquire_fini(acquire_ctx);
}> int panfrost_job_push(struct panfrost_job *job) @@ -223,8 +315,7 @@ int panfrost_job_push(struct panfrost_job *job)
mutex_lock(&pfdev->sched_lock);
- ret = drm_gem_lock_reservations(job->bos, job->bo_count,
&acquire_ctx);
- ret = panfrost_job_lock_bos(job, &acquire_ctx); if (ret) { mutex_unlock(&pfdev->sched_lock); return ret;
@@ -240,18 +331,16 @@ int panfrost_job_push(struct panfrost_job *job)
kref_get(&job->refcount); /* put by scheduler job completion */
- panfrost_acquire_object_fences(job->bos, job->bo_count,
job->implicit_fences);
panfrost_acquire_object_fences(job);
drm_sched_entity_push_job(&job->base, entity);
mutex_unlock(&pfdev->sched_lock);
- panfrost_attach_object_fences(job->bos, job->bo_count,
job->render_done_fence);
- panfrost_attach_object_fences(job);
unlock:
- drm_gem_unlock_reservations(job->bos, job->bo_count, &acquire_ctx);
panfrost_job_unlock_bos(job, &acquire_ctx);
return ret;
} @@ -267,20 +356,22 @@ static void panfrost_job_cleanup(struct kref *ref) dma_fence_put(job->in_fences[i]); kvfree(job->in_fences); }
- if (job->implicit_fences) {
for (i = 0; i < job->bo_count; i++)
dma_fence_put(job->implicit_fences[i]);
kvfree(job->implicit_fences);
- for (i = 0; i < job->bo_count; i++) {
unsigned int j;
dma_fence_put(job->bos[i].excl);
for (j = 0; j < job->bos[i].shared_count; j++)
}dma_fence_put(job->bos[i].shared[j]);
- dma_fence_put(job->done_fence); dma_fence_put(job->render_done_fence);
- if (job->bos) {
for (i = 0; i < job->bo_count; i++)
drm_gem_object_put_unlocked(job->bos[i]);
kvfree(job->bos);
- }
for (i = 0; i < job->bo_count; i++)
drm_gem_object_put_unlocked(job->bos[i].obj);
kvfree(job->bos); kfree(job);
}
@@ -314,11 +405,22 @@ static struct dma_fence *panfrost_job_dependency(struct drm_sched_job *sched_job } }
- /* Implicit fences, max. one per BO */
- /* Implicit fences */ for (i = 0; i < job->bo_count; i++) {
if (job->implicit_fences[i]) {
fence = job->implicit_fences[i];
job->implicit_fences[i] = NULL;
unsigned int j;
if (job->bos[i].excl) {
fence = job->bos[i].excl;
job->bos[i].excl = NULL;
return fence;
}
for (j = 0; j < job->bos[i].shared_count; j++) {
if (!job->bos[i].shared[j])
continue;
fence = job->bos[i].shared[j];
} }job->bos[i].shared[j] = NULL; return fence;
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.h b/drivers/gpu/drm/panfrost/panfrost_job.h index 62454128a792..53440640a6e3 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.h +++ b/drivers/gpu/drm/panfrost/panfrost_job.h @@ -11,6 +11,14 @@ struct panfrost_device; struct panfrost_gem_object; struct panfrost_file_priv;
+struct panfrost_job_bo_desc {
- struct drm_gem_object *obj;
- struct dma_fence *excl;
- struct dma_fence **shared;
- unsigned int shared_count;
- u32 flags;
+};
struct panfrost_job { struct drm_sched_job base;
@@ -31,8 +39,7 @@ struct panfrost_job { __u32 flush_id;
/* Exclusive fences we have taken from the BOs to wait for */
Comment needs updating
- struct dma_fence **implicit_fences;
- struct drm_gem_object **bos;
struct panfrost_job_bo_desc *bos; u32 bo_count;
/* Fence to be signaled by drm-sched once its done with the job */
diff --git a/include/uapi/drm/panfrost_drm.h b/include/uapi/drm/panfrost_drm.h index ec19db1eead8..029c6ae1b1f0 100644 --- a/include/uapi/drm/panfrost_drm.h +++ b/include/uapi/drm/panfrost_drm.h @@ -40,6 +40,28 @@ extern "C" { #define DRM_IOCTL_PANFROST_PERFCNT_DUMP DRM_IOW(DRM_COMMAND_BASE + DRM_PANFROST_PERFCNT_DUMP, struct drm_panfrost_perfcnt_dump)
#define PANFROST_JD_REQ_FS (1 << 0)
+#define PANFROST_SUBMIT_BO_READ (1 << 0) +#define PANFROST_SUBMIT_BO_WRITE (1 << 1) +#define PANFROST_SUBMIT_BO_RW (PANFROST_SUBMIT_BO_READ | \
PANFROST_SUBMIT_BO_WRITE)
+#define PANFROST_SUBMIT_BO_NO_IMPLICIT_FENCE (1 << 2) +#define PANFROST_SUBMIT_BO_VALID_FLAGS \
- (PANFROST_SUBMIT_BO_RW | PANFROST_SUBMIT_BO_NO_IMPLICIT_FENCE)
+/**
- struct drm_panfrost_submit_bo - BO descriptor passed to the submit ioctl.
- Useful to give detailed information about BOs used by a given job.
- @handle: the GEM handle
- @flags: a combination of PANFROST_SUBMIT_BO_*
- */
+struct drm_panfrost_submit_bo {
- __u32 handle;
- __u32 flags;
+};
/**
- struct drm_panfrost_submit - ioctl argument for submitting commands to the 3D
- engine.
@@ -68,6 +90,25 @@ struct drm_panfrost_submit {
/** A combination of PANFROST_JD_REQ_* */ __u32 requirements;
- /**
* Pointer to a u32 array of &drm_panfrost_submit_bo_desc objects. This
* field is meant to replace &drm_panfrost_submit.bo_handles which did
* not provide enough information to relax synchronization between
* jobs that only only read the BO they share. When both
* &drm_panfrost_submit.bo_handles and &drm_panfrost_submit.bo_descs
* are provided, drm_panfrost_submit.bo_handles is ignored.
*/
- __u64 bo_descs;
- /**
* Number of BO descriptors passed in (size is that times
* sizeof(drm_panfrost_submit_bo_desc)).
*/
- __u32 bo_desc_count;
We don't really need another count field. bo_handle_count could be re-used. Indeed this could even be handled with just a flags field with a new flag specifying that bo_handles no longer points to handles but to bo_desc objects instead.
Steve
- /** Padding, must be 0. */
- __u32 pad;
};
/**
- /**
* Pointer to a u32 array of &drm_panfrost_submit_bo_desc objects. This
* field is meant to replace &drm_panfrost_submit.bo_handles which did
* not provide enough information to relax synchronization between
* jobs that only only read the BO they share. When both
* &drm_panfrost_submit.bo_handles and &drm_panfrost_submit.bo_descs
* are provided, drm_panfrost_submit.bo_handles is ignored.
*/
- __u64 bo_descs;
- /**
* Number of BO descriptors passed in (size is that times
* sizeof(drm_panfrost_submit_bo_desc)).
*/
- __u32 bo_desc_count;
We don't really need another count field. bo_handle_count could be re-used. Indeed this could even be handled with just a flags field with a new flag specifying that bo_handles no longer points to handles but to bo_desc objects instead.
This seems like the cleaniest approach (also bumping the ABI version).
On Fri, 13 Sep 2019 14:44:14 +0100 Steven Price steven.price@arm.com wrote:
On 13/09/2019 12:17, Boris Brezillon wrote:
The READ/WRITE flags are particularly useful if we want to avoid serialization of jobs that read from the same BO but never write to it. The NO_IMPLICIT_FENCE might be useful when the user knows the BO is shared but jobs are using different portions of the buffer.
Signed-off-by: Boris Brezillon boris.brezillon@collabora.com
Good feature - we could do with an (easy) way of the user driver detecting this - so it might be worth bumping the driver version for this?
I was trying to support this feature without adding a new feature flag or bumping the minor version, but I guess there's no good reason to do that.
Some more comments below.
drivers/gpu/drm/panfrost/panfrost_drv.c | 72 +++++++++-- drivers/gpu/drm/panfrost/panfrost_job.c | 164 +++++++++++++++++++----- drivers/gpu/drm/panfrost/panfrost_job.h | 11 +- include/uapi/drm/panfrost_drm.h | 41 ++++++ 4 files changed, 247 insertions(+), 41 deletions(-)
diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c index d74442d71048..08082fd557c3 100644 --- a/drivers/gpu/drm/panfrost/panfrost_drv.c +++ b/drivers/gpu/drm/panfrost/panfrost_drv.c @@ -119,20 +119,76 @@ panfrost_lookup_bos(struct drm_device *dev, struct drm_panfrost_submit *args, struct panfrost_job *job) {
- job->bo_count = args->bo_handle_count;
- struct drm_panfrost_submit_bo *bo_descs = NULL;
- u32 *handles = NULL;
- u32 i, bo_count;
- int ret = 0;
- if (!job->bo_count)
- bo_count = args->bo_desc_count ?
args->bo_desc_count : args->bo_handle_count;
- if (!bo_count) return 0;
- job->implicit_fences = kvmalloc_array(job->bo_count,
sizeof(struct dma_fence *),
- job->bos = kvmalloc_array(bo_count, sizeof(*job->bos), GFP_KERNEL | __GFP_ZERO);
- if (!job->implicit_fences)
- if (!job->bos) return -ENOMEM;
- return drm_gem_objects_lookup(file_priv,
(void __user *)(uintptr_t)args->bo_handles,
job->bo_count, &job->bos);
- job->bo_count = bo_count;
- bo_descs = kvmalloc_array(bo_count, sizeof(*bo_descs),
GFP_KERNEL | __GFP_ZERO);
- if (!bo_descs) {
ret = -ENOMEM;
goto out;
This can be just "return -ENOMEM" - both handles and bo_descs will be NULL.
Will fix that.
- }
- if (!args->bo_desc_count) {
handles = kvmalloc_array(bo_count, sizeof(*handles),
GFP_KERNEL);
if (!handles) {
ret =-ENOMEM;
goto out;
}
if (copy_from_user(handles,
(void __user *)(uintptr_t)args->bo_handles,
job->bo_count * sizeof(*handles))) {
ret = -EFAULT;
goto out;
}
for (i = 0; i < job->bo_count; i++) {
bo_descs[i].handle = handles[i];
bo_descs[i].flags = PANFROST_SUBMIT_BO_WRITE |
PANFROST_SUBMIT_BO_READ;
You can use PANFROST_SUBMIT_BO_RW here.
That as well.
}
- } else if (copy_from_user(bo_descs,
(void __user *)(uintptr_t)args->bo_descs,
job->bo_count * sizeof(*bo_descs))) {
ret = -EFAULT;
goto out;
- }
- for (i = 0; i < job->bo_count; i++) {
if ((bo_descs[i].flags & ~PANFROST_SUBMIT_BO_VALID_FLAGS) ||
!(bo_descs[i].flags & PANFROST_SUBMIT_BO_RW)) {
ret = -EINVAL;
goto out;
}
job->bos[i].flags = bo_descs[i].flags;
job->bos[i].obj = drm_gem_object_lookup(file_priv,
bo_descs[i].handle);
if (!job->bos[i].obj) {
ret = -ENOENT;
goto out;
}
- }
+out:
- kvfree(handles);
- kvfree(bo_descs);
- return ret;
}
/** diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c index 05c85f45a0de..e4b74fde9339 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.c +++ b/drivers/gpu/drm/panfrost/panfrost_job.c @@ -193,24 +193,116 @@ static void panfrost_job_hw_submit(struct panfrost_job *job, int js) pm_runtime_put_autosuspend(pfdev->dev); }
-static void panfrost_acquire_object_fences(struct drm_gem_object **bos,
int bo_count,
struct dma_fence **implicit_fences)
+static int panfrost_acquire_object_fences(struct panfrost_job *job) {
- int i;
- int i, ret;
- for (i = 0; i < bo_count; i++)
implicit_fences[i] = dma_resv_get_excl_rcu(bos[i]->resv);
- for (i = 0; i < job->bo_count; i++) {
struct panfrost_job_bo_desc *bo = &job->bos[i];
struct dma_resv *robj = bo->obj->resv;
if (!(job->bos[i].flags & PANFROST_SUBMIT_BO_WRITE)) {
ret = dma_resv_reserve_shared(robj, 1);
if (ret)
return ret;
}
if (bo->flags & PANFROST_SUBMIT_BO_NO_IMPLICIT_FENCE)
continue;
if (bo->flags & PANFROST_SUBMIT_BO_WRITE) {
ret = dma_resv_get_fences_rcu(robj, &bo->excl,
&bo->shared_count,
&bo->shared);
if (ret)
return ret;
} else {
bo->excl = dma_resv_get_excl_rcu(robj);
}
The implementation of NO_IMPLICIT_FENCE seems a bit strange to me: READ | NO_IMPLICIT_FENCE still reserves space for a shared fence. I don't understand why.
The NO_IMPLICIT_FENCE flag is telling the core we don't want to wait on the fence installed by other jobs on this BO, but other might want to wait on our render-done fence (when CPU accesses are required, or simply because other jobs didn't pass the NO_IMPLICIT fence flag).
- }
- return 0;
}
-static void panfrost_attach_object_fences(struct drm_gem_object **bos,
int bo_count,
struct dma_fence *fence)
+static void panfrost_attach_object_fences(struct panfrost_job *job) { int i;
- for (i = 0; i < bo_count; i++)
dma_resv_add_excl_fence(bos[i]->resv, fence);
- for (i = 0; i < job->bo_count; i++) {
struct drm_gem_object *obj = job->bos[i].obj;
if (job->bos[i].flags & PANFROST_SUBMIT_BO_WRITE)
dma_resv_add_excl_fence(obj->resv,
job->render_done_fence);
else
dma_resv_add_shared_fence(obj->resv,
job->render_done_fence);
- }
+}
+static int panfrost_job_lock_bos(struct panfrost_job *job,
struct ww_acquire_ctx *acquire_ctx)
+{
- int contended = -1;
- int i, ret;
- ww_acquire_init(acquire_ctx, &reservation_ww_class);
+retry:
- if (contended != -1) {
struct drm_gem_object *obj = job->bos[contended].obj;
ret = ww_mutex_lock_slow_interruptible(&obj->resv->lock,
acquire_ctx);
dma_resv_lock_slot_interruptible()?
Yep (I started this on an older kernel version where dma_resv_ helpers didn't exit and apparently forgot to convert a few things when rebasing).
if (ret) {
ww_acquire_done(acquire_ctx);
return ret;
}
- }
- for (i = 0; i < job->bo_count; i++) {
if (i == contended)
continue;
ret = ww_mutex_lock_interruptible(&job->bos[i].obj->resv->lock,
acquire_ctx);
dma_resv_lock_interruptible()?
Ditto
if (ret) {
int j;
for (j = 0; j < i; j++)
ww_mutex_unlock(&job->bos[j].obj->resv->lock);
if (contended != -1 && contended >= i) {
struct drm_gem_object *contended_obj;
contended_obj = job->bos[contended].obj;
ww_mutex_unlock(&contended_obj->resv->lock);
}
if (ret == -EDEADLK) {
contended = i;
goto retry;
}
ww_acquire_done(acquire_ctx);
return ret;
}
- }
- ww_acquire_done(acquire_ctx);
- return 0;
+}
This looks like a copy of drm_gem_lock_reservations(). The only reason for it as far as I can see is because we now have an array of struct panfrost_job_bo_desc rather than a direct array of struct drm_gem_object. I'm not sure having everything neatly in one structure is worth this cost?
I really like the ideas of having all BO related fields put in a separate structure, but okay.
/**
- struct drm_panfrost_submit - ioctl argument for submitting commands to the 3D
- engine.
@@ -68,6 +90,25 @@ struct drm_panfrost_submit {
/** A combination of PANFROST_JD_REQ_* */ __u32 requirements;
- /**
* Pointer to a u32 array of &drm_panfrost_submit_bo_desc objects. This
* field is meant to replace &drm_panfrost_submit.bo_handles which did
* not provide enough information to relax synchronization between
* jobs that only only read the BO they share. When both
* &drm_panfrost_submit.bo_handles and &drm_panfrost_submit.bo_descs
* are provided, drm_panfrost_submit.bo_handles is ignored.
*/
- __u64 bo_descs;
- /**
* Number of BO descriptors passed in (size is that times
* sizeof(drm_panfrost_submit_bo_desc)).
*/
- __u32 bo_desc_count;
We don't really need another count field. bo_handle_count could be re-used. Indeed this could even be handled with just a flags field with a new flag specifying that bo_handles no longer points to handles but to bo_desc objects instead.
As said above, I was trying to avoid bumping the minor version or adding a feature flag, hence the decision to add 2 separate fields for the BO desc array. I agree, it's probably not the best solution, so I'll add a new _REQ_ flag and bump the minor version as you suggest.
On Fri, Sep 13, 2019 at 8:44 AM Steven Price steven.price@arm.com wrote:
On 13/09/2019 12:17, Boris Brezillon wrote:
The READ/WRITE flags are particularly useful if we want to avoid serialization of jobs that read from the same BO but never write to it. The NO_IMPLICIT_FENCE might be useful when the user knows the BO is shared but jobs are using different portions of the buffer.
Signed-off-by: Boris Brezillon boris.brezillon@collabora.com
Good feature - we could do with an (easy) way of the user driver detecting this - so it might be worth bumping the driver version for this?
Some more comments below.
drivers/gpu/drm/panfrost/panfrost_drv.c | 72 +++++++++-- drivers/gpu/drm/panfrost/panfrost_job.c | 164 +++++++++++++++++++----- drivers/gpu/drm/panfrost/panfrost_job.h | 11 +- include/uapi/drm/panfrost_drm.h | 41 ++++++ 4 files changed, 247 insertions(+), 41 deletions(-)
diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c index d74442d71048..08082fd557c3 100644 --- a/drivers/gpu/drm/panfrost/panfrost_drv.c +++ b/drivers/gpu/drm/panfrost/panfrost_drv.c @@ -119,20 +119,76 @@ panfrost_lookup_bos(struct drm_device *dev, struct drm_panfrost_submit *args, struct panfrost_job *job) {
job->bo_count = args->bo_handle_count;
struct drm_panfrost_submit_bo *bo_descs = NULL;
u32 *handles = NULL;
u32 i, bo_count;
int ret = 0;
if (!job->bo_count)
bo_count = args->bo_desc_count ?
args->bo_desc_count : args->bo_handle_count;
if (!bo_count) return 0;
job->implicit_fences = kvmalloc_array(job->bo_count,
sizeof(struct dma_fence *),
job->bos = kvmalloc_array(bo_count, sizeof(*job->bos), GFP_KERNEL | __GFP_ZERO);
if (!job->implicit_fences)
if (!job->bos) return -ENOMEM;
return drm_gem_objects_lookup(file_priv,
(void __user *)(uintptr_t)args->bo_handles,
job->bo_count, &job->bos);
job->bo_count = bo_count;
bo_descs = kvmalloc_array(bo_count, sizeof(*bo_descs),
GFP_KERNEL | __GFP_ZERO);
if (!bo_descs) {
ret = -ENOMEM;
goto out;
This can be just "return -ENOMEM" - both handles and bo_descs will be NULL.
}
if (!args->bo_desc_count) {
handles = kvmalloc_array(bo_count, sizeof(*handles),
GFP_KERNEL);
if (!handles) {
ret =-ENOMEM;
goto out;
}
if (copy_from_user(handles,
(void __user *)(uintptr_t)args->bo_handles,
job->bo_count * sizeof(*handles))) {
ret = -EFAULT;
goto out;
}
for (i = 0; i < job->bo_count; i++) {
bo_descs[i].handle = handles[i];
bo_descs[i].flags = PANFROST_SUBMIT_BO_WRITE |
PANFROST_SUBMIT_BO_READ;
You can use PANFROST_SUBMIT_BO_RW here.
}
} else if (copy_from_user(bo_descs,
(void __user *)(uintptr_t)args->bo_descs,
job->bo_count * sizeof(*bo_descs))) {
ret = -EFAULT;
goto out;
}
for (i = 0; i < job->bo_count; i++) {
if ((bo_descs[i].flags & ~PANFROST_SUBMIT_BO_VALID_FLAGS) ||
!(bo_descs[i].flags & PANFROST_SUBMIT_BO_RW)) {
ret = -EINVAL;
goto out;
}
job->bos[i].flags = bo_descs[i].flags;
job->bos[i].obj = drm_gem_object_lookup(file_priv,
bo_descs[i].handle);
if (!job->bos[i].obj) {
ret = -ENOENT;
goto out;
}
}
+out:
kvfree(handles);
kvfree(bo_descs);
return ret;
}
/** diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c index 05c85f45a0de..e4b74fde9339 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.c +++ b/drivers/gpu/drm/panfrost/panfrost_job.c @@ -193,24 +193,116 @@ static void panfrost_job_hw_submit(struct panfrost_job *job, int js) pm_runtime_put_autosuspend(pfdev->dev); }
-static void panfrost_acquire_object_fences(struct drm_gem_object **bos,
int bo_count,
struct dma_fence **implicit_fences)
+static int panfrost_acquire_object_fences(struct panfrost_job *job) {
int i;
int i, ret;
for (i = 0; i < bo_count; i++)
implicit_fences[i] = dma_resv_get_excl_rcu(bos[i]->resv);
for (i = 0; i < job->bo_count; i++) {
struct panfrost_job_bo_desc *bo = &job->bos[i];
struct dma_resv *robj = bo->obj->resv;
if (!(job->bos[i].flags & PANFROST_SUBMIT_BO_WRITE)) {
ret = dma_resv_reserve_shared(robj, 1);
if (ret)
return ret;
}
if (bo->flags & PANFROST_SUBMIT_BO_NO_IMPLICIT_FENCE)
continue;
if (bo->flags & PANFROST_SUBMIT_BO_WRITE) {
ret = dma_resv_get_fences_rcu(robj, &bo->excl,
&bo->shared_count,
&bo->shared);
if (ret)
return ret;
} else {
bo->excl = dma_resv_get_excl_rcu(robj);
}
The implementation of NO_IMPLICIT_FENCE seems a bit strange to me: READ | NO_IMPLICIT_FENCE still reserves space for a shared fence. I don't understand why.
}
return 0;
}
-static void panfrost_attach_object_fences(struct drm_gem_object **bos,
int bo_count,
struct dma_fence *fence)
+static void panfrost_attach_object_fences(struct panfrost_job *job) { int i;
for (i = 0; i < bo_count; i++)
dma_resv_add_excl_fence(bos[i]->resv, fence);
for (i = 0; i < job->bo_count; i++) {
struct drm_gem_object *obj = job->bos[i].obj;
if (job->bos[i].flags & PANFROST_SUBMIT_BO_WRITE)
dma_resv_add_excl_fence(obj->resv,
job->render_done_fence);
else
dma_resv_add_shared_fence(obj->resv,
job->render_done_fence);
}
+}
+static int panfrost_job_lock_bos(struct panfrost_job *job,
struct ww_acquire_ctx *acquire_ctx)
+{
int contended = -1;
int i, ret;
ww_acquire_init(acquire_ctx, &reservation_ww_class);
+retry:
if (contended != -1) {
struct drm_gem_object *obj = job->bos[contended].obj;
ret = ww_mutex_lock_slow_interruptible(&obj->resv->lock,
acquire_ctx);
dma_resv_lock_slot_interruptible()?
if (ret) {
ww_acquire_done(acquire_ctx);
return ret;
}
}
for (i = 0; i < job->bo_count; i++) {
if (i == contended)
continue;
ret = ww_mutex_lock_interruptible(&job->bos[i].obj->resv->lock,
acquire_ctx);
dma_resv_lock_interruptible()?
if (ret) {
int j;
for (j = 0; j < i; j++)
ww_mutex_unlock(&job->bos[j].obj->resv->lock);
if (contended != -1 && contended >= i) {
struct drm_gem_object *contended_obj;
contended_obj = job->bos[contended].obj;
ww_mutex_unlock(&contended_obj->resv->lock);
}
if (ret == -EDEADLK) {
contended = i;
goto retry;
}
ww_acquire_done(acquire_ctx);
return ret;
}
}
ww_acquire_done(acquire_ctx);
return 0;
+}
This looks like a copy of drm_gem_lock_reservations(). The only reason for it as far as I can see is because we now have an array of struct panfrost_job_bo_desc rather than a direct array of struct drm_gem_object. I'm not sure having everything neatly in one structure is worth this cost?
I'm not thrilled about this either. If not a separate array, we could change the common code to work on a common struct instead.
To put it another way, this is all copy-n-paste from elsewhere that I don't really understand and want to maintain in the driver.
Rob
On Fri, Sep 13, 2019 at 6:17 AM Boris Brezillon boris.brezillon@collabora.com wrote:
The READ/WRITE flags are particularly useful if we want to avoid serialization of jobs that read from the same BO but never write to it.
Any data on performance differences?
The NO_IMPLICIT_FENCE might be useful when the user knows the BO is shared but jobs are using different portions of the buffer.
Why don't we add this when it is useful rather than might be?
Signed-off-by: Boris Brezillon boris.brezillon@collabora.com
On Mon, 16 Sep 2019 17:20:28 -0500 Rob Herring robh@kernel.org wrote:
On Fri, Sep 13, 2019 at 6:17 AM Boris Brezillon boris.brezillon@collabora.com wrote:
The READ/WRITE flags are particularly useful if we want to avoid serialization of jobs that read from the same BO but never write to it.
Any data on performance differences?
Unfortunately no. When I initially added support for BO flags I thought it would fix a regression I had on one glmark2 test (ideas), but the problem ended up being something completely different (overhead of calling ioctl(WAIT_BO) on already idle BOs).
I just ran glmark2 again, and there doesn't seem to be a noticeable improvement with those 2 patches applied (and mesa patched to use the new flags). This being said, the improvement is likely to be workload dependent, so I wouldn't consider these patches as useless, but I'm fine putting them on hold until we see a real need.
Maybe Steven has some real use cases that could help outline the benefit of these patches.
The NO_IMPLICIT_FENCE might be useful when the user knows the BO is shared but jobs are using different portions of the buffer.
Why don't we add this when it is useful rather than might be?
I don't have a need for that one yet, but etanviv has it in place so I thought I'd add both at the same time. Note that it could help us reduce the number of fence returned by panfrost_job_dependency(), but I'm not sure it makes a difference in terms of perfs.
On 17/09/2019 08:15, Boris Brezillon wrote:
On Mon, 16 Sep 2019 17:20:28 -0500 Rob Herring robh@kernel.org wrote:
On Fri, Sep 13, 2019 at 6:17 AM Boris Brezillon boris.brezillon@collabora.com wrote:
The READ/WRITE flags are particularly useful if we want to avoid serialization of jobs that read from the same BO but never write to it.
Any data on performance differences?
Unfortunately no. When I initially added support for BO flags I thought it would fix a regression I had on one glmark2 test (ideas), but the problem ended up being something completely different (overhead of calling ioctl(WAIT_BO) on already idle BOs).
I just ran glmark2 again, and there doesn't seem to be a noticeable improvement with those 2 patches applied (and mesa patched to use the new flags). This being said, the improvement is likely to be workload dependent, so I wouldn't consider these patches as useless, but I'm fine putting them on hold until we see a real need.
Maybe Steven has some real use cases that could help outline the benefit of these patches.
To be honest I don't really know. The DDK internally does track this read/write information - but then it doesn't involve the kernel in tracking it. I was presuming that Mesa (because it exports the buffer usage to the kernel) would benefit from being able to have multiple readers - for example of a buffer being used as a texture for multiple render passes. It's possible we don't see this benefit because we haven't got the queuing of jobs merged yet?
There might also be some benefits when it comes to interaction with other drivers, but I don't have any concrete examples.
The NO_IMPLICIT_FENCE might be useful when the user knows the BO is shared but jobs are using different portions of the buffer.
Why don't we add this when it is useful rather than might be?
I don't have a need for that one yet, but etanviv has it in place so I thought I'd add both at the same time. Note that it could help us reduce the number of fence returned by panfrost_job_dependency(), but I'm not sure it makes a difference in terms of perfs.
I'm not aware of any reason for NO_IMPLICIT_FENCE. I found it somewhat odd that it is effectively one way (it doesn't wait for fences, but others could be waiting on the fence added by this usage). If we don't have a use yet in Mesa then it's probably best to wait until we know how this is actually going to be used.
There is of course already the option of simply omitting the BO in the job to prevent any dependencies being created :)
Steve
dri-devel@lists.freedesktop.org