[PATCH] drm/amdgpu: Transfer fences to dmabuf importer

List overview All Threads
Download

newer

older

[PATCH] drm/vc4: Make sure to emit...

[PATCH] drm: Remove defunct...

Chris Wilson

7 Aug 2018 7 Aug '18

10:45 a.m.

amdgpu only uses shared-fences internally, but dmabuf importers rely on implicit write hazard tracking via the reservation_object.fence_excl. For example, the importer use the write hazard for timing a page flip to only occur after the exporter has finished flushing its write into the surface. As such, on exporting a dmabuf, we must either flush all outstanding fences (for we do not know which are writes and should have been exclusive) or alternatively create a new exclusive fence that is the composite of all the existing shared fences, and so will only be signaled when all earlier fences are signaled (ensuring that we can not be signaled before the completion of any earlier write).

Testcase: igt/amd_prime/amd-to-i915 Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Alex Deucher alexander.deucher@amd.com Cc: "Christian König" christian.koenig@amd.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c | 70 ++++++++++++++++++++--- 1 file changed, 62 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c index 1c5d97f4b4dd..47e6ec5510b6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c @@ -37,6 +37,7 @@ #include "amdgpu_display.h" #include <drm/amdgpu_drm.h> #include <linux/dma-buf.h> +#include <linux/dma-fence-array.h>

static const struct dma_buf_ops amdgpu_dmabuf_ops;

@@ -188,6 +189,57 @@ amdgpu_gem_prime_import_sg_table(struct drm_device *dev, return ERR_PTR(ret); }

+static int +__reservation_object_make_exclusive(struct reservation_object *obj) +{ + struct reservation_object_list *fobj; + struct dma_fence_array *array; + struct dma_fence **fences; + unsigned int count, i; + + fobj = reservation_object_get_list(obj); + if (!fobj) + return 0; + + count = !!rcu_access_pointer(obj->fence_excl); + count += fobj->shared_count; + + fences = kmalloc_array(sizeof(*fences), count, GFP_KERNEL); + if (!fences) + return -ENOMEM; + + for (i = 0; i < fobj->shared_count; i++) { + struct dma_fence *f = + rcu_dereference_protected(fobj->shared[i], + reservation_object_held(obj)); + + fences[i] = dma_fence_get(f); + } + + if (rcu_access_pointer(obj->fence_excl)) { + struct dma_fence *f = + rcu_dereference_protected(obj->fence_excl, + reservation_object_held(obj)); + + fences[i] = dma_fence_get(f); + } + + array = dma_fence_array_create(count, fences, + dma_fence_context_alloc(1), 0, + false); + if (!array) + goto err_fences_put; + + reservation_object_add_excl_fence(obj, &array->base); + return 0; + +err_fences_put: + for (i = 0; i < count; i++) + dma_fence_put(fences[i]); + kfree(fences); + return -ENOMEM; +} + /** * amdgpu_gem_map_attach - &dma_buf_ops.attach implementation * @dma_buf: shared DMA buffer @@ -219,16 +271,18 @@ static int amdgpu_gem_map_attach(struct dma_buf *dma_buf,

if (attach->dev->driver != adev->dev->driver) { /* - * Wait for all shared fences to complete before we switch to future - * use of exclusive fence on this prime shared bo. + * We only create shared fences for internal use, but importers + * of the dmabuf rely on exclusive fences for implicitly + * tracking write hazards. As any of the current fences may + * correspond to a write, we need to convert all existing + * fences on the resevation object into a single exclusive + * fence. */ - r = reservation_object_wait_timeout_rcu(bo->tbo.resv, - true, false, - MAX_SCHEDULE_TIMEOUT); - if (unlikely(r < 0)) { - DRM_DEBUG_PRIME("Fence wait failed: %li\n", r); + reservation_object_lock(bo->tbo.resv, NULL); + r = __reservation_object_make_exclusive(bo->tbo.resv); + reservation_object_unlock(bo->tbo.resv); + if (r) goto error_unreserve; - } }

/* pin buffer into GTT */

-- 2.18.0

Show replies by date

Huang Rui

7 Aug 7 Aug

10:56 a.m.

On Tue, Aug 07, 2018 at 11:45:00AM +0100, Chris Wilson wrote:

...

amdgpu only uses shared-fences internally, but dmabuf importers rely on implicit write hazard tracking via the reservation_object.fence_excl. For example, the importer use the write hazard for timing a page flip to only occur after the exporter has finished flushing its write into the surface. As such, on exporting a dmabuf, we must either flush all outstanding fences (for we do not know which are writes and should have been exclusive) or alternatively create a new exclusive fence that is the composite of all the existing shared fences, and so will only be signaled when all earlier fences are signaled (ensuring that we can not be signaled before the completion of any earlier write).

Testcase: igt/amd_prime/amd-to-i915 Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Alex Deucher alexander.deucher@amd.com Cc: "Christian König" christian.koenig@amd.com

drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c | 70 ++++++++++++++++++++--- 1 file changed, 62 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c index 1c5d97f4b4dd..47e6ec5510b6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c @@ -37,6 +37,7 @@ #include "amdgpu_display.h" #include <drm/amdgpu_drm.h> #include <linux/dma-buf.h> +#include <linux/dma-fence-array.h>

static const struct dma_buf_ops amdgpu_dmabuf_ops;

@@ -188,6 +189,57 @@ amdgpu_gem_prime_import_sg_table(struct drm_device *dev, return ERR_PTR(ret); }

+static int +__reservation_object_make_exclusive(struct reservation_object *obj) +{

Why not you move the helper to reservation.c, and then export symbol for this file?

Thanks, Ray

...

struct reservation_object_list *fobj;

struct dma_fence_array *array;

struct dma_fence **fences;

unsigned int count, i;

fobj = reservation_object_get_list(obj);

if (!fobj)
return 0;
count = !!rcu_access_pointer(obj->fence_excl);

count += fobj->shared_count;

fences = kmalloc_array(sizeof(*fences), count, GFP_KERNEL);

if (!fences)
return -ENOMEM;
for (i = 0; i < fobj->shared_count; i++) {
struct dma_fence *f =
	rcu_dereference_protected(fobj->shared[i],
				  reservation_object_held(obj));
fences[i] = dma_fence_get(f);
}

if (rcu_access_pointer(obj->fence_excl)) {
struct dma_fence *f =
	rcu_dereference_protected(obj->fence_excl,
				  reservation_object_held(obj));
fences[i] = dma_fence_get(f);
}

array = dma_fence_array_create(count, fences,
		       dma_fence_context_alloc(1), 0,
		       false);
if (!array)
goto err_fences_put;
reservation_object_add_excl_fence(obj, &array->base);

return 0;
+err_fences_put:
for (i = 0; i < count; i++)
dma_fence_put(fences[i]);
kfree(fences);

return -ENOMEM;
+}

/**

amdgpu_gem_map_attach - &dma_buf_ops.attach implementation

@dma_buf: shared DMA buffer

@@ -219,16 +271,18 @@ static int amdgpu_gem_map_attach(struct dma_buf *dma_buf,

if (attach->dev->driver != adev->dev->driver) { /*
 * Wait for all shared fences to complete before we switch to future
 * use of exclusive fence on this prime shared bo.
 * We only create shared fences for internal use, but importers
 * of the dmabuf rely on exclusive fences for implicitly
 * tracking write hazards. As any of the current fences may
 * correspond to a write, we need to convert all existing
 * fences on the resevation object into a single exclusive
 * fence.
*/
r = reservation_object_wait_timeout_rcu(bo->tbo.resv,
					true, false,
					MAX_SCHEDULE_TIMEOUT);
if (unlikely(r < 0)) {
	DRM_DEBUG_PRIME("Fence wait failed: %li\n", r);
reservation_object_lock(bo->tbo.resv, NULL);
r = __reservation_object_make_exclusive(bo->tbo.resv);
reservation_object_unlock(bo->tbo.resv);
if (r)
goto error_unreserve;
}
}

/* pin buffer into GTT */
-- 2.18.0

amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Chris Wilson

11:05 a.m.

Quoting Huang Rui (2018-08-07 11:56:24)

...

On Tue, Aug 07, 2018 at 11:45:00AM +0100, Chris Wilson wrote:

...
amdgpu only uses shared-fences internally, but dmabuf importers rely on implicit write hazard tracking via the reservation_object.fence_excl. For example, the importer use the write hazard for timing a page flip to only occur after the exporter has finished flushing its write into the surface. As such, on exporting a dmabuf, we must either flush all outstanding fences (for we do not know which are writes and should have been exclusive) or alternatively create a new exclusive fence that is the composite of all the existing shared fences, and so will only be signaled when all earlier fences are signaled (ensuring that we can not be signaled before the completion of any earlier write).

Testcase: igt/amd_prime/amd-to-i915 Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Alex Deucher alexander.deucher@amd.com Cc: "Christian König" christian.koenig@amd.com

drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c | 70 ++++++++++++++++++++--- 1 file changed, 62 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c index 1c5d97f4b4dd..47e6ec5510b6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c @@ -37,6 +37,7 @@ #include "amdgpu_display.h" #include <drm/amdgpu_drm.h> #include <linux/dma-buf.h> +#include <linux/dma-fence-array.h>

static const struct dma_buf_ops amdgpu_dmabuf_ops;

@@ -188,6 +189,57 @@ amdgpu_gem_prime_import_sg_table(struct drm_device *dev, return ERR_PTR(ret); }

+static int +__reservation_object_make_exclusive(struct reservation_object *obj) +{

Why not you move the helper to reservation.c, and then export symbol for this file?

I have not seen anything else that would wish to use this helper. The first task is to solve this issue here before worrying about generalisation. -Chris

Chris Wilson

11:03 a.m.

New subject: [PATCH v2] drm/amdgpu: Transfer fences to dmabuf importer

v2: reservation_object is already locked by amdgpu_bo_reserve()

Testcase: igt/amd_prime/amd-to-i915 Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Alex Deucher alexander.deucher@amd.com Cc: "Christian König" christian.koenig@amd.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c | 68 ++++++++++++++++++++--- 1 file changed, 60 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c index 1c5d97f4b4dd..27f639c69e35 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c @@ -37,6 +37,7 @@ #include "amdgpu_display.h" #include <drm/amdgpu_drm.h> #include <linux/dma-buf.h> +#include <linux/dma-fence-array.h>

static const struct dma_buf_ops amdgpu_dmabuf_ops;

@@ -188,6 +189,57 @@ amdgpu_gem_prime_import_sg_table(struct drm_device *dev, return ERR_PTR(ret); }

+static int +__reservation_object_make_exclusive(struct reservation_object *obj) +{ + struct reservation_object_list *fobj; + struct dma_fence_array *array; + struct dma_fence **fences; + unsigned int count, i; + + fobj = reservation_object_get_list(obj); + if (!fobj) + return 0; + + count = !!rcu_access_pointer(obj->fence_excl); + count += fobj->shared_count; + + fences = kmalloc_array(sizeof(*fences), count, GFP_KERNEL); + if (!fences) + return -ENOMEM; + + for (i = 0; i < fobj->shared_count; i++) { + struct dma_fence *f = + rcu_dereference_protected(fobj->shared[i], + reservation_object_held(obj)); + + fences[i] = dma_fence_get(f); + } + + if (rcu_access_pointer(obj->fence_excl)) { + struct dma_fence *f = + rcu_dereference_protected(obj->fence_excl, + reservation_object_held(obj)); + + fences[i] = dma_fence_get(f); + } + + array = dma_fence_array_create(count, fences, + dma_fence_context_alloc(1), 0, + false); + if (!array) + goto err_fences_put; + + reservation_object_add_excl_fence(obj, &array->base); + return 0; + +err_fences_put: + for (i = 0; i < count; i++) + dma_fence_put(fences[i]); + kfree(fences); + return -ENOMEM; +} + /** * amdgpu_gem_map_attach - &dma_buf_ops.attach implementation * @dma_buf: shared DMA buffer @@ -219,16 +271,16 @@ static int amdgpu_gem_map_attach(struct dma_buf *dma_buf,

if (attach->dev->driver != adev->dev->driver) { /* - * Wait for all shared fences to complete before we switch to future - * use of exclusive fence on this prime shared bo. + * We only create shared fences for internal use, but importers + * of the dmabuf rely on exclusive fences for implicitly + * tracking write hazards. As any of the current fences may + * correspond to a write, we need to convert all existing + * fences on the resevation object into a single exclusive + * fence. */ - r = reservation_object_wait_timeout_rcu(bo->tbo.resv, - true, false, - MAX_SCHEDULE_TIMEOUT); - if (unlikely(r < 0)) { - DRM_DEBUG_PRIME("Fence wait failed: %li\n", r); + r = __reservation_object_make_exclusive(bo->tbo.resv); + if (r) goto error_unreserve; - } }

/* pin buffer into GTT */

-- 2.18.0

Chris Wilson

11:05 a.m.

New subject: [PATCH v3] drm/amdgpu: Transfer fences to dmabuf importer

v2: reservation_object is already locked by amdgpu_bo_reserve()

Testcase: igt/amd_prime/amd-to-i915 Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Alex Deucher alexander.deucher@amd.com Cc: "Christian König" christian.koenig@amd.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c | 68 ++++++++++++++++++++--- 1 file changed, 60 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c index 1c5d97f4b4dd..576a83946c25 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c @@ -37,6 +37,7 @@ #include "amdgpu_display.h" #include <drm/amdgpu_drm.h> #include <linux/dma-buf.h> +#include <linux/dma-fence-array.h>

static const struct dma_buf_ops amdgpu_dmabuf_ops;

@@ -188,6 +189,57 @@ amdgpu_gem_prime_import_sg_table(struct drm_device *dev, return ERR_PTR(ret); }

+static int +__reservation_object_make_exclusive(struct reservation_object *obj) +{ + struct reservation_object_list *fobj; + struct dma_fence_array *array; + struct dma_fence **fences; + unsigned int count, i; + + fobj = reservation_object_get_list(obj); + if (!fobj) + return 0; + + count = !!rcu_access_pointer(obj->fence_excl); + count += fobj->shared_count; + + fences = kmalloc_array(count, sizeof(*fences), GFP_KERNEL); + if (!fences) + return -ENOMEM; + + for (i = 0; i < fobj->shared_count; i++) { + struct dma_fence *f = + rcu_dereference_protected(fobj->shared[i], + reservation_object_held(obj)); + + fences[i] = dma_fence_get(f); + } + + if (rcu_access_pointer(obj->fence_excl)) { + struct dma_fence *f = + rcu_dereference_protected(obj->fence_excl, + reservation_object_held(obj)); + + fences[i] = dma_fence_get(f); + } + + array = dma_fence_array_create(count, fences, + dma_fence_context_alloc(1), 0, + false); + if (!array) + goto err_fences_put; + + reservation_object_add_excl_fence(obj, &array->base); + return 0; + +err_fences_put: + for (i = 0; i < count; i++) + dma_fence_put(fences[i]); + kfree(fences); + return -ENOMEM; +} + /** * amdgpu_gem_map_attach - &dma_buf_ops.attach implementation * @dma_buf: shared DMA buffer @@ -219,16 +271,16 @@ static int amdgpu_gem_map_attach(struct dma_buf *dma_buf,

if (attach->dev->driver != adev->dev->driver) { /* - * Wait for all shared fences to complete before we switch to future - * use of exclusive fence on this prime shared bo. + * We only create shared fences for internal use, but importers + * of the dmabuf rely on exclusive fences for implicitly + * tracking write hazards. As any of the current fences may + * correspond to a write, we need to convert all existing + * fences on the resevation object into a single exclusive + * fence. */ - r = reservation_object_wait_timeout_rcu(bo->tbo.resv, - true, false, - MAX_SCHEDULE_TIMEOUT); - if (unlikely(r < 0)) { - DRM_DEBUG_PRIME("Fence wait failed: %li\n", r); + r = __reservation_object_make_exclusive(bo->tbo.resv); + if (r) goto error_unreserve; - } }

/* pin buffer into GTT */

-- 2.18.0

Christian König

11:23 a.m.

New subject: [PATCH v3] drm/amdgpu: Transfer fences to dmabuf importer

Am 07.08.2018 um 13:05 schrieb Chris Wilson:

...

amdgpu only uses shared-fences internally, but dmabuf importers rely on implicit write hazard tracking via the reservation_object.fence_excl.

Well I would rather suggest a solution where we stop doing this.

The problem here is that i915 assumes that a write operation always needs exclusive access to an object protected by an reservation object.

At least for amdgpu, radeon and nouveau this assumption is incorrect, but only amdgpu has a design where userspace is not lying to the kernel about it's read/write access.

What we should do instead is to add a flag to each shared fence to note if it is a write operation or not. Then we can trivially add a function to wait on on those in i915.

I should have pushed harder for this solution when the problem came up initially, Christian.

...

For example, the importer use the write hazard for timing a page flip to only occur after the exporter has finished flushing its write into the surface. As such, on exporting a dmabuf, we must either flush all outstanding fences (for we do not know which are writes and should have been exclusive) or alternatively create a new exclusive fence that is the composite of all the existing shared fences, and so will only be signaled when all earlier fences are signaled (ensuring that we can not be signaled before the completion of any earlier write).

v2: reservation_object is already locked by amdgpu_bo_reserve()

Testcase: igt/amd_prime/amd-to-i915 Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Alex Deucher alexander.deucher@amd.com Cc: "Christian König" christian.koenig@amd.com

drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c | 68 ++++++++++++++++++++--- 1 file changed, 60 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c index 1c5d97f4b4dd..576a83946c25 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c @@ -37,6 +37,7 @@ #include "amdgpu_display.h" #include <drm/amdgpu_drm.h> #include <linux/dma-buf.h> +#include <linux/dma-fence-array.h>

static const struct dma_buf_ops amdgpu_dmabuf_ops;

@@ -188,6 +189,57 @@ amdgpu_gem_prime_import_sg_table(struct drm_device *dev, return ERR_PTR(ret); }

+static int +__reservation_object_make_exclusive(struct reservation_object *obj) +{
struct reservation_object_list *fobj;

struct dma_fence_array *array;

struct dma_fence **fences;

unsigned int count, i;

fobj = reservation_object_get_list(obj);

if (!fobj)
return 0;
count = !!rcu_access_pointer(obj->fence_excl);

count += fobj->shared_count;

fences = kmalloc_array(count, sizeof(*fences), GFP_KERNEL);

if (!fences)
return -ENOMEM;
for (i = 0; i < fobj->shared_count; i++) {
struct dma_fence *f =
	rcu_dereference_protected(fobj->shared[i],
				  reservation_object_held(obj));
fences[i] = dma_fence_get(f);
}

if (rcu_access_pointer(obj->fence_excl)) {
struct dma_fence *f =
	rcu_dereference_protected(obj->fence_excl,
				  reservation_object_held(obj));
fences[i] = dma_fence_get(f);
}

array = dma_fence_array_create(count, fences,
		       dma_fence_context_alloc(1), 0,
		       false);
if (!array)
goto err_fences_put;
reservation_object_add_excl_fence(obj, &array->base);

return 0;
+err_fences_put:
for (i = 0; i < count; i++)
dma_fence_put(fences[i]);
kfree(fences);

return -ENOMEM;
+}

/**

amdgpu_gem_map_attach - &dma_buf_ops.attach implementation

@dma_buf: shared DMA buffer

@@ -219,16 +271,16 @@ static int amdgpu_gem_map_attach(struct dma_buf *dma_buf,

if (attach->dev->driver != adev->dev->driver) { /*
 * Wait for all shared fences to complete before we switch to future
 * use of exclusive fence on this prime shared bo.
 * We only create shared fences for internal use, but importers
 * of the dmabuf rely on exclusive fences for implicitly
 * tracking write hazards. As any of the current fences may
 * correspond to a write, we need to convert all existing
 * fences on the resevation object into a single exclusive
 * fence.
*/
r = reservation_object_wait_timeout_rcu(bo->tbo.resv,
					true, false,
					MAX_SCHEDULE_TIMEOUT);
if (unlikely(r < 0)) {
	DRM_DEBUG_PRIME("Fence wait failed: %li\n", r);
r = __reservation_object_make_exclusive(bo->tbo.resv);
if (r)
goto error_unreserve;
}
}

/* pin buffer into GTT */

Daniel Vetter

12:43 p.m.

New subject: [PATCH v3] drm/amdgpu: Transfer fences to dmabuf importer

On Tue, Aug 07, 2018 at 01:23:19PM +0200, Christian König wrote:

...

Am 07.08.2018 um 13:05 schrieb Chris Wilson:

...
amdgpu only uses shared-fences internally, but dmabuf importers rely on implicit write hazard tracking via the reservation_object.fence_excl.

Well I would rather suggest a solution where we stop doing this.

The problem here is that i915 assumes that a write operation always needs exclusive access to an object protected by an reservation object.

At least for amdgpu, radeon and nouveau this assumption is incorrect, but only amdgpu has a design where userspace is not lying to the kernel about it's read/write access.

What we should do instead is to add a flag to each shared fence to note if it is a write operation or not. Then we can trivially add a function to wait on on those in i915.

I should have pushed harder for this solution when the problem came up initially,

For shared buffers in an implicit syncing setup like dri2 the exclusive fence _is_ your write fence. That's how this stuff works. Note it's only for implicit fencing for dri2 shared buffers. i915 lies as much as everyone else for explicit syncing.

Now as long as you only share within amdgpu you can do whatever you want to, but for anything shared outside of amdgpu, if the buffer isn't shared through an explicit syncing protocol, then you do have to set the exclusive fence. That's at least how this stuff works right now with all other drivers.

For i915 we had to do some uapi trickery to make this work in all cases, since only really userspace knows whether a write should be a shared or exclusive write. But that's stricly an issue limited to your driver, and dosn't need changes to reservation object all throughout the stack.

Aside: If you want to attach multiple write fences to the exclusive fence, that should be doable with the array fences. -Daniel

...

Christian.

...
For example, the importer use the write hazard for timing a page flip to only occur after the exporter has finished flushing its write into the surface. As such, on exporting a dmabuf, we must either flush all outstanding fences (for we do not know which are writes and should have been exclusive) or alternatively create a new exclusive fence that is the composite of all the existing shared fences, and so will only be signaled when all earlier fences are signaled (ensuring that we can not be signaled before the completion of any earlier write).

v2: reservation_object is already locked by amdgpu_bo_reserve()

Testcase: igt/amd_prime/amd-to-i915 Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Alex Deucher alexander.deucher@amd.com Cc: "Christian König" christian.koenig@amd.com

drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c | 68 ++++++++++++++++++++--- 1 file changed, 60 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c index 1c5d97f4b4dd..576a83946c25 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c @@ -37,6 +37,7 @@ #include "amdgpu_display.h" #include <drm/amdgpu_drm.h> #include <linux/dma-buf.h> +#include <linux/dma-fence-array.h> static const struct dma_buf_ops amdgpu_dmabuf_ops; @@ -188,6 +189,57 @@ amdgpu_gem_prime_import_sg_table(struct drm_device *dev, return ERR_PTR(ret); } +static int +__reservation_object_make_exclusive(struct reservation_object *obj) +{
struct reservation_object_list *fobj;

struct dma_fence_array *array;

struct dma_fence **fences;

unsigned int count, i;

fobj = reservation_object_get_list(obj);

if (!fobj)
return 0;
count = !!rcu_access_pointer(obj->fence_excl);

count += fobj->shared_count;

fences = kmalloc_array(count, sizeof(*fences), GFP_KERNEL);

if (!fences)
return -ENOMEM;
for (i = 0; i < fobj->shared_count; i++) {
struct dma_fence *f =
	rcu_dereference_protected(fobj->shared[i],
				  reservation_object_held(obj));
fences[i] = dma_fence_get(f);
}

if (rcu_access_pointer(obj->fence_excl)) {
struct dma_fence *f =
	rcu_dereference_protected(obj->fence_excl,
				  reservation_object_held(obj));
fences[i] = dma_fence_get(f);
}

array = dma_fence_array_create(count, fences,
		       dma_fence_context_alloc(1), 0,
		       false);
if (!array)
goto err_fences_put;
reservation_object_add_excl_fence(obj, &array->base);

return 0;
+err_fences_put:
for (i = 0; i < count; i++)
dma_fence_put(fences[i]);
kfree(fences);

return -ENOMEM;
+}

/**

amdgpu_gem_map_attach - &dma_buf_ops.attach implementation

@dma_buf: shared DMA buffer

@@ -219,16 +271,16 @@ static int amdgpu_gem_map_attach(struct dma_buf *dma_buf, if (attach->dev->driver != adev->dev->driver) { /*
 * Wait for all shared fences to complete before we switch to future
 * use of exclusive fence on this prime shared bo.
 * We only create shared fences for internal use, but importers
 * of the dmabuf rely on exclusive fences for implicitly
 * tracking write hazards. As any of the current fences may
 * correspond to a write, we need to convert all existing
 * fences on the resevation object into a single exclusive
 * fence.
*/
r = reservation_object_wait_timeout_rcu(bo->tbo.resv,
					true, false,
					MAX_SCHEDULE_TIMEOUT);
if (unlikely(r < 0)) {
	DRM_DEBUG_PRIME("Fence wait failed: %li\n", r);
r = __reservation_object_make_exclusive(bo->tbo.resv);
if (r)
goto error_unreserve;
}
} /* pin buffer into GTT */
dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel

-- Daniel Vetter Software Engineer, Intel Corporation http://blog.ffwll.ch

Daniel Vetter

12:47 p.m.

New subject: [PATCH v3] drm/amdgpu: Transfer fences to dmabuf importer

On Tue, Aug 07, 2018 at 02:43:22PM +0200, Daniel Vetter wrote:

...

On Tue, Aug 07, 2018 at 01:23:19PM +0200, Christian König wrote:

...
Am 07.08.2018 um 13:05 schrieb Chris Wilson:

...
amdgpu only uses shared-fences internally, but dmabuf importers rely on implicit write hazard tracking via the reservation_object.fence_excl.

Well I would rather suggest a solution where we stop doing this.

The problem here is that i915 assumes that a write operation always needs exclusive access to an object protected by an reservation object.

At least for amdgpu, radeon and nouveau this assumption is incorrect, but only amdgpu has a design where userspace is not lying to the kernel about it's read/write access.

What we should do instead is to add a flag to each shared fence to note if it is a write operation or not. Then we can trivially add a function to wait on on those in i915.

I should have pushed harder for this solution when the problem came up initially,

For shared buffers in an implicit syncing setup like dri2 the exclusive fence _is_ your write fence. That's how this stuff works. Note it's only for implicit fencing for dri2 shared buffers. i915 lies as much as everyone else for explicit syncing.

Just realized that dri3 is exactly in the same boat still afaiui. Anyway, tldr here isn't that i915 is the exception, but amdgpu. If you have a shared buffer used in an implicitly synced protocol, you have to set the exclusive fence to guard writes. How you do that is up to you really. And if a bitfield in reservation_object would help in tracking these I guess we could add that, but I think that could as well just be put into an amdgpu structure somewhere. And would be less confusing for everyone else if it's not in struct reservation_object. -Daniel

...

Now as long as you only share within amdgpu you can do whatever you want to, but for anything shared outside of amdgpu, if the buffer isn't shared through an explicit syncing protocol, then you do have to set the exclusive fence. That's at least how this stuff works right now with all other drivers.

For i915 we had to do some uapi trickery to make this work in all cases, since only really userspace knows whether a write should be a shared or exclusive write. But that's stricly an issue limited to your driver, and dosn't need changes to reservation object all throughout the stack.

Aside: If you want to attach multiple write fences to the exclusive fence, that should be doable with the array fences. -Daniel

...
Christian.

...
For example, the importer use the write hazard for timing a page flip to only occur after the exporter has finished flushing its write into the surface. As such, on exporting a dmabuf, we must either flush all outstanding fences (for we do not know which are writes and should have been exclusive) or alternatively create a new exclusive fence that is the composite of all the existing shared fences, and so will only be signaled when all earlier fences are signaled (ensuring that we can not be signaled before the completion of any earlier write).

v2: reservation_object is already locked by amdgpu_bo_reserve()

Testcase: igt/amd_prime/amd-to-i915 Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Alex Deucher alexander.deucher@amd.com Cc: "Christian König" christian.koenig@amd.com

drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c | 68 ++++++++++++++++++++--- 1 file changed, 60 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c index 1c5d97f4b4dd..576a83946c25 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c @@ -37,6 +37,7 @@ #include "amdgpu_display.h" #include <drm/amdgpu_drm.h> #include <linux/dma-buf.h> +#include <linux/dma-fence-array.h> static const struct dma_buf_ops amdgpu_dmabuf_ops; @@ -188,6 +189,57 @@ amdgpu_gem_prime_import_sg_table(struct drm_device *dev, return ERR_PTR(ret); } +static int +__reservation_object_make_exclusive(struct reservation_object *obj) +{
struct reservation_object_list *fobj;

struct dma_fence_array *array;

struct dma_fence **fences;

unsigned int count, i;

fobj = reservation_object_get_list(obj);

if (!fobj)
return 0;
count = !!rcu_access_pointer(obj->fence_excl);

count += fobj->shared_count;

fences = kmalloc_array(count, sizeof(*fences), GFP_KERNEL);

if (!fences)
return -ENOMEM;
for (i = 0; i < fobj->shared_count; i++) {
struct dma_fence *f =
	rcu_dereference_protected(fobj->shared[i],
				  reservation_object_held(obj));
fences[i] = dma_fence_get(f);
}

if (rcu_access_pointer(obj->fence_excl)) {
struct dma_fence *f =
	rcu_dereference_protected(obj->fence_excl,
				  reservation_object_held(obj));
fences[i] = dma_fence_get(f);
}

array = dma_fence_array_create(count, fences,
		       dma_fence_context_alloc(1), 0,
		       false);
if (!array)
goto err_fences_put;
reservation_object_add_excl_fence(obj, &array->base);

return 0;
+err_fences_put:
for (i = 0; i < count; i++)
dma_fence_put(fences[i]);
kfree(fences);

return -ENOMEM;
+}

/**

amdgpu_gem_map_attach - &dma_buf_ops.attach implementation

@dma_buf: shared DMA buffer

@@ -219,16 +271,16 @@ static int amdgpu_gem_map_attach(struct dma_buf *dma_buf, if (attach->dev->driver != adev->dev->driver) { /*
 * Wait for all shared fences to complete before we switch to future
 * use of exclusive fence on this prime shared bo.
 * We only create shared fences for internal use, but importers
 * of the dmabuf rely on exclusive fences for implicitly
 * tracking write hazards. As any of the current fences may
 * correspond to a write, we need to convert all existing
 * fences on the resevation object into a single exclusive
 * fence.
*/
r = reservation_object_wait_timeout_rcu(bo->tbo.resv,
					true, false,
					MAX_SCHEDULE_TIMEOUT);
if (unlikely(r < 0)) {
	DRM_DEBUG_PRIME("Fence wait failed: %li\n", r);
r = __reservation_object_make_exclusive(bo->tbo.resv);
if (r)
goto error_unreserve;
}
} /* pin buffer into GTT */
dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
-- Daniel Vetter Software Engineer, Intel Corporation http://blog.ffwll.ch

-- Daniel Vetter Software Engineer, Intel Corporation http://blog.ffwll.ch

Christian König

12:51 p.m.

New subject: [PATCH v3] drm/amdgpu: Transfer fences to dmabuf importer

Am 07.08.2018 um 14:43 schrieb Daniel Vetter:

...

On Tue, Aug 07, 2018 at 01:23:19PM +0200, Christian König wrote:

...
Am 07.08.2018 um 13:05 schrieb Chris Wilson:

...
amdgpu only uses shared-fences internally, but dmabuf importers rely on implicit write hazard tracking via the reservation_object.fence_excl.

Well I would rather suggest a solution where we stop doing this.

The problem here is that i915 assumes that a write operation always needs exclusive access to an object protected by an reservation object.

At least for amdgpu, radeon and nouveau this assumption is incorrect, but only amdgpu has a design where userspace is not lying to the kernel about it's read/write access.

What we should do instead is to add a flag to each shared fence to note if it is a write operation or not. Then we can trivially add a function to wait on on those in i915.

I should have pushed harder for this solution when the problem came up initially,

For shared buffers in an implicit syncing setup like dri2 the exclusive fence _is_ your write fence.

And exactly that is absolutely not correct if you ask me.

The exclusive fence is for two use cases, the first one is for drivers which don't care about concurrent accesses and only use serialized accesses and the second is for kernel uses under the hood of userspace, evictions, buffer moves etc..

What i915 does is to abuse that concept for write once read many situations, and that in turn is the bug we need to fix here.

...

That's how this stuff works. Note it's only for implicit fencing for dri2 shared buffers. i915 lies as much as everyone else for explicit syncing.

That is really really ugly and should be fixed instead.

...

Now as long as you only share within amdgpu you can do whatever you want to, but for anything shared outside of amdgpu, if the buffer isn't shared through an explicit syncing protocol, then you do have to set the exclusive fence. That's at least how this stuff works right now with all other drivers.

For i915 we had to do some uapi trickery to make this work in all cases, since only really userspace knows whether a write should be a shared or exclusive write. But that's stricly an issue limited to your driver, and dosn't need changes to reservation object all throughout the stack.

You don't need to change the reservation object all throughout the stack. A simple flag if a shared fence is a write or not should be doable.

Give me a day or two to prototype that, Christian.

...

Aside: If you want to attach multiple write fences to the exclusive fence, that should be doable with the array fences. -Daniel

...
Christian.

...
For example, the importer use the write hazard for timing a page flip to only occur after the exporter has finished flushing its write into the surface. As such, on exporting a dmabuf, we must either flush all outstanding fences (for we do not know which are writes and should have been exclusive) or alternatively create a new exclusive fence that is the composite of all the existing shared fences, and so will only be signaled when all earlier fences are signaled (ensuring that we can not be signaled before the completion of any earlier write).

v2: reservation_object is already locked by amdgpu_bo_reserve()

Testcase: igt/amd_prime/amd-to-i915 Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Alex Deucher alexander.deucher@amd.com Cc: "Christian König" christian.koenig@amd.com

drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c | 68 ++++++++++++++++++++--- 1 file changed, 60 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c index 1c5d97f4b4dd..576a83946c25 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c @@ -37,6 +37,7 @@ #include "amdgpu_display.h" #include <drm/amdgpu_drm.h> #include <linux/dma-buf.h> +#include <linux/dma-fence-array.h> static const struct dma_buf_ops amdgpu_dmabuf_ops; @@ -188,6 +189,57 @@ amdgpu_gem_prime_import_sg_table(struct drm_device *dev, return ERR_PTR(ret); } +static int +__reservation_object_make_exclusive(struct reservation_object *obj) +{
struct reservation_object_list *fobj;

struct dma_fence_array *array;

struct dma_fence **fences;

unsigned int count, i;

fobj = reservation_object_get_list(obj);

if (!fobj)
return 0;
count = !!rcu_access_pointer(obj->fence_excl);

count += fobj->shared_count;

fences = kmalloc_array(count, sizeof(*fences), GFP_KERNEL);

if (!fences)
return -ENOMEM;
for (i = 0; i < fobj->shared_count; i++) {
struct dma_fence *f =
	rcu_dereference_protected(fobj->shared[i],
				  reservation_object_held(obj));
fences[i] = dma_fence_get(f);
}

if (rcu_access_pointer(obj->fence_excl)) {
struct dma_fence *f =
	rcu_dereference_protected(obj->fence_excl,
				  reservation_object_held(obj));
fences[i] = dma_fence_get(f);
}

array = dma_fence_array_create(count, fences,
		       dma_fence_context_alloc(1), 0,
		       false);
if (!array)
goto err_fences_put;
reservation_object_add_excl_fence(obj, &array->base);

return 0;
+err_fences_put:
for (i = 0; i < count; i++)
dma_fence_put(fences[i]);
kfree(fences);

return -ENOMEM;
+}

/**

amdgpu_gem_map_attach - &dma_buf_ops.attach implementation

@dma_buf: shared DMA buffer

@@ -219,16 +271,16 @@ static int amdgpu_gem_map_attach(struct dma_buf *dma_buf, if (attach->dev->driver != adev->dev->driver) { /*
 * Wait for all shared fences to complete before we switch to future
 * use of exclusive fence on this prime shared bo.
 * We only create shared fences for internal use, but importers
 * of the dmabuf rely on exclusive fences for implicitly
 * tracking write hazards. As any of the current fences may
 * correspond to a write, we need to convert all existing
 * fences on the resevation object into a single exclusive
 * fence.
 */
r = reservation_object_wait_timeout_rcu(bo->tbo.resv,
					true, false,
					MAX_SCHEDULE_TIMEOUT);
if (unlikely(r < 0)) {
	DRM_DEBUG_PRIME("Fence wait failed: %li\n", r);
r = __reservation_object_make_exclusive(bo->tbo.resv);
if (r)
	goto error_unreserve;
}
} /* pin buffer into GTT */
dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel

Daniel Vetter

12:59 p.m.

New subject: [PATCH v3] drm/amdgpu: Transfer fences to dmabuf importer

On Tue, Aug 07, 2018 at 02:51:50PM +0200, Christian König wrote:

...

Am 07.08.2018 um 14:43 schrieb Daniel Vetter:

...
On Tue, Aug 07, 2018 at 01:23:19PM +0200, Christian König wrote:

...
Am 07.08.2018 um 13:05 schrieb Chris Wilson:

...
amdgpu only uses shared-fences internally, but dmabuf importers rely on implicit write hazard tracking via the reservation_object.fence_excl.

Well I would rather suggest a solution where we stop doing this.

The problem here is that i915 assumes that a write operation always needs exclusive access to an object protected by an reservation object.

At least for amdgpu, radeon and nouveau this assumption is incorrect, but only amdgpu has a design where userspace is not lying to the kernel about it's read/write access.

What we should do instead is to add a flag to each shared fence to note if it is a write operation or not. Then we can trivially add a function to wait on on those in i915.

I should have pushed harder for this solution when the problem came up initially,

For shared buffers in an implicit syncing setup like dri2 the exclusive fence _is_ your write fence.

And exactly that is absolutely not correct if you ask me.

The exclusive fence is for two use cases, the first one is for drivers which don't care about concurrent accesses and only use serialized accesses and the second is for kernel uses under the hood of userspace, evictions, buffer moves etc..

What i915 does is to abuse that concept for write once read many situations, and that in turn is the bug we need to fix here.

Again, the exclusive fence was added for implicit sync usage like dri2/3. _Not_ for your own buffer manager. If you want to separate these two usages, then I guess we can do that, but claiming that i915 is the odd one out just aint true. You're arguing against all other kms drivers we have here.

...

...
That's how this stuff works. Note it's only for implicit fencing for dri2 shared buffers. i915 lies as much as everyone else for explicit syncing.

That is really really ugly and should be fixed instead.

It works and is uapi now. What's the gain in "fixing" it? And this was discussed at length when intel and freedreno folks talked about implementing explicit sync and android sync pts.

...

...
Now as long as you only share within amdgpu you can do whatever you want to, but for anything shared outside of amdgpu, if the buffer isn't shared through an explicit syncing protocol, then you do have to set the exclusive fence. That's at least how this stuff works right now with all other drivers.

For i915 we had to do some uapi trickery to make this work in all cases, since only really userspace knows whether a write should be a shared or exclusive write. But that's stricly an issue limited to your driver, and dosn't need changes to reservation object all throughout the stack.

You don't need to change the reservation object all throughout the stack. A simple flag if a shared fence is a write or not should be doable.

Give me a day or two to prototype that,

See my other reply, i think that's only needed for amdgpu internal book-keeping, so that you can create the correct exclusive fence on first export. But yeah, adding a bitfield to track which fences need to become exclusive shouldn't be a tricky solution to implement for amdgpu. -Daniel

...

Christian.

...
Aside: If you want to attach multiple write fences to the exclusive fence, that should be doable with the array fences. -Daniel

...
Christian.

...
For example, the importer use the write hazard for timing a page flip to only occur after the exporter has finished flushing its write into the surface. As such, on exporting a dmabuf, we must either flush all outstanding fences (for we do not know which are writes and should have been exclusive) or alternatively create a new exclusive fence that is the composite of all the existing shared fences, and so will only be signaled when all earlier fences are signaled (ensuring that we can not be signaled before the completion of any earlier write).

v2: reservation_object is already locked by amdgpu_bo_reserve()

Testcase: igt/amd_prime/amd-to-i915 Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Alex Deucher alexander.deucher@amd.com Cc: "Christian König" christian.koenig@amd.com

drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c | 68 ++++++++++++++++++++--- 1 file changed, 60 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c index 1c5d97f4b4dd..576a83946c25 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c @@ -37,6 +37,7 @@ #include "amdgpu_display.h" #include <drm/amdgpu_drm.h> #include <linux/dma-buf.h> +#include <linux/dma-fence-array.h> static const struct dma_buf_ops amdgpu_dmabuf_ops; @@ -188,6 +189,57 @@ amdgpu_gem_prime_import_sg_table(struct drm_device *dev, return ERR_PTR(ret); } +static int +__reservation_object_make_exclusive(struct reservation_object *obj) +{
struct reservation_object_list *fobj;

struct dma_fence_array *array;

struct dma_fence **fences;

unsigned int count, i;

fobj = reservation_object_get_list(obj);

if (!fobj)
return 0;
count = !!rcu_access_pointer(obj->fence_excl);

count += fobj->shared_count;

fences = kmalloc_array(count, sizeof(*fences), GFP_KERNEL);

if (!fences)
return -ENOMEM;
for (i = 0; i < fobj->shared_count; i++) {
struct dma_fence *f =
	rcu_dereference_protected(fobj->shared[i],
				  reservation_object_held(obj));
fences[i] = dma_fence_get(f);
}

if (rcu_access_pointer(obj->fence_excl)) {
struct dma_fence *f =
	rcu_dereference_protected(obj->fence_excl,
				  reservation_object_held(obj));
fences[i] = dma_fence_get(f);
}

array = dma_fence_array_create(count, fences,
		       dma_fence_context_alloc(1), 0,
		       false);
if (!array)
goto err_fences_put;
reservation_object_add_excl_fence(obj, &array->base);

return 0;
+err_fences_put:
for (i = 0; i < count; i++)
dma_fence_put(fences[i]);
kfree(fences);

return -ENOMEM;
+}

/**

amdgpu_gem_map_attach - &dma_buf_ops.attach implementation

@dma_buf: shared DMA buffer

@@ -219,16 +271,16 @@ static int amdgpu_gem_map_attach(struct dma_buf *dma_buf, if (attach->dev->driver != adev->dev->driver) { /*
 * Wait for all shared fences to complete before we switch to future
 * use of exclusive fence on this prime shared bo.
 * We only create shared fences for internal use, but importers
 * of the dmabuf rely on exclusive fences for implicitly
 * tracking write hazards. As any of the current fences may
 * correspond to a write, we need to convert all existing
 * fences on the resevation object into a single exclusive
 * fence.
 */
r = reservation_object_wait_timeout_rcu(bo->tbo.resv,
					true, false,
					MAX_SCHEDULE_TIMEOUT);
if (unlikely(r < 0)) {
	DRM_DEBUG_PRIME("Fence wait failed: %li\n", r);
r = __reservation_object_make_exclusive(bo->tbo.resv);
if (r)
	goto error_unreserve;
}
} /* pin buffer into GTT */
dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel

-- Daniel Vetter Software Engineer, Intel Corporation http://blog.ffwll.ch

Christian König

1:17 p.m.

New subject: [PATCH v3] drm/amdgpu: Transfer fences to dmabuf importer

Am 07.08.2018 um 14:59 schrieb Daniel Vetter:

...

On Tue, Aug 07, 2018 at 02:51:50PM +0200, Christian König wrote:

...
Am 07.08.2018 um 14:43 schrieb Daniel Vetter:

...
On Tue, Aug 07, 2018 at 01:23:19PM +0200, Christian König wrote:

...
Am 07.08.2018 um 13:05 schrieb Chris Wilson:

...
amdgpu only uses shared-fences internally, but dmabuf importers rely on implicit write hazard tracking via the reservation_object.fence_excl.

Well I would rather suggest a solution where we stop doing this.

The problem here is that i915 assumes that a write operation always needs exclusive access to an object protected by an reservation object.

At least for amdgpu, radeon and nouveau this assumption is incorrect, but only amdgpu has a design where userspace is not lying to the kernel about it's read/write access.

What we should do instead is to add a flag to each shared fence to note if it is a write operation or not. Then we can trivially add a function to wait on on those in i915.

I should have pushed harder for this solution when the problem came up initially,

For shared buffers in an implicit syncing setup like dri2 the exclusive fence _is_ your write fence.

And exactly that is absolutely not correct if you ask me.

The exclusive fence is for two use cases, the first one is for drivers which don't care about concurrent accesses and only use serialized accesses and the second is for kernel uses under the hood of userspace, evictions, buffer moves etc..

What i915 does is to abuse that concept for write once read many situations, and that in turn is the bug we need to fix here.

Again, the exclusive fence was added for implicit sync usage like dri2/3. _Not_ for your own buffer manager. If you want to separate these two usages, then I guess we can do that, but claiming that i915 is the odd one out just aint true. You're arguing against all other kms drivers we have here.

No I'm not. I discussed exactly that use case with Maarten when the reservation object was introduced.

I think that Maarten named the explicit fence on purpose "explicit" fence and not "write" fence to make that distinction clear.

I have to admit that this wasn't really documented, but it indeed was the original purpose to get away from the idea that writes needs to be exclusive.

...

...
...
That's how this stuff works. Note it's only for implicit fencing for dri2 shared buffers. i915 lies as much as everyone else for explicit syncing.

That is really really ugly and should be fixed instead.

It works and is uapi now. What's the gain in "fixing" it?

It allows you to implement explicit fencing in the uapi without breaking backward compatibility.

See the situation with amdgpu and intel is the same as with userspace with mixed implicit and explicit fencing.

If that isn't fixed we will run into the same problem when DRI3 gets implicit fencing and some applications starts to use it while others still rely on the explicit fencing.

Regards, Christian.

...

And this was discussed at length when intel and freedreno folks talked about implementing explicit sync and android sync pts.

...
...
Now as long as you only share within amdgpu you can do whatever you want to, but for anything shared outside of amdgpu, if the buffer isn't shared through an explicit syncing protocol, then you do have to set the exclusive fence. That's at least how this stuff works right now with all other drivers.

For i915 we had to do some uapi trickery to make this work in all cases, since only really userspace knows whether a write should be a shared or exclusive write. But that's stricly an issue limited to your driver, and dosn't need changes to reservation object all throughout the stack.

You don't need to change the reservation object all throughout the stack. A simple flag if a shared fence is a write or not should be doable.

Give me a day or two to prototype that,

See my other reply, i think that's only needed for amdgpu internal book-keeping, so that you can create the correct exclusive fence on first export. But yeah, adding a bitfield to track which fences need to become exclusive shouldn't be a tricky solution to implement for amdgpu. -Daniel

...
Christian.

...
Aside: If you want to attach multiple write fences to the exclusive fence, that should be doable with the array fences. -Daniel

...
Christian.

...
For example, the importer use the write hazard for timing a page flip to only occur after the exporter has finished flushing its write into the surface. As such, on exporting a dmabuf, we must either flush all outstanding fences (for we do not know which are writes and should have been exclusive) or alternatively create a new exclusive fence that is the composite of all the existing shared fences, and so will only be signaled when all earlier fences are signaled (ensuring that we can not be signaled before the completion of any earlier write).

v2: reservation_object is already locked by amdgpu_bo_reserve()

Testcase: igt/amd_prime/amd-to-i915 Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Alex Deucher alexander.deucher@amd.com Cc: "Christian König" christian.koenig@amd.com
drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c | 68 ++++++++++++++++++++---
1 file changed, 60 insertions(+), 8 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c index 1c5d97f4b4dd..576a83946c25 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c @@ -37,6 +37,7 @@ #include "amdgpu_display.h" #include <drm/amdgpu_drm.h> #include <linux/dma-buf.h> +#include <linux/dma-fence-array.h> static const struct dma_buf_ops amdgpu_dmabuf_ops; @@ -188,6 +189,57 @@ amdgpu_gem_prime_import_sg_table(struct drm_device *dev, return ERR_PTR(ret); } +static int +__reservation_object_make_exclusive(struct reservation_object *obj) +{
struct reservation_object_list *fobj;

struct dma_fence_array *array;

struct dma_fence **fences;

unsigned int count, i;

fobj = reservation_object_get_list(obj);

if (!fobj)
return 0;
count = !!rcu_access_pointer(obj->fence_excl);

count += fobj->shared_count;

fences = kmalloc_array(count, sizeof(*fences), GFP_KERNEL);

if (!fences)
return -ENOMEM;
for (i = 0; i < fobj->shared_count; i++) {
struct dma_fence *f =
	rcu_dereference_protected(fobj->shared[i],
				  reservation_object_held(obj));
fences[i] = dma_fence_get(f);
}

if (rcu_access_pointer(obj->fence_excl)) {
struct dma_fence *f =
	rcu_dereference_protected(obj->fence_excl,
				  reservation_object_held(obj));
fences[i] = dma_fence_get(f);
}

array = dma_fence_array_create(count, fences,
		       dma_fence_context_alloc(1), 0,
		       false);
if (!array)
goto err_fences_put;
reservation_object_add_excl_fence(obj, &array->base);

return 0;
+err_fences_put:
for (i = 0; i < count; i++)
dma_fence_put(fences[i]);
kfree(fences);

return -ENOMEM;
+}

/**

amdgpu_gem_map_attach - &dma_buf_ops.attach implementation

@dma_buf: shared DMA buffer

@@ -219,16 +271,16 @@ static int amdgpu_gem_map_attach(struct dma_buf *dma_buf, if (attach->dev->driver != adev->dev->driver) { /*
 * Wait for all shared fences to complete before we switch to future
 * use of exclusive fence on this prime shared bo.
 * We only create shared fences for internal use, but importers
 * of the dmabuf rely on exclusive fences for implicitly
 * tracking write hazards. As any of the current fences may
 * correspond to a write, we need to convert all existing
 * fences on the resevation object into a single exclusive
 * fence.
	 */
r = reservation_object_wait_timeout_rcu(bo->tbo.resv,
					true, false,
					MAX_SCHEDULE_TIMEOUT);
if (unlikely(r < 0)) {
	DRM_DEBUG_PRIME("Fence wait failed: %li\n", r);
r = __reservation_object_make_exclusive(bo->tbo.resv);
if (r)
		goto error_unreserve;
}
}
/* pin buffer into GTT */
dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel

Daniel Vetter

1:37 p.m.

New subject: [PATCH v3] drm/amdgpu: Transfer fences to dmabuf importer

On Tue, Aug 07, 2018 at 03:17:06PM +0200, Christian König wrote:

...

Am 07.08.2018 um 14:59 schrieb Daniel Vetter:

...
On Tue, Aug 07, 2018 at 02:51:50PM +0200, Christian König wrote:

...
Am 07.08.2018 um 14:43 schrieb Daniel Vetter:

...
On Tue, Aug 07, 2018 at 01:23:19PM +0200, Christian König wrote:

...
Am 07.08.2018 um 13:05 schrieb Chris Wilson:

...
amdgpu only uses shared-fences internally, but dmabuf importers rely on implicit write hazard tracking via the reservation_object.fence_excl.

Well I would rather suggest a solution where we stop doing this.

The problem here is that i915 assumes that a write operation always needs exclusive access to an object protected by an reservation object.

At least for amdgpu, radeon and nouveau this assumption is incorrect, but only amdgpu has a design where userspace is not lying to the kernel about it's read/write access.

What we should do instead is to add a flag to each shared fence to note if it is a write operation or not. Then we can trivially add a function to wait on on those in i915.

I should have pushed harder for this solution when the problem came up initially,

For shared buffers in an implicit syncing setup like dri2 the exclusive fence _is_ your write fence.

And exactly that is absolutely not correct if you ask me.

The exclusive fence is for two use cases, the first one is for drivers which don't care about concurrent accesses and only use serialized accesses and the second is for kernel uses under the hood of userspace, evictions, buffer moves etc..

What i915 does is to abuse that concept for write once read many situations, and that in turn is the bug we need to fix here.

Again, the exclusive fence was added for implicit sync usage like dri2/3. _Not_ for your own buffer manager. If you want to separate these two usages, then I guess we can do that, but claiming that i915 is the odd one out just aint true. You're arguing against all other kms drivers we have here.

No I'm not. I discussed exactly that use case with Maarten when the reservation object was introduced.

I think that Maarten named the explicit fence on purpose "explicit" fence and not "write" fence to make that distinction clear.

I have to admit that this wasn't really documented, but it indeed was the original purpose to get away from the idea that writes needs to be exclusive.

...
...
...
That's how this stuff works. Note it's only for implicit fencing for dri2 shared buffers. i915 lies as much as everyone else for explicit syncing.

That is really really ugly and should be fixed instead.

It works and is uapi now. What's the gain in "fixing" it?

It allows you to implement explicit fencing in the uapi without breaking backward compatibility.

See the situation with amdgpu and intel is the same as with userspace with mixed implicit and explicit fencing.

If that isn't fixed we will run into the same problem when DRI3 gets implicit fencing and some applications starts to use it while others still rely on the explicit fencing.

I think we're a bit cross-eyed on exact semantics, but yes this is exactly the use-case. Chris Wilson's use-case tries to emulate exactly what would happen if implicitly fenced amdgpu rendered buffer should be displayed on an i915 output. Then you need to set the exclusive fence correctly.

And yes we called them exclusive/shared because shared fences could also be write fences. But for the implicitly synced userspace case, exclusive = The write fence.

So probably Chris' patch ends up syncing too much (since for explicit syncing you don't want to attach an exclusive fence, because userspace passes the fence around already to the atomic ioctl). But at least it's correct for the implicit case. But that's entirely an optimization problem within amdgpu.

Summary: If you have a shared buffer used in implicitly synced buffer sharing, you _must_ set the exclusive fence to cover all write access.

In any other case (not shared, or not as part of implicitly synced protocol), you can do whatever you want. So we're not conflagrating exclusive with write here at all. And yes this is exactly what i915 and all other drivers do - for explicit fencing we don't set the exclusive fence, but leave that all to userspace. Also, userspace tells us (because it knows how the protocol works, not the kernel) when to set an exclusive fence. For historical reasons the relevant uapi parts are called read/write, but that's just an accident of (maybe too clever) uapi reuse, not their actual semantics. -Daniel

...

Regards, Christian.

...
And this was discussed at length when intel and freedreno folks talked about implementing explicit sync and android sync pts.

...
...
Now as long as you only share within amdgpu you can do whatever you want to, but for anything shared outside of amdgpu, if the buffer isn't shared through an explicit syncing protocol, then you do have to set the exclusive fence. That's at least how this stuff works right now with all other drivers.

For i915 we had to do some uapi trickery to make this work in all cases, since only really userspace knows whether a write should be a shared or exclusive write. But that's stricly an issue limited to your driver, and dosn't need changes to reservation object all throughout the stack.

You don't need to change the reservation object all throughout the stack. A simple flag if a shared fence is a write or not should be doable.

Give me a day or two to prototype that,

See my other reply, i think that's only needed for amdgpu internal book-keeping, so that you can create the correct exclusive fence on first export. But yeah, adding a bitfield to track which fences need to become exclusive shouldn't be a tricky solution to implement for amdgpu. -Daniel

...
Christian.

...
Aside: If you want to attach multiple write fences to the exclusive fence, that should be doable with the array fences. -Daniel

...
Christian.

...
For example, the importer use the write hazard for timing a page flip to only occur after the exporter has finished flushing its write into the surface. As such, on exporting a dmabuf, we must either flush all outstanding fences (for we do not know which are writes and should have been exclusive) or alternatively create a new exclusive fence that is the composite of all the existing shared fences, and so will only be signaled when all earlier fences are signaled (ensuring that we can not be signaled before the completion of any earlier write).

v2: reservation_object is already locked by amdgpu_bo_reserve()

Testcase: igt/amd_prime/amd-to-i915 Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Alex Deucher alexander.deucher@amd.com Cc: "Christian König" christian.koenig@amd.com
drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c | 68 ++++++++++++++++++++---
1 file changed, 60 insertions(+), 8 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c index 1c5d97f4b4dd..576a83946c25 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c @@ -37,6 +37,7 @@ #include "amdgpu_display.h" #include <drm/amdgpu_drm.h> #include <linux/dma-buf.h> +#include <linux/dma-fence-array.h> static const struct dma_buf_ops amdgpu_dmabuf_ops; @@ -188,6 +189,57 @@ amdgpu_gem_prime_import_sg_table(struct drm_device *dev, return ERR_PTR(ret); } +static int +__reservation_object_make_exclusive(struct reservation_object *obj) +{
struct reservation_object_list *fobj;

struct dma_fence_array *array;

struct dma_fence **fences;

unsigned int count, i;

fobj = reservation_object_get_list(obj);

if (!fobj)
return 0;
count = !!rcu_access_pointer(obj->fence_excl);

count += fobj->shared_count;

fences = kmalloc_array(count, sizeof(*fences), GFP_KERNEL);

if (!fences)
return -ENOMEM;
for (i = 0; i < fobj->shared_count; i++) {
struct dma_fence *f =
	rcu_dereference_protected(fobj->shared[i],
				  reservation_object_held(obj));
fences[i] = dma_fence_get(f);
}

if (rcu_access_pointer(obj->fence_excl)) {
struct dma_fence *f =
	rcu_dereference_protected(obj->fence_excl,
				  reservation_object_held(obj));
fences[i] = dma_fence_get(f);
}

array = dma_fence_array_create(count, fences,
		       dma_fence_context_alloc(1), 0,
		       false);
if (!array)
goto err_fences_put;
reservation_object_add_excl_fence(obj, &array->base);

return 0;
+err_fences_put:
for (i = 0; i < count; i++)
dma_fence_put(fences[i]);
kfree(fences);

return -ENOMEM;
+}

/**

amdgpu_gem_map_attach - &dma_buf_ops.attach implementation

@dma_buf: shared DMA buffer

@@ -219,16 +271,16 @@ static int amdgpu_gem_map_attach(struct dma_buf *dma_buf, if (attach->dev->driver != adev->dev->driver) { /*
 * Wait for all shared fences to complete before we switch to future
 * use of exclusive fence on this prime shared bo.
 * We only create shared fences for internal use, but importers
 * of the dmabuf rely on exclusive fences for implicitly
 * tracking write hazards. As any of the current fences may
 * correspond to a write, we need to convert all existing
 * fences on the resevation object into a single exclusive
 * fence.
	 */
r = reservation_object_wait_timeout_rcu(bo->tbo.resv,
					true, false,
					MAX_SCHEDULE_TIMEOUT);
if (unlikely(r < 0)) {
	DRM_DEBUG_PRIME("Fence wait failed: %li\n", r);
r = __reservation_object_make_exclusive(bo->tbo.resv);
if (r)
		goto error_unreserve;
}
}
/* pin buffer into GTT */
dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel

-- Daniel Vetter Software Engineer, Intel Corporation http://blog.ffwll.ch

Christian König

1:54 p.m.

New subject: [PATCH v3] drm/amdgpu: Transfer fences to dmabuf importer

Am 07.08.2018 um 15:37 schrieb Daniel Vetter:

...

On Tue, Aug 07, 2018 at 03:17:06PM +0200, Christian König wrote:

...
Am 07.08.2018 um 14:59 schrieb Daniel Vetter:

...
On Tue, Aug 07, 2018 at 02:51:50PM +0200, Christian König wrote:

...
Am 07.08.2018 um 14:43 schrieb Daniel Vetter:

...
On Tue, Aug 07, 2018 at 01:23:19PM +0200, Christian König wrote:

...
Am 07.08.2018 um 13:05 schrieb Chris Wilson: > amdgpu only uses shared-fences internally, but dmabuf importers rely on > implicit write hazard tracking via the reservation_object.fence_excl. Well I would rather suggest a solution where we stop doing this.

The problem here is that i915 assumes that a write operation always needs exclusive access to an object protected by an reservation object.

At least for amdgpu, radeon and nouveau this assumption is incorrect, but only amdgpu has a design where userspace is not lying to the kernel about it's read/write access.

What we should do instead is to add a flag to each shared fence to note if it is a write operation or not. Then we can trivially add a function to wait on on those in i915.

I should have pushed harder for this solution when the problem came up initially,

For shared buffers in an implicit syncing setup like dri2 the exclusive fence _is_ your write fence.

And exactly that is absolutely not correct if you ask me.

The exclusive fence is for two use cases, the first one is for drivers which don't care about concurrent accesses and only use serialized accesses and the second is for kernel uses under the hood of userspace, evictions, buffer moves etc..

What i915 does is to abuse that concept for write once read many situations, and that in turn is the bug we need to fix here.

Again, the exclusive fence was added for implicit sync usage like dri2/3. _Not_ for your own buffer manager. If you want to separate these two usages, then I guess we can do that, but claiming that i915 is the odd one out just aint true. You're arguing against all other kms drivers we have here.

No I'm not. I discussed exactly that use case with Maarten when the reservation object was introduced.

I think that Maarten named the explicit fence on purpose "explicit" fence and not "write" fence to make that distinction clear.

I have to admit that this wasn't really documented, but it indeed was the original purpose to get away from the idea that writes needs to be exclusive.

...
...
...
That's how this stuff works. Note it's only for implicit fencing for dri2 shared buffers. i915 lies as much as everyone else for explicit syncing.

That is really really ugly and should be fixed instead.

It works and is uapi now. What's the gain in "fixing" it?

It allows you to implement explicit fencing in the uapi without breaking backward compatibility.

See the situation with amdgpu and intel is the same as with userspace with mixed implicit and explicit fencing.

If that isn't fixed we will run into the same problem when DRI3 gets implicit fencing and some applications starts to use it while others still rely on the explicit fencing.

I think we're a bit cross-eyed on exact semantics, but yes this is exactly the use-case. Chris Wilson's use-case tries to emulate exactly what would happen if implicitly fenced amdgpu rendered buffer should be displayed on an i915 output. Then you need to set the exclusive fence correctly.

Yes, exactly. That's what I totally agree on.

...

And yes we called them exclusive/shared because shared fences could also be write fences. But for the implicitly synced userspace case, exclusive = The write fence.

So probably Chris' patch ends up syncing too much (since for explicit syncing you don't want to attach an exclusive fence, because userspace passes the fence around already to the atomic ioctl). But at least it's correct for the implicit case. But that's entirely an optimization problem within amdgpu.

Completely agree as well, but Chris patch actually just optimizes things. It is not necessary for correct operation.

See previously we just waited for the BO to be idle, now Chris patch collects all fences (shared and exclusive) and adds that as single elusive fence to avoid blocking.

...

Summary: If you have a shared buffer used in implicitly synced buffer sharing, you _must_ set the exclusive fence to cover all write access.

And how should command submission know that?

I mean we add the exclusive or shared fence during command submission, but that IOCTL has not the slightest idea if the BO is then used for explicit or for implicit fencing.

...

In any other case (not shared, or not as part of implicitly synced protocol), you can do whatever you want. So we're not conflagrating exclusive with write here at all. And yes this is exactly what i915 and all other drivers do - for explicit fencing we don't set the exclusive fence, but leave that all to userspace. Also, userspace tells us (because it knows how the protocol works, not the kernel) when to set an exclusive fence.

To extend that at least the lower layers of the user space driver doesn't know if we have explicit or implicit fencing either.

The only component who really does know that is the DDX or more general the driver instance inside the display server.

And when that component gets the buffer to display it command submission should already be long done.

Regards, Christian.

...

For historical reasons the relevant uapi parts are called read/write, but that's just an accident of (maybe too clever) uapi reuse, not their actual semantics. -Daniel

Daniel Vetter

2:04 p.m.

New subject: [PATCH v3] drm/amdgpu: Transfer fences to dmabuf importer

On Tue, Aug 7, 2018 at 3:54 PM, Christian König christian.koenig@amd.com wrote:

...

Am 07.08.2018 um 15:37 schrieb Daniel Vetter:

...
On Tue, Aug 07, 2018 at 03:17:06PM +0200, Christian König wrote:

...
Am 07.08.2018 um 14:59 schrieb Daniel Vetter:

...
On Tue, Aug 07, 2018 at 02:51:50PM +0200, Christian König wrote:

...
Am 07.08.2018 um 14:43 schrieb Daniel Vetter:

...
On Tue, Aug 07, 2018 at 01:23:19PM +0200, Christian König wrote: > > Am 07.08.2018 um 13:05 schrieb Chris Wilson: >> >> amdgpu only uses shared-fences internally, but dmabuf importers rely >> on >> implicit write hazard tracking via the >> reservation_object.fence_excl. > > Well I would rather suggest a solution where we stop doing this. > > The problem here is that i915 assumes that a write operation always > needs > exclusive access to an object protected by an reservation object. > > At least for amdgpu, radeon and nouveau this assumption is incorrect, > but > only amdgpu has a design where userspace is not lying to the kernel > about > it's read/write access. > > What we should do instead is to add a flag to each shared fence to > note if > it is a write operation or not. Then we can trivially add a function > to wait > on on those in i915. > > I should have pushed harder for this solution when the problem came > up > initially,

For shared buffers in an implicit syncing setup like dri2 the exclusive fence _is_ your write fence.

And exactly that is absolutely not correct if you ask me.

The exclusive fence is for two use cases, the first one is for drivers which don't care about concurrent accesses and only use serialized accesses and the second is for kernel uses under the hood of userspace, evictions, buffer moves etc..

What i915 does is to abuse that concept for write once read many situations, and that in turn is the bug we need to fix here.

Again, the exclusive fence was added for implicit sync usage like dri2/3. _Not_ for your own buffer manager. If you want to separate these two usages, then I guess we can do that, but claiming that i915 is the odd one out just aint true. You're arguing against all other kms drivers we have here.

No I'm not. I discussed exactly that use case with Maarten when the reservation object was introduced.

I think that Maarten named the explicit fence on purpose "explicit" fence and not "write" fence to make that distinction clear.

I have to admit that this wasn't really documented, but it indeed was the original purpose to get away from the idea that writes needs to be exclusive.

...
...
...
That's how this stuff works. Note it's only for implicit fencing for dri2 shared buffers. i915 lies as much as everyone else for explicit syncing.

That is really really ugly and should be fixed instead.

It works and is uapi now. What's the gain in "fixing" it?

It allows you to implement explicit fencing in the uapi without breaking backward compatibility.

See the situation with amdgpu and intel is the same as with userspace with mixed implicit and explicit fencing.

If that isn't fixed we will run into the same problem when DRI3 gets implicit fencing and some applications starts to use it while others still rely on the explicit fencing.

I think we're a bit cross-eyed on exact semantics, but yes this is exactly the use-case. Chris Wilson's use-case tries to emulate exactly what would happen if implicitly fenced amdgpu rendered buffer should be displayed on an i915 output. Then you need to set the exclusive fence correctly.

Yes, exactly. That's what I totally agree on.

...
And yes we called them exclusive/shared because shared fences could also be write fences. But for the implicitly synced userspace case, exclusive = The write fence.

So probably Chris' patch ends up syncing too much (since for explicit syncing you don't want to attach an exclusive fence, because userspace passes the fence around already to the atomic ioctl). But at least it's correct for the implicit case. But that's entirely an optimization problem within amdgpu.

Completely agree as well, but Chris patch actually just optimizes things. It is not necessary for correct operation.

Duh, I didn't read the patch carefully enough ...

...

See previously we just waited for the BO to be idle, now Chris patch collects all fences (shared and exclusive) and adds that as single elusive fence to avoid blocking.

...
Summary: If you have a shared buffer used in implicitly synced buffer sharing, you _must_ set the exclusive fence to cover all write access.

And how should command submission know that?

I mean we add the exclusive or shared fence during command submission, but that IOCTL has not the slightest idea if the BO is then used for explicit or for implicit fencing.

The ioctl doesn't know, but the winsys in userspace does know. Well, with one exception: Bare metal egl on gbm or similar doesn't, so there the heuristics is that we assume implicit fencing until userspace starts using the explicit fence extensions. Then we switch over.

And once your winsys knows whether shared buffers should be implicit or explicit synced, it can tell the kernel. Either through a new flag, our by retroshoehorning the semantics into existing flags. The latter is what we've done for i915 and freedrone afaiui.

...

...
In any other case (not shared, or not as part of implicitly synced protocol), you can do whatever you want. So we're not conflagrating exclusive with write here at all. And yes this is exactly what i915 and all other drivers do - for explicit fencing we don't set the exclusive fence, but leave that all to userspace. Also, userspace tells us (because it knows how the protocol works, not the kernel) when to set an exclusive fence.

To extend that at least the lower layers of the user space driver doesn't know if we have explicit or implicit fencing either.

The only component who really does know that is the DDX or more general the driver instance inside the display server.

And when that component gets the buffer to display it command submission should already be long done.

Why does only the DDX know? If you're running gl on GLX your driver knows how to sync the shared buffer - it also knows how to hand it over to the DDX after all. Same for gles on android/surfaceflinger, you know it's going to be explicit sync for shared buffers.

There's some cases where you can't decide this right away, but a context flag that enables explicit sync once that's clear seemed to be all that's needed. Worst case the first frame is artifially limited due to the implicit fencing hurting a bit. And in general that issue is only for bare metal buffers, i.e. your compositor takes a small hit once at startup. If this is a real issue we could add a gbm interface to select implict/explicit before we start allocating anything.

Again this is only for shared buffers, anything private to a given context can be handled however you want really. -Daniel

-- Daniel Vetter Software Engineer, Intel Corporation +41 (0) 79 365 57 48 - http://blog.ffwll.ch

Chris Wilson

4:08 p.m.

New subject: [PATCH v4] drm/amdgpu: Transfer fences to dmabuf importer

v2: reservation_object is already locked by amdgpu_bo_reserve()

This time, hopefully proofread and references complete. -Chris

--- drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c | 68 ++++++++++++++++++++--- 1 file changed, 60 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c index 1c5d97f4b4dd..dff2b01a3d89 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c @@ -37,6 +37,7 @@ #include "amdgpu_display.h" #include <drm/amdgpu_drm.h> #include <linux/dma-buf.h> +#include <linux/dma-fence-array.h>

static const struct dma_buf_ops amdgpu_dmabuf_ops;

@@ -188,6 +189,57 @@ amdgpu_gem_prime_import_sg_table(struct drm_device *dev, return ERR_PTR(ret); }

+static int +__reservation_object_make_exclusive(struct reservation_object *obj) +{ + struct reservation_object_list *fobj; + struct dma_fence_array *array; + struct dma_fence **fences; + unsigned int count, i; + + fobj = reservation_object_get_list(obj); + if (!fobj) + return 0; + + count = !!rcu_access_pointer(obj->fence_excl); + count += fobj->shared_count; + + fences = kmalloc_array(count, sizeof(*fences), GFP_KERNEL); + if (!fences) + return -ENOMEM; + + for (i = 0; i < fobj->shared_count; i++) { + struct dma_fence *f = + rcu_dereference_protected(fobj->shared[i], + reservation_object_held(obj)); + + fences[i] = dma_fence_get(f); + } + + if (rcu_access_pointer(obj->fence_excl)) { + struct dma_fence *f = + rcu_dereference_protected(obj->fence_excl, + reservation_object_held(obj)); + + fences[i] = dma_fence_get(f); + } + + array = dma_fence_array_create(count, fences, + dma_fence_context_alloc(1), 0, + false); + if (!array) + goto err_fences_put; + + reservation_object_add_excl_fence(obj, &array->base); + return 0; + +err_fences_put: + for (i = 0; i < count; i++) + dma_fence_put(fences[i]); + kfree(fences); + return -ENOMEM; +} + /** * amdgpu_gem_map_attach - &dma_buf_ops.attach implementation * @dma_buf: shared DMA buffer @@ -219,16 +271,16 @@ static int amdgpu_gem_map_attach(struct dma_buf *dma_buf,

if (attach->dev->driver != adev->dev->driver) { /* - * Wait for all shared fences to complete before we switch to future - * use of exclusive fence on this prime shared bo. + * We only create shared fences for internal use, but importers + * of the dmabuf rely on exclusive fences for implicitly + * tracking write hazards. As any of the current fences may + * correspond to a write, we need to convert all existing + * fences on the reservation object into a single exclusive + * fence. */ - r = reservation_object_wait_timeout_rcu(bo->tbo.resv, - true, false, - MAX_SCHEDULE_TIMEOUT); - if (unlikely(r < 0)) { - DRM_DEBUG_PRIME("Fence wait failed: %li\n", r); + r = __reservation_object_make_exclusive(bo->tbo.resv); + if (r) goto error_unreserve; - } }

/* pin buffer into GTT */

-- 2.18.0

Christian König

5:57 p.m.

New subject: [PATCH v4] drm/amdgpu: Transfer fences to dmabuf importer

Am 07.08.2018 um 18:08 schrieb Chris Wilson:

...

amdgpu only uses shared-fences internally, but dmabuf importers rely on implicit write hazard tracking via the reservation_object.fence_excl. For example, the importer use the write hazard for timing a page flip to only occur after the exporter has finished flushing its write into the surface. As such, on exporting a dmabuf, we must either flush all outstanding fences (for we do not know which are writes and should have been exclusive) or alternatively create a new exclusive fence that is the composite of all the existing shared fences, and so will only be signaled when all earlier fences are signaled (ensuring that we can not be signaled before the completion of any earlier write).

v2: reservation_object is already locked by amdgpu_bo_reserve()

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107341 Testcase: igt/amd_prime/amd-to-i915 References: 8e94a46c1770 ("drm/amdgpu: Attach exclusive fence to prime exported bo's. (v5)") Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Alex Deucher alexander.deucher@amd.com Cc: "Christian König" christian.koenig@amd.com

This time, hopefully proofread and references complete. -Chris

drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c | 68 ++++++++++++++++++++--- 1 file changed, 60 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c index 1c5d97f4b4dd..dff2b01a3d89 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c @@ -37,6 +37,7 @@ #include "amdgpu_display.h" #include <drm/amdgpu_drm.h> #include <linux/dma-buf.h> +#include <linux/dma-fence-array.h>

static const struct dma_buf_ops amdgpu_dmabuf_ops;

@@ -188,6 +189,57 @@ amdgpu_gem_prime_import_sg_table(struct drm_device *dev, return ERR_PTR(ret); }

+static int +__reservation_object_make_exclusive(struct reservation_object *obj) +{
struct reservation_object_list *fobj;

struct dma_fence_array *array;

struct dma_fence **fences;

unsigned int count, i;

fobj = reservation_object_get_list(obj);

if (!fobj)
return 0;
count = !!rcu_access_pointer(obj->fence_excl);

count += fobj->shared_count;

fences = kmalloc_array(count, sizeof(*fences), GFP_KERNEL);

if (!fences)
return -ENOMEM;
for (i = 0; i < fobj->shared_count; i++) {
struct dma_fence *f =
	rcu_dereference_protected(fobj->shared[i],
				  reservation_object_held(obj));
fences[i] = dma_fence_get(f);
}

if (rcu_access_pointer(obj->fence_excl)) {
struct dma_fence *f =
	rcu_dereference_protected(obj->fence_excl,
				  reservation_object_held(obj));
fences[i] = dma_fence_get(f);
}

array = dma_fence_array_create(count, fences,
		       dma_fence_context_alloc(1), 0,
		       false);
if (!array)
goto err_fences_put;
reservation_object_add_excl_fence(obj, &array->base);

return 0;
+err_fences_put:
for (i = 0; i < count; i++)
dma_fence_put(fences[i]);
kfree(fences);

return -ENOMEM;
+}

This can be simplified a lot. See amdgpu_pasid_free_delayed() for an example:

...

r = reservation_object_get_fences_rcu(resv, NULL, &count, &fences);         if (r)                 goto fallback;

if (count == 0) {                 amdgpu_pasid_free(pasid);                 return;         }

if (count == 1) {                 fence = fences[0];                 kfree(fences);         } else {                 uint64_t context = dma_fence_context_alloc(1);                 struct dma_fence_array *array;

array = dma_fence_array_create(count, fences, context,                                                1, false);                 if (!array) {                         kfree(fences);                         goto fallback;                 }                 fence = &array->base;         }

Regards, Christian.

...

/**

amdgpu_gem_map_attach - &dma_buf_ops.attach implementation
@dma_buf: shared DMA buffer

@@ -219,16 +271,16 @@ static int amdgpu_gem_map_attach(struct dma_buf *dma_buf,

if (attach->dev->driver != adev->dev->driver) { /*

 * Wait for all shared fences to complete before we switch to future

 * use of exclusive fence on this prime shared bo.

 * We only create shared fences for internal use, but importers

 * of the dmabuf rely on exclusive fences for implicitly

 * tracking write hazards. As any of the current fences may

 * correspond to a write, we need to convert all existing

 * fences on the reservation object into a single exclusive

```
 * fence.
```
*/

r = reservation_object_wait_timeout_rcu(bo->tbo.resv,

```
					true, false,
```
```
					MAX_SCHEDULE_TIMEOUT);
```
```
if (unlikely(r < 0)) {
```

	DRM_DEBUG_PRIME("Fence wait failed: %li\n", r);

r = __reservation_object_make_exclusive(bo->tbo.resv);

```
if (r)
goto error_unreserve;
```

```
}
```
}

/* pin buffer into GTT */

Chris Wilson

6:14 p.m.

New subject: [PATCH v4] drm/amdgpu: Transfer fences to dmabuf importer

Quoting Christian König (2018-08-07 18:57:16)

...

Am 07.08.2018 um 18:08 schrieb Chris Wilson:

...
amdgpu only uses shared-fences internally, but dmabuf importers rely on implicit write hazard tracking via the reservation_object.fence_excl. For example, the importer use the write hazard for timing a page flip to only occur after the exporter has finished flushing its write into the surface. As such, on exporting a dmabuf, we must either flush all outstanding fences (for we do not know which are writes and should have been exclusive) or alternatively create a new exclusive fence that is the composite of all the existing shared fences, and so will only be signaled when all earlier fences are signaled (ensuring that we can not be signaled before the completion of any earlier write).

v2: reservation_object is already locked by amdgpu_bo_reserve()

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107341 Testcase: igt/amd_prime/amd-to-i915 References: 8e94a46c1770 ("drm/amdgpu: Attach exclusive fence to prime exported bo's. (v5)") Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Alex Deucher alexander.deucher@amd.com Cc: "Christian König" christian.koenig@amd.com

This time, hopefully proofread and references complete. -Chris

drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c | 68 ++++++++++++++++++++--- 1 file changed, 60 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c index 1c5d97f4b4dd..dff2b01a3d89 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c @@ -37,6 +37,7 @@ #include "amdgpu_display.h" #include <drm/amdgpu_drm.h> #include <linux/dma-buf.h> +#include <linux/dma-fence-array.h>

static const struct dma_buf_ops amdgpu_dmabuf_ops;

@@ -188,6 +189,57 @@ amdgpu_gem_prime_import_sg_table(struct drm_device *dev, return ERR_PTR(ret); }

+static int +__reservation_object_make_exclusive(struct reservation_object *obj) +{
struct reservation_object_list *fobj;
struct dma_fence_array *array;
struct dma_fence **fences;
unsigned int count, i;
fobj = reservation_object_get_list(obj);
if (!fobj)
        return 0;
count = !!rcu_access_pointer(obj->fence_excl);
count += fobj->shared_count;
fences = kmalloc_array(count, sizeof(*fences), GFP_KERNEL);
if (!fences)
        return -ENOMEM;
for (i = 0; i < fobj->shared_count; i++) {
        struct dma_fence *f =
                rcu_dereference_protected(fobj->shared[i],
                                          reservation_object_held(obj));
        fences[i] = dma_fence_get(f);
}
if (rcu_access_pointer(obj->fence_excl)) {
        struct dma_fence *f =
                rcu_dereference_protected(obj->fence_excl,
                                          reservation_object_held(obj));
        fences[i] = dma_fence_get(f);
}
array = dma_fence_array_create(count, fences,
                               dma_fence_context_alloc(1), 0,
                               false);
if (!array)
        goto err_fences_put;
reservation_object_add_excl_fence(obj, &array->base);
return 0;
+err_fences_put:
for (i = 0; i < count; i++)
        dma_fence_put(fences[i]);
kfree(fences);
return -ENOMEM;
+}
This can be simplified a lot. See amdgpu_pasid_free_delayed() for an example:

{ if (!reservation_object_get_list(obj)) return 0;

r = reservation_object_get_fences_rcu(obj, NULL, &count, &fences); if (r) return r;

array = dma_fence_array_create(count, fences, dma_fence_context_alloc(1), 0, false); if (!array) goto err_fences_put;

reservation_object_add_excl_fence(obj, &array->base); return 0;

err: ... }

My starting point was going to be use get_fences_rcu, but get_fences_rcu can hardly be called simple for where the lock is required to be held start to finish ;) -Chris

Christian König

6:18 p.m.

New subject: [PATCH v4] drm/amdgpu: Transfer fences to dmabuf importer

Am 07.08.2018 um 20:14 schrieb Chris Wilson:

...

Quoting Christian König (2018-08-07 18:57:16)

...
Am 07.08.2018 um 18:08 schrieb Chris Wilson:

...
amdgpu only uses shared-fences internally, but dmabuf importers rely on implicit write hazard tracking via the reservation_object.fence_excl. For example, the importer use the write hazard for timing a page flip to only occur after the exporter has finished flushing its write into the surface. As such, on exporting a dmabuf, we must either flush all outstanding fences (for we do not know which are writes and should have been exclusive) or alternatively create a new exclusive fence that is the composite of all the existing shared fences, and so will only be signaled when all earlier fences are signaled (ensuring that we can not be signaled before the completion of any earlier write).

v2: reservation_object is already locked by amdgpu_bo_reserve()

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107341 Testcase: igt/amd_prime/amd-to-i915 References: 8e94a46c1770 ("drm/amdgpu: Attach exclusive fence to prime exported bo's. (v5)") Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Alex Deucher alexander.deucher@amd.com Cc: "Christian König" christian.koenig@amd.com

This time, hopefully proofread and references complete. -Chris

drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c | 68 ++++++++++++++++++++--- 1 file changed, 60 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c index 1c5d97f4b4dd..dff2b01a3d89 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c @@ -37,6 +37,7 @@ #include "amdgpu_display.h" #include <drm/amdgpu_drm.h> #include <linux/dma-buf.h> +#include <linux/dma-fence-array.h>

static const struct dma_buf_ops amdgpu_dmabuf_ops;

@@ -188,6 +189,57 @@ amdgpu_gem_prime_import_sg_table(struct drm_device *dev, return ERR_PTR(ret); }

+static int +__reservation_object_make_exclusive(struct reservation_object *obj) +{
struct reservation_object_list *fobj;
struct dma_fence_array *array;
struct dma_fence **fences;
unsigned int count, i;
fobj = reservation_object_get_list(obj);
if (!fobj)
        return 0;
count = !!rcu_access_pointer(obj->fence_excl);
count += fobj->shared_count;
fences = kmalloc_array(count, sizeof(*fences), GFP_KERNEL);
if (!fences)
        return -ENOMEM;
for (i = 0; i < fobj->shared_count; i++) {
        struct dma_fence *f =
                rcu_dereference_protected(fobj->shared[i],
                                          reservation_object_held(obj));
        fences[i] = dma_fence_get(f);
}
if (rcu_access_pointer(obj->fence_excl)) {
        struct dma_fence *f =
                rcu_dereference_protected(obj->fence_excl,
                                          reservation_object_held(obj));
        fences[i] = dma_fence_get(f);
}
array = dma_fence_array_create(count, fences,
                               dma_fence_context_alloc(1), 0,
                               false);
if (!array)
        goto err_fences_put;
reservation_object_add_excl_fence(obj, &array->base);
return 0;
+err_fences_put:
for (i = 0; i < count; i++)
        dma_fence_put(fences[i]);
kfree(fences);
return -ENOMEM;
+}
This can be simplified a lot. See amdgpu_pasid_free_delayed() for an example:
{ if (!reservation_object_get_list(obj)) return 0;

r = reservation_object_get_fences_rcu(obj, NULL, &count, &fences); if (r) return r;

array = dma_fence_array_create(count, fences, dma_fence_context_alloc(1), 0, false); if (!array) goto err_fences_put;

reservation_object_add_excl_fence(obj, &array->base); return 0;

err: ... }

My starting point was going to be use get_fences_rcu, but get_fences_rcu can hardly be called simple for where the lock is required to be held start to finish ;)

What are you talking about? get_fences_rcu doesn't require any locking at all.

You only need to the locking to make sure that between creating the fence array and calling reservation_object_add_excl_fence() no other fence is added.

Christian.

...

-Chris _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Chris Wilson

6:30 p.m.

New subject: [PATCH v4] drm/amdgpu: Transfer fences to dmabuf importer

Quoting Christian König (2018-08-07 19:18:55)

...

Am 07.08.2018 um 20:14 schrieb Chris Wilson:

...
Quoting Christian König (2018-08-07 18:57:16)

...
Am 07.08.2018 um 18:08 schrieb Chris Wilson:

...
amdgpu only uses shared-fences internally, but dmabuf importers rely on implicit write hazard tracking via the reservation_object.fence_excl. For example, the importer use the write hazard for timing a page flip to only occur after the exporter has finished flushing its write into the surface. As such, on exporting a dmabuf, we must either flush all outstanding fences (for we do not know which are writes and should have been exclusive) or alternatively create a new exclusive fence that is the composite of all the existing shared fences, and so will only be signaled when all earlier fences are signaled (ensuring that we can not be signaled before the completion of any earlier write).

v2: reservation_object is already locked by amdgpu_bo_reserve()

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107341 Testcase: igt/amd_prime/amd-to-i915 References: 8e94a46c1770 ("drm/amdgpu: Attach exclusive fence to prime exported bo's. (v5)") Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Alex Deucher alexander.deucher@amd.com Cc: "Christian König" christian.koenig@amd.com

This time, hopefully proofread and references complete. -Chris

drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c | 68 ++++++++++++++++++++--- 1 file changed, 60 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c index 1c5d97f4b4dd..dff2b01a3d89 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c @@ -37,6 +37,7 @@ #include "amdgpu_display.h" #include <drm/amdgpu_drm.h> #include <linux/dma-buf.h> +#include <linux/dma-fence-array.h>

static const struct dma_buf_ops amdgpu_dmabuf_ops;

@@ -188,6 +189,57 @@ amdgpu_gem_prime_import_sg_table(struct drm_device *dev, return ERR_PTR(ret); }

+static int +__reservation_object_make_exclusive(struct reservation_object *obj) +{
struct reservation_object_list *fobj;
struct dma_fence_array *array;
struct dma_fence **fences;
unsigned int count, i;
fobj = reservation_object_get_list(obj);
if (!fobj)
        return 0;
count = !!rcu_access_pointer(obj->fence_excl);
count += fobj->shared_count;
fences = kmalloc_array(count, sizeof(*fences), GFP_KERNEL);
if (!fences)
        return -ENOMEM;
for (i = 0; i < fobj->shared_count; i++) {
        struct dma_fence *f =
                rcu_dereference_protected(fobj->shared[i],
                                          reservation_object_held(obj));
        fences[i] = dma_fence_get(f);
}
if (rcu_access_pointer(obj->fence_excl)) {
        struct dma_fence *f =
                rcu_dereference_protected(obj->fence_excl,
                                          reservation_object_held(obj));
        fences[i] = dma_fence_get(f);
}
array = dma_fence_array_create(count, fences,
                               dma_fence_context_alloc(1), 0,
                               false);
if (!array)
        goto err_fences_put;
reservation_object_add_excl_fence(obj, &array->base);
return 0;
+err_fences_put:
for (i = 0; i < count; i++)
        dma_fence_put(fences[i]);
kfree(fences);
return -ENOMEM;
+}
This can be simplified a lot. See amdgpu_pasid_free_delayed() for an example:
{ if (!reservation_object_get_list(obj)) return 0;
  r = reservation_object_get_fences_rcu(obj, NULL, &count, &fences);
  if (r)
          return r;

  array = dma_fence_array_create(count, fences,
                                 dma_fence_context_alloc(1), 0,
                                 false);
  if (!array)
          goto err_fences_put;

  reservation_object_add_excl_fence(obj, &array->base);
  return 0;
err: ... }

My starting point was going to be use get_fences_rcu, but get_fences_rcu can hardly be called simple for where the lock is required to be held start to finish ;)
What are you talking about? get_fences_rcu doesn't require any locking at all.

You only need to the locking to make sure that between creating the fence array and calling reservation_object_add_excl_fence() no other fence is added.

Exactly. That's what need to be absolutely clear from the context. I didn't say anything about the locking requirements for get_fences_rcu just the opposite. -Chris

Chris Wilson

6:32 p.m.

New subject: [PATCH v5] drm/amdgpu: Transfer fences to dmabuf importer

v2: reservation_object is already locked by amdgpu_bo_reserve() v3: Replace looping with get_fences_rcu and special case the promotion of a single shared fence directly to an exclusive fence, bypassing the fence array.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107341 Testcase: igt/amd_prime/amd-to-i915 References: 8e94a46c1770 ("drm/amdgpu: Attach exclusive fence to prime exported bo's. (v5)") Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Alex Deucher alexander.deucher@amd.com Cc: "Christian König" christian.koenig@amd.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c | 58 +++++++++++++++++++---- 1 file changed, 50 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c index 3fdd5688da0b..06a310b5b07b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c @@ -37,6 +37,7 @@ #include "amdgpu_display.h" #include <drm/amdgpu_drm.h> #include <linux/dma-buf.h> +#include <linux/dma-fence-array.h>

static const struct dma_buf_ops amdgpu_dmabuf_ops;

@@ -188,6 +189,47 @@ amdgpu_gem_prime_import_sg_table(struct drm_device *dev, return ERR_PTR(ret); }

+static int +__reservation_object_make_exclusive(struct reservation_object *obj) +{ + struct dma_fence **fences; + unsigned int count; + int r; + + if (!reservation_object_get_list(obj)) /* no shared fences to convert */ + return 0; + + r = reservation_object_get_fences_rcu(obj, NULL, &count, &fences); + if (r) + return r; + + if (count == 0) { + /* Now that was unexpected. */ + } else if (count == 1) { + reservation_object_add_excl_fence(obj, fences[0]); + dma_fence_put(fences[0]); + kfree(fences); + } else { + struct dma_fence_array *array; + + array = dma_fence_array_create(count, fences, + dma_fence_context_alloc(1), 0, + false); + if (!array) + goto err_fences_put; + + reservation_object_add_excl_fence(obj, &array->base); + } + + return 0; + +err_fences_put: + while (count--) + dma_fence_put(fences[count]); + kfree(fences); + return -ENOMEM; +} + /** * amdgpu_gem_map_attach - &dma_buf_ops.attach implementation * @dma_buf: shared DMA buffer @@ -219,16 +261,16 @@ static int amdgpu_gem_map_attach(struct dma_buf *dma_buf,

if (attach->dev->driver != adev->dev->driver) { /* - * Wait for all shared fences to complete before we switch to future - * use of exclusive fence on this prime shared bo. + * We only create shared fences for internal use, but importers + * of the dmabuf rely on exclusive fences for implicitly + * tracking write hazards. As any of the current fences may + * correspond to a write, we need to convert all existing + * fences on the reservation object into a single exclusive + * fence. */ - r = reservation_object_wait_timeout_rcu(bo->tbo.resv, - true, false, - MAX_SCHEDULE_TIMEOUT); - if (unlikely(r < 0)) { - DRM_DEBUG_PRIME("Fence wait failed: %li\n", r); + r = __reservation_object_make_exclusive(bo->tbo.resv); + if (r) goto error_unreserve; - } }

/* pin buffer into GTT */

-- 2.18.0

Christian König

6:53 p.m.

New subject: [PATCH v5] drm/amdgpu: Transfer fences to dmabuf importer

Am 07.08.2018 um 20:32 schrieb Chris Wilson:

...

amdgpu only uses shared-fences internally, but dmabuf importers rely on implicit write hazard tracking via the reservation_object.fence_excl. For example, the importer use the write hazard for timing a page flip to only occur after the exporter has finished flushing its write into the surface. As such, on exporting a dmabuf, we must either flush all outstanding fences (for we do not know which are writes and should have been exclusive) or alternatively create a new exclusive fence that is the composite of all the existing shared fences, and so will only be signaled when all earlier fences are signaled (ensuring that we can not be signaled before the completion of any earlier write).

v2: reservation_object is already locked by amdgpu_bo_reserve() v3: Replace looping with get_fences_rcu and special case the promotion of a single shared fence directly to an exclusive fence, bypassing the fence array.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107341 Testcase: igt/amd_prime/amd-to-i915 References: 8e94a46c1770 ("drm/amdgpu: Attach exclusive fence to prime exported bo's. (v5)") Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Alex Deucher alexander.deucher@amd.com Cc: "Christian König" christian.koenig@amd.com

Reviewed-by: Christian König christian.koenig@amd.com

...

drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c | 58 +++++++++++++++++++---- 1 file changed, 50 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c index 3fdd5688da0b..06a310b5b07b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c @@ -37,6 +37,7 @@ #include "amdgpu_display.h" #include <drm/amdgpu_drm.h> #include <linux/dma-buf.h> +#include <linux/dma-fence-array.h>

static const struct dma_buf_ops amdgpu_dmabuf_ops;

@@ -188,6 +189,47 @@ amdgpu_gem_prime_import_sg_table(struct drm_device *dev, return ERR_PTR(ret); }

+static int +__reservation_object_make_exclusive(struct reservation_object *obj) +{
struct dma_fence **fences;

unsigned int count;

int r;

if (!reservation_object_get_list(obj)) /* no shared fences to convert */
return 0;
r = reservation_object_get_fences_rcu(obj, NULL, &count, &fences);

if (r)
return r;
if (count == 0) {
/* Now that was unexpected. */
} else if (count == 1) {
reservation_object_add_excl_fence(obj, fences[0]);
dma_fence_put(fences[0]);
kfree(fences);
} else {
struct dma_fence_array *array;
array = dma_fence_array_create(count, fences,
			       dma_fence_context_alloc(1), 0,
			       false);
if (!array)
	goto err_fences_put;
reservation_object_add_excl_fence(obj, &array->base);
}

return 0;
+err_fences_put:
while (count--)
dma_fence_put(fences[count]);
kfree(fences);

return -ENOMEM;
+}

/**

amdgpu_gem_map_attach - &dma_buf_ops.attach implementation

@dma_buf: shared DMA buffer

@@ -219,16 +261,16 @@ static int amdgpu_gem_map_attach(struct dma_buf *dma_buf,

if (attach->dev->driver != adev->dev->driver) { /*
 * Wait for all shared fences to complete before we switch to future
 * use of exclusive fence on this prime shared bo.
 * We only create shared fences for internal use, but importers
 * of the dmabuf rely on exclusive fences for implicitly
 * tracking write hazards. As any of the current fences may
 * correspond to a write, we need to convert all existing
 * fences on the reservation object into a single exclusive
 * fence.
*/
r = reservation_object_wait_timeout_rcu(bo->tbo.resv,
					true, false,
					MAX_SCHEDULE_TIMEOUT);
if (unlikely(r < 0)) {
	DRM_DEBUG_PRIME("Fence wait failed: %li\n", r);
r = __reservation_object_make_exclusive(bo->tbo.resv);
if (r)
goto error_unreserve;
}
}

/* pin buffer into GTT */

Chris Wilson

7:36 p.m.

New subject: [PATCH v6] drm/amdgpu: Transfer fences to dmabuf importer

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107341 Testcase: igt/amd_prime/amd-to-i915 References: 8e94a46c1770 ("drm/amdgpu: Attach exclusive fence to prime exported bo's. (v5)") Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Alex Deucher alexander.deucher@amd.com Cc: "Christian König" christian.koenig@amd.com Reviewed-by: "Christian König" christian.koenig@amd.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c | 59 ++++++++++++++++++++--- 1 file changed, 51 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c index 3fdd5688da0b..0394f5c77f81 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c @@ -37,6 +37,7 @@ #include "amdgpu_display.h" #include <drm/amdgpu_drm.h> #include <linux/dma-buf.h> +#include <linux/dma-fence-array.h>

static const struct dma_buf_ops amdgpu_dmabuf_ops;

@@ -188,6 +189,48 @@ amdgpu_gem_prime_import_sg_table(struct drm_device *dev, return ERR_PTR(ret); }

+static int +__reservation_object_make_exclusive(struct reservation_object *obj) +{ + struct dma_fence **fences; + unsigned int count; + int r; + + if (!reservation_object_get_list(obj)) /* no shared fences to convert */ + return 0; + + r = reservation_object_get_fences_rcu(obj, NULL, &count, &fences); + if (r) + return r; + + if (count == 0) { + /* Now that was unexpected. */ + } else if (count == 1) { + reservation_object_add_excl_fence(obj, fences[0]); + dma_fence_put(fences[0]); + kfree(fences); + } else { + struct dma_fence_array *array; + + array = dma_fence_array_create(count, fences, + dma_fence_context_alloc(1), 0, + false); + if (!array) + goto err_fences_put; + + reservation_object_add_excl_fence(obj, &array->base); + dma_fence_put(&array->base); + } + + return 0; + +err_fences_put: + while (count--) + dma_fence_put(fences[count]); + kfree(fences); + return -ENOMEM; +} + /** * amdgpu_gem_map_attach - &dma_buf_ops.attach implementation * @dma_buf: shared DMA buffer @@ -219,16 +262,16 @@ static int amdgpu_gem_map_attach(struct dma_buf *dma_buf,

if (attach->dev->driver != adev->dev->driver) { /* - * Wait for all shared fences to complete before we switch to future - * use of exclusive fence on this prime shared bo. + * We only create shared fences for internal use, but importers + * of the dmabuf rely on exclusive fences for implicitly + * tracking write hazards. As any of the current fences may + * correspond to a write, we need to convert all existing + * fences on the reservation object into a single exclusive + * fence. */ - r = reservation_object_wait_timeout_rcu(bo->tbo.resv, - true, false, - MAX_SCHEDULE_TIMEOUT); - if (unlikely(r < 0)) { - DRM_DEBUG_PRIME("Fence wait failed: %li\n", r); + r = __reservation_object_make_exclusive(bo->tbo.resv); + if (r) goto error_unreserve; - } }

/* pin buffer into GTT */

-- 2.18.0

2472

Age (days ago)

2472

Last active (days ago)

dri-devel@lists.freedesktop.org

21 comments

5 participants

tags (0)

participants (5)

Chris Wilson
Christian König
Christian König
Daniel Vetter
Huang Rui