Right now, userspace doesn't do any L2T writes, but we should lay out our expectations for how it works.
v2: Explicitly mention the VCD cache flushing requirements and that we'll flush the other caches before each of the CLs.
Signed-off-by: Eric Anholt eric@anholt.net --- include/uapi/drm/v3d_drm.h | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/include/uapi/drm/v3d_drm.h b/include/uapi/drm/v3d_drm.h index 35c7d813c66e..ea70669d2138 100644 --- a/include/uapi/drm/v3d_drm.h +++ b/include/uapi/drm/v3d_drm.h @@ -52,6 +52,14 @@ extern "C" { * * This asks the kernel to have the GPU execute an optional binner * command list, and a render command list. + * + * The L1T, slice, L2C, L2T, and GCA caches will be flushed before + * each CL executes. The VCD cache should be flushed (if necessary) + * by the submitted CLs. The TLB writes are guaranteed to have been + * flushed by the time the render done IRQ happens, which is the + * trigger for out_sync. Any dirtying of cachelines by the job (only + * possible using TMU writes) must be flushed by the caller using the + * CL's cache flush commands. */ struct drm_v3d_submit_cl { /* Pointer to the binner command list.
Now that I've specified how the end-of-pipeline flushing should work, we're never going to use this function.
Signed-off-by: Eric Anholt eric@anholt.net Reviewed-by: Dave Emett david.emett@broadcom.com --- drivers/gpu/drm/v3d/v3d_drv.h | 1 - drivers/gpu/drm/v3d/v3d_gem.c | 21 --------------------- 2 files changed, 22 deletions(-)
diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h index bcd3d567bec2..239b56d76f3e 100644 --- a/drivers/gpu/drm/v3d/v3d_drv.h +++ b/drivers/gpu/drm/v3d/v3d_drv.h @@ -314,7 +314,6 @@ void v3d_exec_put(struct v3d_exec_info *exec); void v3d_tfu_job_put(struct v3d_tfu_job *exec); void v3d_reset(struct v3d_dev *v3d); void v3d_invalidate_caches(struct v3d_dev *v3d); -void v3d_flush_caches(struct v3d_dev *v3d);
/* v3d_irq.c */ void v3d_irq_init(struct v3d_dev *v3d); diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c index f565b197cba9..92413cbcf92c 100644 --- a/drivers/gpu/drm/v3d/v3d_gem.c +++ b/drivers/gpu/drm/v3d/v3d_gem.c @@ -175,20 +175,6 @@ v3d_invalidate_slices(struct v3d_dev *v3d, int core) V3D_SET_FIELD(0xf, V3D_SLCACTL_ICC)); }
-/* Invalidates texture L2 cachelines */ -static void -v3d_invalidate_l2t(struct v3d_dev *v3d, int core) -{ - V3D_CORE_WRITE(core, - V3D_CTL_L2TCACTL, - V3D_L2TCACTL_L2TFLS | - V3D_SET_FIELD(V3D_L2TCACTL_FLM_CLEAR, V3D_L2TCACTL_FLM)); - if (wait_for(!(V3D_CORE_READ(core, V3D_CTL_L2TCACTL) & - V3D_L2TCACTL_L2TFLS), 100)) { - DRM_ERROR("Timeout waiting for L2T invalidate\n"); - } -} - void v3d_invalidate_caches(struct v3d_dev *v3d) { @@ -199,13 +185,6 @@ v3d_invalidate_caches(struct v3d_dev *v3d) v3d_flush_l2t(v3d, 0); }
-void -v3d_flush_caches(struct v3d_dev *v3d) -{ - v3d_invalidate_l1td(v3d, 0); - v3d_invalidate_l2t(v3d, 0); -} - static void v3d_attach_object_fences(struct v3d_bo **bos, int bo_count, struct dma_fence *fence)
This is the write combiner for TMU writes. You're supposed to flush that at job end if you had dirtied any cachelines. Flushing it at job start then doesn't make any sense.
Signed-off-by: Eric Anholt eric@anholt.net Fixes: 57692c94dcbe ("drm/v3d: Introduce a new DRM driver for Broadcom V3D V3.x+") Reviewed-by: Dave Emett david.emett@broadcom.com --- drivers/gpu/drm/v3d/v3d_gem.c | 12 ------------ 1 file changed, 12 deletions(-)
diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c index 92413cbcf92c..01e879c71cad 100644 --- a/drivers/gpu/drm/v3d/v3d_gem.c +++ b/drivers/gpu/drm/v3d/v3d_gem.c @@ -139,22 +139,10 @@ v3d_invalidate_l2(struct v3d_dev *v3d, int core) V3D_L2CACTL_L2CENA); }
-static void -v3d_invalidate_l1td(struct v3d_dev *v3d, int core) -{ - V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL, V3D_L2TCACTL_TMUWCF); - if (wait_for(!(V3D_CORE_READ(core, V3D_CTL_L2TCACTL) & - V3D_L2TCACTL_L2TFLS), 100)) { - DRM_ERROR("Timeout waiting for L1T write combiner flush\n"); - } -} - /* Invalidates texture L2 cachelines */ static void v3d_flush_l2t(struct v3d_dev *v3d, int core) { - v3d_invalidate_l1td(v3d, core); - V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL, V3D_L2TCACTL_L2TFLS | V3D_SET_FIELD(V3D_L2TCACTL_FLM_FLUSH, V3D_L2TCACTL_FLM));
According to Dave, once you've started an L2T flush, all L2T accesses will be blocked until the flush completes. This fixes a consistent 3-4ms stall between the ioctl and running the job, and 3DMMES Taiji goes from 27fps to 110fps.
v2: Leave a note about why we don't need to wait for completion.
Signed-off-by: Eric Anholt eric@anholt.net Fixes: 57692c94dcbe ("drm/v3d: Introduce a new DRM driver for Broadcom V3D V3.x+") Reviewed-by: Dave Emett david.emett@broadcom.com (v1, comment requested) --- drivers/gpu/drm/v3d/v3d_gem.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c index 01e879c71cad..c268c7c79566 100644 --- a/drivers/gpu/drm/v3d/v3d_gem.c +++ b/drivers/gpu/drm/v3d/v3d_gem.c @@ -143,13 +143,13 @@ v3d_invalidate_l2(struct v3d_dev *v3d, int core) static void v3d_flush_l2t(struct v3d_dev *v3d, int core) { + /* While there is a busy bit (V3D_L2TCACTL_L2TFLS), we don't + * need to wait for completion before dispatching the job -- + * L2T accesses will be stalled until the flush has completed. + */ V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL, V3D_L2TCACTL_L2TFLS | V3D_SET_FIELD(V3D_L2TCACTL_FLM_FLUSH, V3D_L2TCACTL_FLM)); - if (wait_for(!(V3D_CORE_READ(core, V3D_CTL_L2TCACTL) & - V3D_L2TCACTL_L2TFLS), 100)) { - DRM_ERROR("Timeout waiting for L2T flush\n"); - } }
/* Invalidates the slice caches. These are read-only caches. */
On Mon, 3 Dec 2018 at 22:24, Eric Anholt eric@anholt.net wrote:
According to Dave, once you've started an L2T flush, all L2T accesses will be blocked until the flush completes. This fixes a consistent 3-4ms stall between the ioctl and running the job, and 3DMMES Taiji goes from 27fps to 110fps.
v2: Leave a note about why we don't need to wait for completion.
Signed-off-by: Eric Anholt eric@anholt.net
Reviewed-by: Dave Emett david.emett@broadcom.com
Fixes: 57692c94dcbe ("drm/v3d: Introduce a new DRM driver for Broadcom V3D V3.x+") Reviewed-by: Dave Emett david.emett@broadcom.com (v1, comment requested)
drivers/gpu/drm/v3d/v3d_gem.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c index 01e879c71cad..c268c7c79566 100644 --- a/drivers/gpu/drm/v3d/v3d_gem.c +++ b/drivers/gpu/drm/v3d/v3d_gem.c @@ -143,13 +143,13 @@ v3d_invalidate_l2(struct v3d_dev *v3d, int core) static void v3d_flush_l2t(struct v3d_dev *v3d, int core) {
/* While there is a busy bit (V3D_L2TCACTL_L2TFLS), we don't
* need to wait for completion before dispatching the job --
* L2T accesses will be stalled until the flush has completed.
*/ V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL, V3D_L2TCACTL_L2TFLS | V3D_SET_FIELD(V3D_L2TCACTL_FLM_FLUSH, V3D_L2TCACTL_FLM));
if (wait_for(!(V3D_CORE_READ(core, V3D_CTL_L2TCACTL) &
V3D_L2TCACTL_L2TFLS), 100)) {
DRM_ERROR("Timeout waiting for L2T flush\n");
}
}
/* Invalidates the slice caches. These are read-only caches. */
2.20.0.rc1
This cache was replaced with the slice accessing the L2T in the newer generations. Noted by Dave during review.
Signed-off-by: Eric Anholt eric@anholt.net --- drivers/gpu/drm/v3d/v3d_gem.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c index c268c7c79566..8a4be9515179 100644 --- a/drivers/gpu/drm/v3d/v3d_gem.c +++ b/drivers/gpu/drm/v3d/v3d_gem.c @@ -130,10 +130,15 @@ v3d_flush_l3(struct v3d_dev *v3d) } }
-/* Invalidates the (read-only) L2 cache. */ +/* Invalidates the (read-only) L2C cache. This was the L2 cache for + * uniforms and instructions on V3D 3.2. + */ static void -v3d_invalidate_l2(struct v3d_dev *v3d, int core) +v3d_invalidate_l2c(struct v3d_dev *v3d, int core) { + if (v3d->ver > 32) + return; + V3D_CORE_WRITE(core, V3D_CTL_L2CACTL, V3D_L2CACTL_L2CCLR | V3D_L2CACTL_L2CENA); @@ -168,7 +173,7 @@ v3d_invalidate_caches(struct v3d_dev *v3d) { v3d_flush_l3(v3d);
- v3d_invalidate_l2(v3d, 0); + v3d_invalidate_l2c(v3d, 0); v3d_invalidate_slices(v3d, 0); v3d_flush_l2t(v3d, 0); }
On Mon, 3 Dec 2018 at 22:24, Eric Anholt eric@anholt.net wrote:
This cache was replaced with the slice accessing the L2T in the newer generations. Noted by Dave during review.
Signed-off-by: Eric Anholt eric@anholt.net
Reviewed-by: Dave Emett david.emett@broadcom.com
drivers/gpu/drm/v3d/v3d_gem.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c index c268c7c79566..8a4be9515179 100644 --- a/drivers/gpu/drm/v3d/v3d_gem.c +++ b/drivers/gpu/drm/v3d/v3d_gem.c @@ -130,10 +130,15 @@ v3d_flush_l3(struct v3d_dev *v3d) } }
-/* Invalidates the (read-only) L2 cache. */ +/* Invalidates the (read-only) L2C cache. This was the L2 cache for
- uniforms and instructions on V3D 3.2.
- */
static void -v3d_invalidate_l2(struct v3d_dev *v3d, int core) +v3d_invalidate_l2c(struct v3d_dev *v3d, int core) {
if (v3d->ver > 32)
return;
V3D_CORE_WRITE(core, V3D_CTL_L2CACTL, V3D_L2CACTL_L2CCLR | V3D_L2CACTL_L2CENA);
@@ -168,7 +173,7 @@ v3d_invalidate_caches(struct v3d_dev *v3d) { v3d_flush_l3(v3d);
v3d_invalidate_l2(v3d, 0);
v3d_invalidate_l2c(v3d, 0); v3d_invalidate_slices(v3d, 0); v3d_flush_l2t(v3d, 0);
}
2.20.0.rc1
This would be a fairly obscure race, but let's make sure we don't ever lose it.
Signed-off-by: Eric Anholt eric@anholt.net --- drivers/gpu/drm/v3d/v3d_gem.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c index 8a4be9515179..443b1c53117a 100644 --- a/drivers/gpu/drm/v3d/v3d_gem.c +++ b/drivers/gpu/drm/v3d/v3d_gem.c @@ -171,11 +171,15 @@ v3d_invalidate_slices(struct v3d_dev *v3d, int core) void v3d_invalidate_caches(struct v3d_dev *v3d) { + /* Invalidate the caches from the outside in. That way if + * another CL's concurrent use of nearby memory were to pull + * an invalidated cacheline back in, we wouldn't leave stale + * data in the inner cache. + */ v3d_flush_l3(v3d); - v3d_invalidate_l2c(v3d, 0); - v3d_invalidate_slices(v3d, 0); v3d_flush_l2t(v3d, 0); + v3d_invalidate_slices(v3d, 0); }
static void
On Mon, 3 Dec 2018 at 22:24, Eric Anholt eric@anholt.net wrote:
This would be a fairly obscure race, but let's make sure we don't ever lose it.
Signed-off-by: Eric Anholt eric@anholt.net
Reviewed-by: Dave Emett david.emett@broadcom.com
drivers/gpu/drm/v3d/v3d_gem.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c index 8a4be9515179..443b1c53117a 100644 --- a/drivers/gpu/drm/v3d/v3d_gem.c +++ b/drivers/gpu/drm/v3d/v3d_gem.c @@ -171,11 +171,15 @@ v3d_invalidate_slices(struct v3d_dev *v3d, int core) void v3d_invalidate_caches(struct v3d_dev *v3d) {
/* Invalidate the caches from the outside in. That way if
* another CL's concurrent use of nearby memory were to pull
* an invalidated cacheline back in, we wouldn't leave stale
* data in the inner cache.
*/ v3d_flush_l3(v3d);
v3d_invalidate_l2c(v3d, 0);
v3d_invalidate_slices(v3d, 0); v3d_flush_l2t(v3d, 0);
v3d_invalidate_slices(v3d, 0);
}
static void
2.20.0.rc1
On Mon, 3 Dec 2018 at 22:24, Eric Anholt eric@anholt.net wrote:
Right now, userspace doesn't do any L2T writes, but we should lay out our expectations for how it works.
v2: Explicitly mention the VCD cache flushing requirements and that we'll flush the other caches before each of the CLs.
Signed-off-by: Eric Anholt eric@anholt.net
Reviewed-by: Dave Emett david.emett@broadcom.com
include/uapi/drm/v3d_drm.h | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/include/uapi/drm/v3d_drm.h b/include/uapi/drm/v3d_drm.h index 35c7d813c66e..ea70669d2138 100644 --- a/include/uapi/drm/v3d_drm.h +++ b/include/uapi/drm/v3d_drm.h @@ -52,6 +52,14 @@ extern "C" {
- This asks the kernel to have the GPU execute an optional binner
- command list, and a render command list.
- The L1T, slice, L2C, L2T, and GCA caches will be flushed before
- each CL executes. The VCD cache should be flushed (if necessary)
- by the submitted CLs. The TLB writes are guaranteed to have been
- flushed by the time the render done IRQ happens, which is the
- trigger for out_sync. Any dirtying of cachelines by the job (only
- possible using TMU writes) must be flushed by the caller using the
*/
- CL's cache flush commands.
struct drm_v3d_submit_cl { /* Pointer to the binner command list. -- 2.20.0.rc1
dri-devel@lists.freedesktop.org