Right now, userspace doesn't do any L2T writes, but we should lay out our expectations for how it works.
Signed-off-by: Eric Anholt eric@anholt.net --- include/uapi/drm/v3d_drm.h | 7 +++++++ 1 file changed, 7 insertions(+)
diff --git a/include/uapi/drm/v3d_drm.h b/include/uapi/drm/v3d_drm.h index 35c7d813c66e..95b8f8e82ea5 100644 --- a/include/uapi/drm/v3d_drm.h +++ b/include/uapi/drm/v3d_drm.h @@ -52,6 +52,13 @@ extern "C" { * * This asks the kernel to have the GPU execute an optional binner * command list, and a render command list. + * + * The caches (L1T, slice, and L2T) will be flushed before the job + * executes. The TLB writes are guaranteed to have been flushed by + * the time the render done IRQ happens, which is the trigger for + * out_sync. Any dirtying of cachelines by the job (only possible + * using TMU writes) must be flushed by the caller using the CL's + * cache flush commands. */ struct drm_v3d_submit_cl { /* Pointer to the binner command list.
Now that I've specified how the end-of-pipeline flushing should work, we're never going to use this function.
Signed-off-by: Eric Anholt eric@anholt.net --- drivers/gpu/drm/v3d/v3d_drv.h | 1 - drivers/gpu/drm/v3d/v3d_gem.c | 21 --------------------- 2 files changed, 22 deletions(-)
diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h index bcd3d567bec2..239b56d76f3e 100644 --- a/drivers/gpu/drm/v3d/v3d_drv.h +++ b/drivers/gpu/drm/v3d/v3d_drv.h @@ -314,7 +314,6 @@ void v3d_exec_put(struct v3d_exec_info *exec); void v3d_tfu_job_put(struct v3d_tfu_job *exec); void v3d_reset(struct v3d_dev *v3d); void v3d_invalidate_caches(struct v3d_dev *v3d); -void v3d_flush_caches(struct v3d_dev *v3d);
/* v3d_irq.c */ void v3d_irq_init(struct v3d_dev *v3d); diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c index 8b4af512450f..34103205b7cb 100644 --- a/drivers/gpu/drm/v3d/v3d_gem.c +++ b/drivers/gpu/drm/v3d/v3d_gem.c @@ -175,20 +175,6 @@ v3d_invalidate_slices(struct v3d_dev *v3d, int core) V3D_SET_FIELD(0xf, V3D_SLCACTL_ICC)); }
-/* Invalidates texture L2 cachelines */ -static void -v3d_invalidate_l2t(struct v3d_dev *v3d, int core) -{ - V3D_CORE_WRITE(core, - V3D_CTL_L2TCACTL, - V3D_L2TCACTL_L2TFLS | - V3D_SET_FIELD(V3D_L2TCACTL_FLM_CLEAR, V3D_L2TCACTL_FLM)); - if (wait_for(!(V3D_CORE_READ(core, V3D_CTL_L2TCACTL) & - V3D_L2TCACTL_L2TFLS), 100)) { - DRM_ERROR("Timeout waiting for L2T invalidate\n"); - } -} - void v3d_invalidate_caches(struct v3d_dev *v3d) { @@ -199,13 +185,6 @@ v3d_invalidate_caches(struct v3d_dev *v3d) v3d_flush_l2t(v3d, 0); }
-void -v3d_flush_caches(struct v3d_dev *v3d) -{ - v3d_invalidate_l1td(v3d, 0); - v3d_invalidate_l2t(v3d, 0); -} - static void v3d_attach_object_fences(struct v3d_bo **bos, int bo_count, struct dma_fence *fence)
It will be needed for the CSD, which does not have a way to clean the L1T/L2T before reporting completion. In any case it's unused now, so I don't see a problem with removing it.
On Sat, 1 Dec 2018 at 00:58, Eric Anholt eric@anholt.net wrote:
Now that I've specified how the end-of-pipeline flushing should work, we're never going to use this function.
Signed-off-by: Eric Anholt eric@anholt.net
Reviewed-by: Dave Emett david.emett@broadcom.com
drivers/gpu/drm/v3d/v3d_drv.h | 1 - drivers/gpu/drm/v3d/v3d_gem.c | 21 --------------------- 2 files changed, 22 deletions(-)
diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h index bcd3d567bec2..239b56d76f3e 100644 --- a/drivers/gpu/drm/v3d/v3d_drv.h +++ b/drivers/gpu/drm/v3d/v3d_drv.h @@ -314,7 +314,6 @@ void v3d_exec_put(struct v3d_exec_info *exec); void v3d_tfu_job_put(struct v3d_tfu_job *exec); void v3d_reset(struct v3d_dev *v3d); void v3d_invalidate_caches(struct v3d_dev *v3d); -void v3d_flush_caches(struct v3d_dev *v3d);
/* v3d_irq.c */ void v3d_irq_init(struct v3d_dev *v3d); diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c index 8b4af512450f..34103205b7cb 100644 --- a/drivers/gpu/drm/v3d/v3d_gem.c +++ b/drivers/gpu/drm/v3d/v3d_gem.c @@ -175,20 +175,6 @@ v3d_invalidate_slices(struct v3d_dev *v3d, int core) V3D_SET_FIELD(0xf, V3D_SLCACTL_ICC)); }
-/* Invalidates texture L2 cachelines */ -static void -v3d_invalidate_l2t(struct v3d_dev *v3d, int core) -{
V3D_CORE_WRITE(core,
V3D_CTL_L2TCACTL,
V3D_L2TCACTL_L2TFLS |
V3D_SET_FIELD(V3D_L2TCACTL_FLM_CLEAR, V3D_L2TCACTL_FLM));
if (wait_for(!(V3D_CORE_READ(core, V3D_CTL_L2TCACTL) &
V3D_L2TCACTL_L2TFLS), 100)) {
DRM_ERROR("Timeout waiting for L2T invalidate\n");
}
-}
void v3d_invalidate_caches(struct v3d_dev *v3d) { @@ -199,13 +185,6 @@ v3d_invalidate_caches(struct v3d_dev *v3d) v3d_flush_l2t(v3d, 0); }
-void -v3d_flush_caches(struct v3d_dev *v3d) -{
v3d_invalidate_l1td(v3d, 0);
v3d_invalidate_l2t(v3d, 0);
-}
static void v3d_attach_object_fences(struct v3d_bo **bos, int bo_count, struct dma_fence *fence) -- 2.20.0.rc1
This is the write combiner for TMU writes. You're supposed to flush that at job end if you had dirtied any cachelines. Flushing it at job start then doesn't make any sense.
Signed-off-by: Eric Anholt eric@anholt.net Fixes: 57692c94dcbe ("drm/v3d: Introduce a new DRM driver for Broadcom V3D V3.x+") --- drivers/gpu/drm/v3d/v3d_gem.c | 12 ------------ 1 file changed, 12 deletions(-)
diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c index 34103205b7cb..cc4d025b01e0 100644 --- a/drivers/gpu/drm/v3d/v3d_gem.c +++ b/drivers/gpu/drm/v3d/v3d_gem.c @@ -139,22 +139,10 @@ v3d_invalidate_l2(struct v3d_dev *v3d, int core) V3D_L2CACTL_L2CENA); }
-static void -v3d_invalidate_l1td(struct v3d_dev *v3d, int core) -{ - V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL, V3D_L2TCACTL_TMUWCF); - if (wait_for(!(V3D_CORE_READ(core, V3D_CTL_L2TCACTL) & - V3D_L2TCACTL_L2TFLS), 100)) { - DRM_ERROR("Timeout waiting for L1T write combiner flush\n"); - } -} - /* Invalidates texture L2 cachelines */ static void v3d_flush_l2t(struct v3d_dev *v3d, int core) { - v3d_invalidate_l1td(v3d, core); - V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL, V3D_L2TCACTL_L2TFLS | V3D_SET_FIELD(V3D_L2TCACTL_FLM_FLUSH, V3D_L2TCACTL_FLM));
On Sat, 1 Dec 2018 at 00:58, Eric Anholt eric@anholt.net wrote:
This is the write combiner for TMU writes. You're supposed to flush that at job end if you had dirtied any cachelines. Flushing it at job start then doesn't make any sense.
Signed-off-by: Eric Anholt eric@anholt.net
Reviewed-by: Dave Emett david.emett@broadcom.com
Fixes: 57692c94dcbe ("drm/v3d: Introduce a new DRM driver for Broadcom V3D V3.x+")
drivers/gpu/drm/v3d/v3d_gem.c | 12 ------------ 1 file changed, 12 deletions(-)
diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c index 34103205b7cb..cc4d025b01e0 100644 --- a/drivers/gpu/drm/v3d/v3d_gem.c +++ b/drivers/gpu/drm/v3d/v3d_gem.c @@ -139,22 +139,10 @@ v3d_invalidate_l2(struct v3d_dev *v3d, int core) V3D_L2CACTL_L2CENA); }
-static void -v3d_invalidate_l1td(struct v3d_dev *v3d, int core) -{
V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL, V3D_L2TCACTL_TMUWCF);
if (wait_for(!(V3D_CORE_READ(core, V3D_CTL_L2TCACTL) &
V3D_L2TCACTL_L2TFLS), 100)) {
DRM_ERROR("Timeout waiting for L1T write combiner flush\n");
}
-}
/* Invalidates texture L2 cachelines */ static void v3d_flush_l2t(struct v3d_dev *v3d, int core) {
v3d_invalidate_l1td(v3d, core);
V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL, V3D_L2TCACTL_L2TFLS | V3D_SET_FIELD(V3D_L2TCACTL_FLM_FLUSH, V3D_L2TCACTL_FLM));
-- 2.20.0.rc1
According to Dave, once you've started an L2T flush, all L2T accesses will be blocked until the flush completes. This fixes a consistent 3-4ms stall between the ioctl and running the job, and 3DMMES Taiji goes from 27fps to 110fps.
Signed-off-by: Eric Anholt eric@anholt.net Fixes: 57692c94dcbe ("drm/v3d: Introduce a new DRM driver for Broadcom V3D V3.x+") --- drivers/gpu/drm/v3d/v3d_gem.c | 4 ---- 1 file changed, 4 deletions(-)
diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c index cc4d025b01e0..0bd6892e3044 100644 --- a/drivers/gpu/drm/v3d/v3d_gem.c +++ b/drivers/gpu/drm/v3d/v3d_gem.c @@ -146,10 +146,6 @@ v3d_flush_l2t(struct v3d_dev *v3d, int core) V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL, V3D_L2TCACTL_L2TFLS | V3D_SET_FIELD(V3D_L2TCACTL_FLM_FLUSH, V3D_L2TCACTL_FLM)); - if (wait_for(!(V3D_CORE_READ(core, V3D_CTL_L2TCACTL) & - V3D_L2TCACTL_L2TFLS), 100)) { - DRM_ERROR("Timeout waiting for L2T flush\n"); - } }
/* Invalidates the slice caches. These are read-only caches. */
I'm surprised this made such a big difference. It should not take a long time to flush the L2T -- no memory access should be involved, it should just be a matter of the L2T iterating over its lines clearing the tags. This should take thousands of V3D cycles, not milliseconds. So the 3-4ms stall seems worthy of more investigation to me.
A comment describing why no waits are necessary would be good.
On Sat, 1 Dec 2018 at 00:58, Eric Anholt eric@anholt.net wrote:
According to Dave, once you've started an L2T flush, all L2T accesses will be blocked until the flush completes. This fixes a consistent 3-4ms stall between the ioctl and running the job, and 3DMMES Taiji goes from 27fps to 110fps.
Signed-off-by: Eric Anholt eric@anholt.net
Reviewed-by: Dave Emett david.emett@broadcom.com
Fixes: 57692c94dcbe ("drm/v3d: Introduce a new DRM driver for Broadcom V3D V3.x+")
drivers/gpu/drm/v3d/v3d_gem.c | 4 ---- 1 file changed, 4 deletions(-)
diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c index cc4d025b01e0..0bd6892e3044 100644 --- a/drivers/gpu/drm/v3d/v3d_gem.c +++ b/drivers/gpu/drm/v3d/v3d_gem.c @@ -146,10 +146,6 @@ v3d_flush_l2t(struct v3d_dev *v3d, int core) V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL, V3D_L2TCACTL_L2TFLS | V3D_SET_FIELD(V3D_L2TCACTL_FLM_FLUSH, V3D_L2TCACTL_FLM));
if (wait_for(!(V3D_CORE_READ(core, V3D_CTL_L2TCACTL) &
V3D_L2TCACTL_L2TFLS), 100)) {
DRM_ERROR("Timeout waiting for L2T flush\n");
}
}
/* Invalidates the slice caches. These are read-only caches. */
2.20.0.rc1
The core scheduler tells us when the job is pushed to the scheduler's queue, and I had the job_run functions saying when they actually queue the job to the hardware. By adding tracepoints for the very top of the ioctls and the IRQs signaling job completion, "perf record -a -e v3d:.* -e gpu_scheduler:.* <job>; perf script" gets you a pretty decent timeline.
Signed-off-by: Eric Anholt eric@anholt.net --- drivers/gpu/drm/v3d/v3d_gem.c | 4 ++ drivers/gpu/drm/v3d/v3d_irq.c | 19 +++++- drivers/gpu/drm/v3d/v3d_trace.h | 101 ++++++++++++++++++++++++++++++++ 3 files changed, 121 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c index 0bd6892e3044..2f82e2724b1f 100644 --- a/drivers/gpu/drm/v3d/v3d_gem.c +++ b/drivers/gpu/drm/v3d/v3d_gem.c @@ -484,6 +484,8 @@ v3d_submit_cl_ioctl(struct drm_device *dev, void *data, struct drm_syncobj *sync_out; int ret = 0;
+ trace_v3d_submit_cl_ioctl(&v3d->drm, args->rcl_start, args->rcl_end); + if (args->pad != 0) { DRM_INFO("pad must be zero: %d\n", args->pad); return -EINVAL; @@ -611,6 +613,8 @@ v3d_submit_tfu_ioctl(struct drm_device *dev, void *data, int ret = 0; int bo_count;
+ trace_v3d_submit_tfu_ioctl(&v3d->drm, args->iia); + job = kcalloc(1, sizeof(*job), GFP_KERNEL); if (!job) return -ENOMEM; diff --git a/drivers/gpu/drm/v3d/v3d_irq.c b/drivers/gpu/drm/v3d/v3d_irq.c index dd7a7b0bd5a1..69338da70ddc 100644 --- a/drivers/gpu/drm/v3d/v3d_irq.c +++ b/drivers/gpu/drm/v3d/v3d_irq.c @@ -15,6 +15,7 @@
#include "v3d_drv.h" #include "v3d_regs.h" +#include "v3d_trace.h"
#define V3D_CORE_IRQS ((u32)(V3D_INT_OUTOMEM | \ V3D_INT_FLDONE | \ @@ -88,12 +89,20 @@ v3d_irq(int irq, void *arg) }
if (intsts & V3D_INT_FLDONE) { - dma_fence_signal(v3d->bin_job->bin.done_fence); + struct v3d_fence *fence = + to_v3d_fence(v3d->bin_job->bin.done_fence); + + trace_v3d_bcl_irq(&v3d->drm, fence->seqno); + dma_fence_signal(&fence->base); status = IRQ_HANDLED; }
if (intsts & V3D_INT_FRDONE) { - dma_fence_signal(v3d->render_job->render.done_fence); + struct v3d_fence *fence = + to_v3d_fence(v3d->render_job->render.done_fence); + + trace_v3d_rcl_irq(&v3d->drm, fence->seqno); + dma_fence_signal(&fence->base); status = IRQ_HANDLED; }
@@ -119,7 +128,11 @@ v3d_hub_irq(int irq, void *arg) V3D_WRITE(V3D_HUB_INT_CLR, intsts);
if (intsts & V3D_HUB_INT_TFUC) { - dma_fence_signal(v3d->tfu_job->done_fence); + struct v3d_fence *fence = + to_v3d_fence(v3d->tfu_job->done_fence); + + trace_v3d_tfu_irq(&v3d->drm, fence->seqno); + dma_fence_signal(&fence->base); status = IRQ_HANDLED; }
diff --git a/drivers/gpu/drm/v3d/v3d_trace.h b/drivers/gpu/drm/v3d/v3d_trace.h index f54ed9cd3444..edd984afa33f 100644 --- a/drivers/gpu/drm/v3d/v3d_trace.h +++ b/drivers/gpu/drm/v3d/v3d_trace.h @@ -12,6 +12,28 @@ #define TRACE_SYSTEM v3d #define TRACE_INCLUDE_FILE v3d_trace
+TRACE_EVENT(v3d_submit_cl_ioctl, + TP_PROTO(struct drm_device *dev, u32 ct1qba, u32 ct1qea), + TP_ARGS(dev, ct1qba, ct1qea), + + TP_STRUCT__entry( + __field(u32, dev) + __field(u32, ct1qba) + __field(u32, ct1qea) + ), + + TP_fast_assign( + __entry->dev = dev->primary->index; + __entry->ct1qba = ct1qba; + __entry->ct1qea = ct1qea; + ), + + TP_printk("dev=%u, RCL 0x%08x..0x%08x", + __entry->dev, + __entry->ct1qba, + __entry->ct1qea) +); + TRACE_EVENT(v3d_submit_cl, TP_PROTO(struct drm_device *dev, bool is_render, uint64_t seqno, @@ -42,6 +64,85 @@ TRACE_EVENT(v3d_submit_cl, __entry->ctnqea) );
+TRACE_EVENT(v3d_bcl_irq, + TP_PROTO(struct drm_device *dev, + uint64_t seqno), + TP_ARGS(dev, seqno), + + TP_STRUCT__entry( + __field(u32, dev) + __field(u64, seqno) + ), + + TP_fast_assign( + __entry->dev = dev->primary->index; + __entry->seqno = seqno; + ), + + TP_printk("dev=%u, seqno=%llu", + __entry->dev, + __entry->seqno) +); + +TRACE_EVENT(v3d_rcl_irq, + TP_PROTO(struct drm_device *dev, + uint64_t seqno), + TP_ARGS(dev, seqno), + + TP_STRUCT__entry( + __field(u32, dev) + __field(u64, seqno) + ), + + TP_fast_assign( + __entry->dev = dev->primary->index; + __entry->seqno = seqno; + ), + + TP_printk("dev=%u, seqno=%llu", + __entry->dev, + __entry->seqno) +); + +TRACE_EVENT(v3d_tfu_irq, + TP_PROTO(struct drm_device *dev, + uint64_t seqno), + TP_ARGS(dev, seqno), + + TP_STRUCT__entry( + __field(u32, dev) + __field(u64, seqno) + ), + + TP_fast_assign( + __entry->dev = dev->primary->index; + __entry->seqno = seqno; + ), + + TP_printk("dev=%u, seqno=%llu", + __entry->dev, + __entry->seqno) +); + +TRACE_EVENT(v3d_submit_tfu_ioctl, + TP_PROTO(struct drm_device *dev, u32 iia), + TP_ARGS(dev, iia), + + TP_STRUCT__entry( + __field(u32, dev) + __field(u32, iia) + ), + + TP_fast_assign( + __entry->dev = dev->primary->index; + __entry->iia = iia; + ), + + TP_printk("dev=%u, IIA 0x%08x", + __entry->dev, + __entry->iia) +); + TRACE_EVENT(v3d_submit_tfu, TP_PROTO(struct drm_device *dev, uint64_t seqno),
On Sat, 1 Dec 2018 at 00:58, Eric Anholt eric@anholt.net wrote:
The core scheduler tells us when the job is pushed to the scheduler's queue, and I had the job_run functions saying when they actually queue the job to the hardware. By adding tracepoints for the very top of the ioctls and the IRQs signaling job completion, "perf record -a -e v3d:.* -e gpu_scheduler:.* <job>; perf script" gets you a pretty decent timeline.
Signed-off-by: Eric Anholt eric@anholt.net
Reviewed-by: Dave Emett david.emett@broadcom.com
drivers/gpu/drm/v3d/v3d_gem.c | 4 ++ drivers/gpu/drm/v3d/v3d_irq.c | 19 +++++- drivers/gpu/drm/v3d/v3d_trace.h | 101 ++++++++++++++++++++++++++++++++ 3 files changed, 121 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c index 0bd6892e3044..2f82e2724b1f 100644 --- a/drivers/gpu/drm/v3d/v3d_gem.c +++ b/drivers/gpu/drm/v3d/v3d_gem.c @@ -484,6 +484,8 @@ v3d_submit_cl_ioctl(struct drm_device *dev, void *data, struct drm_syncobj *sync_out; int ret = 0;
trace_v3d_submit_cl_ioctl(&v3d->drm, args->rcl_start, args->rcl_end);
if (args->pad != 0) { DRM_INFO("pad must be zero: %d\n", args->pad); return -EINVAL;
@@ -611,6 +613,8 @@ v3d_submit_tfu_ioctl(struct drm_device *dev, void *data, int ret = 0; int bo_count;
trace_v3d_submit_tfu_ioctl(&v3d->drm, args->iia);
job = kcalloc(1, sizeof(*job), GFP_KERNEL); if (!job) return -ENOMEM;
diff --git a/drivers/gpu/drm/v3d/v3d_irq.c b/drivers/gpu/drm/v3d/v3d_irq.c index dd7a7b0bd5a1..69338da70ddc 100644 --- a/drivers/gpu/drm/v3d/v3d_irq.c +++ b/drivers/gpu/drm/v3d/v3d_irq.c @@ -15,6 +15,7 @@
#include "v3d_drv.h" #include "v3d_regs.h" +#include "v3d_trace.h"
#define V3D_CORE_IRQS ((u32)(V3D_INT_OUTOMEM | \ V3D_INT_FLDONE | \ @@ -88,12 +89,20 @@ v3d_irq(int irq, void *arg) }
if (intsts & V3D_INT_FLDONE) {
dma_fence_signal(v3d->bin_job->bin.done_fence);
struct v3d_fence *fence =
to_v3d_fence(v3d->bin_job->bin.done_fence);
trace_v3d_bcl_irq(&v3d->drm, fence->seqno);
dma_fence_signal(&fence->base); status = IRQ_HANDLED; } if (intsts & V3D_INT_FRDONE) {
dma_fence_signal(v3d->render_job->render.done_fence);
struct v3d_fence *fence =
to_v3d_fence(v3d->render_job->render.done_fence);
trace_v3d_rcl_irq(&v3d->drm, fence->seqno);
dma_fence_signal(&fence->base); status = IRQ_HANDLED; }
@@ -119,7 +128,11 @@ v3d_hub_irq(int irq, void *arg) V3D_WRITE(V3D_HUB_INT_CLR, intsts);
if (intsts & V3D_HUB_INT_TFUC) {
dma_fence_signal(v3d->tfu_job->done_fence);
struct v3d_fence *fence =
to_v3d_fence(v3d->tfu_job->done_fence);
trace_v3d_tfu_irq(&v3d->drm, fence->seqno);
dma_fence_signal(&fence->base); status = IRQ_HANDLED; }
diff --git a/drivers/gpu/drm/v3d/v3d_trace.h b/drivers/gpu/drm/v3d/v3d_trace.h index f54ed9cd3444..edd984afa33f 100644 --- a/drivers/gpu/drm/v3d/v3d_trace.h +++ b/drivers/gpu/drm/v3d/v3d_trace.h @@ -12,6 +12,28 @@ #define TRACE_SYSTEM v3d #define TRACE_INCLUDE_FILE v3d_trace
+TRACE_EVENT(v3d_submit_cl_ioctl,
TP_PROTO(struct drm_device *dev, u32 ct1qba, u32 ct1qea),
TP_ARGS(dev, ct1qba, ct1qea),
TP_STRUCT__entry(
__field(u32, dev)
__field(u32, ct1qba)
__field(u32, ct1qea)
),
TP_fast_assign(
__entry->dev = dev->primary->index;
__entry->ct1qba = ct1qba;
__entry->ct1qea = ct1qea;
),
TP_printk("dev=%u, RCL 0x%08x..0x%08x",
__entry->dev,
__entry->ct1qba,
__entry->ct1qea)
+);
TRACE_EVENT(v3d_submit_cl, TP_PROTO(struct drm_device *dev, bool is_render, uint64_t seqno, @@ -42,6 +64,85 @@ TRACE_EVENT(v3d_submit_cl, __entry->ctnqea) );
+TRACE_EVENT(v3d_bcl_irq,
TP_PROTO(struct drm_device *dev,
uint64_t seqno),
TP_ARGS(dev, seqno),
TP_STRUCT__entry(
__field(u32, dev)
__field(u64, seqno)
),
TP_fast_assign(
__entry->dev = dev->primary->index;
__entry->seqno = seqno;
),
TP_printk("dev=%u, seqno=%llu",
__entry->dev,
__entry->seqno)
+);
+TRACE_EVENT(v3d_rcl_irq,
TP_PROTO(struct drm_device *dev,
uint64_t seqno),
TP_ARGS(dev, seqno),
TP_STRUCT__entry(
__field(u32, dev)
__field(u64, seqno)
),
TP_fast_assign(
__entry->dev = dev->primary->index;
__entry->seqno = seqno;
),
TP_printk("dev=%u, seqno=%llu",
__entry->dev,
__entry->seqno)
+);
+TRACE_EVENT(v3d_tfu_irq,
TP_PROTO(struct drm_device *dev,
uint64_t seqno),
TP_ARGS(dev, seqno),
TP_STRUCT__entry(
__field(u32, dev)
__field(u64, seqno)
),
TP_fast_assign(
__entry->dev = dev->primary->index;
__entry->seqno = seqno;
),
TP_printk("dev=%u, seqno=%llu",
__entry->dev,
__entry->seqno)
+);
+TRACE_EVENT(v3d_submit_tfu_ioctl,
TP_PROTO(struct drm_device *dev, u32 iia),
TP_ARGS(dev, iia),
TP_STRUCT__entry(
__field(u32, dev)
__field(u32, iia)
),
TP_fast_assign(
__entry->dev = dev->primary->index;
__entry->iia = iia;
),
TP_printk("dev=%u, IIA 0x%08x",
__entry->dev,
__entry->iia)
+);
TRACE_EVENT(v3d_submit_tfu, TP_PROTO(struct drm_device *dev, uint64_t seqno), -- 2.20.0.rc1
We shouldn't be returning v3d-render in for our new queue.
Signed-off-by: Eric Anholt eric@anholt.net Fixes: 83d5139982db ("drm/v3d: Add support for submitting jobs to the TFU.") --- drivers/gpu/drm/v3d/v3d_fence.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/v3d/v3d_fence.c b/drivers/gpu/drm/v3d/v3d_fence.c index 50bfcf9a8a1a..b0a2a1ae2eb1 100644 --- a/drivers/gpu/drm/v3d/v3d_fence.c +++ b/drivers/gpu/drm/v3d/v3d_fence.c @@ -29,10 +29,16 @@ static const char *v3d_fence_get_timeline_name(struct dma_fence *fence) { struct v3d_fence *f = to_v3d_fence(fence);
- if (f->queue == V3D_BIN) + switch (f->queue) { + case V3D_BIN: return "v3d-bin"; - else + case V3D_RENDER: return "v3d-render"; + case V3D_TFU: + return "v3d-tfu"; + default: + return NULL; + } }
const struct dma_fence_ops v3d_fence_ops = {
On Sat, 1 Dec 2018 at 00:58, Eric Anholt eric@anholt.net wrote:
We shouldn't be returning v3d-render in for our new queue.
Signed-off-by: Eric Anholt eric@anholt.net
Reviewed-by: Dave Emett david.emett@broadcom.com
Fixes: 83d5139982db ("drm/v3d: Add support for submitting jobs to the TFU.")
drivers/gpu/drm/v3d/v3d_fence.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/v3d/v3d_fence.c b/drivers/gpu/drm/v3d/v3d_fence.c index 50bfcf9a8a1a..b0a2a1ae2eb1 100644 --- a/drivers/gpu/drm/v3d/v3d_fence.c +++ b/drivers/gpu/drm/v3d/v3d_fence.c @@ -29,10 +29,16 @@ static const char *v3d_fence_get_timeline_name(struct dma_fence *fence) { struct v3d_fence *f = to_v3d_fence(fence);
if (f->queue == V3D_BIN)
switch (f->queue) {
case V3D_BIN: return "v3d-bin";
else
case V3D_RENDER: return "v3d-render";
case V3D_TFU:
return "v3d-tfu";
default:
return NULL;
}
}
const struct dma_fence_ops v3d_fence_ops = {
2.20.0.rc1
- The caches (L1T, slice, and L2T) will be flushed before the job
- executes. The TLB writes are guaranteed to have been flushed by
I would say before *each* job executes, as the caches are flushed before both bin and render. I wouldn't say "the caches" as not all of the V3D caches are flushed before executing a control list. In particular, the VCD cache is not cleared by the kernel driver (not even sure if there is a register interface to do this); it is expected that the control list will do this itself (using the CLEAR_VCD_CACHE instruction). On 3.3 and earlier there is a separate L2C for instructions/uniforms and a GCA. These do need to be flushed, and it looks like they are, so they should be mentioned here.
On Mon, 3 Dec 2018 at 17:22, Dave Emett david.emett@broadcom.com wrote:
- The caches (L1T, slice, and L2T) will be flushed before the job
- executes. The TLB writes are guaranteed to have been flushed by
I would say before *each* job executes, as the caches are flushed before both bin and render. I wouldn't say "the caches" as not all of the V3D caches are flushed before executing a control list. In particular, the VCD cache is not cleared by the kernel driver (not even sure if there is a register interface to do this); it is expected that the control list will do this itself (using the CLEAR_VCD_CACHE instruction). On 3.3 and earlier there is a separate L2C for instructions/uniforms and a GCA. These do need to be flushed, and it looks like they are, so they should be mentioned here.
Correction: on *3.2* and earlier there is a separate L2C for instructions/uniforms. On 3.3 and earlier there is a GCA.
It looks like we're currently unconditionally writing the L2C clear registers. We should really only be doing this on 3.2 and earlier.
Dave Emett david.emett@broadcom.com writes:
On Mon, 3 Dec 2018 at 17:22, Dave Emett david.emett@broadcom.com wrote:
- The caches (L1T, slice, and L2T) will be flushed before the job
- executes. The TLB writes are guaranteed to have been flushed by
I would say before *each* job executes, as the caches are flushed before both bin and render. I wouldn't say "the caches" as not all of the V3D caches are flushed before executing a control list. In particular, the VCD cache is not cleared by the kernel driver (not even sure if there is a register interface to do this); it is expected that the control list will do this itself (using the CLEAR_VCD_CACHE instruction). On 3.3 and earlier there is a separate L2C for instructions/uniforms and a GCA. These do need to be flushed, and it looks like they are, so they should be mentioned here.
Correction: on *3.2* and earlier there is a separate L2C for instructions/uniforms. On 3.3 and earlier there is a GCA.
It looks like we're currently unconditionally writing the L2C clear registers. We should really only be doing this on 3.2 and earlier.
OK, I've pushed the timeline name and tracing to drm-misc-next, and I'll resend the rest with more comments and the L2C change and outside-in invalidation once I'm far enough into this CTS run that I'm confident.
dri-devel@lists.freedesktop.org