Hi,
The current V3D scheduler has two issues where CSD jobs are resubmitted regardless of the previous timed-out flag, and where the timer is not restarted for timed-out CL/CSD jobs (which we wish to continue running). The second one is due to the DRM scheduler API change and fixed in a similar way to [1]. A kernel command-line option to set the default timeout value is also added.
I tested this patchset with Piglit and our CSD programs in [2]. Because it is hard to get the current upstream kernel to work on BCM2711, I used the kernel from rpi-5.8.y tree [3]. There still are problems where some Piglit tests get longer time to finish running (3610 minutes to 3650 minutes in total), and some ones result in the invalid memory read errors with unknown reasons:
[17086.230959] v3d fec00000.v3d: MMU error from client CLE (4) at 0xac1000, pte invalid [17086.238722] v3d fec00000.v3d: MMU error from client CLE (4) at 0x1b61000, pte invalid [18643.303188] v3d fec00000.v3d: MMU error from client L2T (0) at 0x15bff00, pte invalid [18655.933748] v3d fec00000.v3d: MMU error from client L2T (0) at 0x15bff00, pte invalid
However, most of the CL/CSD programs are now working happily without kernel warnings and errors.
Regards, Sugizaki
(Re-sending this series because I failed to post the previous one to dri-devel.)
[1] https://patchwork.kernel.org/patch/11732895/ [2] https://github.com/Idein/py-videocore6 [3] https://github.com/raspberrypi/linux/tree/rpi-5.8.y
Yukimasa Sugizaki (3): drm/v3d: Don't resubmit guilty CSD jobs drm/v3d: Correctly restart the timer when progress is made drm/v3d: Add job timeout module param
drivers/gpu/drm/v3d/v3d_sched.c | 62 +++++++++++++++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 11 deletions(-)
-- 2.7.4
The previous code misses a check for the timeout error set by drm_sched_resubmit_jobs(), which results in an infinite GPU reset loop if once a timeout occurs:
[ 178.799106] v3d fec00000.v3d: [drm:v3d_reset [v3d]] *ERROR* Resetting GPU for hang. [ 178.807836] v3d fec00000.v3d: [drm:v3d_reset [v3d]] *ERROR* V3D_ERR_STAT: 0x00001000 [ 179.839132] v3d fec00000.v3d: [drm:v3d_reset [v3d]] *ERROR* Resetting GPU for hang. [ 179.847865] v3d fec00000.v3d: [drm:v3d_reset [v3d]] *ERROR* V3D_ERR_STAT: 0x00001000 [ 180.879146] v3d fec00000.v3d: [drm:v3d_reset [v3d]] *ERROR* Resetting GPU for hang. [ 180.887925] v3d fec00000.v3d: [drm:v3d_reset [v3d]] *ERROR* V3D_ERR_STAT: 0x00001000 [ 181.919188] v3d fec00000.v3d: [drm:v3d_reset [v3d]] *ERROR* Resetting GPU for hang. [ 181.928002] v3d fec00000.v3d: [drm:v3d_reset [v3d]] *ERROR* V3D_ERR_STAT: 0x00001000 ...
This commit adds the check for timeout as in v3d_{bin,render}_job_run():
[ 66.408962] v3d fec00000.v3d: [drm:v3d_reset [v3d]] *ERROR* Resetting GPU for hang. [ 66.417734] v3d fec00000.v3d: [drm:v3d_reset [v3d]] *ERROR* V3D_ERR_STAT: 0x00001000 [ 66.428296] [drm] Skipping CSD job resubmission due to previous error (-125)
, where -125 is -ECANCELED, though users currently have no way other than inspecting the dmesg to check if the timeout has occurred.
Signed-off-by: Yukimasa Sugizaki ysugi@idein.jp --- drivers/gpu/drm/v3d/v3d_sched.c | 11 +++++++++++ 1 file changed, 11 insertions(+)
diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c index 0747614a78f0..001216f22017 100644 --- a/drivers/gpu/drm/v3d/v3d_sched.c +++ b/drivers/gpu/drm/v3d/v3d_sched.c @@ -226,6 +226,17 @@ v3d_csd_job_run(struct drm_sched_job *sched_job) struct dma_fence *fence; int i;
+ /* This error is set to -ECANCELED by drm_sched_resubmit_jobs() if this + * job timed out more than sched_job->sched->hang_limit times. + */ + int error = sched_job->s_fence->finished.error; + + if (unlikely(error < 0)) { + DRM_WARN("Skipping CSD job resubmission due to previous error (%d)\n", + error); + return ERR_PTR(error); + } + v3d->csd_job = job;
v3d_invalidate_caches(v3d); -- 2.7.4
The V3D scheduler wants a timed-out job to continue running if it made progress. However, the current DRM scheduler removes the timed-out job from ring_mirror_list and thus the timer is not restarted automatically, resulting in an infinite timeout. We need stop and restart the DRM scheduler to rearm the timer.
Fixes: 135517d3565b ("drm/scheduler: Avoid accessing freed bad job.") Signed-off-by: Yukimasa Sugizaki ysugi@idein.jp --- drivers/gpu/drm/v3d/v3d_sched.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+)
diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c index 001216f22017..feef0c749e68 100644 --- a/drivers/gpu/drm/v3d/v3d_sched.c +++ b/drivers/gpu/drm/v3d/v3d_sched.c @@ -312,9 +312,24 @@ v3d_cl_job_timedout(struct drm_sched_job *sched_job, enum v3d_queue q, u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(q)); u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(q));
+ /* If we've made progress, skip reset and let the timer get + * rearmed. + */ if (*timedout_ctca != ctca || *timedout_ctra != ctra) { *timedout_ctca = ctca; *timedout_ctra = ctra; + + /* Because the timed-out job has been removed from + * ring_mirror_list in drm_sched_job_timedout(), we need to + * stop and restart the scheduler to rearm the timer. + * Holding the reset_lock is necessary for concurrent + * v3d_gpu_reset_for_timeout(). + */ + mutex_lock(&v3d->reset_lock); + drm_sched_stop(sched_job->sched, sched_job); + drm_sched_start(sched_job->sched, sched_job); + mutex_unlock(&v3d->reset_lock); + return; }
@@ -359,6 +374,18 @@ v3d_csd_job_timedout(struct drm_sched_job *sched_job) */ if (job->timedout_batches != batches) { job->timedout_batches = batches; + + /* Because the timed-out job has been removed from + * ring_mirror_list in drm_sched_job_timedout(), we need to + * stop and restart the scheduler to rearm the timer. + * Holding the reset_lock is necessary for concurrent + * v3d_gpu_reset_for_timeout(). + */ + mutex_lock(&v3d->reset_lock); + drm_sched_stop(sched_job->sched, sched_job); + drm_sched_start(sched_job->sched, sched_job); + mutex_unlock(&v3d->reset_lock); + return; }
-- 2.7.4
The default timeout is 500 ms which is too short for some workloads including Piglit. Adding this parameter will help users to run heavier tasks.
Signed-off-by: Yukimasa Sugizaki ysugi@idein.jp --- drivers/gpu/drm/v3d/v3d_sched.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-)
diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c index feef0c749e68..983efb018560 100644 --- a/drivers/gpu/drm/v3d/v3d_sched.c +++ b/drivers/gpu/drm/v3d/v3d_sched.c @@ -19,11 +19,17 @@ */
#include <linux/kthread.h> +#include <linux/moduleparam.h>
#include "v3d_drv.h" #include "v3d_regs.h" #include "v3d_trace.h"
+static uint timeout = 500; +module_param(timeout, uint, 0444); +MODULE_PARM_DESC(timeout, + "Timeout for a job in ms (0 means infinity and default is 500 ms)"); + static struct v3d_job * to_v3d_job(struct drm_sched_job *sched_job) { @@ -432,13 +438,13 @@ v3d_sched_init(struct v3d_dev *v3d) { int hw_jobs_limit = 1; int job_hang_limit = 0; - int hang_limit_ms = 500; + long timeout_jiffies = timeout ? + msecs_to_jiffies(timeout) : MAX_SCHEDULE_TIMEOUT; int ret;
ret = drm_sched_init(&v3d->queue[V3D_BIN].sched, &v3d_bin_sched_ops, - hw_jobs_limit, job_hang_limit, - msecs_to_jiffies(hang_limit_ms), + hw_jobs_limit, job_hang_limit, timeout_jiffies, "v3d_bin"); if (ret) { dev_err(v3d->drm.dev, "Failed to create bin scheduler: %d.", ret); @@ -447,8 +453,7 @@ v3d_sched_init(struct v3d_dev *v3d)
ret = drm_sched_init(&v3d->queue[V3D_RENDER].sched, &v3d_render_sched_ops, - hw_jobs_limit, job_hang_limit, - msecs_to_jiffies(hang_limit_ms), + hw_jobs_limit, job_hang_limit, timeout_jiffies, "v3d_render"); if (ret) { dev_err(v3d->drm.dev, "Failed to create render scheduler: %d.", @@ -459,8 +464,7 @@ v3d_sched_init(struct v3d_dev *v3d)
ret = drm_sched_init(&v3d->queue[V3D_TFU].sched, &v3d_tfu_sched_ops, - hw_jobs_limit, job_hang_limit, - msecs_to_jiffies(hang_limit_ms), + hw_jobs_limit, job_hang_limit, timeout_jiffies, "v3d_tfu"); if (ret) { dev_err(v3d->drm.dev, "Failed to create TFU scheduler: %d.", @@ -472,8 +476,7 @@ v3d_sched_init(struct v3d_dev *v3d) if (v3d_has_csd(v3d)) { ret = drm_sched_init(&v3d->queue[V3D_CSD].sched, &v3d_csd_sched_ops, - hw_jobs_limit, job_hang_limit, - msecs_to_jiffies(hang_limit_ms), + hw_jobs_limit, job_hang_limit, timeout_jiffies, "v3d_csd"); if (ret) { dev_err(v3d->drm.dev, "Failed to create CSD scheduler: %d.", @@ -484,8 +487,7 @@ v3d_sched_init(struct v3d_dev *v3d)
ret = drm_sched_init(&v3d->queue[V3D_CACHE_CLEAN].sched, &v3d_cache_clean_sched_ops, - hw_jobs_limit, job_hang_limit, - msecs_to_jiffies(hang_limit_ms), + hw_jobs_limit, job_hang_limit, timeout_jiffies, "v3d_cache_clean"); if (ret) { dev_err(v3d->drm.dev, "Failed to create CACHE_CLEAN scheduler: %d.", -- 2.7.4
dri-devel@lists.freedesktop.org