From: Rob Clark robdclark@chromium.org
Because system suspend uses pm_runtime_force_suspend() we can't rely runpm refcnt's to protect us if the GPU is active, etc. Fortunately *usually* the GPU is idle when system suspend is triggered. But that isn't quite good enough.
The first patch attempts to block for a modest amount of time until GPU is idle (and failing that, returns -EBUSY). We could have taken a slightly easier approach and just returned -EBUSY if GPU is not idle, but that would cause system suspend to fail. And no one likes pulling a hot laptop out of their backpack.
The second patch avoids getting devfreq callbacks after suspend, since pm_runtime_force_suspend() breaks the pm_runtime_get_if_in_use() tricks used to deal with devfreq callbacks while suspended.
Rob Clark (2): drm/msm/gpu: Wait for idle before suspending drm/msm/gpu: Cancel idle/boost work on suspend
drivers/gpu/drm/msm/adreno/adreno_device.c | 9 +++++++++ drivers/gpu/drm/msm/msm_gpu.c | 3 +++ drivers/gpu/drm/msm/msm_gpu.h | 3 +++ drivers/gpu/drm/msm/msm_gpu_devfreq.c | 21 +++++++++++++++++++-- 4 files changed, 34 insertions(+), 2 deletions(-)
From: Rob Clark robdclark@chromium.org
System suspend uses pm_runtime_force_suspend(), which cheekily bypasses the runpm reference counts. This doesn't actually work so well when the GPU is active. So add a reasonable delay waiting for the GPU to become idle.
Alternatively we could just return -EBUSY in this case, but that has the disadvantage of causing system suspend to fail.
Signed-off-by: Rob Clark robdclark@chromium.org --- drivers/gpu/drm/msm/adreno/adreno_device.c | 9 +++++++++ drivers/gpu/drm/msm/msm_gpu.c | 3 +++ drivers/gpu/drm/msm/msm_gpu.h | 3 +++ 3 files changed, 15 insertions(+)
diff --git a/drivers/gpu/drm/msm/adreno/adreno_device.c b/drivers/gpu/drm/msm/adreno/adreno_device.c index 93005839b5da..b677ca3fd75e 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_device.c +++ b/drivers/gpu/drm/msm/adreno/adreno_device.c @@ -611,6 +611,15 @@ static int adreno_resume(struct device *dev) static int adreno_suspend(struct device *dev) { struct msm_gpu *gpu = dev_to_gpu(dev); + int ret = 0; + + ret = wait_event_timeout(gpu->retire_event, + !msm_gpu_active(gpu), + msecs_to_jiffies(1000)); + if (ret == 0) { + dev_err(dev, "Timeout waiting for GPU to suspend\n"); + return -EBUSY; + }
return gpu->funcs->pm_suspend(gpu); } diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index 0f78c2615272..2c1049c0ea14 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -703,6 +703,8 @@ static void retire_submits(struct msm_gpu *gpu) } } } + + wake_up_all(&gpu->retire_event); }
static void retire_worker(struct kthread_work *work) @@ -848,6 +850,7 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, INIT_LIST_HEAD(&gpu->active_list); mutex_init(&gpu->active_lock); mutex_init(&gpu->lock); + init_waitqueue_head(&gpu->retire_event); kthread_init_work(&gpu->retire_work, retire_worker); kthread_init_work(&gpu->recover_work, recover_worker); kthread_init_work(&gpu->fault_work, fault_worker); diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h index 445c6bfd4b6b..92aa1e9196c6 100644 --- a/drivers/gpu/drm/msm/msm_gpu.h +++ b/drivers/gpu/drm/msm/msm_gpu.h @@ -230,6 +230,9 @@ struct msm_gpu { /* work for handling GPU recovery: */ struct kthread_work recover_work;
+ /** retire_event: notified when submits are retired: */ + wait_queue_head_t retire_event; + /* work for handling active-list retiring: */ struct kthread_work retire_work;
On Thu 06 Jan 10:14 PST 2022, Rob Clark wrote:
From: Rob Clark robdclark@chromium.org
System suspend uses pm_runtime_force_suspend(), which cheekily bypasses the runpm reference counts. This doesn't actually work so well when the GPU is active. So add a reasonable delay waiting for the GPU to become idle.
Alternatively we could just return -EBUSY in this case, but that has the disadvantage of causing system suspend to fail.
Reviewed-by: Bjorn Andersson bjorn.andersson@linaro.org
Regards, Bjorn
Signed-off-by: Rob Clark robdclark@chromium.org
drivers/gpu/drm/msm/adreno/adreno_device.c | 9 +++++++++ drivers/gpu/drm/msm/msm_gpu.c | 3 +++ drivers/gpu/drm/msm/msm_gpu.h | 3 +++ 3 files changed, 15 insertions(+)
diff --git a/drivers/gpu/drm/msm/adreno/adreno_device.c b/drivers/gpu/drm/msm/adreno/adreno_device.c index 93005839b5da..b677ca3fd75e 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_device.c +++ b/drivers/gpu/drm/msm/adreno/adreno_device.c @@ -611,6 +611,15 @@ static int adreno_resume(struct device *dev) static int adreno_suspend(struct device *dev) { struct msm_gpu *gpu = dev_to_gpu(dev);
int ret = 0;
ret = wait_event_timeout(gpu->retire_event,
!msm_gpu_active(gpu),
msecs_to_jiffies(1000));
if (ret == 0) {
dev_err(dev, "Timeout waiting for GPU to suspend\n");
return -EBUSY;
}
return gpu->funcs->pm_suspend(gpu);
} diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index 0f78c2615272..2c1049c0ea14 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -703,6 +703,8 @@ static void retire_submits(struct msm_gpu *gpu) } } }
- wake_up_all(&gpu->retire_event);
}
static void retire_worker(struct kthread_work *work) @@ -848,6 +850,7 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, INIT_LIST_HEAD(&gpu->active_list); mutex_init(&gpu->active_lock); mutex_init(&gpu->lock);
- init_waitqueue_head(&gpu->retire_event); kthread_init_work(&gpu->retire_work, retire_worker); kthread_init_work(&gpu->recover_work, recover_worker); kthread_init_work(&gpu->fault_work, fault_worker);
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h index 445c6bfd4b6b..92aa1e9196c6 100644 --- a/drivers/gpu/drm/msm/msm_gpu.h +++ b/drivers/gpu/drm/msm/msm_gpu.h @@ -230,6 +230,9 @@ struct msm_gpu { /* work for handling GPU recovery: */ struct kthread_work recover_work;
- /** retire_event: notified when submits are retired: */
- wait_queue_head_t retire_event;
- /* work for handling active-list retiring: */ struct kthread_work retire_work;
-- 2.33.1
Il 06/01/22 19:14, Rob Clark ha scritto:
From: Rob Clark robdclark@chromium.org
System suspend uses pm_runtime_force_suspend(), which cheekily bypasses the runpm reference counts. This doesn't actually work so well when the GPU is active. So add a reasonable delay waiting for the GPU to become idle.
Alternatively we could just return -EBUSY in this case, but that has the disadvantage of causing system suspend to fail.
Signed-off-by: Rob Clark robdclark@chromium.org
Reviewed-by: AngeloGioacchino Del Regno angelogioacchino.delregno@collabora.com
Quoting Rob Clark (2022-01-06 10:14:46)
From: Rob Clark robdclark@chromium.org
System suspend uses pm_runtime_force_suspend(), which cheekily bypasses the runpm reference counts. This doesn't actually work so well when the GPU is active. So add a reasonable delay waiting for the GPU to become idle.
Maybe also say:
Failure to wait during system wide suspend leads to GPU hangs seen on resume.
Alternatively we could just return -EBUSY in this case, but that has the disadvantage of causing system suspend to fail.
Signed-off-by: Rob Clark robdclark@chromium.org
drivers/gpu/drm/msm/adreno/adreno_device.c | 9 +++++++++ drivers/gpu/drm/msm/msm_gpu.c | 3 +++ drivers/gpu/drm/msm/msm_gpu.h | 3 +++ 3 files changed, 15 insertions(+)
diff --git a/drivers/gpu/drm/msm/adreno/adreno_device.c b/drivers/gpu/drm/msm/adreno/adreno_device.c index 93005839b5da..b677ca3fd75e 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_device.c +++ b/drivers/gpu/drm/msm/adreno/adreno_device.c @@ -611,6 +611,15 @@ static int adreno_resume(struct device *dev) static int adreno_suspend(struct device *dev) { struct msm_gpu *gpu = dev_to_gpu(dev);
int ret = 0;
Please don't assign and then immediately overwrite.
ret = wait_event_timeout(gpu->retire_event,
!msm_gpu_active(gpu),
msecs_to_jiffies(1000));
if (ret == 0) {
The usual pattern is
long timeleft;
timeleft = wait_event_timeout(...) if (!timeleft) { /* no time left; timed out */
Can it be the same pattern here? It helps because people sometimes forget that wait_event_timeout() returns the time that is left and not an error code when it times out.
dev_err(dev, "Timeout waiting for GPU to suspend\n");
return -EBUSY;
} return gpu->funcs->pm_suspend(gpu);
}
On Fri, Jan 7, 2022 at 4:27 PM Stephen Boyd swboyd@chromium.org wrote:
Quoting Rob Clark (2022-01-06 10:14:46)
From: Rob Clark robdclark@chromium.org
System suspend uses pm_runtime_force_suspend(), which cheekily bypasses the runpm reference counts. This doesn't actually work so well when the GPU is active. So add a reasonable delay waiting for the GPU to become idle.
Maybe also say:
Failure to wait during system wide suspend leads to GPU hangs seen on resume.
The fallout can actually be a lot more than just GPU hangs.. that is just the case that is easy (for us) to observe because the crash logging captures them. But sync/async external aborts are also possible.. and I think even just undefined behavior (ie. I think if the timing works out right, it can survive but just "lose" rendering that hadn't completed yet)
Alternatively we could just return -EBUSY in this case, but that has the disadvantage of causing system suspend to fail.
Signed-off-by: Rob Clark robdclark@chromium.org
drivers/gpu/drm/msm/adreno/adreno_device.c | 9 +++++++++ drivers/gpu/drm/msm/msm_gpu.c | 3 +++ drivers/gpu/drm/msm/msm_gpu.h | 3 +++ 3 files changed, 15 insertions(+)
diff --git a/drivers/gpu/drm/msm/adreno/adreno_device.c b/drivers/gpu/drm/msm/adreno/adreno_device.c index 93005839b5da..b677ca3fd75e 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_device.c +++ b/drivers/gpu/drm/msm/adreno/adreno_device.c @@ -611,6 +611,15 @@ static int adreno_resume(struct device *dev) static int adreno_suspend(struct device *dev) { struct msm_gpu *gpu = dev_to_gpu(dev);
int ret = 0;
Please don't assign and then immediately overwrite.
ret = wait_event_timeout(gpu->retire_event,
!msm_gpu_active(gpu),
msecs_to_jiffies(1000));
if (ret == 0) {
The usual pattern is
long timeleft; timeleft = wait_event_timeout(...) if (!timeleft) { /* no time left; timed out */
Can it be the same pattern here? It helps because people sometimes forget that wait_event_timeout() returns the time that is left and not an error code when it times out.
ok, I'll update in v2..
BR, -R
dev_err(dev, "Timeout waiting for GPU to suspend\n");
return -EBUSY;
} return gpu->funcs->pm_suspend(gpu);
}
From: Rob Clark robdclark@chromium.org
With system suspend using pm_runtime_force_suspend() we can't rely on the pm_runtime_get_if_in_use() trick to deal with devfreq callbacks after (or racing with) suspend. So flush any pending idle or boost work in the suspend path.
Signed-off-by: Rob Clark robdclark@chromium.org --- drivers/gpu/drm/msm/msm_gpu_devfreq.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/msm/msm_gpu_devfreq.c b/drivers/gpu/drm/msm/msm_gpu_devfreq.c index 62405e980925..9bf319be11f6 100644 --- a/drivers/gpu/drm/msm/msm_gpu_devfreq.c +++ b/drivers/gpu/drm/msm/msm_gpu_devfreq.c @@ -133,6 +133,18 @@ void msm_devfreq_init(struct msm_gpu *gpu) CLOCK_MONOTONIC, HRTIMER_MODE_REL); }
+static void cancel_idle_work(struct msm_gpu_devfreq *df) +{ + hrtimer_cancel(&df->idle_work.timer); + kthread_cancel_work_sync(&df->idle_work.work); +} + +static void cancel_boost_work(struct msm_gpu_devfreq *df) +{ + hrtimer_cancel(&df->boost_work.timer); + kthread_cancel_work_sync(&df->boost_work.work); +} + void msm_devfreq_cleanup(struct msm_gpu *gpu) { struct msm_gpu_devfreq *df = &gpu->devfreq; @@ -152,7 +164,12 @@ void msm_devfreq_resume(struct msm_gpu *gpu)
void msm_devfreq_suspend(struct msm_gpu *gpu) { - devfreq_suspend_device(gpu->devfreq.devfreq); + struct msm_gpu_devfreq *df = &gpu->devfreq; + + devfreq_suspend_device(df->devfreq); + + cancel_idle_work(df); + cancel_boost_work(df); }
static void msm_devfreq_boost_work(struct kthread_work *work) @@ -196,7 +213,7 @@ void msm_devfreq_active(struct msm_gpu *gpu) /* * Cancel any pending transition to idle frequency: */ - hrtimer_cancel(&df->idle_work.timer); + cancel_idle_work(df);
idle_time = ktime_to_ms(ktime_sub(ktime_get(), df->idle_time));
dri-devel@lists.freedesktop.org