Major enhancement in this series is the support for a minimal gmu coredump which can be captured inline instead of through our usual recover worker. It is helpful in the case of gmu errors during gpu wake-up/suspend path and helps to capture a snapshot of gmu before we do a suspend. I had to introduce a lock to synchronize the crashstate because the runtime-suspend can happen from an asynchronous RPM thread.
Apart from this, there are some improvements to gracefully handle the gmu errors by propagating the error back to parent or by retrying. Also, a few patches to fix some trivial bugs in the related code.
Akhil P Oommen (10): drm/msm/a6xx: Add helper to check smmu is stalled drm/msm/a6xx: Send NMI to gmu when it is hung drm/msm/a6xx: Avoid gmu lock in pm ops drm/msm/a6xx: Enhance debugging of gmu faults drm/msm: Do recovery on hw_init failure drm/msm/a6xx: Propagate OOB set error drm/msm/adreno: Retry on gpu resume failure drm/msm/a6xx: Remove clk votes on failure drm/msm: Remove pm_runtime_get() from msm_job_run() drm/msm/a6xx: Free gmu_debug crashstate bo
drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 89 +++++++++++++++++++++++------ drivers/gpu/drm/msm/adreno/a6xx_gmu.h | 1 + drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 31 +++++++--- drivers/gpu/drm/msm/adreno/a6xx_gpu.h | 4 +- drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c | 79 +++++++++++++++++++++---- drivers/gpu/drm/msm/adreno/adreno_device.c | 10 +++- drivers/gpu/drm/msm/adreno/adreno_gpu.c | 10 +++- drivers/gpu/drm/msm/adreno/adreno_gpu.h | 2 + drivers/gpu/drm/msm/msm_gpu.c | 28 ++++++++- drivers/gpu/drm/msm/msm_gpu.h | 11 ++-- drivers/gpu/drm/msm/msm_ringbuffer.c | 4 -- 11 files changed, 218 insertions(+), 51 deletions(-)
Add a helper function to check for stalled smmu and also avoid reading RBBM_STATUS3 register which is in GX domain before ensuring GX is ON.
Signed-off-by: Akhil P Oommen quic_akhilpo@quicinc.com ---
drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 8 +++++++- drivers/gpu/drm/msm/adreno/a6xx_gpu.h | 1 + drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c | 4 +--- 3 files changed, 9 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 7d23c74..3faf551 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -14,6 +14,12 @@
#define GPU_PAS_ID 13
+bool a6xx_is_smmu_stalled(struct msm_gpu *gpu) +{ + return !!(gpu_read(gpu, REG_A6XX_RBBM_STATUS3) & + A6XX_RBBM_STATUS3_SMMU_STALLED_ON_FAULT); +} + static inline bool _a6xx_check_idle(struct msm_gpu *gpu) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); @@ -1346,7 +1352,7 @@ static void a6xx_fault_detect_irq(struct msm_gpu *gpu) * to otherwise resume normally rather than killing the submit, so * just bail. */ - if (gpu_read(gpu, REG_A6XX_RBBM_STATUS3) & A6XX_RBBM_STATUS3_SMMU_STALLED_ON_FAULT) + if (a6xx_is_smmu_stalled(gpu)) return;
/* diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h index 86e0a7c..675aef0 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h @@ -85,5 +85,6 @@ void a6xx_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
struct msm_gpu_state *a6xx_gpu_state_get(struct msm_gpu *gpu); int a6xx_gpu_state_put(struct msm_gpu_state *state); +bool a6xx_is_smmu_stalled(struct msm_gpu *gpu);
#endif /* __A6XX_GPU_H__ */ diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c index 55f4433..7de9d2f 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c @@ -971,8 +971,6 @@ struct msm_gpu_state *a6xx_gpu_state_get(struct msm_gpu *gpu) struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); struct a6xx_gpu_state *a6xx_state = kzalloc(sizeof(*a6xx_state), GFP_KERNEL); - bool stalled = !!(gpu_read(gpu, REG_A6XX_RBBM_STATUS3) & - A6XX_RBBM_STATUS3_SMMU_STALLED_ON_FAULT);
if (!a6xx_state) return ERR_PTR(-ENOMEM); @@ -1003,7 +1001,7 @@ struct msm_gpu_state *a6xx_gpu_state_get(struct msm_gpu *gpu) * write out GPU state, so we need to skip this when the SMMU is * stalled in response to an iova fault */ - if (!stalled && !gpu->needs_hw_init && + if (!a6xx_is_smmu_stalled(gpu) && !a6xx_crashdumper_init(gpu, &_dumper)) { dumper = &_dumper; }
While capturing gmu state, first send an NMI to gmu when it is hung. This helps to move gmu to a safe state.
Signed-off-by: Akhil P Oommen quic_akhilpo@quicinc.com ---
drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 37 +++++++++++++++++++++++++++++ drivers/gpu/drm/msm/adreno/a6xx_gmu.h | 1 + drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c | 14 ++++++++++- 3 files changed, 51 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c index 3e325e2..f208a81 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c @@ -14,6 +14,37 @@ #include "msm_gpu_trace.h" #include "msm_mmu.h"
+void a6xx_gmu_send_nmi(struct a6xx_gmu *gmu) +{ + struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu); + struct adreno_gpu *adreno_gpu = &a6xx_gpu->base; + struct msm_gpu *gpu = &adreno_gpu->base; + u32 val; + + if (a6xx_gmu_gx_is_on(gmu) && a6xx_is_smmu_stalled(gpu)) { + DRM_DEV_ERROR(gmu->dev, + "Skipping GMU NMI since SMMU is stalled\n"); + } + + /* Don't retrigger NMI if gmu reset is already active */ + val = gmu_read(gmu, REG_A6XX_GMU_CM3_FW_INIT_RESULT); + if (val & 0xE00) + return; + + /* Mask all interrupts from GMU first */ + gmu_write(gmu, REG_A6XX_GMU_GMU2HOST_INTR_MASK, 0xFFFFFFFF); + + /* Trigger NMI to make gmu save it's internal state to ddr */ + val = gmu_read(gmu, REG_A6XX_GMU_CM3_CFG); + gmu_write(gmu, REG_A6XX_GMU_CM3_CFG, val | BIT(9)); + + /* Barrier to ensure write is posted before we proceed */ + wmb(); + + /* Small delay to ensure state copy is ddr is complete at GMU */ + udelay(200); +} + static void a6xx_gmu_fault(struct a6xx_gmu *gmu) { struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu); @@ -790,6 +821,12 @@ static int a6xx_gmu_fw_start(struct a6xx_gmu *gmu, unsigned int state) gmu_write(gmu, REG_A6XX_GMU_CM3_FW_INIT_RESULT, 0); gmu_write(gmu, REG_A6XX_GMU_CM3_BOOT_CONFIG, 0x02);
+ /* + * Make sure that the NMI bit is cleared by configuring the reset value + * here + */ + gmu_write(gmu, REG_A6XX_GMU_CM3_CFG, 0x4052); + /* Write the iova of the HFI table */ gmu_write(gmu, REG_A6XX_GMU_HFI_QTBL_ADDR, gmu->hfi.iova); gmu_write(gmu, REG_A6XX_GMU_HFI_QTBL_INFO, 1); diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h index 84bd516..4228ec1 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h @@ -186,5 +186,6 @@ int a6xx_hfi_set_freq(struct a6xx_gmu *gmu, int index);
bool a6xx_gmu_gx_is_on(struct a6xx_gmu *gmu); bool a6xx_gmu_sptprac_is_on(struct a6xx_gmu *gmu); +void a6xx_gmu_send_nmi(struct a6xx_gmu *gmu);
#endif diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c index 7de9d2f..09b2ff0 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c @@ -964,6 +964,18 @@ static void a6xx_get_indexed_registers(struct msm_gpu *gpu, a6xx_state->nr_indexed_regs = count; }
+void a6xx_get_gmu_state(struct msm_gpu *gpu, struct a6xx_gpu_state *a6xx_state) +{ + struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); + struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); + struct a6xx_gmu *gmu = &a6xx_gpu->gmu; + + if (gmu->hung) + a6xx_gmu_send_nmi(gmu); + + a6xx_get_gmu_registers(gpu, a6xx_state); +} + struct msm_gpu_state *a6xx_gpu_state_get(struct msm_gpu *gpu) { struct a6xx_crashdumper _dumper = { 0 }, *dumper = NULL; @@ -980,7 +992,7 @@ struct msm_gpu_state *a6xx_gpu_state_get(struct msm_gpu *gpu) /* Get the generic state from the adreno core */ adreno_gpu_state_get(gpu, &a6xx_state->base);
- a6xx_get_gmu_registers(gpu, a6xx_state); + a6xx_get_gmu_state(gpu, a6xx_state);
a6xx_state->gmu_log = a6xx_snapshot_gmu_bo(a6xx_state, &a6xx_gpu->gmu.log); a6xx_state->gmu_hfi = a6xx_snapshot_gmu_bo(a6xx_state, &a6xx_gpu->gmu.hfi);
Hi Akhil,
Thank you for the patch! Perhaps something to improve:
[auto build test WARNING on drm/drm-next] [also build test WARNING on drm-intel/for-linux-next drm-tip/drm-tip drm-exynos/exynos-drm-next tegra-drm/drm/tegra/for-next v5.17-rc7 next-20220308] [cannot apply to airlied/drm-next] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch]
url: https://github.com/0day-ci/linux/commits/Akhil-P-Oommen/Support-for-GMU-core... base: git://anongit.freedesktop.org/drm/drm drm-next config: riscv-randconfig-r042-20220307 (https://download.01.org/0day-ci/archive/20220308/202203082018.IcI00Nvs-lkp@i...) compiler: clang version 15.0.0 (https://github.com/llvm/llvm-project d271fc04d5b97b12e6b797c6067d3c96a8d7470e) reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # install riscv cross compiling tool for clang build # apt-get install binutils-riscv64-linux-gnu # https://github.com/0day-ci/linux/commit/23953efc645803299a93f178e9a32f2ae97d... git remote add linux-review https://github.com/0day-ci/linux git fetch --no-tags linux-review Akhil-P-Oommen/Support-for-GMU-coredump-and-some-related-improvements/20220303-013028 git checkout 23953efc645803299a93f178e9a32f2ae97dae39 # save the config file to linux build tree mkdir build_dir COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=riscv SHELL=/bin/bash drivers/gpu/drm/msm/
If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot lkp@intel.com
All warnings (new ones prefixed by >>):
drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c:967:6: warning: no previous prototype for function 'a6xx_get_gmu_state' [-Wmissing-prototypes]
void a6xx_get_gmu_state(struct msm_gpu *gpu, struct a6xx_gpu_state *a6xx_state) ^ drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c:967:1: note: declare 'static' if the function is not intended to be used outside of this translation unit void a6xx_get_gmu_state(struct msm_gpu *gpu, struct a6xx_gpu_state *a6xx_state) ^ static 1 warning generated.
vim +/a6xx_get_gmu_state +967 drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c
966
967 void a6xx_get_gmu_state(struct msm_gpu *gpu, struct a6xx_gpu_state *a6xx_state)
968 { 969 struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); 970 struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); 971 struct a6xx_gmu *gmu = &a6xx_gpu->gmu; 972 973 if (gmu->hung) 974 a6xx_gmu_send_nmi(gmu); 975 976 a6xx_get_gmu_registers(gpu, a6xx_state); 977 } 978
--- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
We don't really need gmu lock in runtime pm ops because these operations are serialized anyway and also with other paths where we take this lock. This patch will help to simplify the locking order when we introduce crashstate_lock in the upcoming patch.
Signed-off-by: Akhil P Oommen quic_akhilpo@quicinc.com ---
drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 4 ---- 1 file changed, 4 deletions(-)
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 3faf551..8c3cb31 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -1530,9 +1530,7 @@ static int a6xx_pm_resume(struct msm_gpu *gpu)
trace_msm_gpu_resume(0);
- mutex_lock(&a6xx_gpu->gmu.lock); ret = a6xx_gmu_resume(a6xx_gpu); - mutex_unlock(&a6xx_gpu->gmu.lock); if (ret) return ret;
@@ -1555,9 +1553,7 @@ static int a6xx_pm_suspend(struct msm_gpu *gpu)
msm_devfreq_suspend(gpu);
- mutex_lock(&a6xx_gpu->gmu.lock); ret = a6xx_gmu_stop(a6xx_gpu); - mutex_unlock(&a6xx_gpu->gmu.lock); if (ret) return ret;
Add support for inline capture of gmu coredump in gmu resume/suspend path to help debug gmu error/faults. This is sort of a lite version of gpu coredump with just gmu states. And we can't use recover_worker in these scenarios because gmu is collapsed after a failure in this path. Hence we need to capture the gmu states inline before it is collapsed.
Signed-off-by: Akhil P Oommen quic_akhilpo@quicinc.com ---
drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 18 +++++++-- drivers/gpu/drm/msm/adreno/a6xx_gpu.h | 1 + drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c | 60 +++++++++++++++++++++++++---- drivers/gpu/drm/msm/adreno/adreno_gpu.c | 10 +++-- drivers/gpu/drm/msm/adreno/adreno_gpu.h | 2 + drivers/gpu/drm/msm/msm_gpu.c | 23 ++++++++++- drivers/gpu/drm/msm/msm_gpu.h | 11 ++++-- 7 files changed, 105 insertions(+), 20 deletions(-)
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c index f208a81..f121d798 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c @@ -1024,6 +1024,7 @@ int a6xx_gmu_resume(struct a6xx_gpu *a6xx_gpu) /* On failure, shut down the GMU to leave it in a good state */ if (ret) { disable_irq(gmu->gmu_irq); + a6xx_gmu_inline_coredump(gmu); a6xx_rpmh_stop(gmu); pm_runtime_put(gmu->gxpd); pm_runtime_put(gmu->dev); @@ -1082,6 +1083,7 @@ static void a6xx_gmu_shutdown(struct a6xx_gmu *gmu) { struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu); struct adreno_gpu *adreno_gpu = &a6xx_gpu->base; + int ret = 0; u32 val;
/* @@ -1091,10 +1093,11 @@ static void a6xx_gmu_shutdown(struct a6xx_gmu *gmu) val = gmu_read(gmu, REG_A6XX_GPU_GMU_CX_GMU_RPMH_POWER_STATE);
if (val != 0xf) { - int ret = a6xx_gmu_wait_for_idle(gmu); + ret = a6xx_gmu_wait_for_idle(gmu);
/* If the GMU isn't responding assume it is hung */ if (ret) { + a6xx_gmu_inline_coredump(gmu); a6xx_gmu_force_off(gmu); return; } @@ -1102,7 +1105,9 @@ static void a6xx_gmu_shutdown(struct a6xx_gmu *gmu) a6xx_bus_clear_pending_transactions(adreno_gpu);
/* tell the GMU we want to slumber */ - a6xx_gmu_notify_slumber(gmu); + ret = a6xx_gmu_notify_slumber(gmu); + if (ret) + goto out;
ret = gmu_poll_timeout(gmu, REG_A6XX_GPU_GMU_AO_GPU_CX_BUSY_STATUS, val, @@ -1123,6 +1128,10 @@ static void a6xx_gmu_shutdown(struct a6xx_gmu *gmu) REG_A6XX_GPU_GMU_AO_GPU_CX_BUSY_STATUS2)); }
+out: + if (ret) + a6xx_gmu_inline_coredump(gmu); + /* Turn off HFI */ a6xx_hfi_stop(gmu);
@@ -1146,9 +1155,10 @@ int a6xx_gmu_stop(struct a6xx_gpu *a6xx_gpu) * Force the GMU off if we detected a hang, otherwise try to shut it * down gracefully */ - if (gmu->hung) + if (gmu->hung) { + a6xx_gmu_inline_coredump(gmu); a6xx_gmu_force_off(gmu); - else + } else a6xx_gmu_shutdown(gmu);
/* Remove the bus vote */ diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h index 675aef0..2599443 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h @@ -86,5 +86,6 @@ void a6xx_show(struct msm_gpu *gpu, struct msm_gpu_state *state, struct msm_gpu_state *a6xx_gpu_state_get(struct msm_gpu *gpu); int a6xx_gpu_state_put(struct msm_gpu_state *state); bool a6xx_is_smmu_stalled(struct msm_gpu *gpu); +void a6xx_gmu_inline_coredump(struct a6xx_gmu *gmu);
#endif /* __A6XX_GPU_H__ */ diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c index 09b2ff0..4d4588a 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c @@ -3,6 +3,7 @@
#include <linux/ascii85.h> #include "msm_gem.h" +#include "msm_gpu.h" #include "a6xx_gpu.h" #include "a6xx_gmu.h" #include "a6xx_gpu_state.h" @@ -970,10 +971,19 @@ void a6xx_get_gmu_state(struct msm_gpu *gpu, struct a6xx_gpu_state *a6xx_state) struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); struct a6xx_gmu *gmu = &a6xx_gpu->gmu;
- if (gmu->hung) + if (gmu->hung) { + mutex_lock(&gmu->lock); a6xx_gmu_send_nmi(gmu); + mutex_unlock(&gmu->lock); + }
a6xx_get_gmu_registers(gpu, a6xx_state); + + a6xx_state->gmu_log = a6xx_snapshot_gmu_bo(a6xx_state, &a6xx_gpu->gmu.log); + a6xx_state->gmu_hfi = a6xx_snapshot_gmu_bo(a6xx_state, &a6xx_gpu->gmu.hfi); + a6xx_state->gmu_debug = a6xx_snapshot_gmu_bo(a6xx_state, &a6xx_gpu->gmu.debug); + + a6xx_snapshot_gmu_hfi_history(gpu, a6xx_state); }
struct msm_gpu_state *a6xx_gpu_state_get(struct msm_gpu *gpu) @@ -994,12 +1004,6 @@ struct msm_gpu_state *a6xx_gpu_state_get(struct msm_gpu *gpu)
a6xx_get_gmu_state(gpu, a6xx_state);
- a6xx_state->gmu_log = a6xx_snapshot_gmu_bo(a6xx_state, &a6xx_gpu->gmu.log); - a6xx_state->gmu_hfi = a6xx_snapshot_gmu_bo(a6xx_state, &a6xx_gpu->gmu.hfi); - a6xx_state->gmu_debug = a6xx_snapshot_gmu_bo(a6xx_state, &a6xx_gpu->gmu.debug); - - a6xx_snapshot_gmu_hfi_history(gpu, a6xx_state); - /* If GX isn't on the rest of the data isn't going to be accessible */ if (!a6xx_gmu_gx_is_on(&a6xx_gpu->gmu)) return &a6xx_state->base; @@ -1343,3 +1347,45 @@ void a6xx_show(struct msm_gpu *gpu, struct msm_gpu_state *state, drm_puts(p, "debugbus:\n"); a6xx_show_debugbus(a6xx_state, p); } + +void a6xx_gmu_inline_coredump(struct a6xx_gmu *gmu) +{ + struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu); + struct adreno_gpu *adreno_gpu = &a6xx_gpu->base; + struct msm_gpu *gpu = &adreno_gpu->base; + struct a6xx_gpu_state *a6xx_state; + + WARN_ON(mutex_is_locked(&gmu->lock)); + + mutex_lock(&gpu->crashstate_lock); + + if (gpu->crashstate) { + mutex_unlock(&gpu->crashstate_lock); + DRM_DEV_ERROR(gmu->dev, "Skipping GMU coredump\n"); + return; + } + + a6xx_state = kzalloc(sizeof(*a6xx_state), GFP_KERNEL); + if (!a6xx_state) { + mutex_unlock(&gpu->crashstate_lock); + DRM_DEV_ERROR(gmu->dev, + "Failed to allocate memory for GMU coredump\n"); + return; + } + + INIT_LIST_HEAD(&a6xx_state->objs); + adreno_gpu_state_init(&a6xx_state->base); + + /* + * Set hung=true here so that an NMI is sent to gmu while capturing + * coredump + */ + gmu->hung = true; + a6xx_get_gmu_state(gpu, a6xx_state); + + gpu->crashstate = &a6xx_state->base; + + mutex_unlock(&gpu->crashstate_lock); + + msm_gpu_create_devcoredump(gpu); +} diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c index 15c8997..d3ae42f 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c @@ -503,6 +503,12 @@ bool adreno_idle(struct msm_gpu *gpu, struct msm_ringbuffer *ring) return false; }
+void adreno_gpu_state_init(struct msm_gpu_state *state) +{ + kref_init(&state->ref); + ktime_get_real_ts64(&state->time); +} + int adreno_gpu_state_get(struct msm_gpu *gpu, struct msm_gpu_state *state) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); @@ -510,9 +516,7 @@ int adreno_gpu_state_get(struct msm_gpu *gpu, struct msm_gpu_state *state)
WARN_ON(!mutex_is_locked(&gpu->lock));
- kref_init(&state->ref); - - ktime_get_real_ts64(&state->time); + adreno_gpu_state_init(state);
for (i = 0; i < gpu->nr_rings; i++) { int size = 0, j; diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h index b1ee453..9472183 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h @@ -316,6 +316,8 @@ int adreno_gpu_state_put(struct msm_gpu_state *state); void adreno_show_object(struct drm_printer *p, void **ptr, int len, bool *encoded);
+void adreno_gpu_state_init(struct msm_gpu_state *state); + /* * Common helper function to initialize the default address space for arm-smmu * attached targets diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index bacdabb..e8a442a 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -259,6 +259,12 @@ static void msm_gpu_crashstate_get_bo(struct msm_gpu_state *state, state->nr_bos++; }
+void msm_gpu_create_devcoredump(struct msm_gpu *gpu) +{ + dev_coredumpm(gpu->dev->dev, THIS_MODULE, gpu, 0, GFP_KERNEL, + msm_gpu_devcoredump_read, msm_gpu_devcoredump_free); +} + static void msm_gpu_crashstate_capture(struct msm_gpu *gpu, struct msm_gem_submit *submit, char *comm, char *cmd) { @@ -268,13 +274,19 @@ static void msm_gpu_crashstate_capture(struct msm_gpu *gpu, if (!gpu->funcs->gpu_state_get) return;
+ mutex_lock(&gpu->crashstate_lock); + /* Only save one crash state at a time */ - if (gpu->crashstate) + if (gpu->crashstate) { + mutex_unlock(&gpu->crashstate_lock); return; + }
state = gpu->funcs->gpu_state_get(gpu); - if (IS_ERR_OR_NULL(state)) + if (IS_ERR_OR_NULL(state)) { + mutex_unlock(&gpu->crashstate_lock); return; + }
/* Fill in the additional crash state information */ state->comm = kstrdup(comm, GFP_KERNEL); @@ -316,6 +328,8 @@ static void msm_gpu_crashstate_capture(struct msm_gpu *gpu, /* Set the active crash state to be dumped on failure */ gpu->crashstate = state;
+ mutex_unlock(&gpu->crashstate_lock); + /* FIXME: Release the crashstate if this errors out? */ dev_coredumpm(gpu->dev->dev, THIS_MODULE, gpu, 0, GFP_KERNEL, msm_gpu_devcoredump_read, msm_gpu_devcoredump_free); @@ -325,6 +339,10 @@ static void msm_gpu_crashstate_capture(struct msm_gpu *gpu, struct msm_gem_submit *submit, char *comm, char *cmd) { } + +void msm_gpu_create_devcoredump(struct msm_gpu *gpu) +{ +} #endif
/* @@ -856,6 +874,7 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, INIT_LIST_HEAD(&gpu->active_list); mutex_init(&gpu->active_lock); mutex_init(&gpu->lock); + mutex_init(&gpu->crashstate_lock); init_waitqueue_head(&gpu->retire_event); kthread_init_work(&gpu->retire_work, retire_worker); kthread_init_work(&gpu->recover_work, recover_worker); diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h index c99627f..b33f508 100644 --- a/drivers/gpu/drm/msm/msm_gpu.h +++ b/drivers/gpu/drm/msm/msm_gpu.h @@ -250,6 +250,7 @@ struct msm_gpu { uint32_t suspend_count;
struct msm_gpu_state *crashstate; + struct mutex crashstate_lock;
/* Enable clamping to idle freq when inactive: */ bool clamp_to_idle; @@ -578,30 +579,32 @@ static inline struct msm_gpu_state *msm_gpu_crashstate_get(struct msm_gpu *gpu) { struct msm_gpu_state *state = NULL;
- mutex_lock(&gpu->lock); + mutex_lock(&gpu->crashstate_lock);
if (gpu->crashstate) { kref_get(&gpu->crashstate->ref); state = gpu->crashstate; }
- mutex_unlock(&gpu->lock); + mutex_unlock(&gpu->crashstate_lock);
return state; }
static inline void msm_gpu_crashstate_put(struct msm_gpu *gpu) { - mutex_lock(&gpu->lock); + mutex_lock(&gpu->crashstate_lock);
if (gpu->crashstate) { if (gpu->funcs->gpu_state_put(gpu->crashstate)) gpu->crashstate = NULL; }
- mutex_unlock(&gpu->lock); + mutex_unlock(&gpu->crashstate_lock); }
+void msm_gpu_create_devcoredump(struct msm_gpu *gpu); + /* * Simple macro to semi-cleanly add the MAP_PRIV flag for targets that can * support expanded privileges
Schedule the recover worker when there is hw init failure in msm_gpu_submit(). The recover worker will take care of capturing coredump, gpu recovery and resubmission of pending IBs.
Signed-off-by: Akhil P Oommen quic_akhilpo@quicinc.com ---
drivers/gpu/drm/msm/msm_gpu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index e8a442a..4d24fa1 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -757,12 +757,15 @@ void msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit) struct msm_drm_private *priv = dev->dev_private; struct msm_ringbuffer *ring = submit->ring; unsigned long flags; + int ret;
WARN_ON(!mutex_is_locked(&gpu->lock));
pm_runtime_get_sync(&gpu->pdev->dev);
- msm_gpu_hw_init(gpu); + ret = msm_gpu_hw_init(gpu); + if (ret) + kthread_queue_work(gpu->worker, &gpu->recover_work);
submit->seqno = ++ring->seqno;
Propagate OOB set error to higher level so that a coredump is captured followed by recovery sequence.
Signed-off-by: Akhil P Oommen quic_akhilpo@quicinc.com ---
drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 33 ++++++++++++++++++++------------- drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 19 ++++++++++++++++--- drivers/gpu/drm/msm/adreno/a6xx_gpu.h | 2 +- 3 files changed, 37 insertions(+), 17 deletions(-)
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c index f121d798..66ae509 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c @@ -133,7 +133,7 @@ bool a6xx_gmu_gx_is_on(struct a6xx_gmu *gmu) A6XX_GMU_SPTPRAC_PWR_CLK_STATUS_GX_HM_CLK_OFF)); }
-void a6xx_gmu_set_freq(struct msm_gpu *gpu, struct dev_pm_opp *opp) +int a6xx_gmu_set_freq(struct msm_gpu *gpu, struct dev_pm_opp *opp) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); @@ -145,7 +145,7 @@ void a6xx_gmu_set_freq(struct msm_gpu *gpu, struct dev_pm_opp *opp) gpu_freq = dev_pm_opp_get_freq(opp);
if (gpu_freq == gmu->freq) - return; + return 0;
for (perf_index = 0; perf_index < gmu->nr_gpu_freqs - 1; perf_index++) if (gpu_freq == gmu->gpu_freqs[perf_index]) @@ -161,13 +161,13 @@ void a6xx_gmu_set_freq(struct msm_gpu *gpu, struct dev_pm_opp *opp) * bring up the power if it isn't already active */ if (pm_runtime_get_if_in_use(gmu->dev) == 0) - return; + return 0;
if (!gmu->legacy) { - a6xx_hfi_set_freq(gmu, perf_index); + ret = a6xx_hfi_set_freq(gmu, perf_index); dev_pm_opp_set_opp(&gpu->pdev->dev, opp); pm_runtime_put(gmu->dev); - return; + return ret; }
gmu_write(gmu, REG_A6XX_GMU_DCVS_ACK_OPTION, 0); @@ -182,15 +182,17 @@ void a6xx_gmu_set_freq(struct msm_gpu *gpu, struct dev_pm_opp *opp) gmu_write(gmu, REG_A6XX_GMU_DCVS_BW_SETTING, 0xff);
/* Set and clear the OOB for DCVS to trigger the GMU */ - a6xx_gmu_set_oob(gmu, GMU_OOB_DCVS_SET); + ret = a6xx_gmu_set_oob(gmu, GMU_OOB_DCVS_SET); a6xx_gmu_clear_oob(gmu, GMU_OOB_DCVS_SET);
- ret = gmu_read(gmu, REG_A6XX_GMU_DCVS_RETURN); - if (ret) + if (!ret && gmu_read(gmu, REG_A6XX_GMU_DCVS_RETURN)) { dev_err(gmu->dev, "GMU set GPU frequency error: %d\n", ret); + ret = -EINVAL; + }
dev_pm_opp_set_opp(&gpu->pdev->dev, opp); pm_runtime_put(gmu->dev); + return ret; }
unsigned long a6xx_gmu_get_freq(struct msm_gpu *gpu) @@ -353,11 +355,13 @@ int a6xx_gmu_set_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state) ret = gmu_poll_timeout(gmu, REG_A6XX_GMU_GMU2HOST_INTR_INFO, val, val & (1 << ack), 100, 10000);
- if (ret) + if (ret) { DRM_DEV_ERROR(gmu->dev, "Timeout waiting for GMU OOB set %s: 0x%x\n", a6xx_gmu_oob_bits[state].name, gmu_read(gmu, REG_A6XX_GMU_GMU2HOST_INTR_INFO)); + return -ETIMEDOUT; + }
/* Clear the acknowledge interrupt */ gmu_write(gmu, REG_A6XX_GMU_GMU2HOST_INTR_CLR, 1 << ack); @@ -922,18 +926,21 @@ static void a6xx_gmu_force_off(struct a6xx_gmu *gmu) a6xx_gmu_rpmh_off(gmu); }
-static void a6xx_gmu_set_initial_freq(struct msm_gpu *gpu, struct a6xx_gmu *gmu) +static int a6xx_gmu_set_initial_freq(struct msm_gpu *gpu, struct a6xx_gmu *gmu) { struct dev_pm_opp *gpu_opp; unsigned long gpu_freq = gmu->gpu_freqs[gmu->current_perf_index]; + int ret;
gpu_opp = dev_pm_opp_find_freq_exact(&gpu->pdev->dev, gpu_freq, true); if (IS_ERR(gpu_opp)) - return; + return PTR_ERR(gpu_opp);
gmu->freq = 0; /* so a6xx_gmu_set_freq() doesn't exit early */ - a6xx_gmu_set_freq(gpu, gpu_opp); + ret = a6xx_gmu_set_freq(gpu, gpu_opp); dev_pm_opp_put(gpu_opp); + + return ret; }
static void a6xx_gmu_set_initial_bw(struct msm_gpu *gpu, struct a6xx_gmu *gmu) @@ -1018,7 +1025,7 @@ int a6xx_gmu_resume(struct a6xx_gpu *a6xx_gpu) enable_irq(gmu->hfi_irq);
/* Set the GPU to the current freq */ - a6xx_gmu_set_initial_freq(gpu, gmu); + ret = a6xx_gmu_set_initial_freq(gpu, gmu);
out: /* On failure, shut down the GMU to leave it in a good state */ diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 8c3cb31..fdfc5c4 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -890,7 +890,7 @@ static int hw_init(struct msm_gpu *gpu) int ret;
/* Make sure the GMU keeps the GPU on while we set it up */ - a6xx_gmu_set_oob(&a6xx_gpu->gmu, GMU_OOB_GPU_SET); + ret = a6xx_gmu_set_oob(&a6xx_gpu->gmu, GMU_OOB_GPU_SET);
gpu_write(gpu, REG_A6XX_RBBM_SECVID_TSB_CNTL, 0);
@@ -1570,11 +1570,18 @@ static int a6xx_get_timestamp(struct msm_gpu *gpu, uint64_t *value) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); + int ret;
mutex_lock(&a6xx_gpu->gmu.lock);
/* Force the GPU power on so we can read this register */ - a6xx_gmu_set_oob(&a6xx_gpu->gmu, GMU_OOB_PERFCOUNTER_SET); + ret = a6xx_gmu_set_oob(&a6xx_gpu->gmu, GMU_OOB_PERFCOUNTER_SET); + if (ret) { + mutex_unlock(&a6xx_gpu->gmu.lock); + a6xx_gpu->gmu.hung = true; + kthread_queue_work(gpu->worker, &gpu->recover_work); + return ret; + }
*value = gpu_read64(gpu, REG_A6XX_CP_ALWAYS_ON_COUNTER_LO, REG_A6XX_CP_ALWAYS_ON_COUNTER_HI); @@ -1650,10 +1657,16 @@ static void a6xx_gpu_set_freq(struct msm_gpu *gpu, struct dev_pm_opp *opp) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); + int ret;
mutex_lock(&a6xx_gpu->gmu.lock); - a6xx_gmu_set_freq(gpu, opp); + ret = a6xx_gmu_set_freq(gpu, opp); mutex_unlock(&a6xx_gpu->gmu.lock); + + if (ret) { + a6xx_gpu->gmu.hung = true; + kthread_queue_work(gpu->worker, &gpu->recover_work); + } }
static struct msm_gem_address_space * diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h index 2599443..391ff76 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h @@ -77,7 +77,7 @@ void a6xx_gmu_clear_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state); int a6xx_gmu_init(struct a6xx_gpu *a6xx_gpu, struct device_node *node); void a6xx_gmu_remove(struct a6xx_gpu *a6xx_gpu);
-void a6xx_gmu_set_freq(struct msm_gpu *gpu, struct dev_pm_opp *opp); +int a6xx_gmu_set_freq(struct msm_gpu *gpu, struct dev_pm_opp *opp); unsigned long a6xx_gmu_get_freq(struct msm_gpu *gpu);
void a6xx_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
Hi Akhil,
Thank you for the patch! Perhaps something to improve:
[auto build test WARNING on drm/drm-next] [also build test WARNING on drm-intel/for-linux-next drm-tip/drm-tip drm-exynos/exynos-drm-next v5.17-rc7 next-20220308] [cannot apply to tegra-drm/drm/tegra/for-next airlied/drm-next] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch]
url: https://github.com/0day-ci/linux/commits/Akhil-P-Oommen/Support-for-GMU-core... base: git://anongit.freedesktop.org/drm/drm drm-next config: s390-randconfig-m031-20220307 (https://download.01.org/0day-ci/archive/20220309/202203091923.2RD2Ech3-lkp@i...) compiler: s390-linux-gcc (GCC) 11.2.0
If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot lkp@intel.com
smatch warnings: drivers/gpu/drm/msm/adreno/a6xx_gpu.c:894 hw_init() warn: inconsistent indenting
vim +894 drivers/gpu/drm/msm/adreno/a6xx_gpu.c
874 875 #define A6XX_INT_MASK (A6XX_RBBM_INT_0_MASK_CP_AHB_ERROR | \ 876 A6XX_RBBM_INT_0_MASK_RBBM_ATB_ASYNCFIFO_OVERFLOW | \ 877 A6XX_RBBM_INT_0_MASK_CP_HW_ERROR | \ 878 A6XX_RBBM_INT_0_MASK_CP_IB2 | \ 879 A6XX_RBBM_INT_0_MASK_CP_IB1 | \ 880 A6XX_RBBM_INT_0_MASK_CP_RB | \ 881 A6XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS | \ 882 A6XX_RBBM_INT_0_MASK_RBBM_ATB_BUS_OVERFLOW | \ 883 A6XX_RBBM_INT_0_MASK_RBBM_HANG_DETECT | \ 884 A6XX_RBBM_INT_0_MASK_UCHE_OOB_ACCESS | \ 885 A6XX_RBBM_INT_0_MASK_UCHE_TRAP_INTR) 886 887 static int hw_init(struct msm_gpu *gpu) 888 { 889 struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); 890 struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); 891 int ret; 892 893 /* Make sure the GMU keeps the GPU on while we set it up */
894 ret = a6xx_gmu_set_oob(&a6xx_gpu->gmu, GMU_OOB_GPU_SET);
895 896 gpu_write(gpu, REG_A6XX_RBBM_SECVID_TSB_CNTL, 0); 897 898 /* 899 * Disable the trusted memory range - we don't actually supported secure 900 * memory rendering at this point in time and we don't want to block off 901 * part of the virtual memory space. 902 */ 903 gpu_write64(gpu, REG_A6XX_RBBM_SECVID_TSB_TRUSTED_BASE_LO, 904 REG_A6XX_RBBM_SECVID_TSB_TRUSTED_BASE_HI, 0x00000000); 905 gpu_write(gpu, REG_A6XX_RBBM_SECVID_TSB_TRUSTED_SIZE, 0x00000000); 906 907 /* Turn on 64 bit addressing for all blocks */ 908 gpu_write(gpu, REG_A6XX_CP_ADDR_MODE_CNTL, 0x1); 909 gpu_write(gpu, REG_A6XX_VSC_ADDR_MODE_CNTL, 0x1); 910 gpu_write(gpu, REG_A6XX_GRAS_ADDR_MODE_CNTL, 0x1); 911 gpu_write(gpu, REG_A6XX_RB_ADDR_MODE_CNTL, 0x1); 912 gpu_write(gpu, REG_A6XX_PC_ADDR_MODE_CNTL, 0x1); 913 gpu_write(gpu, REG_A6XX_HLSQ_ADDR_MODE_CNTL, 0x1); 914 gpu_write(gpu, REG_A6XX_VFD_ADDR_MODE_CNTL, 0x1); 915 gpu_write(gpu, REG_A6XX_VPC_ADDR_MODE_CNTL, 0x1); 916 gpu_write(gpu, REG_A6XX_UCHE_ADDR_MODE_CNTL, 0x1); 917 gpu_write(gpu, REG_A6XX_SP_ADDR_MODE_CNTL, 0x1); 918 gpu_write(gpu, REG_A6XX_TPL1_ADDR_MODE_CNTL, 0x1); 919 gpu_write(gpu, REG_A6XX_RBBM_SECVID_TSB_ADDR_MODE_CNTL, 0x1); 920 921 /* enable hardware clockgating */ 922 a6xx_set_hwcg(gpu, true); 923 924 /* VBIF/GBIF start*/ 925 if (adreno_is_a640_family(adreno_gpu) || 926 adreno_is_a650_family(adreno_gpu)) { 927 gpu_write(gpu, REG_A6XX_GBIF_QSB_SIDE0, 0x00071620); 928 gpu_write(gpu, REG_A6XX_GBIF_QSB_SIDE1, 0x00071620); 929 gpu_write(gpu, REG_A6XX_GBIF_QSB_SIDE2, 0x00071620); 930 gpu_write(gpu, REG_A6XX_GBIF_QSB_SIDE3, 0x00071620); 931 gpu_write(gpu, REG_A6XX_GBIF_QSB_SIDE3, 0x00071620); 932 gpu_write(gpu, REG_A6XX_RBBM_GBIF_CLIENT_QOS_CNTL, 0x3); 933 } else { 934 gpu_write(gpu, REG_A6XX_RBBM_VBIF_CLIENT_QOS_CNTL, 0x3); 935 } 936 937 if (adreno_is_a630(adreno_gpu)) 938 gpu_write(gpu, REG_A6XX_VBIF_GATE_OFF_WRREQ_EN, 0x00000009); 939 940 /* Make all blocks contribute to the GPU BUSY perf counter */ 941 gpu_write(gpu, REG_A6XX_RBBM_PERFCTR_GPU_BUSY_MASKED, 0xffffffff); 942 943 /* Disable L2 bypass in the UCHE */ 944 gpu_write(gpu, REG_A6XX_UCHE_WRITE_RANGE_MAX_LO, 0xffffffc0); 945 gpu_write(gpu, REG_A6XX_UCHE_WRITE_RANGE_MAX_HI, 0x0001ffff); 946 gpu_write(gpu, REG_A6XX_UCHE_TRAP_BASE_LO, 0xfffff000); 947 gpu_write(gpu, REG_A6XX_UCHE_TRAP_BASE_HI, 0x0001ffff); 948 gpu_write(gpu, REG_A6XX_UCHE_WRITE_THRU_BASE_LO, 0xfffff000); 949 gpu_write(gpu, REG_A6XX_UCHE_WRITE_THRU_BASE_HI, 0x0001ffff); 950 951 if (!adreno_is_a650_family(adreno_gpu)) { 952 /* Set the GMEM VA range [0x100000:0x100000 + gpu->gmem - 1] */ 953 gpu_write64(gpu, REG_A6XX_UCHE_GMEM_RANGE_MIN_LO, 954 REG_A6XX_UCHE_GMEM_RANGE_MIN_HI, 0x00100000); 955 956 gpu_write64(gpu, REG_A6XX_UCHE_GMEM_RANGE_MAX_LO, 957 REG_A6XX_UCHE_GMEM_RANGE_MAX_HI, 958 0x00100000 + adreno_gpu->gmem - 1); 959 } 960 961 gpu_write(gpu, REG_A6XX_UCHE_FILTER_CNTL, 0x804); 962 gpu_write(gpu, REG_A6XX_UCHE_CACHE_WAYS, 0x4); 963 964 if (adreno_is_a640_family(adreno_gpu) || 965 adreno_is_a650_family(adreno_gpu)) 966 gpu_write(gpu, REG_A6XX_CP_ROQ_THRESHOLDS_2, 0x02000140); 967 else 968 gpu_write(gpu, REG_A6XX_CP_ROQ_THRESHOLDS_2, 0x010000c0); 969 gpu_write(gpu, REG_A6XX_CP_ROQ_THRESHOLDS_1, 0x8040362c); 970 971 if (adreno_is_a660_family(adreno_gpu)) 972 gpu_write(gpu, REG_A6XX_CP_LPAC_PROG_FIFO_SIZE, 0x00000020); 973 974 /* Setting the mem pool size */ 975 gpu_write(gpu, REG_A6XX_CP_MEM_POOL_SIZE, 128); 976 977 /* Setting the primFifo thresholds default values, 978 * and vccCacheSkipDis=1 bit (0x200) for A640 and newer 979 */ 980 if (adreno_is_a650(adreno_gpu) || adreno_is_a660(adreno_gpu)) 981 gpu_write(gpu, REG_A6XX_PC_DBG_ECO_CNTL, 0x00300200); 982 else if (adreno_is_a640_family(adreno_gpu) || adreno_is_7c3(adreno_gpu)) 983 gpu_write(gpu, REG_A6XX_PC_DBG_ECO_CNTL, 0x00200200); 984 else if (adreno_is_a650(adreno_gpu) || adreno_is_a660(adreno_gpu)) 985 gpu_write(gpu, REG_A6XX_PC_DBG_ECO_CNTL, 0x00300200); 986 else 987 gpu_write(gpu, REG_A6XX_PC_DBG_ECO_CNTL, 0x00180000); 988 989 /* Set the AHB default slave response to "ERROR" */ 990 gpu_write(gpu, REG_A6XX_CP_AHB_CNTL, 0x1); 991 992 /* Turn on performance counters */ 993 gpu_write(gpu, REG_A6XX_RBBM_PERFCTR_CNTL, 0x1); 994 995 /* Select CP0 to always count cycles */ 996 gpu_write(gpu, REG_A6XX_CP_PERFCTR_CP_SEL(0), PERF_CP_ALWAYS_COUNT); 997 998 a6xx_set_ubwc_config(gpu); 999 1000 /* Enable fault detection */ 1001 gpu_write(gpu, REG_A6XX_RBBM_INTERFACE_HANG_INT_CNTL, 1002 (1 << 30) | 0x1fffff); 1003 1004 gpu_write(gpu, REG_A6XX_UCHE_CLIENT_PF, 1); 1005 1006 /* Set weights for bicubic filtering */ 1007 if (adreno_is_a650_family(adreno_gpu)) { 1008 gpu_write(gpu, REG_A6XX_TPL1_BICUBIC_WEIGHTS_TABLE_0, 0); 1009 gpu_write(gpu, REG_A6XX_TPL1_BICUBIC_WEIGHTS_TABLE_1, 1010 0x3fe05ff4); 1011 gpu_write(gpu, REG_A6XX_TPL1_BICUBIC_WEIGHTS_TABLE_2, 1012 0x3fa0ebee); 1013 gpu_write(gpu, REG_A6XX_TPL1_BICUBIC_WEIGHTS_TABLE_3, 1014 0x3f5193ed); 1015 gpu_write(gpu, REG_A6XX_TPL1_BICUBIC_WEIGHTS_TABLE_4, 1016 0x3f0243f0); 1017 } 1018 1019 /* Protect registers from the CP */ 1020 a6xx_set_cp_protect(gpu); 1021 1022 if (adreno_is_a660_family(adreno_gpu)) { 1023 gpu_write(gpu, REG_A6XX_CP_CHICKEN_DBG, 0x1); 1024 gpu_write(gpu, REG_A6XX_RBBM_GBIF_CLIENT_QOS_CNTL, 0x0); 1025 } 1026 1027 /* Set dualQ + disable afull for A660 GPU */ 1028 if (adreno_is_a660(adreno_gpu)) 1029 gpu_write(gpu, REG_A6XX_UCHE_CMDQ_CONFIG, 0x66906); 1030 1031 /* Enable expanded apriv for targets that support it */ 1032 if (gpu->hw_apriv) { 1033 gpu_write(gpu, REG_A6XX_CP_APRIV_CNTL, 1034 (1 << 6) | (1 << 5) | (1 << 3) | (1 << 2) | (1 << 1)); 1035 } 1036 1037 /* Enable interrupts */ 1038 gpu_write(gpu, REG_A6XX_RBBM_INT_0_MASK, A6XX_INT_MASK); 1039 1040 ret = adreno_hw_init(gpu); 1041 if (ret) 1042 goto out; 1043 1044 ret = a6xx_ucode_init(gpu); 1045 if (ret) 1046 goto out; 1047 1048 /* Set the ringbuffer address */ 1049 gpu_write64(gpu, REG_A6XX_CP_RB_BASE, REG_A6XX_CP_RB_BASE_HI, 1050 gpu->rb[0]->iova); 1051 1052 /* Targets that support extended APRIV can use the RPTR shadow from 1053 * hardware but all the other ones need to disable the feature. Targets 1054 * that support the WHERE_AM_I opcode can use that instead 1055 */ 1056 if (adreno_gpu->base.hw_apriv) 1057 gpu_write(gpu, REG_A6XX_CP_RB_CNTL, MSM_GPU_RB_CNTL_DEFAULT); 1058 else 1059 gpu_write(gpu, REG_A6XX_CP_RB_CNTL, 1060 MSM_GPU_RB_CNTL_DEFAULT | AXXX_CP_RB_CNTL_NO_UPDATE); 1061 1062 /* 1063 * Expanded APRIV and targets that support WHERE_AM_I both need a 1064 * privileged buffer to store the RPTR shadow 1065 */ 1066 1067 if (adreno_gpu->base.hw_apriv || a6xx_gpu->has_whereami) { 1068 if (!a6xx_gpu->shadow_bo) { 1069 a6xx_gpu->shadow = msm_gem_kernel_new(gpu->dev, 1070 sizeof(u32) * gpu->nr_rings, 1071 MSM_BO_WC | MSM_BO_MAP_PRIV, 1072 gpu->aspace, &a6xx_gpu->shadow_bo, 1073 &a6xx_gpu->shadow_iova); 1074 1075 if (IS_ERR(a6xx_gpu->shadow)) 1076 return PTR_ERR(a6xx_gpu->shadow); 1077 1078 msm_gem_object_set_name(a6xx_gpu->shadow_bo, "shadow"); 1079 } 1080 1081 gpu_write64(gpu, REG_A6XX_CP_RB_RPTR_ADDR_LO, 1082 REG_A6XX_CP_RB_RPTR_ADDR_HI, 1083 shadowptr(a6xx_gpu, gpu->rb[0])); 1084 } 1085 1086 /* Always come up on rb 0 */ 1087 a6xx_gpu->cur_ring = gpu->rb[0]; 1088 1089 gpu->cur_ctx_seqno = 0; 1090 1091 /* Enable the SQE_to start the CP engine */ 1092 gpu_write(gpu, REG_A6XX_CP_SQE_CNTL, 1); 1093 1094 ret = a6xx_cp_init(gpu); 1095 if (ret) 1096 goto out; 1097 1098 /* 1099 * Try to load a zap shader into the secure world. If successful 1100 * we can use the CP to switch out of secure mode. If not then we 1101 * have no resource but to try to switch ourselves out manually. If we 1102 * guessed wrong then access to the RBBM_SECVID_TRUST_CNTL register will 1103 * be blocked and a permissions violation will soon follow. 1104 */ 1105 ret = a6xx_zap_shader_init(gpu); 1106 if (!ret) { 1107 OUT_PKT7(gpu->rb[0], CP_SET_SECURE_MODE, 1); 1108 OUT_RING(gpu->rb[0], 0x00000000); 1109 1110 a6xx_flush(gpu, gpu->rb[0]); 1111 if (!a6xx_idle(gpu, gpu->rb[0])) 1112 return -EINVAL; 1113 } else if (ret == -ENODEV) { 1114 /* 1115 * This device does not use zap shader (but print a warning 1116 * just in case someone got their dt wrong.. hopefully they 1117 * have a debug UART to realize the error of their ways... 1118 * if you mess this up you are about to crash horribly) 1119 */ 1120 dev_warn_once(gpu->dev->dev, 1121 "Zap shader not enabled - using SECVID_TRUST_CNTL instead\n"); 1122 gpu_write(gpu, REG_A6XX_RBBM_SECVID_TRUST_CNTL, 0x0); 1123 ret = 0; 1124 } else { 1125 return ret; 1126 } 1127 1128 out: 1129 /* 1130 * Tell the GMU that we are done touching the GPU and it can start power 1131 * management 1132 */ 1133 a6xx_gmu_clear_oob(&a6xx_gpu->gmu, GMU_OOB_GPU_SET); 1134 1135 if (a6xx_gpu->gmu.legacy) { 1136 /* Take the GMU out of its special boot mode */ 1137 a6xx_gmu_clear_oob(&a6xx_gpu->gmu, GMU_OOB_BOOT_SLUMBER); 1138 } 1139 1140 return ret; 1141 } 1142
--- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Retry infinitely on resume failure because there is nothing much we can do if GPU is not ON. Also, this helps us to avoid checking for the return value of pm_runtime_get() to see if GPU is ON.
Signed-off-by: Akhil P Oommen quic_akhilpo@quicinc.com ---
drivers/gpu/drm/msm/adreno/adreno_device.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/msm/adreno/adreno_device.c b/drivers/gpu/drm/msm/adreno/adreno_device.c index 89cfd84..abcc553 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_device.c +++ b/drivers/gpu/drm/msm/adreno/adreno_device.c @@ -603,8 +603,16 @@ static const struct of_device_id dt_match[] = { static int adreno_resume(struct device *dev) { struct msm_gpu *gpu = dev_to_gpu(dev); + int ret; + + /* What hope do we have for the future if we can't turn ON gpu */ + while (true) { + ret = gpu->funcs->pm_resume(gpu); + if (!ret) + break; + }
- return gpu->funcs->pm_resume(gpu); + return 0; }
static int active_submits(struct msm_gpu *gpu)
Il 02/03/22 18:27, Akhil P Oommen ha scritto:
Retry infinitely on resume failure because there is nothing much we can do if GPU is not ON. Also, this helps us to avoid checking for the return value of pm_runtime_get() to see if GPU is ON.
Signed-off-by: Akhil P Oommen quic_akhilpo@quicinc.com
drivers/gpu/drm/msm/adreno/adreno_device.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/msm/adreno/adreno_device.c b/drivers/gpu/drm/msm/adreno/adreno_device.c index 89cfd84..abcc553 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_device.c +++ b/drivers/gpu/drm/msm/adreno/adreno_device.c @@ -603,8 +603,16 @@ static const struct of_device_id dt_match[] = { static int adreno_resume(struct device *dev) { struct msm_gpu *gpu = dev_to_gpu(dev);
- int ret;
- /* What hope do we have for the future if we can't turn ON gpu */
Hello Akhil,
the hope for the future would be to at least not lock up everything with an infinite loop, so, please change this to have a limited amount of retries.
My guess would be that a maximum of 10 is already a lot of retries, but feel free to choose an appropriate number.
Regards, Angelo
- while (true) {
ret = gpu->funcs->pm_resume(gpu);
if (!ret)
break;
- }
- return gpu->funcs->pm_resume(gpu);
return 0; }
static int active_submits(struct msm_gpu *gpu)
On 3/3/2022 2:51 PM, AngeloGioacchino Del Regno wrote:
Il 02/03/22 18:27, Akhil P Oommen ha scritto:
Retry infinitely on resume failure because there is nothing much we can do if GPU is not ON. Also, this helps us to avoid checking for the return value of pm_runtime_get() to see if GPU is ON.
Signed-off-by: Akhil P Oommen quic_akhilpo@quicinc.com
drivers/gpu/drm/msm/adreno/adreno_device.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/msm/adreno/adreno_device.c b/drivers/gpu/drm/msm/adreno/adreno_device.c index 89cfd84..abcc553 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_device.c +++ b/drivers/gpu/drm/msm/adreno/adreno_device.c @@ -603,8 +603,16 @@ static const struct of_device_id dt_match[] = { static int adreno_resume(struct device *dev) { struct msm_gpu *gpu = dev_to_gpu(dev); + int ret;
+ /* What hope do we have for the future if we can't turn ON gpu */
Hello Akhil,
the hope for the future would be to at least not lock up everything with an infinite loop, so, please change this to have a limited amount of retries.
My guess would be that a maximum of 10 is already a lot of retries, but feel free to choose an appropriate number.
Thanks for the feedback, Angelo. I will revisit this.
-Akhil.
Regards, Angelo
+ while (true) { + ret = gpu->funcs->pm_resume(gpu); + if (!ret) + break; + } - return gpu->funcs->pm_resume(gpu); + return 0; } static int active_submits(struct msm_gpu *gpu)
Remove vote on clks on gpu resume failure.
Signed-off-by: Akhil P Oommen quic_akhilpo@quicinc.com ---
drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c index 66ae509..e90359f 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c @@ -1033,6 +1033,7 @@ int a6xx_gmu_resume(struct a6xx_gpu *a6xx_gpu) disable_irq(gmu->gmu_irq); a6xx_gmu_inline_coredump(gmu); a6xx_rpmh_stop(gmu); + clk_bulk_disable_unprepare(gmu->nr_clocks, gmu->clocks); pm_runtime_put(gmu->gxpd); pm_runtime_put(gmu->dev); }
We do pm_runtime_get() within msm_gpu_submit(). So remove the redundant pm_runtime_get/put from msm_job_run().
Signed-off-by: Akhil P Oommen quic_akhilpo@quicinc.com ---
drivers/gpu/drm/msm/msm_ringbuffer.c | 4 ---- 1 file changed, 4 deletions(-)
diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c index 3bbf574..43fb04e 100644 --- a/drivers/gpu/drm/msm/msm_ringbuffer.c +++ b/drivers/gpu/drm/msm/msm_ringbuffer.c @@ -18,8 +18,6 @@ static struct dma_fence *msm_job_run(struct drm_sched_job *job)
submit->hw_fence = msm_fence_alloc(submit->ring->fctx);
- pm_runtime_get_sync(&gpu->pdev->dev); - /* TODO move submit path over to using a per-ring lock.. */ mutex_lock(&gpu->lock);
@@ -27,8 +25,6 @@ static struct dma_fence *msm_job_run(struct drm_sched_job *job)
mutex_unlock(&gpu->lock);
- pm_runtime_put(&gpu->pdev->dev); - return dma_fence_get(submit->hw_fence); }
Free gmu_debug bo while destroying the gpu crashstate.
Signed-off-by: Akhil P Oommen quic_akhilpo@quicinc.com ---
drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c index 4d4588a..09bb993 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c @@ -1054,6 +1054,9 @@ static void a6xx_gpu_state_destroy(struct kref *kref) if (a6xx_state->gmu_hfi) kvfree(a6xx_state->gmu_hfi->data);
+ if (a6xx_state->gmu_debug) + kvfree(a6xx_state->gmu_debug->data); + list_for_each_entry_safe(obj, tmp, &a6xx_state->objs, node) kfree(obj);
dri-devel@lists.freedesktop.org