From: Rob Clark robdclark@chromium.org
Currently, for GL_EXT_robustness userspace uses the global and per- submitqueue fault counters to determine GUILTY_CONTEXT_RESET_EXT vs INNOCENT_CONTEXT_RESET_EXT. But that is a bit overly paranoid, in that a fault in a different process's context (when it has it's own isolated address space) should not hurt anything.
This is particularly annoying with CrOS and chrome's exit_on_context_lost quirk, while running deqp in the android container, as the deqp-egl suite has tests that intentionally trigger gpu hangs (for the purpose of testing the robustness extension), which triggers chrome to restart, which restarts the android container!
This new param gives userspace a way to ignore faults triggered by other processes.
Applies on top of https://patchwork.freedesktop.org/series/98907/
Rob Clark (2): drm/msm/gpu: Add ctx to get_param() drm/msm/gpu: Add param to get address space faults
drivers/gpu/drm/msm/adreno/adreno_gpu.c | 6 +++++- drivers/gpu/drm/msm/adreno/adreno_gpu.h | 3 ++- drivers/gpu/drm/msm/msm_drv.c | 4 +++- drivers/gpu/drm/msm/msm_gem.h | 3 +++ drivers/gpu/drm/msm/msm_gpu.c | 1 + drivers/gpu/drm/msm/msm_gpu.h | 3 ++- drivers/gpu/drm/msm/msm_rd.c | 6 ++++-- include/uapi/drm/msm_drm.h | 3 ++- 8 files changed, 22 insertions(+), 7 deletions(-)
From: Rob Clark robdclark@chromium.org
Prep work for next patch.
Signed-off-by: Rob Clark robdclark@chromium.org --- drivers/gpu/drm/msm/adreno/adreno_gpu.c | 3 ++- drivers/gpu/drm/msm/adreno/adreno_gpu.h | 3 ++- drivers/gpu/drm/msm/msm_drv.c | 3 ++- drivers/gpu/drm/msm/msm_gpu.h | 3 ++- drivers/gpu/drm/msm/msm_rd.c | 6 ++++-- 5 files changed, 12 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c index f33cfa4ef1c8..caa9076197de 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c @@ -227,7 +227,8 @@ adreno_iommu_create_address_space(struct msm_gpu *gpu, return aspace; }
-int adreno_get_param(struct msm_gpu *gpu, uint32_t param, uint64_t *value) +int adreno_get_param(struct msm_gpu *gpu, struct msm_file_private *ctx, + uint32_t param, uint64_t *value) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h index cffabe7d33c1..432590036b31 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h @@ -279,7 +279,8 @@ static inline int adreno_is_a650_family(struct adreno_gpu *gpu) adreno_is_a660_family(gpu); }
-int adreno_get_param(struct msm_gpu *gpu, uint32_t param, uint64_t *value); +int adreno_get_param(struct msm_gpu *gpu, struct msm_file_private *ctx, + uint32_t param, uint64_t *value); const struct firmware *adreno_request_fw(struct adreno_gpu *adreno_gpu, const char *fwname); struct drm_gem_object *adreno_fw_create_bo(struct msm_gpu *gpu, diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index 555666e3f960..72060247e43c 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -763,7 +763,8 @@ static int msm_ioctl_get_param(struct drm_device *dev, void *data, if (!gpu) return -ENXIO;
- return gpu->funcs->get_param(gpu, args->param, &args->value); + return gpu->funcs->get_param(gpu, file->driver_priv, + args->param, &args->value); }
static int msm_ioctl_gem_new(struct drm_device *dev, void *data, diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h index 92aa1e9196c6..ba8407231340 100644 --- a/drivers/gpu/drm/msm/msm_gpu.h +++ b/drivers/gpu/drm/msm/msm_gpu.h @@ -42,7 +42,8 @@ struct msm_gpu_config { * + z180_gpu */ struct msm_gpu_funcs { - int (*get_param)(struct msm_gpu *gpu, uint32_t param, uint64_t *value); + int (*get_param)(struct msm_gpu *gpu, struct msm_file_private *ctx, + uint32_t param, uint64_t *value); int (*hw_init)(struct msm_gpu *gpu); int (*pm_suspend)(struct msm_gpu *gpu); int (*pm_resume)(struct msm_gpu *gpu); diff --git a/drivers/gpu/drm/msm/msm_rd.c b/drivers/gpu/drm/msm/msm_rd.c index 7e4d6460719e..dd3605b46264 100644 --- a/drivers/gpu/drm/msm/msm_rd.c +++ b/drivers/gpu/drm/msm/msm_rd.c @@ -197,13 +197,15 @@ static int rd_open(struct inode *inode, struct file *file)
/* the parsing tools need to know gpu-id to know which * register database to load. + * + * Note: These particular param does not require a context */ - gpu->funcs->get_param(gpu, MSM_PARAM_GPU_ID, &val); + gpu->funcs->get_param(gpu, NULL, MSM_PARAM_GPU_ID, &val); gpu_id = val;
rd_write_section(rd, RD_GPU_ID, &gpu_id, sizeof(gpu_id));
- gpu->funcs->get_param(gpu, MSM_PARAM_CHIP_ID, &val); + gpu->funcs->get_param(gpu, NULL, MSM_PARAM_CHIP_ID, &val); rd_write_section(rd, RD_CHIP_ID, &val, sizeof(val));
out:
From: Rob Clark robdclark@chromium.org
Add a param so that userspace can query the fault count for faults that might effect them (ie. any context sharing the same address space).
Signed-off-by: Rob Clark robdclark@chromium.org --- drivers/gpu/drm/msm/adreno/adreno_gpu.c | 3 +++ drivers/gpu/drm/msm/msm_drv.c | 1 + drivers/gpu/drm/msm/msm_gem.h | 3 +++ drivers/gpu/drm/msm/msm_gpu.c | 1 + include/uapi/drm/msm_drm.h | 3 ++- 5 files changed, 10 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c index caa9076197de..05899c77ca38 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c @@ -271,6 +271,9 @@ int adreno_get_param(struct msm_gpu *gpu, struct msm_file_private *ctx, case MSM_PARAM_FAULTS: *value = gpu->global_faults; return 0; + case MSM_PARAM_AS_FAULTS: + *value = ctx->aspace->faults; + return 0; case MSM_PARAM_SUSPENDS: *value = gpu->suspend_count; return 0; diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index 72060247e43c..39ab9a361d7f 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -41,6 +41,7 @@ * - 1.6.0 - Syncobj support * - 1.7.0 - Add MSM_PARAM_SUSPENDS to access suspend count * - 1.8.0 - Add MSM_BO_CACHED_COHERENT for supported GPUs (a6xx) + * - 1.9.0 - Add MSM_PARAM_AS_FAULTS */ #define MSM_VERSION_MAJOR 1 #define MSM_VERSION_MINOR 8 diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h index 54ca0817d807..af612add5264 100644 --- a/drivers/gpu/drm/msm/msm_gem.h +++ b/drivers/gpu/drm/msm/msm_gem.h @@ -35,6 +35,9 @@ struct msm_gem_address_space { * will be non-NULL: */ struct pid *pid; + + /* @faults: the number of GPU hangs associated with this address space */ + int faults; };
struct msm_gem_vma { diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index 2c1049c0ea14..1029a51cb016 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -372,6 +372,7 @@ static void recover_worker(struct kthread_work *work) /* Increment the fault counts */ gpu->global_faults++; submit->queue->faults++; + submit->aspace->faults++;
task = get_pid_task(submit->pid, PIDTYPE_PID); if (task) { diff --git a/include/uapi/drm/msm_drm.h b/include/uapi/drm/msm_drm.h index 6b8fffc28a50..b7d92868b945 100644 --- a/include/uapi/drm/msm_drm.h +++ b/include/uapi/drm/msm_drm.h @@ -75,8 +75,9 @@ struct drm_msm_timespec { #define MSM_PARAM_GMEM_BASE 0x06 #define MSM_PARAM_PRIORITIES 0x07 /* The # of priority levels */ #define MSM_PARAM_PP_PGTABLE 0x08 /* => 1 for per-process pagetables, else 0 */ -#define MSM_PARAM_FAULTS 0x09 +#define MSM_PARAM_FAULTS 0x09 /* global fault count */ #define MSM_PARAM_SUSPENDS 0x0a +#define MSM_PARAM_AS_FAULTS 0x0b /* faults in any context sharing same address space */
/* For backwards compat. The original support for preemption was based on * a single ring per priority level so # of priority levels equals the #
dri-devel@lists.freedesktop.org