Hello,
Here's is collection of patches addressing some stability issues.
The first patch has been submitted a while ago but was lacking a way to kill in-flight jobs when a context is closed; which is now addressed in patch 10.
The rest of those patches are improving fault handling (with some code refactoring in the middle).
"drm/panfrost: Do the exception -> string translation using a table" still has a TODO. I basically mapped all exception types to EINVAL since most faults are triggered by invalid job/shaders, but there might be some exceptions that should be translated to something else. Any feedback on that aspect is welcome.
Regards,
Boris
Boris Brezillon (10): drm/panfrost: Make sure MMU context lifetime is not bound to panfrost_priv drm/panfrost: Get rid of the unused JS_STATUS_EVENT_ACTIVE definition drm/panfrost: Drop the pfdev argument passed to panfrost_exception_name() drm/panfrost: Expose exception types to userspace drm/panfrost: Disable the AS on unhandled page faults drm/panfrost: Expose a helper to trigger a GPU reset drm/panfrost: Reset the GPU when the AS_ACTIVE bit is stuck drm/panfrost: Do the exception -> string translation using a table drm/panfrost: Don't reset the GPU on job faults unless we really have to drm/panfrost: Kill in-flight jobs on FD close
drivers/gpu/drm/panfrost/panfrost_device.c | 143 +++++++++++------ drivers/gpu/drm/panfrost/panfrost_device.h | 21 ++- drivers/gpu/drm/panfrost/panfrost_drv.c | 50 ++---- drivers/gpu/drm/panfrost/panfrost_gem.c | 20 ++- drivers/gpu/drm/panfrost/panfrost_gpu.c | 2 +- drivers/gpu/drm/panfrost/panfrost_job.c | 74 ++++++--- drivers/gpu/drm/panfrost/panfrost_mmu.c | 173 ++++++++++++++------- drivers/gpu/drm/panfrost/panfrost_mmu.h | 5 +- drivers/gpu/drm/panfrost/panfrost_regs.h | 3 - include/uapi/drm/panfrost_drm.h | 65 ++++++++ 10 files changed, 371 insertions(+), 185 deletions(-)
Jobs can be in-flight when the file descriptor is closed (either because the process did not terminate properly, or because it didn't wait for all GPU jobs to be finished), and apparently panfrost_job_close() does not cancel already running jobs. Let's refcount the MMU context object so it's lifetime is no longer bound to the FD lifetime and running jobs can finish properly without generating spurious page faults.
Reported-by: Icecream95 ixn@keemail.me Fixes: 7282f7645d06 ("drm/panfrost: Implement per FD address spaces") Cc: stable@vger.kernel.org Signed-off-by: Boris Brezillon boris.brezillon@collabora.com --- drivers/gpu/drm/panfrost/panfrost_device.h | 8 +- drivers/gpu/drm/panfrost/panfrost_drv.c | 50 ++----- drivers/gpu/drm/panfrost/panfrost_gem.c | 20 ++- drivers/gpu/drm/panfrost/panfrost_job.c | 4 +- drivers/gpu/drm/panfrost/panfrost_mmu.c | 160 ++++++++++++++------- drivers/gpu/drm/panfrost/panfrost_mmu.h | 5 +- 6 files changed, 136 insertions(+), 111 deletions(-)
diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h index f614e98771e4..8b2cdb8c701d 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.h +++ b/drivers/gpu/drm/panfrost/panfrost_device.h @@ -121,8 +121,12 @@ struct panfrost_device { };
struct panfrost_mmu { + struct panfrost_device *pfdev; + struct kref refcount; struct io_pgtable_cfg pgtbl_cfg; struct io_pgtable_ops *pgtbl_ops; + struct drm_mm mm; + spinlock_t mm_lock; int as; atomic_t as_count; struct list_head list; @@ -133,9 +137,7 @@ struct panfrost_file_priv {
struct drm_sched_entity sched_entity[NUM_JOB_SLOTS];
- struct panfrost_mmu mmu; - struct drm_mm mm; - spinlock_t mm_lock; + struct panfrost_mmu *mmu; };
static inline struct panfrost_device *to_panfrost_device(struct drm_device *ddev) diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c index 075ec0ef746c..945133db1857 100644 --- a/drivers/gpu/drm/panfrost/panfrost_drv.c +++ b/drivers/gpu/drm/panfrost/panfrost_drv.c @@ -417,7 +417,7 @@ static int panfrost_ioctl_madvise(struct drm_device *dev, void *data, * anyway, so let's not bother. */ if (!list_is_singular(&bo->mappings.list) || - WARN_ON_ONCE(first->mmu != &priv->mmu)) { + WARN_ON_ONCE(first->mmu != priv->mmu)) { ret = -EINVAL; goto out_unlock_mappings; } @@ -449,32 +449,6 @@ int panfrost_unstable_ioctl_check(void) return 0; }
-#define PFN_4G (SZ_4G >> PAGE_SHIFT) -#define PFN_4G_MASK (PFN_4G - 1) -#define PFN_16M (SZ_16M >> PAGE_SHIFT) - -static void panfrost_drm_mm_color_adjust(const struct drm_mm_node *node, - unsigned long color, - u64 *start, u64 *end) -{ - /* Executable buffers can't start or end on a 4GB boundary */ - if (!(color & PANFROST_BO_NOEXEC)) { - u64 next_seg; - - if ((*start & PFN_4G_MASK) == 0) - (*start)++; - - if ((*end & PFN_4G_MASK) == 0) - (*end)--; - - next_seg = ALIGN(*start, PFN_4G); - if (next_seg - *start <= PFN_16M) - *start = next_seg + 1; - - *end = min(*end, ALIGN(*start, PFN_4G) - 1); - } -} - static int panfrost_open(struct drm_device *dev, struct drm_file *file) { @@ -489,15 +463,11 @@ panfrost_open(struct drm_device *dev, struct drm_file *file) panfrost_priv->pfdev = pfdev; file->driver_priv = panfrost_priv;
- spin_lock_init(&panfrost_priv->mm_lock); - - /* 4G enough for now. can be 48-bit */ - drm_mm_init(&panfrost_priv->mm, SZ_32M >> PAGE_SHIFT, (SZ_4G - SZ_32M) >> PAGE_SHIFT); - panfrost_priv->mm.color_adjust = panfrost_drm_mm_color_adjust; - - ret = panfrost_mmu_pgtable_alloc(panfrost_priv); - if (ret) - goto err_pgtable; + panfrost_priv->mmu = panfrost_mmu_ctx_create(pfdev); + if (IS_ERR(panfrost_priv->mmu)) { + ret = PTR_ERR(panfrost_priv->mmu); + goto err_free; + }
ret = panfrost_job_open(panfrost_priv); if (ret) @@ -506,9 +476,8 @@ panfrost_open(struct drm_device *dev, struct drm_file *file) return 0;
err_job: - panfrost_mmu_pgtable_free(panfrost_priv); -err_pgtable: - drm_mm_takedown(&panfrost_priv->mm); + panfrost_mmu_ctx_put(panfrost_priv->mmu); +err_free: kfree(panfrost_priv); return ret; } @@ -521,8 +490,7 @@ panfrost_postclose(struct drm_device *dev, struct drm_file *file) panfrost_perfcnt_close(file); panfrost_job_close(panfrost_priv);
- panfrost_mmu_pgtable_free(panfrost_priv); - drm_mm_takedown(&panfrost_priv->mm); + panfrost_mmu_ctx_put(panfrost_priv->mmu); kfree(panfrost_priv); }
diff --git a/drivers/gpu/drm/panfrost/panfrost_gem.c b/drivers/gpu/drm/panfrost/panfrost_gem.c index 3e0723bc36bd..23377481f4e3 100644 --- a/drivers/gpu/drm/panfrost/panfrost_gem.c +++ b/drivers/gpu/drm/panfrost/panfrost_gem.c @@ -60,7 +60,7 @@ panfrost_gem_mapping_get(struct panfrost_gem_object *bo,
mutex_lock(&bo->mappings.lock); list_for_each_entry(iter, &bo->mappings.list, node) { - if (iter->mmu == &priv->mmu) { + if (iter->mmu == priv->mmu) { kref_get(&iter->refcount); mapping = iter; break; @@ -74,16 +74,13 @@ panfrost_gem_mapping_get(struct panfrost_gem_object *bo, static void panfrost_gem_teardown_mapping(struct panfrost_gem_mapping *mapping) { - struct panfrost_file_priv *priv; - if (mapping->active) panfrost_mmu_unmap(mapping);
- priv = container_of(mapping->mmu, struct panfrost_file_priv, mmu); - spin_lock(&priv->mm_lock); + spin_lock(&mapping->mmu->mm_lock); if (drm_mm_node_allocated(&mapping->mmnode)) drm_mm_remove_node(&mapping->mmnode); - spin_unlock(&priv->mm_lock); + spin_unlock(&mapping->mmu->mm_lock); }
static void panfrost_gem_mapping_release(struct kref *kref) @@ -94,6 +91,7 @@ static void panfrost_gem_mapping_release(struct kref *kref)
panfrost_gem_teardown_mapping(mapping); drm_gem_object_put(&mapping->obj->base.base); + panfrost_mmu_ctx_put(mapping->mmu); kfree(mapping); }
@@ -143,11 +141,11 @@ int panfrost_gem_open(struct drm_gem_object *obj, struct drm_file *file_priv) else align = size >= SZ_2M ? SZ_2M >> PAGE_SHIFT : 0;
- mapping->mmu = &priv->mmu; - spin_lock(&priv->mm_lock); - ret = drm_mm_insert_node_generic(&priv->mm, &mapping->mmnode, + mapping->mmu = panfrost_mmu_ctx_get(priv->mmu); + spin_lock(&mapping->mmu->mm_lock); + ret = drm_mm_insert_node_generic(&mapping->mmu->mm, &mapping->mmnode, size >> PAGE_SHIFT, align, color, 0); - spin_unlock(&priv->mm_lock); + spin_unlock(&mapping->mmu->mm_lock); if (ret) goto err;
@@ -176,7 +174,7 @@ void panfrost_gem_close(struct drm_gem_object *obj, struct drm_file *file_priv)
mutex_lock(&bo->mappings.lock); list_for_each_entry(iter, &bo->mappings.list, node) { - if (iter->mmu == &priv->mmu) { + if (iter->mmu == priv->mmu) { mapping = iter; list_del(&iter->node); break; diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c index 2df3e999a38d..3757c6eb3023 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.c +++ b/drivers/gpu/drm/panfrost/panfrost_job.c @@ -165,7 +165,7 @@ static void panfrost_job_hw_submit(struct panfrost_job *job, int js) return; }
- cfg = panfrost_mmu_as_get(pfdev, &job->file_priv->mmu); + cfg = panfrost_mmu_as_get(pfdev, job->file_priv->mmu);
job_write(pfdev, JS_HEAD_NEXT_LO(js), jc_head & 0xFFFFFFFF); job_write(pfdev, JS_HEAD_NEXT_HI(js), jc_head >> 32); @@ -527,7 +527,7 @@ static irqreturn_t panfrost_job_irq_handler(int irq, void *data) if (job) { pfdev->jobs[j] = NULL;
- panfrost_mmu_as_put(pfdev, &job->file_priv->mmu); + panfrost_mmu_as_put(pfdev, job->file_priv->mmu); panfrost_devfreq_record_idle(&pfdev->pfdevfreq);
dma_fence_signal_locked(job->done_fence); diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c index 0581186ebfb3..569509c2ba27 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.c +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c @@ -1,5 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright 2019 Linaro, Ltd, Rob Herring robh@kernel.org */ + +#include <drm/panfrost_drm.h> + #include <linux/atomic.h> #include <linux/bitfield.h> #include <linux/delay.h> @@ -337,7 +340,7 @@ static void mmu_tlb_inv_context_s1(void *cookie)
static void mmu_tlb_sync_context(void *cookie) { - //struct panfrost_device *pfdev = cookie; + //struct panfrost_mmu *mmu = cookie; // TODO: Wait 1000 GPU cycles for HW_ISSUE_6367/T60X }
@@ -352,57 +355,10 @@ static const struct iommu_flush_ops mmu_tlb_ops = { .tlb_flush_walk = mmu_tlb_flush_walk, };
-int panfrost_mmu_pgtable_alloc(struct panfrost_file_priv *priv) -{ - struct panfrost_mmu *mmu = &priv->mmu; - struct panfrost_device *pfdev = priv->pfdev; - - INIT_LIST_HEAD(&mmu->list); - mmu->as = -1; - - mmu->pgtbl_cfg = (struct io_pgtable_cfg) { - .pgsize_bitmap = SZ_4K | SZ_2M, - .ias = FIELD_GET(0xff, pfdev->features.mmu_features), - .oas = FIELD_GET(0xff00, pfdev->features.mmu_features), - .coherent_walk = pfdev->coherent, - .tlb = &mmu_tlb_ops, - .iommu_dev = pfdev->dev, - }; - - mmu->pgtbl_ops = alloc_io_pgtable_ops(ARM_MALI_LPAE, &mmu->pgtbl_cfg, - priv); - if (!mmu->pgtbl_ops) - return -EINVAL; - - return 0; -} - -void panfrost_mmu_pgtable_free(struct panfrost_file_priv *priv) -{ - struct panfrost_device *pfdev = priv->pfdev; - struct panfrost_mmu *mmu = &priv->mmu; - - spin_lock(&pfdev->as_lock); - if (mmu->as >= 0) { - pm_runtime_get_noresume(pfdev->dev); - if (pm_runtime_active(pfdev->dev)) - panfrost_mmu_disable(pfdev, mmu->as); - pm_runtime_put_autosuspend(pfdev->dev); - - clear_bit(mmu->as, &pfdev->as_alloc_mask); - clear_bit(mmu->as, &pfdev->as_in_use_mask); - list_del(&mmu->list); - } - spin_unlock(&pfdev->as_lock); - - free_io_pgtable_ops(mmu->pgtbl_ops); -} - static struct panfrost_gem_mapping * addr_to_mapping(struct panfrost_device *pfdev, int as, u64 addr) { struct panfrost_gem_mapping *mapping = NULL; - struct panfrost_file_priv *priv; struct drm_mm_node *node; u64 offset = addr >> PAGE_SHIFT; struct panfrost_mmu *mmu; @@ -415,11 +371,10 @@ addr_to_mapping(struct panfrost_device *pfdev, int as, u64 addr) goto out;
found_mmu: - priv = container_of(mmu, struct panfrost_file_priv, mmu);
- spin_lock(&priv->mm_lock); + spin_lock(&mmu->mm_lock);
- drm_mm_for_each_node(node, &priv->mm) { + drm_mm_for_each_node(node, &mmu->mm) { if (offset >= node->start && offset < (node->start + node->size)) { mapping = drm_mm_node_to_panfrost_mapping(node); @@ -429,7 +384,7 @@ addr_to_mapping(struct panfrost_device *pfdev, int as, u64 addr) } }
- spin_unlock(&priv->mm_lock); + spin_unlock(&mmu->mm_lock); out: spin_unlock(&pfdev->as_lock); return mapping; @@ -542,6 +497,107 @@ static int panfrost_mmu_map_fault_addr(struct panfrost_device *pfdev, int as, return ret; }
+static void panfrost_mmu_release_ctx(struct kref *kref) +{ + struct panfrost_mmu *mmu = container_of(kref, struct panfrost_mmu, + refcount); + struct panfrost_device *pfdev = mmu->pfdev; + + spin_lock(&pfdev->as_lock); + if (mmu->as >= 0) { + pm_runtime_get_noresume(pfdev->dev); + if (pm_runtime_active(pfdev->dev)) + panfrost_mmu_disable(pfdev, mmu->as); + pm_runtime_put_autosuspend(pfdev->dev); + + clear_bit(mmu->as, &pfdev->as_alloc_mask); + clear_bit(mmu->as, &pfdev->as_in_use_mask); + list_del(&mmu->list); + } + spin_unlock(&pfdev->as_lock); + + free_io_pgtable_ops(mmu->pgtbl_ops); + drm_mm_takedown(&mmu->mm); + kfree(mmu); +} + +void panfrost_mmu_ctx_put(struct panfrost_mmu *mmu) +{ + kref_put(&mmu->refcount, panfrost_mmu_release_ctx); +} + +struct panfrost_mmu *panfrost_mmu_ctx_get(struct panfrost_mmu *mmu) +{ + kref_get(&mmu->refcount); + + return mmu; +} + +#define PFN_4G (SZ_4G >> PAGE_SHIFT) +#define PFN_4G_MASK (PFN_4G - 1) +#define PFN_16M (SZ_16M >> PAGE_SHIFT) + +static void panfrost_drm_mm_color_adjust(const struct drm_mm_node *node, + unsigned long color, + u64 *start, u64 *end) +{ + /* Executable buffers can't start or end on a 4GB boundary */ + if (!(color & PANFROST_BO_NOEXEC)) { + u64 next_seg; + + if ((*start & PFN_4G_MASK) == 0) + (*start)++; + + if ((*end & PFN_4G_MASK) == 0) + (*end)--; + + next_seg = ALIGN(*start, PFN_4G); + if (next_seg - *start <= PFN_16M) + *start = next_seg + 1; + + *end = min(*end, ALIGN(*start, PFN_4G) - 1); + } +} + +struct panfrost_mmu *panfrost_mmu_ctx_create(struct panfrost_device *pfdev) +{ + struct panfrost_mmu *mmu; + + mmu = kzalloc(sizeof(*mmu), GFP_KERNEL); + if (!mmu) + return ERR_PTR(-ENOMEM); + + mmu->pfdev = pfdev; + spin_lock_init(&mmu->mm_lock); + + /* 4G enough for now. can be 48-bit */ + drm_mm_init(&mmu->mm, SZ_32M >> PAGE_SHIFT, (SZ_4G - SZ_32M) >> PAGE_SHIFT); + mmu->mm.color_adjust = panfrost_drm_mm_color_adjust; + + INIT_LIST_HEAD(&mmu->list); + mmu->as = -1; + + mmu->pgtbl_cfg = (struct io_pgtable_cfg) { + .pgsize_bitmap = SZ_4K | SZ_2M, + .ias = FIELD_GET(0xff, pfdev->features.mmu_features), + .oas = FIELD_GET(0xff00, pfdev->features.mmu_features), + .coherent_walk = pfdev->coherent, + .tlb = &mmu_tlb_ops, + .iommu_dev = pfdev->dev, + }; + + mmu->pgtbl_ops = alloc_io_pgtable_ops(ARM_MALI_LPAE, &mmu->pgtbl_cfg, + mmu); + if (!mmu->pgtbl_ops) { + kfree(mmu); + return ERR_PTR(-EINVAL); + } + + kref_init(&mmu->refcount); + + return mmu; +} + static const char *access_type_name(struct panfrost_device *pfdev, u32 fault_status) { diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.h b/drivers/gpu/drm/panfrost/panfrost_mmu.h index 44fc2edf63ce..cc2a0d307feb 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.h +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.h @@ -18,7 +18,8 @@ void panfrost_mmu_reset(struct panfrost_device *pfdev); u32 panfrost_mmu_as_get(struct panfrost_device *pfdev, struct panfrost_mmu *mmu); void panfrost_mmu_as_put(struct panfrost_device *pfdev, struct panfrost_mmu *mmu);
-int panfrost_mmu_pgtable_alloc(struct panfrost_file_priv *priv); -void panfrost_mmu_pgtable_free(struct panfrost_file_priv *priv); +struct panfrost_mmu *panfrost_mmu_ctx_get(struct panfrost_mmu *mmu); +void panfrost_mmu_ctx_put(struct panfrost_mmu *mmu); +struct panfrost_mmu *panfrost_mmu_ctx_create(struct panfrost_device *pfdev);
#endif
Exception types will be defined as an enum in panfrost_drm.h so userspace and use the same definitions if needed.
Signed-off-by: Boris Brezillon boris.brezillon@collabora.com --- drivers/gpu/drm/panfrost/panfrost_regs.h | 3 --- 1 file changed, 3 deletions(-)
diff --git a/drivers/gpu/drm/panfrost/panfrost_regs.h b/drivers/gpu/drm/panfrost/panfrost_regs.h index dc9df5457f1c..1940ff86e49a 100644 --- a/drivers/gpu/drm/panfrost/panfrost_regs.h +++ b/drivers/gpu/drm/panfrost/panfrost_regs.h @@ -262,9 +262,6 @@ #define JS_COMMAND_SOFT_STOP_1 0x06 /* Execute SOFT_STOP if JOB_CHAIN_FLAG is 1 */ #define JS_COMMAND_HARD_STOP_1 0x07 /* Execute HARD_STOP if JOB_CHAIN_FLAG is 1 */
-#define JS_STATUS_EVENT_ACTIVE 0x08 - - /* MMU regs */ #define MMU_INT_RAWSTAT 0x2000 #define MMU_INT_CLEAR 0x2004
Currently unused. We'll add it back if we need per-GPU definitions.
Signed-off-by: Boris Brezillon boris.brezillon@collabora.com --- drivers/gpu/drm/panfrost/panfrost_device.c | 2 +- drivers/gpu/drm/panfrost/panfrost_device.h | 2 +- drivers/gpu/drm/panfrost/panfrost_gpu.c | 2 +- drivers/gpu/drm/panfrost/panfrost_job.c | 2 +- drivers/gpu/drm/panfrost/panfrost_mmu.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/panfrost/panfrost_device.c b/drivers/gpu/drm/panfrost/panfrost_device.c index a2a09c51eed7..f7f5ca94f910 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.c +++ b/drivers/gpu/drm/panfrost/panfrost_device.c @@ -292,7 +292,7 @@ void panfrost_device_fini(struct panfrost_device *pfdev) panfrost_clk_fini(pfdev); }
-const char *panfrost_exception_name(struct panfrost_device *pfdev, u32 exception_code) +const char *panfrost_exception_name(u32 exception_code) { switch (exception_code) { /* Non-Fault Status code */ diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h index 8b2cdb8c701d..2fe1550da7f8 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.h +++ b/drivers/gpu/drm/panfrost/panfrost_device.h @@ -173,6 +173,6 @@ void panfrost_device_reset(struct panfrost_device *pfdev); int panfrost_device_resume(struct device *dev); int panfrost_device_suspend(struct device *dev);
-const char *panfrost_exception_name(struct panfrost_device *pfdev, u32 exception_code); +const char *panfrost_exception_name(u32 exception_code);
#endif diff --git a/drivers/gpu/drm/panfrost/panfrost_gpu.c b/drivers/gpu/drm/panfrost/panfrost_gpu.c index 0e70e27fd8c3..26e4196b6c90 100644 --- a/drivers/gpu/drm/panfrost/panfrost_gpu.c +++ b/drivers/gpu/drm/panfrost/panfrost_gpu.c @@ -33,7 +33,7 @@ static irqreturn_t panfrost_gpu_irq_handler(int irq, void *data) address |= gpu_read(pfdev, GPU_FAULT_ADDRESS_LO);
dev_warn(pfdev->dev, "GPU Fault 0x%08x (%s) at 0x%016llx\n", - fault_status & 0xFF, panfrost_exception_name(pfdev, fault_status), + fault_status & 0xFF, panfrost_exception_name(fault_status), address);
if (state & GPU_IRQ_MULTIPLE_FAULT) diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c index 3757c6eb3023..1be80b3dd5d0 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.c +++ b/drivers/gpu/drm/panfrost/panfrost_job.c @@ -500,7 +500,7 @@ static irqreturn_t panfrost_job_irq_handler(int irq, void *data)
dev_err(pfdev->dev, "js fault, js=%d, status=%s, head=0x%x, tail=0x%x", j, - panfrost_exception_name(pfdev, job_read(pfdev, JS_STATUS(j))), + panfrost_exception_name(job_read(pfdev, JS_STATUS(j))), job_read(pfdev, JS_HEAD_LO(j)), job_read(pfdev, JS_TAIL_LO(j)));
diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c index 569509c2ba27..2a9bf30edc9d 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.c +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c @@ -675,7 +675,7 @@ static irqreturn_t panfrost_mmu_irq_handler_thread(int irq, void *data) "TODO", fault_status, (fault_status & (1 << 10) ? "DECODER FAULT" : "SLAVE FAULT"), - exception_type, panfrost_exception_name(pfdev, exception_type), + exception_type, panfrost_exception_name(exception_type), access_type, access_type_name(pfdev, fault_status), source_id);
Job headers contain an exception type field which might be read and converted to a human readable string by tracing tools. Let's expose the exception type as an enum so we share the same definition.
Signed-off-by: Boris Brezillon boris.brezillon@collabora.com --- include/uapi/drm/panfrost_drm.h | 65 +++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+)
diff --git a/include/uapi/drm/panfrost_drm.h b/include/uapi/drm/panfrost_drm.h index 061e700dd06c..9a05d57d0118 100644 --- a/include/uapi/drm/panfrost_drm.h +++ b/include/uapi/drm/panfrost_drm.h @@ -224,6 +224,71 @@ struct drm_panfrost_madvise { __u32 retained; /* out, whether backing store still exists */ };
+/* The exception types */ + +enum drm_panfrost_exception_type { + DRM_PANFROST_EXCEPTION_OK = 0x00, + DRM_PANFROST_EXCEPTION_DONE = 0x01, + DRM_PANFROST_EXCEPTION_STOPPED = 0x03, + DRM_PANFROST_EXCEPTION_TERMINATED = 0x04, + DRM_PANFROST_EXCEPTION_KABOOM = 0x05, + DRM_PANFROST_EXCEPTION_EUREKA = 0x06, + DRM_PANFROST_EXCEPTION_ACTIVE = 0x08, + DRM_PANFROST_EXCEPTION_JOB_CONFIG_FAULT = 0x40, + DRM_PANFROST_EXCEPTION_JOB_POWER_FAULT = 0x41, + DRM_PANFROST_EXCEPTION_JOB_READ_FAULT = 0x42, + DRM_PANFROST_EXCEPTION_JOB_WRITE_FAULT = 0x43, + DRM_PANFROST_EXCEPTION_JOB_AFFINITY_FAULT = 0x44, + DRM_PANFROST_EXCEPTION_JOB_BUS_FAULT = 0x48, + DRM_PANFROST_EXCEPTION_INSTR_INVALID_PC = 0x50, + DRM_PANFROST_EXCEPTION_INSTR_INVALID_ENC = 0x51, + DRM_PANFROST_EXCEPTION_INSTR_BARRIER_FAULT = 0x55, + DRM_PANFROST_EXCEPTION_DATA_INVALID_FAULT = 0x58, + DRM_PANFROST_EXCEPTION_TILE_RANGE_FAULT = 0x59, + DRM_PANFROST_EXCEPTION_ADDR_RANGE_FAULT = 0x5a, + DRM_PANFROST_EXCEPTION_IMPRECISE_FAULT = 0x5b, + DRM_PANFROST_EXCEPTION_OOM = 0x60, + DRM_PANFROST_EXCEPTION_UNKNOWN = 0x7f, + DRM_PANFROST_EXCEPTION_DELAYED_BUS_FAULT = 0x80, + DRM_PANFROST_EXCEPTION_GPU_SHAREABILITY_FAULT = 0x88, + DRM_PANFROST_EXCEPTION_SYS_SHAREABILITY_FAULT = 0x89, + DRM_PANFROST_EXCEPTION_GPU_CACHEABILITY_FAULT = 0x8a, + DRM_PANFROST_EXCEPTION_TRANSLATION_FAULT_0 = 0xc0, + DRM_PANFROST_EXCEPTION_TRANSLATION_FAULT_1 = 0xc1, + DRM_PANFROST_EXCEPTION_TRANSLATION_FAULT_2 = 0xc2, + DRM_PANFROST_EXCEPTION_TRANSLATION_FAULT_3 = 0xc3, + DRM_PANFROST_EXCEPTION_TRANSLATION_FAULT_4 = 0xc4, + DRM_PANFROST_EXCEPTION_TRANSLATION_FAULT_IDENTITY = 0xc7, + DRM_PANFROST_EXCEPTION_PERM_FAULT_0 = 0xc8, + DRM_PANFROST_EXCEPTION_PERM_FAULT_1 = 0xc9, + DRM_PANFROST_EXCEPTION_PERM_FAULT_2 = 0xca, + DRM_PANFROST_EXCEPTION_PERM_FAULT_3 = 0xcb, + DRM_PANFROST_EXCEPTION_TRANSTAB_BUS_FAULT_0 = 0xd0, + DRM_PANFROST_EXCEPTION_TRANSTAB_BUS_FAULT_1 = 0xd1, + DRM_PANFROST_EXCEPTION_TRANSTAB_BUS_FAULT_2 = 0xd2, + DRM_PANFROST_EXCEPTION_TRANSTAB_BUS_FAULT_3 = 0xd3, + DRM_PANFROST_EXCEPTION_ACCESS_FLAG_0 = 0xd8, + DRM_PANFROST_EXCEPTION_ACCESS_FLAG_1 = 0xd9, + DRM_PANFROST_EXCEPTION_ACCESS_FLAG_2 = 0xda, + DRM_PANFROST_EXCEPTION_ACCESS_FLAG_3 = 0xdb, + DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_IN0 = 0xe0, + DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_IN1 = 0xe1, + DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_IN2 = 0xe2, + DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_IN3 = 0xe3, + DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_OUT0 = 0xe4, + DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_OUT1 = 0xe5, + DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_OUT2 = 0xe6, + DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_OUT3 = 0xe7, + DRM_PANFROST_EXCEPTION_MEM_ATTR_FAULT_0 = 0xe8, + DRM_PANFROST_EXCEPTION_MEM_ATTR_FAULT_1 = 0xe9, + DRM_PANFROST_EXCEPTION_MEM_ATTR_FAULT_2 = 0xea, + DRM_PANFROST_EXCEPTION_MEM_ATTR_FAULT_3 = 0xeb, + DRM_PANFROST_EXCEPTION_MEM_ATTR_NONCACHE_0 = 0xec, + DRM_PANFROST_EXCEPTION_MEM_ATTR_NONCACHE_1 = 0xed, + DRM_PANFROST_EXCEPTION_MEM_ATTR_NONCACHE_2 = 0xee, + DRM_PANFROST_EXCEPTION_MEM_ATTR_NONCACHE_3 = 0xef, +}; + #if defined(__cplusplus) } #endif
If we don't do that, we have to wait for the job timeout to expire before the fault jobs gets killed.
Signed-off-by: Boris Brezillon boris.brezillon@collabora.com --- drivers/gpu/drm/panfrost/panfrost_mmu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c index 2a9bf30edc9d..d5c624e776f1 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.c +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c @@ -661,7 +661,7 @@ static irqreturn_t panfrost_mmu_irq_handler_thread(int irq, void *data) if ((status & mask) == BIT(as) && (exception_type & 0xF8) == 0xC0) ret = panfrost_mmu_map_fault_addr(pfdev, as, addr);
- if (ret) + if (ret) { /* terminal fault, print info about the fault */ dev_err(pfdev->dev, "Unhandled Page fault in AS%d at VA 0x%016llX\n" @@ -679,6 +679,10 @@ static irqreturn_t panfrost_mmu_irq_handler_thread(int irq, void *data) access_type, access_type_name(pfdev, fault_status), source_id);
+ /* Disable the MMU to stop jobs on this AS immediately */ + panfrost_mmu_disable(pfdev, as); + } + status &= ~mask;
/* If we received new MMU interrupts, process them before returning. */
Expose a helper to trigger a GPU reset so we can easily trigger reset operations outside the job timeout handler.
Signed-off-by: Boris Brezillon boris.brezillon@collabora.com --- drivers/gpu/drm/panfrost/panfrost_device.h | 8 ++++++++ drivers/gpu/drm/panfrost/panfrost_job.c | 4 +--- 2 files changed, 9 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h index 2fe1550da7f8..1c6a3597eba0 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.h +++ b/drivers/gpu/drm/panfrost/panfrost_device.h @@ -175,4 +175,12 @@ int panfrost_device_suspend(struct device *dev);
const char *panfrost_exception_name(u32 exception_code);
+static inline void +panfrost_device_schedule_reset(struct panfrost_device *pfdev) +{ + /* Schedule a reset if there's no reset in progress. */ + if (!atomic_xchg(&pfdev->reset.pending, 1)) + schedule_work(&pfdev->reset.work); +} + #endif diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c index 1be80b3dd5d0..be5d3e4a1d0a 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.c +++ b/drivers/gpu/drm/panfrost/panfrost_job.c @@ -458,9 +458,7 @@ static enum drm_gpu_sched_stat panfrost_job_timedout(struct drm_sched_job if (!panfrost_scheduler_stop(&pfdev->js->queue[js], sched_job)) return DRM_GPU_SCHED_STAT_NOMINAL;
- /* Schedule a reset if there's no reset in progress. */ - if (!atomic_xchg(&pfdev->reset.pending, 1)) - schedule_work(&pfdev->reset.work); + panfrost_device_schedule_reset(pfdev);
return DRM_GPU_SCHED_STAT_NOMINAL; }
Things are unlikely to resolve until we reset the GPU. Let's not wait for other faults/timeout to happen to trigger this reset.
Signed-off-by: Boris Brezillon boris.brezillon@collabora.com --- drivers/gpu/drm/panfrost/panfrost_mmu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c index d5c624e776f1..d20bcaecb78f 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.c +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c @@ -36,8 +36,11 @@ static int wait_ready(struct panfrost_device *pfdev, u32 as_nr) ret = readl_relaxed_poll_timeout_atomic(pfdev->iomem + AS_STATUS(as_nr), val, !(val & AS_STATUS_AS_ACTIVE), 10, 1000);
- if (ret) + if (ret) { + /* The GPU hung, let's trigger a reset */ + panfrost_device_schedule_reset(pfdev); dev_err(pfdev->dev, "AS_ACTIVE bit stuck\n"); + }
return ret; }
Do the exception -> string translation using a table so we can add extra fields if we need to. While at it add an error field to ease the exception -> error conversion which we'll need if we want to set the fence error to something that reflects the exception code.
TODO: fix the error codes.
Signed-off-by: Boris Brezillon boris.brezillon@collabora.com --- drivers/gpu/drm/panfrost/panfrost_device.c | 134 +++++++++++++-------- drivers/gpu/drm/panfrost/panfrost_device.h | 1 + 2 files changed, 88 insertions(+), 47 deletions(-)
diff --git a/drivers/gpu/drm/panfrost/panfrost_device.c b/drivers/gpu/drm/panfrost/panfrost_device.c index f7f5ca94f910..2de011cee258 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.c +++ b/drivers/gpu/drm/panfrost/panfrost_device.c @@ -292,55 +292,95 @@ void panfrost_device_fini(struct panfrost_device *pfdev) panfrost_clk_fini(pfdev); }
-const char *panfrost_exception_name(u32 exception_code) -{ - switch (exception_code) { - /* Non-Fault Status code */ - case 0x00: return "NOT_STARTED/IDLE/OK"; - case 0x01: return "DONE"; - case 0x02: return "INTERRUPTED"; - case 0x03: return "STOPPED"; - case 0x04: return "TERMINATED"; - case 0x08: return "ACTIVE"; - /* Job exceptions */ - case 0x40: return "JOB_CONFIG_FAULT"; - case 0x41: return "JOB_POWER_FAULT"; - case 0x42: return "JOB_READ_FAULT"; - case 0x43: return "JOB_WRITE_FAULT"; - case 0x44: return "JOB_AFFINITY_FAULT"; - case 0x48: return "JOB_BUS_FAULT"; - case 0x50: return "INSTR_INVALID_PC"; - case 0x51: return "INSTR_INVALID_ENC"; - case 0x52: return "INSTR_TYPE_MISMATCH"; - case 0x53: return "INSTR_OPERAND_FAULT"; - case 0x54: return "INSTR_TLS_FAULT"; - case 0x55: return "INSTR_BARRIER_FAULT"; - case 0x56: return "INSTR_ALIGN_FAULT"; - case 0x58: return "DATA_INVALID_FAULT"; - case 0x59: return "TILE_RANGE_FAULT"; - case 0x5A: return "ADDR_RANGE_FAULT"; - case 0x60: return "OUT_OF_MEMORY"; - /* GPU exceptions */ - case 0x80: return "DELAYED_BUS_FAULT"; - case 0x88: return "SHAREABILITY_FAULT"; - /* MMU exceptions */ - case 0xC1: return "TRANSLATION_FAULT_LEVEL1"; - case 0xC2: return "TRANSLATION_FAULT_LEVEL2"; - case 0xC3: return "TRANSLATION_FAULT_LEVEL3"; - case 0xC4: return "TRANSLATION_FAULT_LEVEL4"; - case 0xC8: return "PERMISSION_FAULT"; - case 0xC9 ... 0xCF: return "PERMISSION_FAULT"; - case 0xD1: return "TRANSTAB_BUS_FAULT_LEVEL1"; - case 0xD2: return "TRANSTAB_BUS_FAULT_LEVEL2"; - case 0xD3: return "TRANSTAB_BUS_FAULT_LEVEL3"; - case 0xD4: return "TRANSTAB_BUS_FAULT_LEVEL4"; - case 0xD8: return "ACCESS_FLAG"; - case 0xD9 ... 0xDF: return "ACCESS_FLAG"; - case 0xE0 ... 0xE7: return "ADDRESS_SIZE_FAULT"; - case 0xE8 ... 0xEF: return "MEMORY_ATTRIBUTES_FAULT"; +#define PANFROST_EXCEPTION(id, err) \ + [DRM_PANFROST_EXCEPTION_ ## id] = { \ + .name = #id, \ + .error = err, \ }
- return "UNKNOWN"; +struct panfrost_exception_info { + const char *name; + int error; +}; + +static const struct panfrost_exception_info panfrost_exception_infos[] = { + PANFROST_EXCEPTION(OK, 0), + PANFROST_EXCEPTION(DONE, 0), + PANFROST_EXCEPTION(STOPPED, 0), + PANFROST_EXCEPTION(TERMINATED, 0), + PANFROST_EXCEPTION(KABOOM, 0), + PANFROST_EXCEPTION(EUREKA, 0), + PANFROST_EXCEPTION(ACTIVE, 0), + PANFROST_EXCEPTION(JOB_CONFIG_FAULT, -EINVAL), + PANFROST_EXCEPTION(JOB_POWER_FAULT, -ECANCELED), + PANFROST_EXCEPTION(JOB_READ_FAULT, -EINVAL), + PANFROST_EXCEPTION(JOB_WRITE_FAULT, -EINVAL), + PANFROST_EXCEPTION(JOB_AFFINITY_FAULT, -EINVAL), + PANFROST_EXCEPTION(JOB_BUS_FAULT, -EINVAL), + PANFROST_EXCEPTION(INSTR_INVALID_PC, -EINVAL), + PANFROST_EXCEPTION(INSTR_INVALID_ENC, -EINVAL), + PANFROST_EXCEPTION(INSTR_BARRIER_FAULT, -EINVAL), + PANFROST_EXCEPTION(DATA_INVALID_FAULT, -EINVAL), + PANFROST_EXCEPTION(TILE_RANGE_FAULT, -EINVAL), + PANFROST_EXCEPTION(ADDR_RANGE_FAULT, -EINVAL), + PANFROST_EXCEPTION(IMPRECISE_FAULT, -EINVAL), + PANFROST_EXCEPTION(OOM, -ENOMEM), + PANFROST_EXCEPTION(UNKNOWN, -EINVAL), + PANFROST_EXCEPTION(DELAYED_BUS_FAULT, -EINVAL), + PANFROST_EXCEPTION(GPU_SHAREABILITY_FAULT, -ECANCELED), + PANFROST_EXCEPTION(SYS_SHAREABILITY_FAULT, -ECANCELED), + PANFROST_EXCEPTION(GPU_CACHEABILITY_FAULT, -ECANCELED), + PANFROST_EXCEPTION(TRANSLATION_FAULT_0, -EINVAL), + PANFROST_EXCEPTION(TRANSLATION_FAULT_1, -EINVAL), + PANFROST_EXCEPTION(TRANSLATION_FAULT_2, -EINVAL), + PANFROST_EXCEPTION(TRANSLATION_FAULT_3, -EINVAL), + PANFROST_EXCEPTION(TRANSLATION_FAULT_4, -EINVAL), + PANFROST_EXCEPTION(TRANSLATION_FAULT_IDENTITY, -EINVAL), + PANFROST_EXCEPTION(PERM_FAULT_0, -EINVAL), + PANFROST_EXCEPTION(PERM_FAULT_1, -EINVAL), + PANFROST_EXCEPTION(PERM_FAULT_2, -EINVAL), + PANFROST_EXCEPTION(PERM_FAULT_3, -EINVAL), + PANFROST_EXCEPTION(TRANSTAB_BUS_FAULT_0, -EINVAL), + PANFROST_EXCEPTION(TRANSTAB_BUS_FAULT_1, -EINVAL), + PANFROST_EXCEPTION(TRANSTAB_BUS_FAULT_2, -EINVAL), + PANFROST_EXCEPTION(TRANSTAB_BUS_FAULT_3, -EINVAL), + PANFROST_EXCEPTION(ACCESS_FLAG_0, -EINVAL), + PANFROST_EXCEPTION(ACCESS_FLAG_1, -EINVAL), + PANFROST_EXCEPTION(ACCESS_FLAG_2, -EINVAL), + PANFROST_EXCEPTION(ACCESS_FLAG_3, -EINVAL), + PANFROST_EXCEPTION(ADDR_SIZE_FAULT_IN0, -EINVAL), + PANFROST_EXCEPTION(ADDR_SIZE_FAULT_IN1, -EINVAL), + PANFROST_EXCEPTION(ADDR_SIZE_FAULT_IN2, -EINVAL), + PANFROST_EXCEPTION(ADDR_SIZE_FAULT_IN3, -EINVAL), + PANFROST_EXCEPTION(ADDR_SIZE_FAULT_OUT0, -EINVAL), + PANFROST_EXCEPTION(ADDR_SIZE_FAULT_OUT1, -EINVAL), + PANFROST_EXCEPTION(ADDR_SIZE_FAULT_OUT2, -EINVAL), + PANFROST_EXCEPTION(ADDR_SIZE_FAULT_OUT3, -EINVAL), + PANFROST_EXCEPTION(MEM_ATTR_FAULT_0, -EINVAL), + PANFROST_EXCEPTION(MEM_ATTR_FAULT_1, -EINVAL), + PANFROST_EXCEPTION(MEM_ATTR_FAULT_2, -EINVAL), + PANFROST_EXCEPTION(MEM_ATTR_FAULT_3, -EINVAL), + PANFROST_EXCEPTION(MEM_ATTR_NONCACHE_0, -EINVAL), + PANFROST_EXCEPTION(MEM_ATTR_NONCACHE_1, -EINVAL), + PANFROST_EXCEPTION(MEM_ATTR_NONCACHE_2, -EINVAL), + PANFROST_EXCEPTION(MEM_ATTR_NONCACHE_3, -EINVAL), +}; + +const char *panfrost_exception_name(u32 exception_code) +{ + if (WARN_ON(exception_code >= ARRAY_SIZE(panfrost_exception_infos) || + !panfrost_exception_infos[exception_code].name)) + return "UNKNOWN"; + + return panfrost_exception_infos[exception_code].name; +} + +int panfrost_exception_to_error(u32 exception_code) +{ + if (WARN_ON(exception_code >= ARRAY_SIZE(panfrost_exception_infos))) + return 0; + + return panfrost_exception_infos[exception_code].error; }
void panfrost_device_reset(struct panfrost_device *pfdev) diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h index 1c6a3597eba0..498c7b5dccd0 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.h +++ b/drivers/gpu/drm/panfrost/panfrost_device.h @@ -174,6 +174,7 @@ int panfrost_device_resume(struct device *dev); int panfrost_device_suspend(struct device *dev);
const char *panfrost_exception_name(u32 exception_code); +int panfrost_exception_to_error(u32 exception_code);
static inline void panfrost_device_schedule_reset(struct panfrost_device *pfdev)
If we can recover from a fault without a reset there's no reason to issue one.
Signed-off-by: Boris Brezillon boris.brezillon@collabora.com --- drivers/gpu/drm/panfrost/panfrost_device.c | 9 ++++++ drivers/gpu/drm/panfrost/panfrost_device.h | 2 ++ drivers/gpu/drm/panfrost/panfrost_job.c | 35 ++++++++++++++-------- 3 files changed, 34 insertions(+), 12 deletions(-)
diff --git a/drivers/gpu/drm/panfrost/panfrost_device.c b/drivers/gpu/drm/panfrost/panfrost_device.c index 2de011cee258..ac76e8646e97 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.c +++ b/drivers/gpu/drm/panfrost/panfrost_device.c @@ -383,6 +383,15 @@ int panfrost_exception_to_error(u32 exception_code) return panfrost_exception_infos[exception_code].error; }
+bool panfrost_exception_needs_reset(const struct panfrost_device *pfdev, + u32 exception_code) +{ + /* Right now, none of the GPU we support need a reset, but this + * might change (e.g. Valhall GPUs require a when a BUS_FAULT occurs). + */ + return false; +} + void panfrost_device_reset(struct panfrost_device *pfdev) { panfrost_gpu_soft_reset(pfdev); diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h index 498c7b5dccd0..95e6044008d2 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.h +++ b/drivers/gpu/drm/panfrost/panfrost_device.h @@ -175,6 +175,8 @@ int panfrost_device_suspend(struct device *dev);
const char *panfrost_exception_name(u32 exception_code); int panfrost_exception_to_error(u32 exception_code); +bool panfrost_exception_needs_reset(const struct panfrost_device *pfdev, + u32 exception_code);
static inline void panfrost_device_schedule_reset(struct panfrost_device *pfdev) diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c index be5d3e4a1d0a..aedc604d331c 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.c +++ b/drivers/gpu/drm/panfrost/panfrost_job.c @@ -493,27 +493,38 @@ static irqreturn_t panfrost_job_irq_handler(int irq, void *data)
if (status & JOB_INT_MASK_ERR(j)) { enum panfrost_queue_status old_status; + u32 js_status = job_read(pfdev, JS_STATUS(j));
job_write(pfdev, JS_COMMAND_NEXT(j), JS_COMMAND_NOP);
dev_err(pfdev->dev, "js fault, js=%d, status=%s, head=0x%x, tail=0x%x", j, - panfrost_exception_name(job_read(pfdev, JS_STATUS(j))), + panfrost_exception_name(js_status), job_read(pfdev, JS_HEAD_LO(j)), job_read(pfdev, JS_TAIL_LO(j)));
- /* - * When the queue is being restarted we don't report - * faults directly to avoid races between the timeout - * and reset handlers. panfrost_scheduler_start() will - * call drm_sched_fault() after the queue has been - * started if status == FAULT_PENDING. + /* If we need a reset, signal it to the reset handler, + * otherwise, update the fence error field and signal + * the job fence. */ - old_status = atomic_cmpxchg(&pfdev->js->queue[j].status, - PANFROST_QUEUE_STATUS_STARTING, - PANFROST_QUEUE_STATUS_FAULT_PENDING); - if (old_status == PANFROST_QUEUE_STATUS_ACTIVE) - drm_sched_fault(&pfdev->js->queue[j].sched); + if (panfrost_exception_needs_reset(pfdev, js_status)) { + /* + * When the queue is being restarted we don't report + * faults directly to avoid races between the timeout + * and reset handlers. panfrost_scheduler_start() will + * call drm_sched_fault() after the queue has been + * started if status == FAULT_PENDING. + */ + old_status = atomic_cmpxchg(&pfdev->js->queue[j].status, + PANFROST_QUEUE_STATUS_STARTING, + PANFROST_QUEUE_STATUS_FAULT_PENDING); + if (old_status == PANFROST_QUEUE_STATUS_ACTIVE) + drm_sched_fault(&pfdev->js->queue[j].sched); + } else { + dma_fence_set_error(pfdev->jobs[j]->done_fence, + panfrost_exception_to_error(js_status)); + status |= JOB_INT_MASK_DONE(j); + } }
if (status & JOB_INT_MASK_DONE(j)) {
If the process who submitted these jobs decided to close the FD before the jobs are done it probably means it doesn't care about the result.
Signed-off-by: Boris Brezillon boris.brezillon@collabora.com --- drivers/gpu/drm/panfrost/panfrost_job.c | 33 +++++++++++++++++++++---- 1 file changed, 28 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c index aedc604d331c..a51fa0a81367 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.c +++ b/drivers/gpu/drm/panfrost/panfrost_job.c @@ -494,14 +494,22 @@ static irqreturn_t panfrost_job_irq_handler(int irq, void *data) if (status & JOB_INT_MASK_ERR(j)) { enum panfrost_queue_status old_status; u32 js_status = job_read(pfdev, JS_STATUS(j)); + int error = panfrost_exception_to_error(js_status); + const char *exception_name = panfrost_exception_name(js_status);
job_write(pfdev, JS_COMMAND_NEXT(j), JS_COMMAND_NOP);
- dev_err(pfdev->dev, "js fault, js=%d, status=%s, head=0x%x, tail=0x%x", - j, - panfrost_exception_name(js_status), - job_read(pfdev, JS_HEAD_LO(j)), - job_read(pfdev, JS_TAIL_LO(j))); + if (!error) { + dev_dbg(pfdev->dev, "js interrupt, js=%d, status=%s, head=0x%x, tail=0x%x", + j, exception_name, + job_read(pfdev, JS_HEAD_LO(j)), + job_read(pfdev, JS_TAIL_LO(j))); + } else { + dev_err(pfdev->dev, "js fault, js=%d, status=%s, head=0x%x, tail=0x%x", + j, exception_name, + job_read(pfdev, JS_HEAD_LO(j)), + job_read(pfdev, JS_TAIL_LO(j))); + }
/* If we need a reset, signal it to the reset handler, * otherwise, update the fence error field and signal @@ -688,10 +696,25 @@ int panfrost_job_open(struct panfrost_file_priv *panfrost_priv)
void panfrost_job_close(struct panfrost_file_priv *panfrost_priv) { + struct panfrost_device *pfdev = panfrost_priv->pfdev; + unsigned long flags; int i;
for (i = 0; i < NUM_JOB_SLOTS; i++) drm_sched_entity_destroy(&panfrost_priv->sched_entity[i]); + + /* Kill in-flight jobs */ + spin_lock_irqsave(&pfdev->js->job_lock, flags); + for (i = 0; i < NUM_JOB_SLOTS; i++) { + struct drm_sched_entity *entity = &panfrost_priv->sched_entity[i]; + struct panfrost_job *job = pfdev->jobs[i]; + + if (!job || job->base.entity != entity) + continue; + + job_write(pfdev, JS_COMMAND(i), JS_COMMAND_HARD_STOP); + } + spin_unlock_irqrestore(&pfdev->js->job_lock, flags); }
int panfrost_job_is_idle(struct panfrost_device *pfdev)
dri-devel@lists.freedesktop.org