[PATCH v2 00/12] drm/panfrost: Misc fixes/improvements

List overview All Threads
Download

newer

older

Re: [Linaro-mm-sig] [PATCH v3 1/2]...

[PULL] drm-misc-next-fixes

Boris Brezillon

21 Jun 2021 21 Jun '21

1:38 p.m.

Hello,

Sorry for the noise, but I forgot 2 fixes (one fixing the error set to the sched fence when the driver fence allocation fails, and the other one shrinking the dma signalling section to get rid of spurious lockdep warnings).

The rest of the series hasn't changed.

The first patch has been submitted a while ago but was lacking a way to kill in-flight jobs when a context is closed; which is now addressed in patch 10.

The rest of those patches are improving fault handling (with some code refactoring in the middle).

"drm/panfrost: Do the exception -> string translation using a table" still has a TODO. I basically mapped all exception types to EINVAL since most faults are triggered by invalid job/shaders, but there might be some exceptions that should be translated to something else. Any feedback on that aspect is welcome.

Regards,

Boris

Changes in v2: * Added patch 11 and 12

Boris Brezillon (12): drm/panfrost: Make sure MMU context lifetime is not bound to panfrost_priv drm/panfrost: Get rid of the unused JS_STATUS_EVENT_ACTIVE definition drm/panfrost: Drop the pfdev argument passed to panfrost_exception_name() drm/panfrost: Expose exception types to userspace drm/panfrost: Disable the AS on unhandled page faults drm/panfrost: Expose a helper to trigger a GPU reset drm/panfrost: Reset the GPU when the AS_ACTIVE bit is stuck drm/panfrost: Do the exception -> string translation using a table drm/panfrost: Don't reset the GPU on job faults unless we really have to drm/panfrost: Kill in-flight jobs on FD close drm/panfrost: Make ->run_job() return an ERR_PTR() when appropriate drm/panfrost: Shorten the fence signalling section

drivers/gpu/drm/panfrost/panfrost_device.c | 143 +++++++++++------ drivers/gpu/drm/panfrost/panfrost_device.h | 21 ++- drivers/gpu/drm/panfrost/panfrost_drv.c | 50 ++---- drivers/gpu/drm/panfrost/panfrost_gem.c | 20 ++- drivers/gpu/drm/panfrost/panfrost_gpu.c | 2 +- drivers/gpu/drm/panfrost/panfrost_job.c | 83 ++++++---- drivers/gpu/drm/panfrost/panfrost_mmu.c | 173 ++++++++++++++------- drivers/gpu/drm/panfrost/panfrost_mmu.h | 5 +- drivers/gpu/drm/panfrost/panfrost_regs.h | 3 - include/uapi/drm/panfrost_drm.h | 65 ++++++++ 10 files changed, 375 insertions(+), 190 deletions(-)

-- 2.31.1

Show replies by date

Boris Brezillon

21 Jun 21 Jun

1:38 p.m.

New subject: [PATCH v2 01/12] drm/panfrost: Make sure MMU context lifetime is not bound to panfrost_priv

Jobs can be in-flight when the file descriptor is closed (either because the process did not terminate properly, or because it didn't wait for all GPU jobs to be finished), and apparently panfrost_job_close() does not cancel already running jobs. Let's refcount the MMU context object so it's lifetime is no longer bound to the FD lifetime and running jobs can finish properly without generating spurious page faults.

Reported-by: Icecream95 ixn@keemail.me Fixes: 7282f7645d06 ("drm/panfrost: Implement per FD address spaces") Cc: stable@vger.kernel.org Signed-off-by: Boris Brezillon boris.brezillon@collabora.com --- drivers/gpu/drm/panfrost/panfrost_device.h | 8 +- drivers/gpu/drm/panfrost/panfrost_drv.c | 50 ++----- drivers/gpu/drm/panfrost/panfrost_gem.c | 20 ++- drivers/gpu/drm/panfrost/panfrost_job.c | 4 +- drivers/gpu/drm/panfrost/panfrost_mmu.c | 160 ++++++++++++++------- drivers/gpu/drm/panfrost/panfrost_mmu.h | 5 +- 6 files changed, 136 insertions(+), 111 deletions(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h index f614e98771e4..8b2cdb8c701d 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.h +++ b/drivers/gpu/drm/panfrost/panfrost_device.h @@ -121,8 +121,12 @@ struct panfrost_device { };

struct panfrost_mmu { + struct panfrost_device *pfdev; + struct kref refcount; struct io_pgtable_cfg pgtbl_cfg; struct io_pgtable_ops *pgtbl_ops; + struct drm_mm mm; + spinlock_t mm_lock; int as; atomic_t as_count; struct list_head list; @@ -133,9 +137,7 @@ struct panfrost_file_priv {

struct drm_sched_entity sched_entity[NUM_JOB_SLOTS];

- struct panfrost_mmu mmu; - struct drm_mm mm; - spinlock_t mm_lock; + struct panfrost_mmu *mmu; };

static inline struct panfrost_device *to_panfrost_device(struct drm_device *ddev) diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c index 075ec0ef746c..945133db1857 100644 --- a/drivers/gpu/drm/panfrost/panfrost_drv.c +++ b/drivers/gpu/drm/panfrost/panfrost_drv.c @@ -417,7 +417,7 @@ static int panfrost_ioctl_madvise(struct drm_device *dev, void *data, * anyway, so let's not bother. */ if (!list_is_singular(&bo->mappings.list) || - WARN_ON_ONCE(first->mmu != &priv->mmu)) { + WARN_ON_ONCE(first->mmu != priv->mmu)) { ret = -EINVAL; goto out_unlock_mappings; } @@ -449,32 +449,6 @@ int panfrost_unstable_ioctl_check(void) return 0; }

-#define PFN_4G (SZ_4G >> PAGE_SHIFT) -#define PFN_4G_MASK (PFN_4G - 1) -#define PFN_16M (SZ_16M >> PAGE_SHIFT) - -static void panfrost_drm_mm_color_adjust(const struct drm_mm_node *node, - unsigned long color, - u64 *start, u64 *end) -{ - /* Executable buffers can't start or end on a 4GB boundary */ - if (!(color & PANFROST_BO_NOEXEC)) { - u64 next_seg; - - if ((*start & PFN_4G_MASK) == 0) - (*start)++; - - if ((*end & PFN_4G_MASK) == 0) - (*end)--; - - next_seg = ALIGN(*start, PFN_4G); - if (next_seg - *start <= PFN_16M) - *start = next_seg + 1; - - *end = min(*end, ALIGN(*start, PFN_4G) - 1); - } -} - static int panfrost_open(struct drm_device *dev, struct drm_file *file) { @@ -489,15 +463,11 @@ panfrost_open(struct drm_device *dev, struct drm_file *file) panfrost_priv->pfdev = pfdev; file->driver_priv = panfrost_priv;

- spin_lock_init(&panfrost_priv->mm_lock); - - /* 4G enough for now. can be 48-bit */ - drm_mm_init(&panfrost_priv->mm, SZ_32M >> PAGE_SHIFT, (SZ_4G - SZ_32M) >> PAGE_SHIFT); - panfrost_priv->mm.color_adjust = panfrost_drm_mm_color_adjust; - - ret = panfrost_mmu_pgtable_alloc(panfrost_priv); - if (ret) - goto err_pgtable; + panfrost_priv->mmu = panfrost_mmu_ctx_create(pfdev); + if (IS_ERR(panfrost_priv->mmu)) { + ret = PTR_ERR(panfrost_priv->mmu); + goto err_free; + }

ret = panfrost_job_open(panfrost_priv); if (ret) @@ -506,9 +476,8 @@ panfrost_open(struct drm_device *dev, struct drm_file *file) return 0;

err_job: - panfrost_mmu_pgtable_free(panfrost_priv); -err_pgtable: - drm_mm_takedown(&panfrost_priv->mm); + panfrost_mmu_ctx_put(panfrost_priv->mmu); +err_free: kfree(panfrost_priv); return ret; } @@ -521,8 +490,7 @@ panfrost_postclose(struct drm_device *dev, struct drm_file *file) panfrost_perfcnt_close(file); panfrost_job_close(panfrost_priv);

- panfrost_mmu_pgtable_free(panfrost_priv); - drm_mm_takedown(&panfrost_priv->mm); + panfrost_mmu_ctx_put(panfrost_priv->mmu); kfree(panfrost_priv); }

diff --git a/drivers/gpu/drm/panfrost/panfrost_gem.c b/drivers/gpu/drm/panfrost/panfrost_gem.c index 3e0723bc36bd..23377481f4e3 100644 --- a/drivers/gpu/drm/panfrost/panfrost_gem.c +++ b/drivers/gpu/drm/panfrost/panfrost_gem.c @@ -60,7 +60,7 @@ panfrost_gem_mapping_get(struct panfrost_gem_object *bo,

mutex_lock(&bo->mappings.lock); list_for_each_entry(iter, &bo->mappings.list, node) { - if (iter->mmu == &priv->mmu) { + if (iter->mmu == priv->mmu) { kref_get(&iter->refcount); mapping = iter; break; @@ -74,16 +74,13 @@ panfrost_gem_mapping_get(struct panfrost_gem_object *bo, static void panfrost_gem_teardown_mapping(struct panfrost_gem_mapping *mapping) { - struct panfrost_file_priv *priv; - if (mapping->active) panfrost_mmu_unmap(mapping);

- priv = container_of(mapping->mmu, struct panfrost_file_priv, mmu); - spin_lock(&priv->mm_lock); + spin_lock(&mapping->mmu->mm_lock); if (drm_mm_node_allocated(&mapping->mmnode)) drm_mm_remove_node(&mapping->mmnode); - spin_unlock(&priv->mm_lock); + spin_unlock(&mapping->mmu->mm_lock); }

static void panfrost_gem_mapping_release(struct kref *kref) @@ -94,6 +91,7 @@ static void panfrost_gem_mapping_release(struct kref *kref)

panfrost_gem_teardown_mapping(mapping); drm_gem_object_put(&mapping->obj->base.base); + panfrost_mmu_ctx_put(mapping->mmu); kfree(mapping); }

@@ -143,11 +141,11 @@ int panfrost_gem_open(struct drm_gem_object *obj, struct drm_file *file_priv) else align = size >= SZ_2M ? SZ_2M >> PAGE_SHIFT : 0;

- mapping->mmu = &priv->mmu; - spin_lock(&priv->mm_lock); - ret = drm_mm_insert_node_generic(&priv->mm, &mapping->mmnode, + mapping->mmu = panfrost_mmu_ctx_get(priv->mmu); + spin_lock(&mapping->mmu->mm_lock); + ret = drm_mm_insert_node_generic(&mapping->mmu->mm, &mapping->mmnode, size >> PAGE_SHIFT, align, color, 0); - spin_unlock(&priv->mm_lock); + spin_unlock(&mapping->mmu->mm_lock); if (ret) goto err;

@@ -176,7 +174,7 @@ void panfrost_gem_close(struct drm_gem_object *obj, struct drm_file *file_priv)

mutex_lock(&bo->mappings.lock); list_for_each_entry(iter, &bo->mappings.list, node) { - if (iter->mmu == &priv->mmu) { + if (iter->mmu == priv->mmu) { mapping = iter; list_del(&iter->node); break; diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c index 2df3e999a38d..3757c6eb3023 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.c +++ b/drivers/gpu/drm/panfrost/panfrost_job.c @@ -165,7 +165,7 @@ static void panfrost_job_hw_submit(struct panfrost_job *job, int js) return; }

- cfg = panfrost_mmu_as_get(pfdev, &job->file_priv->mmu); + cfg = panfrost_mmu_as_get(pfdev, job->file_priv->mmu);

job_write(pfdev, JS_HEAD_NEXT_LO(js), jc_head & 0xFFFFFFFF); job_write(pfdev, JS_HEAD_NEXT_HI(js), jc_head >> 32); @@ -527,7 +527,7 @@ static irqreturn_t panfrost_job_irq_handler(int irq, void *data) if (job) { pfdev->jobs[j] = NULL;

- panfrost_mmu_as_put(pfdev, &job->file_priv->mmu); + panfrost_mmu_as_put(pfdev, job->file_priv->mmu); panfrost_devfreq_record_idle(&pfdev->pfdevfreq);

dma_fence_signal_locked(job->done_fence); diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c index 0581186ebfb3..569509c2ba27 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.c +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c @@ -1,5 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright 2019 Linaro, Ltd, Rob Herring robh@kernel.org */ + +#include <drm/panfrost_drm.h> + #include <linux/atomic.h> #include <linux/bitfield.h> #include <linux/delay.h> @@ -337,7 +340,7 @@ static void mmu_tlb_inv_context_s1(void *cookie)

static void mmu_tlb_sync_context(void *cookie) { - //struct panfrost_device *pfdev = cookie; + //struct panfrost_mmu *mmu = cookie; // TODO: Wait 1000 GPU cycles for HW_ISSUE_6367/T60X }

@@ -352,57 +355,10 @@ static const struct iommu_flush_ops mmu_tlb_ops = { .tlb_flush_walk = mmu_tlb_flush_walk, };

-int panfrost_mmu_pgtable_alloc(struct panfrost_file_priv *priv) -{ - struct panfrost_mmu *mmu = &priv->mmu; - struct panfrost_device *pfdev = priv->pfdev; - - INIT_LIST_HEAD(&mmu->list); - mmu->as = -1; - - mmu->pgtbl_cfg = (struct io_pgtable_cfg) { - .pgsize_bitmap = SZ_4K | SZ_2M, - .ias = FIELD_GET(0xff, pfdev->features.mmu_features), - .oas = FIELD_GET(0xff00, pfdev->features.mmu_features), - .coherent_walk = pfdev->coherent, - .tlb = &mmu_tlb_ops, - .iommu_dev = pfdev->dev, - }; - - mmu->pgtbl_ops = alloc_io_pgtable_ops(ARM_MALI_LPAE, &mmu->pgtbl_cfg, - priv); - if (!mmu->pgtbl_ops) - return -EINVAL; - - return 0; -} - -void panfrost_mmu_pgtable_free(struct panfrost_file_priv *priv) -{ - struct panfrost_device *pfdev = priv->pfdev; - struct panfrost_mmu *mmu = &priv->mmu; - - spin_lock(&pfdev->as_lock); - if (mmu->as >= 0) { - pm_runtime_get_noresume(pfdev->dev); - if (pm_runtime_active(pfdev->dev)) - panfrost_mmu_disable(pfdev, mmu->as); - pm_runtime_put_autosuspend(pfdev->dev); - - clear_bit(mmu->as, &pfdev->as_alloc_mask); - clear_bit(mmu->as, &pfdev->as_in_use_mask); - list_del(&mmu->list); - } - spin_unlock(&pfdev->as_lock); - - free_io_pgtable_ops(mmu->pgtbl_ops); -} - static struct panfrost_gem_mapping * addr_to_mapping(struct panfrost_device *pfdev, int as, u64 addr) { struct panfrost_gem_mapping *mapping = NULL; - struct panfrost_file_priv *priv; struct drm_mm_node *node; u64 offset = addr >> PAGE_SHIFT; struct panfrost_mmu *mmu; @@ -415,11 +371,10 @@ addr_to_mapping(struct panfrost_device *pfdev, int as, u64 addr) goto out;

found_mmu: - priv = container_of(mmu, struct panfrost_file_priv, mmu);

- spin_lock(&priv->mm_lock); + spin_lock(&mmu->mm_lock);

- drm_mm_for_each_node(node, &priv->mm) { + drm_mm_for_each_node(node, &mmu->mm) { if (offset >= node->start && offset < (node->start + node->size)) { mapping = drm_mm_node_to_panfrost_mapping(node); @@ -429,7 +384,7 @@ addr_to_mapping(struct panfrost_device *pfdev, int as, u64 addr) } }

- spin_unlock(&priv->mm_lock); + spin_unlock(&mmu->mm_lock); out: spin_unlock(&pfdev->as_lock); return mapping; @@ -542,6 +497,107 @@ static int panfrost_mmu_map_fault_addr(struct panfrost_device *pfdev, int as, return ret; }

+static void panfrost_mmu_release_ctx(struct kref *kref) +{ + struct panfrost_mmu *mmu = container_of(kref, struct panfrost_mmu, + refcount); + struct panfrost_device *pfdev = mmu->pfdev; + + spin_lock(&pfdev->as_lock); + if (mmu->as >= 0) { + pm_runtime_get_noresume(pfdev->dev); + if (pm_runtime_active(pfdev->dev)) + panfrost_mmu_disable(pfdev, mmu->as); + pm_runtime_put_autosuspend(pfdev->dev); + + clear_bit(mmu->as, &pfdev->as_alloc_mask); + clear_bit(mmu->as, &pfdev->as_in_use_mask); + list_del(&mmu->list); + } + spin_unlock(&pfdev->as_lock); + + free_io_pgtable_ops(mmu->pgtbl_ops); + drm_mm_takedown(&mmu->mm); + kfree(mmu); +} + +void panfrost_mmu_ctx_put(struct panfrost_mmu *mmu) +{ + kref_put(&mmu->refcount, panfrost_mmu_release_ctx); +} + +struct panfrost_mmu *panfrost_mmu_ctx_get(struct panfrost_mmu *mmu) +{ + kref_get(&mmu->refcount); + + return mmu; +} + +#define PFN_4G (SZ_4G >> PAGE_SHIFT) +#define PFN_4G_MASK (PFN_4G - 1) +#define PFN_16M (SZ_16M >> PAGE_SHIFT) + +static void panfrost_drm_mm_color_adjust(const struct drm_mm_node *node, + unsigned long color, + u64 *start, u64 *end) +{ + /* Executable buffers can't start or end on a 4GB boundary */ + if (!(color & PANFROST_BO_NOEXEC)) { + u64 next_seg; + + if ((*start & PFN_4G_MASK) == 0) + (*start)++; + + if ((*end & PFN_4G_MASK) == 0) + (*end)--; + + next_seg = ALIGN(*start, PFN_4G); + if (next_seg - *start <= PFN_16M) + *start = next_seg + 1; + + *end = min(*end, ALIGN(*start, PFN_4G) - 1); + } +} + +struct panfrost_mmu *panfrost_mmu_ctx_create(struct panfrost_device *pfdev) +{ + struct panfrost_mmu *mmu; + + mmu = kzalloc(sizeof(*mmu), GFP_KERNEL); + if (!mmu) + return ERR_PTR(-ENOMEM); + + mmu->pfdev = pfdev; + spin_lock_init(&mmu->mm_lock); + + /* 4G enough for now. can be 48-bit */ + drm_mm_init(&mmu->mm, SZ_32M >> PAGE_SHIFT, (SZ_4G - SZ_32M) >> PAGE_SHIFT); + mmu->mm.color_adjust = panfrost_drm_mm_color_adjust; + + INIT_LIST_HEAD(&mmu->list); + mmu->as = -1; + + mmu->pgtbl_cfg = (struct io_pgtable_cfg) { + .pgsize_bitmap = SZ_4K | SZ_2M, + .ias = FIELD_GET(0xff, pfdev->features.mmu_features), + .oas = FIELD_GET(0xff00, pfdev->features.mmu_features), + .coherent_walk = pfdev->coherent, + .tlb = &mmu_tlb_ops, + .iommu_dev = pfdev->dev, + }; + + mmu->pgtbl_ops = alloc_io_pgtable_ops(ARM_MALI_LPAE, &mmu->pgtbl_cfg, + mmu); + if (!mmu->pgtbl_ops) { + kfree(mmu); + return ERR_PTR(-EINVAL); + } + + kref_init(&mmu->refcount); + + return mmu; +} + static const char *access_type_name(struct panfrost_device *pfdev, u32 fault_status) { diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.h b/drivers/gpu/drm/panfrost/panfrost_mmu.h index 44fc2edf63ce..cc2a0d307feb 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.h +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.h @@ -18,7 +18,8 @@ void panfrost_mmu_reset(struct panfrost_device *pfdev); u32 panfrost_mmu_as_get(struct panfrost_device *pfdev, struct panfrost_mmu *mmu); void panfrost_mmu_as_put(struct panfrost_device *pfdev, struct panfrost_mmu *mmu);

-int panfrost_mmu_pgtable_alloc(struct panfrost_file_priv *priv); -void panfrost_mmu_pgtable_free(struct panfrost_file_priv *priv); +struct panfrost_mmu *panfrost_mmu_ctx_get(struct panfrost_mmu *mmu); +void panfrost_mmu_ctx_put(struct panfrost_mmu *mmu); +struct panfrost_mmu *panfrost_mmu_ctx_create(struct panfrost_device *pfdev);

#endif

-- 2.31.1

Alyssa Rosenzweig

1:57 p.m.

New subject: [PATCH v2 01/12] drm/panfrost: Make sure MMU context lifetime is not bound to panfrost_priv

...

Jobs can be in-flight when the file descriptor is closed (either because the process did not terminate properly, or because it didn't wait for all GPU jobs to be finished), and apparently panfrost_job_close() does not cancel already running jobs. Let's refcount the MMU context object so it's lifetime is no longer bound to the FD lifetime and running jobs can finish properly without generating spurious page faults.

Remind me - why can't we hard stop in-flight jobs when the fd is closed? I've seen cases where kill -9'ing a badly behaved process doesn't end the fault storm, or unfreeze the desktop.

Steven Price

2:29 p.m.

New subject: [PATCH v2 01/12] drm/panfrost: Make sure MMU context lifetime is not bound to panfrost_priv

On 21/06/2021 14:57, Alyssa Rosenzweig wrote:

...

...
Jobs can be in-flight when the file descriptor is closed (either because the process did not terminate properly, or because it didn't wait for all GPU jobs to be finished), and apparently panfrost_job_close() does not cancel already running jobs. Let's refcount the MMU context object so it's lifetime is no longer bound to the FD lifetime and running jobs can finish properly without generating spurious page faults.

Remind me - why can't we hard stop in-flight jobs when the fd is closed? I've seen cases where kill -9'ing a badly behaved process doesn't end the fault storm, or unfreeze the desktop.

Hard-stopping the in-flight jobs would also make sense. But unless we want to actually hang the close() then there will be a period between issuing the hard-stop and actually having completed all jobs in the context.

But equally to be fair I've been cherry-picking this patch myself for quite some time, so we should just merge it and improve from there. So you can have my:

Reviewed-by: Steven Price steven.price@arm.com

Boris Brezillon

2:44 p.m.

New subject: [PATCH v2 01/12] drm/panfrost: Make sure MMU context lifetime is not bound to panfrost_priv

On Mon, 21 Jun 2021 15:29:55 +0100 Steven Price steven.price@arm.com wrote:

...

On 21/06/2021 14:57, Alyssa Rosenzweig wrote:

...
...
Jobs can be in-flight when the file descriptor is closed (either because the process did not terminate properly, or because it didn't wait for all GPU jobs to be finished), and apparently panfrost_job_close() does not cancel already running jobs. Let's refcount the MMU context object so it's lifetime is no longer bound to the FD lifetime and running jobs can finish properly without generating spurious page faults.

Remind me - why can't we hard stop in-flight jobs when the fd is closed? I've seen cases where kill -9'ing a badly behaved process doesn't end the fault storm, or unfreeze the desktop.

Hard-stopping the in-flight jobs would also make sense. But unless we want to actually hang the close() then there will be a period between issuing the hard-stop and actually having completed all jobs in the context.

Patch 10 is doing that, I just didn't want to backport all the dependencies, so I kept it split in 2 halves: one patch fixing the use-after-free bug, and the other part killing in-flight jobs.

...

But equally to be fair I've been cherry-picking this patch myself for quite some time, so we should just merge it and improve from there. So you can have my:

Reviewed-by: Steven Price steven.price@arm.com

Boris Brezillon

24 Jun 24 Jun

8:03 a.m.

New subject: [PATCH v2 01/12] drm/panfrost: Make sure MMU context lifetime is not bound to panfrost_priv

On Mon, 21 Jun 2021 15:38:56 +0200 Boris Brezillon boris.brezillon@collabora.com wrote:

...

Jobs can be in-flight when the file descriptor is closed (either because the process did not terminate properly, or because it didn't wait for all GPU jobs to be finished), and apparently panfrost_job_close() does not cancel already running jobs. Let's refcount the MMU context object so it's lifetime is no longer bound to the FD lifetime and running jobs can finish properly without generating spurious page faults.

Reported-by: Icecream95 ixn@keemail.me Fixes: 7282f7645d06 ("drm/panfrost: Implement per FD address spaces") Cc: stable@vger.kernel.org Signed-off-by: Boris Brezillon boris.brezillon@collabora.com

Queued this patch to drm-misc-next. I'll respin the rest of this series.

Boris Brezillon

21 Jun 21 Jun

1:38 p.m.

New subject: [PATCH v2 02/12] drm/panfrost: Get rid of the unused JS_STATUS_EVENT_ACTIVE definition

Exception types will be defined as an enum in panfrost_drm.h so userspace and use the same definitions if needed.

Signed-off-by: Boris Brezillon boris.brezillon@collabora.com --- drivers/gpu/drm/panfrost/panfrost_regs.h | 3 --- 1 file changed, 3 deletions(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_regs.h b/drivers/gpu/drm/panfrost/panfrost_regs.h index dc9df5457f1c..1940ff86e49a 100644 --- a/drivers/gpu/drm/panfrost/panfrost_regs.h +++ b/drivers/gpu/drm/panfrost/panfrost_regs.h @@ -262,9 +262,6 @@ #define JS_COMMAND_SOFT_STOP_1 0x06 /* Execute SOFT_STOP if JOB_CHAIN_FLAG is 1 */ #define JS_COMMAND_HARD_STOP_1 0x07 /* Execute HARD_STOP if JOB_CHAIN_FLAG is 1 */

-#define JS_STATUS_EVENT_ACTIVE 0x08 - - /* MMU regs */ #define MMU_INT_RAWSTAT 0x2000 #define MMU_INT_CLEAR 0x2004

-- 2.31.1

Steven Price

2:34 p.m.

New subject: [PATCH v2 02/12] drm/panfrost: Get rid of the unused JS_STATUS_EVENT_ACTIVE definition

On 21/06/2021 14:38, Boris Brezillon wrote:

...

Exception types will be defined as an enum in panfrost_drm.h so userspace and use the same definitions if needed.

s/and/can/ ?

While it is (currently) unused in the kernel, this is a hardware value so I'm not sure why it's worth removing this and not the other (currently) unused values here. This is the value returned from the JS_STATUS register when the slot is actively processing a job.

Steve

...

Signed-off-by: Boris Brezillon boris.brezillon@collabora.com

drivers/gpu/drm/panfrost/panfrost_regs.h | 3 --- 1 file changed, 3 deletions(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_regs.h b/drivers/gpu/drm/panfrost/panfrost_regs.h index dc9df5457f1c..1940ff86e49a 100644 --- a/drivers/gpu/drm/panfrost/panfrost_regs.h +++ b/drivers/gpu/drm/panfrost/panfrost_regs.h @@ -262,9 +262,6 @@ #define JS_COMMAND_SOFT_STOP_1 0x06 /* Execute SOFT_STOP if JOB_CHAIN_FLAG is 1 */ #define JS_COMMAND_HARD_STOP_1 0x07 /* Execute HARD_STOP if JOB_CHAIN_FLAG is 1 */

-#define JS_STATUS_EVENT_ACTIVE 0x08

/* MMU regs */ #define MMU_INT_RAWSTAT 0x2000 #define MMU_INT_CLEAR 0x2004

Boris Brezillon

2:49 p.m.

New subject: [PATCH v2 02/12] drm/panfrost: Get rid of the unused JS_STATUS_EVENT_ACTIVE definition

On Mon, 21 Jun 2021 15:34:35 +0100 Steven Price steven.price@arm.com wrote:

...

On 21/06/2021 14:38, Boris Brezillon wrote:

...
Exception types will be defined as an enum in panfrost_drm.h so userspace and use the same definitions if needed.

s/and/can/ ?

While it is (currently) unused in the kernel, this is a hardware value so I'm not sure why it's worth removing this and not the other (currently) unused values here. This is the value returned from the JS_STATUS register when the slot is actively processing a job.

Hm, what's the point of having the same value defined in 2 places (DRM_PANFROST_EXCEPTION_ACTIVE defined in patch 3 vs JS_STATUS_EVENT_ACTIVE here)? I mean, values defined in the drm_panfrost_exception_type enum apply to the JS_STATUS registers too, right?

...

Steve

...
Signed-off-by: Boris Brezillon boris.brezillon@collabora.com

drivers/gpu/drm/panfrost/panfrost_regs.h | 3 --- 1 file changed, 3 deletions(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_regs.h b/drivers/gpu/drm/panfrost/panfrost_regs.h index dc9df5457f1c..1940ff86e49a 100644 --- a/drivers/gpu/drm/panfrost/panfrost_regs.h +++ b/drivers/gpu/drm/panfrost/panfrost_regs.h @@ -262,9 +262,6 @@ #define JS_COMMAND_SOFT_STOP_1 0x06 /* Execute SOFT_STOP if JOB_CHAIN_FLAG is 1 */ #define JS_COMMAND_HARD_STOP_1 0x07 /* Execute HARD_STOP if JOB_CHAIN_FLAG is 1 */

-#define JS_STATUS_EVENT_ACTIVE 0x08

/* MMU regs */ #define MMU_INT_RAWSTAT 0x2000 #define MMU_INT_CLEAR 0x2004

Steven Price

2:54 p.m.

New subject: [PATCH v2 02/12] drm/panfrost: Get rid of the unused JS_STATUS_EVENT_ACTIVE definition

On 21/06/2021 15:49, Boris Brezillon wrote:

...

On Mon, 21 Jun 2021 15:34:35 +0100 Steven Price steven.price@arm.com wrote:

...
On 21/06/2021 14:38, Boris Brezillon wrote:

...
Exception types will be defined as an enum in panfrost_drm.h so userspace and use the same definitions if needed.

s/and/can/ ?

While it is (currently) unused in the kernel, this is a hardware value so I'm not sure why it's worth removing this and not the other (currently) unused values here. This is the value returned from the JS_STATUS register when the slot is actively processing a job.

Hm, what's the point of having the same value defined in 2 places (DRM_PANFROST_EXCEPTION_ACTIVE defined in patch 3 vs JS_STATUS_EVENT_ACTIVE here)? I mean, values defined in the drm_panfrost_exception_type enum apply to the JS_STATUS registers too, right?

Thinking about this more I guess I agree with you: this is an oddity and your following patch adds a (more) complete list. You've convinced me - with my nit above fixed:

Reviewed-by: Steven Price steven.price@arm.com

Boris Brezillon

1:38 p.m.

New subject: [PATCH v2 03/12] drm/panfrost: Drop the pfdev argument passed to panfrost_exception_name()

Currently unused. We'll add it back if we need per-GPU definitions.

Signed-off-by: Boris Brezillon boris.brezillon@collabora.com --- drivers/gpu/drm/panfrost/panfrost_device.c | 2 +- drivers/gpu/drm/panfrost/panfrost_device.h | 2 +- drivers/gpu/drm/panfrost/panfrost_gpu.c | 2 +- drivers/gpu/drm/panfrost/panfrost_job.c | 2 +- drivers/gpu/drm/panfrost/panfrost_mmu.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_device.c b/drivers/gpu/drm/panfrost/panfrost_device.c index a2a09c51eed7..f7f5ca94f910 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.c +++ b/drivers/gpu/drm/panfrost/panfrost_device.c @@ -292,7 +292,7 @@ void panfrost_device_fini(struct panfrost_device *pfdev) panfrost_clk_fini(pfdev); }

-const char *panfrost_exception_name(struct panfrost_device *pfdev, u32 exception_code) +const char *panfrost_exception_name(u32 exception_code) { switch (exception_code) { /* Non-Fault Status code */ diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h index 8b2cdb8c701d..2fe1550da7f8 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.h +++ b/drivers/gpu/drm/panfrost/panfrost_device.h @@ -173,6 +173,6 @@ void panfrost_device_reset(struct panfrost_device *pfdev); int panfrost_device_resume(struct device *dev); int panfrost_device_suspend(struct device *dev);

-const char *panfrost_exception_name(struct panfrost_device *pfdev, u32 exception_code); +const char *panfrost_exception_name(u32 exception_code);

#endif diff --git a/drivers/gpu/drm/panfrost/panfrost_gpu.c b/drivers/gpu/drm/panfrost/panfrost_gpu.c index 0e70e27fd8c3..26e4196b6c90 100644 --- a/drivers/gpu/drm/panfrost/panfrost_gpu.c +++ b/drivers/gpu/drm/panfrost/panfrost_gpu.c @@ -33,7 +33,7 @@ static irqreturn_t panfrost_gpu_irq_handler(int irq, void *data) address |= gpu_read(pfdev, GPU_FAULT_ADDRESS_LO);

dev_warn(pfdev->dev, "GPU Fault 0x%08x (%s) at 0x%016llx\n", - fault_status & 0xFF, panfrost_exception_name(pfdev, fault_status), + fault_status & 0xFF, panfrost_exception_name(fault_status), address);

if (state & GPU_IRQ_MULTIPLE_FAULT) diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c index 3757c6eb3023..1be80b3dd5d0 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.c +++ b/drivers/gpu/drm/panfrost/panfrost_job.c @@ -500,7 +500,7 @@ static irqreturn_t panfrost_job_irq_handler(int irq, void *data)

dev_err(pfdev->dev, "js fault, js=%d, status=%s, head=0x%x, tail=0x%x", j, - panfrost_exception_name(pfdev, job_read(pfdev, JS_STATUS(j))), + panfrost_exception_name(job_read(pfdev, JS_STATUS(j))), job_read(pfdev, JS_HEAD_LO(j)), job_read(pfdev, JS_TAIL_LO(j)));

diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c index 569509c2ba27..2a9bf30edc9d 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.c +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c @@ -675,7 +675,7 @@ static irqreturn_t panfrost_mmu_irq_handler_thread(int irq, void *data) "TODO", fault_status, (fault_status & (1 << 10) ? "DECODER FAULT" : "SLAVE FAULT"), - exception_type, panfrost_exception_name(pfdev, exception_type), + exception_type, panfrost_exception_name(exception_type), access_type, access_type_name(pfdev, fault_status), source_id);

-- 2.31.1

Steven Price

2:36 p.m.

New subject: [PATCH v2 03/12] drm/panfrost: Drop the pfdev argument passed to panfrost_exception_name()

On 21/06/2021 14:38, Boris Brezillon wrote:

...

Currently unused. We'll add it back if we need per-GPU definitions.

Signed-off-by: Boris Brezillon boris.brezillon@collabora.com

Reviewed-by: Steven Price steven.price@arm.com

Side note: we could use the definitions such as JS_STATUS_EVENT_ACTIVE in panfrost_exception_name() rather than magic numbers. Although I do find this function a handy mapping between codes and names to refer to! ;)

Steve

...

drivers/gpu/drm/panfrost/panfrost_device.c | 2 +- drivers/gpu/drm/panfrost/panfrost_device.h | 2 +- drivers/gpu/drm/panfrost/panfrost_gpu.c | 2 +- drivers/gpu/drm/panfrost/panfrost_job.c | 2 +- drivers/gpu/drm/panfrost/panfrost_mmu.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_device.c b/drivers/gpu/drm/panfrost/panfrost_device.c index a2a09c51eed7..f7f5ca94f910 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.c +++ b/drivers/gpu/drm/panfrost/panfrost_device.c @@ -292,7 +292,7 @@ void panfrost_device_fini(struct panfrost_device *pfdev) panfrost_clk_fini(pfdev); }

-const char *panfrost_exception_name(struct panfrost_device *pfdev, u32 exception_code) +const char *panfrost_exception_name(u32 exception_code) { switch (exception_code) { /* Non-Fault Status code */ diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h index 8b2cdb8c701d..2fe1550da7f8 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.h +++ b/drivers/gpu/drm/panfrost/panfrost_device.h @@ -173,6 +173,6 @@ void panfrost_device_reset(struct panfrost_device *pfdev); int panfrost_device_resume(struct device *dev); int panfrost_device_suspend(struct device *dev);

-const char *panfrost_exception_name(struct panfrost_device *pfdev, u32 exception_code); +const char *panfrost_exception_name(u32 exception_code);

#endif diff --git a/drivers/gpu/drm/panfrost/panfrost_gpu.c b/drivers/gpu/drm/panfrost/panfrost_gpu.c index 0e70e27fd8c3..26e4196b6c90 100644 --- a/drivers/gpu/drm/panfrost/panfrost_gpu.c +++ b/drivers/gpu/drm/panfrost/panfrost_gpu.c @@ -33,7 +33,7 @@ static irqreturn_t panfrost_gpu_irq_handler(int irq, void *data) address |= gpu_read(pfdev, GPU_FAULT_ADDRESS_LO);
dev_warn(pfdev->dev, "GPU Fault 0x%08x (%s) at 0x%016llx\n",
	 fault_status & 0xFF, panfrost_exception_name(pfdev, fault_status),
	 fault_status & 0xFF, panfrost_exception_name(fault_status),
 address);
if (state & GPU_IRQ_MULTIPLE_FAULT)
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c index 3757c6eb3023..1be80b3dd5d0 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.c +++ b/drivers/gpu/drm/panfrost/panfrost_job.c @@ -500,7 +500,7 @@ static irqreturn_t panfrost_job_irq_handler(int irq, void *data)
	dev_err(pfdev->dev, "js fault, js=%d, status=%s, head=0x%x, tail=0x%x",
		j,
		panfrost_exception_name(pfdev, job_read(pfdev, JS_STATUS(j))),
		panfrost_exception_name(job_read(pfdev, JS_STATUS(j))),
	job_read(pfdev, JS_HEAD_LO(j)),
	job_read(pfdev, JS_TAIL_LO(j)));
diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c index 569509c2ba27..2a9bf30edc9d 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.c +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c @@ -675,7 +675,7 @@ static irqreturn_t panfrost_mmu_irq_handler_thread(int irq, void *data) "TODO", fault_status, (fault_status & (1 << 10) ? "DECODER FAULT" : "SLAVE FAULT"),
		exception_type, panfrost_exception_name(pfdev, exception_type),
		exception_type, panfrost_exception_name(exception_type),
	access_type, access_type_name(pfdev, fault_status),
	source_id);

Boris Brezillon

1:38 p.m.

New subject: [PATCH v2 04/12] drm/panfrost: Expose exception types to userspace

Job headers contain an exception type field which might be read and converted to a human readable string by tracing tools. Let's expose the exception type as an enum so we share the same definition.

Signed-off-by: Boris Brezillon boris.brezillon@collabora.com --- include/uapi/drm/panfrost_drm.h | 65 +++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+)

diff --git a/include/uapi/drm/panfrost_drm.h b/include/uapi/drm/panfrost_drm.h index 061e700dd06c..9a05d57d0118 100644 --- a/include/uapi/drm/panfrost_drm.h +++ b/include/uapi/drm/panfrost_drm.h @@ -224,6 +224,71 @@ struct drm_panfrost_madvise { __u32 retained; /* out, whether backing store still exists */ };

+/* The exception types */ + +enum drm_panfrost_exception_type { + DRM_PANFROST_EXCEPTION_OK = 0x00, + DRM_PANFROST_EXCEPTION_DONE = 0x01, + DRM_PANFROST_EXCEPTION_STOPPED = 0x03, + DRM_PANFROST_EXCEPTION_TERMINATED = 0x04, + DRM_PANFROST_EXCEPTION_KABOOM = 0x05, + DRM_PANFROST_EXCEPTION_EUREKA = 0x06, + DRM_PANFROST_EXCEPTION_ACTIVE = 0x08, + DRM_PANFROST_EXCEPTION_JOB_CONFIG_FAULT = 0x40, + DRM_PANFROST_EXCEPTION_JOB_POWER_FAULT = 0x41, + DRM_PANFROST_EXCEPTION_JOB_READ_FAULT = 0x42, + DRM_PANFROST_EXCEPTION_JOB_WRITE_FAULT = 0x43, + DRM_PANFROST_EXCEPTION_JOB_AFFINITY_FAULT = 0x44, + DRM_PANFROST_EXCEPTION_JOB_BUS_FAULT = 0x48, + DRM_PANFROST_EXCEPTION_INSTR_INVALID_PC = 0x50, + DRM_PANFROST_EXCEPTION_INSTR_INVALID_ENC = 0x51, + DRM_PANFROST_EXCEPTION_INSTR_BARRIER_FAULT = 0x55, + DRM_PANFROST_EXCEPTION_DATA_INVALID_FAULT = 0x58, + DRM_PANFROST_EXCEPTION_TILE_RANGE_FAULT = 0x59, + DRM_PANFROST_EXCEPTION_ADDR_RANGE_FAULT = 0x5a, + DRM_PANFROST_EXCEPTION_IMPRECISE_FAULT = 0x5b, + DRM_PANFROST_EXCEPTION_OOM = 0x60, + DRM_PANFROST_EXCEPTION_UNKNOWN = 0x7f, + DRM_PANFROST_EXCEPTION_DELAYED_BUS_FAULT = 0x80, + DRM_PANFROST_EXCEPTION_GPU_SHAREABILITY_FAULT = 0x88, + DRM_PANFROST_EXCEPTION_SYS_SHAREABILITY_FAULT = 0x89, + DRM_PANFROST_EXCEPTION_GPU_CACHEABILITY_FAULT = 0x8a, + DRM_PANFROST_EXCEPTION_TRANSLATION_FAULT_0 = 0xc0, + DRM_PANFROST_EXCEPTION_TRANSLATION_FAULT_1 = 0xc1, + DRM_PANFROST_EXCEPTION_TRANSLATION_FAULT_2 = 0xc2, + DRM_PANFROST_EXCEPTION_TRANSLATION_FAULT_3 = 0xc3, + DRM_PANFROST_EXCEPTION_TRANSLATION_FAULT_4 = 0xc4, + DRM_PANFROST_EXCEPTION_TRANSLATION_FAULT_IDENTITY = 0xc7, + DRM_PANFROST_EXCEPTION_PERM_FAULT_0 = 0xc8, + DRM_PANFROST_EXCEPTION_PERM_FAULT_1 = 0xc9, + DRM_PANFROST_EXCEPTION_PERM_FAULT_2 = 0xca, + DRM_PANFROST_EXCEPTION_PERM_FAULT_3 = 0xcb, + DRM_PANFROST_EXCEPTION_TRANSTAB_BUS_FAULT_0 = 0xd0, + DRM_PANFROST_EXCEPTION_TRANSTAB_BUS_FAULT_1 = 0xd1, + DRM_PANFROST_EXCEPTION_TRANSTAB_BUS_FAULT_2 = 0xd2, + DRM_PANFROST_EXCEPTION_TRANSTAB_BUS_FAULT_3 = 0xd3, + DRM_PANFROST_EXCEPTION_ACCESS_FLAG_0 = 0xd8, + DRM_PANFROST_EXCEPTION_ACCESS_FLAG_1 = 0xd9, + DRM_PANFROST_EXCEPTION_ACCESS_FLAG_2 = 0xda, + DRM_PANFROST_EXCEPTION_ACCESS_FLAG_3 = 0xdb, + DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_IN0 = 0xe0, + DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_IN1 = 0xe1, + DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_IN2 = 0xe2, + DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_IN3 = 0xe3, + DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_OUT0 = 0xe4, + DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_OUT1 = 0xe5, + DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_OUT2 = 0xe6, + DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_OUT3 = 0xe7, + DRM_PANFROST_EXCEPTION_MEM_ATTR_FAULT_0 = 0xe8, + DRM_PANFROST_EXCEPTION_MEM_ATTR_FAULT_1 = 0xe9, + DRM_PANFROST_EXCEPTION_MEM_ATTR_FAULT_2 = 0xea, + DRM_PANFROST_EXCEPTION_MEM_ATTR_FAULT_3 = 0xeb, + DRM_PANFROST_EXCEPTION_MEM_ATTR_NONCACHE_0 = 0xec, + DRM_PANFROST_EXCEPTION_MEM_ATTR_NONCACHE_1 = 0xed, + DRM_PANFROST_EXCEPTION_MEM_ATTR_NONCACHE_2 = 0xee, + DRM_PANFROST_EXCEPTION_MEM_ATTR_NONCACHE_3 = 0xef, +}; + #if defined(__cplusplus) } #endif

-- 2.31.1

Steven Price

2:49 p.m.

New subject: [PATCH v2 04/12] drm/panfrost: Expose exception types to userspace

On 21/06/2021 14:38, Boris Brezillon wrote:

...

Job headers contain an exception type field which might be read and converted to a human readable string by tracing tools. Let's expose the exception type as an enum so we share the same definition.

Signed-off-by: Boris Brezillon boris.brezillon@collabora.com

include/uapi/drm/panfrost_drm.h | 65 +++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+)

diff --git a/include/uapi/drm/panfrost_drm.h b/include/uapi/drm/panfrost_drm.h index 061e700dd06c..9a05d57d0118 100644 --- a/include/uapi/drm/panfrost_drm.h +++ b/include/uapi/drm/panfrost_drm.h @@ -224,6 +224,71 @@ struct drm_panfrost_madvise { __u32 retained; /* out, whether backing store still exists */ };

+/* The exception types */

+enum drm_panfrost_exception_type {

DRM_PANFROST_EXCEPTION_OK = 0x00,

DRM_PANFROST_EXCEPTION_DONE = 0x01,

Any reason to miss INTERRUPTED? Although I don't think you'll ever see it.

...

DRM_PANFROST_EXCEPTION_STOPPED = 0x03,

DRM_PANFROST_EXCEPTION_TERMINATED = 0x04,

DRM_PANFROST_EXCEPTION_KABOOM = 0x05,

DRM_PANFROST_EXCEPTION_EUREKA = 0x06,

Interestingly KABOOM/EUREKA are missing from panfrost_exception_name()

...

DRM_PANFROST_EXCEPTION_ACTIVE = 0x08,

DRM_PANFROST_EXCEPTION_JOB_CONFIG_FAULT = 0x40,

DRM_PANFROST_EXCEPTION_JOB_POWER_FAULT = 0x41,

DRM_PANFROST_EXCEPTION_JOB_READ_FAULT = 0x42,

DRM_PANFROST_EXCEPTION_JOB_WRITE_FAULT = 0x43,

DRM_PANFROST_EXCEPTION_JOB_AFFINITY_FAULT = 0x44,

DRM_PANFROST_EXCEPTION_JOB_BUS_FAULT = 0x48,

DRM_PANFROST_EXCEPTION_INSTR_INVALID_PC = 0x50,

DRM_PANFROST_EXCEPTION_INSTR_INVALID_ENC = 0x51,

0x52: INSTR_TYPE_MISMATCH 0x53: INSTR_OPERAND_FAULT 0x54: INSTR_TLS_FAULT

...

DRM_PANFROST_EXCEPTION_INSTR_BARRIER_FAULT = 0x55,

0x56: INSTR_ALIGN_FAULT

By the looks of it this is probably the Bifrost list and missing those codes which are Midgard only, whereas panfrost_exception_name() looks like it's missing some Bifrost status codes.

Given this is UAPI there is some argument for missing e.g. INTERRUPTED (I'm not sure it was ever actually implemented in hardware and the term INTERRUPTED might be reused in future), but it seems a bit wrong just to have Bifrost values here.

Steve

...

DRM_PANFROST_EXCEPTION_DATA_INVALID_FAULT = 0x58,

DRM_PANFROST_EXCEPTION_TILE_RANGE_FAULT = 0x59,

DRM_PANFROST_EXCEPTION_ADDR_RANGE_FAULT = 0x5a,

DRM_PANFROST_EXCEPTION_IMPRECISE_FAULT = 0x5b,

DRM_PANFROST_EXCEPTION_OOM = 0x60,

DRM_PANFROST_EXCEPTION_UNKNOWN = 0x7f,

DRM_PANFROST_EXCEPTION_DELAYED_BUS_FAULT = 0x80,

DRM_PANFROST_EXCEPTION_GPU_SHAREABILITY_FAULT = 0x88,

DRM_PANFROST_EXCEPTION_SYS_SHAREABILITY_FAULT = 0x89,

DRM_PANFROST_EXCEPTION_GPU_CACHEABILITY_FAULT = 0x8a,

DRM_PANFROST_EXCEPTION_TRANSLATION_FAULT_0 = 0xc0,

DRM_PANFROST_EXCEPTION_TRANSLATION_FAULT_1 = 0xc1,

DRM_PANFROST_EXCEPTION_TRANSLATION_FAULT_2 = 0xc2,

DRM_PANFROST_EXCEPTION_TRANSLATION_FAULT_3 = 0xc3,

DRM_PANFROST_EXCEPTION_TRANSLATION_FAULT_4 = 0xc4,

DRM_PANFROST_EXCEPTION_TRANSLATION_FAULT_IDENTITY = 0xc7,

DRM_PANFROST_EXCEPTION_PERM_FAULT_0 = 0xc8,

DRM_PANFROST_EXCEPTION_PERM_FAULT_1 = 0xc9,

DRM_PANFROST_EXCEPTION_PERM_FAULT_2 = 0xca,

DRM_PANFROST_EXCEPTION_PERM_FAULT_3 = 0xcb,

DRM_PANFROST_EXCEPTION_TRANSTAB_BUS_FAULT_0 = 0xd0,

DRM_PANFROST_EXCEPTION_TRANSTAB_BUS_FAULT_1 = 0xd1,

DRM_PANFROST_EXCEPTION_TRANSTAB_BUS_FAULT_2 = 0xd2,

DRM_PANFROST_EXCEPTION_TRANSTAB_BUS_FAULT_3 = 0xd3,

DRM_PANFROST_EXCEPTION_ACCESS_FLAG_0 = 0xd8,

DRM_PANFROST_EXCEPTION_ACCESS_FLAG_1 = 0xd9,

DRM_PANFROST_EXCEPTION_ACCESS_FLAG_2 = 0xda,

DRM_PANFROST_EXCEPTION_ACCESS_FLAG_3 = 0xdb,

DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_IN0 = 0xe0,

DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_IN1 = 0xe1,

DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_IN2 = 0xe2,

DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_IN3 = 0xe3,

DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_OUT0 = 0xe4,

DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_OUT1 = 0xe5,

DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_OUT2 = 0xe6,

DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_OUT3 = 0xe7,

DRM_PANFROST_EXCEPTION_MEM_ATTR_FAULT_0 = 0xe8,

DRM_PANFROST_EXCEPTION_MEM_ATTR_FAULT_1 = 0xe9,

DRM_PANFROST_EXCEPTION_MEM_ATTR_FAULT_2 = 0xea,

DRM_PANFROST_EXCEPTION_MEM_ATTR_FAULT_3 = 0xeb,

DRM_PANFROST_EXCEPTION_MEM_ATTR_NONCACHE_0 = 0xec,

DRM_PANFROST_EXCEPTION_MEM_ATTR_NONCACHE_1 = 0xed,

DRM_PANFROST_EXCEPTION_MEM_ATTR_NONCACHE_2 = 0xee,

DRM_PANFROST_EXCEPTION_MEM_ATTR_NONCACHE_3 = 0xef,

+};

#if defined(__cplusplus) } #endif

Boris Brezillon

2:55 p.m.

New subject: [PATCH v2 04/12] drm/panfrost: Expose exception types to userspace

On Mon, 21 Jun 2021 15:49:14 +0100 Steven Price steven.price@arm.com wrote:

...

On 21/06/2021 14:38, Boris Brezillon wrote:

...
Job headers contain an exception type field which might be read and converted to a human readable string by tracing tools. Let's expose the exception type as an enum so we share the same definition.

Signed-off-by: Boris Brezillon boris.brezillon@collabora.com

include/uapi/drm/panfrost_drm.h | 65 +++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+)

diff --git a/include/uapi/drm/panfrost_drm.h b/include/uapi/drm/panfrost_drm.h index 061e700dd06c..9a05d57d0118 100644 --- a/include/uapi/drm/panfrost_drm.h +++ b/include/uapi/drm/panfrost_drm.h @@ -224,6 +224,71 @@ struct drm_panfrost_madvise { __u32 retained; /* out, whether backing store still exists */ };

+/* The exception types */

+enum drm_panfrost_exception_type {

DRM_PANFROST_EXCEPTION_OK = 0x00,

DRM_PANFROST_EXCEPTION_DONE = 0x01,

Any reason to miss INTERRUPTED? Although I don't think you'll ever see it.

Oops, that one is marked 'reserved' on Bifrost. I'll add it.

...

...

DRM_PANFROST_EXCEPTION_STOPPED = 0x03,

DRM_PANFROST_EXCEPTION_TERMINATED = 0x04,

DRM_PANFROST_EXCEPTION_KABOOM = 0x05,

DRM_PANFROST_EXCEPTION_EUREKA = 0x06,

Interestingly KABOOM/EUREKA are missing from panfrost_exception_name()

Addressed in patch 8.

...

...

DRM_PANFROST_EXCEPTION_ACTIVE = 0x08,

DRM_PANFROST_EXCEPTION_JOB_CONFIG_FAULT = 0x40,

DRM_PANFROST_EXCEPTION_JOB_POWER_FAULT = 0x41,

DRM_PANFROST_EXCEPTION_JOB_READ_FAULT = 0x42,

DRM_PANFROST_EXCEPTION_JOB_WRITE_FAULT = 0x43,

DRM_PANFROST_EXCEPTION_JOB_AFFINITY_FAULT = 0x44,

DRM_PANFROST_EXCEPTION_JOB_BUS_FAULT = 0x48,

DRM_PANFROST_EXCEPTION_INSTR_INVALID_PC = 0x50,

DRM_PANFROST_EXCEPTION_INSTR_INVALID_ENC = 0x51,

0x52: INSTR_TYPE_MISMATCH 0x53: INSTR_OPERAND_FAULT 0x54: INSTR_TLS_FAULT

...

DRM_PANFROST_EXCEPTION_INSTR_BARRIER_FAULT = 0x55,

0x56: INSTR_ALIGN_FAULT

By the looks of it this is probably the Bifrost list and missing those codes which are Midgard only, whereas panfrost_exception_name() looks like it's missing some Bifrost status codes.

Yep, I'll add the missing ones.

...

Given this is UAPI there is some argument for missing e.g. INTERRUPTED (I'm not sure it was ever actually implemented in hardware and the term INTERRUPTED might be reused in future), but it seems a bit wrong just to have Bifrost values here.

Definitely, I just didn't notice Midgard and Bifrost had different set of exceptions.

...

Steve

...

DRM_PANFROST_EXCEPTION_DATA_INVALID_FAULT = 0x58,

DRM_PANFROST_EXCEPTION_TILE_RANGE_FAULT = 0x59,

DRM_PANFROST_EXCEPTION_ADDR_RANGE_FAULT = 0x5a,

DRM_PANFROST_EXCEPTION_IMPRECISE_FAULT = 0x5b,

DRM_PANFROST_EXCEPTION_OOM = 0x60,

DRM_PANFROST_EXCEPTION_UNKNOWN = 0x7f,

DRM_PANFROST_EXCEPTION_DELAYED_BUS_FAULT = 0x80,

DRM_PANFROST_EXCEPTION_GPU_SHAREABILITY_FAULT = 0x88,

DRM_PANFROST_EXCEPTION_SYS_SHAREABILITY_FAULT = 0x89,

DRM_PANFROST_EXCEPTION_GPU_CACHEABILITY_FAULT = 0x8a,

DRM_PANFROST_EXCEPTION_TRANSLATION_FAULT_0 = 0xc0,

DRM_PANFROST_EXCEPTION_TRANSLATION_FAULT_1 = 0xc1,

DRM_PANFROST_EXCEPTION_TRANSLATION_FAULT_2 = 0xc2,

DRM_PANFROST_EXCEPTION_TRANSLATION_FAULT_3 = 0xc3,

DRM_PANFROST_EXCEPTION_TRANSLATION_FAULT_4 = 0xc4,

DRM_PANFROST_EXCEPTION_TRANSLATION_FAULT_IDENTITY = 0xc7,

DRM_PANFROST_EXCEPTION_PERM_FAULT_0 = 0xc8,

DRM_PANFROST_EXCEPTION_PERM_FAULT_1 = 0xc9,

DRM_PANFROST_EXCEPTION_PERM_FAULT_2 = 0xca,

DRM_PANFROST_EXCEPTION_PERM_FAULT_3 = 0xcb,

DRM_PANFROST_EXCEPTION_TRANSTAB_BUS_FAULT_0 = 0xd0,

DRM_PANFROST_EXCEPTION_TRANSTAB_BUS_FAULT_1 = 0xd1,

DRM_PANFROST_EXCEPTION_TRANSTAB_BUS_FAULT_2 = 0xd2,

DRM_PANFROST_EXCEPTION_TRANSTAB_BUS_FAULT_3 = 0xd3,

DRM_PANFROST_EXCEPTION_ACCESS_FLAG_0 = 0xd8,

DRM_PANFROST_EXCEPTION_ACCESS_FLAG_1 = 0xd9,

DRM_PANFROST_EXCEPTION_ACCESS_FLAG_2 = 0xda,

DRM_PANFROST_EXCEPTION_ACCESS_FLAG_3 = 0xdb,

DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_IN0 = 0xe0,

DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_IN1 = 0xe1,

DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_IN2 = 0xe2,

DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_IN3 = 0xe3,

DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_OUT0 = 0xe4,

DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_OUT1 = 0xe5,

DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_OUT2 = 0xe6,

DRM_PANFROST_EXCEPTION_ADDR_SIZE_FAULT_OUT3 = 0xe7,

DRM_PANFROST_EXCEPTION_MEM_ATTR_FAULT_0 = 0xe8,

DRM_PANFROST_EXCEPTION_MEM_ATTR_FAULT_1 = 0xe9,

DRM_PANFROST_EXCEPTION_MEM_ATTR_FAULT_2 = 0xea,

DRM_PANFROST_EXCEPTION_MEM_ATTR_FAULT_3 = 0xeb,

DRM_PANFROST_EXCEPTION_MEM_ATTR_NONCACHE_0 = 0xec,

DRM_PANFROST_EXCEPTION_MEM_ATTR_NONCACHE_1 = 0xed,

DRM_PANFROST_EXCEPTION_MEM_ATTR_NONCACHE_2 = 0xee,

DRM_PANFROST_EXCEPTION_MEM_ATTR_NONCACHE_3 = 0xef,

+};

#if defined(__cplusplus) } #endif

Boris Brezillon

1:39 p.m.

New subject: [PATCH v2 05/12] drm/panfrost: Disable the AS on unhandled page faults

If we don't do that, we have to wait for the job timeout to expire before the fault jobs gets killed.

Signed-off-by: Boris Brezillon boris.brezillon@collabora.com --- drivers/gpu/drm/panfrost/panfrost_mmu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c index 2a9bf30edc9d..d5c624e776f1 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.c +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c @@ -661,7 +661,7 @@ static irqreturn_t panfrost_mmu_irq_handler_thread(int irq, void *data) if ((status & mask) == BIT(as) && (exception_type & 0xF8) == 0xC0) ret = panfrost_mmu_map_fault_addr(pfdev, as, addr);

- if (ret) + if (ret) { /* terminal fault, print info about the fault */ dev_err(pfdev->dev, "Unhandled Page fault in AS%d at VA 0x%016llX\n" @@ -679,6 +679,10 @@ static irqreturn_t panfrost_mmu_irq_handler_thread(int irq, void *data) access_type, access_type_name(pfdev, fault_status), source_id);

+ /* Disable the MMU to stop jobs on this AS immediately */ + panfrost_mmu_disable(pfdev, as); + } + status &= ~mask;

/* If we received new MMU interrupts, process them before returning. */

-- 2.31.1

Boris Brezillon

3:08 p.m.

New subject: [PATCH v2 05/12] drm/panfrost: Disable the AS on unhandled page faults

On Mon, 21 Jun 2021 15:39:00 +0200 Boris Brezillon boris.brezillon@collabora.com wrote:

...

If we don't do that, we have to wait for the job timeout to expire before the fault jobs gets killed.

^ faulty

...

Signed-off-by: Boris Brezillon boris.brezillon@collabora.com

drivers/gpu/drm/panfrost/panfrost_mmu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c index 2a9bf30edc9d..d5c624e776f1 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.c +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c @@ -661,7 +661,7 @@ static irqreturn_t panfrost_mmu_irq_handler_thread(int irq, void *data) if ((status & mask) == BIT(as) && (exception_type & 0xF8) == 0xC0) ret = panfrost_mmu_map_fault_addr(pfdev, as, addr);
if (ret)
if (ret) {
/* terminal fault, print info about the fault */
dev_err(pfdev->dev,
	"Unhandled Page fault in AS%d at VA 0x%016llX\n"
@@ -679,6 +679,10 @@ static irqreturn_t panfrost_mmu_irq_handler_thread(int irq, void *data) access_type, access_type_name(pfdev, fault_status), source_id);
	/* Disable the MMU to stop jobs on this AS immediately */
	panfrost_mmu_disable(pfdev, as);
}
status &= ~mask;

/* If we received new MMU interrupts, process them before returning. */

Steven Price

3:09 p.m.

New subject: [PATCH v2 05/12] drm/panfrost: Disable the AS on unhandled page faults

On 21/06/2021 14:39, Boris Brezillon wrote:

...

If we don't do that, we have to wait for the job timeout to expire before the fault jobs gets killed.

Signed-off-by: Boris Brezillon boris.brezillon@collabora.com

Don't we need to do something here to allow recovery of the MMU context in the future? panfrost_mmu_disable() will zero out the MMU registers on the hardware, but AFAICS panfrost_mmu_enable() won't be called to restore the values until something evicts the address space (GPU power down/reset or just too many other processes).

The ideal would be to block submission of new jobs from this context and then wait until existing jobs have completed at which point the MMU state can be restored and jobs allowed again.

But at a minimum I think we should have something like an 'MMU poisoned' bit that panfrost_mmu_as_get() can check.

Steve

...

drivers/gpu/drm/panfrost/panfrost_mmu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c index 2a9bf30edc9d..d5c624e776f1 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.c +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c @@ -661,7 +661,7 @@ static irqreturn_t panfrost_mmu_irq_handler_thread(int irq, void *data) if ((status & mask) == BIT(as) && (exception_type & 0xF8) == 0xC0) ret = panfrost_mmu_map_fault_addr(pfdev, as, addr);
if (ret)
if (ret) {
/* terminal fault, print info about the fault */
dev_err(pfdev->dev,
	"Unhandled Page fault in AS%d at VA 0x%016llX\n"
@@ -679,6 +679,10 @@ static irqreturn_t panfrost_mmu_irq_handler_thread(int irq, void *data) access_type, access_type_name(pfdev, fault_status), source_id);
	/* Disable the MMU to stop jobs on this AS immediately */
	panfrost_mmu_disable(pfdev, as);
}
status &= ~mask;

/* If we received new MMU interrupts, process them before returning. */

Boris Brezillon

3:32 p.m.

New subject: [PATCH v2 05/12] drm/panfrost: Disable the AS on unhandled page faults

On Mon, 21 Jun 2021 16:09:32 +0100 Steven Price steven.price@arm.com wrote:

...

On 21/06/2021 14:39, Boris Brezillon wrote:

...
If we don't do that, we have to wait for the job timeout to expire before the fault jobs gets killed.

Signed-off-by: Boris Brezillon boris.brezillon@collabora.com

Don't we need to do something here to allow recovery of the MMU context in the future? panfrost_mmu_disable() will zero out the MMU registers on the hardware, but AFAICS panfrost_mmu_enable() won't be called to restore the values until something evicts the address space (GPU power down/reset or just too many other processes).

The ideal would be to block submission of new jobs from this context and then wait until existing jobs have completed at which point the MMU state can be restored and jobs allowed again.

Uh, I assumed it'd be okay to have subsequent jobs coming from this context to fail with a BUS_FAULT until the context is closed. But what you suggest seems more robust.

...

But at a minimum I think we should have something like an 'MMU poisoned' bit that panfrost_mmu_as_get() can check.

Steve

...
drivers/gpu/drm/panfrost/panfrost_mmu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c index 2a9bf30edc9d..d5c624e776f1 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.c +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c @@ -661,7 +661,7 @@ static irqreturn_t panfrost_mmu_irq_handler_thread(int irq, void *data) if ((status & mask) == BIT(as) && (exception_type & 0xF8) == 0xC0) ret = panfrost_mmu_map_fault_addr(pfdev, as, addr);
if (ret)
if (ret) {
/* terminal fault, print info about the fault */
dev_err(pfdev->dev,
	"Unhandled Page fault in AS%d at VA 0x%016llX\n"
@@ -679,6 +679,10 @@ static irqreturn_t panfrost_mmu_irq_handler_thread(int irq, void *data) access_type, access_type_name(pfdev, fault_status), source_id);
	/* Disable the MMU to stop jobs on this AS immediately */
	panfrost_mmu_disable(pfdev, as);
}
status &= ~mask;

/* If we received new MMU interrupts, process them before returning. */

Boris Brezillon

1:39 p.m.

New subject: [PATCH v2 06/12] drm/panfrost: Expose a helper to trigger a GPU reset

Expose a helper to trigger a GPU reset so we can easily trigger reset operations outside the job timeout handler.

Signed-off-by: Boris Brezillon boris.brezillon@collabora.com --- drivers/gpu/drm/panfrost/panfrost_device.h | 8 ++++++++ drivers/gpu/drm/panfrost/panfrost_job.c | 4 +--- 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h index 2fe1550da7f8..1c6a3597eba0 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.h +++ b/drivers/gpu/drm/panfrost/panfrost_device.h @@ -175,4 +175,12 @@ int panfrost_device_suspend(struct device *dev);

const char *panfrost_exception_name(u32 exception_code);

+static inline void +panfrost_device_schedule_reset(struct panfrost_device *pfdev) +{ + /* Schedule a reset if there's no reset in progress. */ + if (!atomic_xchg(&pfdev->reset.pending, 1)) + schedule_work(&pfdev->reset.work); +} + #endif diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c index 1be80b3dd5d0..be5d3e4a1d0a 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.c +++ b/drivers/gpu/drm/panfrost/panfrost_job.c @@ -458,9 +458,7 @@ static enum drm_gpu_sched_stat panfrost_job_timedout(struct drm_sched_job if (!panfrost_scheduler_stop(&pfdev->js->queue[js], sched_job)) return DRM_GPU_SCHED_STAT_NOMINAL;

- /* Schedule a reset if there's no reset in progress. */ - if (!atomic_xchg(&pfdev->reset.pending, 1)) - schedule_work(&pfdev->reset.work); + panfrost_device_schedule_reset(pfdev);

return DRM_GPU_SCHED_STAT_NOMINAL; }

-- 2.31.1

Steven Price

3:10 p.m.

New subject: [PATCH v2 06/12] drm/panfrost: Expose a helper to trigger a GPU reset

On 21/06/2021 14:39, Boris Brezillon wrote:

...

Expose a helper to trigger a GPU reset so we can easily trigger reset operations outside the job timeout handler.

Signed-off-by: Boris Brezillon boris.brezillon@collabora.com

Reviewed-by: Steven Price steven.price@arm.com

...

drivers/gpu/drm/panfrost/panfrost_device.h | 8 ++++++++ drivers/gpu/drm/panfrost/panfrost_job.c | 4 +--- 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h index 2fe1550da7f8..1c6a3597eba0 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.h +++ b/drivers/gpu/drm/panfrost/panfrost_device.h @@ -175,4 +175,12 @@ int panfrost_device_suspend(struct device *dev);

const char *panfrost_exception_name(u32 exception_code);

+static inline void +panfrost_device_schedule_reset(struct panfrost_device *pfdev) +{
/* Schedule a reset if there's no reset in progress. */

if (!atomic_xchg(&pfdev->reset.pending, 1))
schedule_work(&pfdev->reset.work);
+}

#endif diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c index 1be80b3dd5d0..be5d3e4a1d0a 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.c +++ b/drivers/gpu/drm/panfrost/panfrost_job.c @@ -458,9 +458,7 @@ static enum drm_gpu_sched_stat panfrost_job_timedout(struct drm_sched_job if (!panfrost_scheduler_stop(&pfdev->js->queue[js], sched_job)) return DRM_GPU_SCHED_STAT_NOMINAL;
/* Schedule a reset if there's no reset in progress. */

if (!atomic_xchg(&pfdev->reset.pending, 1))
schedule_work(&pfdev->reset.work);
panfrost_device_schedule_reset(pfdev);

return DRM_GPU_SCHED_STAT_NOMINAL;

}

Boris Brezillon

1:39 p.m.

New subject: [PATCH v2 07/12] drm/panfrost: Reset the GPU when the AS_ACTIVE bit is stuck

Things are unlikely to resolve until we reset the GPU. Let's not wait for other faults/timeout to happen to trigger this reset.

Signed-off-by: Boris Brezillon boris.brezillon@collabora.com --- drivers/gpu/drm/panfrost/panfrost_mmu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c index d5c624e776f1..d20bcaecb78f 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.c +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c @@ -36,8 +36,11 @@ static int wait_ready(struct panfrost_device *pfdev, u32 as_nr) ret = readl_relaxed_poll_timeout_atomic(pfdev->iomem + AS_STATUS(as_nr), val, !(val & AS_STATUS_AS_ACTIVE), 10, 1000);

- if (ret) + if (ret) { + /* The GPU hung, let's trigger a reset */ + panfrost_device_schedule_reset(pfdev); dev_err(pfdev->dev, "AS_ACTIVE bit stuck\n"); + }

return ret; }

-- 2.31.1

Steven Price

3:11 p.m.

New subject: [PATCH v2 07/12] drm/panfrost: Reset the GPU when the AS_ACTIVE bit is stuck

On 21/06/2021 14:39, Boris Brezillon wrote:

...

Things are unlikely to resolve until we reset the GPU. Let's not wait for other faults/timeout to happen to trigger this reset.

Signed-off-by: Boris Brezillon boris.brezillon@collabora.com

This one still haunts me... ;)

Reviewed-by: Steven Price steven.price@arm.com

...

drivers/gpu/drm/panfrost/panfrost_mmu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c index d5c624e776f1..d20bcaecb78f 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.c +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c @@ -36,8 +36,11 @@ static int wait_ready(struct panfrost_device *pfdev, u32 as_nr) ret = readl_relaxed_poll_timeout_atomic(pfdev->iomem + AS_STATUS(as_nr), val, !(val & AS_STATUS_AS_ACTIVE), 10, 1000);

if (ret)
if (ret) {
/* The GPU hung, let's trigger a reset */
panfrost_device_schedule_reset(pfdev);
dev_err(pfdev->dev, "AS_ACTIVE bit stuck\n");
}

return ret;
}

Boris Brezillon

1:39 p.m.

New subject: [PATCH v2 08/12] drm/panfrost: Do the exception -> string translation using a table

Do the exception -> string translation using a table so we can add extra fields if we need to. While at it add an error field to ease the exception -> error conversion which we'll need if we want to set the fence error to something that reflects the exception code.

TODO: fix the error codes.

Signed-off-by: Boris Brezillon boris.brezillon@collabora.com --- drivers/gpu/drm/panfrost/panfrost_device.c | 134 +++++++++++++-------- drivers/gpu/drm/panfrost/panfrost_device.h | 1 + 2 files changed, 88 insertions(+), 47 deletions(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_device.c b/drivers/gpu/drm/panfrost/panfrost_device.c index f7f5ca94f910..2de011cee258 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.c +++ b/drivers/gpu/drm/panfrost/panfrost_device.c @@ -292,55 +292,95 @@ void panfrost_device_fini(struct panfrost_device *pfdev) panfrost_clk_fini(pfdev); }

-const char *panfrost_exception_name(u32 exception_code) -{ - switch (exception_code) { - /* Non-Fault Status code */ - case 0x00: return "NOT_STARTED/IDLE/OK"; - case 0x01: return "DONE"; - case 0x02: return "INTERRUPTED"; - case 0x03: return "STOPPED"; - case 0x04: return "TERMINATED"; - case 0x08: return "ACTIVE"; - /* Job exceptions */ - case 0x40: return "JOB_CONFIG_FAULT"; - case 0x41: return "JOB_POWER_FAULT"; - case 0x42: return "JOB_READ_FAULT"; - case 0x43: return "JOB_WRITE_FAULT"; - case 0x44: return "JOB_AFFINITY_FAULT"; - case 0x48: return "JOB_BUS_FAULT"; - case 0x50: return "INSTR_INVALID_PC"; - case 0x51: return "INSTR_INVALID_ENC"; - case 0x52: return "INSTR_TYPE_MISMATCH"; - case 0x53: return "INSTR_OPERAND_FAULT"; - case 0x54: return "INSTR_TLS_FAULT"; - case 0x55: return "INSTR_BARRIER_FAULT"; - case 0x56: return "INSTR_ALIGN_FAULT"; - case 0x58: return "DATA_INVALID_FAULT"; - case 0x59: return "TILE_RANGE_FAULT"; - case 0x5A: return "ADDR_RANGE_FAULT"; - case 0x60: return "OUT_OF_MEMORY"; - /* GPU exceptions */ - case 0x80: return "DELAYED_BUS_FAULT"; - case 0x88: return "SHAREABILITY_FAULT"; - /* MMU exceptions */ - case 0xC1: return "TRANSLATION_FAULT_LEVEL1"; - case 0xC2: return "TRANSLATION_FAULT_LEVEL2"; - case 0xC3: return "TRANSLATION_FAULT_LEVEL3"; - case 0xC4: return "TRANSLATION_FAULT_LEVEL4"; - case 0xC8: return "PERMISSION_FAULT"; - case 0xC9 ... 0xCF: return "PERMISSION_FAULT"; - case 0xD1: return "TRANSTAB_BUS_FAULT_LEVEL1"; - case 0xD2: return "TRANSTAB_BUS_FAULT_LEVEL2"; - case 0xD3: return "TRANSTAB_BUS_FAULT_LEVEL3"; - case 0xD4: return "TRANSTAB_BUS_FAULT_LEVEL4"; - case 0xD8: return "ACCESS_FLAG"; - case 0xD9 ... 0xDF: return "ACCESS_FLAG"; - case 0xE0 ... 0xE7: return "ADDRESS_SIZE_FAULT"; - case 0xE8 ... 0xEF: return "MEMORY_ATTRIBUTES_FAULT"; +#define PANFROST_EXCEPTION(id, err) \ + [DRM_PANFROST_EXCEPTION_ ## id] = { \ + .name = #id, \ + .error = err, \ }

- return "UNKNOWN"; +struct panfrost_exception_info { + const char *name; + int error; +}; + +static const struct panfrost_exception_info panfrost_exception_infos[] = { + PANFROST_EXCEPTION(OK, 0), + PANFROST_EXCEPTION(DONE, 0), + PANFROST_EXCEPTION(STOPPED, 0), + PANFROST_EXCEPTION(TERMINATED, 0), + PANFROST_EXCEPTION(KABOOM, 0), + PANFROST_EXCEPTION(EUREKA, 0), + PANFROST_EXCEPTION(ACTIVE, 0), + PANFROST_EXCEPTION(JOB_CONFIG_FAULT, -EINVAL), + PANFROST_EXCEPTION(JOB_POWER_FAULT, -ECANCELED), + PANFROST_EXCEPTION(JOB_READ_FAULT, -EINVAL), + PANFROST_EXCEPTION(JOB_WRITE_FAULT, -EINVAL), + PANFROST_EXCEPTION(JOB_AFFINITY_FAULT, -EINVAL), + PANFROST_EXCEPTION(JOB_BUS_FAULT, -EINVAL), + PANFROST_EXCEPTION(INSTR_INVALID_PC, -EINVAL), + PANFROST_EXCEPTION(INSTR_INVALID_ENC, -EINVAL), + PANFROST_EXCEPTION(INSTR_BARRIER_FAULT, -EINVAL), + PANFROST_EXCEPTION(DATA_INVALID_FAULT, -EINVAL), + PANFROST_EXCEPTION(TILE_RANGE_FAULT, -EINVAL), + PANFROST_EXCEPTION(ADDR_RANGE_FAULT, -EINVAL), + PANFROST_EXCEPTION(IMPRECISE_FAULT, -EINVAL), + PANFROST_EXCEPTION(OOM, -ENOMEM), + PANFROST_EXCEPTION(UNKNOWN, -EINVAL), + PANFROST_EXCEPTION(DELAYED_BUS_FAULT, -EINVAL), + PANFROST_EXCEPTION(GPU_SHAREABILITY_FAULT, -ECANCELED), + PANFROST_EXCEPTION(SYS_SHAREABILITY_FAULT, -ECANCELED), + PANFROST_EXCEPTION(GPU_CACHEABILITY_FAULT, -ECANCELED), + PANFROST_EXCEPTION(TRANSLATION_FAULT_0, -EINVAL), + PANFROST_EXCEPTION(TRANSLATION_FAULT_1, -EINVAL), + PANFROST_EXCEPTION(TRANSLATION_FAULT_2, -EINVAL), + PANFROST_EXCEPTION(TRANSLATION_FAULT_3, -EINVAL), + PANFROST_EXCEPTION(TRANSLATION_FAULT_4, -EINVAL), + PANFROST_EXCEPTION(TRANSLATION_FAULT_IDENTITY, -EINVAL), + PANFROST_EXCEPTION(PERM_FAULT_0, -EINVAL), + PANFROST_EXCEPTION(PERM_FAULT_1, -EINVAL), + PANFROST_EXCEPTION(PERM_FAULT_2, -EINVAL), + PANFROST_EXCEPTION(PERM_FAULT_3, -EINVAL), + PANFROST_EXCEPTION(TRANSTAB_BUS_FAULT_0, -EINVAL), + PANFROST_EXCEPTION(TRANSTAB_BUS_FAULT_1, -EINVAL), + PANFROST_EXCEPTION(TRANSTAB_BUS_FAULT_2, -EINVAL), + PANFROST_EXCEPTION(TRANSTAB_BUS_FAULT_3, -EINVAL), + PANFROST_EXCEPTION(ACCESS_FLAG_0, -EINVAL), + PANFROST_EXCEPTION(ACCESS_FLAG_1, -EINVAL), + PANFROST_EXCEPTION(ACCESS_FLAG_2, -EINVAL), + PANFROST_EXCEPTION(ACCESS_FLAG_3, -EINVAL), + PANFROST_EXCEPTION(ADDR_SIZE_FAULT_IN0, -EINVAL), + PANFROST_EXCEPTION(ADDR_SIZE_FAULT_IN1, -EINVAL), + PANFROST_EXCEPTION(ADDR_SIZE_FAULT_IN2, -EINVAL), + PANFROST_EXCEPTION(ADDR_SIZE_FAULT_IN3, -EINVAL), + PANFROST_EXCEPTION(ADDR_SIZE_FAULT_OUT0, -EINVAL), + PANFROST_EXCEPTION(ADDR_SIZE_FAULT_OUT1, -EINVAL), + PANFROST_EXCEPTION(ADDR_SIZE_FAULT_OUT2, -EINVAL), + PANFROST_EXCEPTION(ADDR_SIZE_FAULT_OUT3, -EINVAL), + PANFROST_EXCEPTION(MEM_ATTR_FAULT_0, -EINVAL), + PANFROST_EXCEPTION(MEM_ATTR_FAULT_1, -EINVAL), + PANFROST_EXCEPTION(MEM_ATTR_FAULT_2, -EINVAL), + PANFROST_EXCEPTION(MEM_ATTR_FAULT_3, -EINVAL), + PANFROST_EXCEPTION(MEM_ATTR_NONCACHE_0, -EINVAL), + PANFROST_EXCEPTION(MEM_ATTR_NONCACHE_1, -EINVAL), + PANFROST_EXCEPTION(MEM_ATTR_NONCACHE_2, -EINVAL), + PANFROST_EXCEPTION(MEM_ATTR_NONCACHE_3, -EINVAL), +}; + +const char *panfrost_exception_name(u32 exception_code) +{ + if (WARN_ON(exception_code >= ARRAY_SIZE(panfrost_exception_infos) || + !panfrost_exception_infos[exception_code].name)) + return "UNKNOWN"; + + return panfrost_exception_infos[exception_code].name; +} + +int panfrost_exception_to_error(u32 exception_code) +{ + if (WARN_ON(exception_code >= ARRAY_SIZE(panfrost_exception_infos))) + return 0; + + return panfrost_exception_infos[exception_code].error; }

void panfrost_device_reset(struct panfrost_device *pfdev) diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h index 1c6a3597eba0..498c7b5dccd0 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.h +++ b/drivers/gpu/drm/panfrost/panfrost_device.h @@ -174,6 +174,7 @@ int panfrost_device_resume(struct device *dev); int panfrost_device_suspend(struct device *dev);

const char *panfrost_exception_name(u32 exception_code); +int panfrost_exception_to_error(u32 exception_code);

static inline void panfrost_device_schedule_reset(struct panfrost_device *pfdev)

-- 2.31.1

Steven Price

3:19 p.m.

New subject: [PATCH v2 08/12] drm/panfrost: Do the exception -> string translation using a table

On 21/06/2021 14:39, Boris Brezillon wrote:

...

Do the exception -> string translation using a table so we can add extra fields if we need to. While at it add an error field to ease the exception -> error conversion which we'll need if we want to set the fence error to something that reflects the exception code.

TODO: fix the error codes.

TODO: Do the TODO ;)

I'm not sure how useful translating the hardware error codes to Linux ones are. E.g. 'OOM' means something quite different from a normal -ENOMEM. One is running out of a space in a predefined buffer, the other is Linux not able to allocate memory.

...

Signed-off-by: Boris Brezillon boris.brezillon@collabora.com

drivers/gpu/drm/panfrost/panfrost_device.c | 134 +++++++++++++-------- drivers/gpu/drm/panfrost/panfrost_device.h | 1 + 2 files changed, 88 insertions(+), 47 deletions(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_device.c b/drivers/gpu/drm/panfrost/panfrost_device.c index f7f5ca94f910..2de011cee258 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.c +++ b/drivers/gpu/drm/panfrost/panfrost_device.c @@ -292,55 +292,95 @@ void panfrost_device_fini(struct panfrost_device *pfdev) panfrost_clk_fini(pfdev); }

-const char *panfrost_exception_name(u32 exception_code) -{
switch (exception_code) {
/* Non-Fault Status code */
case 0x00: return "NOT_STARTED/IDLE/OK";

case 0x01: return "DONE";

case 0x02: return "INTERRUPTED";

case 0x03: return "STOPPED";

case 0x04: return "TERMINATED";

case 0x08: return "ACTIVE";
/* Job exceptions */
case 0x40: return "JOB_CONFIG_FAULT";

case 0x41: return "JOB_POWER_FAULT";

case 0x42: return "JOB_READ_FAULT";

case 0x43: return "JOB_WRITE_FAULT";

case 0x44: return "JOB_AFFINITY_FAULT";

case 0x48: return "JOB_BUS_FAULT";

case 0x50: return "INSTR_INVALID_PC";

case 0x51: return "INSTR_INVALID_ENC";

case 0x52: return "INSTR_TYPE_MISMATCH";

case 0x53: return "INSTR_OPERAND_FAULT";

case 0x54: return "INSTR_TLS_FAULT";

case 0x55: return "INSTR_BARRIER_FAULT";

case 0x56: return "INSTR_ALIGN_FAULT";

case 0x58: return "DATA_INVALID_FAULT";

case 0x59: return "TILE_RANGE_FAULT";

case 0x5A: return "ADDR_RANGE_FAULT";

case 0x60: return "OUT_OF_MEMORY";
/* GPU exceptions */
case 0x80: return "DELAYED_BUS_FAULT";

case 0x88: return "SHAREABILITY_FAULT";
/* MMU exceptions */
case 0xC1: return "TRANSLATION_FAULT_LEVEL1";

case 0xC2: return "TRANSLATION_FAULT_LEVEL2";

case 0xC3: return "TRANSLATION_FAULT_LEVEL3";

case 0xC4: return "TRANSLATION_FAULT_LEVEL4";

case 0xC8: return "PERMISSION_FAULT";

case 0xC9 ... 0xCF: return "PERMISSION_FAULT";

case 0xD1: return "TRANSTAB_BUS_FAULT_LEVEL1";

case 0xD2: return "TRANSTAB_BUS_FAULT_LEVEL2";

case 0xD3: return "TRANSTAB_BUS_FAULT_LEVEL3";

case 0xD4: return "TRANSTAB_BUS_FAULT_LEVEL4";

case 0xD8: return "ACCESS_FLAG";

case 0xD9 ... 0xDF: return "ACCESS_FLAG";

case 0xE0 ... 0xE7: return "ADDRESS_SIZE_FAULT";

case 0xE8 ... 0xEF: return "MEMORY_ATTRIBUTES_FAULT";
+#define PANFROST_EXCEPTION(id, err) \
[DRM_PANFROST_EXCEPTION_ ## id] = { \
.name = #id, \
.error = err, \
}
return "UNKNOWN";

+struct panfrost_exception_info {

const char *name;

int error;

+};

+static const struct panfrost_exception_info panfrost_exception_infos[] = {

PANFROST_EXCEPTION(OK, 0),

PANFROST_EXCEPTION(DONE, 0),

PANFROST_EXCEPTION(STOPPED, 0),

PANFROST_EXCEPTION(TERMINATED, 0),

STOPPED/TERMINATED are not really 'success' from an application perspective. But equally they are ones that need special handling from the kernel.

...

PANFROST_EXCEPTION(KABOOM, 0),

PANFROST_EXCEPTION(EUREKA, 0),

PANFROST_EXCEPTION(ACTIVE, 0),

PANFROST_EXCEPTION(JOB_CONFIG_FAULT, -EINVAL),

PANFROST_EXCEPTION(JOB_POWER_FAULT, -ECANCELED),

PANFROST_EXCEPTION(JOB_READ_FAULT, -EINVAL),

PANFROST_EXCEPTION(JOB_WRITE_FAULT, -EINVAL),

PANFROST_EXCEPTION(JOB_AFFINITY_FAULT, -EINVAL),

PANFROST_EXCEPTION(JOB_BUS_FAULT, -EINVAL),

PANFROST_EXCEPTION(INSTR_INVALID_PC, -EINVAL),

PANFROST_EXCEPTION(INSTR_INVALID_ENC, -EINVAL),

PANFROST_EXCEPTION(INSTR_BARRIER_FAULT, -EINVAL),

PANFROST_EXCEPTION(DATA_INVALID_FAULT, -EINVAL),

PANFROST_EXCEPTION(TILE_RANGE_FAULT, -EINVAL),

PANFROST_EXCEPTION(ADDR_RANGE_FAULT, -EINVAL),

PANFROST_EXCEPTION(IMPRECISE_FAULT, -EINVAL),

PANFROST_EXCEPTION(OOM, -ENOMEM),

PANFROST_EXCEPTION(UNKNOWN, -EINVAL),

We should probably make a distinction between this 'special' UNKNOWN that the hardware can report...

...

PANFROST_EXCEPTION(DELAYED_BUS_FAULT, -EINVAL),

PANFROST_EXCEPTION(GPU_SHAREABILITY_FAULT, -ECANCELED),

PANFROST_EXCEPTION(SYS_SHAREABILITY_FAULT, -ECANCELED),

PANFROST_EXCEPTION(GPU_CACHEABILITY_FAULT, -ECANCELED),

PANFROST_EXCEPTION(TRANSLATION_FAULT_0, -EINVAL),

PANFROST_EXCEPTION(TRANSLATION_FAULT_1, -EINVAL),

PANFROST_EXCEPTION(TRANSLATION_FAULT_2, -EINVAL),

PANFROST_EXCEPTION(TRANSLATION_FAULT_3, -EINVAL),

PANFROST_EXCEPTION(TRANSLATION_FAULT_4, -EINVAL),

PANFROST_EXCEPTION(TRANSLATION_FAULT_IDENTITY, -EINVAL),

PANFROST_EXCEPTION(PERM_FAULT_0, -EINVAL),

PANFROST_EXCEPTION(PERM_FAULT_1, -EINVAL),

PANFROST_EXCEPTION(PERM_FAULT_2, -EINVAL),

PANFROST_EXCEPTION(PERM_FAULT_3, -EINVAL),

PANFROST_EXCEPTION(TRANSTAB_BUS_FAULT_0, -EINVAL),

PANFROST_EXCEPTION(TRANSTAB_BUS_FAULT_1, -EINVAL),

PANFROST_EXCEPTION(TRANSTAB_BUS_FAULT_2, -EINVAL),

PANFROST_EXCEPTION(TRANSTAB_BUS_FAULT_3, -EINVAL),

PANFROST_EXCEPTION(ACCESS_FLAG_0, -EINVAL),

PANFROST_EXCEPTION(ACCESS_FLAG_1, -EINVAL),

PANFROST_EXCEPTION(ACCESS_FLAG_2, -EINVAL),

PANFROST_EXCEPTION(ACCESS_FLAG_3, -EINVAL),

PANFROST_EXCEPTION(ADDR_SIZE_FAULT_IN0, -EINVAL),

PANFROST_EXCEPTION(ADDR_SIZE_FAULT_IN1, -EINVAL),

PANFROST_EXCEPTION(ADDR_SIZE_FAULT_IN2, -EINVAL),

PANFROST_EXCEPTION(ADDR_SIZE_FAULT_IN3, -EINVAL),

PANFROST_EXCEPTION(ADDR_SIZE_FAULT_OUT0, -EINVAL),

PANFROST_EXCEPTION(ADDR_SIZE_FAULT_OUT1, -EINVAL),

PANFROST_EXCEPTION(ADDR_SIZE_FAULT_OUT2, -EINVAL),

PANFROST_EXCEPTION(ADDR_SIZE_FAULT_OUT3, -EINVAL),

PANFROST_EXCEPTION(MEM_ATTR_FAULT_0, -EINVAL),

PANFROST_EXCEPTION(MEM_ATTR_FAULT_1, -EINVAL),

PANFROST_EXCEPTION(MEM_ATTR_FAULT_2, -EINVAL),

PANFROST_EXCEPTION(MEM_ATTR_FAULT_3, -EINVAL),

PANFROST_EXCEPTION(MEM_ATTR_NONCACHE_0, -EINVAL),

PANFROST_EXCEPTION(MEM_ATTR_NONCACHE_1, -EINVAL),

PANFROST_EXCEPTION(MEM_ATTR_NONCACHE_2, -EINVAL),

PANFROST_EXCEPTION(MEM_ATTR_NONCACHE_3, -EINVAL),

+};

+const char *panfrost_exception_name(u32 exception_code) +{
if (WARN_ON(exception_code >= ARRAY_SIZE(panfrost_exception_infos) ||
    !panfrost_exception_infos[exception_code].name))
return "UNKNOWN";

...and this UNKNOWN that just means we don't have a clue what the magic number is.

Steve

...

return panfrost_exception_infos[exception_code].name;

+}

+int panfrost_exception_to_error(u32 exception_code) +{
if (WARN_ON(exception_code >= ARRAY_SIZE(panfrost_exception_infos)))
return 0;
return panfrost_exception_infos[exception_code].error;
}

void panfrost_device_reset(struct panfrost_device *pfdev) diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h index 1c6a3597eba0..498c7b5dccd0 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.h +++ b/drivers/gpu/drm/panfrost/panfrost_device.h @@ -174,6 +174,7 @@ int panfrost_device_resume(struct device *dev); int panfrost_device_suspend(struct device *dev);

const char *panfrost_exception_name(u32 exception_code); +int panfrost_exception_to_error(u32 exception_code);

static inline void panfrost_device_schedule_reset(struct panfrost_device *pfdev)

Boris Brezillon

3:46 p.m.

New subject: [PATCH v2 08/12] drm/panfrost: Do the exception -> string translation using a table

On Mon, 21 Jun 2021 16:19:38 +0100 Steven Price steven.price@arm.com wrote:

...

On 21/06/2021 14:39, Boris Brezillon wrote:

...
Do the exception -> string translation using a table so we can add extra fields if we need to. While at it add an error field to ease the exception -> error conversion which we'll need if we want to set the fence error to something that reflects the exception code.

TODO: fix the error codes.

TODO: Do the TODO ;)

Yeah, I was kinda expecting help with that :-).

...

I'm not sure how useful translating the hardware error codes to Linux ones are. E.g. 'OOM' means something quite different from a normal -ENOMEM. One is running out of a space in a predefined buffer, the other is Linux not able to allocate memory.

Okay, then I can just unconditionally set the fence error to -EINVAL and drop this error field.

...

...
Signed-off-by: Boris Brezillon boris.brezillon@collabora.com

drivers/gpu/drm/panfrost/panfrost_device.c | 134 +++++++++++++-------- drivers/gpu/drm/panfrost/panfrost_device.h | 1 + 2 files changed, 88 insertions(+), 47 deletions(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_device.c b/drivers/gpu/drm/panfrost/panfrost_device.c index f7f5ca94f910..2de011cee258 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.c +++ b/drivers/gpu/drm/panfrost/panfrost_device.c @@ -292,55 +292,95 @@ void panfrost_device_fini(struct panfrost_device *pfdev) panfrost_clk_fini(pfdev); }

-const char *panfrost_exception_name(u32 exception_code) -{
switch (exception_code) {
/* Non-Fault Status code */
case 0x00: return "NOT_STARTED/IDLE/OK";

case 0x01: return "DONE";

case 0x02: return "INTERRUPTED";

case 0x03: return "STOPPED";

case 0x04: return "TERMINATED";

case 0x08: return "ACTIVE";
/* Job exceptions */
case 0x40: return "JOB_CONFIG_FAULT";

case 0x41: return "JOB_POWER_FAULT";

case 0x42: return "JOB_READ_FAULT";

case 0x43: return "JOB_WRITE_FAULT";

case 0x44: return "JOB_AFFINITY_FAULT";

case 0x48: return "JOB_BUS_FAULT";

case 0x50: return "INSTR_INVALID_PC";

case 0x51: return "INSTR_INVALID_ENC";

case 0x52: return "INSTR_TYPE_MISMATCH";

case 0x53: return "INSTR_OPERAND_FAULT";

case 0x54: return "INSTR_TLS_FAULT";

case 0x55: return "INSTR_BARRIER_FAULT";

case 0x56: return "INSTR_ALIGN_FAULT";

case 0x58: return "DATA_INVALID_FAULT";

case 0x59: return "TILE_RANGE_FAULT";

case 0x5A: return "ADDR_RANGE_FAULT";

case 0x60: return "OUT_OF_MEMORY";
/* GPU exceptions */
case 0x80: return "DELAYED_BUS_FAULT";

case 0x88: return "SHAREABILITY_FAULT";
/* MMU exceptions */
case 0xC1: return "TRANSLATION_FAULT_LEVEL1";

case 0xC2: return "TRANSLATION_FAULT_LEVEL2";

case 0xC3: return "TRANSLATION_FAULT_LEVEL3";

case 0xC4: return "TRANSLATION_FAULT_LEVEL4";

case 0xC8: return "PERMISSION_FAULT";

case 0xC9 ... 0xCF: return "PERMISSION_FAULT";

case 0xD1: return "TRANSTAB_BUS_FAULT_LEVEL1";

case 0xD2: return "TRANSTAB_BUS_FAULT_LEVEL2";

case 0xD3: return "TRANSTAB_BUS_FAULT_LEVEL3";

case 0xD4: return "TRANSTAB_BUS_FAULT_LEVEL4";

case 0xD8: return "ACCESS_FLAG";

case 0xD9 ... 0xDF: return "ACCESS_FLAG";

case 0xE0 ... 0xE7: return "ADDRESS_SIZE_FAULT";

case 0xE8 ... 0xEF: return "MEMORY_ATTRIBUTES_FAULT";
+#define PANFROST_EXCEPTION(id, err) \
[DRM_PANFROST_EXCEPTION_ ## id] = { \
.name = #id, \
.error = err, \
}
return "UNKNOWN";

+struct panfrost_exception_info {

const char *name;

int error;

+};

+static const struct panfrost_exception_info panfrost_exception_infos[] = {

PANFROST_EXCEPTION(OK, 0),

PANFROST_EXCEPTION(DONE, 0),

PANFROST_EXCEPTION(STOPPED, 0),

PANFROST_EXCEPTION(TERMINATED, 0),
STOPPED/TERMINATED are not really 'success' from an application perspective. But equally they are ones that need special handling from the kernel.

STOPPED is a temporary state (at least it is right now), so the error code doesn't matter much (the job is expected to be resumed before the job fence is signaled and the final error assigned). TERMINATED should probably have a valid error code reflecting the fact that the job didn't finish properly so that any waiter knows the result of the rendering is invalid.

...

...

PANFROST_EXCEPTION(KABOOM, 0),

PANFROST_EXCEPTION(EUREKA, 0),

PANFROST_EXCEPTION(ACTIVE, 0),

PANFROST_EXCEPTION(JOB_CONFIG_FAULT, -EINVAL),

PANFROST_EXCEPTION(JOB_POWER_FAULT, -ECANCELED),

PANFROST_EXCEPTION(JOB_READ_FAULT, -EINVAL),

PANFROST_EXCEPTION(JOB_WRITE_FAULT, -EINVAL),

PANFROST_EXCEPTION(JOB_AFFINITY_FAULT, -EINVAL),

PANFROST_EXCEPTION(JOB_BUS_FAULT, -EINVAL),

PANFROST_EXCEPTION(INSTR_INVALID_PC, -EINVAL),

PANFROST_EXCEPTION(INSTR_INVALID_ENC, -EINVAL),

PANFROST_EXCEPTION(INSTR_BARRIER_FAULT, -EINVAL),

PANFROST_EXCEPTION(DATA_INVALID_FAULT, -EINVAL),

PANFROST_EXCEPTION(TILE_RANGE_FAULT, -EINVAL),

PANFROST_EXCEPTION(ADDR_RANGE_FAULT, -EINVAL),

PANFROST_EXCEPTION(IMPRECISE_FAULT, -EINVAL),

PANFROST_EXCEPTION(OOM, -ENOMEM),

PANFROST_EXCEPTION(UNKNOWN, -EINVAL),

We should probably make a distinction between this 'special' UNKNOWN that the hardware can report...

...
PANFROST_EXCEPTION(DELAYED_BUS_FAULT, -EINVAL),

PANFROST_EXCEPTION(GPU_SHAREABILITY_FAULT, -ECANCELED),

PANFROST_EXCEPTION(SYS_SHAREABILITY_FAULT, -ECANCELED),

PANFROST_EXCEPTION(GPU_CACHEABILITY_FAULT, -ECANCELED),

PANFROST_EXCEPTION(TRANSLATION_FAULT_0, -EINVAL),

PANFROST_EXCEPTION(TRANSLATION_FAULT_1, -EINVAL),

PANFROST_EXCEPTION(TRANSLATION_FAULT_2, -EINVAL),

PANFROST_EXCEPTION(TRANSLATION_FAULT_3, -EINVAL),

PANFROST_EXCEPTION(TRANSLATION_FAULT_4, -EINVAL),

PANFROST_EXCEPTION(TRANSLATION_FAULT_IDENTITY, -EINVAL),

PANFROST_EXCEPTION(PERM_FAULT_0, -EINVAL),

PANFROST_EXCEPTION(PERM_FAULT_1, -EINVAL),

PANFROST_EXCEPTION(PERM_FAULT_2, -EINVAL),

PANFROST_EXCEPTION(PERM_FAULT_3, -EINVAL),

PANFROST_EXCEPTION(TRANSTAB_BUS_FAULT_0, -EINVAL),

PANFROST_EXCEPTION(TRANSTAB_BUS_FAULT_1, -EINVAL),

PANFROST_EXCEPTION(TRANSTAB_BUS_FAULT_2, -EINVAL),

PANFROST_EXCEPTION(TRANSTAB_BUS_FAULT_3, -EINVAL),

PANFROST_EXCEPTION(ACCESS_FLAG_0, -EINVAL),

PANFROST_EXCEPTION(ACCESS_FLAG_1, -EINVAL),

PANFROST_EXCEPTION(ACCESS_FLAG_2, -EINVAL),

PANFROST_EXCEPTION(ACCESS_FLAG_3, -EINVAL),

PANFROST_EXCEPTION(ADDR_SIZE_FAULT_IN0, -EINVAL),

PANFROST_EXCEPTION(ADDR_SIZE_FAULT_IN1, -EINVAL),

PANFROST_EXCEPTION(ADDR_SIZE_FAULT_IN2, -EINVAL),

PANFROST_EXCEPTION(ADDR_SIZE_FAULT_IN3, -EINVAL),

PANFROST_EXCEPTION(ADDR_SIZE_FAULT_OUT0, -EINVAL),

PANFROST_EXCEPTION(ADDR_SIZE_FAULT_OUT1, -EINVAL),

PANFROST_EXCEPTION(ADDR_SIZE_FAULT_OUT2, -EINVAL),

PANFROST_EXCEPTION(ADDR_SIZE_FAULT_OUT3, -EINVAL),

PANFROST_EXCEPTION(MEM_ATTR_FAULT_0, -EINVAL),

PANFROST_EXCEPTION(MEM_ATTR_FAULT_1, -EINVAL),

PANFROST_EXCEPTION(MEM_ATTR_FAULT_2, -EINVAL),

PANFROST_EXCEPTION(MEM_ATTR_FAULT_3, -EINVAL),

PANFROST_EXCEPTION(MEM_ATTR_NONCACHE_0, -EINVAL),

PANFROST_EXCEPTION(MEM_ATTR_NONCACHE_1, -EINVAL),

PANFROST_EXCEPTION(MEM_ATTR_NONCACHE_2, -EINVAL),

PANFROST_EXCEPTION(MEM_ATTR_NONCACHE_3, -EINVAL),

+};

+const char *panfrost_exception_name(u32 exception_code) +{
if (WARN_ON(exception_code >= ARRAY_SIZE(panfrost_exception_infos) ||
    !panfrost_exception_infos[exception_code].name))
return "UNKNOWN";  
...and this UNKNOWN that just means we don't have a clue what the magic number is.

Makes sense. How about "Unknown exception type"?

...

Steve

...
return panfrost_exception_infos[exception_code].name;

+}

+int panfrost_exception_to_error(u32 exception_code) +{
if (WARN_ON(exception_code >= ARRAY_SIZE(panfrost_exception_infos)))
return 0;
return panfrost_exception_infos[exception_code].error;
}

void panfrost_device_reset(struct panfrost_device *pfdev) diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h index 1c6a3597eba0..498c7b5dccd0 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.h +++ b/drivers/gpu/drm/panfrost/panfrost_device.h @@ -174,6 +174,7 @@ int panfrost_device_resume(struct device *dev); int panfrost_device_suspend(struct device *dev);

const char *panfrost_exception_name(u32 exception_code); +int panfrost_exception_to_error(u32 exception_code);

static inline void panfrost_device_schedule_reset(struct panfrost_device *pfdev)

Boris Brezillon

1:39 p.m.

New subject: [PATCH v2 09/12] drm/panfrost: Don't reset the GPU on job faults unless we really have to

If we can recover from a fault without a reset there's no reason to issue one.

Signed-off-by: Boris Brezillon boris.brezillon@collabora.com --- drivers/gpu/drm/panfrost/panfrost_device.c | 9 ++++++ drivers/gpu/drm/panfrost/panfrost_device.h | 2 ++ drivers/gpu/drm/panfrost/panfrost_job.c | 35 ++++++++++++++-------- 3 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_device.c b/drivers/gpu/drm/panfrost/panfrost_device.c index 2de011cee258..ac76e8646e97 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.c +++ b/drivers/gpu/drm/panfrost/panfrost_device.c @@ -383,6 +383,15 @@ int panfrost_exception_to_error(u32 exception_code) return panfrost_exception_infos[exception_code].error; }

+bool panfrost_exception_needs_reset(const struct panfrost_device *pfdev, + u32 exception_code) +{ + /* Right now, none of the GPU we support need a reset, but this + * might change (e.g. Valhall GPUs require a when a BUS_FAULT occurs). + */ + return false; +} + void panfrost_device_reset(struct panfrost_device *pfdev) { panfrost_gpu_soft_reset(pfdev); diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h index 498c7b5dccd0..95e6044008d2 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.h +++ b/drivers/gpu/drm/panfrost/panfrost_device.h @@ -175,6 +175,8 @@ int panfrost_device_suspend(struct device *dev);

const char *panfrost_exception_name(u32 exception_code); int panfrost_exception_to_error(u32 exception_code); +bool panfrost_exception_needs_reset(const struct panfrost_device *pfdev, + u32 exception_code);

static inline void panfrost_device_schedule_reset(struct panfrost_device *pfdev) diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c index be5d3e4a1d0a..aedc604d331c 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.c +++ b/drivers/gpu/drm/panfrost/panfrost_job.c @@ -493,27 +493,38 @@ static irqreturn_t panfrost_job_irq_handler(int irq, void *data)

if (status & JOB_INT_MASK_ERR(j)) { enum panfrost_queue_status old_status; + u32 js_status = job_read(pfdev, JS_STATUS(j));

job_write(pfdev, JS_COMMAND_NEXT(j), JS_COMMAND_NOP);

dev_err(pfdev->dev, "js fault, js=%d, status=%s, head=0x%x, tail=0x%x", j, - panfrost_exception_name(job_read(pfdev, JS_STATUS(j))), + panfrost_exception_name(js_status), job_read(pfdev, JS_HEAD_LO(j)), job_read(pfdev, JS_TAIL_LO(j)));

- /* - * When the queue is being restarted we don't report - * faults directly to avoid races between the timeout - * and reset handlers. panfrost_scheduler_start() will - * call drm_sched_fault() after the queue has been - * started if status == FAULT_PENDING. + /* If we need a reset, signal it to the reset handler, + * otherwise, update the fence error field and signal + * the job fence. */ - old_status = atomic_cmpxchg(&pfdev->js->queue[j].status, - PANFROST_QUEUE_STATUS_STARTING, - PANFROST_QUEUE_STATUS_FAULT_PENDING); - if (old_status == PANFROST_QUEUE_STATUS_ACTIVE) - drm_sched_fault(&pfdev->js->queue[j].sched); + if (panfrost_exception_needs_reset(pfdev, js_status)) { + /* + * When the queue is being restarted we don't report + * faults directly to avoid races between the timeout + * and reset handlers. panfrost_scheduler_start() will + * call drm_sched_fault() after the queue has been + * started if status == FAULT_PENDING. + */ + old_status = atomic_cmpxchg(&pfdev->js->queue[j].status, + PANFROST_QUEUE_STATUS_STARTING, + PANFROST_QUEUE_STATUS_FAULT_PENDING); + if (old_status == PANFROST_QUEUE_STATUS_ACTIVE) + drm_sched_fault(&pfdev->js->queue[j].sched); + } else { + dma_fence_set_error(pfdev->jobs[j]->done_fence, + panfrost_exception_to_error(js_status)); + status |= JOB_INT_MASK_DONE(j); + } }

if (status & JOB_INT_MASK_DONE(j)) {

-- 2.31.1

Steven Price

3:26 p.m.

New subject: [PATCH v2 09/12] drm/panfrost: Don't reset the GPU on job faults unless we really have to

On 21/06/2021 14:39, Boris Brezillon wrote:

...

If we can recover from a fault without a reset there's no reason to issue one.

Signed-off-by: Boris Brezillon boris.brezillon@collabora.com

drivers/gpu/drm/panfrost/panfrost_device.c | 9 ++++++ drivers/gpu/drm/panfrost/panfrost_device.h | 2 ++ drivers/gpu/drm/panfrost/panfrost_job.c | 35 ++++++++++++++-------- 3 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_device.c b/drivers/gpu/drm/panfrost/panfrost_device.c index 2de011cee258..ac76e8646e97 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.c +++ b/drivers/gpu/drm/panfrost/panfrost_device.c @@ -383,6 +383,15 @@ int panfrost_exception_to_error(u32 exception_code) return panfrost_exception_infos[exception_code].error; }

+bool panfrost_exception_needs_reset(const struct panfrost_device *pfdev,
		    u32 exception_code)
+{
/* Right now, none of the GPU we support need a reset, but this
* might change (e.g. Valhall GPUs require a when a BUS_FAULT occurs).

NITs: ^ some ^ reset

Or just drop the example for now.

...

*/
return false;
+}

void panfrost_device_reset(struct panfrost_device *pfdev) { panfrost_gpu_soft_reset(pfdev); diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h index 498c7b5dccd0..95e6044008d2 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.h +++ b/drivers/gpu/drm/panfrost/panfrost_device.h @@ -175,6 +175,8 @@ int panfrost_device_suspend(struct device *dev);

const char *panfrost_exception_name(u32 exception_code); int panfrost_exception_to_error(u32 exception_code); +bool panfrost_exception_needs_reset(const struct panfrost_device *pfdev,
		    u32 exception_code);
static inline void panfrost_device_schedule_reset(struct panfrost_device *pfdev) diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c index be5d3e4a1d0a..aedc604d331c 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.c +++ b/drivers/gpu/drm/panfrost/panfrost_job.c @@ -493,27 +493,38 @@ static irqreturn_t panfrost_job_irq_handler(int irq, void *data)
if (status & JOB_INT_MASK_ERR(j)) {
	enum panfrost_queue_status old_status;
	u32 js_status = job_read(pfdev, JS_STATUS(j));

job_write(pfdev, JS_COMMAND_NEXT(j), JS_COMMAND_NOP);

dev_err(pfdev->dev, "js fault, js=%d, status=%s, head=0x%x, tail=0x%x",
	j,
		panfrost_exception_name(job_read(pfdev, JS_STATUS(j))),
		panfrost_exception_name(js_status),
	job_read(pfdev, JS_HEAD_LO(j)),
	job_read(pfdev, JS_TAIL_LO(j)));
	/*
	 * When the queue is being restarted we don't report
	 * faults directly to avoid races between the timeout
	 * and reset handlers. panfrost_scheduler_start() will
	 * call drm_sched_fault() after the queue has been
	 * started if status == FAULT_PENDING.
	/* If we need a reset, signal it to the reset handler,
	 * otherwise, update the fence error field and signal
	 * the job fence.
 */
	old_status = atomic_cmpxchg(&pfdev->js->queue[j].status,
				    PANFROST_QUEUE_STATUS_STARTING,
				    PANFROST_QUEUE_STATUS_FAULT_PENDING);
	if (old_status == PANFROST_QUEUE_STATUS_ACTIVE)
		drm_sched_fault(&pfdev->js->queue[j].sched);
	if (panfrost_exception_needs_reset(pfdev, js_status)) {
		/*
		 * When the queue is being restarted we don't report
		 * faults directly to avoid races between the timeout
		 * and reset handlers. panfrost_scheduler_start() will
		 * call drm_sched_fault() after the queue has been
		 * started if status == FAULT_PENDING.
		 */
		old_status = atomic_cmpxchg(&pfdev->js->queue[j].status,
					    PANFROST_QUEUE_STATUS_STARTING,
					    PANFROST_QUEUE_STATUS_FAULT_PENDING);
		if (old_status == PANFROST_QUEUE_STATUS_ACTIVE)
			drm_sched_fault(&pfdev->js->queue[j].sched);
	} else {
		dma_fence_set_error(pfdev->jobs[j]->done_fence,
				    panfrost_exception_to_error(js_status));

As in the previous patch - at the moment a status of STOPPED or TERMINATED shouldn't actually happen. But the next patch is about to change that! TERMINATED should definitely cause an error on the fence.

Steve

...

		status |= JOB_INT_MASK_DONE(j);
	}
}

if (status & JOB_INT_MASK_DONE(j)) {

Boris Brezillon

1:39 p.m.

New subject: [PATCH v2 10/12] drm/panfrost: Kill in-flight jobs on FD close

If the process who submitted these jobs decided to close the FD before the jobs are done it probably means it doesn't care about the result.

Signed-off-by: Boris Brezillon boris.brezillon@collabora.com --- drivers/gpu/drm/panfrost/panfrost_job.c | 33 +++++++++++++++++++++---- 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c index aedc604d331c..a51fa0a81367 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.c +++ b/drivers/gpu/drm/panfrost/panfrost_job.c @@ -494,14 +494,22 @@ static irqreturn_t panfrost_job_irq_handler(int irq, void *data) if (status & JOB_INT_MASK_ERR(j)) { enum panfrost_queue_status old_status; u32 js_status = job_read(pfdev, JS_STATUS(j)); + int error = panfrost_exception_to_error(js_status); + const char *exception_name = panfrost_exception_name(js_status);

job_write(pfdev, JS_COMMAND_NEXT(j), JS_COMMAND_NOP);

- dev_err(pfdev->dev, "js fault, js=%d, status=%s, head=0x%x, tail=0x%x", - j, - panfrost_exception_name(js_status), - job_read(pfdev, JS_HEAD_LO(j)), - job_read(pfdev, JS_TAIL_LO(j))); + if (!error) { + dev_dbg(pfdev->dev, "js interrupt, js=%d, status=%s, head=0x%x, tail=0x%x", + j, exception_name, + job_read(pfdev, JS_HEAD_LO(j)), + job_read(pfdev, JS_TAIL_LO(j))); + } else { + dev_err(pfdev->dev, "js fault, js=%d, status=%s, head=0x%x, tail=0x%x", + j, exception_name, + job_read(pfdev, JS_HEAD_LO(j)), + job_read(pfdev, JS_TAIL_LO(j))); + }

/* If we need a reset, signal it to the reset handler, * otherwise, update the fence error field and signal @@ -688,10 +696,25 @@ int panfrost_job_open(struct panfrost_file_priv *panfrost_priv)

void panfrost_job_close(struct panfrost_file_priv *panfrost_priv) { + struct panfrost_device *pfdev = panfrost_priv->pfdev; + unsigned long flags; int i;

for (i = 0; i < NUM_JOB_SLOTS; i++) drm_sched_entity_destroy(&panfrost_priv->sched_entity[i]); + + /* Kill in-flight jobs */ + spin_lock_irqsave(&pfdev->js->job_lock, flags); + for (i = 0; i < NUM_JOB_SLOTS; i++) { + struct drm_sched_entity *entity = &panfrost_priv->sched_entity[i]; + struct panfrost_job *job = pfdev->jobs[i]; + + if (!job || job->base.entity != entity) + continue; + + job_write(pfdev, JS_COMMAND(i), JS_COMMAND_HARD_STOP); + } + spin_unlock_irqrestore(&pfdev->js->job_lock, flags); }

int panfrost_job_is_idle(struct panfrost_device *pfdev)

-- 2.31.1

Steven Price

3:31 p.m.

New subject: [PATCH v2 10/12] drm/panfrost: Kill in-flight jobs on FD close

On 21/06/2021 14:39, Boris Brezillon wrote:

...

If the process who submitted these jobs decided to close the FD before the jobs are done it probably means it doesn't care about the result.

Signed-off-by: Boris Brezillon boris.brezillon@collabora.com

drivers/gpu/drm/panfrost/panfrost_job.c | 33 +++++++++++++++++++++---- 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c index aedc604d331c..a51fa0a81367 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.c +++ b/drivers/gpu/drm/panfrost/panfrost_job.c @@ -494,14 +494,22 @@ static irqreturn_t panfrost_job_irq_handler(int irq, void *data) if (status & JOB_INT_MASK_ERR(j)) { enum panfrost_queue_status old_status; u32 js_status = job_read(pfdev, JS_STATUS(j));
	int error = panfrost_exception_to_error(js_status);
	const char *exception_name = panfrost_exception_name(js_status);

NIT: I'm not sure if it's worth it, but it feels like a function which returns both the name and error-code would make sense. E.g. making struct panfrost_exception_info public.

...

	job_write(pfdev, JS_COMMAND_NEXT(j), JS_COMMAND_NOP);

	dev_err(pfdev->dev, "js fault, js=%d, status=%s, head=0x%x, tail=0x%x",

```
		j,
```
```
		panfrost_exception_name(js_status),
```
```
		job_read(pfdev, JS_HEAD_LO(j)),
```
```
		job_read(pfdev, JS_TAIL_LO(j)));
```

```
	if (!error) {
```

		dev_dbg(pfdev->dev, "js interrupt, js=%d, status=%s, head=0x%x, tail=0x%x",

```
			j, exception_name,
```
```
			job_read(pfdev, JS_HEAD_LO(j)),
```
```
			job_read(pfdev, JS_TAIL_LO(j)));
```
```
	} else {
```

		dev_err(pfdev->dev, "js fault, js=%d, status=%s, head=0x%x, tail=0x%x",

```
			j, exception_name,
```
```
			job_read(pfdev, JS_HEAD_LO(j)),
```
```
			job_read(pfdev, JS_TAIL_LO(j)));
```
```
	}
```

Again here you're going to have issues with TERMINATED - dev_err() is probably too chatty, so just changing panfrost_exception_to_error() to return an error value is going to cause problems here.

Steve

...

	/* If we need a reset, signal it to the reset handler,
	 * otherwise, update the fence error field and signal
@@ -688,10 +696,25 @@ int panfrost_job_open(struct panfrost_file_priv *panfrost_priv)

void panfrost_job_close(struct panfrost_file_priv *panfrost_priv) {
struct panfrost_device *pfdev = panfrost_priv->pfdev;

unsigned long flags; int i;

for (i = 0; i < NUM_JOB_SLOTS; i++) drm_sched_entity_destroy(&panfrost_priv->sched_entity[i]);

/* Kill in-flight jobs */

spin_lock_irqsave(&pfdev->js->job_lock, flags);

for (i = 0; i < NUM_JOB_SLOTS; i++) {
struct drm_sched_entity *entity = &panfrost_priv->sched_entity[i];
struct panfrost_job *job = pfdev->jobs[i];
if (!job || job->base.entity != entity)
	continue;
job_write(pfdev, JS_COMMAND(i), JS_COMMAND_HARD_STOP);
}

spin_unlock_irqrestore(&pfdev->js->job_lock, flags);
}

int panfrost_job_is_idle(struct panfrost_device *pfdev)

Boris Brezillon

1:39 p.m.

New subject: [PATCH v2 11/12] drm/panfrost: Make ->run_job() return an ERR_PTR() when appropriate

If the fence creation fail, we can return the error pointer directly. The core will update the fence error accordingly.

Signed-off-by: Boris Brezillon boris.brezillon@collabora.com --- drivers/gpu/drm/panfrost/panfrost_job.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c index a51fa0a81367..74b63e1ee6d9 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.c +++ b/drivers/gpu/drm/panfrost/panfrost_job.c @@ -355,7 +355,7 @@ static struct dma_fence *panfrost_job_run(struct drm_sched_job *sched_job)

fence = panfrost_fence_create(pfdev, slot); if (IS_ERR(fence)) - return NULL; + return fence;

if (job->done_fence) dma_fence_put(job->done_fence);

-- 2.31.1

Steven Price

3:33 p.m.

New subject: [PATCH v2 11/12] drm/panfrost: Make ->run_job() return an ERR_PTR() when appropriate

On 21/06/2021 14:39, Boris Brezillon wrote:

...

If the fence creation fail, we can return the error pointer directly. The core will update the fence error accordingly.

Signed-off-by: Boris Brezillon boris.brezillon@collabora.com

Reviewed-by: Steven Price steven.price@arm.com

...

drivers/gpu/drm/panfrost/panfrost_job.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c index a51fa0a81367..74b63e1ee6d9 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.c +++ b/drivers/gpu/drm/panfrost/panfrost_job.c @@ -355,7 +355,7 @@ static struct dma_fence *panfrost_job_run(struct drm_sched_job *sched_job)

fence = panfrost_fence_create(pfdev, slot); if (IS_ERR(fence))
return NULL;
return fence;
if (job->done_fence) dma_fence_put(job->done_fence);

Boris Brezillon

1:39 p.m.

New subject: [PATCH v2 12/12] drm/panfrost: Shorten the fence signalling section

panfrost_reset() does not directly signal fences, but panfrost_scheduler_start() does, when calling drm_sched_start().

Signed-off-by: Boris Brezillon boris.brezillon@collabora.com --- drivers/gpu/drm/panfrost/panfrost_job.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c index 74b63e1ee6d9..cf6abe0fdf47 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.c +++ b/drivers/gpu/drm/panfrost/panfrost_job.c @@ -414,6 +414,7 @@ static bool panfrost_scheduler_stop(struct panfrost_queue_state *queue, static void panfrost_scheduler_start(struct panfrost_queue_state *queue) { enum panfrost_queue_status old_status; + bool cookie;

mutex_lock(&queue->lock); old_status = atomic_xchg(&queue->status, @@ -423,7 +424,9 @@ static void panfrost_scheduler_start(struct panfrost_queue_state *queue) /* Restore the original timeout before starting the scheduler. */ queue->sched.timeout = msecs_to_jiffies(JOB_TIMEOUT_MS); drm_sched_resubmit_jobs(&queue->sched); + cookie = dma_fence_begin_signalling(); drm_sched_start(&queue->sched, true); + dma_fence_end_signalling(cookie); old_status = atomic_xchg(&queue->status, PANFROST_QUEUE_STATUS_ACTIVE); if (old_status == PANFROST_QUEUE_STATUS_FAULT_PENDING) @@ -566,9 +569,7 @@ static void panfrost_reset(struct work_struct *work) reset.work); unsigned long flags; unsigned int i; - bool cookie;

- cookie = dma_fence_begin_signalling(); for (i = 0; i < NUM_JOB_SLOTS; i++) { /* * We want pending timeouts to be handled before we attempt @@ -608,8 +609,6 @@ static void panfrost_reset(struct work_struct *work)

for (i = 0; i < NUM_JOB_SLOTS; i++) panfrost_scheduler_start(&pfdev->js->queue[i]); - - dma_fence_end_signalling(cookie); }

int panfrost_job_init(struct panfrost_device *pfdev)

-- 2.31.1

Steven Price

3:43 p.m.

New subject: [PATCH v2 12/12] drm/panfrost: Shorten the fence signalling section

On 21/06/2021 14:39, Boris Brezillon wrote:

...

panfrost_reset() does not directly signal fences, but panfrost_scheduler_start() does, when calling drm_sched_start().

I have to admit to not fully understanding dma_fence_begin_signalling() - but I thought the idea was that it should have a relatively wide length to actually catch locking bugs. Just wrapping drm_sched_start() looks wrong: i.e. why isn't this code just contained in drm_sched_start()?

The relevant section from the docs reads:

...

All code necessary to complete a &dma_fence must be annotated, from the

point where a fence is accessible to other threads, to the point where

dma_fence_signal() is called. Un-annotated code can contain deadlock issues,

and due to the very strict rules and many corner cases it is infeasible to

catch these just with review or normal stress testing

So it makes sense that we annotate code from when the reset is started to after the signalling has happened. That way if there are any locks taken in the reset path which could be blocked waiting for any of the fences which might be signalled we get moaned at by lockdep.

Steve

...

Signed-off-by: Boris Brezillon boris.brezillon@collabora.com

drivers/gpu/drm/panfrost/panfrost_job.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c index 74b63e1ee6d9..cf6abe0fdf47 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.c +++ b/drivers/gpu/drm/panfrost/panfrost_job.c @@ -414,6 +414,7 @@ static bool panfrost_scheduler_stop(struct panfrost_queue_state *queue, static void panfrost_scheduler_start(struct panfrost_queue_state *queue) { enum panfrost_queue_status old_status;

bool cookie;

mutex_lock(&queue->lock); old_status = atomic_xchg(&queue->status,

@@ -423,7 +424,9 @@ static void panfrost_scheduler_start(struct panfrost_queue_state *queue) /* Restore the original timeout before starting the scheduler. */ queue->sched.timeout = msecs_to_jiffies(JOB_TIMEOUT_MS); drm_sched_resubmit_jobs(&queue->sched);

cookie = dma_fence_begin_signalling(); drm_sched_start(&queue->sched, true);

dma_fence_end_signalling(cookie); old_status = atomic_xchg(&queue->status, PANFROST_QUEUE_STATUS_ACTIVE); if (old_status == PANFROST_QUEUE_STATUS_FAULT_PENDING)

@@ -566,9 +569,7 @@ static void panfrost_reset(struct work_struct *work) reset.work); unsigned long flags; unsigned int i;

bool cookie;

cookie = dma_fence_begin_signalling(); for (i = 0; i < NUM_JOB_SLOTS; i++) { /*

We want pending timeouts to be handled before we attempt

@@ -608,8 +609,6 @@ static void panfrost_reset(struct work_struct *work)

for (i = 0; i < NUM_JOB_SLOTS; i++) panfrost_scheduler_start(&pfdev->js->queue[i]);

dma_fence_end_signalling(cookie);

}

int panfrost_job_init(struct panfrost_device *pfdev)

1413

Age (days ago)

1416

Last active (days ago)

dri-devel@lists.freedesktop.org

33 comments

3 participants

tags (0)

participants (3)

Alyssa Rosenzweig
Boris Brezillon
Steven Price