Some hardware variants contain a system cache or the last level cache(llc). This cache is typically a large block which is shared by multiple clients on the SOC. GPU uses the system cache to cache both the GPU data buffers(like textures) as well the SMMU pagetables. This helps with improved render performance as well as lower power consumption by reducing the bus traffic to the system memory.
The system cache architecture allows the cache to be split into slices which then be used by multiple SOC clients. This patch series is an effort to enable and use two of those slices perallocated for the GPU, one for the GPU data buffers and another for the GPU SMMU hardware pagetables.
Patch 1 - Patch 4 adds system cache support in SMMU and GPU driver. Patch 5 and 6 are minor cleanups for arm-smmu impl.
The series is based on top of https://gitlab.freedesktop.org/drm/msm/-/tree/msm-next-pgtables
Changes in v6: * Move table to arm-smmu-qcom (Robin)
Changes in v5: * Drop cleanup of blank lines since it was intentional (Robin) * Rebase again on top of msm-next-pgtables as it moves pretty fast
Changes in v4: * Drop IOMMU_SYS_CACHE prot flag * Rebase on top of https://gitlab.freedesktop.org/drm/msm/-/tree/msm-next-pgtables
Changes in v3: * Fix domain attribute setting to before iommu_attach_device() * Fix few code style and checkpatch warnings * Rebase on top of Jordan's latest split pagetables and per-instance pagetables support
Changes in v2: * Addressed review comments and rebased on top of Jordan's split pagetables series
Sai Prakash Ranjan (4): iommu/io-pgtable-arm: Add support to use system cache iommu/arm-smmu: Add domain attribute for system cache iommu: arm-smmu-impl: Use table to list QCOM implementations iommu: arm-smmu-impl: Add a space before open parenthesis
Sharat Masetty (2): drm/msm: rearrange the gpu_rmw() function drm/msm/a6xx: Add support for using system cache(LLC)
drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 83 ++++++++++++++++++++++ drivers/gpu/drm/msm/adreno/a6xx_gpu.h | 4 ++ drivers/gpu/drm/msm/adreno/adreno_gpu.c | 17 +++++ drivers/gpu/drm/msm/msm_drv.c | 8 +++ drivers/gpu/drm/msm/msm_drv.h | 1 + drivers/gpu/drm/msm/msm_gpu.h | 5 +- drivers/iommu/arm/arm-smmu/arm-smmu-impl.c | 11 +-- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 21 ++++-- drivers/iommu/arm/arm-smmu/arm-smmu.c | 17 +++++ drivers/iommu/arm/arm-smmu/arm-smmu.h | 2 +- drivers/iommu/io-pgtable-arm.c | 7 +- include/linux/io-pgtable.h | 4 ++ include/linux/iommu.h | 1 + 13 files changed, 161 insertions(+), 20 deletions(-)
base-commit: ea95e543fd6201aceff96a0dd95530b2085874c4
Add a quirk IO_PGTABLE_QUIRK_SYS_CACHE to override the attributes set in TCR for the page table walker when using system cache.
Signed-off-by: Sai Prakash Ranjan saiprakash.ranjan@codeaurora.org --- drivers/iommu/io-pgtable-arm.c | 7 ++++++- include/linux/io-pgtable.h | 4 ++++ 2 files changed, 10 insertions(+), 1 deletion(-)
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index dc7bcf858b6d..828426c16fa9 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -789,7 +789,8 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NON_STRICT | - IO_PGTABLE_QUIRK_ARM_TTBR1)) + IO_PGTABLE_QUIRK_ARM_TTBR1 | + IO_PGTABLE_QUIRK_SYS_CACHE)) return NULL;
data = arm_lpae_alloc_pgtable(cfg); @@ -801,6 +802,10 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) tcr->sh = ARM_LPAE_TCR_SH_IS; tcr->irgn = ARM_LPAE_TCR_RGN_WBWA; tcr->orgn = ARM_LPAE_TCR_RGN_WBWA; + } else if (cfg->quirks & IO_PGTABLE_QUIRK_SYS_CACHE) { + tcr->sh = ARM_LPAE_TCR_SH_OS; + tcr->irgn = ARM_LPAE_TCR_RGN_NC; + tcr->orgn = ARM_LPAE_TCR_RGN_WBWA; } else { tcr->sh = ARM_LPAE_TCR_SH_OS; tcr->irgn = ARM_LPAE_TCR_RGN_NC; diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index 23285ba645db..ecc9d2248b84 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -86,6 +86,9 @@ struct io_pgtable_cfg { * * IO_PGTABLE_QUIRK_ARM_TTBR1: (ARM LPAE format) Configure the table * for use in the upper half of a split address space. + * + * IO_PGTABLE_QUIRK_SYS_CACHE: Override the attributes set in TCR for + * the page table walker when using system cache. */ #define IO_PGTABLE_QUIRK_ARM_NS BIT(0) #define IO_PGTABLE_QUIRK_NO_PERMS BIT(1) @@ -93,6 +96,7 @@ struct io_pgtable_cfg { #define IO_PGTABLE_QUIRK_ARM_MTK_EXT BIT(3) #define IO_PGTABLE_QUIRK_NON_STRICT BIT(4) #define IO_PGTABLE_QUIRK_ARM_TTBR1 BIT(5) + #define IO_PGTABLE_QUIRK_SYS_CACHE BIT(6) unsigned long quirks; unsigned long pgsize_bitmap; unsigned int ias;
Add iommu domain attribute for using system cache aka last level cache by client drivers like GPU to set right attributes for caching the hardware pagetables into the system cache.
Signed-off-by: Sai Prakash Ranjan saiprakash.ranjan@codeaurora.org --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 17 +++++++++++++++++ drivers/iommu/arm/arm-smmu/arm-smmu.h | 1 + include/linux/iommu.h | 1 + 3 files changed, 19 insertions(+)
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 1f06ab219819..d449c895ba16 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -789,6 +789,9 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, if (smmu_domain->non_strict) pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
+ if (smmu_domain->sys_cache) + pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_SYS_CACHE; + pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain); if (!pgtbl_ops) { ret = -ENOMEM; @@ -1513,6 +1516,9 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain, case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE: *(int *)data = smmu_domain->non_strict; return 0; + case DOMAIN_ATTR_SYS_CACHE: + *((int *)data) = smmu_domain->sys_cache; + return 0; default: return -ENODEV; } @@ -1544,6 +1550,17 @@ static int arm_smmu_domain_set_attr(struct iommu_domain *domain, else smmu_domain->stage = ARM_SMMU_DOMAIN_S1; break; + case DOMAIN_ATTR_SYS_CACHE: + if (smmu_domain->smmu) { + ret = -EPERM; + goto out_unlock; + } + + if (*((int *)data)) + smmu_domain->sys_cache = true; + else + smmu_domain->sys_cache = false; + break; default: ret = -ENODEV; } diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h index ddf2ca4c923d..93593e164e44 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.h +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h @@ -373,6 +373,7 @@ struct arm_smmu_domain { struct mutex init_mutex; /* Protects smmu pointer */ spinlock_t cb_lock; /* Serialises ATS1* ops and TLB syncs */ struct iommu_domain domain; + bool sys_cache; };
struct arm_smmu_master_cfg { diff --git a/include/linux/iommu.h b/include/linux/iommu.h index fee209efb756..a580dfe9c68d 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -118,6 +118,7 @@ enum iommu_attr { DOMAIN_ATTR_FSL_PAMUV1, DOMAIN_ATTR_NESTING, /* two stages of translation */ DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE, + DOMAIN_ATTR_SYS_CACHE, DOMAIN_ATTR_MAX, };
From: Sharat Masetty smasetty@codeaurora.org
The register read-modify-write construct is generic enough that it can be used by other subsystems as needed, create a more generic rmw() function and have the gpu_rmw() use this new function.
Signed-off-by: Sharat Masetty smasetty@codeaurora.org Reviewed-by: Jordan Crouse jcrouse@codeaurora.org Signed-off-by: Sai Prakash Ranjan saiprakash.ranjan@codeaurora.org --- drivers/gpu/drm/msm/msm_drv.c | 8 ++++++++ drivers/gpu/drm/msm/msm_drv.h | 1 + drivers/gpu/drm/msm/msm_gpu.h | 5 +---- 3 files changed, 10 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index 49685571dc0e..a1e22b974b77 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -180,6 +180,14 @@ u32 msm_readl(const void __iomem *addr) return val; }
+void msm_rmw(void __iomem *addr, u32 mask, u32 or) +{ + u32 val = msm_readl(addr); + + val &= ~mask; + msm_writel(val | or, addr); +} + struct msm_vblank_work { struct work_struct work; int crtc_id; diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h index b9dd8f8f4887..655b3b0424a1 100644 --- a/drivers/gpu/drm/msm/msm_drv.h +++ b/drivers/gpu/drm/msm/msm_drv.h @@ -478,6 +478,7 @@ void __iomem *msm_ioremap_quiet(struct platform_device *pdev, const char *name, const char *dbgname); void msm_writel(u32 data, void __iomem *addr); u32 msm_readl(const void __iomem *addr); +void msm_rmw(void __iomem *addr, u32 mask, u32 or);
struct msm_gpu_submitqueue; int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx); diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h index 6c9e1fdc1a76..b2b419277953 100644 --- a/drivers/gpu/drm/msm/msm_gpu.h +++ b/drivers/gpu/drm/msm/msm_gpu.h @@ -246,10 +246,7 @@ static inline u32 gpu_read(struct msm_gpu *gpu, u32 reg)
static inline void gpu_rmw(struct msm_gpu *gpu, u32 reg, u32 mask, u32 or) { - uint32_t val = gpu_read(gpu, reg); - - val &= ~mask; - gpu_write(gpu, reg, val | or); + msm_rmw(gpu->mmio + (reg << 2), mask, or); }
static inline u64 gpu_read64(struct msm_gpu *gpu, u32 lo, u32 hi)
On Mon, Oct 26, 2020 at 05:24:02PM +0530, Sai Prakash Ranjan wrote:
From: Sharat Masetty smasetty@codeaurora.org
The register read-modify-write construct is generic enough that it can be used by other subsystems as needed, create a more generic rmw() function and have the gpu_rmw() use this new function.
Signed-off-by: Sharat Masetty smasetty@codeaurora.org Reviewed-by: Jordan Crouse jcrouse@codeaurora.org Signed-off-by: Sai Prakash Ranjan saiprakash.ranjan@codeaurora.org
Rob - this should be safe to pull with msm-next regardless of the merge status of the iommu side of things. Hopefully everything will be pulled for 5.11 but if it isn't it would be good to get this out of the cycle.
Jordan
drivers/gpu/drm/msm/msm_drv.c | 8 ++++++++ drivers/gpu/drm/msm/msm_drv.h | 1 + drivers/gpu/drm/msm/msm_gpu.h | 5 +---- 3 files changed, 10 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index 49685571dc0e..a1e22b974b77 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -180,6 +180,14 @@ u32 msm_readl(const void __iomem *addr) return val; }
+void msm_rmw(void __iomem *addr, u32 mask, u32 or) +{
- u32 val = msm_readl(addr);
- val &= ~mask;
- msm_writel(val | or, addr);
+}
struct msm_vblank_work { struct work_struct work; int crtc_id; diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h index b9dd8f8f4887..655b3b0424a1 100644 --- a/drivers/gpu/drm/msm/msm_drv.h +++ b/drivers/gpu/drm/msm/msm_drv.h @@ -478,6 +478,7 @@ void __iomem *msm_ioremap_quiet(struct platform_device *pdev, const char *name, const char *dbgname); void msm_writel(u32 data, void __iomem *addr); u32 msm_readl(const void __iomem *addr); +void msm_rmw(void __iomem *addr, u32 mask, u32 or);
struct msm_gpu_submitqueue; int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx); diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h index 6c9e1fdc1a76..b2b419277953 100644 --- a/drivers/gpu/drm/msm/msm_gpu.h +++ b/drivers/gpu/drm/msm/msm_gpu.h @@ -246,10 +246,7 @@ static inline u32 gpu_read(struct msm_gpu *gpu, u32 reg)
static inline void gpu_rmw(struct msm_gpu *gpu, u32 reg, u32 mask, u32 or) {
- uint32_t val = gpu_read(gpu, reg);
- val &= ~mask;
- gpu_write(gpu, reg, val | or);
- msm_rmw(gpu->mmio + (reg << 2), mask, or);
}
static inline u64 gpu_read64(struct msm_gpu *gpu, u32 lo, u32 hi)
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation
From: Sharat Masetty smasetty@codeaurora.org
The last level system cache can be partitioned to 32 different slices of which GPU has two slices preallocated. One slice is used for caching GPU buffers and the other slice is used for caching the GPU SMMU pagetables. This talks to the core system cache driver to acquire the slice handles, configure the SCID's to those slices and activates and deactivates the slices upon GPU power collapse and restore.
Some support from the IOMMU driver is also needed to make use of the system cache to set the right TCR attributes. GPU then has the ability to override a few cacheability parameters which it does to override write-allocate to write-no-allocate as the GPU hardware does not benefit much from it.
DOMAIN_ATTR_SYS_CACHE is another domain level attribute used by the IOMMU driver to set the right attributes to cache the hardware pagetables into the system cache.
Signed-off-by: Sharat Masetty smasetty@codeaurora.org [saiprakash.ranjan: fix to set attr before device attach to iommu and rebase] Signed-off-by: Sai Prakash Ranjan saiprakash.ranjan@codeaurora.org --- drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 83 +++++++++++++++++++++++++ drivers/gpu/drm/msm/adreno/a6xx_gpu.h | 4 ++ drivers/gpu/drm/msm/adreno/adreno_gpu.c | 17 +++++ 3 files changed, 104 insertions(+)
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 8915882e4444..151190ff62f7 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -8,7 +8,9 @@ #include "a6xx_gpu.h" #include "a6xx_gmu.xml.h"
+#include <linux/bitfield.h> #include <linux/devfreq.h> +#include <linux/soc/qcom/llcc-qcom.h>
#define GPU_PAS_ID 13
@@ -1022,6 +1024,79 @@ static irqreturn_t a6xx_irq(struct msm_gpu *gpu) return IRQ_HANDLED; }
+static void a6xx_llc_rmw(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 mask, u32 or) +{ + return msm_rmw(a6xx_gpu->llc_mmio + (reg << 2), mask, or); +} + +static void a6xx_llc_write(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 value) +{ + return msm_writel(value, a6xx_gpu->llc_mmio + (reg << 2)); +} + +static void a6xx_llc_deactivate(struct a6xx_gpu *a6xx_gpu) +{ + llcc_slice_deactivate(a6xx_gpu->llc_slice); + llcc_slice_deactivate(a6xx_gpu->htw_llc_slice); +} + +static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu) +{ + u32 cntl1_regval = 0; + + if (IS_ERR(a6xx_gpu->llc_mmio)) + return; + + if (!llcc_slice_activate(a6xx_gpu->llc_slice)) { + u32 gpu_scid = llcc_get_slice_id(a6xx_gpu->llc_slice); + + gpu_scid &= 0x1f; + cntl1_regval = (gpu_scid << 0) | (gpu_scid << 5) | (gpu_scid << 10) | + (gpu_scid << 15) | (gpu_scid << 20); + } + + if (!llcc_slice_activate(a6xx_gpu->htw_llc_slice)) { + u32 gpuhtw_scid = llcc_get_slice_id(a6xx_gpu->htw_llc_slice); + + gpuhtw_scid &= 0x1f; + cntl1_regval |= FIELD_PREP(GENMASK(29, 25), gpuhtw_scid); + } + + if (cntl1_regval) { + /* + * Program the slice IDs for the various GPU blocks and GPU MMU + * pagetables + */ + a6xx_llc_write(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, cntl1_regval); + + /* + * Program cacheability overrides to not allocate cache lines on + * a write miss + */ + a6xx_llc_rmw(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 0xF, 0x03); + } +} + +static void a6xx_llc_slices_destroy(struct a6xx_gpu *a6xx_gpu) +{ + llcc_slice_putd(a6xx_gpu->llc_slice); + llcc_slice_putd(a6xx_gpu->htw_llc_slice); +} + +static void a6xx_llc_slices_init(struct platform_device *pdev, + struct a6xx_gpu *a6xx_gpu) +{ + a6xx_gpu->llc_mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx"); + if (IS_ERR(a6xx_gpu->llc_mmio)) + return; + + a6xx_gpu->llc_slice = llcc_slice_getd(LLCC_GPU); + a6xx_gpu->htw_llc_slice = llcc_slice_getd(LLCC_GPUHTW); + + if (IS_ERR(a6xx_gpu->llc_slice) && IS_ERR(a6xx_gpu->htw_llc_slice)) + a6xx_gpu->llc_mmio = ERR_PTR(-EINVAL); +} + static int a6xx_pm_resume(struct msm_gpu *gpu) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); @@ -1038,6 +1113,8 @@ static int a6xx_pm_resume(struct msm_gpu *gpu)
msm_gpu_resume_devfreq(gpu);
+ a6xx_llc_activate(a6xx_gpu); + return 0; }
@@ -1048,6 +1125,8 @@ static int a6xx_pm_suspend(struct msm_gpu *gpu)
trace_msm_gpu_suspend(0);
+ a6xx_llc_deactivate(a6xx_gpu); + devfreq_suspend_device(gpu->devfreq.devfreq);
return a6xx_gmu_stop(a6xx_gpu); @@ -1091,6 +1170,8 @@ static void a6xx_destroy(struct msm_gpu *gpu) drm_gem_object_put(a6xx_gpu->shadow_bo); }
+ a6xx_llc_slices_destroy(a6xx_gpu); + a6xx_gmu_remove(a6xx_gpu);
adreno_gpu_cleanup(adreno_gpu); @@ -1209,6 +1290,8 @@ struct msm_gpu *a6xx_gpu_init(struct drm_device *dev) if (info && info->revn == 650) adreno_gpu->base.hw_apriv = true;
+ a6xx_llc_slices_init(pdev, a6xx_gpu); + ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, 1); if (ret) { a6xx_destroy(&(a6xx_gpu->base.base)); diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h index 3eeebf6a754b..9e6079af679c 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h @@ -28,6 +28,10 @@ struct a6xx_gpu { uint32_t *shadow;
bool has_whereami; + + void __iomem *llc_mmio; + void *llc_slice; + void *htw_llc_slice; };
#define to_a6xx_gpu(x) container_of(x, struct a6xx_gpu, base) diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c index fd8f491f2e48..86c4fe667225 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c @@ -16,6 +16,7 @@ #include <linux/soc/qcom/mdt_loader.h> #include <soc/qcom/ocmem.h> #include "adreno_gpu.h" +#include "a6xx_gpu.h" #include "msm_gem.h" #include "msm_mmu.h"
@@ -189,6 +190,8 @@ struct msm_gem_address_space * adreno_iommu_create_address_space(struct msm_gpu *gpu, struct platform_device *pdev) { + struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); + struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); struct iommu_domain *iommu; struct msm_mmu *mmu; struct msm_gem_address_space *aspace; @@ -198,7 +201,21 @@ adreno_iommu_create_address_space(struct msm_gpu *gpu, if (!iommu) return NULL;
+ /* + * This allows GPU to set the bus attributes required to use system + * cache on behalf of the iommu page table walker. + */ + if (!IS_ERR(a6xx_gpu->htw_llc_slice)) { + int gpu_htw_llc = 1; + + iommu_domain_set_attr(iommu, DOMAIN_ATTR_SYS_CACHE, &gpu_htw_llc); + } + mmu = msm_iommu_new(&pdev->dev, iommu); + if (IS_ERR(mmu)) { + iommu_domain_free(iommu); + return ERR_CAST(mmu); + }
/* * Use the aperture start or SZ_16M, whichever is greater. This will
On Mon, Oct 26, 2020 at 05:24:03PM +0530, Sai Prakash Ranjan wrote:
From: Sharat Masetty smasetty@codeaurora.org
The last level system cache can be partitioned to 32 different slices of which GPU has two slices preallocated. One slice is used for caching GPU buffers and the other slice is used for caching the GPU SMMU pagetables. This talks to the core system cache driver to acquire the slice handles, configure the SCID's to those slices and activates and deactivates the slices upon GPU power collapse and restore.
Some support from the IOMMU driver is also needed to make use of the system cache to set the right TCR attributes. GPU then has the ability to override a few cacheability parameters which it does to override write-allocate to write-no-allocate as the GPU hardware does not benefit much from it.
DOMAIN_ATTR_SYS_CACHE is another domain level attribute used by the IOMMU driver to set the right attributes to cache the hardware pagetables into the system cache.
Signed-off-by: Sharat Masetty smasetty@codeaurora.org [saiprakash.ranjan: fix to set attr before device attach to iommu and rebase] Signed-off-by: Sai Prakash Ranjan saiprakash.ranjan@codeaurora.org
As with the previous patch this doesn't exactly need the IOMMU side changes outside of the update to the domain attribute enum.
If the attribute didn't exist we would just lose no-write-allocate which is undesirable but not devastating.
Hopefully the arm-smmu changes are ready to go but I'm just trying to figure out a game plan to keep Sai from having to maintain these patches for another cycle.
Jordan
drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 83 +++++++++++++++++++++++++ drivers/gpu/drm/msm/adreno/a6xx_gpu.h | 4 ++ drivers/gpu/drm/msm/adreno/adreno_gpu.c | 17 +++++ 3 files changed, 104 insertions(+)
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 8915882e4444..151190ff62f7 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -8,7 +8,9 @@ #include "a6xx_gpu.h" #include "a6xx_gmu.xml.h"
+#include <linux/bitfield.h> #include <linux/devfreq.h> +#include <linux/soc/qcom/llcc-qcom.h>
#define GPU_PAS_ID 13
@@ -1022,6 +1024,79 @@ static irqreturn_t a6xx_irq(struct msm_gpu *gpu) return IRQ_HANDLED; }
+static void a6xx_llc_rmw(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 mask, u32 or) +{
- return msm_rmw(a6xx_gpu->llc_mmio + (reg << 2), mask, or);
+}
+static void a6xx_llc_write(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 value) +{
- return msm_writel(value, a6xx_gpu->llc_mmio + (reg << 2));
+}
+static void a6xx_llc_deactivate(struct a6xx_gpu *a6xx_gpu) +{
- llcc_slice_deactivate(a6xx_gpu->llc_slice);
- llcc_slice_deactivate(a6xx_gpu->htw_llc_slice);
+}
+static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu) +{
- u32 cntl1_regval = 0;
- if (IS_ERR(a6xx_gpu->llc_mmio))
return;
- if (!llcc_slice_activate(a6xx_gpu->llc_slice)) {
u32 gpu_scid = llcc_get_slice_id(a6xx_gpu->llc_slice);
gpu_scid &= 0x1f;
cntl1_regval = (gpu_scid << 0) | (gpu_scid << 5) | (gpu_scid << 10) |
(gpu_scid << 15) | (gpu_scid << 20);
- }
- if (!llcc_slice_activate(a6xx_gpu->htw_llc_slice)) {
u32 gpuhtw_scid = llcc_get_slice_id(a6xx_gpu->htw_llc_slice);
gpuhtw_scid &= 0x1f;
cntl1_regval |= FIELD_PREP(GENMASK(29, 25), gpuhtw_scid);
- }
- if (cntl1_regval) {
/*
* Program the slice IDs for the various GPU blocks and GPU MMU
* pagetables
*/
a6xx_llc_write(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, cntl1_regval);
/*
* Program cacheability overrides to not allocate cache lines on
* a write miss
*/
a6xx_llc_rmw(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 0xF, 0x03);
- }
+}
+static void a6xx_llc_slices_destroy(struct a6xx_gpu *a6xx_gpu) +{
- llcc_slice_putd(a6xx_gpu->llc_slice);
- llcc_slice_putd(a6xx_gpu->htw_llc_slice);
+}
+static void a6xx_llc_slices_init(struct platform_device *pdev,
struct a6xx_gpu *a6xx_gpu)
+{
- a6xx_gpu->llc_mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx");
- if (IS_ERR(a6xx_gpu->llc_mmio))
return;
- a6xx_gpu->llc_slice = llcc_slice_getd(LLCC_GPU);
- a6xx_gpu->htw_llc_slice = llcc_slice_getd(LLCC_GPUHTW);
- if (IS_ERR(a6xx_gpu->llc_slice) && IS_ERR(a6xx_gpu->htw_llc_slice))
a6xx_gpu->llc_mmio = ERR_PTR(-EINVAL);
+}
static int a6xx_pm_resume(struct msm_gpu *gpu) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); @@ -1038,6 +1113,8 @@ static int a6xx_pm_resume(struct msm_gpu *gpu)
msm_gpu_resume_devfreq(gpu);
- a6xx_llc_activate(a6xx_gpu);
- return 0;
}
@@ -1048,6 +1125,8 @@ static int a6xx_pm_suspend(struct msm_gpu *gpu)
trace_msm_gpu_suspend(0);
a6xx_llc_deactivate(a6xx_gpu);
devfreq_suspend_device(gpu->devfreq.devfreq);
return a6xx_gmu_stop(a6xx_gpu);
@@ -1091,6 +1170,8 @@ static void a6xx_destroy(struct msm_gpu *gpu) drm_gem_object_put(a6xx_gpu->shadow_bo); }
a6xx_llc_slices_destroy(a6xx_gpu);
a6xx_gmu_remove(a6xx_gpu);
adreno_gpu_cleanup(adreno_gpu);
@@ -1209,6 +1290,8 @@ struct msm_gpu *a6xx_gpu_init(struct drm_device *dev) if (info && info->revn == 650) adreno_gpu->base.hw_apriv = true;
- a6xx_llc_slices_init(pdev, a6xx_gpu);
- ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, 1); if (ret) { a6xx_destroy(&(a6xx_gpu->base.base));
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h index 3eeebf6a754b..9e6079af679c 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h @@ -28,6 +28,10 @@ struct a6xx_gpu { uint32_t *shadow;
bool has_whereami;
- void __iomem *llc_mmio;
- void *llc_slice;
- void *htw_llc_slice;
};
#define to_a6xx_gpu(x) container_of(x, struct a6xx_gpu, base) diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c index fd8f491f2e48..86c4fe667225 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c @@ -16,6 +16,7 @@ #include <linux/soc/qcom/mdt_loader.h> #include <soc/qcom/ocmem.h> #include "adreno_gpu.h" +#include "a6xx_gpu.h" #include "msm_gem.h" #include "msm_mmu.h"
@@ -189,6 +190,8 @@ struct msm_gem_address_space * adreno_iommu_create_address_space(struct msm_gpu *gpu, struct platform_device *pdev) {
- struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
- struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); struct iommu_domain *iommu; struct msm_mmu *mmu; struct msm_gem_address_space *aspace;
@@ -198,7 +201,21 @@ adreno_iommu_create_address_space(struct msm_gpu *gpu, if (!iommu) return NULL;
/*
* This allows GPU to set the bus attributes required to use system
* cache on behalf of the iommu page table walker.
*/
if (!IS_ERR(a6xx_gpu->htw_llc_slice)) {
int gpu_htw_llc = 1;
iommu_domain_set_attr(iommu, DOMAIN_ATTR_SYS_CACHE, &gpu_htw_llc);
}
mmu = msm_iommu_new(&pdev->dev, iommu);
if (IS_ERR(mmu)) {
iommu_domain_free(iommu);
return ERR_CAST(mmu);
}
/*
- Use the aperture start or SZ_16M, whichever is greater. This will
-- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation
Use table and of_match_node() to match qcom implementation instead of multiple of_device_compatible() calls for each QCOM SMMU implementation.
Signed-off-by: Sai Prakash Ranjan saiprakash.ranjan@codeaurora.org --- drivers/iommu/arm/arm-smmu/arm-smmu-impl.c | 9 +-------- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 21 ++++++++++++++++----- drivers/iommu/arm/arm-smmu/arm-smmu.h | 1 - 3 files changed, 17 insertions(+), 14 deletions(-)
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c index d199b4bff15d..ffaf3f91ba52 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c @@ -217,14 +217,7 @@ struct arm_smmu_device *arm_smmu_impl_init(struct arm_smmu_device *smmu) if (of_device_is_compatible(np, "nvidia,tegra194-smmu")) return nvidia_smmu_impl_init(smmu);
- if (of_device_is_compatible(np, "qcom,sdm845-smmu-500") || - of_device_is_compatible(np, "qcom,sc7180-smmu-500") || - of_device_is_compatible(np, "qcom,sm8150-smmu-500") || - of_device_is_compatible(np, "qcom,sm8250-smmu-500")) - return qcom_smmu_impl_init(smmu); - - if (of_device_is_compatible(smmu->dev->of_node, "qcom,adreno-smmu")) - return qcom_adreno_smmu_impl_init(smmu); + smmu = qcom_smmu_impl_init(smmu);
if (of_device_is_compatible(np, "marvell,ap806-smmu-500")) smmu->impl = &mrvl_mmu500_impl; diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 0663d7d26908..9c40fb7a2241 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -224,12 +224,23 @@ static struct arm_smmu_device *qcom_smmu_create(struct arm_smmu_device *smmu, return &qsmmu->smmu; }
+static const struct of_device_id __maybe_unused qcom_smmu_impl_of_match[] = { + { .compatible = "qcom,sc7180-smmu-500" }, + { .compatible = "qcom,sdm845-smmu-500" }, + { .compatible = "qcom,sm8150-smmu-500" }, + { .compatible = "qcom,sm8250-smmu-500" }, + { } +}; + struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu) { - return qcom_smmu_create(smmu, &qcom_smmu_impl); -} + const struct device_node *np = smmu->dev->of_node;
-struct arm_smmu_device *qcom_adreno_smmu_impl_init(struct arm_smmu_device *smmu) -{ - return qcom_smmu_create(smmu, &qcom_adreno_smmu_impl); + if (of_match_node(qcom_smmu_impl_of_match, np)) + return qcom_smmu_create(smmu, &qcom_smmu_impl); + + if (of_device_is_compatible(np, "qcom,adreno-smmu")) + return qcom_smmu_create(smmu, &qcom_adreno_smmu_impl); + + return smmu; } diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h index 93593e164e44..94f3e439c082 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.h +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h @@ -524,7 +524,6 @@ static inline void arm_smmu_writeq(struct arm_smmu_device *smmu, int page, struct arm_smmu_device *arm_smmu_impl_init(struct arm_smmu_device *smmu); struct arm_smmu_device *nvidia_smmu_impl_init(struct arm_smmu_device *smmu); struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu); -struct arm_smmu_device *qcom_adreno_smmu_impl_init(struct arm_smmu_device *smmu);
void arm_smmu_write_context_bank(struct arm_smmu_device *smmu, int idx); int arm_mmu500_reset(struct arm_smmu_device *smmu);
Fix the checkpatch warning for space required before the open parenthesis.
Signed-off-by: Sai Prakash Ranjan saiprakash.ranjan@codeaurora.org --- drivers/iommu/arm/arm-smmu/arm-smmu-impl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c index ffaf3f91ba52..f16da4a21270 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c @@ -12,7 +12,7 @@
static int arm_smmu_gr0_ns(int offset) { - switch(offset) { + switch (offset) { case ARM_SMMU_GR0_sCR0: case ARM_SMMU_GR0_sACR: case ARM_SMMU_GR0_sGFSR:
dri-devel@lists.freedesktop.org