We are likely to write multiple page entries at once and already ensure proper write buffer flushing before GPU submit, so this improves CPU time usage in the submit path without any downsides.
Signed-off-by: Lucas Stach l.stach@pengutronix.de --- v2: use shorter _wc function names. --- drivers/gpu/drm/etnaviv/etnaviv_iommu.c | 34 +++++----- drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c | 74 ++++++++++------------ 2 files changed, 49 insertions(+), 59 deletions(-)
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_iommu.c b/drivers/gpu/drm/etnaviv/etnaviv_iommu.c index 4b9b11ca6f03..4ada19054443 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_iommu.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_iommu.c @@ -47,11 +47,10 @@ static int __etnaviv_iommu_init(struct etnaviv_iommuv1_domain *etnaviv_domain) u32 *p; int i;
- etnaviv_domain->base.bad_page_cpu = dma_alloc_coherent( - etnaviv_domain->base.dev, - SZ_4K, - &etnaviv_domain->base.bad_page_dma, - GFP_KERNEL); + etnaviv_domain->base.bad_page_cpu = + dma_alloc_wc(etnaviv_domain->base.dev, SZ_4K, + &etnaviv_domain->base.bad_page_dma, + GFP_KERNEL); if (!etnaviv_domain->base.bad_page_cpu) return -ENOMEM;
@@ -59,14 +58,14 @@ static int __etnaviv_iommu_init(struct etnaviv_iommuv1_domain *etnaviv_domain) for (i = 0; i < SZ_4K / 4; i++) *p++ = 0xdead55aa;
- etnaviv_domain->pgtable_cpu = - dma_alloc_coherent(etnaviv_domain->base.dev, PT_SIZE, - &etnaviv_domain->pgtable_dma, - GFP_KERNEL); + etnaviv_domain->pgtable_cpu = dma_alloc_wc(etnaviv_domain->base.dev, + PT_SIZE, + &etnaviv_domain->pgtable_dma, + GFP_KERNEL); if (!etnaviv_domain->pgtable_cpu) { - dma_free_coherent(etnaviv_domain->base.dev, SZ_4K, - etnaviv_domain->base.bad_page_cpu, - etnaviv_domain->base.bad_page_dma); + dma_free_wc(etnaviv_domain->base.dev, SZ_4K, + etnaviv_domain->base.bad_page_cpu, + etnaviv_domain->base.bad_page_dma); return -ENOMEM; }
@@ -81,13 +80,12 @@ static void etnaviv_iommuv1_domain_free(struct etnaviv_iommu_domain *domain) struct etnaviv_iommuv1_domain *etnaviv_domain = to_etnaviv_domain(domain);
- dma_free_coherent(etnaviv_domain->base.dev, PT_SIZE, - etnaviv_domain->pgtable_cpu, - etnaviv_domain->pgtable_dma); + dma_free_wc(etnaviv_domain->base.dev, PT_SIZE, + etnaviv_domain->pgtable_cpu, etnaviv_domain->pgtable_dma);
- dma_free_coherent(etnaviv_domain->base.dev, SZ_4K, - etnaviv_domain->base.bad_page_cpu, - etnaviv_domain->base.bad_page_dma); + dma_free_wc(etnaviv_domain->base.dev, SZ_4K, + etnaviv_domain->base.bad_page_cpu, + etnaviv_domain->base.bad_page_dma);
kfree(etnaviv_domain); } diff --git a/drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c b/drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c index 9752dbd5d28b..47785d61cd95 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c @@ -104,11 +104,10 @@ static int etnaviv_iommuv2_init(struct etnaviv_iommuv2_domain *etnaviv_domain) int ret, i, j;
/* allocate scratch page */ - etnaviv_domain->base.bad_page_cpu = dma_alloc_coherent( - etnaviv_domain->base.dev, - SZ_4K, - &etnaviv_domain->base.bad_page_dma, - GFP_KERNEL); + etnaviv_domain->base.bad_page_cpu = + dma_alloc_wc(etnaviv_domain->base.dev, SZ_4K, + &etnaviv_domain->base.bad_page_dma, + GFP_KERNEL); if (!etnaviv_domain->base.bad_page_cpu) { ret = -ENOMEM; goto fail_mem; @@ -117,19 +116,17 @@ static int etnaviv_iommuv2_init(struct etnaviv_iommuv2_domain *etnaviv_domain) for (i = 0; i < SZ_4K / 4; i++) *p++ = 0xdead55aa;
- etnaviv_domain->pta_cpu = dma_alloc_coherent(etnaviv_domain->base.dev, - SZ_4K, - &etnaviv_domain->pta_dma, - GFP_KERNEL); + etnaviv_domain->pta_cpu = dma_alloc_wc(etnaviv_domain->base.dev, + SZ_4K, &etnaviv_domain->pta_dma, + GFP_KERNEL); if (!etnaviv_domain->pta_cpu) { ret = -ENOMEM; goto fail_mem; }
- etnaviv_domain->mtlb_cpu = dma_alloc_coherent(etnaviv_domain->base.dev, - SZ_4K, - &etnaviv_domain->mtlb_dma, - GFP_KERNEL); + etnaviv_domain->mtlb_cpu = dma_alloc_wc(etnaviv_domain->base.dev, + SZ_4K, &etnaviv_domain->mtlb_dma, + GFP_KERNEL); if (!etnaviv_domain->mtlb_cpu) { ret = -ENOMEM; goto fail_mem; @@ -138,10 +135,9 @@ static int etnaviv_iommuv2_init(struct etnaviv_iommuv2_domain *etnaviv_domain) /* pre-populate STLB pages (may want to switch to on-demand later) */ for (i = 0; i < MMUv2_MAX_STLB_ENTRIES; i++) { etnaviv_domain->stlb_cpu[i] = - dma_alloc_coherent(etnaviv_domain->base.dev, - SZ_4K, - &etnaviv_domain->stlb_dma[i], - GFP_KERNEL); + dma_alloc_wc(etnaviv_domain->base.dev, SZ_4K, + &etnaviv_domain->stlb_dma[i], + GFP_KERNEL); if (!etnaviv_domain->stlb_cpu[i]) { ret = -ENOMEM; goto fail_mem; @@ -158,25 +154,23 @@ static int etnaviv_iommuv2_init(struct etnaviv_iommuv2_domain *etnaviv_domain)
fail_mem: if (etnaviv_domain->base.bad_page_cpu) - dma_free_coherent(etnaviv_domain->base.dev, SZ_4K, - etnaviv_domain->base.bad_page_cpu, - etnaviv_domain->base.bad_page_dma); + dma_free_wc(etnaviv_domain->base.dev, SZ_4K, + etnaviv_domain->base.bad_page_cpu, + etnaviv_domain->base.bad_page_dma);
if (etnaviv_domain->pta_cpu) - dma_free_coherent(etnaviv_domain->base.dev, SZ_4K, - etnaviv_domain->pta_cpu, - etnaviv_domain->pta_dma); + dma_free_wc(etnaviv_domain->base.dev, SZ_4K, + etnaviv_domain->pta_cpu, etnaviv_domain->pta_dma);
if (etnaviv_domain->mtlb_cpu) - dma_free_coherent(etnaviv_domain->base.dev, SZ_4K, - etnaviv_domain->mtlb_cpu, - etnaviv_domain->mtlb_dma); + dma_free_wc(etnaviv_domain->base.dev, SZ_4K, + etnaviv_domain->mtlb_cpu, etnaviv_domain->mtlb_dma);
for (i = 0; i < MMUv2_MAX_STLB_ENTRIES; i++) { if (etnaviv_domain->stlb_cpu[i]) - dma_free_coherent(etnaviv_domain->base.dev, SZ_4K, - etnaviv_domain->stlb_cpu[i], - etnaviv_domain->stlb_dma[i]); + dma_free_wc(etnaviv_domain->base.dev, SZ_4K, + etnaviv_domain->stlb_cpu[i], + etnaviv_domain->stlb_dma[i]); }
return ret; @@ -188,23 +182,21 @@ static void etnaviv_iommuv2_domain_free(struct etnaviv_iommu_domain *domain) to_etnaviv_domain(domain); int i;
- dma_free_coherent(etnaviv_domain->base.dev, SZ_4K, - etnaviv_domain->base.bad_page_cpu, - etnaviv_domain->base.bad_page_dma); + dma_free_wc(etnaviv_domain->base.dev, SZ_4K, + etnaviv_domain->base.bad_page_cpu, + etnaviv_domain->base.bad_page_dma);
- dma_free_coherent(etnaviv_domain->base.dev, SZ_4K, - etnaviv_domain->pta_cpu, - etnaviv_domain->pta_dma); + dma_free_wc(etnaviv_domain->base.dev, SZ_4K, + etnaviv_domain->pta_cpu, etnaviv_domain->pta_dma);
- dma_free_coherent(etnaviv_domain->base.dev, SZ_4K, - etnaviv_domain->mtlb_cpu, - etnaviv_domain->mtlb_dma); + dma_free_wc(etnaviv_domain->base.dev, SZ_4K, + etnaviv_domain->mtlb_cpu, etnaviv_domain->mtlb_dma);
for (i = 0; i < MMUv2_MAX_STLB_ENTRIES; i++) { if (etnaviv_domain->stlb_cpu[i]) - dma_free_coherent(etnaviv_domain->base.dev, SZ_4K, - etnaviv_domain->stlb_cpu[i], - etnaviv_domain->stlb_dma[i]); + dma_free_wc(etnaviv_domain->base.dev, SZ_4K, + etnaviv_domain->stlb_cpu[i], + etnaviv_domain->stlb_dma[i]); }
vfree(etnaviv_domain);
With etnaviv not being tied into the IOMMU framework anymore, the MMU functions will only be called under sleeping locks. Thus we are able to allocate the memory for the 2nd level page tables on demand without having to deal with memory allocation in atomic context.
This speeds up driver intitialization on MMUv2 GPU cores, as we don't need to preallocate all the page table memory and also reduces memory consumption for most workloads, as most of them won't use the full GPU virtual address space.
Signed-off-by: Lucas Stach l.stach@pengutronix.de --- v2: fill MTLB for unpopulated STLBs with PTE_EXCEPTION --- drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c | 63 ++++++++++++---------- 1 file changed, 34 insertions(+), 29 deletions(-)
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c b/drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c index 47785d61cd95..2d9d09608bc2 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c @@ -47,8 +47,9 @@ struct etnaviv_iommuv2_domain { u32 *mtlb_cpu; dma_addr_t mtlb_dma; /* S(lave) TLB aka second level pagetable */ - u32 *stlb_cpu[1024]; - dma_addr_t stlb_dma[1024]; + u32 *stlb_cpu[MMUv2_MAX_STLB_ENTRIES]; + dma_addr_t stlb_dma[MMUv2_MAX_STLB_ENTRIES]; + DECLARE_BITMAP(stlb_allocated, MMUv2_MAX_STLB_ENTRIES); };
static struct etnaviv_iommuv2_domain * @@ -57,13 +58,35 @@ to_etnaviv_domain(struct etnaviv_iommu_domain *domain) return container_of(domain, struct etnaviv_iommuv2_domain, base); }
+static int +etnaviv_iommuv2_ensure_stlb(struct etnaviv_iommuv2_domain *etnaviv_domain, + int stlb) +{ + if (test_bit(stlb, etnaviv_domain->stlb_allocated)) + return 0; + + etnaviv_domain->stlb_cpu[stlb] = + dma_alloc_writecombine(etnaviv_domain->base.dev, + SZ_4K, + &etnaviv_domain->stlb_dma[stlb], + GFP_KERNEL); + + if (!etnaviv_domain->stlb_cpu[stlb]) + return -ENOMEM; + + __set_bit(stlb, etnaviv_domain->stlb_allocated); + etnaviv_domain->mtlb_cpu[stlb] = etnaviv_domain->stlb_dma[stlb] | + MMUv2_PTE_PRESENT; + return 0; +} + static int etnaviv_iommuv2_map(struct etnaviv_iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot) { struct etnaviv_iommuv2_domain *etnaviv_domain = to_etnaviv_domain(domain); - int mtlb_entry, stlb_entry; + int mtlb_entry, stlb_entry, ret; u32 entry = (u32)paddr | MMUv2_PTE_PRESENT;
if (size != SZ_4K) @@ -75,6 +98,10 @@ static int etnaviv_iommuv2_map(struct etnaviv_iommu_domain *domain, mtlb_entry = (iova & MMUv2_MTLB_MASK) >> MMUv2_MTLB_SHIFT; stlb_entry = (iova & MMUv2_STLB_MASK) >> MMUv2_STLB_SHIFT;
+ ret = etnaviv_iommuv2_ensure_stlb(etnaviv_domain, mtlb_entry); + if (ret) + return ret; + etnaviv_domain->stlb_cpu[mtlb_entry][stlb_entry] = entry;
return 0; @@ -101,7 +128,7 @@ static size_t etnaviv_iommuv2_unmap(struct etnaviv_iommu_domain *domain, static int etnaviv_iommuv2_init(struct etnaviv_iommuv2_domain *etnaviv_domain) { u32 *p; - int ret, i, j; + int ret, i;
/* allocate scratch page */ etnaviv_domain->base.bad_page_cpu = @@ -132,23 +159,8 @@ static int etnaviv_iommuv2_init(struct etnaviv_iommuv2_domain *etnaviv_domain) goto fail_mem; }
- /* pre-populate STLB pages (may want to switch to on-demand later) */ - for (i = 0; i < MMUv2_MAX_STLB_ENTRIES; i++) { - etnaviv_domain->stlb_cpu[i] = - dma_alloc_wc(etnaviv_domain->base.dev, SZ_4K, - &etnaviv_domain->stlb_dma[i], - GFP_KERNEL); - if (!etnaviv_domain->stlb_cpu[i]) { - ret = -ENOMEM; - goto fail_mem; - } - p = etnaviv_domain->stlb_cpu[i]; - for (j = 0; j < SZ_4K / 4; j++) - *p++ = MMUv2_PTE_EXCEPTION; - - etnaviv_domain->mtlb_cpu[i] = etnaviv_domain->stlb_dma[i] | - MMUv2_PTE_PRESENT; - } + memset32(etnaviv_domain->mtlb_cpu, MMUv2_PTE_EXCEPTION, + MMUv2_MAX_STLB_ENTRIES);
return 0;
@@ -166,13 +178,6 @@ static int etnaviv_iommuv2_init(struct etnaviv_iommuv2_domain *etnaviv_domain) dma_free_wc(etnaviv_domain->base.dev, SZ_4K, etnaviv_domain->mtlb_cpu, etnaviv_domain->mtlb_dma);
- for (i = 0; i < MMUv2_MAX_STLB_ENTRIES; i++) { - if (etnaviv_domain->stlb_cpu[i]) - dma_free_wc(etnaviv_domain->base.dev, SZ_4K, - etnaviv_domain->stlb_cpu[i], - etnaviv_domain->stlb_dma[i]); - } - return ret; }
@@ -193,7 +198,7 @@ static void etnaviv_iommuv2_domain_free(struct etnaviv_iommu_domain *domain) etnaviv_domain->mtlb_cpu, etnaviv_domain->mtlb_dma);
for (i = 0; i < MMUv2_MAX_STLB_ENTRIES; i++) { - if (etnaviv_domain->stlb_cpu[i]) + if (test_bit(i, etnaviv_domain->stlb_allocated)) dma_free_wc(etnaviv_domain->base.dev, SZ_4K, etnaviv_domain->stlb_cpu[i], etnaviv_domain->stlb_dma[i]);
On Tue, 2018-05-08 at 15:23 +0200, Lucas Stach wrote:
We are likely to write multiple page entries at once and already ensure proper write buffer flushing before GPU submit, so this improves CPU time usage in the submit path without any downsides.
Signed-off-by: Lucas Stach l.stach@pengutronix.de
Reviewed-by: Philipp Zabel p.zabel@pengutronix.de
regards Philipp
dri-devel@lists.freedesktop.org