From: Rob Clark robdclark@chromium.org
This is a replacement for a previous patches[1] that was adding arm64 support for drm_clflush. I've also added a patch to solve a similar cache issue in vgem.
The first few patches just export arch_sync_dma_for_*(). Possibly instead the EXPORT_SYMBOL_GPL() should be somewere central, rather than per-arch (but where would make sense?)
The fourth adds (and exports) these ops for arch/arm. (Arnd Bergmann mentioned on IRC that Christoph Hellwig was working on this already for arch/arm which could replace the fourth patch.)
The last two patches actually fix things.
[1] https://patchwork.freedesktop.org/series/64732/
Rob Clark (6): arm64: export arch_sync_dma_for_*() mips: export arch_sync_dma_for_*() powerpc: export arch_sync_dma_for_*() arm: add arch_sync_dma_for_*() drm/msm: stop abusing DMA API drm/vgem: fix cache synchronization on arm/arm64 (take two)
arch/arm/Kconfig | 2 + arch/arm/mm/dma-mapping-nommu.c | 14 +++ arch/arm/mm/dma-mapping.c | 28 ++++++ arch/arm64/mm/dma-mapping.c | 2 + arch/arm64/mm/flush.c | 2 + arch/mips/mm/dma-noncoherent.c | 2 + arch/powerpc/mm/dma-noncoherent.c | 2 + drivers/gpu/drm/drm_cache.c | 20 ++++- drivers/gpu/drm/msm/msm_gem.c | 37 +++----- drivers/gpu/drm/vgem/vgem_drv.c | 145 ++++++++++++++++++++---------- include/drm/drm_cache.h | 4 + 11 files changed, 182 insertions(+), 76 deletions(-)
From: Rob Clark robdclark@chromium.org
Signed-off-by: Rob Clark robdclark@chromium.org --- arch/arm64/mm/dma-mapping.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 1d3f0b5a9940..ea5ae11d07f7 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -24,12 +24,14 @@ void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, { __dma_map_area(phys_to_virt(paddr), size, dir); } +EXPORT_SYMBOL_GPL(arch_sync_dma_for_device);
void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, size_t size, enum dma_data_direction dir) { __dma_unmap_area(phys_to_virt(paddr), size, dir); } +EXPORT_SYMBOL_GPL(arch_sync_dma_for_cpu);
void arch_dma_prep_coherent(struct page *page, size_t size) {
From: Rob Clark robdclark@chromium.org
Signed-off-by: Rob Clark robdclark@chromium.org --- arch/arm64/mm/flush.c | 2 ++ arch/mips/mm/dma-noncoherent.c | 2 ++ drivers/gpu/drm/drm_cache.c | 20 +++++++++++++++++--- include/drm/drm_cache.h | 4 ++++ 4 files changed, 25 insertions(+), 3 deletions(-)
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index dc19300309d2..f0eb6320c979 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -93,3 +93,5 @@ void arch_invalidate_pmem(void *addr, size_t size) } EXPORT_SYMBOL_GPL(arch_invalidate_pmem); #endif + +EXPORT_SYMBOL_GPL(__flush_dcache_area); diff --git a/arch/mips/mm/dma-noncoherent.c b/arch/mips/mm/dma-noncoherent.c index ed56c6fa7be2..bd5debe1b423 100644 --- a/arch/mips/mm/dma-noncoherent.c +++ b/arch/mips/mm/dma-noncoherent.c @@ -131,6 +131,7 @@ void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, { dma_sync_phys(paddr, size, dir); } +EXPORT_SYMBOL_GPL(arch_sync_dma_for_device);
#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, @@ -139,6 +140,7 @@ void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, if (cpu_needs_post_dma_flush(dev)) dma_sync_phys(paddr, size, dir); } +EXPORT_SYMBOL_GPL(arch_sync_dma_for_cpu); #endif
void arch_dma_cache_sync(struct device *dev, void *vaddr, size_t size, diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c index 3bd76e918b5d..90105c637797 100644 --- a/drivers/gpu/drm/drm_cache.c +++ b/drivers/gpu/drm/drm_cache.c @@ -69,6 +69,14 @@ static void drm_cache_flush_clflush(struct page *pages[], } #endif
+#if defined(__powerpc__) +static void __flush_dcache_area(void *addr, size_t len) +{ + flush_dcache_range((unsigned long)addr, + (unsigned long)addr + PAGE_SIZE); +} +#endif + /** * drm_clflush_pages - Flush dcache lines of a set of pages. * @pages: List of pages to be flushed. @@ -90,7 +98,7 @@ drm_clflush_pages(struct page *pages[], unsigned long num_pages) if (wbinvd_on_all_cpus()) pr_err("Timed out waiting for cache flush\n");
-#elif defined(__powerpc__) +#elif defined(__powerpc__) || defined(CONFIG_ARM64) unsigned long i; for (i = 0; i < num_pages; i++) { struct page *page = pages[i]; @@ -100,8 +108,7 @@ drm_clflush_pages(struct page *pages[], unsigned long num_pages) continue;
page_virtual = kmap_atomic(page); - flush_dcache_range((unsigned long)page_virtual, - (unsigned long)page_virtual + PAGE_SIZE); + __flush_dcache_area(page_virtual, PAGE_SIZE); kunmap_atomic(page_virtual); } #else @@ -135,6 +142,13 @@ drm_clflush_sg(struct sg_table *st)
if (wbinvd_on_all_cpus()) pr_err("Timed out waiting for cache flush\n"); +#elif defined(CONFIG_ARM64) + struct sg_page_iter sg_iter; + + for_each_sg_page(st->sgl, &sg_iter, st->nents, 0) { + struct page *p = sg_page_iter_page(&sg_iter); + drm_clflush_pages(&p, 1); + } #else pr_err("Architecture has no drm_cache.c support\n"); WARN_ON_ONCE(1); diff --git a/include/drm/drm_cache.h b/include/drm/drm_cache.h index 987ff16b9420..f94e7bd3eca4 100644 --- a/include/drm/drm_cache.h +++ b/include/drm/drm_cache.h @@ -40,6 +40,10 @@ void drm_clflush_sg(struct sg_table *st); void drm_clflush_virt_range(void *addr, unsigned long length); bool drm_need_swiotlb(int dma_bits);
+#if defined(CONFIG_X86) || defined(__powerpc__) || defined(CONFIG_ARM64) +#define HAS_DRM_CACHE 1 +#endif +
static inline bool drm_arch_can_wc_memory(void) {
From: Rob Clark robdclark@chromium.org
Signed-off-by: Rob Clark robdclark@chromium.org --- arch/powerpc/mm/dma-noncoherent.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c index c617282d5b2a..80d53b950821 100644 --- a/arch/powerpc/mm/dma-noncoherent.c +++ b/arch/powerpc/mm/dma-noncoherent.c @@ -401,12 +401,14 @@ void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, { __dma_sync_page(paddr, size, dir); } +EXPORT_SYMBOL_GPL(arch_sync_dma_for_device);
void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, size_t size, enum dma_data_direction dir) { __dma_sync_page(paddr, size, dir); } +EXPORT_SYMBOL_GPL(arch_sync_dma_for_cpu);
/* * Return the PFN for a given cpu virtual address returned by arch_dma_alloc.
From: Rob Clark robdclark@chromium.org
Signed-off-by: Rob Clark robdclark@chromium.org --- arch/arm/Kconfig | 2 ++ arch/arm/mm/dma-mapping-nommu.c | 14 ++++++++++++++ arch/arm/mm/dma-mapping.c | 28 ++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+)
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 33b00579beff..a48a7263a2c1 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -18,6 +18,8 @@ config ARM select ARCH_HAS_SET_MEMORY select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL select ARCH_HAS_STRICT_MODULE_RWX if MMU + select ARCH_HAS_SYNC_DMA_FOR_DEVICE + select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_TEARDOWN_DMA_OPS if MMU select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAVE_CUSTOM_GPIO_H diff --git a/arch/arm/mm/dma-mapping-nommu.c b/arch/arm/mm/dma-mapping-nommu.c index 52b82559d99b..4a3df952151f 100644 --- a/arch/arm/mm/dma-mapping-nommu.c +++ b/arch/arm/mm/dma-mapping-nommu.c @@ -84,6 +84,13 @@ static void __dma_page_cpu_to_dev(phys_addr_t paddr, size_t size, outer_clean_range(paddr, paddr + size); }
+void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, + size_t size, enum dma_data_direction dir) +{ + __dma_page_cpu_to_dev(paddr, size, dir); +} +EXPORT_SYMBOL_GPL(arch_sync_dma_for_device); + static void __dma_page_dev_to_cpu(phys_addr_t paddr, size_t size, enum dma_data_direction dir) { @@ -93,6 +100,13 @@ static void __dma_page_dev_to_cpu(phys_addr_t paddr, size_t size, } }
+void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, + size_t size, enum dma_data_direction dir) +{ + __dma_page_dev_to_cpu(paddr, size, dir); +} +EXPORT_SYMBOL_GPL(arch_sync_dma_for_cpu); + static dma_addr_t arm_nommu_dma_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 6774b03aa405..8ead93196194 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -979,6 +979,13 @@ static void __dma_page_cpu_to_dev(struct page *page, unsigned long off, /* FIXME: non-speculating: flush on bidirectional mappings? */ }
+void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, + size_t size, enum dma_data_direction dir) +{ + __dma_page_cpu_to_dev(phys_to_page(paddr), paddr % PAGE_SIZE, size, dir); +} +EXPORT_SYMBOL_GPL(arch_sync_dma_for_device); + static void __dma_page_dev_to_cpu(struct page *page, unsigned long off, size_t size, enum dma_data_direction dir) { @@ -1013,6 +1020,27 @@ static void __dma_page_dev_to_cpu(struct page *page, unsigned long off, } }
+void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, + size_t size, enum dma_data_direction dir) +{ + __dma_page_dev_to_cpu(phys_to_page(paddr), paddr % PAGE_SIZE, size, dir); +} +EXPORT_SYMBOL_GPL(arch_sync_dma_for_cpu); + +/* + * arch_dma_{alloc,free} fail-stubs needed to avoid link-errors in dma/direct.c + * (which is not actually used on arch/arm) + */ +void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t flags, unsigned long attrs) +{ + return NULL; +} +void arch_dma_free(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle, unsigned long attrs) +{ +} + /** * arm_dma_map_sg - map a set of SG buffers for streaming mode DMA * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
From: Rob Clark robdclark@chromium.org
Use arch_sync_dma_for_{device,cpu}() rather than abusing the DMA API to indirectly get at the arch_sync_dma code.
Signed-off-by: Rob Clark robdclark@chromium.org --- drivers/gpu/drm/msm/msm_gem.c | 37 +++++++++++------------------------ 1 file changed, 11 insertions(+), 26 deletions(-)
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index 8cf6362e64bf..a2611e62df19 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -7,6 +7,7 @@ #include <linux/spinlock.h> #include <linux/shmem_fs.h> #include <linux/dma-buf.h> +#include <linux/dma-noncoherent.h> #include <linux/pfn_t.h>
#include "msm_drv.h" @@ -32,43 +33,27 @@ static bool use_pages(struct drm_gem_object *obj) return !msm_obj->vram_node; }
-/* - * Cache sync.. this is a bit over-complicated, to fit dma-mapping - * API. Really GPU cache is out of scope here (handled on cmdstream) - * and all we need to do is invalidate newly allocated pages before - * mapping to CPU as uncached/writecombine. - * - * On top of this, we have the added headache, that depending on - * display generation, the display's iommu may be wired up to either - * the toplevel drm device (mdss), or to the mdp sub-node, meaning - * that here we either have dma-direct or iommu ops. - * - * Let this be a cautionary tail of abstraction gone wrong. - */ - static void sync_for_device(struct msm_gem_object *msm_obj) { struct device *dev = msm_obj->base.dev->dev; + struct scatterlist *sg; + int i;
- if (get_dma_ops(dev)) { - dma_sync_sg_for_device(dev, msm_obj->sgt->sgl, - msm_obj->sgt->nents, DMA_BIDIRECTIONAL); - } else { - dma_map_sg(dev, msm_obj->sgt->sgl, - msm_obj->sgt->nents, DMA_BIDIRECTIONAL); + for_each_sg(msm_obj->sgt->sgl, sg, msm_obj->sgt->nents, i) { + arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, + DMA_BIDIRECTIONAL); } }
static void sync_for_cpu(struct msm_gem_object *msm_obj) { struct device *dev = msm_obj->base.dev->dev; + struct scatterlist *sg; + int i;
- if (get_dma_ops(dev)) { - dma_sync_sg_for_cpu(dev, msm_obj->sgt->sgl, - msm_obj->sgt->nents, DMA_BIDIRECTIONAL); - } else { - dma_unmap_sg(dev, msm_obj->sgt->sgl, - msm_obj->sgt->nents, DMA_BIDIRECTIONAL); + for_each_sg(msm_obj->sgt->sgl, sg, msm_obj->sgt->nents, i) { + arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, + DMA_BIDIRECTIONAL); } }
From: Rob Clark robdclark@chromium.org
drm_cflush_pages() is no-op on arm/arm64. But instead we can use arch_sync API.
Fixes failures with vgem_test.
Signed-off-by: Rob Clark robdclark@chromium.org --- drivers/gpu/drm/vgem/vgem_drv.c | 145 +++++++++++++++++++++----------- 1 file changed, 98 insertions(+), 47 deletions(-)
diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c index 11a8f99ba18c..4493abdba134 100644 --- a/drivers/gpu/drm/vgem/vgem_drv.c +++ b/drivers/gpu/drm/vgem/vgem_drv.c @@ -34,6 +34,7 @@ #include <linux/ramfs.h> #include <linux/shmem_fs.h> #include <linux/dma-buf.h> +#include <linux/dma-noncoherent.h> #include "vgem_drv.h"
#define DRIVER_NAME "vgem" @@ -47,10 +48,16 @@ static struct vgem_device { struct platform_device *platform; } *vgem_device;
+static void sync_and_unpin(struct drm_vgem_gem_object *bo); +static struct page **pin_and_sync(struct drm_vgem_gem_object *bo); + static void vgem_gem_free_object(struct drm_gem_object *obj) { struct drm_vgem_gem_object *vgem_obj = to_vgem_bo(obj);
+ if (!obj->import_attach) + sync_and_unpin(vgem_obj); + kvfree(vgem_obj->pages); mutex_destroy(&vgem_obj->pages_lock);
@@ -78,40 +85,15 @@ static vm_fault_t vgem_gem_fault(struct vm_fault *vmf) return VM_FAULT_SIGBUS;
mutex_lock(&obj->pages_lock); + if (!obj->pages) + pin_and_sync(obj); if (obj->pages) { get_page(obj->pages[page_offset]); vmf->page = obj->pages[page_offset]; ret = 0; } mutex_unlock(&obj->pages_lock); - if (ret) { - struct page *page; - - page = shmem_read_mapping_page( - file_inode(obj->base.filp)->i_mapping, - page_offset); - if (!IS_ERR(page)) { - vmf->page = page; - ret = 0; - } else switch (PTR_ERR(page)) { - case -ENOSPC: - case -ENOMEM: - ret = VM_FAULT_OOM; - break; - case -EBUSY: - ret = VM_FAULT_RETRY; - break; - case -EFAULT: - case -EINVAL: - ret = VM_FAULT_SIGBUS; - break; - default: - WARN_ON(PTR_ERR(page)); - ret = VM_FAULT_SIGBUS; - break; - }
- } return ret; }
@@ -277,32 +259,107 @@ static const struct file_operations vgem_driver_fops = { .release = drm_release, };
-static struct page **vgem_pin_pages(struct drm_vgem_gem_object *bo) +/* Called under pages_lock, except in free path (where it can't race): */ +static void sync_and_unpin(struct drm_vgem_gem_object *bo) { - mutex_lock(&bo->pages_lock); - if (bo->pages_pin_count++ == 0) { - struct page **pages; - - pages = drm_gem_get_pages(&bo->base); - if (IS_ERR(pages)) { - bo->pages_pin_count--; - mutex_unlock(&bo->pages_lock); - return pages; + struct device *dev = bo->base.dev->dev; + + if (bo->table) { + struct scatterlist *sg; + int i; + + for_each_sg(bo->table->sgl, sg, bo->table->nents, i) { + arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, + DMA_BIDIRECTIONAL); }
- bo->pages = pages; + sg_free_table(bo->table); + kfree(bo->table); + bo->table = NULL; + } + + if (bo->pages) { + drm_gem_put_pages(&bo->base, bo->pages, true, true); + bo->pages = NULL; + } +} + +static struct page **pin_and_sync(struct drm_vgem_gem_object *bo) +{ + struct device *dev = bo->base.dev->dev; + int npages = bo->base.size >> PAGE_SHIFT; + struct page **pages; + struct sg_table *sgt; + struct scatterlist *sg; + int i; + + WARN_ON(!mutex_is_locked(&bo->pages_lock)); + + pages = drm_gem_get_pages(&bo->base); + if (IS_ERR(pages)) { + bo->pages_pin_count--; + mutex_unlock(&bo->pages_lock); + return pages; + } + + sgt = drm_prime_pages_to_sg(pages, npages); + if (IS_ERR(sgt)) { + dev_err(dev, "failed to allocate sgt: %ld\n", + PTR_ERR(bo->table)); + drm_gem_put_pages(&bo->base, pages, false, false); + mutex_unlock(&bo->pages_lock); + return ERR_CAST(bo->table); + } + + /* + * Flush the object from the CPU cache so that importers + * can rely on coherent indirect access via the exported + * dma-address. + */ + for_each_sg(sgt->sgl, sg, sgt->nents, i) { + arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, + DMA_BIDIRECTIONAL); + } + +#if defined(CONFIG_X86) + /* x86 doesn't have arch_sync_dma_*() */ + drm_clflush_pages(pages, npages); +#endif + + bo->pages = pages; + bo->table = sgt; + + return pages; +} + +static struct page **vgem_pin_pages(struct drm_vgem_gem_object *bo) +{ + struct page **pages; + + mutex_lock(&bo->pages_lock); + if (bo->pages_pin_count++ == 0 && !bo->pages) { + pages = pin_and_sync(bo); + } else { + WARN_ON(!bo->pages); + pages = bo->pages; } mutex_unlock(&bo->pages_lock);
- return bo->pages; + return pages; }
static void vgem_unpin_pages(struct drm_vgem_gem_object *bo) { + /* + * We shouldn't hit this for imported bo's.. in the import + * case we don't own the scatter-table + */ + WARN_ON(bo->base.import_attach); + mutex_lock(&bo->pages_lock); if (--bo->pages_pin_count == 0) { - drm_gem_put_pages(&bo->base, bo->pages, true, true); - bo->pages = NULL; + WARN_ON(!bo->table); + sync_and_unpin(bo); } mutex_unlock(&bo->pages_lock); } @@ -310,18 +367,12 @@ static void vgem_unpin_pages(struct drm_vgem_gem_object *bo) static int vgem_prime_pin(struct drm_gem_object *obj) { struct drm_vgem_gem_object *bo = to_vgem_bo(obj); - long n_pages = obj->size >> PAGE_SHIFT; struct page **pages;
pages = vgem_pin_pages(bo); if (IS_ERR(pages)) return PTR_ERR(pages);
- /* Flush the object from the CPU cache so that importers can rely - * on coherent indirect access via the exported dma-address. - */ - drm_clflush_pages(pages, n_pages); - return 0; }
dri-devel@lists.freedesktop.org