tldr; DMA buffers aren't normal memory, expecting that you can use them like that (like calling get_user_pages works, or that they're accounting like any other normal memory) cannot be guaranteed.
Since some userspace only runs on integrated devices, where all buffers are actually all resident system memory, there's a huge temptation to assume that a struct page is always present and useable like for any more pagecache backed mmap. This has the potential to result in a uapi nightmare.
To stop this gap require that DMA buffer mmaps are VM_PFNMAP, which blocks get_user_pages and all the other struct page based infrastructure for everyone. In spirit this is the uapi counterpart to the kernel-internal CONFIG_DMABUF_DEBUG.
Motivated by a recent patch which wanted to swich the system dma-buf heap to vm_insert_page instead of vm_insert_pfn.
v2:
Jason brought up that we also want to guarantee that all ptes have the pte_special flag set, to catch fast get_user_pages (on architectures that support this). Allowing VM_MIXEDMAP (like VM_SPECIAL does) would still allow vm_insert_page, but limiting to VM_PFNMAP will catch that.
From auditing the various functions to insert pfn pte entires
(vm_insert_pfn_prot, remap_pfn_range and all it's callers like dma_mmap_wc) it looks like VM_PFNMAP is already required anyway, so this should be the correct flag to check for.
References: https://lore.kernel.org/lkml/CAKMK7uHi+mG0z0HUmNt13QCCvutuRVjpcR0NjRL12k-WbW... Acked-by: Christian König christian.koenig@amd.com Cc: Jason Gunthorpe jgg@ziepe.ca Cc: Suren Baghdasaryan surenb@google.com Cc: Matthew Wilcox willy@infradead.org Cc: John Stultz john.stultz@linaro.org Signed-off-by: Daniel Vetter daniel.vetter@intel.com Cc: Sumit Semwal sumit.semwal@linaro.org Cc: "Christian König" christian.koenig@amd.com Cc: linux-media@vger.kernel.org Cc: linaro-mm-sig@lists.linaro.org --- drivers/dma-buf/dma-buf.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-)
diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index f264b70c383e..06cb1d2e9fdc 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -127,6 +127,7 @@ static struct file_system_type dma_buf_fs_type = { static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma) { struct dma_buf *dmabuf; + int ret;
if (!is_dma_buf_file(file)) return -EINVAL; @@ -142,7 +143,11 @@ static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma) dmabuf->size >> PAGE_SHIFT) return -EINVAL;
- return dmabuf->ops->mmap(dmabuf, vma); + ret = dmabuf->ops->mmap(dmabuf, vma); + + WARN_ON(!(vma->vm_flags & VM_PFNMAP)); + + return ret; }
static loff_t dma_buf_llseek(struct file *file, loff_t offset, int whence) @@ -1244,6 +1249,8 @@ EXPORT_SYMBOL_GPL(dma_buf_end_cpu_access); int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma, unsigned long pgoff) { + int ret; + if (WARN_ON(!dmabuf || !vma)) return -EINVAL;
@@ -1264,7 +1271,11 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma, vma_set_file(vma, dmabuf->file); vma->vm_pgoff = pgoff;
- return dmabuf->ops->mmap(dmabuf, vma); + ret = dmabuf->ops->mmap(dmabuf, vma); + + WARN_ON(!(vma->vm_flags & VM_PFNMAP)); + + return ret; } EXPORT_SYMBOL_GPL(dma_buf_mmap);
Aside from deleting lots of code the real motivation here is to switch the mmap over to VM_PFNMAP, to be more consistent with what real gpu drivers do. They're all VM_PFNMP, which means get_user_pages doesn't work, and even if you try and there's a struct page behind that, touching it and mucking around with its refcount can upset drivers real bad.
Cc: John Stultz john.stultz@linaro.org Cc: Sumit Semwal sumit.semwal@linaro.org Cc: "Christian König" christian.koenig@amd.com Signed-off-by: Daniel Vetter daniel.vetter@intel.com Cc: Melissa Wen melissa.srw@gmail.com Cc: Chris Wilson chris@chris-wilson.co.uk --- drivers/gpu/drm/vgem/vgem_drv.c | 280 +------------------------------- 1 file changed, 3 insertions(+), 277 deletions(-)
diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c index a0e75f1d5d01..88b3d125a610 100644 --- a/drivers/gpu/drm/vgem/vgem_drv.c +++ b/drivers/gpu/drm/vgem/vgem_drv.c @@ -40,6 +40,7 @@ #include <drm/drm_file.h> #include <drm/drm_ioctl.h> #include <drm/drm_managed.h> +#include <drm/drm_gem_shmem_helper.h> #include <drm/drm_prime.h>
#include "vgem_drv.h" @@ -50,27 +51,11 @@ #define DRIVER_MAJOR 1 #define DRIVER_MINOR 0
-static const struct drm_gem_object_funcs vgem_gem_object_funcs; - static struct vgem_device { struct drm_device drm; struct platform_device *platform; } *vgem_device;
-static void vgem_gem_free_object(struct drm_gem_object *obj) -{ - struct drm_vgem_gem_object *vgem_obj = to_vgem_bo(obj); - - kvfree(vgem_obj->pages); - mutex_destroy(&vgem_obj->pages_lock); - - if (obj->import_attach) - drm_prime_gem_destroy(obj, vgem_obj->table); - - drm_gem_object_release(obj); - kfree(vgem_obj); -} - static vm_fault_t vgem_gem_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -159,265 +144,12 @@ static void vgem_postclose(struct drm_device *dev, struct drm_file *file) kfree(vfile); }
-static struct drm_vgem_gem_object *__vgem_gem_create(struct drm_device *dev, - unsigned long size) -{ - struct drm_vgem_gem_object *obj; - int ret; - - obj = kzalloc(sizeof(*obj), GFP_KERNEL); - if (!obj) - return ERR_PTR(-ENOMEM); - - obj->base.funcs = &vgem_gem_object_funcs; - - ret = drm_gem_object_init(dev, &obj->base, roundup(size, PAGE_SIZE)); - if (ret) { - kfree(obj); - return ERR_PTR(ret); - } - - mutex_init(&obj->pages_lock); - - return obj; -} - -static void __vgem_gem_destroy(struct drm_vgem_gem_object *obj) -{ - drm_gem_object_release(&obj->base); - kfree(obj); -} - -static struct drm_gem_object *vgem_gem_create(struct drm_device *dev, - struct drm_file *file, - unsigned int *handle, - unsigned long size) -{ - struct drm_vgem_gem_object *obj; - int ret; - - obj = __vgem_gem_create(dev, size); - if (IS_ERR(obj)) - return ERR_CAST(obj); - - ret = drm_gem_handle_create(file, &obj->base, handle); - if (ret) { - drm_gem_object_put(&obj->base); - return ERR_PTR(ret); - } - - return &obj->base; -} - -static int vgem_gem_dumb_create(struct drm_file *file, struct drm_device *dev, - struct drm_mode_create_dumb *args) -{ - struct drm_gem_object *gem_object; - u64 pitch, size; - - pitch = args->width * DIV_ROUND_UP(args->bpp, 8); - size = args->height * pitch; - if (size == 0) - return -EINVAL; - - gem_object = vgem_gem_create(dev, file, &args->handle, size); - if (IS_ERR(gem_object)) - return PTR_ERR(gem_object); - - args->size = gem_object->size; - args->pitch = pitch; - - drm_gem_object_put(gem_object); - - DRM_DEBUG("Created object of size %llu\n", args->size); - - return 0; -} - static struct drm_ioctl_desc vgem_ioctls[] = { DRM_IOCTL_DEF_DRV(VGEM_FENCE_ATTACH, vgem_fence_attach_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(VGEM_FENCE_SIGNAL, vgem_fence_signal_ioctl, DRM_RENDER_ALLOW), };
-static int vgem_mmap(struct file *filp, struct vm_area_struct *vma) -{ - unsigned long flags = vma->vm_flags; - int ret; - - ret = drm_gem_mmap(filp, vma); - if (ret) - return ret; - - /* Keep the WC mmaping set by drm_gem_mmap() but our pages - * are ordinary and not special. - */ - vma->vm_flags = flags | VM_DONTEXPAND | VM_DONTDUMP; - return 0; -} - -static const struct file_operations vgem_driver_fops = { - .owner = THIS_MODULE, - .open = drm_open, - .mmap = vgem_mmap, - .poll = drm_poll, - .read = drm_read, - .unlocked_ioctl = drm_ioctl, - .compat_ioctl = drm_compat_ioctl, - .release = drm_release, -}; - -static struct page **vgem_pin_pages(struct drm_vgem_gem_object *bo) -{ - mutex_lock(&bo->pages_lock); - if (bo->pages_pin_count++ == 0) { - struct page **pages; - - pages = drm_gem_get_pages(&bo->base); - if (IS_ERR(pages)) { - bo->pages_pin_count--; - mutex_unlock(&bo->pages_lock); - return pages; - } - - bo->pages = pages; - } - mutex_unlock(&bo->pages_lock); - - return bo->pages; -} - -static void vgem_unpin_pages(struct drm_vgem_gem_object *bo) -{ - mutex_lock(&bo->pages_lock); - if (--bo->pages_pin_count == 0) { - drm_gem_put_pages(&bo->base, bo->pages, true, true); - bo->pages = NULL; - } - mutex_unlock(&bo->pages_lock); -} - -static int vgem_prime_pin(struct drm_gem_object *obj) -{ - struct drm_vgem_gem_object *bo = to_vgem_bo(obj); - long n_pages = obj->size >> PAGE_SHIFT; - struct page **pages; - - pages = vgem_pin_pages(bo); - if (IS_ERR(pages)) - return PTR_ERR(pages); - - /* Flush the object from the CPU cache so that importers can rely - * on coherent indirect access via the exported dma-address. - */ - drm_clflush_pages(pages, n_pages); - - return 0; -} - -static void vgem_prime_unpin(struct drm_gem_object *obj) -{ - struct drm_vgem_gem_object *bo = to_vgem_bo(obj); - - vgem_unpin_pages(bo); -} - -static struct sg_table *vgem_prime_get_sg_table(struct drm_gem_object *obj) -{ - struct drm_vgem_gem_object *bo = to_vgem_bo(obj); - - return drm_prime_pages_to_sg(obj->dev, bo->pages, bo->base.size >> PAGE_SHIFT); -} - -static struct drm_gem_object* vgem_prime_import(struct drm_device *dev, - struct dma_buf *dma_buf) -{ - struct vgem_device *vgem = container_of(dev, typeof(*vgem), drm); - - return drm_gem_prime_import_dev(dev, dma_buf, &vgem->platform->dev); -} - -static struct drm_gem_object *vgem_prime_import_sg_table(struct drm_device *dev, - struct dma_buf_attachment *attach, struct sg_table *sg) -{ - struct drm_vgem_gem_object *obj; - int npages; - - obj = __vgem_gem_create(dev, attach->dmabuf->size); - if (IS_ERR(obj)) - return ERR_CAST(obj); - - npages = PAGE_ALIGN(attach->dmabuf->size) / PAGE_SIZE; - - obj->table = sg; - obj->pages = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); - if (!obj->pages) { - __vgem_gem_destroy(obj); - return ERR_PTR(-ENOMEM); - } - - obj->pages_pin_count++; /* perma-pinned */ - drm_prime_sg_to_page_array(obj->table, obj->pages, npages); - return &obj->base; -} - -static int vgem_prime_vmap(struct drm_gem_object *obj, struct dma_buf_map *map) -{ - struct drm_vgem_gem_object *bo = to_vgem_bo(obj); - long n_pages = obj->size >> PAGE_SHIFT; - struct page **pages; - void *vaddr; - - pages = vgem_pin_pages(bo); - if (IS_ERR(pages)) - return PTR_ERR(pages); - - vaddr = vmap(pages, n_pages, 0, pgprot_writecombine(PAGE_KERNEL)); - if (!vaddr) - return -ENOMEM; - dma_buf_map_set_vaddr(map, vaddr); - - return 0; -} - -static void vgem_prime_vunmap(struct drm_gem_object *obj, struct dma_buf_map *map) -{ - struct drm_vgem_gem_object *bo = to_vgem_bo(obj); - - vunmap(map->vaddr); - vgem_unpin_pages(bo); -} - -static int vgem_prime_mmap(struct drm_gem_object *obj, - struct vm_area_struct *vma) -{ - int ret; - - if (obj->size < vma->vm_end - vma->vm_start) - return -EINVAL; - - if (!obj->filp) - return -ENODEV; - - ret = call_mmap(obj->filp, vma); - if (ret) - return ret; - - vma_set_file(vma, obj->filp); - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; - vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); - - return 0; -} - -static const struct drm_gem_object_funcs vgem_gem_object_funcs = { - .free = vgem_gem_free_object, - .pin = vgem_prime_pin, - .unpin = vgem_prime_unpin, - .get_sg_table = vgem_prime_get_sg_table, - .vmap = vgem_prime_vmap, - .vunmap = vgem_prime_vunmap, - .vm_ops = &vgem_gem_vm_ops, -}; +DEFINE_DRM_GEM_FOPS(vgem_driver_fops);
static const struct drm_driver vgem_driver = { .driver_features = DRIVER_GEM | DRIVER_RENDER, @@ -427,13 +159,7 @@ static const struct drm_driver vgem_driver = { .num_ioctls = ARRAY_SIZE(vgem_ioctls), .fops = &vgem_driver_fops,
- .dumb_create = vgem_gem_dumb_create, - - .prime_handle_to_fd = drm_gem_prime_handle_to_fd, - .prime_fd_to_handle = drm_gem_prime_fd_to_handle, - .gem_prime_import = vgem_prime_import, - .gem_prime_import_sg_table = vgem_prime_import_sg_table, - .gem_prime_mmap = vgem_prime_mmap, + DRM_GEM_SHMEM_DRIVER_OPS,
.name = DRIVER_NAME, .desc = DRIVER_DESC,
Hi
Am 23.02.21 um 11:59 schrieb Daniel Vetter:
Aside from deleting lots of code the real motivation here is to switch the mmap over to VM_PFNMAP, to be more consistent with what real gpu drivers do. They're all VM_PFNMP, which means get_user_pages doesn't work, and even if you try and there's a struct page behind that, touching it and mucking around with its refcount can upset drivers real bad.
Cc: John Stultz john.stultz@linaro.org Cc: Sumit Semwal sumit.semwal@linaro.org Cc: "Christian König" christian.koenig@amd.com Signed-off-by: Daniel Vetter daniel.vetter@intel.com Cc: Melissa Wen melissa.srw@gmail.com Cc: Chris Wilson chris@chris-wilson.co.uk
drivers/gpu/drm/vgem/vgem_drv.c | 280 +------------------------------- 1 file changed, 3 insertions(+), 277 deletions(-)
diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c index a0e75f1d5d01..88b3d125a610 100644 --- a/drivers/gpu/drm/vgem/vgem_drv.c +++ b/drivers/gpu/drm/vgem/vgem_drv.c @@ -40,6 +40,7 @@ #include <drm/drm_file.h> #include <drm/drm_ioctl.h> #include <drm/drm_managed.h> +#include <drm/drm_gem_shmem_helper.h>
This should be between file.h and ioctl.h
#include <drm/drm_prime.h>
#include "vgem_drv.h" @@ -50,27 +51,11 @@ #define DRIVER_MAJOR 1 #define DRIVER_MINOR 0
-static const struct drm_gem_object_funcs vgem_gem_object_funcs;
- static struct vgem_device { struct drm_device drm; struct platform_device *platform; } *vgem_device;
-static void vgem_gem_free_object(struct drm_gem_object *obj) -{
- struct drm_vgem_gem_object *vgem_obj = to_vgem_bo(obj);
- kvfree(vgem_obj->pages);
- mutex_destroy(&vgem_obj->pages_lock);
- if (obj->import_attach)
drm_prime_gem_destroy(obj, vgem_obj->table);
- drm_gem_object_release(obj);
- kfree(vgem_obj);
-}
- static vm_fault_t vgem_gem_fault(struct vm_fault *vmf)
From a quick grep it looks like you should be able to remove this function and vgam_gem_vm_ops as well.
The rest of the patch looks good to me.
Acked-by: Thomas Zimmermann tzimmermann@suse.de
{ struct vm_area_struct *vma = vmf->vma; @@ -159,265 +144,12 @@ static void vgem_postclose(struct drm_device *dev, struct drm_file *file) kfree(vfile); }
-static struct drm_vgem_gem_object *__vgem_gem_create(struct drm_device *dev,
unsigned long size)
-{
- struct drm_vgem_gem_object *obj;
- int ret;
- obj = kzalloc(sizeof(*obj), GFP_KERNEL);
- if (!obj)
return ERR_PTR(-ENOMEM);
- obj->base.funcs = &vgem_gem_object_funcs;
- ret = drm_gem_object_init(dev, &obj->base, roundup(size, PAGE_SIZE));
- if (ret) {
kfree(obj);
return ERR_PTR(ret);
- }
- mutex_init(&obj->pages_lock);
- return obj;
-}
-static void __vgem_gem_destroy(struct drm_vgem_gem_object *obj) -{
- drm_gem_object_release(&obj->base);
- kfree(obj);
-}
-static struct drm_gem_object *vgem_gem_create(struct drm_device *dev,
struct drm_file *file,
unsigned int *handle,
unsigned long size)
-{
- struct drm_vgem_gem_object *obj;
- int ret;
- obj = __vgem_gem_create(dev, size);
- if (IS_ERR(obj))
return ERR_CAST(obj);
- ret = drm_gem_handle_create(file, &obj->base, handle);
- if (ret) {
drm_gem_object_put(&obj->base);
return ERR_PTR(ret);
- }
- return &obj->base;
-}
-static int vgem_gem_dumb_create(struct drm_file *file, struct drm_device *dev,
struct drm_mode_create_dumb *args)
-{
- struct drm_gem_object *gem_object;
- u64 pitch, size;
- pitch = args->width * DIV_ROUND_UP(args->bpp, 8);
- size = args->height * pitch;
- if (size == 0)
return -EINVAL;
- gem_object = vgem_gem_create(dev, file, &args->handle, size);
- if (IS_ERR(gem_object))
return PTR_ERR(gem_object);
- args->size = gem_object->size;
- args->pitch = pitch;
- drm_gem_object_put(gem_object);
- DRM_DEBUG("Created object of size %llu\n", args->size);
- return 0;
-}
- static struct drm_ioctl_desc vgem_ioctls[] = { DRM_IOCTL_DEF_DRV(VGEM_FENCE_ATTACH, vgem_fence_attach_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(VGEM_FENCE_SIGNAL, vgem_fence_signal_ioctl, DRM_RENDER_ALLOW), };
-static int vgem_mmap(struct file *filp, struct vm_area_struct *vma) -{
- unsigned long flags = vma->vm_flags;
- int ret;
- ret = drm_gem_mmap(filp, vma);
- if (ret)
return ret;
- /* Keep the WC mmaping set by drm_gem_mmap() but our pages
* are ordinary and not special.
*/
- vma->vm_flags = flags | VM_DONTEXPAND | VM_DONTDUMP;
- return 0;
-}
-static const struct file_operations vgem_driver_fops = {
- .owner = THIS_MODULE,
- .open = drm_open,
- .mmap = vgem_mmap,
- .poll = drm_poll,
- .read = drm_read,
- .unlocked_ioctl = drm_ioctl,
- .compat_ioctl = drm_compat_ioctl,
- .release = drm_release,
-};
-static struct page **vgem_pin_pages(struct drm_vgem_gem_object *bo) -{
- mutex_lock(&bo->pages_lock);
- if (bo->pages_pin_count++ == 0) {
struct page **pages;
pages = drm_gem_get_pages(&bo->base);
if (IS_ERR(pages)) {
bo->pages_pin_count--;
mutex_unlock(&bo->pages_lock);
return pages;
}
bo->pages = pages;
- }
- mutex_unlock(&bo->pages_lock);
- return bo->pages;
-}
-static void vgem_unpin_pages(struct drm_vgem_gem_object *bo) -{
- mutex_lock(&bo->pages_lock);
- if (--bo->pages_pin_count == 0) {
drm_gem_put_pages(&bo->base, bo->pages, true, true);
bo->pages = NULL;
- }
- mutex_unlock(&bo->pages_lock);
-}
-static int vgem_prime_pin(struct drm_gem_object *obj) -{
- struct drm_vgem_gem_object *bo = to_vgem_bo(obj);
- long n_pages = obj->size >> PAGE_SHIFT;
- struct page **pages;
- pages = vgem_pin_pages(bo);
- if (IS_ERR(pages))
return PTR_ERR(pages);
- /* Flush the object from the CPU cache so that importers can rely
* on coherent indirect access via the exported dma-address.
*/
- drm_clflush_pages(pages, n_pages);
- return 0;
-}
-static void vgem_prime_unpin(struct drm_gem_object *obj) -{
- struct drm_vgem_gem_object *bo = to_vgem_bo(obj);
- vgem_unpin_pages(bo);
-}
-static struct sg_table *vgem_prime_get_sg_table(struct drm_gem_object *obj) -{
- struct drm_vgem_gem_object *bo = to_vgem_bo(obj);
- return drm_prime_pages_to_sg(obj->dev, bo->pages, bo->base.size >> PAGE_SHIFT);
-}
-static struct drm_gem_object* vgem_prime_import(struct drm_device *dev,
struct dma_buf *dma_buf)
-{
- struct vgem_device *vgem = container_of(dev, typeof(*vgem), drm);
- return drm_gem_prime_import_dev(dev, dma_buf, &vgem->platform->dev);
-}
-static struct drm_gem_object *vgem_prime_import_sg_table(struct drm_device *dev,
struct dma_buf_attachment *attach, struct sg_table *sg)
-{
- struct drm_vgem_gem_object *obj;
- int npages;
- obj = __vgem_gem_create(dev, attach->dmabuf->size);
- if (IS_ERR(obj))
return ERR_CAST(obj);
- npages = PAGE_ALIGN(attach->dmabuf->size) / PAGE_SIZE;
- obj->table = sg;
- obj->pages = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
- if (!obj->pages) {
__vgem_gem_destroy(obj);
return ERR_PTR(-ENOMEM);
- }
- obj->pages_pin_count++; /* perma-pinned */
- drm_prime_sg_to_page_array(obj->table, obj->pages, npages);
- return &obj->base;
-}
-static int vgem_prime_vmap(struct drm_gem_object *obj, struct dma_buf_map *map) -{
- struct drm_vgem_gem_object *bo = to_vgem_bo(obj);
- long n_pages = obj->size >> PAGE_SHIFT;
- struct page **pages;
- void *vaddr;
- pages = vgem_pin_pages(bo);
- if (IS_ERR(pages))
return PTR_ERR(pages);
- vaddr = vmap(pages, n_pages, 0, pgprot_writecombine(PAGE_KERNEL));
- if (!vaddr)
return -ENOMEM;
- dma_buf_map_set_vaddr(map, vaddr);
- return 0;
-}
-static void vgem_prime_vunmap(struct drm_gem_object *obj, struct dma_buf_map *map) -{
- struct drm_vgem_gem_object *bo = to_vgem_bo(obj);
- vunmap(map->vaddr);
- vgem_unpin_pages(bo);
-}
-static int vgem_prime_mmap(struct drm_gem_object *obj,
struct vm_area_struct *vma)
-{
- int ret;
- if (obj->size < vma->vm_end - vma->vm_start)
return -EINVAL;
- if (!obj->filp)
return -ENODEV;
- ret = call_mmap(obj->filp, vma);
- if (ret)
return ret;
- vma_set_file(vma, obj->filp);
- vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
- vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
- return 0;
-}
-static const struct drm_gem_object_funcs vgem_gem_object_funcs = {
- .free = vgem_gem_free_object,
- .pin = vgem_prime_pin,
- .unpin = vgem_prime_unpin,
- .get_sg_table = vgem_prime_get_sg_table,
- .vmap = vgem_prime_vmap,
- .vunmap = vgem_prime_vunmap,
- .vm_ops = &vgem_gem_vm_ops,
-}; +DEFINE_DRM_GEM_FOPS(vgem_driver_fops);
static const struct drm_driver vgem_driver = { .driver_features = DRIVER_GEM | DRIVER_RENDER, @@ -427,13 +159,7 @@ static const struct drm_driver vgem_driver = { .num_ioctls = ARRAY_SIZE(vgem_ioctls), .fops = &vgem_driver_fops,
- .dumb_create = vgem_gem_dumb_create,
- .prime_handle_to_fd = drm_gem_prime_handle_to_fd,
- .prime_fd_to_handle = drm_gem_prime_fd_to_handle,
- .gem_prime_import = vgem_prime_import,
- .gem_prime_import_sg_table = vgem_prime_import_sg_table,
- .gem_prime_mmap = vgem_prime_mmap,
DRM_GEM_SHMEM_DRIVER_OPS,
.name = DRIVER_NAME, .desc = DRIVER_DESC,
Aside from deleting lots of code the real motivation here is to switch the mmap over to VM_PFNMAP, to be more consistent with what real gpu drivers do. They're all VM_PFNMP, which means get_user_pages doesn't work, and even if you try and there's a struct page behind that, touching it and mucking around with its refcount can upset drivers real bad.
v2: Review from Thomas: - sort #include - drop more dead code that I didn't spot somehow
Cc: Thomas Zimmermann tzimmermann@suse.de Acked-by: Thomas Zimmermann tzimmermann@suse.de Cc: John Stultz john.stultz@linaro.org Cc: Sumit Semwal sumit.semwal@linaro.org Cc: "Christian König" christian.koenig@amd.com Signed-off-by: Daniel Vetter daniel.vetter@intel.com Cc: Melissa Wen melissa.srw@gmail.com Cc: Chris Wilson chris@chris-wilson.co.uk --- drivers/gpu/drm/vgem/vgem_drv.c | 340 +------------------------------- 1 file changed, 3 insertions(+), 337 deletions(-)
diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c index a0e75f1d5d01..b1b3a5ffc542 100644 --- a/drivers/gpu/drm/vgem/vgem_drv.c +++ b/drivers/gpu/drm/vgem/vgem_drv.c @@ -38,6 +38,7 @@
#include <drm/drm_drv.h> #include <drm/drm_file.h> +#include <drm/drm_gem_shmem_helper.h> #include <drm/drm_ioctl.h> #include <drm/drm_managed.h> #include <drm/drm_prime.h> @@ -50,87 +51,11 @@ #define DRIVER_MAJOR 1 #define DRIVER_MINOR 0
-static const struct drm_gem_object_funcs vgem_gem_object_funcs; - static struct vgem_device { struct drm_device drm; struct platform_device *platform; } *vgem_device;
-static void vgem_gem_free_object(struct drm_gem_object *obj) -{ - struct drm_vgem_gem_object *vgem_obj = to_vgem_bo(obj); - - kvfree(vgem_obj->pages); - mutex_destroy(&vgem_obj->pages_lock); - - if (obj->import_attach) - drm_prime_gem_destroy(obj, vgem_obj->table); - - drm_gem_object_release(obj); - kfree(vgem_obj); -} - -static vm_fault_t vgem_gem_fault(struct vm_fault *vmf) -{ - struct vm_area_struct *vma = vmf->vma; - struct drm_vgem_gem_object *obj = vma->vm_private_data; - /* We don't use vmf->pgoff since that has the fake offset */ - unsigned long vaddr = vmf->address; - vm_fault_t ret = VM_FAULT_SIGBUS; - loff_t num_pages; - pgoff_t page_offset; - page_offset = (vaddr - vma->vm_start) >> PAGE_SHIFT; - - num_pages = DIV_ROUND_UP(obj->base.size, PAGE_SIZE); - - if (page_offset >= num_pages) - return VM_FAULT_SIGBUS; - - mutex_lock(&obj->pages_lock); - if (obj->pages) { - get_page(obj->pages[page_offset]); - vmf->page = obj->pages[page_offset]; - ret = 0; - } - mutex_unlock(&obj->pages_lock); - if (ret) { - struct page *page; - - page = shmem_read_mapping_page( - file_inode(obj->base.filp)->i_mapping, - page_offset); - if (!IS_ERR(page)) { - vmf->page = page; - ret = 0; - } else switch (PTR_ERR(page)) { - case -ENOSPC: - case -ENOMEM: - ret = VM_FAULT_OOM; - break; - case -EBUSY: - ret = VM_FAULT_RETRY; - break; - case -EFAULT: - case -EINVAL: - ret = VM_FAULT_SIGBUS; - break; - default: - WARN_ON(PTR_ERR(page)); - ret = VM_FAULT_SIGBUS; - break; - } - - } - return ret; -} - -static const struct vm_operations_struct vgem_gem_vm_ops = { - .fault = vgem_gem_fault, - .open = drm_gem_vm_open, - .close = drm_gem_vm_close, -}; - static int vgem_open(struct drm_device *dev, struct drm_file *file) { struct vgem_file *vfile; @@ -159,265 +84,12 @@ static void vgem_postclose(struct drm_device *dev, struct drm_file *file) kfree(vfile); }
-static struct drm_vgem_gem_object *__vgem_gem_create(struct drm_device *dev, - unsigned long size) -{ - struct drm_vgem_gem_object *obj; - int ret; - - obj = kzalloc(sizeof(*obj), GFP_KERNEL); - if (!obj) - return ERR_PTR(-ENOMEM); - - obj->base.funcs = &vgem_gem_object_funcs; - - ret = drm_gem_object_init(dev, &obj->base, roundup(size, PAGE_SIZE)); - if (ret) { - kfree(obj); - return ERR_PTR(ret); - } - - mutex_init(&obj->pages_lock); - - return obj; -} - -static void __vgem_gem_destroy(struct drm_vgem_gem_object *obj) -{ - drm_gem_object_release(&obj->base); - kfree(obj); -} - -static struct drm_gem_object *vgem_gem_create(struct drm_device *dev, - struct drm_file *file, - unsigned int *handle, - unsigned long size) -{ - struct drm_vgem_gem_object *obj; - int ret; - - obj = __vgem_gem_create(dev, size); - if (IS_ERR(obj)) - return ERR_CAST(obj); - - ret = drm_gem_handle_create(file, &obj->base, handle); - if (ret) { - drm_gem_object_put(&obj->base); - return ERR_PTR(ret); - } - - return &obj->base; -} - -static int vgem_gem_dumb_create(struct drm_file *file, struct drm_device *dev, - struct drm_mode_create_dumb *args) -{ - struct drm_gem_object *gem_object; - u64 pitch, size; - - pitch = args->width * DIV_ROUND_UP(args->bpp, 8); - size = args->height * pitch; - if (size == 0) - return -EINVAL; - - gem_object = vgem_gem_create(dev, file, &args->handle, size); - if (IS_ERR(gem_object)) - return PTR_ERR(gem_object); - - args->size = gem_object->size; - args->pitch = pitch; - - drm_gem_object_put(gem_object); - - DRM_DEBUG("Created object of size %llu\n", args->size); - - return 0; -} - static struct drm_ioctl_desc vgem_ioctls[] = { DRM_IOCTL_DEF_DRV(VGEM_FENCE_ATTACH, vgem_fence_attach_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(VGEM_FENCE_SIGNAL, vgem_fence_signal_ioctl, DRM_RENDER_ALLOW), };
-static int vgem_mmap(struct file *filp, struct vm_area_struct *vma) -{ - unsigned long flags = vma->vm_flags; - int ret; - - ret = drm_gem_mmap(filp, vma); - if (ret) - return ret; - - /* Keep the WC mmaping set by drm_gem_mmap() but our pages - * are ordinary and not special. - */ - vma->vm_flags = flags | VM_DONTEXPAND | VM_DONTDUMP; - return 0; -} - -static const struct file_operations vgem_driver_fops = { - .owner = THIS_MODULE, - .open = drm_open, - .mmap = vgem_mmap, - .poll = drm_poll, - .read = drm_read, - .unlocked_ioctl = drm_ioctl, - .compat_ioctl = drm_compat_ioctl, - .release = drm_release, -}; - -static struct page **vgem_pin_pages(struct drm_vgem_gem_object *bo) -{ - mutex_lock(&bo->pages_lock); - if (bo->pages_pin_count++ == 0) { - struct page **pages; - - pages = drm_gem_get_pages(&bo->base); - if (IS_ERR(pages)) { - bo->pages_pin_count--; - mutex_unlock(&bo->pages_lock); - return pages; - } - - bo->pages = pages; - } - mutex_unlock(&bo->pages_lock); - - return bo->pages; -} - -static void vgem_unpin_pages(struct drm_vgem_gem_object *bo) -{ - mutex_lock(&bo->pages_lock); - if (--bo->pages_pin_count == 0) { - drm_gem_put_pages(&bo->base, bo->pages, true, true); - bo->pages = NULL; - } - mutex_unlock(&bo->pages_lock); -} - -static int vgem_prime_pin(struct drm_gem_object *obj) -{ - struct drm_vgem_gem_object *bo = to_vgem_bo(obj); - long n_pages = obj->size >> PAGE_SHIFT; - struct page **pages; - - pages = vgem_pin_pages(bo); - if (IS_ERR(pages)) - return PTR_ERR(pages); - - /* Flush the object from the CPU cache so that importers can rely - * on coherent indirect access via the exported dma-address. - */ - drm_clflush_pages(pages, n_pages); - - return 0; -} - -static void vgem_prime_unpin(struct drm_gem_object *obj) -{ - struct drm_vgem_gem_object *bo = to_vgem_bo(obj); - - vgem_unpin_pages(bo); -} - -static struct sg_table *vgem_prime_get_sg_table(struct drm_gem_object *obj) -{ - struct drm_vgem_gem_object *bo = to_vgem_bo(obj); - - return drm_prime_pages_to_sg(obj->dev, bo->pages, bo->base.size >> PAGE_SHIFT); -} - -static struct drm_gem_object* vgem_prime_import(struct drm_device *dev, - struct dma_buf *dma_buf) -{ - struct vgem_device *vgem = container_of(dev, typeof(*vgem), drm); - - return drm_gem_prime_import_dev(dev, dma_buf, &vgem->platform->dev); -} - -static struct drm_gem_object *vgem_prime_import_sg_table(struct drm_device *dev, - struct dma_buf_attachment *attach, struct sg_table *sg) -{ - struct drm_vgem_gem_object *obj; - int npages; - - obj = __vgem_gem_create(dev, attach->dmabuf->size); - if (IS_ERR(obj)) - return ERR_CAST(obj); - - npages = PAGE_ALIGN(attach->dmabuf->size) / PAGE_SIZE; - - obj->table = sg; - obj->pages = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); - if (!obj->pages) { - __vgem_gem_destroy(obj); - return ERR_PTR(-ENOMEM); - } - - obj->pages_pin_count++; /* perma-pinned */ - drm_prime_sg_to_page_array(obj->table, obj->pages, npages); - return &obj->base; -} - -static int vgem_prime_vmap(struct drm_gem_object *obj, struct dma_buf_map *map) -{ - struct drm_vgem_gem_object *bo = to_vgem_bo(obj); - long n_pages = obj->size >> PAGE_SHIFT; - struct page **pages; - void *vaddr; - - pages = vgem_pin_pages(bo); - if (IS_ERR(pages)) - return PTR_ERR(pages); - - vaddr = vmap(pages, n_pages, 0, pgprot_writecombine(PAGE_KERNEL)); - if (!vaddr) - return -ENOMEM; - dma_buf_map_set_vaddr(map, vaddr); - - return 0; -} - -static void vgem_prime_vunmap(struct drm_gem_object *obj, struct dma_buf_map *map) -{ - struct drm_vgem_gem_object *bo = to_vgem_bo(obj); - - vunmap(map->vaddr); - vgem_unpin_pages(bo); -} - -static int vgem_prime_mmap(struct drm_gem_object *obj, - struct vm_area_struct *vma) -{ - int ret; - - if (obj->size < vma->vm_end - vma->vm_start) - return -EINVAL; - - if (!obj->filp) - return -ENODEV; - - ret = call_mmap(obj->filp, vma); - if (ret) - return ret; - - vma_set_file(vma, obj->filp); - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; - vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); - - return 0; -} - -static const struct drm_gem_object_funcs vgem_gem_object_funcs = { - .free = vgem_gem_free_object, - .pin = vgem_prime_pin, - .unpin = vgem_prime_unpin, - .get_sg_table = vgem_prime_get_sg_table, - .vmap = vgem_prime_vmap, - .vunmap = vgem_prime_vunmap, - .vm_ops = &vgem_gem_vm_ops, -}; +DEFINE_DRM_GEM_FOPS(vgem_driver_fops);
static const struct drm_driver vgem_driver = { .driver_features = DRIVER_GEM | DRIVER_RENDER, @@ -427,13 +99,7 @@ static const struct drm_driver vgem_driver = { .num_ioctls = ARRAY_SIZE(vgem_ioctls), .fops = &vgem_driver_fops,
- .dumb_create = vgem_gem_dumb_create, - - .prime_handle_to_fd = drm_gem_prime_handle_to_fd, - .prime_fd_to_handle = drm_gem_prime_fd_to_handle, - .gem_prime_import = vgem_prime_import, - .gem_prime_import_sg_table = vgem_prime_import_sg_table, - .gem_prime_mmap = vgem_prime_mmap, + DRM_GEM_SHMEM_DRIVER_OPS,
.name = DRIVER_NAME, .desc = DRIVER_DESC,
Hi Daniel,
I love your patch! Yet something to improve:
[auto build test ERROR on drm-intel/for-linux-next] [also build test ERROR on drm-tip/drm-tip linus/master next-20210223] [cannot apply to drm/drm-next v5.11] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch]
url: https://github.com/0day-ci/linux/commits/Daniel-Vetter/dma-buf-Require-VM_PF... base: git://anongit.freedesktop.org/drm-intel for-linux-next config: openrisc-randconfig-r026-20210223 (attached as .config) compiler: or1k-linux-gcc (GCC) 9.3.0 reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # https://github.com/0day-ci/linux/commit/5c544c63e333016d58d3e6f4802093906ef5... git remote add linux-review https://github.com/0day-ci/linux git fetch --no-tags linux-review Daniel-Vetter/dma-buf-Require-VM_PFNMAP-vma-for-mmap/20210223-190209 git checkout 5c544c63e333016d58d3e6f4802093906ef5456e # save the attached .config to linux build tree COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=openrisc
If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot lkp@intel.com
All errors (new ones prefixed by >>):
or1k-linux-ld: arch/openrisc/kernel/entry.o: in function `_external_irq_handler': (.text+0x83c): undefined reference to `printk' (.text+0x83c): relocation truncated to fit: R_OR1K_INSN_REL_26 against undefined symbol `printk'
or1k-linux-ld: drivers/gpu/drm/vgem/vgem_drv.o:(.rodata+0x44): undefined reference to `drm_gem_shmem_prime_import_sg_table' or1k-linux-ld: drivers/gpu/drm/vgem/vgem_drv.o:(.rodata+0x4c): undefined reference to `drm_gem_shmem_dumb_create'
--- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Hi Daniel,
I love your patch! Yet something to improve:
[auto build test ERROR on drm-intel/for-linux-next] [also build test ERROR on drm-tip/drm-tip linus/master next-20210223] [cannot apply to tegra-drm/drm/tegra/for-next drm-exynos/exynos-drm-next drm/drm-next v5.11] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch]
url: https://github.com/0day-ci/linux/commits/Daniel-Vetter/dma-buf-Require-VM_PF... base: git://anongit.freedesktop.org/drm-intel for-linux-next config: microblaze-randconfig-r013-20210223 (attached as .config) compiler: microblaze-linux-gcc (GCC) 9.3.0 reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # https://github.com/0day-ci/linux/commit/5c544c63e333016d58d3e6f4802093906ef5... git remote add linux-review https://github.com/0day-ci/linux git fetch --no-tags linux-review Daniel-Vetter/dma-buf-Require-VM_PFNMAP-vma-for-mmap/20210223-190209 git checkout 5c544c63e333016d58d3e6f4802093906ef5456e # save the attached .config to linux build tree COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=microblaze
If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot lkp@intel.com
All errors (new ones prefixed by >>, old ones prefixed by <<):
ERROR: modpost: "drm_gem_shmem_prime_import_sg_table" [drivers/gpu/drm/vgem/vgem.ko] undefined! ERROR: modpost: "drm_gem_shmem_dumb_create" [drivers/gpu/drm/vgem/vgem.ko] undefined!
--- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Aside from deleting lots of code the real motivation here is to switch the mmap over to VM_PFNMAP, to be more consistent with what real gpu drivers do. They're all VM_PFNMP, which means get_user_pages doesn't work, and even if you try and there's a struct page behind that, touching it and mucking around with its refcount can upset drivers real bad.
v2: Review from Thomas: - sort #include - drop more dead code that I didn't spot somehow
v3: select DRM_GEM_SHMEM_HELPER to make it build (intel-gfx-ci)
Cc: Thomas Zimmermann tzimmermann@suse.de Acked-by: Thomas Zimmermann tzimmermann@suse.de Cc: John Stultz john.stultz@linaro.org Cc: Sumit Semwal sumit.semwal@linaro.org Cc: "Christian König" christian.koenig@amd.com Signed-off-by: Daniel Vetter daniel.vetter@intel.com Cc: Melissa Wen melissa.srw@gmail.com Cc: Chris Wilson chris@chris-wilson.co.uk --- drivers/gpu/drm/Kconfig | 1 + drivers/gpu/drm/vgem/vgem_drv.c | 340 +------------------------------- 2 files changed, 4 insertions(+), 337 deletions(-)
diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 8e73311de583..94e4ac830283 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -274,6 +274,7 @@ source "drivers/gpu/drm/kmb/Kconfig" config DRM_VGEM tristate "Virtual GEM provider" depends on DRM + select DRM_GEM_SHMEM_HELPER help Choose this option to get a virtual graphics memory manager, as used by Mesa's software renderer for enhanced performance. diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c index a0e75f1d5d01..b1b3a5ffc542 100644 --- a/drivers/gpu/drm/vgem/vgem_drv.c +++ b/drivers/gpu/drm/vgem/vgem_drv.c @@ -38,6 +38,7 @@
#include <drm/drm_drv.h> #include <drm/drm_file.h> +#include <drm/drm_gem_shmem_helper.h> #include <drm/drm_ioctl.h> #include <drm/drm_managed.h> #include <drm/drm_prime.h> @@ -50,87 +51,11 @@ #define DRIVER_MAJOR 1 #define DRIVER_MINOR 0
-static const struct drm_gem_object_funcs vgem_gem_object_funcs; - static struct vgem_device { struct drm_device drm; struct platform_device *platform; } *vgem_device;
-static void vgem_gem_free_object(struct drm_gem_object *obj) -{ - struct drm_vgem_gem_object *vgem_obj = to_vgem_bo(obj); - - kvfree(vgem_obj->pages); - mutex_destroy(&vgem_obj->pages_lock); - - if (obj->import_attach) - drm_prime_gem_destroy(obj, vgem_obj->table); - - drm_gem_object_release(obj); - kfree(vgem_obj); -} - -static vm_fault_t vgem_gem_fault(struct vm_fault *vmf) -{ - struct vm_area_struct *vma = vmf->vma; - struct drm_vgem_gem_object *obj = vma->vm_private_data; - /* We don't use vmf->pgoff since that has the fake offset */ - unsigned long vaddr = vmf->address; - vm_fault_t ret = VM_FAULT_SIGBUS; - loff_t num_pages; - pgoff_t page_offset; - page_offset = (vaddr - vma->vm_start) >> PAGE_SHIFT; - - num_pages = DIV_ROUND_UP(obj->base.size, PAGE_SIZE); - - if (page_offset >= num_pages) - return VM_FAULT_SIGBUS; - - mutex_lock(&obj->pages_lock); - if (obj->pages) { - get_page(obj->pages[page_offset]); - vmf->page = obj->pages[page_offset]; - ret = 0; - } - mutex_unlock(&obj->pages_lock); - if (ret) { - struct page *page; - - page = shmem_read_mapping_page( - file_inode(obj->base.filp)->i_mapping, - page_offset); - if (!IS_ERR(page)) { - vmf->page = page; - ret = 0; - } else switch (PTR_ERR(page)) { - case -ENOSPC: - case -ENOMEM: - ret = VM_FAULT_OOM; - break; - case -EBUSY: - ret = VM_FAULT_RETRY; - break; - case -EFAULT: - case -EINVAL: - ret = VM_FAULT_SIGBUS; - break; - default: - WARN_ON(PTR_ERR(page)); - ret = VM_FAULT_SIGBUS; - break; - } - - } - return ret; -} - -static const struct vm_operations_struct vgem_gem_vm_ops = { - .fault = vgem_gem_fault, - .open = drm_gem_vm_open, - .close = drm_gem_vm_close, -}; - static int vgem_open(struct drm_device *dev, struct drm_file *file) { struct vgem_file *vfile; @@ -159,265 +84,12 @@ static void vgem_postclose(struct drm_device *dev, struct drm_file *file) kfree(vfile); }
-static struct drm_vgem_gem_object *__vgem_gem_create(struct drm_device *dev, - unsigned long size) -{ - struct drm_vgem_gem_object *obj; - int ret; - - obj = kzalloc(sizeof(*obj), GFP_KERNEL); - if (!obj) - return ERR_PTR(-ENOMEM); - - obj->base.funcs = &vgem_gem_object_funcs; - - ret = drm_gem_object_init(dev, &obj->base, roundup(size, PAGE_SIZE)); - if (ret) { - kfree(obj); - return ERR_PTR(ret); - } - - mutex_init(&obj->pages_lock); - - return obj; -} - -static void __vgem_gem_destroy(struct drm_vgem_gem_object *obj) -{ - drm_gem_object_release(&obj->base); - kfree(obj); -} - -static struct drm_gem_object *vgem_gem_create(struct drm_device *dev, - struct drm_file *file, - unsigned int *handle, - unsigned long size) -{ - struct drm_vgem_gem_object *obj; - int ret; - - obj = __vgem_gem_create(dev, size); - if (IS_ERR(obj)) - return ERR_CAST(obj); - - ret = drm_gem_handle_create(file, &obj->base, handle); - if (ret) { - drm_gem_object_put(&obj->base); - return ERR_PTR(ret); - } - - return &obj->base; -} - -static int vgem_gem_dumb_create(struct drm_file *file, struct drm_device *dev, - struct drm_mode_create_dumb *args) -{ - struct drm_gem_object *gem_object; - u64 pitch, size; - - pitch = args->width * DIV_ROUND_UP(args->bpp, 8); - size = args->height * pitch; - if (size == 0) - return -EINVAL; - - gem_object = vgem_gem_create(dev, file, &args->handle, size); - if (IS_ERR(gem_object)) - return PTR_ERR(gem_object); - - args->size = gem_object->size; - args->pitch = pitch; - - drm_gem_object_put(gem_object); - - DRM_DEBUG("Created object of size %llu\n", args->size); - - return 0; -} - static struct drm_ioctl_desc vgem_ioctls[] = { DRM_IOCTL_DEF_DRV(VGEM_FENCE_ATTACH, vgem_fence_attach_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(VGEM_FENCE_SIGNAL, vgem_fence_signal_ioctl, DRM_RENDER_ALLOW), };
-static int vgem_mmap(struct file *filp, struct vm_area_struct *vma) -{ - unsigned long flags = vma->vm_flags; - int ret; - - ret = drm_gem_mmap(filp, vma); - if (ret) - return ret; - - /* Keep the WC mmaping set by drm_gem_mmap() but our pages - * are ordinary and not special. - */ - vma->vm_flags = flags | VM_DONTEXPAND | VM_DONTDUMP; - return 0; -} - -static const struct file_operations vgem_driver_fops = { - .owner = THIS_MODULE, - .open = drm_open, - .mmap = vgem_mmap, - .poll = drm_poll, - .read = drm_read, - .unlocked_ioctl = drm_ioctl, - .compat_ioctl = drm_compat_ioctl, - .release = drm_release, -}; - -static struct page **vgem_pin_pages(struct drm_vgem_gem_object *bo) -{ - mutex_lock(&bo->pages_lock); - if (bo->pages_pin_count++ == 0) { - struct page **pages; - - pages = drm_gem_get_pages(&bo->base); - if (IS_ERR(pages)) { - bo->pages_pin_count--; - mutex_unlock(&bo->pages_lock); - return pages; - } - - bo->pages = pages; - } - mutex_unlock(&bo->pages_lock); - - return bo->pages; -} - -static void vgem_unpin_pages(struct drm_vgem_gem_object *bo) -{ - mutex_lock(&bo->pages_lock); - if (--bo->pages_pin_count == 0) { - drm_gem_put_pages(&bo->base, bo->pages, true, true); - bo->pages = NULL; - } - mutex_unlock(&bo->pages_lock); -} - -static int vgem_prime_pin(struct drm_gem_object *obj) -{ - struct drm_vgem_gem_object *bo = to_vgem_bo(obj); - long n_pages = obj->size >> PAGE_SHIFT; - struct page **pages; - - pages = vgem_pin_pages(bo); - if (IS_ERR(pages)) - return PTR_ERR(pages); - - /* Flush the object from the CPU cache so that importers can rely - * on coherent indirect access via the exported dma-address. - */ - drm_clflush_pages(pages, n_pages); - - return 0; -} - -static void vgem_prime_unpin(struct drm_gem_object *obj) -{ - struct drm_vgem_gem_object *bo = to_vgem_bo(obj); - - vgem_unpin_pages(bo); -} - -static struct sg_table *vgem_prime_get_sg_table(struct drm_gem_object *obj) -{ - struct drm_vgem_gem_object *bo = to_vgem_bo(obj); - - return drm_prime_pages_to_sg(obj->dev, bo->pages, bo->base.size >> PAGE_SHIFT); -} - -static struct drm_gem_object* vgem_prime_import(struct drm_device *dev, - struct dma_buf *dma_buf) -{ - struct vgem_device *vgem = container_of(dev, typeof(*vgem), drm); - - return drm_gem_prime_import_dev(dev, dma_buf, &vgem->platform->dev); -} - -static struct drm_gem_object *vgem_prime_import_sg_table(struct drm_device *dev, - struct dma_buf_attachment *attach, struct sg_table *sg) -{ - struct drm_vgem_gem_object *obj; - int npages; - - obj = __vgem_gem_create(dev, attach->dmabuf->size); - if (IS_ERR(obj)) - return ERR_CAST(obj); - - npages = PAGE_ALIGN(attach->dmabuf->size) / PAGE_SIZE; - - obj->table = sg; - obj->pages = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); - if (!obj->pages) { - __vgem_gem_destroy(obj); - return ERR_PTR(-ENOMEM); - } - - obj->pages_pin_count++; /* perma-pinned */ - drm_prime_sg_to_page_array(obj->table, obj->pages, npages); - return &obj->base; -} - -static int vgem_prime_vmap(struct drm_gem_object *obj, struct dma_buf_map *map) -{ - struct drm_vgem_gem_object *bo = to_vgem_bo(obj); - long n_pages = obj->size >> PAGE_SHIFT; - struct page **pages; - void *vaddr; - - pages = vgem_pin_pages(bo); - if (IS_ERR(pages)) - return PTR_ERR(pages); - - vaddr = vmap(pages, n_pages, 0, pgprot_writecombine(PAGE_KERNEL)); - if (!vaddr) - return -ENOMEM; - dma_buf_map_set_vaddr(map, vaddr); - - return 0; -} - -static void vgem_prime_vunmap(struct drm_gem_object *obj, struct dma_buf_map *map) -{ - struct drm_vgem_gem_object *bo = to_vgem_bo(obj); - - vunmap(map->vaddr); - vgem_unpin_pages(bo); -} - -static int vgem_prime_mmap(struct drm_gem_object *obj, - struct vm_area_struct *vma) -{ - int ret; - - if (obj->size < vma->vm_end - vma->vm_start) - return -EINVAL; - - if (!obj->filp) - return -ENODEV; - - ret = call_mmap(obj->filp, vma); - if (ret) - return ret; - - vma_set_file(vma, obj->filp); - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; - vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); - - return 0; -} - -static const struct drm_gem_object_funcs vgem_gem_object_funcs = { - .free = vgem_gem_free_object, - .pin = vgem_prime_pin, - .unpin = vgem_prime_unpin, - .get_sg_table = vgem_prime_get_sg_table, - .vmap = vgem_prime_vmap, - .vunmap = vgem_prime_vunmap, - .vm_ops = &vgem_gem_vm_ops, -}; +DEFINE_DRM_GEM_FOPS(vgem_driver_fops);
static const struct drm_driver vgem_driver = { .driver_features = DRIVER_GEM | DRIVER_RENDER, @@ -427,13 +99,7 @@ static const struct drm_driver vgem_driver = { .num_ioctls = ARRAY_SIZE(vgem_ioctls), .fops = &vgem_driver_fops,
- .dumb_create = vgem_gem_dumb_create, - - .prime_handle_to_fd = drm_gem_prime_handle_to_fd, - .prime_fd_to_handle = drm_gem_prime_fd_to_handle, - .gem_prime_import = vgem_prime_import, - .gem_prime_import_sg_table = vgem_prime_import_sg_table, - .gem_prime_mmap = vgem_prime_mmap, + DRM_GEM_SHMEM_DRIVER_OPS,
.name = DRIVER_NAME, .desc = DRIVER_DESC,
Hi
Am 25.02.21 um 11:23 schrieb Daniel Vetter:
Aside from deleting lots of code the real motivation here is to switch the mmap over to VM_PFNMAP, to be more consistent with what real gpu drivers do. They're all VM_PFNMP, which means get_user_pages doesn't work, and even if you try and there's a struct page behind that, touching it and mucking around with its refcount can upset drivers real bad.
v2: Review from Thomas:
- sort #include
- drop more dead code that I didn't spot somehow
v3: select DRM_GEM_SHMEM_HELPER to make it build (intel-gfx-ci)
Since you're working on it, could you move the config item into a Kconfig file under vgem?
Best regards Thomas
Cc: Thomas Zimmermann tzimmermann@suse.de Acked-by: Thomas Zimmermann tzimmermann@suse.de Cc: John Stultz john.stultz@linaro.org Cc: Sumit Semwal sumit.semwal@linaro.org Cc: "Christian König" christian.koenig@amd.com Signed-off-by: Daniel Vetter daniel.vetter@intel.com Cc: Melissa Wen melissa.srw@gmail.com Cc: Chris Wilson chris@chris-wilson.co.uk
drivers/gpu/drm/Kconfig | 1 + drivers/gpu/drm/vgem/vgem_drv.c | 340 +------------------------------- 2 files changed, 4 insertions(+), 337 deletions(-)
diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 8e73311de583..94e4ac830283 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -274,6 +274,7 @@ source "drivers/gpu/drm/kmb/Kconfig" config DRM_VGEM tristate "Virtual GEM provider" depends on DRM
- select DRM_GEM_SHMEM_HELPER help Choose this option to get a virtual graphics memory manager, as used by Mesa's software renderer for enhanced performance.
diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c index a0e75f1d5d01..b1b3a5ffc542 100644 --- a/drivers/gpu/drm/vgem/vgem_drv.c +++ b/drivers/gpu/drm/vgem/vgem_drv.c @@ -38,6 +38,7 @@
#include <drm/drm_drv.h> #include <drm/drm_file.h> +#include <drm/drm_gem_shmem_helper.h> #include <drm/drm_ioctl.h> #include <drm/drm_managed.h> #include <drm/drm_prime.h> @@ -50,87 +51,11 @@ #define DRIVER_MAJOR 1 #define DRIVER_MINOR 0
-static const struct drm_gem_object_funcs vgem_gem_object_funcs;
- static struct vgem_device { struct drm_device drm; struct platform_device *platform; } *vgem_device;
-static void vgem_gem_free_object(struct drm_gem_object *obj) -{
- struct drm_vgem_gem_object *vgem_obj = to_vgem_bo(obj);
- kvfree(vgem_obj->pages);
- mutex_destroy(&vgem_obj->pages_lock);
- if (obj->import_attach)
drm_prime_gem_destroy(obj, vgem_obj->table);
- drm_gem_object_release(obj);
- kfree(vgem_obj);
-}
-static vm_fault_t vgem_gem_fault(struct vm_fault *vmf) -{
- struct vm_area_struct *vma = vmf->vma;
- struct drm_vgem_gem_object *obj = vma->vm_private_data;
- /* We don't use vmf->pgoff since that has the fake offset */
- unsigned long vaddr = vmf->address;
- vm_fault_t ret = VM_FAULT_SIGBUS;
- loff_t num_pages;
- pgoff_t page_offset;
- page_offset = (vaddr - vma->vm_start) >> PAGE_SHIFT;
- num_pages = DIV_ROUND_UP(obj->base.size, PAGE_SIZE);
- if (page_offset >= num_pages)
return VM_FAULT_SIGBUS;
- mutex_lock(&obj->pages_lock);
- if (obj->pages) {
get_page(obj->pages[page_offset]);
vmf->page = obj->pages[page_offset];
ret = 0;
- }
- mutex_unlock(&obj->pages_lock);
- if (ret) {
struct page *page;
page = shmem_read_mapping_page(
file_inode(obj->base.filp)->i_mapping,
page_offset);
if (!IS_ERR(page)) {
vmf->page = page;
ret = 0;
} else switch (PTR_ERR(page)) {
case -ENOSPC:
case -ENOMEM:
ret = VM_FAULT_OOM;
break;
case -EBUSY:
ret = VM_FAULT_RETRY;
break;
case -EFAULT:
case -EINVAL:
ret = VM_FAULT_SIGBUS;
break;
default:
WARN_ON(PTR_ERR(page));
ret = VM_FAULT_SIGBUS;
break;
}
- }
- return ret;
-}
-static const struct vm_operations_struct vgem_gem_vm_ops = {
- .fault = vgem_gem_fault,
- .open = drm_gem_vm_open,
- .close = drm_gem_vm_close,
-};
- static int vgem_open(struct drm_device *dev, struct drm_file *file) { struct vgem_file *vfile;
@@ -159,265 +84,12 @@ static void vgem_postclose(struct drm_device *dev, struct drm_file *file) kfree(vfile); }
-static struct drm_vgem_gem_object *__vgem_gem_create(struct drm_device *dev,
unsigned long size)
-{
- struct drm_vgem_gem_object *obj;
- int ret;
- obj = kzalloc(sizeof(*obj), GFP_KERNEL);
- if (!obj)
return ERR_PTR(-ENOMEM);
- obj->base.funcs = &vgem_gem_object_funcs;
- ret = drm_gem_object_init(dev, &obj->base, roundup(size, PAGE_SIZE));
- if (ret) {
kfree(obj);
return ERR_PTR(ret);
- }
- mutex_init(&obj->pages_lock);
- return obj;
-}
-static void __vgem_gem_destroy(struct drm_vgem_gem_object *obj) -{
- drm_gem_object_release(&obj->base);
- kfree(obj);
-}
-static struct drm_gem_object *vgem_gem_create(struct drm_device *dev,
struct drm_file *file,
unsigned int *handle,
unsigned long size)
-{
- struct drm_vgem_gem_object *obj;
- int ret;
- obj = __vgem_gem_create(dev, size);
- if (IS_ERR(obj))
return ERR_CAST(obj);
- ret = drm_gem_handle_create(file, &obj->base, handle);
- if (ret) {
drm_gem_object_put(&obj->base);
return ERR_PTR(ret);
- }
- return &obj->base;
-}
-static int vgem_gem_dumb_create(struct drm_file *file, struct drm_device *dev,
struct drm_mode_create_dumb *args)
-{
- struct drm_gem_object *gem_object;
- u64 pitch, size;
- pitch = args->width * DIV_ROUND_UP(args->bpp, 8);
- size = args->height * pitch;
- if (size == 0)
return -EINVAL;
- gem_object = vgem_gem_create(dev, file, &args->handle, size);
- if (IS_ERR(gem_object))
return PTR_ERR(gem_object);
- args->size = gem_object->size;
- args->pitch = pitch;
- drm_gem_object_put(gem_object);
- DRM_DEBUG("Created object of size %llu\n", args->size);
- return 0;
-}
- static struct drm_ioctl_desc vgem_ioctls[] = { DRM_IOCTL_DEF_DRV(VGEM_FENCE_ATTACH, vgem_fence_attach_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(VGEM_FENCE_SIGNAL, vgem_fence_signal_ioctl, DRM_RENDER_ALLOW), };
-static int vgem_mmap(struct file *filp, struct vm_area_struct *vma) -{
- unsigned long flags = vma->vm_flags;
- int ret;
- ret = drm_gem_mmap(filp, vma);
- if (ret)
return ret;
- /* Keep the WC mmaping set by drm_gem_mmap() but our pages
* are ordinary and not special.
*/
- vma->vm_flags = flags | VM_DONTEXPAND | VM_DONTDUMP;
- return 0;
-}
-static const struct file_operations vgem_driver_fops = {
- .owner = THIS_MODULE,
- .open = drm_open,
- .mmap = vgem_mmap,
- .poll = drm_poll,
- .read = drm_read,
- .unlocked_ioctl = drm_ioctl,
- .compat_ioctl = drm_compat_ioctl,
- .release = drm_release,
-};
-static struct page **vgem_pin_pages(struct drm_vgem_gem_object *bo) -{
- mutex_lock(&bo->pages_lock);
- if (bo->pages_pin_count++ == 0) {
struct page **pages;
pages = drm_gem_get_pages(&bo->base);
if (IS_ERR(pages)) {
bo->pages_pin_count--;
mutex_unlock(&bo->pages_lock);
return pages;
}
bo->pages = pages;
- }
- mutex_unlock(&bo->pages_lock);
- return bo->pages;
-}
-static void vgem_unpin_pages(struct drm_vgem_gem_object *bo) -{
- mutex_lock(&bo->pages_lock);
- if (--bo->pages_pin_count == 0) {
drm_gem_put_pages(&bo->base, bo->pages, true, true);
bo->pages = NULL;
- }
- mutex_unlock(&bo->pages_lock);
-}
-static int vgem_prime_pin(struct drm_gem_object *obj) -{
- struct drm_vgem_gem_object *bo = to_vgem_bo(obj);
- long n_pages = obj->size >> PAGE_SHIFT;
- struct page **pages;
- pages = vgem_pin_pages(bo);
- if (IS_ERR(pages))
return PTR_ERR(pages);
- /* Flush the object from the CPU cache so that importers can rely
* on coherent indirect access via the exported dma-address.
*/
- drm_clflush_pages(pages, n_pages);
- return 0;
-}
-static void vgem_prime_unpin(struct drm_gem_object *obj) -{
- struct drm_vgem_gem_object *bo = to_vgem_bo(obj);
- vgem_unpin_pages(bo);
-}
-static struct sg_table *vgem_prime_get_sg_table(struct drm_gem_object *obj) -{
- struct drm_vgem_gem_object *bo = to_vgem_bo(obj);
- return drm_prime_pages_to_sg(obj->dev, bo->pages, bo->base.size >> PAGE_SHIFT);
-}
-static struct drm_gem_object* vgem_prime_import(struct drm_device *dev,
struct dma_buf *dma_buf)
-{
- struct vgem_device *vgem = container_of(dev, typeof(*vgem), drm);
- return drm_gem_prime_import_dev(dev, dma_buf, &vgem->platform->dev);
-}
-static struct drm_gem_object *vgem_prime_import_sg_table(struct drm_device *dev,
struct dma_buf_attachment *attach, struct sg_table *sg)
-{
- struct drm_vgem_gem_object *obj;
- int npages;
- obj = __vgem_gem_create(dev, attach->dmabuf->size);
- if (IS_ERR(obj))
return ERR_CAST(obj);
- npages = PAGE_ALIGN(attach->dmabuf->size) / PAGE_SIZE;
- obj->table = sg;
- obj->pages = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
- if (!obj->pages) {
__vgem_gem_destroy(obj);
return ERR_PTR(-ENOMEM);
- }
- obj->pages_pin_count++; /* perma-pinned */
- drm_prime_sg_to_page_array(obj->table, obj->pages, npages);
- return &obj->base;
-}
-static int vgem_prime_vmap(struct drm_gem_object *obj, struct dma_buf_map *map) -{
- struct drm_vgem_gem_object *bo = to_vgem_bo(obj);
- long n_pages = obj->size >> PAGE_SHIFT;
- struct page **pages;
- void *vaddr;
- pages = vgem_pin_pages(bo);
- if (IS_ERR(pages))
return PTR_ERR(pages);
- vaddr = vmap(pages, n_pages, 0, pgprot_writecombine(PAGE_KERNEL));
- if (!vaddr)
return -ENOMEM;
- dma_buf_map_set_vaddr(map, vaddr);
- return 0;
-}
-static void vgem_prime_vunmap(struct drm_gem_object *obj, struct dma_buf_map *map) -{
- struct drm_vgem_gem_object *bo = to_vgem_bo(obj);
- vunmap(map->vaddr);
- vgem_unpin_pages(bo);
-}
-static int vgem_prime_mmap(struct drm_gem_object *obj,
struct vm_area_struct *vma)
-{
- int ret;
- if (obj->size < vma->vm_end - vma->vm_start)
return -EINVAL;
- if (!obj->filp)
return -ENODEV;
- ret = call_mmap(obj->filp, vma);
- if (ret)
return ret;
- vma_set_file(vma, obj->filp);
- vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
- vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
- return 0;
-}
-static const struct drm_gem_object_funcs vgem_gem_object_funcs = {
- .free = vgem_gem_free_object,
- .pin = vgem_prime_pin,
- .unpin = vgem_prime_unpin,
- .get_sg_table = vgem_prime_get_sg_table,
- .vmap = vgem_prime_vmap,
- .vunmap = vgem_prime_vunmap,
- .vm_ops = &vgem_gem_vm_ops,
-}; +DEFINE_DRM_GEM_FOPS(vgem_driver_fops);
static const struct drm_driver vgem_driver = { .driver_features = DRIVER_GEM | DRIVER_RENDER, @@ -427,13 +99,7 @@ static const struct drm_driver vgem_driver = { .num_ioctls = ARRAY_SIZE(vgem_ioctls), .fops = &vgem_driver_fops,
- .dumb_create = vgem_gem_dumb_create,
- .prime_handle_to_fd = drm_gem_prime_handle_to_fd,
- .prime_fd_to_handle = drm_gem_prime_fd_to_handle,
- .gem_prime_import = vgem_prime_import,
- .gem_prime_import_sg_table = vgem_prime_import_sg_table,
- .gem_prime_mmap = vgem_prime_mmap,
DRM_GEM_SHMEM_DRIVER_OPS,
.name = DRIVER_NAME, .desc = DRIVER_DESC,
On Fri, Feb 26, 2021 at 10:19 AM Thomas Zimmermann tzimmermann@suse.de wrote:
Hi
Am 25.02.21 um 11:23 schrieb Daniel Vetter:
Aside from deleting lots of code the real motivation here is to switch the mmap over to VM_PFNMAP, to be more consistent with what real gpu drivers do. They're all VM_PFNMP, which means get_user_pages doesn't work, and even if you try and there's a struct page behind that, touching it and mucking around with its refcount can upset drivers real bad.
v2: Review from Thomas:
- sort #include
- drop more dead code that I didn't spot somehow
v3: select DRM_GEM_SHMEM_HELPER to make it build (intel-gfx-ci)
Since you're working on it, could you move the config item into a Kconfig file under vgem?
We have a lot of drivers still without their own Kconfig. I thought we're only doing that for drivers which have multiple options, or otherwise would clutter up the main drm/Kconfig file?
Not opposed to this, just feels like if we do this, should do it for all of them. -Daniel
Best regards Thomas
Cc: Thomas Zimmermann tzimmermann@suse.de Acked-by: Thomas Zimmermann tzimmermann@suse.de Cc: John Stultz john.stultz@linaro.org Cc: Sumit Semwal sumit.semwal@linaro.org Cc: "Christian König" christian.koenig@amd.com Signed-off-by: Daniel Vetter daniel.vetter@intel.com Cc: Melissa Wen melissa.srw@gmail.com Cc: Chris Wilson chris@chris-wilson.co.uk
drivers/gpu/drm/Kconfig | 1 + drivers/gpu/drm/vgem/vgem_drv.c | 340 +------------------------------- 2 files changed, 4 insertions(+), 337 deletions(-)
diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 8e73311de583..94e4ac830283 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -274,6 +274,7 @@ source "drivers/gpu/drm/kmb/Kconfig" config DRM_VGEM tristate "Virtual GEM provider" depends on DRM
select DRM_GEM_SHMEM_HELPER help Choose this option to get a virtual graphics memory manager, as used by Mesa's software renderer for enhanced performance.
diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c index a0e75f1d5d01..b1b3a5ffc542 100644 --- a/drivers/gpu/drm/vgem/vgem_drv.c +++ b/drivers/gpu/drm/vgem/vgem_drv.c @@ -38,6 +38,7 @@
#include <drm/drm_drv.h> #include <drm/drm_file.h> +#include <drm/drm_gem_shmem_helper.h> #include <drm/drm_ioctl.h> #include <drm/drm_managed.h> #include <drm/drm_prime.h> @@ -50,87 +51,11 @@ #define DRIVER_MAJOR 1 #define DRIVER_MINOR 0
-static const struct drm_gem_object_funcs vgem_gem_object_funcs;
- static struct vgem_device { struct drm_device drm; struct platform_device *platform; } *vgem_device;
-static void vgem_gem_free_object(struct drm_gem_object *obj) -{
struct drm_vgem_gem_object *vgem_obj = to_vgem_bo(obj);
kvfree(vgem_obj->pages);
mutex_destroy(&vgem_obj->pages_lock);
if (obj->import_attach)
drm_prime_gem_destroy(obj, vgem_obj->table);
drm_gem_object_release(obj);
kfree(vgem_obj);
-}
-static vm_fault_t vgem_gem_fault(struct vm_fault *vmf) -{
struct vm_area_struct *vma = vmf->vma;
struct drm_vgem_gem_object *obj = vma->vm_private_data;
/* We don't use vmf->pgoff since that has the fake offset */
unsigned long vaddr = vmf->address;
vm_fault_t ret = VM_FAULT_SIGBUS;
loff_t num_pages;
pgoff_t page_offset;
page_offset = (vaddr - vma->vm_start) >> PAGE_SHIFT;
num_pages = DIV_ROUND_UP(obj->base.size, PAGE_SIZE);
if (page_offset >= num_pages)
return VM_FAULT_SIGBUS;
mutex_lock(&obj->pages_lock);
if (obj->pages) {
get_page(obj->pages[page_offset]);
vmf->page = obj->pages[page_offset];
ret = 0;
}
mutex_unlock(&obj->pages_lock);
if (ret) {
struct page *page;
page = shmem_read_mapping_page(
file_inode(obj->base.filp)->i_mapping,
page_offset);
if (!IS_ERR(page)) {
vmf->page = page;
ret = 0;
} else switch (PTR_ERR(page)) {
case -ENOSPC:
case -ENOMEM:
ret = VM_FAULT_OOM;
break;
case -EBUSY:
ret = VM_FAULT_RETRY;
break;
case -EFAULT:
case -EINVAL:
ret = VM_FAULT_SIGBUS;
break;
default:
WARN_ON(PTR_ERR(page));
ret = VM_FAULT_SIGBUS;
break;
}
}
return ret;
-}
-static const struct vm_operations_struct vgem_gem_vm_ops = {
.fault = vgem_gem_fault,
.open = drm_gem_vm_open,
.close = drm_gem_vm_close,
-};
- static int vgem_open(struct drm_device *dev, struct drm_file *file) { struct vgem_file *vfile;
@@ -159,265 +84,12 @@ static void vgem_postclose(struct drm_device *dev, struct drm_file *file) kfree(vfile); }
-static struct drm_vgem_gem_object *__vgem_gem_create(struct drm_device *dev,
unsigned long size)
-{
struct drm_vgem_gem_object *obj;
int ret;
obj = kzalloc(sizeof(*obj), GFP_KERNEL);
if (!obj)
return ERR_PTR(-ENOMEM);
obj->base.funcs = &vgem_gem_object_funcs;
ret = drm_gem_object_init(dev, &obj->base, roundup(size, PAGE_SIZE));
if (ret) {
kfree(obj);
return ERR_PTR(ret);
}
mutex_init(&obj->pages_lock);
return obj;
-}
-static void __vgem_gem_destroy(struct drm_vgem_gem_object *obj) -{
drm_gem_object_release(&obj->base);
kfree(obj);
-}
-static struct drm_gem_object *vgem_gem_create(struct drm_device *dev,
struct drm_file *file,
unsigned int *handle,
unsigned long size)
-{
struct drm_vgem_gem_object *obj;
int ret;
obj = __vgem_gem_create(dev, size);
if (IS_ERR(obj))
return ERR_CAST(obj);
ret = drm_gem_handle_create(file, &obj->base, handle);
if (ret) {
drm_gem_object_put(&obj->base);
return ERR_PTR(ret);
}
return &obj->base;
-}
-static int vgem_gem_dumb_create(struct drm_file *file, struct drm_device *dev,
struct drm_mode_create_dumb *args)
-{
struct drm_gem_object *gem_object;
u64 pitch, size;
pitch = args->width * DIV_ROUND_UP(args->bpp, 8);
size = args->height * pitch;
if (size == 0)
return -EINVAL;
gem_object = vgem_gem_create(dev, file, &args->handle, size);
if (IS_ERR(gem_object))
return PTR_ERR(gem_object);
args->size = gem_object->size;
args->pitch = pitch;
drm_gem_object_put(gem_object);
DRM_DEBUG("Created object of size %llu\n", args->size);
return 0;
-}
- static struct drm_ioctl_desc vgem_ioctls[] = { DRM_IOCTL_DEF_DRV(VGEM_FENCE_ATTACH, vgem_fence_attach_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(VGEM_FENCE_SIGNAL, vgem_fence_signal_ioctl, DRM_RENDER_ALLOW), };
-static int vgem_mmap(struct file *filp, struct vm_area_struct *vma) -{
unsigned long flags = vma->vm_flags;
int ret;
ret = drm_gem_mmap(filp, vma);
if (ret)
return ret;
/* Keep the WC mmaping set by drm_gem_mmap() but our pages
* are ordinary and not special.
*/
vma->vm_flags = flags | VM_DONTEXPAND | VM_DONTDUMP;
return 0;
-}
-static const struct file_operations vgem_driver_fops = {
.owner = THIS_MODULE,
.open = drm_open,
.mmap = vgem_mmap,
.poll = drm_poll,
.read = drm_read,
.unlocked_ioctl = drm_ioctl,
.compat_ioctl = drm_compat_ioctl,
.release = drm_release,
-};
-static struct page **vgem_pin_pages(struct drm_vgem_gem_object *bo) -{
mutex_lock(&bo->pages_lock);
if (bo->pages_pin_count++ == 0) {
struct page **pages;
pages = drm_gem_get_pages(&bo->base);
if (IS_ERR(pages)) {
bo->pages_pin_count--;
mutex_unlock(&bo->pages_lock);
return pages;
}
bo->pages = pages;
}
mutex_unlock(&bo->pages_lock);
return bo->pages;
-}
-static void vgem_unpin_pages(struct drm_vgem_gem_object *bo) -{
mutex_lock(&bo->pages_lock);
if (--bo->pages_pin_count == 0) {
drm_gem_put_pages(&bo->base, bo->pages, true, true);
bo->pages = NULL;
}
mutex_unlock(&bo->pages_lock);
-}
-static int vgem_prime_pin(struct drm_gem_object *obj) -{
struct drm_vgem_gem_object *bo = to_vgem_bo(obj);
long n_pages = obj->size >> PAGE_SHIFT;
struct page **pages;
pages = vgem_pin_pages(bo);
if (IS_ERR(pages))
return PTR_ERR(pages);
/* Flush the object from the CPU cache so that importers can rely
* on coherent indirect access via the exported dma-address.
*/
drm_clflush_pages(pages, n_pages);
return 0;
-}
-static void vgem_prime_unpin(struct drm_gem_object *obj) -{
struct drm_vgem_gem_object *bo = to_vgem_bo(obj);
vgem_unpin_pages(bo);
-}
-static struct sg_table *vgem_prime_get_sg_table(struct drm_gem_object *obj) -{
struct drm_vgem_gem_object *bo = to_vgem_bo(obj);
return drm_prime_pages_to_sg(obj->dev, bo->pages, bo->base.size >> PAGE_SHIFT);
-}
-static struct drm_gem_object* vgem_prime_import(struct drm_device *dev,
struct dma_buf *dma_buf)
-{
struct vgem_device *vgem = container_of(dev, typeof(*vgem), drm);
return drm_gem_prime_import_dev(dev, dma_buf, &vgem->platform->dev);
-}
-static struct drm_gem_object *vgem_prime_import_sg_table(struct drm_device *dev,
struct dma_buf_attachment *attach, struct sg_table *sg)
-{
struct drm_vgem_gem_object *obj;
int npages;
obj = __vgem_gem_create(dev, attach->dmabuf->size);
if (IS_ERR(obj))
return ERR_CAST(obj);
npages = PAGE_ALIGN(attach->dmabuf->size) / PAGE_SIZE;
obj->table = sg;
obj->pages = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
if (!obj->pages) {
__vgem_gem_destroy(obj);
return ERR_PTR(-ENOMEM);
}
obj->pages_pin_count++; /* perma-pinned */
drm_prime_sg_to_page_array(obj->table, obj->pages, npages);
return &obj->base;
-}
-static int vgem_prime_vmap(struct drm_gem_object *obj, struct dma_buf_map *map) -{
struct drm_vgem_gem_object *bo = to_vgem_bo(obj);
long n_pages = obj->size >> PAGE_SHIFT;
struct page **pages;
void *vaddr;
pages = vgem_pin_pages(bo);
if (IS_ERR(pages))
return PTR_ERR(pages);
vaddr = vmap(pages, n_pages, 0, pgprot_writecombine(PAGE_KERNEL));
if (!vaddr)
return -ENOMEM;
dma_buf_map_set_vaddr(map, vaddr);
return 0;
-}
-static void vgem_prime_vunmap(struct drm_gem_object *obj, struct dma_buf_map *map) -{
struct drm_vgem_gem_object *bo = to_vgem_bo(obj);
vunmap(map->vaddr);
vgem_unpin_pages(bo);
-}
-static int vgem_prime_mmap(struct drm_gem_object *obj,
struct vm_area_struct *vma)
-{
int ret;
if (obj->size < vma->vm_end - vma->vm_start)
return -EINVAL;
if (!obj->filp)
return -ENODEV;
ret = call_mmap(obj->filp, vma);
if (ret)
return ret;
vma_set_file(vma, obj->filp);
vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
return 0;
-}
-static const struct drm_gem_object_funcs vgem_gem_object_funcs = {
.free = vgem_gem_free_object,
.pin = vgem_prime_pin,
.unpin = vgem_prime_unpin,
.get_sg_table = vgem_prime_get_sg_table,
.vmap = vgem_prime_vmap,
.vunmap = vgem_prime_vunmap,
.vm_ops = &vgem_gem_vm_ops,
-}; +DEFINE_DRM_GEM_FOPS(vgem_driver_fops);
static const struct drm_driver vgem_driver = { .driver_features = DRIVER_GEM | DRIVER_RENDER, @@ -427,13 +99,7 @@ static const struct drm_driver vgem_driver = { .num_ioctls = ARRAY_SIZE(vgem_ioctls), .fops = &vgem_driver_fops,
.dumb_create = vgem_gem_dumb_create,
.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
.gem_prime_import = vgem_prime_import,
.gem_prime_import_sg_table = vgem_prime_import_sg_table,
.gem_prime_mmap = vgem_prime_mmap,
DRM_GEM_SHMEM_DRIVER_OPS, .name = DRIVER_NAME, .desc = DRIVER_DESC,
-- Thomas Zimmermann Graphics Driver Developer SUSE Software Solutions Germany GmbH Maxfeldstr. 5, 90409 Nürnberg, Germany (HRB 36809, AG Nürnberg) Geschäftsführer: Felix Imendörffer
Hi
Am 26.02.21 um 14:30 schrieb Daniel Vetter:
On Fri, Feb 26, 2021 at 10:19 AM Thomas Zimmermann tzimmermann@suse.de wrote:
Hi
Am 25.02.21 um 11:23 schrieb Daniel Vetter:
Aside from deleting lots of code the real motivation here is to switch the mmap over to VM_PFNMAP, to be more consistent with what real gpu drivers do. They're all VM_PFNMP, which means get_user_pages doesn't work, and even if you try and there's a struct page behind that, touching it and mucking around with its refcount can upset drivers real bad.
v2: Review from Thomas:
- sort #include
- drop more dead code that I didn't spot somehow
v3: select DRM_GEM_SHMEM_HELPER to make it build (intel-gfx-ci)
Since you're working on it, could you move the config item into a Kconfig file under vgem?
We have a lot of drivers still without their own Kconfig. I thought we're only doing that for drivers which have multiple options, or otherwise would clutter up the main drm/Kconfig file?
Not opposed to this, just feels like if we do this, should do it for all of them.
I didn't know that there was a rule for how to handle this. I just didn't like to have driver config rules in the main Kconfig file.
But yeah, maybe let's change this consistently in a separate patchset.
Best regards Thomas
-Daniel
Best regards Thomas
Cc: Thomas Zimmermann tzimmermann@suse.de Acked-by: Thomas Zimmermann tzimmermann@suse.de Cc: John Stultz john.stultz@linaro.org Cc: Sumit Semwal sumit.semwal@linaro.org Cc: "Christian König" christian.koenig@amd.com Signed-off-by: Daniel Vetter daniel.vetter@intel.com Cc: Melissa Wen melissa.srw@gmail.com Cc: Chris Wilson chris@chris-wilson.co.uk
drivers/gpu/drm/Kconfig | 1 + drivers/gpu/drm/vgem/vgem_drv.c | 340 +------------------------------- 2 files changed, 4 insertions(+), 337 deletions(-)
diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 8e73311de583..94e4ac830283 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -274,6 +274,7 @@ source "drivers/gpu/drm/kmb/Kconfig" config DRM_VGEM tristate "Virtual GEM provider" depends on DRM
select DRM_GEM_SHMEM_HELPER help Choose this option to get a virtual graphics memory manager, as used by Mesa's software renderer for enhanced performance.
diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c index a0e75f1d5d01..b1b3a5ffc542 100644 --- a/drivers/gpu/drm/vgem/vgem_drv.c +++ b/drivers/gpu/drm/vgem/vgem_drv.c @@ -38,6 +38,7 @@
#include <drm/drm_drv.h> #include <drm/drm_file.h> +#include <drm/drm_gem_shmem_helper.h> #include <drm/drm_ioctl.h> #include <drm/drm_managed.h> #include <drm/drm_prime.h> @@ -50,87 +51,11 @@ #define DRIVER_MAJOR 1 #define DRIVER_MINOR 0
-static const struct drm_gem_object_funcs vgem_gem_object_funcs;
- static struct vgem_device { struct drm_device drm; struct platform_device *platform; } *vgem_device;
-static void vgem_gem_free_object(struct drm_gem_object *obj) -{
struct drm_vgem_gem_object *vgem_obj = to_vgem_bo(obj);
kvfree(vgem_obj->pages);
mutex_destroy(&vgem_obj->pages_lock);
if (obj->import_attach)
drm_prime_gem_destroy(obj, vgem_obj->table);
drm_gem_object_release(obj);
kfree(vgem_obj);
-}
-static vm_fault_t vgem_gem_fault(struct vm_fault *vmf) -{
struct vm_area_struct *vma = vmf->vma;
struct drm_vgem_gem_object *obj = vma->vm_private_data;
/* We don't use vmf->pgoff since that has the fake offset */
unsigned long vaddr = vmf->address;
vm_fault_t ret = VM_FAULT_SIGBUS;
loff_t num_pages;
pgoff_t page_offset;
page_offset = (vaddr - vma->vm_start) >> PAGE_SHIFT;
num_pages = DIV_ROUND_UP(obj->base.size, PAGE_SIZE);
if (page_offset >= num_pages)
return VM_FAULT_SIGBUS;
mutex_lock(&obj->pages_lock);
if (obj->pages) {
get_page(obj->pages[page_offset]);
vmf->page = obj->pages[page_offset];
ret = 0;
}
mutex_unlock(&obj->pages_lock);
if (ret) {
struct page *page;
page = shmem_read_mapping_page(
file_inode(obj->base.filp)->i_mapping,
page_offset);
if (!IS_ERR(page)) {
vmf->page = page;
ret = 0;
} else switch (PTR_ERR(page)) {
case -ENOSPC:
case -ENOMEM:
ret = VM_FAULT_OOM;
break;
case -EBUSY:
ret = VM_FAULT_RETRY;
break;
case -EFAULT:
case -EINVAL:
ret = VM_FAULT_SIGBUS;
break;
default:
WARN_ON(PTR_ERR(page));
ret = VM_FAULT_SIGBUS;
break;
}
}
return ret;
-}
-static const struct vm_operations_struct vgem_gem_vm_ops = {
.fault = vgem_gem_fault,
.open = drm_gem_vm_open,
.close = drm_gem_vm_close,
-};
- static int vgem_open(struct drm_device *dev, struct drm_file *file) { struct vgem_file *vfile;
@@ -159,265 +84,12 @@ static void vgem_postclose(struct drm_device *dev, struct drm_file *file) kfree(vfile); }
-static struct drm_vgem_gem_object *__vgem_gem_create(struct drm_device *dev,
unsigned long size)
-{
struct drm_vgem_gem_object *obj;
int ret;
obj = kzalloc(sizeof(*obj), GFP_KERNEL);
if (!obj)
return ERR_PTR(-ENOMEM);
obj->base.funcs = &vgem_gem_object_funcs;
ret = drm_gem_object_init(dev, &obj->base, roundup(size, PAGE_SIZE));
if (ret) {
kfree(obj);
return ERR_PTR(ret);
}
mutex_init(&obj->pages_lock);
return obj;
-}
-static void __vgem_gem_destroy(struct drm_vgem_gem_object *obj) -{
drm_gem_object_release(&obj->base);
kfree(obj);
-}
-static struct drm_gem_object *vgem_gem_create(struct drm_device *dev,
struct drm_file *file,
unsigned int *handle,
unsigned long size)
-{
struct drm_vgem_gem_object *obj;
int ret;
obj = __vgem_gem_create(dev, size);
if (IS_ERR(obj))
return ERR_CAST(obj);
ret = drm_gem_handle_create(file, &obj->base, handle);
if (ret) {
drm_gem_object_put(&obj->base);
return ERR_PTR(ret);
}
return &obj->base;
-}
-static int vgem_gem_dumb_create(struct drm_file *file, struct drm_device *dev,
struct drm_mode_create_dumb *args)
-{
struct drm_gem_object *gem_object;
u64 pitch, size;
pitch = args->width * DIV_ROUND_UP(args->bpp, 8);
size = args->height * pitch;
if (size == 0)
return -EINVAL;
gem_object = vgem_gem_create(dev, file, &args->handle, size);
if (IS_ERR(gem_object))
return PTR_ERR(gem_object);
args->size = gem_object->size;
args->pitch = pitch;
drm_gem_object_put(gem_object);
DRM_DEBUG("Created object of size %llu\n", args->size);
return 0;
-}
- static struct drm_ioctl_desc vgem_ioctls[] = { DRM_IOCTL_DEF_DRV(VGEM_FENCE_ATTACH, vgem_fence_attach_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(VGEM_FENCE_SIGNAL, vgem_fence_signal_ioctl, DRM_RENDER_ALLOW), };
-static int vgem_mmap(struct file *filp, struct vm_area_struct *vma) -{
unsigned long flags = vma->vm_flags;
int ret;
ret = drm_gem_mmap(filp, vma);
if (ret)
return ret;
/* Keep the WC mmaping set by drm_gem_mmap() but our pages
* are ordinary and not special.
*/
vma->vm_flags = flags | VM_DONTEXPAND | VM_DONTDUMP;
return 0;
-}
-static const struct file_operations vgem_driver_fops = {
.owner = THIS_MODULE,
.open = drm_open,
.mmap = vgem_mmap,
.poll = drm_poll,
.read = drm_read,
.unlocked_ioctl = drm_ioctl,
.compat_ioctl = drm_compat_ioctl,
.release = drm_release,
-};
-static struct page **vgem_pin_pages(struct drm_vgem_gem_object *bo) -{
mutex_lock(&bo->pages_lock);
if (bo->pages_pin_count++ == 0) {
struct page **pages;
pages = drm_gem_get_pages(&bo->base);
if (IS_ERR(pages)) {
bo->pages_pin_count--;
mutex_unlock(&bo->pages_lock);
return pages;
}
bo->pages = pages;
}
mutex_unlock(&bo->pages_lock);
return bo->pages;
-}
-static void vgem_unpin_pages(struct drm_vgem_gem_object *bo) -{
mutex_lock(&bo->pages_lock);
if (--bo->pages_pin_count == 0) {
drm_gem_put_pages(&bo->base, bo->pages, true, true);
bo->pages = NULL;
}
mutex_unlock(&bo->pages_lock);
-}
-static int vgem_prime_pin(struct drm_gem_object *obj) -{
struct drm_vgem_gem_object *bo = to_vgem_bo(obj);
long n_pages = obj->size >> PAGE_SHIFT;
struct page **pages;
pages = vgem_pin_pages(bo);
if (IS_ERR(pages))
return PTR_ERR(pages);
/* Flush the object from the CPU cache so that importers can rely
* on coherent indirect access via the exported dma-address.
*/
drm_clflush_pages(pages, n_pages);
return 0;
-}
-static void vgem_prime_unpin(struct drm_gem_object *obj) -{
struct drm_vgem_gem_object *bo = to_vgem_bo(obj);
vgem_unpin_pages(bo);
-}
-static struct sg_table *vgem_prime_get_sg_table(struct drm_gem_object *obj) -{
struct drm_vgem_gem_object *bo = to_vgem_bo(obj);
return drm_prime_pages_to_sg(obj->dev, bo->pages, bo->base.size >> PAGE_SHIFT);
-}
-static struct drm_gem_object* vgem_prime_import(struct drm_device *dev,
struct dma_buf *dma_buf)
-{
struct vgem_device *vgem = container_of(dev, typeof(*vgem), drm);
return drm_gem_prime_import_dev(dev, dma_buf, &vgem->platform->dev);
-}
-static struct drm_gem_object *vgem_prime_import_sg_table(struct drm_device *dev,
struct dma_buf_attachment *attach, struct sg_table *sg)
-{
struct drm_vgem_gem_object *obj;
int npages;
obj = __vgem_gem_create(dev, attach->dmabuf->size);
if (IS_ERR(obj))
return ERR_CAST(obj);
npages = PAGE_ALIGN(attach->dmabuf->size) / PAGE_SIZE;
obj->table = sg;
obj->pages = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
if (!obj->pages) {
__vgem_gem_destroy(obj);
return ERR_PTR(-ENOMEM);
}
obj->pages_pin_count++; /* perma-pinned */
drm_prime_sg_to_page_array(obj->table, obj->pages, npages);
return &obj->base;
-}
-static int vgem_prime_vmap(struct drm_gem_object *obj, struct dma_buf_map *map) -{
struct drm_vgem_gem_object *bo = to_vgem_bo(obj);
long n_pages = obj->size >> PAGE_SHIFT;
struct page **pages;
void *vaddr;
pages = vgem_pin_pages(bo);
if (IS_ERR(pages))
return PTR_ERR(pages);
vaddr = vmap(pages, n_pages, 0, pgprot_writecombine(PAGE_KERNEL));
if (!vaddr)
return -ENOMEM;
dma_buf_map_set_vaddr(map, vaddr);
return 0;
-}
-static void vgem_prime_vunmap(struct drm_gem_object *obj, struct dma_buf_map *map) -{
struct drm_vgem_gem_object *bo = to_vgem_bo(obj);
vunmap(map->vaddr);
vgem_unpin_pages(bo);
-}
-static int vgem_prime_mmap(struct drm_gem_object *obj,
struct vm_area_struct *vma)
-{
int ret;
if (obj->size < vma->vm_end - vma->vm_start)
return -EINVAL;
if (!obj->filp)
return -ENODEV;
ret = call_mmap(obj->filp, vma);
if (ret)
return ret;
vma_set_file(vma, obj->filp);
vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
return 0;
-}
-static const struct drm_gem_object_funcs vgem_gem_object_funcs = {
.free = vgem_gem_free_object,
.pin = vgem_prime_pin,
.unpin = vgem_prime_unpin,
.get_sg_table = vgem_prime_get_sg_table,
.vmap = vgem_prime_vmap,
.vunmap = vgem_prime_vunmap,
.vm_ops = &vgem_gem_vm_ops,
-}; +DEFINE_DRM_GEM_FOPS(vgem_driver_fops);
static const struct drm_driver vgem_driver = { .driver_features = DRIVER_GEM | DRIVER_RENDER, @@ -427,13 +99,7 @@ static const struct drm_driver vgem_driver = { .num_ioctls = ARRAY_SIZE(vgem_ioctls), .fops = &vgem_driver_fops,
.dumb_create = vgem_gem_dumb_create,
.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
.gem_prime_import = vgem_prime_import,
.gem_prime_import_sg_table = vgem_prime_import_sg_table,
.gem_prime_mmap = vgem_prime_mmap,
DRM_GEM_SHMEM_DRIVER_OPS, .name = DRIVER_NAME, .desc = DRIVER_DESC,
-- Thomas Zimmermann Graphics Driver Developer SUSE Software Solutions Germany GmbH Maxfeldstr. 5, 90409 Nürnberg, Germany (HRB 36809, AG Nürnberg) Geschäftsführer: Felix Imendörffer
On Fri, Feb 26, 2021 at 02:51:58PM +0100, Thomas Zimmermann wrote:
Hi
Am 26.02.21 um 14:30 schrieb Daniel Vetter:
On Fri, Feb 26, 2021 at 10:19 AM Thomas Zimmermann tzimmermann@suse.de wrote:
Hi
Am 25.02.21 um 11:23 schrieb Daniel Vetter:
Aside from deleting lots of code the real motivation here is to switch the mmap over to VM_PFNMAP, to be more consistent with what real gpu drivers do. They're all VM_PFNMP, which means get_user_pages doesn't work, and even if you try and there's a struct page behind that, touching it and mucking around with its refcount can upset drivers real bad.
v2: Review from Thomas:
- sort #include
- drop more dead code that I didn't spot somehow
v3: select DRM_GEM_SHMEM_HELPER to make it build (intel-gfx-ci)
Since you're working on it, could you move the config item into a Kconfig file under vgem?
We have a lot of drivers still without their own Kconfig. I thought we're only doing that for drivers which have multiple options, or otherwise would clutter up the main drm/Kconfig file?
Not opposed to this, just feels like if we do this, should do it for all of them.
I didn't know that there was a rule for how to handle this. I just didn't like to have driver config rules in the main Kconfig file.
I don't think it is an actual rule, just how the driver Kconfig files started out.
But yeah, maybe let's change this consistently in a separate patchset.
Yeah I looked, we should also put all the driver files at the bottom, and maybe sort them alphabetically or something like that. It's a bit a mess right now. -Daniel
Best regards Thomas
-Daniel
Best regards Thomas
Cc: Thomas Zimmermann tzimmermann@suse.de Acked-by: Thomas Zimmermann tzimmermann@suse.de Cc: John Stultz john.stultz@linaro.org Cc: Sumit Semwal sumit.semwal@linaro.org Cc: "Christian König" christian.koenig@amd.com Signed-off-by: Daniel Vetter daniel.vetter@intel.com Cc: Melissa Wen melissa.srw@gmail.com Cc: Chris Wilson chris@chris-wilson.co.uk
drivers/gpu/drm/Kconfig | 1 + drivers/gpu/drm/vgem/vgem_drv.c | 340 +------------------------------- 2 files changed, 4 insertions(+), 337 deletions(-)
diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 8e73311de583..94e4ac830283 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -274,6 +274,7 @@ source "drivers/gpu/drm/kmb/Kconfig" config DRM_VGEM tristate "Virtual GEM provider" depends on DRM
select DRM_GEM_SHMEM_HELPER help Choose this option to get a virtual graphics memory manager, as used by Mesa's software renderer for enhanced performance.
diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c index a0e75f1d5d01..b1b3a5ffc542 100644 --- a/drivers/gpu/drm/vgem/vgem_drv.c +++ b/drivers/gpu/drm/vgem/vgem_drv.c @@ -38,6 +38,7 @@
#include <drm/drm_drv.h> #include <drm/drm_file.h> +#include <drm/drm_gem_shmem_helper.h> #include <drm/drm_ioctl.h> #include <drm/drm_managed.h> #include <drm/drm_prime.h> @@ -50,87 +51,11 @@ #define DRIVER_MAJOR 1 #define DRIVER_MINOR 0
-static const struct drm_gem_object_funcs vgem_gem_object_funcs;
- static struct vgem_device { struct drm_device drm; struct platform_device *platform; } *vgem_device;
-static void vgem_gem_free_object(struct drm_gem_object *obj) -{
struct drm_vgem_gem_object *vgem_obj = to_vgem_bo(obj);
kvfree(vgem_obj->pages);
mutex_destroy(&vgem_obj->pages_lock);
if (obj->import_attach)
drm_prime_gem_destroy(obj, vgem_obj->table);
drm_gem_object_release(obj);
kfree(vgem_obj);
-}
-static vm_fault_t vgem_gem_fault(struct vm_fault *vmf) -{
struct vm_area_struct *vma = vmf->vma;
struct drm_vgem_gem_object *obj = vma->vm_private_data;
/* We don't use vmf->pgoff since that has the fake offset */
unsigned long vaddr = vmf->address;
vm_fault_t ret = VM_FAULT_SIGBUS;
loff_t num_pages;
pgoff_t page_offset;
page_offset = (vaddr - vma->vm_start) >> PAGE_SHIFT;
num_pages = DIV_ROUND_UP(obj->base.size, PAGE_SIZE);
if (page_offset >= num_pages)
return VM_FAULT_SIGBUS;
mutex_lock(&obj->pages_lock);
if (obj->pages) {
get_page(obj->pages[page_offset]);
vmf->page = obj->pages[page_offset];
ret = 0;
}
mutex_unlock(&obj->pages_lock);
if (ret) {
struct page *page;
page = shmem_read_mapping_page(
file_inode(obj->base.filp)->i_mapping,
page_offset);
if (!IS_ERR(page)) {
vmf->page = page;
ret = 0;
} else switch (PTR_ERR(page)) {
case -ENOSPC:
case -ENOMEM:
ret = VM_FAULT_OOM;
break;
case -EBUSY:
ret = VM_FAULT_RETRY;
break;
case -EFAULT:
case -EINVAL:
ret = VM_FAULT_SIGBUS;
break;
default:
WARN_ON(PTR_ERR(page));
ret = VM_FAULT_SIGBUS;
break;
}
}
return ret;
-}
-static const struct vm_operations_struct vgem_gem_vm_ops = {
.fault = vgem_gem_fault,
.open = drm_gem_vm_open,
.close = drm_gem_vm_close,
-};
- static int vgem_open(struct drm_device *dev, struct drm_file *file) { struct vgem_file *vfile;
@@ -159,265 +84,12 @@ static void vgem_postclose(struct drm_device *dev, struct drm_file *file) kfree(vfile); }
-static struct drm_vgem_gem_object *__vgem_gem_create(struct drm_device *dev,
unsigned long size)
-{
struct drm_vgem_gem_object *obj;
int ret;
obj = kzalloc(sizeof(*obj), GFP_KERNEL);
if (!obj)
return ERR_PTR(-ENOMEM);
obj->base.funcs = &vgem_gem_object_funcs;
ret = drm_gem_object_init(dev, &obj->base, roundup(size, PAGE_SIZE));
if (ret) {
kfree(obj);
return ERR_PTR(ret);
}
mutex_init(&obj->pages_lock);
return obj;
-}
-static void __vgem_gem_destroy(struct drm_vgem_gem_object *obj) -{
drm_gem_object_release(&obj->base);
kfree(obj);
-}
-static struct drm_gem_object *vgem_gem_create(struct drm_device *dev,
struct drm_file *file,
unsigned int *handle,
unsigned long size)
-{
struct drm_vgem_gem_object *obj;
int ret;
obj = __vgem_gem_create(dev, size);
if (IS_ERR(obj))
return ERR_CAST(obj);
ret = drm_gem_handle_create(file, &obj->base, handle);
if (ret) {
drm_gem_object_put(&obj->base);
return ERR_PTR(ret);
}
return &obj->base;
-}
-static int vgem_gem_dumb_create(struct drm_file *file, struct drm_device *dev,
struct drm_mode_create_dumb *args)
-{
struct drm_gem_object *gem_object;
u64 pitch, size;
pitch = args->width * DIV_ROUND_UP(args->bpp, 8);
size = args->height * pitch;
if (size == 0)
return -EINVAL;
gem_object = vgem_gem_create(dev, file, &args->handle, size);
if (IS_ERR(gem_object))
return PTR_ERR(gem_object);
args->size = gem_object->size;
args->pitch = pitch;
drm_gem_object_put(gem_object);
DRM_DEBUG("Created object of size %llu\n", args->size);
return 0;
-}
- static struct drm_ioctl_desc vgem_ioctls[] = { DRM_IOCTL_DEF_DRV(VGEM_FENCE_ATTACH, vgem_fence_attach_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(VGEM_FENCE_SIGNAL, vgem_fence_signal_ioctl, DRM_RENDER_ALLOW), };
-static int vgem_mmap(struct file *filp, struct vm_area_struct *vma) -{
unsigned long flags = vma->vm_flags;
int ret;
ret = drm_gem_mmap(filp, vma);
if (ret)
return ret;
/* Keep the WC mmaping set by drm_gem_mmap() but our pages
* are ordinary and not special.
*/
vma->vm_flags = flags | VM_DONTEXPAND | VM_DONTDUMP;
return 0;
-}
-static const struct file_operations vgem_driver_fops = {
.owner = THIS_MODULE,
.open = drm_open,
.mmap = vgem_mmap,
.poll = drm_poll,
.read = drm_read,
.unlocked_ioctl = drm_ioctl,
.compat_ioctl = drm_compat_ioctl,
.release = drm_release,
-};
-static struct page **vgem_pin_pages(struct drm_vgem_gem_object *bo) -{
mutex_lock(&bo->pages_lock);
if (bo->pages_pin_count++ == 0) {
struct page **pages;
pages = drm_gem_get_pages(&bo->base);
if (IS_ERR(pages)) {
bo->pages_pin_count--;
mutex_unlock(&bo->pages_lock);
return pages;
}
bo->pages = pages;
}
mutex_unlock(&bo->pages_lock);
return bo->pages;
-}
-static void vgem_unpin_pages(struct drm_vgem_gem_object *bo) -{
mutex_lock(&bo->pages_lock);
if (--bo->pages_pin_count == 0) {
drm_gem_put_pages(&bo->base, bo->pages, true, true);
bo->pages = NULL;
}
mutex_unlock(&bo->pages_lock);
-}
-static int vgem_prime_pin(struct drm_gem_object *obj) -{
struct drm_vgem_gem_object *bo = to_vgem_bo(obj);
long n_pages = obj->size >> PAGE_SHIFT;
struct page **pages;
pages = vgem_pin_pages(bo);
if (IS_ERR(pages))
return PTR_ERR(pages);
/* Flush the object from the CPU cache so that importers can rely
* on coherent indirect access via the exported dma-address.
*/
drm_clflush_pages(pages, n_pages);
return 0;
-}
-static void vgem_prime_unpin(struct drm_gem_object *obj) -{
struct drm_vgem_gem_object *bo = to_vgem_bo(obj);
vgem_unpin_pages(bo);
-}
-static struct sg_table *vgem_prime_get_sg_table(struct drm_gem_object *obj) -{
struct drm_vgem_gem_object *bo = to_vgem_bo(obj);
return drm_prime_pages_to_sg(obj->dev, bo->pages, bo->base.size >> PAGE_SHIFT);
-}
-static struct drm_gem_object* vgem_prime_import(struct drm_device *dev,
struct dma_buf *dma_buf)
-{
struct vgem_device *vgem = container_of(dev, typeof(*vgem), drm);
return drm_gem_prime_import_dev(dev, dma_buf, &vgem->platform->dev);
-}
-static struct drm_gem_object *vgem_prime_import_sg_table(struct drm_device *dev,
struct dma_buf_attachment *attach, struct sg_table *sg)
-{
struct drm_vgem_gem_object *obj;
int npages;
obj = __vgem_gem_create(dev, attach->dmabuf->size);
if (IS_ERR(obj))
return ERR_CAST(obj);
npages = PAGE_ALIGN(attach->dmabuf->size) / PAGE_SIZE;
obj->table = sg;
obj->pages = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
if (!obj->pages) {
__vgem_gem_destroy(obj);
return ERR_PTR(-ENOMEM);
}
obj->pages_pin_count++; /* perma-pinned */
drm_prime_sg_to_page_array(obj->table, obj->pages, npages);
return &obj->base;
-}
-static int vgem_prime_vmap(struct drm_gem_object *obj, struct dma_buf_map *map) -{
struct drm_vgem_gem_object *bo = to_vgem_bo(obj);
long n_pages = obj->size >> PAGE_SHIFT;
struct page **pages;
void *vaddr;
pages = vgem_pin_pages(bo);
if (IS_ERR(pages))
return PTR_ERR(pages);
vaddr = vmap(pages, n_pages, 0, pgprot_writecombine(PAGE_KERNEL));
if (!vaddr)
return -ENOMEM;
dma_buf_map_set_vaddr(map, vaddr);
return 0;
-}
-static void vgem_prime_vunmap(struct drm_gem_object *obj, struct dma_buf_map *map) -{
struct drm_vgem_gem_object *bo = to_vgem_bo(obj);
vunmap(map->vaddr);
vgem_unpin_pages(bo);
-}
-static int vgem_prime_mmap(struct drm_gem_object *obj,
struct vm_area_struct *vma)
-{
int ret;
if (obj->size < vma->vm_end - vma->vm_start)
return -EINVAL;
if (!obj->filp)
return -ENODEV;
ret = call_mmap(obj->filp, vma);
if (ret)
return ret;
vma_set_file(vma, obj->filp);
vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
return 0;
-}
-static const struct drm_gem_object_funcs vgem_gem_object_funcs = {
.free = vgem_gem_free_object,
.pin = vgem_prime_pin,
.unpin = vgem_prime_unpin,
.get_sg_table = vgem_prime_get_sg_table,
.vmap = vgem_prime_vmap,
.vunmap = vgem_prime_vunmap,
.vm_ops = &vgem_gem_vm_ops,
-}; +DEFINE_DRM_GEM_FOPS(vgem_driver_fops);
static const struct drm_driver vgem_driver = { .driver_features = DRIVER_GEM | DRIVER_RENDER, @@ -427,13 +99,7 @@ static const struct drm_driver vgem_driver = { .num_ioctls = ARRAY_SIZE(vgem_ioctls), .fops = &vgem_driver_fops,
.dumb_create = vgem_gem_dumb_create,
.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
.gem_prime_import = vgem_prime_import,
.gem_prime_import_sg_table = vgem_prime_import_sg_table,
.gem_prime_mmap = vgem_prime_mmap,
DRM_GEM_SHMEM_DRIVER_OPS, .name = DRIVER_NAME, .desc = DRIVER_DESC,
-- Thomas Zimmermann Graphics Driver Developer SUSE Software Solutions Germany GmbH Maxfeldstr. 5, 90409 Nürnberg, Germany (HRB 36809, AG Nürnberg) Geschäftsführer: Felix Imendörffer
-- Thomas Zimmermann Graphics Driver Developer SUSE Software Solutions Germany GmbH Maxfeldstr. 5, 90409 Nürnberg, Germany (HRB 36809, AG Nürnberg) Geschäftsführer: Felix Imendörffer
On 2/23/21 11:59 AM, Daniel Vetter wrote:
tldr; DMA buffers aren't normal memory, expecting that you can use them like that (like calling get_user_pages works, or that they're accounting like any other normal memory) cannot be guaranteed.
Since some userspace only runs on integrated devices, where all buffers are actually all resident system memory, there's a huge temptation to assume that a struct page is always present and useable like for any more pagecache backed mmap. This has the potential to result in a uapi nightmare.
To stop this gap require that DMA buffer mmaps are VM_PFNMAP, which blocks get_user_pages and all the other struct page based infrastructure for everyone. In spirit this is the uapi counterpart to the kernel-internal CONFIG_DMABUF_DEBUG.
Motivated by a recent patch which wanted to swich the system dma-buf heap to vm_insert_page instead of vm_insert_pfn.
v2:
Jason brought up that we also want to guarantee that all ptes have the pte_special flag set, to catch fast get_user_pages (on architectures that support this). Allowing VM_MIXEDMAP (like VM_SPECIAL does) would still allow vm_insert_page, but limiting to VM_PFNMAP will catch that.
From auditing the various functions to insert pfn pte entires (vm_insert_pfn_prot, remap_pfn_range and all it's callers like dma_mmap_wc) it looks like VM_PFNMAP is already required anyway, so this should be the correct flag to check for.
If we require VM_PFNMAP, for ordinary page mappings, we also need to disallow COW mappings, since it will not work on architectures that don't have CONFIG_ARCH_HAS_PTE_SPECIAL, (see the docs for vm_normal_page()).
Also worth noting is the comment in ttm_bo_mmap_vma_setup() with possible performance implications with x86 + PAT + VM_PFNMAP + normal pages. That's a very old comment, though, and might not be valid anymore.
/Thomas
On Wed, Feb 24, 2021 at 8:46 AM Thomas Hellström (Intel) thomas_os@shipmail.org wrote:
On 2/23/21 11:59 AM, Daniel Vetter wrote:
tldr; DMA buffers aren't normal memory, expecting that you can use them like that (like calling get_user_pages works, or that they're accounting like any other normal memory) cannot be guaranteed.
Since some userspace only runs on integrated devices, where all buffers are actually all resident system memory, there's a huge temptation to assume that a struct page is always present and useable like for any more pagecache backed mmap. This has the potential to result in a uapi nightmare.
To stop this gap require that DMA buffer mmaps are VM_PFNMAP, which blocks get_user_pages and all the other struct page based infrastructure for everyone. In spirit this is the uapi counterpart to the kernel-internal CONFIG_DMABUF_DEBUG.
Motivated by a recent patch which wanted to swich the system dma-buf heap to vm_insert_page instead of vm_insert_pfn.
v2:
Jason brought up that we also want to guarantee that all ptes have the pte_special flag set, to catch fast get_user_pages (on architectures that support this). Allowing VM_MIXEDMAP (like VM_SPECIAL does) would still allow vm_insert_page, but limiting to VM_PFNMAP will catch that.
From auditing the various functions to insert pfn pte entires (vm_insert_pfn_prot, remap_pfn_range and all it's callers like dma_mmap_wc) it looks like VM_PFNMAP is already required anyway, so this should be the correct flag to check for.
If we require VM_PFNMAP, for ordinary page mappings, we also need to disallow COW mappings, since it will not work on architectures that don't have CONFIG_ARCH_HAS_PTE_SPECIAL, (see the docs for vm_normal_page()).
Hm I figured everyone just uses MAP_SHARED for buffer objects since COW really makes absolutely no sense. How would we enforce this?
Also worth noting is the comment in ttm_bo_mmap_vma_setup() with possible performance implications with x86 + PAT + VM_PFNMAP + normal pages. That's a very old comment, though, and might not be valid anymore.
I think that's why ttm has a page cache for these, because it indeed sucks. The PAT changes on pages are rather expensive.
There is still an issue for iomem mappings, because the PAT validation does a linear walk of the resource tree (lol) for every vm_insert_pfn. But for i915 at least this is fixed by using the io_mapping infrastructure, which does the PAT reservation only once when you set up the mapping area at driver load.
Also TTM uses VM_PFNMAP right now for everything, so it can't be a problem that hurts much :-) -Daniel
On 2/24/21 9:45 AM, Daniel Vetter wrote:
On Wed, Feb 24, 2021 at 8:46 AM Thomas Hellström (Intel) thomas_os@shipmail.org wrote:
On 2/23/21 11:59 AM, Daniel Vetter wrote:
tldr; DMA buffers aren't normal memory, expecting that you can use them like that (like calling get_user_pages works, or that they're accounting like any other normal memory) cannot be guaranteed.
Since some userspace only runs on integrated devices, where all buffers are actually all resident system memory, there's a huge temptation to assume that a struct page is always present and useable like for any more pagecache backed mmap. This has the potential to result in a uapi nightmare.
To stop this gap require that DMA buffer mmaps are VM_PFNMAP, which blocks get_user_pages and all the other struct page based infrastructure for everyone. In spirit this is the uapi counterpart to the kernel-internal CONFIG_DMABUF_DEBUG.
Motivated by a recent patch which wanted to swich the system dma-buf heap to vm_insert_page instead of vm_insert_pfn.
v2:
Jason brought up that we also want to guarantee that all ptes have the pte_special flag set, to catch fast get_user_pages (on architectures that support this). Allowing VM_MIXEDMAP (like VM_SPECIAL does) would still allow vm_insert_page, but limiting to VM_PFNMAP will catch that.
From auditing the various functions to insert pfn pte entires (vm_insert_pfn_prot, remap_pfn_range and all it's callers like dma_mmap_wc) it looks like VM_PFNMAP is already required anyway, so this should be the correct flag to check for.
If we require VM_PFNMAP, for ordinary page mappings, we also need to disallow COW mappings, since it will not work on architectures that don't have CONFIG_ARCH_HAS_PTE_SPECIAL, (see the docs for vm_normal_page()).
Hm I figured everyone just uses MAP_SHARED for buffer objects since COW really makes absolutely no sense. How would we enforce this?
Perhaps returning -EINVAL on is_cow_mapping() at mmap time. Either that or allowing MIXEDMAP.
Also worth noting is the comment in ttm_bo_mmap_vma_setup() with possible performance implications with x86 + PAT + VM_PFNMAP + normal pages. That's a very old comment, though, and might not be valid anymore.
I think that's why ttm has a page cache for these, because it indeed sucks. The PAT changes on pages are rather expensive.
IIRC the page cache was implemented because of the slowness of the caching mode transition itself, more specifically the wbinvd() call + global TLB flush.
There is still an issue for iomem mappings, because the PAT validation does a linear walk of the resource tree (lol) for every vm_insert_pfn. But for i915 at least this is fixed by using the io_mapping infrastructure, which does the PAT reservation only once when you set up the mapping area at driver load.
Yes, I guess that was the issue that the comment describes, but the issue wasn't there with vm_insert_mixed() + VM_MIXEDMAP.
Also TTM uses VM_PFNMAP right now for everything, so it can't be a problem that hurts much :-)
Hmm, both 5.11 and drm-tip appears to still use MIXEDMAP?
https://elixir.bootlin.com/linux/latest/source/drivers/gpu/drm/ttm/ttm_bo_vm...
-Daniel
/Thomas
On Wed, Feb 24, 2021 at 10:16 AM Thomas Hellström (Intel) thomas_os@shipmail.org wrote:
On 2/24/21 9:45 AM, Daniel Vetter wrote:
On Wed, Feb 24, 2021 at 8:46 AM Thomas Hellström (Intel) thomas_os@shipmail.org wrote:
On 2/23/21 11:59 AM, Daniel Vetter wrote:
tldr; DMA buffers aren't normal memory, expecting that you can use them like that (like calling get_user_pages works, or that they're accounting like any other normal memory) cannot be guaranteed.
Since some userspace only runs on integrated devices, where all buffers are actually all resident system memory, there's a huge temptation to assume that a struct page is always present and useable like for any more pagecache backed mmap. This has the potential to result in a uapi nightmare.
To stop this gap require that DMA buffer mmaps are VM_PFNMAP, which blocks get_user_pages and all the other struct page based infrastructure for everyone. In spirit this is the uapi counterpart to the kernel-internal CONFIG_DMABUF_DEBUG.
Motivated by a recent patch which wanted to swich the system dma-buf heap to vm_insert_page instead of vm_insert_pfn.
v2:
Jason brought up that we also want to guarantee that all ptes have the pte_special flag set, to catch fast get_user_pages (on architectures that support this). Allowing VM_MIXEDMAP (like VM_SPECIAL does) would still allow vm_insert_page, but limiting to VM_PFNMAP will catch that.
From auditing the various functions to insert pfn pte entires (vm_insert_pfn_prot, remap_pfn_range and all it's callers like dma_mmap_wc) it looks like VM_PFNMAP is already required anyway, so this should be the correct flag to check for.
If we require VM_PFNMAP, for ordinary page mappings, we also need to disallow COW mappings, since it will not work on architectures that don't have CONFIG_ARCH_HAS_PTE_SPECIAL, (see the docs for vm_normal_page()).
Hm I figured everyone just uses MAP_SHARED for buffer objects since COW really makes absolutely no sense. How would we enforce this?
Perhaps returning -EINVAL on is_cow_mapping() at mmap time. Either that or allowing MIXEDMAP.
Also worth noting is the comment in ttm_bo_mmap_vma_setup() with possible performance implications with x86 + PAT + VM_PFNMAP + normal pages. That's a very old comment, though, and might not be valid anymore.
I think that's why ttm has a page cache for these, because it indeed sucks. The PAT changes on pages are rather expensive.
IIRC the page cache was implemented because of the slowness of the caching mode transition itself, more specifically the wbinvd() call + global TLB flush.
There is still an issue for iomem mappings, because the PAT validation does a linear walk of the resource tree (lol) for every vm_insert_pfn. But for i915 at least this is fixed by using the io_mapping infrastructure, which does the PAT reservation only once when you set up the mapping area at driver load.
Yes, I guess that was the issue that the comment describes, but the issue wasn't there with vm_insert_mixed() + VM_MIXEDMAP.
Also TTM uses VM_PFNMAP right now for everything, so it can't be a problem that hurts much :-)
Hmm, both 5.11 and drm-tip appears to still use MIXEDMAP?
https://elixir.bootlin.com/linux/latest/source/drivers/gpu/drm/ttm/ttm_bo_vm...
Uh that's bad, because mixed maps pointing at struct page wont stop gup. At least afaik.
Christian, do we need to patch this up, and maybe fix up ttm fault handler to use io_mapping so the vm_insert_pfn stuff is fast? -Daniel
Am 24.02.21 um 10:31 schrieb Daniel Vetter:
On Wed, Feb 24, 2021 at 10:16 AM Thomas Hellström (Intel) thomas_os@shipmail.org wrote:
On 2/24/21 9:45 AM, Daniel Vetter wrote:
On Wed, Feb 24, 2021 at 8:46 AM Thomas Hellström (Intel) thomas_os@shipmail.org wrote:
On 2/23/21 11:59 AM, Daniel Vetter wrote:
tldr; DMA buffers aren't normal memory, expecting that you can use them like that (like calling get_user_pages works, or that they're accounting like any other normal memory) cannot be guaranteed.
Since some userspace only runs on integrated devices, where all buffers are actually all resident system memory, there's a huge temptation to assume that a struct page is always present and useable like for any more pagecache backed mmap. This has the potential to result in a uapi nightmare.
To stop this gap require that DMA buffer mmaps are VM_PFNMAP, which blocks get_user_pages and all the other struct page based infrastructure for everyone. In spirit this is the uapi counterpart to the kernel-internal CONFIG_DMABUF_DEBUG.
Motivated by a recent patch which wanted to swich the system dma-buf heap to vm_insert_page instead of vm_insert_pfn.
v2:
Jason brought up that we also want to guarantee that all ptes have the pte_special flag set, to catch fast get_user_pages (on architectures that support this). Allowing VM_MIXEDMAP (like VM_SPECIAL does) would still allow vm_insert_page, but limiting to VM_PFNMAP will catch that.
From auditing the various functions to insert pfn pte entires (vm_insert_pfn_prot, remap_pfn_range and all it's callers like dma_mmap_wc) it looks like VM_PFNMAP is already required anyway, so this should be the correct flag to check for.
If we require VM_PFNMAP, for ordinary page mappings, we also need to disallow COW mappings, since it will not work on architectures that don't have CONFIG_ARCH_HAS_PTE_SPECIAL, (see the docs for vm_normal_page()).
Hm I figured everyone just uses MAP_SHARED for buffer objects since COW really makes absolutely no sense. How would we enforce this?
Perhaps returning -EINVAL on is_cow_mapping() at mmap time. Either that or allowing MIXEDMAP.
Also worth noting is the comment in ttm_bo_mmap_vma_setup() with possible performance implications with x86 + PAT + VM_PFNMAP + normal pages. That's a very old comment, though, and might not be valid anymore.
I think that's why ttm has a page cache for these, because it indeed sucks. The PAT changes on pages are rather expensive.
IIRC the page cache was implemented because of the slowness of the caching mode transition itself, more specifically the wbinvd() call + global TLB flush.
Yes, exactly that. The global TLB flush is what really breaks our neck here from a performance perspective.
There is still an issue for iomem mappings, because the PAT validation does a linear walk of the resource tree (lol) for every vm_insert_pfn. But for i915 at least this is fixed by using the io_mapping infrastructure, which does the PAT reservation only once when you set up the mapping area at driver load.
Yes, I guess that was the issue that the comment describes, but the issue wasn't there with vm_insert_mixed() + VM_MIXEDMAP.
Also TTM uses VM_PFNMAP right now for everything, so it can't be a problem that hurts much :-)
Hmm, both 5.11 and drm-tip appears to still use MIXEDMAP?
https://elixir.bootlin.com/linux/latest/source/drivers/gpu/drm/ttm/ttm_bo_vm...
Uh that's bad, because mixed maps pointing at struct page wont stop gup. At least afaik.
Hui? I'm pretty sure MIXEDMAP stops gup as well. Otherwise we would have already seen tons of problems with the page cache.
Regards, Christian.
Christian, do we need to patch this up, and maybe fix up ttm fault handler to use io_mapping so the vm_insert_pfn stuff is fast? -Daniel
On Thu, Feb 25, 2021 at 11:28:31AM +0100, Christian König wrote:
Am 24.02.21 um 10:31 schrieb Daniel Vetter:
On Wed, Feb 24, 2021 at 10:16 AM Thomas Hellström (Intel) thomas_os@shipmail.org wrote:
On 2/24/21 9:45 AM, Daniel Vetter wrote:
On Wed, Feb 24, 2021 at 8:46 AM Thomas Hellström (Intel) thomas_os@shipmail.org wrote:
On 2/23/21 11:59 AM, Daniel Vetter wrote:
tldr; DMA buffers aren't normal memory, expecting that you can use them like that (like calling get_user_pages works, or that they're accounting like any other normal memory) cannot be guaranteed.
Since some userspace only runs on integrated devices, where all buffers are actually all resident system memory, there's a huge temptation to assume that a struct page is always present and useable like for any more pagecache backed mmap. This has the potential to result in a uapi nightmare.
To stop this gap require that DMA buffer mmaps are VM_PFNMAP, which blocks get_user_pages and all the other struct page based infrastructure for everyone. In spirit this is the uapi counterpart to the kernel-internal CONFIG_DMABUF_DEBUG.
Motivated by a recent patch which wanted to swich the system dma-buf heap to vm_insert_page instead of vm_insert_pfn.
v2:
Jason brought up that we also want to guarantee that all ptes have the pte_special flag set, to catch fast get_user_pages (on architectures that support this). Allowing VM_MIXEDMAP (like VM_SPECIAL does) would still allow vm_insert_page, but limiting to VM_PFNMAP will catch that.
From auditing the various functions to insert pfn pte entires (vm_insert_pfn_prot, remap_pfn_range and all it's callers like dma_mmap_wc) it looks like VM_PFNMAP is already required anyway, so this should be the correct flag to check for.
If we require VM_PFNMAP, for ordinary page mappings, we also need to disallow COW mappings, since it will not work on architectures that don't have CONFIG_ARCH_HAS_PTE_SPECIAL, (see the docs for vm_normal_page()).
Hm I figured everyone just uses MAP_SHARED for buffer objects since COW really makes absolutely no sense. How would we enforce this?
Perhaps returning -EINVAL on is_cow_mapping() at mmap time. Either that or allowing MIXEDMAP.
Also worth noting is the comment in ttm_bo_mmap_vma_setup() with possible performance implications with x86 + PAT + VM_PFNMAP + normal pages. That's a very old comment, though, and might not be valid anymore.
I think that's why ttm has a page cache for these, because it indeed sucks. The PAT changes on pages are rather expensive.
IIRC the page cache was implemented because of the slowness of the caching mode transition itself, more specifically the wbinvd() call + global TLB flush.
Yes, exactly that. The global TLB flush is what really breaks our neck here from a performance perspective.
There is still an issue for iomem mappings, because the PAT validation does a linear walk of the resource tree (lol) for every vm_insert_pfn. But for i915 at least this is fixed by using the io_mapping infrastructure, which does the PAT reservation only once when you set up the mapping area at driver load.
Yes, I guess that was the issue that the comment describes, but the issue wasn't there with vm_insert_mixed() + VM_MIXEDMAP.
Also TTM uses VM_PFNMAP right now for everything, so it can't be a problem that hurts much :-)
Hmm, both 5.11 and drm-tip appears to still use MIXEDMAP?
https://elixir.bootlin.com/linux/latest/source/drivers/gpu/drm/ttm/ttm_bo_vm...
Uh that's bad, because mixed maps pointing at struct page wont stop gup. At least afaik.
Hui? I'm pretty sure MIXEDMAP stops gup as well. Otherwise we would have already seen tons of problems with the page cache.
On any architecture which has CONFIG_ARCH_HAS_PTE_SPECIAL vm_insert_mixed boils down to vm_insert_pfn wrt gup. And special pte stops gup fast path.
But if you don't have VM_IO or VM_PFNMAP set, then I'm not seeing how you're stopping gup slow path. See check_vma_flags() in mm/gup.c.
Also if you don't have CONFIG_ARCH_HAS_PTE_SPECIAL then I don't think vm_insert_mixed even works on iomem pfns. There's the devmap exception, but we're not devmap. Worse ttm abuses some accidental codepath to smuggle in hugepte support by intentionally not being devmap.
So I'm really not sure this works as we think it should. Maybe good to do a quick test program on amdgpu with a buffer in system memory only and try to do direct io into it. If it works, you have a problem, and a bad one. -Daniel
Regards, Christian.
Christian, do we need to patch this up, and maybe fix up ttm fault handler to use io_mapping so the vm_insert_pfn stuff is fast? -Daniel
On Thu, Feb 25, 2021 at 11:44 AM Daniel Vetter daniel@ffwll.ch wrote:
On Thu, Feb 25, 2021 at 11:28:31AM +0100, Christian König wrote:
Am 24.02.21 um 10:31 schrieb Daniel Vetter:
On Wed, Feb 24, 2021 at 10:16 AM Thomas Hellström (Intel) thomas_os@shipmail.org wrote:
On 2/24/21 9:45 AM, Daniel Vetter wrote:
On Wed, Feb 24, 2021 at 8:46 AM Thomas Hellström (Intel) thomas_os@shipmail.org wrote:
On 2/23/21 11:59 AM, Daniel Vetter wrote: > tldr; DMA buffers aren't normal memory, expecting that you can use > them like that (like calling get_user_pages works, or that they're > accounting like any other normal memory) cannot be guaranteed. > > Since some userspace only runs on integrated devices, where all > buffers are actually all resident system memory, there's a huge > temptation to assume that a struct page is always present and useable > like for any more pagecache backed mmap. This has the potential to > result in a uapi nightmare. > > To stop this gap require that DMA buffer mmaps are VM_PFNMAP, which > blocks get_user_pages and all the other struct page based > infrastructure for everyone. In spirit this is the uapi counterpart to > the kernel-internal CONFIG_DMABUF_DEBUG. > > Motivated by a recent patch which wanted to swich the system dma-buf > heap to vm_insert_page instead of vm_insert_pfn. > > v2: > > Jason brought up that we also want to guarantee that all ptes have the > pte_special flag set, to catch fast get_user_pages (on architectures > that support this). Allowing VM_MIXEDMAP (like VM_SPECIAL does) would > still allow vm_insert_page, but limiting to VM_PFNMAP will catch that. > > From auditing the various functions to insert pfn pte entires > (vm_insert_pfn_prot, remap_pfn_range and all it's callers like > dma_mmap_wc) it looks like VM_PFNMAP is already required anyway, so > this should be the correct flag to check for. > If we require VM_PFNMAP, for ordinary page mappings, we also need to disallow COW mappings, since it will not work on architectures that don't have CONFIG_ARCH_HAS_PTE_SPECIAL, (see the docs for vm_normal_page()).
Hm I figured everyone just uses MAP_SHARED for buffer objects since COW really makes absolutely no sense. How would we enforce this?
Perhaps returning -EINVAL on is_cow_mapping() at mmap time. Either that or allowing MIXEDMAP.
Also worth noting is the comment in ttm_bo_mmap_vma_setup() with possible performance implications with x86 + PAT + VM_PFNMAP + normal pages. That's a very old comment, though, and might not be valid anymore.
I think that's why ttm has a page cache for these, because it indeed sucks. The PAT changes on pages are rather expensive.
IIRC the page cache was implemented because of the slowness of the caching mode transition itself, more specifically the wbinvd() call + global TLB flush.
Yes, exactly that. The global TLB flush is what really breaks our neck here from a performance perspective.
There is still an issue for iomem mappings, because the PAT validation does a linear walk of the resource tree (lol) for every vm_insert_pfn. But for i915 at least this is fixed by using the io_mapping infrastructure, which does the PAT reservation only once when you set up the mapping area at driver load.
Yes, I guess that was the issue that the comment describes, but the issue wasn't there with vm_insert_mixed() + VM_MIXEDMAP.
Also TTM uses VM_PFNMAP right now for everything, so it can't be a problem that hurts much :-)
Hmm, both 5.11 and drm-tip appears to still use MIXEDMAP?
https://elixir.bootlin.com/linux/latest/source/drivers/gpu/drm/ttm/ttm_bo_vm...
Uh that's bad, because mixed maps pointing at struct page wont stop gup. At least afaik.
Hui? I'm pretty sure MIXEDMAP stops gup as well. Otherwise we would have already seen tons of problems with the page cache.
On any architecture which has CONFIG_ARCH_HAS_PTE_SPECIAL vm_insert_mixed boils down to vm_insert_pfn wrt gup. And special pte stops gup fast path.
But if you don't have VM_IO or VM_PFNMAP set, then I'm not seeing how you're stopping gup slow path. See check_vma_flags() in mm/gup.c.
Also if you don't have CONFIG_ARCH_HAS_PTE_SPECIAL then I don't think vm_insert_mixed even works on iomem pfns. There's the devmap exception, but we're not devmap. Worse ttm abuses some accidental codepath to smuggle in hugepte support by intentionally not being devmap.
So I'm really not sure this works as we think it should. Maybe good to do a quick test program on amdgpu with a buffer in system memory only and try to do direct io into it. If it works, you have a problem, and a bad one.
That's probably impossible, since a quick git grep shows that pretty much anything reasonable has special ptes: arc, arm, arm64, powerpc, riscv, s390, sh, sparc, x86. I don't think you'll have a platform where you can plug an amdgpu in and actually exercise the bug :-)
So maybe we should just switch over to VM_PFNMAP for ttm for more clarity? -Daniel
Regards, Christian.
Christian, do we need to patch this up, and maybe fix up ttm fault handler to use io_mapping so the vm_insert_pfn stuff is fast? -Daniel
-- Daniel Vetter Software Engineer, Intel Corporation http://blog.ffwll.ch
Am 25.02.21 um 16:49 schrieb Daniel Vetter:
On Thu, Feb 25, 2021 at 11:44 AM Daniel Vetter daniel@ffwll.ch wrote:
On Thu, Feb 25, 2021 at 11:28:31AM +0100, Christian König wrote:
Am 24.02.21 um 10:31 schrieb Daniel Vetter:
On Wed, Feb 24, 2021 at 10:16 AM Thomas Hellström (Intel) thomas_os@shipmail.org wrote:
On 2/24/21 9:45 AM, Daniel Vetter wrote:
On Wed, Feb 24, 2021 at 8:46 AM Thomas Hellström (Intel) thomas_os@shipmail.org wrote: > On 2/23/21 11:59 AM, Daniel Vetter wrote: >> tldr; DMA buffers aren't normal memory, expecting that you can use >> them like that (like calling get_user_pages works, or that they're >> accounting like any other normal memory) cannot be guaranteed. >> >> Since some userspace only runs on integrated devices, where all >> buffers are actually all resident system memory, there's a huge >> temptation to assume that a struct page is always present and useable >> like for any more pagecache backed mmap. This has the potential to >> result in a uapi nightmare. >> >> To stop this gap require that DMA buffer mmaps are VM_PFNMAP, which >> blocks get_user_pages and all the other struct page based >> infrastructure for everyone. In spirit this is the uapi counterpart to >> the kernel-internal CONFIG_DMABUF_DEBUG. >> >> Motivated by a recent patch which wanted to swich the system dma-buf >> heap to vm_insert_page instead of vm_insert_pfn. >> >> v2: >> >> Jason brought up that we also want to guarantee that all ptes have the >> pte_special flag set, to catch fast get_user_pages (on architectures >> that support this). Allowing VM_MIXEDMAP (like VM_SPECIAL does) would >> still allow vm_insert_page, but limiting to VM_PFNMAP will catch that. >> >> From auditing the various functions to insert pfn pte entires >> (vm_insert_pfn_prot, remap_pfn_range and all it's callers like >> dma_mmap_wc) it looks like VM_PFNMAP is already required anyway, so >> this should be the correct flag to check for. >> > If we require VM_PFNMAP, for ordinary page mappings, we also need to > disallow COW mappings, since it will not work on architectures that > don't have CONFIG_ARCH_HAS_PTE_SPECIAL, (see the docs for vm_normal_page()). Hm I figured everyone just uses MAP_SHARED for buffer objects since COW really makes absolutely no sense. How would we enforce this?
Perhaps returning -EINVAL on is_cow_mapping() at mmap time. Either that or allowing MIXEDMAP.
> Also worth noting is the comment in ttm_bo_mmap_vma_setup() with > possible performance implications with x86 + PAT + VM_PFNMAP + normal > pages. That's a very old comment, though, and might not be valid anymore. I think that's why ttm has a page cache for these, because it indeed sucks. The PAT changes on pages are rather expensive.
IIRC the page cache was implemented because of the slowness of the caching mode transition itself, more specifically the wbinvd() call + global TLB flush.
Yes, exactly that. The global TLB flush is what really breaks our neck here from a performance perspective.
There is still an issue for iomem mappings, because the PAT validation does a linear walk of the resource tree (lol) for every vm_insert_pfn. But for i915 at least this is fixed by using the io_mapping infrastructure, which does the PAT reservation only once when you set up the mapping area at driver load.
Yes, I guess that was the issue that the comment describes, but the issue wasn't there with vm_insert_mixed() + VM_MIXEDMAP.
Also TTM uses VM_PFNMAP right now for everything, so it can't be a problem that hurts much :-)
Hmm, both 5.11 and drm-tip appears to still use MIXEDMAP?
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Felixir.boo...
Uh that's bad, because mixed maps pointing at struct page wont stop gup. At least afaik.
Hui? I'm pretty sure MIXEDMAP stops gup as well. Otherwise we would have already seen tons of problems with the page cache.
On any architecture which has CONFIG_ARCH_HAS_PTE_SPECIAL vm_insert_mixed boils down to vm_insert_pfn wrt gup. And special pte stops gup fast path.
But if you don't have VM_IO or VM_PFNMAP set, then I'm not seeing how you're stopping gup slow path. See check_vma_flags() in mm/gup.c.
Also if you don't have CONFIG_ARCH_HAS_PTE_SPECIAL then I don't think vm_insert_mixed even works on iomem pfns. There's the devmap exception, but we're not devmap. Worse ttm abuses some accidental codepath to smuggle in hugepte support by intentionally not being devmap.
So I'm really not sure this works as we think it should. Maybe good to do a quick test program on amdgpu with a buffer in system memory only and try to do direct io into it. If it works, you have a problem, and a bad one.
That's probably impossible, since a quick git grep shows that pretty much anything reasonable has special ptes: arc, arm, arm64, powerpc, riscv, s390, sh, sparc, x86. I don't think you'll have a platform where you can plug an amdgpu in and actually exercise the bug :-)
So maybe we should just switch over to VM_PFNMAP for ttm for more clarity?
Maybe yes, but not sure.
I've once had a request to do this from some google guys, but rejected it because I wasn't sure of the consequences.
Christian.
-Daniel
Regards, Christian.
Christian, do we need to patch this up, and maybe fix up ttm fault handler to use io_mapping so the vm_insert_pfn stuff is fast? -Daniel
-- Daniel Vetter Software Engineer, Intel Corporation https://nam11.safelinks.protection.outlook.com/?url=http%3A%2F%2Fblog.ffwll....
On 2/25/21 4:49 PM, Daniel Vetter wrote:
On Thu, Feb 25, 2021 at 11:44 AM Daniel Vetter daniel@ffwll.ch wrote:
On Thu, Feb 25, 2021 at 11:28:31AM +0100, Christian König wrote:
Am 24.02.21 um 10:31 schrieb Daniel Vetter:
On Wed, Feb 24, 2021 at 10:16 AM Thomas Hellström (Intel) thomas_os@shipmail.org wrote:
On 2/24/21 9:45 AM, Daniel Vetter wrote:
On Wed, Feb 24, 2021 at 8:46 AM Thomas Hellström (Intel) thomas_os@shipmail.org wrote: > On 2/23/21 11:59 AM, Daniel Vetter wrote: >> tldr; DMA buffers aren't normal memory, expecting that you can use >> them like that (like calling get_user_pages works, or that they're >> accounting like any other normal memory) cannot be guaranteed. >> >> Since some userspace only runs on integrated devices, where all >> buffers are actually all resident system memory, there's a huge >> temptation to assume that a struct page is always present and useable >> like for any more pagecache backed mmap. This has the potential to >> result in a uapi nightmare. >> >> To stop this gap require that DMA buffer mmaps are VM_PFNMAP, which >> blocks get_user_pages and all the other struct page based >> infrastructure for everyone. In spirit this is the uapi counterpart to >> the kernel-internal CONFIG_DMABUF_DEBUG. >> >> Motivated by a recent patch which wanted to swich the system dma-buf >> heap to vm_insert_page instead of vm_insert_pfn. >> >> v2: >> >> Jason brought up that we also want to guarantee that all ptes have the >> pte_special flag set, to catch fast get_user_pages (on architectures >> that support this). Allowing VM_MIXEDMAP (like VM_SPECIAL does) would >> still allow vm_insert_page, but limiting to VM_PFNMAP will catch that. >> >> From auditing the various functions to insert pfn pte entires >> (vm_insert_pfn_prot, remap_pfn_range and all it's callers like >> dma_mmap_wc) it looks like VM_PFNMAP is already required anyway, so >> this should be the correct flag to check for. >> > If we require VM_PFNMAP, for ordinary page mappings, we also need to > disallow COW mappings, since it will not work on architectures that > don't have CONFIG_ARCH_HAS_PTE_SPECIAL, (see the docs for vm_normal_page()). Hm I figured everyone just uses MAP_SHARED for buffer objects since COW really makes absolutely no sense. How would we enforce this?
Perhaps returning -EINVAL on is_cow_mapping() at mmap time. Either that or allowing MIXEDMAP.
> Also worth noting is the comment in ttm_bo_mmap_vma_setup() with > possible performance implications with x86 + PAT + VM_PFNMAP + normal > pages. That's a very old comment, though, and might not be valid anymore. I think that's why ttm has a page cache for these, because it indeed sucks. The PAT changes on pages are rather expensive.
IIRC the page cache was implemented because of the slowness of the caching mode transition itself, more specifically the wbinvd() call + global TLB flush.
Yes, exactly that. The global TLB flush is what really breaks our neck here from a performance perspective.
There is still an issue for iomem mappings, because the PAT validation does a linear walk of the resource tree (lol) for every vm_insert_pfn. But for i915 at least this is fixed by using the io_mapping infrastructure, which does the PAT reservation only once when you set up the mapping area at driver load.
Yes, I guess that was the issue that the comment describes, but the issue wasn't there with vm_insert_mixed() + VM_MIXEDMAP.
Also TTM uses VM_PFNMAP right now for everything, so it can't be a problem that hurts much :-)
Hmm, both 5.11 and drm-tip appears to still use MIXEDMAP?
https://elixir.bootlin.com/linux/latest/source/drivers/gpu/drm/ttm/ttm_bo_vm...
Uh that's bad, because mixed maps pointing at struct page wont stop gup. At least afaik.
Hui? I'm pretty sure MIXEDMAP stops gup as well. Otherwise we would have already seen tons of problems with the page cache.
On any architecture which has CONFIG_ARCH_HAS_PTE_SPECIAL vm_insert_mixed boils down to vm_insert_pfn wrt gup. And special pte stops gup fast path.
But if you don't have VM_IO or VM_PFNMAP set, then I'm not seeing how you're stopping gup slow path. See check_vma_flags() in mm/gup.c.
Also if you don't have CONFIG_ARCH_HAS_PTE_SPECIAL then I don't think vm_insert_mixed even works on iomem pfns. There's the devmap exception, but we're not devmap. Worse ttm abuses some accidental codepath to smuggle in hugepte support by intentionally not being devmap.
So I'm really not sure this works as we think it should. Maybe good to do a quick test program on amdgpu with a buffer in system memory only and try to do direct io into it. If it works, you have a problem, and a bad one.
That's probably impossible, since a quick git grep shows that pretty much anything reasonable has special ptes: arc, arm, arm64, powerpc, riscv, s390, sh, sparc, x86. I don't think you'll have a platform where you can plug an amdgpu in and actually exercise the bug :-)
Hm. AFAIK _insert_mixed() doesn't set PTE_SPECIAL on system pages, so I don't see what should be stopping gup to those?
/Thomas
So maybe we should just switch over to VM_PFNMAP for ttm for more clarity? -Daniel
Regards, Christian.
Christian, do we need to patch this up, and maybe fix up ttm fault handler to use io_mapping so the vm_insert_pfn stuff is fast? -Daniel
-- Daniel Vetter Software Engineer, Intel Corporation http://blog.ffwll.ch
On Fri, Feb 26, 2021 at 10:41 AM Thomas Hellström (Intel) thomas_os@shipmail.org wrote:
On 2/25/21 4:49 PM, Daniel Vetter wrote:
On Thu, Feb 25, 2021 at 11:44 AM Daniel Vetter daniel@ffwll.ch wrote:
On Thu, Feb 25, 2021 at 11:28:31AM +0100, Christian König wrote:
Am 24.02.21 um 10:31 schrieb Daniel Vetter:
On Wed, Feb 24, 2021 at 10:16 AM Thomas Hellström (Intel) thomas_os@shipmail.org wrote:
On 2/24/21 9:45 AM, Daniel Vetter wrote: > On Wed, Feb 24, 2021 at 8:46 AM Thomas Hellström (Intel) > thomas_os@shipmail.org wrote: >> On 2/23/21 11:59 AM, Daniel Vetter wrote: >>> tldr; DMA buffers aren't normal memory, expecting that you can use >>> them like that (like calling get_user_pages works, or that they're >>> accounting like any other normal memory) cannot be guaranteed. >>> >>> Since some userspace only runs on integrated devices, where all >>> buffers are actually all resident system memory, there's a huge >>> temptation to assume that a struct page is always present and useable >>> like for any more pagecache backed mmap. This has the potential to >>> result in a uapi nightmare. >>> >>> To stop this gap require that DMA buffer mmaps are VM_PFNMAP, which >>> blocks get_user_pages and all the other struct page based >>> infrastructure for everyone. In spirit this is the uapi counterpart to >>> the kernel-internal CONFIG_DMABUF_DEBUG. >>> >>> Motivated by a recent patch which wanted to swich the system dma-buf >>> heap to vm_insert_page instead of vm_insert_pfn. >>> >>> v2: >>> >>> Jason brought up that we also want to guarantee that all ptes have the >>> pte_special flag set, to catch fast get_user_pages (on architectures >>> that support this). Allowing VM_MIXEDMAP (like VM_SPECIAL does) would >>> still allow vm_insert_page, but limiting to VM_PFNMAP will catch that. >>> >>> From auditing the various functions to insert pfn pte entires >>> (vm_insert_pfn_prot, remap_pfn_range and all it's callers like >>> dma_mmap_wc) it looks like VM_PFNMAP is already required anyway, so >>> this should be the correct flag to check for. >>> >> If we require VM_PFNMAP, for ordinary page mappings, we also need to >> disallow COW mappings, since it will not work on architectures that >> don't have CONFIG_ARCH_HAS_PTE_SPECIAL, (see the docs for vm_normal_page()). > Hm I figured everyone just uses MAP_SHARED for buffer objects since > COW really makes absolutely no sense. How would we enforce this? Perhaps returning -EINVAL on is_cow_mapping() at mmap time. Either that or allowing MIXEDMAP.
>> Also worth noting is the comment in ttm_bo_mmap_vma_setup() with >> possible performance implications with x86 + PAT + VM_PFNMAP + normal >> pages. That's a very old comment, though, and might not be valid anymore. > I think that's why ttm has a page cache for these, because it indeed > sucks. The PAT changes on pages are rather expensive. IIRC the page cache was implemented because of the slowness of the caching mode transition itself, more specifically the wbinvd() call + global TLB flush.
Yes, exactly that. The global TLB flush is what really breaks our neck here from a performance perspective.
> There is still an issue for iomem mappings, because the PAT validation > does a linear walk of the resource tree (lol) for every vm_insert_pfn. > But for i915 at least this is fixed by using the io_mapping > infrastructure, which does the PAT reservation only once when you set > up the mapping area at driver load. Yes, I guess that was the issue that the comment describes, but the issue wasn't there with vm_insert_mixed() + VM_MIXEDMAP.
> Also TTM uses VM_PFNMAP right now for everything, so it can't be a > problem that hurts much :-) Hmm, both 5.11 and drm-tip appears to still use MIXEDMAP?
https://elixir.bootlin.com/linux/latest/source/drivers/gpu/drm/ttm/ttm_bo_vm...
Uh that's bad, because mixed maps pointing at struct page wont stop gup. At least afaik.
Hui? I'm pretty sure MIXEDMAP stops gup as well. Otherwise we would have already seen tons of problems with the page cache.
On any architecture which has CONFIG_ARCH_HAS_PTE_SPECIAL vm_insert_mixed boils down to vm_insert_pfn wrt gup. And special pte stops gup fast path.
But if you don't have VM_IO or VM_PFNMAP set, then I'm not seeing how you're stopping gup slow path. See check_vma_flags() in mm/gup.c.
Also if you don't have CONFIG_ARCH_HAS_PTE_SPECIAL then I don't think vm_insert_mixed even works on iomem pfns. There's the devmap exception, but we're not devmap. Worse ttm abuses some accidental codepath to smuggle in hugepte support by intentionally not being devmap.
So I'm really not sure this works as we think it should. Maybe good to do a quick test program on amdgpu with a buffer in system memory only and try to do direct io into it. If it works, you have a problem, and a bad one.
That's probably impossible, since a quick git grep shows that pretty much anything reasonable has special ptes: arc, arm, arm64, powerpc, riscv, s390, sh, sparc, x86. I don't think you'll have a platform where you can plug an amdgpu in and actually exercise the bug :-)
Hm. AFAIK _insert_mixed() doesn't set PTE_SPECIAL on system pages, so I don't see what should be stopping gup to those?
If you have an arch with pte special we use insert_pfn(), which afaict will use pte_mkspecial for the !devmap case. And ttm isn't devmap (otherwise our hugepte abuse of devmap hugeptes would go rather wrong).
So I think it stops gup. But I haven't verified at all. Would be good if Christian can check this with some direct io to a buffer in system memory. -Daniel
On 2/26/21 2:28 PM, Daniel Vetter wrote:
On Fri, Feb 26, 2021 at 10:41 AM Thomas Hellström (Intel) thomas_os@shipmail.org wrote:
On 2/25/21 4:49 PM, Daniel Vetter wrote:
On Thu, Feb 25, 2021 at 11:44 AM Daniel Vetter daniel@ffwll.ch wrote:
On Thu, Feb 25, 2021 at 11:28:31AM +0100, Christian König wrote:
Am 24.02.21 um 10:31 schrieb Daniel Vetter:
On Wed, Feb 24, 2021 at 10:16 AM Thomas Hellström (Intel) thomas_os@shipmail.org wrote: > On 2/24/21 9:45 AM, Daniel Vetter wrote: >> On Wed, Feb 24, 2021 at 8:46 AM Thomas Hellström (Intel) >> thomas_os@shipmail.org wrote: >>> On 2/23/21 11:59 AM, Daniel Vetter wrote: >>>> tldr; DMA buffers aren't normal memory, expecting that you can use >>>> them like that (like calling get_user_pages works, or that they're >>>> accounting like any other normal memory) cannot be guaranteed. >>>> >>>> Since some userspace only runs on integrated devices, where all >>>> buffers are actually all resident system memory, there's a huge >>>> temptation to assume that a struct page is always present and useable >>>> like for any more pagecache backed mmap. This has the potential to >>>> result in a uapi nightmare. >>>> >>>> To stop this gap require that DMA buffer mmaps are VM_PFNMAP, which >>>> blocks get_user_pages and all the other struct page based >>>> infrastructure for everyone. In spirit this is the uapi counterpart to >>>> the kernel-internal CONFIG_DMABUF_DEBUG. >>>> >>>> Motivated by a recent patch which wanted to swich the system dma-buf >>>> heap to vm_insert_page instead of vm_insert_pfn. >>>> >>>> v2: >>>> >>>> Jason brought up that we also want to guarantee that all ptes have the >>>> pte_special flag set, to catch fast get_user_pages (on architectures >>>> that support this). Allowing VM_MIXEDMAP (like VM_SPECIAL does) would >>>> still allow vm_insert_page, but limiting to VM_PFNMAP will catch that. >>>> >>>> From auditing the various functions to insert pfn pte entires >>>> (vm_insert_pfn_prot, remap_pfn_range and all it's callers like >>>> dma_mmap_wc) it looks like VM_PFNMAP is already required anyway, so >>>> this should be the correct flag to check for. >>>> >>> If we require VM_PFNMAP, for ordinary page mappings, we also need to >>> disallow COW mappings, since it will not work on architectures that >>> don't have CONFIG_ARCH_HAS_PTE_SPECIAL, (see the docs for vm_normal_page()). >> Hm I figured everyone just uses MAP_SHARED for buffer objects since >> COW really makes absolutely no sense. How would we enforce this? > Perhaps returning -EINVAL on is_cow_mapping() at mmap time. Either that > or allowing MIXEDMAP. > >>> Also worth noting is the comment in ttm_bo_mmap_vma_setup() with >>> possible performance implications with x86 + PAT + VM_PFNMAP + normal >>> pages. That's a very old comment, though, and might not be valid anymore. >> I think that's why ttm has a page cache for these, because it indeed >> sucks. The PAT changes on pages are rather expensive. > IIRC the page cache was implemented because of the slowness of the > caching mode transition itself, more specifically the wbinvd() call + > global TLB flush.
Yes, exactly that. The global TLB flush is what really breaks our neck here from a performance perspective.
>> There is still an issue for iomem mappings, because the PAT validation >> does a linear walk of the resource tree (lol) for every vm_insert_pfn. >> But for i915 at least this is fixed by using the io_mapping >> infrastructure, which does the PAT reservation only once when you set >> up the mapping area at driver load. > Yes, I guess that was the issue that the comment describes, but the > issue wasn't there with vm_insert_mixed() + VM_MIXEDMAP. > >> Also TTM uses VM_PFNMAP right now for everything, so it can't be a >> problem that hurts much :-) > Hmm, both 5.11 and drm-tip appears to still use MIXEDMAP? > > https://elixir.bootlin.com/linux/latest/source/drivers/gpu/drm/ttm/ttm_bo_vm... Uh that's bad, because mixed maps pointing at struct page wont stop gup. At least afaik.
Hui? I'm pretty sure MIXEDMAP stops gup as well. Otherwise we would have already seen tons of problems with the page cache.
On any architecture which has CONFIG_ARCH_HAS_PTE_SPECIAL vm_insert_mixed boils down to vm_insert_pfn wrt gup. And special pte stops gup fast path.
But if you don't have VM_IO or VM_PFNMAP set, then I'm not seeing how you're stopping gup slow path. See check_vma_flags() in mm/gup.c.
Also if you don't have CONFIG_ARCH_HAS_PTE_SPECIAL then I don't think vm_insert_mixed even works on iomem pfns. There's the devmap exception, but we're not devmap. Worse ttm abuses some accidental codepath to smuggle in hugepte support by intentionally not being devmap.
So I'm really not sure this works as we think it should. Maybe good to do a quick test program on amdgpu with a buffer in system memory only and try to do direct io into it. If it works, you have a problem, and a bad one.
That's probably impossible, since a quick git grep shows that pretty much anything reasonable has special ptes: arc, arm, arm64, powerpc, riscv, s390, sh, sparc, x86. I don't think you'll have a platform where you can plug an amdgpu in and actually exercise the bug :-)
Hm. AFAIK _insert_mixed() doesn't set PTE_SPECIAL on system pages, so I don't see what should be stopping gup to those?
If you have an arch with pte special we use insert_pfn(), which afaict will use pte_mkspecial for the !devmap case. And ttm isn't devmap (otherwise our hugepte abuse of devmap hugeptes would go rather wrong).
So I think it stops gup. But I haven't verified at all. Would be good if Christian can check this with some direct io to a buffer in system memory.
Hmm,
Docs (again vm_normal_page() say)
* VM_MIXEDMAP mappings can likewise contain memory with or without "struct * page" backing, however the difference is that _all_ pages with a struct * page (that is, those where pfn_valid is true) are refcounted and considered * normal pages by the VM. The disadvantage is that pages are refcounted * (which can be slower and simply not an option for some PFNMAP users). The * advantage is that we don't have to follow the strict linearity rule of * PFNMAP mappings in order to support COWable mappings.
but it's true __vm_insert_mixed() ends up in the insert_pfn() path, so the above isn't really true, which makes me wonder if and in that case why there could any longer ever be a significant performance difference between MIXEDMAP and PFNMAP.
BTW regarding the TTM hugeptes, I don't think we ever landed that devmap hack, so they are (for the non-gup case) relying on vma_is_special_huge(). For the gup case, I think the bug is still there.
/Thomas
-Daniel
On Sat, Feb 27, 2021 at 9:06 AM Thomas Hellström (Intel) thomas_os@shipmail.org wrote:
On 2/26/21 2:28 PM, Daniel Vetter wrote:
So I think it stops gup. But I haven't verified at all. Would be good if Christian can check this with some direct io to a buffer in system memory.
Hmm,
Docs (again vm_normal_page() say)
- VM_MIXEDMAP mappings can likewise contain memory with or without "struct
- page" backing, however the difference is that _all_ pages with a struct
- page (that is, those where pfn_valid is true) are refcounted and
considered
- normal pages by the VM. The disadvantage is that pages are refcounted
- (which can be slower and simply not an option for some PFNMAP
users). The
- advantage is that we don't have to follow the strict linearity rule of
- PFNMAP mappings in order to support COWable mappings.
but it's true __vm_insert_mixed() ends up in the insert_pfn() path, so the above isn't really true, which makes me wonder if and in that case why there could any longer ever be a significant performance difference between MIXEDMAP and PFNMAP.
Yeah it's definitely confusing. I guess I'll hack up a patch and see what sticks.
BTW regarding the TTM hugeptes, I don't think we ever landed that devmap hack, so they are (for the non-gup case) relying on vma_is_special_huge(). For the gup case, I think the bug is still there.
Maybe there's another devmap hack, but the ttm_vm_insert functions do use PFN_DEV and all that. And I think that stops gup_fast from trying to find the underlying page. -Daniel
Hi,
On 3/1/21 9:28 AM, Daniel Vetter wrote:
On Sat, Feb 27, 2021 at 9:06 AM Thomas Hellström (Intel) thomas_os@shipmail.org wrote:
On 2/26/21 2:28 PM, Daniel Vetter wrote:
So I think it stops gup. But I haven't verified at all. Would be good if Christian can check this with some direct io to a buffer in system memory.
Hmm,
Docs (again vm_normal_page() say)
- VM_MIXEDMAP mappings can likewise contain memory with or without "struct
- page" backing, however the difference is that _all_ pages with a struct
- page (that is, those where pfn_valid is true) are refcounted and
considered
- normal pages by the VM. The disadvantage is that pages are refcounted
- (which can be slower and simply not an option for some PFNMAP
users). The
- advantage is that we don't have to follow the strict linearity rule of
- PFNMAP mappings in order to support COWable mappings.
but it's true __vm_insert_mixed() ends up in the insert_pfn() path, so the above isn't really true, which makes me wonder if and in that case why there could any longer ever be a significant performance difference between MIXEDMAP and PFNMAP.
Yeah it's definitely confusing. I guess I'll hack up a patch and see what sticks.
BTW regarding the TTM hugeptes, I don't think we ever landed that devmap hack, so they are (for the non-gup case) relying on vma_is_special_huge(). For the gup case, I think the bug is still there.
Maybe there's another devmap hack, but the ttm_vm_insert functions do use PFN_DEV and all that. And I think that stops gup_fast from trying to find the underlying page. -Daniel
Hmm perhaps it might, but I don't think so. The fix I tried out was to set
PFN_DEV | PFN_MAP for huge PTEs which causes pfn_devmap() to be true, and then
follow_devmap_pmd()->get_dev_pagemap() which returns NULL and gup_fast() backs off,
in the end that would mean setting in stone that "if there is a huge devmap page table entry for which we haven't registered any devmap struct pages (get_dev_pagemap returns NULL), we should treat that as a "special" huge page table entry".
From what I can tell, all code calling get_dev_pagemap() already does that, it's just a question of getting it accepted and formalizing it.
/Thomas
On Mon, Mar 01, 2021 at 09:39:53AM +0100, Thomas Hellström (Intel) wrote:
Hi,
On 3/1/21 9:28 AM, Daniel Vetter wrote:
On Sat, Feb 27, 2021 at 9:06 AM Thomas Hellström (Intel) thomas_os@shipmail.org wrote:
On 2/26/21 2:28 PM, Daniel Vetter wrote:
So I think it stops gup. But I haven't verified at all. Would be good if Christian can check this with some direct io to a buffer in system memory.
Hmm,
Docs (again vm_normal_page() say)
- VM_MIXEDMAP mappings can likewise contain memory with or without "struct
- page" backing, however the difference is that _all_ pages with a struct
- page (that is, those where pfn_valid is true) are refcounted and
considered
- normal pages by the VM. The disadvantage is that pages are refcounted
- (which can be slower and simply not an option for some PFNMAP
users). The
- advantage is that we don't have to follow the strict linearity rule of
- PFNMAP mappings in order to support COWable mappings.
but it's true __vm_insert_mixed() ends up in the insert_pfn() path, so the above isn't really true, which makes me wonder if and in that case why there could any longer ever be a significant performance difference between MIXEDMAP and PFNMAP.
Yeah it's definitely confusing. I guess I'll hack up a patch and see what sticks.
BTW regarding the TTM hugeptes, I don't think we ever landed that devmap hack, so they are (for the non-gup case) relying on vma_is_special_huge(). For the gup case, I think the bug is still there.
Maybe there's another devmap hack, but the ttm_vm_insert functions do use PFN_DEV and all that. And I think that stops gup_fast from trying to find the underlying page. -Daniel
Hmm perhaps it might, but I don't think so. The fix I tried out was to set
PFN_DEV | PFN_MAP for huge PTEs which causes pfn_devmap() to be true, and then
follow_devmap_pmd()->get_dev_pagemap() which returns NULL and gup_fast() backs off,
in the end that would mean setting in stone that "if there is a huge devmap page table entry for which we haven't registered any devmap struct pages (get_dev_pagemap returns NULL), we should treat that as a "special" huge page table entry".
From what I can tell, all code calling get_dev_pagemap() already does that, it's just a question of getting it accepted and formalizing it.
Oh I thought that's already how it works, since I didn't spot anything else that would block gup_fast from falling over. I guess really would need some testcases to make sure direct i/o (that's the easiest to test) fails like we expect. -Daniel
On 3/1/21 10:05 AM, Daniel Vetter wrote:
On Mon, Mar 01, 2021 at 09:39:53AM +0100, Thomas Hellström (Intel) wrote:
Hi,
On 3/1/21 9:28 AM, Daniel Vetter wrote:
On Sat, Feb 27, 2021 at 9:06 AM Thomas Hellström (Intel) thomas_os@shipmail.org wrote:
On 2/26/21 2:28 PM, Daniel Vetter wrote:
So I think it stops gup. But I haven't verified at all. Would be good if Christian can check this with some direct io to a buffer in system memory.
Hmm,
Docs (again vm_normal_page() say)
* VM_MIXEDMAP mappings can likewise contain memory with or without "struct * page" backing, however the difference is that _all_ pages with a struct * page (that is, those where pfn_valid is true) are refcounted and
considered * normal pages by the VM. The disadvantage is that pages are refcounted * (which can be slower and simply not an option for some PFNMAP users). The * advantage is that we don't have to follow the strict linearity rule of * PFNMAP mappings in order to support COWable mappings.
but it's true __vm_insert_mixed() ends up in the insert_pfn() path, so the above isn't really true, which makes me wonder if and in that case why there could any longer ever be a significant performance difference between MIXEDMAP and PFNMAP.
Yeah it's definitely confusing. I guess I'll hack up a patch and see what sticks.
BTW regarding the TTM hugeptes, I don't think we ever landed that devmap hack, so they are (for the non-gup case) relying on vma_is_special_huge(). For the gup case, I think the bug is still there.
Maybe there's another devmap hack, but the ttm_vm_insert functions do use PFN_DEV and all that. And I think that stops gup_fast from trying to find the underlying page. -Daniel
Hmm perhaps it might, but I don't think so. The fix I tried out was to set
PFN_DEV | PFN_MAP for huge PTEs which causes pfn_devmap() to be true, and then
follow_devmap_pmd()->get_dev_pagemap() which returns NULL and gup_fast() backs off,
in the end that would mean setting in stone that "if there is a huge devmap page table entry for which we haven't registered any devmap struct pages (get_dev_pagemap returns NULL), we should treat that as a "special" huge page table entry".
From what I can tell, all code calling get_dev_pagemap() already does that, it's just a question of getting it accepted and formalizing it.
Oh I thought that's already how it works, since I didn't spot anything else that would block gup_fast from falling over. I guess really would need some testcases to make sure direct i/o (that's the easiest to test) fails like we expect.
Yeah, IIRC the "| PFN_MAP" is the missing piece for TTM huge ptes. Otherwise pmd_devmap() will not return true and since there is no pmd_special() things break.
/Thomas
-Daniel
Am 01.03.21 um 10:21 schrieb Thomas Hellström (Intel):
On 3/1/21 10:05 AM, Daniel Vetter wrote:
On Mon, Mar 01, 2021 at 09:39:53AM +0100, Thomas Hellström (Intel) wrote:
Hi,
On 3/1/21 9:28 AM, Daniel Vetter wrote:
On Sat, Feb 27, 2021 at 9:06 AM Thomas Hellström (Intel) thomas_os@shipmail.org wrote:
On 2/26/21 2:28 PM, Daniel Vetter wrote:
So I think it stops gup. But I haven't verified at all. Would be good if Christian can check this with some direct io to a buffer in system memory.
Hmm,
Docs (again vm_normal_page() say)
* VM_MIXEDMAP mappings can likewise contain memory with or without "struct * page" backing, however the difference is that _all_ pages with a struct * page (that is, those where pfn_valid is true) are refcounted and considered * normal pages by the VM. The disadvantage is that pages are refcounted * (which can be slower and simply not an option for some PFNMAP users). The * advantage is that we don't have to follow the strict linearity rule of * PFNMAP mappings in order to support COWable mappings.
but it's true __vm_insert_mixed() ends up in the insert_pfn() path, so the above isn't really true, which makes me wonder if and in that case why there could any longer ever be a significant performance difference between MIXEDMAP and PFNMAP.
Yeah it's definitely confusing. I guess I'll hack up a patch and see what sticks.
BTW regarding the TTM hugeptes, I don't think we ever landed that devmap hack, so they are (for the non-gup case) relying on vma_is_special_huge(). For the gup case, I think the bug is still there.
Maybe there's another devmap hack, but the ttm_vm_insert functions do use PFN_DEV and all that. And I think that stops gup_fast from trying to find the underlying page. -Daniel
Hmm perhaps it might, but I don't think so. The fix I tried out was to set
PFN_DEV | PFN_MAP for huge PTEs which causes pfn_devmap() to be true, and then
follow_devmap_pmd()->get_dev_pagemap() which returns NULL and gup_fast() backs off,
in the end that would mean setting in stone that "if there is a huge devmap page table entry for which we haven't registered any devmap struct pages (get_dev_pagemap returns NULL), we should treat that as a "special" huge page table entry".
From what I can tell, all code calling get_dev_pagemap() already does that, it's just a question of getting it accepted and formalizing it.
Oh I thought that's already how it works, since I didn't spot anything else that would block gup_fast from falling over. I guess really would need some testcases to make sure direct i/o (that's the easiest to test) fails like we expect.
Yeah, IIRC the "| PFN_MAP" is the missing piece for TTM huge ptes. Otherwise pmd_devmap() will not return true and since there is no pmd_special() things break.
Is that maybe the issue we have seen with amdgpu and huge pages?
Apart from that I'm lost guys, that devmap and gup stuff is not something I have a good knowledge of apart from a one mile high view.
Christian.
/Thomas
-Daniel
On Mon, Mar 1, 2021 at 11:17 AM Christian König christian.koenig@amd.com wrote:
Am 01.03.21 um 10:21 schrieb Thomas Hellström (Intel):
On 3/1/21 10:05 AM, Daniel Vetter wrote:
On Mon, Mar 01, 2021 at 09:39:53AM +0100, Thomas Hellström (Intel) wrote:
Hi,
On 3/1/21 9:28 AM, Daniel Vetter wrote:
On Sat, Feb 27, 2021 at 9:06 AM Thomas Hellström (Intel) thomas_os@shipmail.org wrote:
On 2/26/21 2:28 PM, Daniel Vetter wrote: > So I think it stops gup. But I haven't verified at all. Would be > good > if Christian can check this with some direct io to a buffer in > system > memory. Hmm,
Docs (again vm_normal_page() say)
* VM_MIXEDMAP mappings can likewise contain memory with or
without "struct * page" backing, however the difference is that _all_ pages with a struct * page (that is, those where pfn_valid is true) are refcounted and considered * normal pages by the VM. The disadvantage is that pages are refcounted * (which can be slower and simply not an option for some PFNMAP users). The * advantage is that we don't have to follow the strict linearity rule of * PFNMAP mappings in order to support COWable mappings.
but it's true __vm_insert_mixed() ends up in the insert_pfn() path, so the above isn't really true, which makes me wonder if and in that case why there could any longer ever be a significant performance difference between MIXEDMAP and PFNMAP.
Yeah it's definitely confusing. I guess I'll hack up a patch and see what sticks.
BTW regarding the TTM hugeptes, I don't think we ever landed that devmap hack, so they are (for the non-gup case) relying on vma_is_special_huge(). For the gup case, I think the bug is still there.
Maybe there's another devmap hack, but the ttm_vm_insert functions do use PFN_DEV and all that. And I think that stops gup_fast from trying to find the underlying page. -Daniel
Hmm perhaps it might, but I don't think so. The fix I tried out was to set
PFN_DEV | PFN_MAP for huge PTEs which causes pfn_devmap() to be true, and then
follow_devmap_pmd()->get_dev_pagemap() which returns NULL and gup_fast() backs off,
in the end that would mean setting in stone that "if there is a huge devmap page table entry for which we haven't registered any devmap struct pages (get_dev_pagemap returns NULL), we should treat that as a "special" huge page table entry".
From what I can tell, all code calling get_dev_pagemap() already does that, it's just a question of getting it accepted and formalizing it.
Oh I thought that's already how it works, since I didn't spot anything else that would block gup_fast from falling over. I guess really would need some testcases to make sure direct i/o (that's the easiest to test) fails like we expect.
Yeah, IIRC the "| PFN_MAP" is the missing piece for TTM huge ptes. Otherwise pmd_devmap() will not return true and since there is no pmd_special() things break.
Is that maybe the issue we have seen with amdgpu and huge pages?
Yeah, essentially when you have a hugepte inserted by ttm, and it happens to point at system memory, then gup will work on that. And create all kinds of havoc.
Apart from that I'm lost guys, that devmap and gup stuff is not something I have a good knowledge of apart from a one mile high view.
I'm not really better, hence would be good to do a testcase and see. This should provoke it: - allocate nicely aligned bo in system memory - mmap, again nicely aligned to 2M - do some direct io from a filesystem into that mmap, that should trigger gup - before the gup completes free the mmap and bo so that ttm recycles the pages, which should trip up on the elevated refcount. If you wait until the direct io is completely, then I think nothing bad can be observed.
Ofc if your amdgpu+hugepte issue is something else, then maybe we have another issue.
Also usual caveat: I'm not an mm hacker either, so might be completely wrong. -Daniel
On 3/1/21 3:09 PM, Daniel Vetter wrote:
On Mon, Mar 1, 2021 at 11:17 AM Christian König christian.koenig@amd.com wrote:
Am 01.03.21 um 10:21 schrieb Thomas Hellström (Intel):
On 3/1/21 10:05 AM, Daniel Vetter wrote:
On Mon, Mar 01, 2021 at 09:39:53AM +0100, Thomas Hellström (Intel) wrote:
Hi,
On 3/1/21 9:28 AM, Daniel Vetter wrote:
On Sat, Feb 27, 2021 at 9:06 AM Thomas Hellström (Intel) thomas_os@shipmail.org wrote: > On 2/26/21 2:28 PM, Daniel Vetter wrote: >> So I think it stops gup. But I haven't verified at all. Would be >> good >> if Christian can check this with some direct io to a buffer in >> system >> memory. > Hmm, > > Docs (again vm_normal_page() say) > > * VM_MIXEDMAP mappings can likewise contain memory with or > without "struct > * page" backing, however the difference is that _all_ pages > with a struct > * page (that is, those where pfn_valid is true) are refcounted > and > considered > * normal pages by the VM. The disadvantage is that pages are > refcounted > * (which can be slower and simply not an option for some PFNMAP > users). The > * advantage is that we don't have to follow the strict > linearity rule of > * PFNMAP mappings in order to support COWable mappings. > > but it's true __vm_insert_mixed() ends up in the insert_pfn() > path, so > the above isn't really true, which makes me wonder if and in that > case > why there could any longer ever be a significant performance > difference > between MIXEDMAP and PFNMAP. Yeah it's definitely confusing. I guess I'll hack up a patch and see what sticks.
> BTW regarding the TTM hugeptes, I don't think we ever landed that > devmap > hack, so they are (for the non-gup case) relying on > vma_is_special_huge(). For the gup case, I think the bug is still > there. Maybe there's another devmap hack, but the ttm_vm_insert functions do use PFN_DEV and all that. And I think that stops gup_fast from trying to find the underlying page. -Daniel
Hmm perhaps it might, but I don't think so. The fix I tried out was to set
PFN_DEV | PFN_MAP for huge PTEs which causes pfn_devmap() to be true, and then
follow_devmap_pmd()->get_dev_pagemap() which returns NULL and gup_fast() backs off,
in the end that would mean setting in stone that "if there is a huge devmap page table entry for which we haven't registered any devmap struct pages (get_dev_pagemap returns NULL), we should treat that as a "special" huge page table entry".
From what I can tell, all code calling get_dev_pagemap() already does that, it's just a question of getting it accepted and formalizing it.
Oh I thought that's already how it works, since I didn't spot anything else that would block gup_fast from falling over. I guess really would need some testcases to make sure direct i/o (that's the easiest to test) fails like we expect.
Yeah, IIRC the "| PFN_MAP" is the missing piece for TTM huge ptes. Otherwise pmd_devmap() will not return true and since there is no pmd_special() things break.
Is that maybe the issue we have seen with amdgpu and huge pages?
Yeah, essentially when you have a hugepte inserted by ttm, and it happens to point at system memory, then gup will work on that. And create all kinds of havoc.
Apart from that I'm lost guys, that devmap and gup stuff is not something I have a good knowledge of apart from a one mile high view.
I'm not really better, hence would be good to do a testcase and see. This should provoke it:
- allocate nicely aligned bo in system memory
- mmap, again nicely aligned to 2M
- do some direct io from a filesystem into that mmap, that should trigger gup
- before the gup completes free the mmap and bo so that ttm recycles
the pages, which should trip up on the elevated refcount. If you wait until the direct io is completely, then I think nothing bad can be observed.
Ofc if your amdgpu+hugepte issue is something else, then maybe we have another issue.
Also usual caveat: I'm not an mm hacker either, so might be completely wrong. -Daniel
So I did the following quick experiment on vmwgfx, and it turns out that with it, fast gup never succeeds. Without the "| PFN_MAP", it typically succeeds
I should probably craft an RFC formalizing this.
/Thomas
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c index 6dc96cf66744..72b6fb17c984 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c @@ -195,6 +195,7 @@ static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf, pfn_t pfnt; struct ttm_tt *ttm = bo->ttm; bool write = vmf->flags & FAULT_FLAG_WRITE; + struct dev_pagemap *pagemap;
/* Fault should not cross bo boundary. */ page_offset &= ~(fault_page_size - 1); @@ -210,6 +211,17 @@ static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf, if ((pfn & (fault_page_size - 1)) != 0) goto out_fallback;
+ /* + * Huge entries must be special, that is marking them as devmap + * with no backing device map range. If there is a backing + * range, Don't insert a huge entry. + */ + pagemap = get_dev_pagemap(pfn, NULL); + if (pagemap) { + put_dev_pagemap(pagemap); + goto out_fallback; + } + /* Check that memory is contiguous. */ if (!bo->mem.bus.is_iomem) { for (i = 1; i < fault_page_size; ++i) { @@ -223,7 +235,7 @@ static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf, } }
- pfnt = __pfn_to_pfn_t(pfn, PFN_DEV); + pfnt = __pfn_to_pfn_t(pfn, PFN_DEV | PFN_MAP); if (fault_page_size == (HPAGE_PMD_SIZE >> PAGE_SHIFT)) ret = vmf_insert_pfn_pmd_prot(vmf, pfnt, pgprot, write); #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD @@ -236,6 +248,21 @@ static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf, if (ret != VM_FAULT_NOPAGE) goto out_fallback;
+#if 1 + { + int npages; + struct page *page; + + npages = get_user_pages_fast_only(vmf->address, 1, 0, &page); + if (npages == 1) { + DRM_WARN("Fast gup succeeded. Bad.\n"); + put_page(page); + } else { + DRM_INFO("Fast gup failed. Good.\n"); + } + } +#endif + return VM_FAULT_NOPAGE; out_fallback: count_vm_event(THP_FAULT_FALLBACK);
On Thu, Mar 11, 2021 at 11:22:06AM +0100, Thomas Hellström (Intel) wrote:
On 3/1/21 3:09 PM, Daniel Vetter wrote:
On Mon, Mar 1, 2021 at 11:17 AM Christian König christian.koenig@amd.com wrote:
Am 01.03.21 um 10:21 schrieb Thomas Hellström (Intel):
On 3/1/21 10:05 AM, Daniel Vetter wrote:
On Mon, Mar 01, 2021 at 09:39:53AM +0100, Thomas Hellström (Intel) wrote:
Hi,
On 3/1/21 9:28 AM, Daniel Vetter wrote: > On Sat, Feb 27, 2021 at 9:06 AM Thomas Hellström (Intel) > thomas_os@shipmail.org wrote: > > On 2/26/21 2:28 PM, Daniel Vetter wrote: > > > So I think it stops gup. But I haven't verified at all. Would be > > > good > > > if Christian can check this with some direct io to a buffer in > > > system > > > memory. > > Hmm, > > > > Docs (again vm_normal_page() say) > > > > * VM_MIXEDMAP mappings can likewise contain memory with or > > without "struct > > * page" backing, however the difference is that _all_ pages > > with a struct > > * page (that is, those where pfn_valid is true) are refcounted > > and > > considered > > * normal pages by the VM. The disadvantage is that pages are > > refcounted > > * (which can be slower and simply not an option for some PFNMAP > > users). The > > * advantage is that we don't have to follow the strict > > linearity rule of > > * PFNMAP mappings in order to support COWable mappings. > > > > but it's true __vm_insert_mixed() ends up in the insert_pfn() > > path, so > > the above isn't really true, which makes me wonder if and in that > > case > > why there could any longer ever be a significant performance > > difference > > between MIXEDMAP and PFNMAP. > Yeah it's definitely confusing. I guess I'll hack up a patch and see > what sticks. > > > BTW regarding the TTM hugeptes, I don't think we ever landed that > > devmap > > hack, so they are (for the non-gup case) relying on > > vma_is_special_huge(). For the gup case, I think the bug is still > > there. > Maybe there's another devmap hack, but the ttm_vm_insert functions do > use PFN_DEV and all that. And I think that stops gup_fast from trying > to find the underlying page. > -Daniel Hmm perhaps it might, but I don't think so. The fix I tried out was to set
PFN_DEV | PFN_MAP for huge PTEs which causes pfn_devmap() to be true, and then
follow_devmap_pmd()->get_dev_pagemap() which returns NULL and gup_fast() backs off,
in the end that would mean setting in stone that "if there is a huge devmap page table entry for which we haven't registered any devmap struct pages (get_dev_pagemap returns NULL), we should treat that as a "special" huge page table entry".
From what I can tell, all code calling get_dev_pagemap() already does that, it's just a question of getting it accepted and formalizing it.
Oh I thought that's already how it works, since I didn't spot anything else that would block gup_fast from falling over. I guess really would need some testcases to make sure direct i/o (that's the easiest to test) fails like we expect.
Yeah, IIRC the "| PFN_MAP" is the missing piece for TTM huge ptes. Otherwise pmd_devmap() will not return true and since there is no pmd_special() things break.
Is that maybe the issue we have seen with amdgpu and huge pages?
Yeah, essentially when you have a hugepte inserted by ttm, and it happens to point at system memory, then gup will work on that. And create all kinds of havoc.
Apart from that I'm lost guys, that devmap and gup stuff is not something I have a good knowledge of apart from a one mile high view.
I'm not really better, hence would be good to do a testcase and see. This should provoke it:
- allocate nicely aligned bo in system memory
- mmap, again nicely aligned to 2M
- do some direct io from a filesystem into that mmap, that should trigger gup
- before the gup completes free the mmap and bo so that ttm recycles
the pages, which should trip up on the elevated refcount. If you wait until the direct io is completely, then I think nothing bad can be observed.
Ofc if your amdgpu+hugepte issue is something else, then maybe we have another issue.
Also usual caveat: I'm not an mm hacker either, so might be completely wrong. -Daniel
So I did the following quick experiment on vmwgfx, and it turns out that with it, fast gup never succeeds. Without the "| PFN_MAP", it typically succeeds
I should probably craft an RFC formalizing this.
Yeah I think that would be good. Maybe even more formalized if we also switch over to VM_PFNMAP, since afaiui these pte flags here only stop the fast gup path. And slow gup can still peak through VM_MIXEDMAP. Or something like that.
Otoh your description of when it only sometimes succeeds would indicate my understanding of VM_PFNMAP vs VM_MIXEDMAP is wrong here.
Christian, what's your take? -Daniel
/Thomas
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c index 6dc96cf66744..72b6fb17c984 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c @@ -195,6 +195,7 @@ static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf, pfn_t pfnt; struct ttm_tt *ttm = bo->ttm; bool write = vmf->flags & FAULT_FLAG_WRITE; + struct dev_pagemap *pagemap;
/* Fault should not cross bo boundary. */ page_offset &= ~(fault_page_size - 1); @@ -210,6 +211,17 @@ static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf, if ((pfn & (fault_page_size - 1)) != 0) goto out_fallback;
+ /* + * Huge entries must be special, that is marking them as devmap + * with no backing device map range. If there is a backing + * range, Don't insert a huge entry. + */ + pagemap = get_dev_pagemap(pfn, NULL); + if (pagemap) { + put_dev_pagemap(pagemap); + goto out_fallback; + }
/* Check that memory is contiguous. */ if (!bo->mem.bus.is_iomem) { for (i = 1; i < fault_page_size; ++i) { @@ -223,7 +235,7 @@ static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf, } }
- pfnt = __pfn_to_pfn_t(pfn, PFN_DEV); + pfnt = __pfn_to_pfn_t(pfn, PFN_DEV | PFN_MAP); if (fault_page_size == (HPAGE_PMD_SIZE >> PAGE_SHIFT)) ret = vmf_insert_pfn_pmd_prot(vmf, pfnt, pgprot, write); #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD @@ -236,6 +248,21 @@ static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf, if (ret != VM_FAULT_NOPAGE) goto out_fallback;
+#if 1 + { + int npages; + struct page *page;
+ npages = get_user_pages_fast_only(vmf->address, 1, 0, &page); + if (npages == 1) { + DRM_WARN("Fast gup succeeded. Bad.\n"); + put_page(page); + } else { + DRM_INFO("Fast gup failed. Good.\n"); + } + } +#endif
return VM_FAULT_NOPAGE; out_fallback: count_vm_event(THP_FAULT_FALLBACK);
Hi!
On 3/11/21 2:00 PM, Daniel Vetter wrote:
On Thu, Mar 11, 2021 at 11:22:06AM +0100, Thomas Hellström (Intel) wrote:
On 3/1/21 3:09 PM, Daniel Vetter wrote:
On Mon, Mar 1, 2021 at 11:17 AM Christian König christian.koenig@amd.com wrote:
Am 01.03.21 um 10:21 schrieb Thomas Hellström (Intel):
On 3/1/21 10:05 AM, Daniel Vetter wrote:
On Mon, Mar 01, 2021 at 09:39:53AM +0100, Thomas Hellström (Intel) wrote: > Hi, > > On 3/1/21 9:28 AM, Daniel Vetter wrote: >> On Sat, Feb 27, 2021 at 9:06 AM Thomas Hellström (Intel) >> thomas_os@shipmail.org wrote: >>> On 2/26/21 2:28 PM, Daniel Vetter wrote: >>>> So I think it stops gup. But I haven't verified at all. Would be >>>> good >>>> if Christian can check this with some direct io to a buffer in >>>> system >>>> memory. >>> Hmm, >>> >>> Docs (again vm_normal_page() say) >>> >>> * VM_MIXEDMAP mappings can likewise contain memory with or >>> without "struct >>> * page" backing, however the difference is that _all_ pages >>> with a struct >>> * page (that is, those where pfn_valid is true) are refcounted >>> and >>> considered >>> * normal pages by the VM. The disadvantage is that pages are >>> refcounted >>> * (which can be slower and simply not an option for some PFNMAP >>> users). The >>> * advantage is that we don't have to follow the strict >>> linearity rule of >>> * PFNMAP mappings in order to support COWable mappings. >>> >>> but it's true __vm_insert_mixed() ends up in the insert_pfn() >>> path, so >>> the above isn't really true, which makes me wonder if and in that >>> case >>> why there could any longer ever be a significant performance >>> difference >>> between MIXEDMAP and PFNMAP. >> Yeah it's definitely confusing. I guess I'll hack up a patch and see >> what sticks. >> >>> BTW regarding the TTM hugeptes, I don't think we ever landed that >>> devmap >>> hack, so they are (for the non-gup case) relying on >>> vma_is_special_huge(). For the gup case, I think the bug is still >>> there. >> Maybe there's another devmap hack, but the ttm_vm_insert functions do >> use PFN_DEV and all that. And I think that stops gup_fast from trying >> to find the underlying page. >> -Daniel > Hmm perhaps it might, but I don't think so. The fix I tried out was > to set > > PFN_DEV | PFN_MAP for huge PTEs which causes pfn_devmap() to be > true, and > then > > follow_devmap_pmd()->get_dev_pagemap() which returns NULL and > gup_fast() > backs off, > > in the end that would mean setting in stone that "if there is a huge > devmap > page table entry for which we haven't registered any devmap struct > pages > (get_dev_pagemap returns NULL), we should treat that as a "special" > huge > page table entry". > > From what I can tell, all code calling get_dev_pagemap() already > does that, > it's just a question of getting it accepted and formalizing it. Oh I thought that's already how it works, since I didn't spot anything else that would block gup_fast from falling over. I guess really would need some testcases to make sure direct i/o (that's the easiest to test) fails like we expect.
Yeah, IIRC the "| PFN_MAP" is the missing piece for TTM huge ptes. Otherwise pmd_devmap() will not return true and since there is no pmd_special() things break.
Is that maybe the issue we have seen with amdgpu and huge pages?
Yeah, essentially when you have a hugepte inserted by ttm, and it happens to point at system memory, then gup will work on that. And create all kinds of havoc.
Apart from that I'm lost guys, that devmap and gup stuff is not something I have a good knowledge of apart from a one mile high view.
I'm not really better, hence would be good to do a testcase and see. This should provoke it:
- allocate nicely aligned bo in system memory
- mmap, again nicely aligned to 2M
- do some direct io from a filesystem into that mmap, that should trigger gup
- before the gup completes free the mmap and bo so that ttm recycles
the pages, which should trip up on the elevated refcount. If you wait until the direct io is completely, then I think nothing bad can be observed.
Ofc if your amdgpu+hugepte issue is something else, then maybe we have another issue.
Also usual caveat: I'm not an mm hacker either, so might be completely wrong. -Daniel
So I did the following quick experiment on vmwgfx, and it turns out that with it, fast gup never succeeds. Without the "| PFN_MAP", it typically succeeds
I should probably craft an RFC formalizing this.
Yeah I think that would be good. Maybe even more formalized if we also switch over to VM_PFNMAP, since afaiui these pte flags here only stop the fast gup path. And slow gup can still peak through VM_MIXEDMAP. Or something like that.
Otoh your description of when it only sometimes succeeds would indicate my understanding of VM_PFNMAP vs VM_MIXEDMAP is wrong here.
My understanding from reading the vmf_insert_mixed() code is that iff the arch has pte_special(), VM_MIXEDMAP should be harmless. But that's not consistent with the vm_normal_page() doc. For architectures without pte_special, VM_PFNMAP must be used, and then we must also block COW mappings.
If we can get someone can commit to verify that the potential PAT WC performance issue is gone with PFNMAP, I can put together a series with that included.
As for existing userspace using COW TTM mappings, I once had a couple of test cases to verify that it actually worked, in particular together with huge PMDs and PUDs where breaking COW would imply splitting those, but I can't think of anything else actually wanting to do that other than by mistake.
/Thomas
Christian, what's your take? -Daniel
/Thomas
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c index 6dc96cf66744..72b6fb17c984 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c @@ -195,6 +195,7 @@ static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf, pfn_t pfnt; struct ttm_tt *ttm = bo->ttm; bool write = vmf->flags & FAULT_FLAG_WRITE; + struct dev_pagemap *pagemap;
/* Fault should not cross bo boundary. */ page_offset &= ~(fault_page_size - 1); @@ -210,6 +211,17 @@ static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf, if ((pfn & (fault_page_size - 1)) != 0) goto out_fallback;
+ /* + * Huge entries must be special, that is marking them as devmap + * with no backing device map range. If there is a backing + * range, Don't insert a huge entry. + */ + pagemap = get_dev_pagemap(pfn, NULL); + if (pagemap) { + put_dev_pagemap(pagemap); + goto out_fallback; + }
/* Check that memory is contiguous. */ if (!bo->mem.bus.is_iomem) { for (i = 1; i < fault_page_size; ++i) { @@ -223,7 +235,7 @@ static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf, } }
- pfnt = __pfn_to_pfn_t(pfn, PFN_DEV); + pfnt = __pfn_to_pfn_t(pfn, PFN_DEV | PFN_MAP); if (fault_page_size == (HPAGE_PMD_SIZE >> PAGE_SHIFT)) ret = vmf_insert_pfn_pmd_prot(vmf, pfnt, pgprot, write); #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD @@ -236,6 +248,21 @@ static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf, if (ret != VM_FAULT_NOPAGE) goto out_fallback;
+#if 1 + { + int npages; + struct page *page;
+ npages = get_user_pages_fast_only(vmf->address, 1, 0, &page); + if (npages == 1) { + DRM_WARN("Fast gup succeeded. Bad.\n"); + put_page(page); + } else { + DRM_INFO("Fast gup failed. Good.\n"); + } + } +#endif
return VM_FAULT_NOPAGE; out_fallback: count_vm_event(THP_FAULT_FALLBACK);
On Thu, Mar 11, 2021 at 2:12 PM Thomas Hellström (Intel) thomas_os@shipmail.org wrote:
Hi!
On 3/11/21 2:00 PM, Daniel Vetter wrote:
On Thu, Mar 11, 2021 at 11:22:06AM +0100, Thomas Hellström (Intel) wrote:
On 3/1/21 3:09 PM, Daniel Vetter wrote:
On Mon, Mar 1, 2021 at 11:17 AM Christian König christian.koenig@amd.com wrote:
Am 01.03.21 um 10:21 schrieb Thomas Hellström (Intel):
On 3/1/21 10:05 AM, Daniel Vetter wrote: > On Mon, Mar 01, 2021 at 09:39:53AM +0100, Thomas Hellström (Intel) > wrote: >> Hi, >> >> On 3/1/21 9:28 AM, Daniel Vetter wrote: >>> On Sat, Feb 27, 2021 at 9:06 AM Thomas Hellström (Intel) >>> thomas_os@shipmail.org wrote: >>>> On 2/26/21 2:28 PM, Daniel Vetter wrote: >>>>> So I think it stops gup. But I haven't verified at all. Would be >>>>> good >>>>> if Christian can check this with some direct io to a buffer in >>>>> system >>>>> memory. >>>> Hmm, >>>> >>>> Docs (again vm_normal_page() say) >>>> >>>> * VM_MIXEDMAP mappings can likewise contain memory with or >>>> without "struct >>>> * page" backing, however the difference is that _all_ pages >>>> with a struct >>>> * page (that is, those where pfn_valid is true) are refcounted >>>> and >>>> considered >>>> * normal pages by the VM. The disadvantage is that pages are >>>> refcounted >>>> * (which can be slower and simply not an option for some PFNMAP >>>> users). The >>>> * advantage is that we don't have to follow the strict >>>> linearity rule of >>>> * PFNMAP mappings in order to support COWable mappings. >>>> >>>> but it's true __vm_insert_mixed() ends up in the insert_pfn() >>>> path, so >>>> the above isn't really true, which makes me wonder if and in that >>>> case >>>> why there could any longer ever be a significant performance >>>> difference >>>> between MIXEDMAP and PFNMAP. >>> Yeah it's definitely confusing. I guess I'll hack up a patch and see >>> what sticks. >>> >>>> BTW regarding the TTM hugeptes, I don't think we ever landed that >>>> devmap >>>> hack, so they are (for the non-gup case) relying on >>>> vma_is_special_huge(). For the gup case, I think the bug is still >>>> there. >>> Maybe there's another devmap hack, but the ttm_vm_insert functions do >>> use PFN_DEV and all that. And I think that stops gup_fast from trying >>> to find the underlying page. >>> -Daniel >> Hmm perhaps it might, but I don't think so. The fix I tried out was >> to set >> >> PFN_DEV | PFN_MAP for huge PTEs which causes pfn_devmap() to be >> true, and >> then >> >> follow_devmap_pmd()->get_dev_pagemap() which returns NULL and >> gup_fast() >> backs off, >> >> in the end that would mean setting in stone that "if there is a huge >> devmap >> page table entry for which we haven't registered any devmap struct >> pages >> (get_dev_pagemap returns NULL), we should treat that as a "special" >> huge >> page table entry". >> >> From what I can tell, all code calling get_dev_pagemap() already >> does that, >> it's just a question of getting it accepted and formalizing it. > Oh I thought that's already how it works, since I didn't spot anything > else that would block gup_fast from falling over. I guess really would > need some testcases to make sure direct i/o (that's the easiest to test) > fails like we expect. Yeah, IIRC the "| PFN_MAP" is the missing piece for TTM huge ptes. Otherwise pmd_devmap() will not return true and since there is no pmd_special() things break.
Is that maybe the issue we have seen with amdgpu and huge pages?
Yeah, essentially when you have a hugepte inserted by ttm, and it happens to point at system memory, then gup will work on that. And create all kinds of havoc.
Apart from that I'm lost guys, that devmap and gup stuff is not something I have a good knowledge of apart from a one mile high view.
I'm not really better, hence would be good to do a testcase and see. This should provoke it:
- allocate nicely aligned bo in system memory
- mmap, again nicely aligned to 2M
- do some direct io from a filesystem into that mmap, that should trigger gup
- before the gup completes free the mmap and bo so that ttm recycles
the pages, which should trip up on the elevated refcount. If you wait until the direct io is completely, then I think nothing bad can be observed.
Ofc if your amdgpu+hugepte issue is something else, then maybe we have another issue.
Also usual caveat: I'm not an mm hacker either, so might be completely wrong. -Daniel
So I did the following quick experiment on vmwgfx, and it turns out that with it, fast gup never succeeds. Without the "| PFN_MAP", it typically succeeds
I should probably craft an RFC formalizing this.
Yeah I think that would be good. Maybe even more formalized if we also switch over to VM_PFNMAP, since afaiui these pte flags here only stop the fast gup path. And slow gup can still peak through VM_MIXEDMAP. Or something like that.
Otoh your description of when it only sometimes succeeds would indicate my understanding of VM_PFNMAP vs VM_MIXEDMAP is wrong here.
My understanding from reading the vmf_insert_mixed() code is that iff the arch has pte_special(), VM_MIXEDMAP should be harmless. But that's not consistent with the vm_normal_page() doc. For architectures without pte_special, VM_PFNMAP must be used, and then we must also block COW mappings.
If we can get someone can commit to verify that the potential PAT WC performance issue is gone with PFNMAP, I can put together a series with that included.
Iirc when I checked there's not much archs without pte_special, so I guess that's why we luck out. Hopefully.
As for existing userspace using COW TTM mappings, I once had a couple of test cases to verify that it actually worked, in particular together with huge PMDs and PUDs where breaking COW would imply splitting those, but I can't think of anything else actually wanting to do that other than by mistake.
Yeah disallowing MAP_PRIVATE mappings would be another good thing to lock down. Really doesn't make much sense. -Daniel
/Thomas
Christian, what's your take? -Daniel
/Thomas
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c index 6dc96cf66744..72b6fb17c984 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c @@ -195,6 +195,7 @@ static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf, pfn_t pfnt; struct ttm_tt *ttm = bo->ttm; bool write = vmf->flags & FAULT_FLAG_WRITE;
struct dev_pagemap *pagemap; /* Fault should not cross bo boundary. */ page_offset &= ~(fault_page_size - 1);
@@ -210,6 +211,17 @@ static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf, if ((pfn & (fault_page_size - 1)) != 0) goto out_fallback;
/*
* Huge entries must be special, that is marking them as devmap
* with no backing device map range. If there is a backing
* range, Don't insert a huge entry.
*/
pagemap = get_dev_pagemap(pfn, NULL);
if (pagemap) {
put_dev_pagemap(pagemap);
goto out_fallback;
}
/* Check that memory is contiguous. */ if (!bo->mem.bus.is_iomem) { for (i = 1; i < fault_page_size; ++i) {
@@ -223,7 +235,7 @@ static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf, } }
pfnt = __pfn_to_pfn_t(pfn, PFN_DEV);
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUDpfnt = __pfn_to_pfn_t(pfn, PFN_DEV | PFN_MAP); if (fault_page_size == (HPAGE_PMD_SIZE >> PAGE_SHIFT)) ret = vmf_insert_pfn_pmd_prot(vmf, pfnt, pgprot, write);
@@ -236,6 +248,21 @@ static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf, if (ret != VM_FAULT_NOPAGE) goto out_fallback;
+#if 1
{
int npages;
struct page *page;
npages = get_user_pages_fast_only(vmf->address, 1, 0,
&page);
if (npages == 1) {
DRM_WARN("Fast gup succeeded. Bad.\n");
put_page(page);
} else {
DRM_INFO("Fast gup failed. Good.\n");
}
}
+#endif
out_fallback: count_vm_event(THP_FAULT_FALLBACK);return VM_FAULT_NOPAGE;
On 3/11/21 2:17 PM, Daniel Vetter wrote:
On Thu, Mar 11, 2021 at 2:12 PM Thomas Hellström (Intel) thomas_os@shipmail.org wrote:
Hi!
On 3/11/21 2:00 PM, Daniel Vetter wrote:
On Thu, Mar 11, 2021 at 11:22:06AM +0100, Thomas Hellström (Intel) wrote:
On 3/1/21 3:09 PM, Daniel Vetter wrote:
On Mon, Mar 1, 2021 at 11:17 AM Christian König christian.koenig@amd.com wrote:
Am 01.03.21 um 10:21 schrieb Thomas Hellström (Intel): > On 3/1/21 10:05 AM, Daniel Vetter wrote: >> On Mon, Mar 01, 2021 at 09:39:53AM +0100, Thomas Hellström (Intel) >> wrote: >>> Hi, >>> >>> On 3/1/21 9:28 AM, Daniel Vetter wrote: >>>> On Sat, Feb 27, 2021 at 9:06 AM Thomas Hellström (Intel) >>>> thomas_os@shipmail.org wrote: >>>>> On 2/26/21 2:28 PM, Daniel Vetter wrote: >>>>>> So I think it stops gup. But I haven't verified at all. Would be >>>>>> good >>>>>> if Christian can check this with some direct io to a buffer in >>>>>> system >>>>>> memory. >>>>> Hmm, >>>>> >>>>> Docs (again vm_normal_page() say) >>>>> >>>>> * VM_MIXEDMAP mappings can likewise contain memory with or >>>>> without "struct >>>>> * page" backing, however the difference is that _all_ pages >>>>> with a struct >>>>> * page (that is, those where pfn_valid is true) are refcounted >>>>> and >>>>> considered >>>>> * normal pages by the VM. The disadvantage is that pages are >>>>> refcounted >>>>> * (which can be slower and simply not an option for some PFNMAP >>>>> users). The >>>>> * advantage is that we don't have to follow the strict >>>>> linearity rule of >>>>> * PFNMAP mappings in order to support COWable mappings. >>>>> >>>>> but it's true __vm_insert_mixed() ends up in the insert_pfn() >>>>> path, so >>>>> the above isn't really true, which makes me wonder if and in that >>>>> case >>>>> why there could any longer ever be a significant performance >>>>> difference >>>>> between MIXEDMAP and PFNMAP. >>>> Yeah it's definitely confusing. I guess I'll hack up a patch and see >>>> what sticks. >>>> >>>>> BTW regarding the TTM hugeptes, I don't think we ever landed that >>>>> devmap >>>>> hack, so they are (for the non-gup case) relying on >>>>> vma_is_special_huge(). For the gup case, I think the bug is still >>>>> there. >>>> Maybe there's another devmap hack, but the ttm_vm_insert functions do >>>> use PFN_DEV and all that. And I think that stops gup_fast from trying >>>> to find the underlying page. >>>> -Daniel >>> Hmm perhaps it might, but I don't think so. The fix I tried out was >>> to set >>> >>> PFN_DEV | PFN_MAP for huge PTEs which causes pfn_devmap() to be >>> true, and >>> then >>> >>> follow_devmap_pmd()->get_dev_pagemap() which returns NULL and >>> gup_fast() >>> backs off, >>> >>> in the end that would mean setting in stone that "if there is a huge >>> devmap >>> page table entry for which we haven't registered any devmap struct >>> pages >>> (get_dev_pagemap returns NULL), we should treat that as a "special" >>> huge >>> page table entry". >>> >>> From what I can tell, all code calling get_dev_pagemap() already >>> does that, >>> it's just a question of getting it accepted and formalizing it. >> Oh I thought that's already how it works, since I didn't spot anything >> else that would block gup_fast from falling over. I guess really would >> need some testcases to make sure direct i/o (that's the easiest to test) >> fails like we expect. > Yeah, IIRC the "| PFN_MAP" is the missing piece for TTM huge ptes. > Otherwise pmd_devmap() will not return true and since there is no > pmd_special() things break. Is that maybe the issue we have seen with amdgpu and huge pages?
Yeah, essentially when you have a hugepte inserted by ttm, and it happens to point at system memory, then gup will work on that. And create all kinds of havoc.
Apart from that I'm lost guys, that devmap and gup stuff is not something I have a good knowledge of apart from a one mile high view.
I'm not really better, hence would be good to do a testcase and see. This should provoke it:
- allocate nicely aligned bo in system memory
- mmap, again nicely aligned to 2M
- do some direct io from a filesystem into that mmap, that should trigger gup
- before the gup completes free the mmap and bo so that ttm recycles
the pages, which should trip up on the elevated refcount. If you wait until the direct io is completely, then I think nothing bad can be observed.
Ofc if your amdgpu+hugepte issue is something else, then maybe we have another issue.
Also usual caveat: I'm not an mm hacker either, so might be completely wrong. -Daniel
So I did the following quick experiment on vmwgfx, and it turns out that with it, fast gup never succeeds. Without the "| PFN_MAP", it typically succeeds
I should probably craft an RFC formalizing this.
Yeah I think that would be good. Maybe even more formalized if we also switch over to VM_PFNMAP, since afaiui these pte flags here only stop the fast gup path. And slow gup can still peak through VM_MIXEDMAP. Or something like that.
Otoh your description of when it only sometimes succeeds would indicate my understanding of VM_PFNMAP vs VM_MIXEDMAP is wrong here.
My understanding from reading the vmf_insert_mixed() code is that iff the arch has pte_special(), VM_MIXEDMAP should be harmless. But that's not consistent with the vm_normal_page() doc. For architectures without pte_special, VM_PFNMAP must be used, and then we must also block COW mappings.
If we can get someone can commit to verify that the potential PAT WC performance issue is gone with PFNMAP, I can put together a series with that included.
Iirc when I checked there's not much archs without pte_special, so I guess that's why we luck out. Hopefully.
As for existing userspace using COW TTM mappings, I once had a couple of test cases to verify that it actually worked, in particular together with huge PMDs and PUDs where breaking COW would imply splitting those, but I can't think of anything else actually wanting to do that other than by mistake.
Yeah disallowing MAP_PRIVATE mappings would be another good thing to lock down. Really doesn't make much sense. -Daniel
Yes, we can't allow them with PFNMAP + a non-linear address space...
/Thomas
/Thomas
Christian, what's your take? -Daniel
/Thomas
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c index 6dc96cf66744..72b6fb17c984 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c @@ -195,6 +195,7 @@ static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf, pfn_t pfnt; struct ttm_tt *ttm = bo->ttm; bool write = vmf->flags & FAULT_FLAG_WRITE;
struct dev_pagemap *pagemap; /* Fault should not cross bo boundary. */ page_offset &= ~(fault_page_size - 1);
@@ -210,6 +211,17 @@ static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf, if ((pfn & (fault_page_size - 1)) != 0) goto out_fallback;
/*
* Huge entries must be special, that is marking them as devmap
* with no backing device map range. If there is a backing
* range, Don't insert a huge entry.
*/
pagemap = get_dev_pagemap(pfn, NULL);
if (pagemap) {
put_dev_pagemap(pagemap);
goto out_fallback;
}
/* Check that memory is contiguous. */ if (!bo->mem.bus.is_iomem) { for (i = 1; i < fault_page_size; ++i) {
@@ -223,7 +235,7 @@ static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf, } }
pfnt = __pfn_to_pfn_t(pfn, PFN_DEV);
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUDpfnt = __pfn_to_pfn_t(pfn, PFN_DEV | PFN_MAP); if (fault_page_size == (HPAGE_PMD_SIZE >> PAGE_SHIFT)) ret = vmf_insert_pfn_pmd_prot(vmf, pfnt, pgprot, write);
@@ -236,6 +248,21 @@ static vm_fault_t ttm_bo_vm_insert_huge(struct vm_fault *vmf, if (ret != VM_FAULT_NOPAGE) goto out_fallback;
+#if 1
{
int npages;
struct page *page;
npages = get_user_pages_fast_only(vmf->address, 1, 0,
&page);
if (npages == 1) {
DRM_WARN("Fast gup succeeded. Bad.\n");
put_page(page);
} else {
DRM_INFO("Fast gup failed. Good.\n");
}
}
+#endif
out_fallback: count_vm_event(THP_FAULT_FALLBACK);return VM_FAULT_NOPAGE;
Am 11.03.21 um 14:17 schrieb Daniel Vetter:
[SNIP]
So I did the following quick experiment on vmwgfx, and it turns out that with it, fast gup never succeeds. Without the "| PFN_MAP", it typically succeeds
I should probably craft an RFC formalizing this.
Yeah I think that would be good. Maybe even more formalized if we also switch over to VM_PFNMAP, since afaiui these pte flags here only stop the fast gup path. And slow gup can still peak through VM_MIXEDMAP. Or something like that.
Otoh your description of when it only sometimes succeeds would indicate my understanding of VM_PFNMAP vs VM_MIXEDMAP is wrong here.
My understanding from reading the vmf_insert_mixed() code is that iff the arch has pte_special(), VM_MIXEDMAP should be harmless. But that's not consistent with the vm_normal_page() doc. For architectures without pte_special, VM_PFNMAP must be used, and then we must also block COW mappings.
If we can get someone can commit to verify that the potential PAT WC performance issue is gone with PFNMAP, I can put together a series with that included.
Iirc when I checked there's not much archs without pte_special, so I guess that's why we luck out. Hopefully.
I still need to read up a bit on what you guys are discussing here, but it starts to make a picture. Especially my understanding of what VM_MIXEDMAP means seems to have been slightly of.
I would say just go ahead and provide patches to always use VM_PFNMAP in TTM and we can test it and see if there are still some issues.
As for existing userspace using COW TTM mappings, I once had a couple of test cases to verify that it actually worked, in particular together with huge PMDs and PUDs where breaking COW would imply splitting those, but I can't think of anything else actually wanting to do that other than by mistake.
Yeah disallowing MAP_PRIVATE mappings would be another good thing to lock down. Really doesn't make much sense.
Completely agree. That sounds like something we should try to avoid.
Regards, Christian.
-Daniel
On Wed, Feb 24, 2021 at 09:45:51AM +0100, Daniel Vetter wrote:
Hm I figured everyone just uses MAP_SHARED for buffer objects since COW really makes absolutely no sense. How would we enforce this?
In RDMA we test
drivers/infiniband/core/ib_core_uverbs.c: if (!(vma->vm_flags & VM_SHARED))
During mmap to reject use of MAP_PRIVATE on BAR pages.
Jason
Am 24.02.21 um 19:46 schrieb Jason Gunthorpe:
On Wed, Feb 24, 2021 at 09:45:51AM +0100, Daniel Vetter wrote:
Hm I figured everyone just uses MAP_SHARED for buffer objects since COW really makes absolutely no sense. How would we enforce this?
In RDMA we test
drivers/infiniband/core/ib_core_uverbs.c: if (!(vma->vm_flags & VM_SHARED))
During mmap to reject use of MAP_PRIVATE on BAR pages.
That's a really good idea. MAP_PRIVATE and any driver mappings doesn't really work at all.
Christian.
Jason
On Thu, Feb 25, 2021 at 11:30:23AM +0100, Christian König wrote:
Am 24.02.21 um 19:46 schrieb Jason Gunthorpe:
On Wed, Feb 24, 2021 at 09:45:51AM +0100, Daniel Vetter wrote:
Hm I figured everyone just uses MAP_SHARED for buffer objects since COW really makes absolutely no sense. How would we enforce this?
In RDMA we test
drivers/infiniband/core/ib_core_uverbs.c: if (!(vma->vm_flags & VM_SHARED))
During mmap to reject use of MAP_PRIVATE on BAR pages.
That's a really good idea. MAP_PRIVATE and any driver mappings doesn't really work at all.
Yeah I feel like this is the next patch we need to add on this little series of locking down dma-buf mmap semantics. Probably should also push these into drm gem mmap code (and maybe ttm can switch over to that, it's really the same).
One at a time. -Daniel
Christian.
Jason
On Tue, Feb 23, 2021 at 3:00 AM Daniel Vetter daniel.vetter@ffwll.ch wrote:
tldr; DMA buffers aren't normal memory, expecting that you can use them like that (like calling get_user_pages works, or that they're accounting like any other normal memory) cannot be guaranteed.
Since some userspace only runs on integrated devices, where all buffers are actually all resident system memory, there's a huge temptation to assume that a struct page is always present and useable like for any more pagecache backed mmap. This has the potential to result in a uapi nightmare.
To stop this gap require that DMA buffer mmaps are VM_PFNMAP, which blocks get_user_pages and all the other struct page based infrastructure for everyone. In spirit this is the uapi counterpart to the kernel-internal CONFIG_DMABUF_DEBUG.
Motivated by a recent patch which wanted to swich the system dma-buf heap to vm_insert_page instead of vm_insert_pfn.
v2:
Jason brought up that we also want to guarantee that all ptes have the pte_special flag set, to catch fast get_user_pages (on architectures that support this). Allowing VM_MIXEDMAP (like VM_SPECIAL does) would still allow vm_insert_page, but limiting to VM_PFNMAP will catch that.
From auditing the various functions to insert pfn pte entires (vm_insert_pfn_prot, remap_pfn_range and all it's callers like dma_mmap_wc) it looks like VM_PFNMAP is already required anyway, so this should be the correct flag to check for.
References: https://lore.kernel.org/lkml/CAKMK7uHi+mG0z0HUmNt13QCCvutuRVjpcR0NjRL12k-WbW... Acked-by: Christian König christian.koenig@amd.com Cc: Jason Gunthorpe jgg@ziepe.ca Cc: Suren Baghdasaryan surenb@google.com Cc: Matthew Wilcox willy@infradead.org Cc: John Stultz john.stultz@linaro.org Signed-off-by: Daniel Vetter daniel.vetter@intel.com Cc: Sumit Semwal sumit.semwal@linaro.org Cc: "Christian König" christian.koenig@amd.com Cc: linux-media@vger.kernel.org Cc: linaro-mm-sig@lists.linaro.org
drivers/dma-buf/dma-buf.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-)
So I gave this a spin in a few of my environments, and with the current dmabuf heaps it spews a lot of warnings.
I'm testing some simple fixes to add: vma->vm_flags |= VM_PFNMAP;
to the dmabuf heap mmap ops, which we might want to queue along side of this.
So assuming those can land together. Acked-by: John Stultz john.stultz@linaro.org
thanks -john
dri-devel@lists.freedesktop.org