Doing this in vmw_ttm_destroy() is to late.
It turned out that this is not a good idea at all because it leaves pointers to freed up system memory pages in the GART tables of the drivers.
Signed-off-by: Christian König christian.koenig@amd.com --- drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c index b0973c27e774..904031d03dbe 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c @@ -526,14 +526,9 @@ static void vmw_ttm_destroy(struct ttm_device *bdev, struct ttm_tt *ttm) struct vmw_ttm_tt *vmw_be = container_of(ttm, struct vmw_ttm_tt, dma_ttm);
- vmw_ttm_unbind(bdev, ttm); ttm_tt_destroy_common(bdev, ttm); vmw_ttm_unmap_dma(vmw_be); - if (vmw_be->dev_priv->map_mode == vmw_dma_alloc_coherent) - ttm_tt_fini(&vmw_be->dma_ttm); - else - ttm_tt_fini(ttm); - + ttm_tt_fini(ttm); if (vmw_be->mob) vmw_mob_destroy(vmw_be->mob);
@@ -578,6 +573,8 @@ static void vmw_ttm_unpopulate(struct ttm_device *bdev, dma_ttm); unsigned int i;
+ vmw_ttm_unbind(bdev, ttm); + if (vmw_tt->mob) { vmw_mob_destroy(vmw_tt->mob); vmw_tt->mob = NULL;
Doing this in amdgpu_ttm_backend_destroy() is to late.
It turned out that this is not a good idea at all because it leaves pointers to freed up system memory pages in the GART tables of the drivers.
Signed-off-by: Christian König christian.koenig@amd.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index b46726e47bce..42b5162814b1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1070,7 +1070,6 @@ static void amdgpu_ttm_backend_destroy(struct ttm_device *bdev, { struct amdgpu_ttm_tt *gtt = (void *)ttm;
- amdgpu_ttm_backend_unbind(bdev, ttm); ttm_tt_destroy_common(bdev, ttm); if (gtt->usertask) put_task_struct(gtt->usertask); @@ -1154,6 +1153,8 @@ static void amdgpu_ttm_tt_unpopulate(struct ttm_device *bdev, struct amdgpu_ttm_tt *gtt = (void *)ttm; struct amdgpu_device *adev;
+ amdgpu_ttm_backend_unbind(bdev, ttm); + if (gtt && gtt->userptr) { amdgpu_ttm_tt_set_user_pages(ttm, NULL); kfree(ttm->sg);
Doing this in nouveau_ttm_tt_destroy()/nouveau_sgdma_destroy() is to late.
It turned out that this is not a good idea at all because it leaves pointers to freed up system memory pages in the GART tables of the drivers.
Signed-off-by: Christian König christian.koenig@amd.com --- drivers/gpu/drm/nouveau/nouveau_bo.c | 3 ++- drivers/gpu/drm/nouveau/nouveau_sgdma.c | 1 - 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c index 085023624fb0..5f309a4ec211 100644 --- a/drivers/gpu/drm/nouveau/nouveau_bo.c +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c @@ -1276,6 +1276,8 @@ nouveau_ttm_tt_unpopulate(struct ttm_device *bdev, if (slave) return;
+ nouveau_ttm_tt_unbind(bdev, ttm); + drm = nouveau_bdev(bdev); dev = drm->dev->dev;
@@ -1289,7 +1291,6 @@ nouveau_ttm_tt_destroy(struct ttm_device *bdev, #if IS_ENABLED(CONFIG_AGP) struct nouveau_drm *drm = nouveau_bdev(bdev); if (drm->agp.bridge) { - ttm_agp_unbind(ttm); ttm_tt_destroy_common(bdev, ttm); ttm_agp_destroy(ttm); return; diff --git a/drivers/gpu/drm/nouveau/nouveau_sgdma.c b/drivers/gpu/drm/nouveau/nouveau_sgdma.c index 256ec5b35473..bde92a9dae7a 100644 --- a/drivers/gpu/drm/nouveau/nouveau_sgdma.c +++ b/drivers/gpu/drm/nouveau/nouveau_sgdma.c @@ -21,7 +21,6 @@ nouveau_sgdma_destroy(struct ttm_device *bdev, struct ttm_tt *ttm) struct nouveau_sgdma_be *nvbe = (struct nouveau_sgdma_be *)ttm;
if (ttm) { - nouveau_sgdma_unbind(bdev, ttm); ttm_tt_destroy_common(bdev, ttm); ttm_tt_fini(&nvbe->ttm); kfree(nvbe);
Doing this in radeon_ttm_tt_destroy() is to late.
It turned out that this is not a good idea at all because it leaves pointers to freed up system memory pages in the GART tables of the drivers.
Signed-off-by: Christian König christian.koenig@amd.com --- drivers/gpu/drm/radeon/radeon_ttm.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c index a06d4cc2fb1c..ee343b76db54 100644 --- a/drivers/gpu/drm/radeon/radeon_ttm.c +++ b/drivers/gpu/drm/radeon/radeon_ttm.c @@ -488,9 +488,7 @@ static void radeon_ttm_backend_destroy(struct ttm_device *bdev, struct ttm_tt *t { struct radeon_ttm_tt *gtt = (void *)ttm;
- radeon_ttm_backend_unbind(bdev, ttm); ttm_tt_destroy_common(bdev, ttm); - ttm_tt_fini(>t->ttm); kfree(gtt); } @@ -574,6 +572,8 @@ static void radeon_ttm_tt_unpopulate(struct ttm_device *bdev, struct ttm_tt *ttm struct radeon_ttm_tt *gtt = radeon_ttm_tt_to_gtt(rdev, ttm); bool slave = !!(ttm->page_flags & TTM_PAGE_FLAG_SG);
+ radeon_ttm_tt_unbind(bdev, ttm); + if (gtt && gtt->userptr) { kfree(ttm->sg); ttm->page_flags &= ~TTM_PAGE_FLAG_SG; @@ -651,7 +651,6 @@ static void radeon_ttm_tt_destroy(struct ttm_device *bdev, struct radeon_device *rdev = radeon_get_rdev(bdev);
if (rdev->flags & RADEON_IS_AGP) { - ttm_agp_unbind(ttm); ttm_tt_destroy_common(bdev, ttm); ttm_agp_destroy(ttm); return;
It turned out that this is not a good idea at all because it leaves pointers to freed up system memory pages in the GART tables of the drivers.
Instead change the drivers to unbind their page tables during unpopulate and merge those things back into ttm_tt_destroy() again.
This reverts commit 7626168fd132009c79a0457bccc58014abc738f5.
Signed-off-by: Christian König christian.koenig@amd.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 1 - drivers/gpu/drm/drm_gem_vram_helper.c | 1 - drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 1 - drivers/gpu/drm/nouveau/nouveau_bo.c | 1 - drivers/gpu/drm/nouveau/nouveau_sgdma.c | 1 - drivers/gpu/drm/qxl/qxl_ttm.c | 1 - drivers/gpu/drm/radeon/radeon_ttm.c | 2 -- drivers/gpu/drm/ttm/ttm_tt.c | 7 +------ drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c | 1 - include/drm/ttm/ttm_tt.h | 7 ------- 10 files changed, 1 insertion(+), 22 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 42b5162814b1..6a6e04b64a80 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1070,7 +1070,6 @@ static void amdgpu_ttm_backend_destroy(struct ttm_device *bdev, { struct amdgpu_ttm_tt *gtt = (void *)ttm;
- ttm_tt_destroy_common(bdev, ttm); if (gtt->usertask) put_task_struct(gtt->usertask);
diff --git a/drivers/gpu/drm/drm_gem_vram_helper.c b/drivers/gpu/drm/drm_gem_vram_helper.c index 1e9b82e51a07..cc81fbac1a13 100644 --- a/drivers/gpu/drm/drm_gem_vram_helper.c +++ b/drivers/gpu/drm/drm_gem_vram_helper.c @@ -846,7 +846,6 @@ static const struct drm_gem_object_funcs drm_gem_vram_object_funcs = {
static void bo_driver_ttm_tt_destroy(struct ttm_device *bdev, struct ttm_tt *tt) { - ttm_tt_destroy_common(bdev, tt); ttm_tt_fini(tt); kfree(tt); } diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c index bf33724bed5c..e646aac9d7a4 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c @@ -118,7 +118,6 @@ static void i915_ttm_tt_destroy(struct ttm_device *bdev, struct ttm_tt *ttm) { struct i915_ttm_tt *i915_tt = container_of(ttm, typeof(*i915_tt), ttm);
- ttm_tt_destroy_common(bdev, ttm); kfree(i915_tt); }
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c index 5f309a4ec211..3177f89cf000 100644 --- a/drivers/gpu/drm/nouveau/nouveau_bo.c +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c @@ -1291,7 +1291,6 @@ nouveau_ttm_tt_destroy(struct ttm_device *bdev, #if IS_ENABLED(CONFIG_AGP) struct nouveau_drm *drm = nouveau_bdev(bdev); if (drm->agp.bridge) { - ttm_tt_destroy_common(bdev, ttm); ttm_agp_destroy(ttm); return; } diff --git a/drivers/gpu/drm/nouveau/nouveau_sgdma.c b/drivers/gpu/drm/nouveau/nouveau_sgdma.c index bde92a9dae7a..85c03c83259b 100644 --- a/drivers/gpu/drm/nouveau/nouveau_sgdma.c +++ b/drivers/gpu/drm/nouveau/nouveau_sgdma.c @@ -21,7 +21,6 @@ nouveau_sgdma_destroy(struct ttm_device *bdev, struct ttm_tt *ttm) struct nouveau_sgdma_be *nvbe = (struct nouveau_sgdma_be *)ttm;
if (ttm) { - ttm_tt_destroy_common(bdev, ttm); ttm_tt_fini(&nvbe->ttm); kfree(nvbe); } diff --git a/drivers/gpu/drm/qxl/qxl_ttm.c b/drivers/gpu/drm/qxl/qxl_ttm.c index 19fd39d9a00c..e91d8154143e 100644 --- a/drivers/gpu/drm/qxl/qxl_ttm.c +++ b/drivers/gpu/drm/qxl/qxl_ttm.c @@ -101,7 +101,6 @@ int qxl_ttm_io_mem_reserve(struct ttm_device *bdev, */ static void qxl_ttm_backend_destroy(struct ttm_device *bdev, struct ttm_tt *ttm) { - ttm_tt_destroy_common(bdev, ttm); ttm_tt_fini(ttm); kfree(ttm); } diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c index ee343b76db54..7793249bc549 100644 --- a/drivers/gpu/drm/radeon/radeon_ttm.c +++ b/drivers/gpu/drm/radeon/radeon_ttm.c @@ -488,7 +488,6 @@ static void radeon_ttm_backend_destroy(struct ttm_device *bdev, struct ttm_tt *t { struct radeon_ttm_tt *gtt = (void *)ttm;
- ttm_tt_destroy_common(bdev, ttm); ttm_tt_fini(>t->ttm); kfree(gtt); } @@ -651,7 +650,6 @@ static void radeon_ttm_tt_destroy(struct ttm_device *bdev, struct radeon_device *rdev = radeon_get_rdev(bdev);
if (rdev->flags & RADEON_IS_AGP) { - ttm_tt_destroy_common(bdev, ttm); ttm_agp_destroy(ttm); return; } diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c index 24031a8acd2d..41f38d9c3e1c 100644 --- a/drivers/gpu/drm/ttm/ttm_tt.c +++ b/drivers/gpu/drm/ttm/ttm_tt.c @@ -123,7 +123,7 @@ static int ttm_sg_tt_alloc_page_directory(struct ttm_tt *ttm) return 0; }
-void ttm_tt_destroy_common(struct ttm_device *bdev, struct ttm_tt *ttm) +void ttm_tt_destroy(struct ttm_device *bdev, struct ttm_tt *ttm) { ttm_tt_unpopulate(bdev, ttm);
@@ -131,11 +131,6 @@ void ttm_tt_destroy_common(struct ttm_device *bdev, struct ttm_tt *ttm) fput(ttm->swap_storage);
ttm->swap_storage = NULL; -} -EXPORT_SYMBOL(ttm_tt_destroy_common); - -void ttm_tt_destroy(struct ttm_device *bdev, struct ttm_tt *ttm) -{ bdev->funcs->ttm_tt_destroy(bdev, ttm); }
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c index 904031d03dbe..f35bdc1cb197 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c @@ -526,7 +526,6 @@ static void vmw_ttm_destroy(struct ttm_device *bdev, struct ttm_tt *ttm) struct vmw_ttm_tt *vmw_be = container_of(ttm, struct vmw_ttm_tt, dma_ttm);
- ttm_tt_destroy_common(bdev, ttm); vmw_ttm_unmap_dma(vmw_be); ttm_tt_fini(ttm); if (vmw_be->mob) diff --git a/include/drm/ttm/ttm_tt.h b/include/drm/ttm/ttm_tt.h index 818680c6a8ed..e402dab1d0f6 100644 --- a/include/drm/ttm/ttm_tt.h +++ b/include/drm/ttm/ttm_tt.h @@ -134,13 +134,6 @@ void ttm_tt_fini(struct ttm_tt *ttm); */ void ttm_tt_destroy(struct ttm_device *bdev, struct ttm_tt *ttm);
-/** - * ttm_tt_destroy_common: - * - * Called from driver to destroy common path. - */ -void ttm_tt_destroy_common(struct ttm_device *bdev, struct ttm_tt *ttm); - /** * ttm_tt_swapin: *
On Thu, Jul 22, 2021 at 02:41:23PM +0200, Christian König wrote:
Doing this in vmw_ttm_destroy() is to late.
It turned out that this is not a good idea at all because it leaves pointers to freed up system memory pages in the GART tables of the drivers.
So I wanted to review this series, and I can't reconcile your claim here with the demidlayering Dave has done. The driver patches here don't ouright undo what Dave has done, but that means the bug has been preexisting since forever (or is due to some other change?), and your commit message is a bit confusing here.
The final patch just undoes the demidlayering from Dave, and I really don't see where there's even a functional change there.
And even these patches here don't really change a hole lot with the calling sequence for at least final teardown: ttm_tt_destroy_common calls ttm_tt_unpopulate as the first thing, so at least there there's no change.
Can you pls elaborate more clearly what exactly you're fixing and what exactly needs to be reordered and where this bug is from (commit sha1)? As is I'm playing detective and the evidence presented is extremely since and I can't reconcile it at all.
I mean I know you don't like typing commit message and documentation, but it does get occasionally rather frustrating on the reviewer side if I have to interpolate between some very sparse hints for this stuff :-/ -Daniel
Signed-off-by: Christian König christian.koenig@amd.com
drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c index b0973c27e774..904031d03dbe 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c @@ -526,14 +526,9 @@ static void vmw_ttm_destroy(struct ttm_device *bdev, struct ttm_tt *ttm) struct vmw_ttm_tt *vmw_be = container_of(ttm, struct vmw_ttm_tt, dma_ttm);
- vmw_ttm_unbind(bdev, ttm); ttm_tt_destroy_common(bdev, ttm); vmw_ttm_unmap_dma(vmw_be);
- if (vmw_be->dev_priv->map_mode == vmw_dma_alloc_coherent)
ttm_tt_fini(&vmw_be->dma_ttm);
- else
ttm_tt_fini(ttm);
- ttm_tt_fini(ttm); if (vmw_be->mob) vmw_mob_destroy(vmw_be->mob);
@@ -578,6 +573,8 @@ static void vmw_ttm_unpopulate(struct ttm_device *bdev, dma_ttm); unsigned int i;
- vmw_ttm_unbind(bdev, ttm);
- if (vmw_tt->mob) { vmw_mob_destroy(vmw_tt->mob); vmw_tt->mob = NULL;
-- 2.25.1
Am 23.07.21 um 10:47 schrieb Daniel Vetter:
On Thu, Jul 22, 2021 at 02:41:23PM +0200, Christian König wrote:
Doing this in vmw_ttm_destroy() is to late.
It turned out that this is not a good idea at all because it leaves pointers to freed up system memory pages in the GART tables of the drivers.
So I wanted to review this series, and I can't reconcile your claim here with the demidlayering Dave has done. The driver patches here don't ouright undo what Dave has done, but that means the bug has been preexisting since forever (or is due to some other change?), and your commit message is a bit confusing here.
The final patch just undoes the demidlayering from Dave, and I really don't see where there's even a functional change there.
And even these patches here don't really change a hole lot with the calling sequence for at least final teardown: ttm_tt_destroy_common calls ttm_tt_unpopulate as the first thing, so at least there there's no change.
Can you pls elaborate more clearly what exactly you're fixing and what exactly needs to be reordered and where this bug is from (commit sha1)? As is I'm playing detective and the evidence presented is extremely since and I can't reconcile it at all.
I mean I know you don't like typing commit message and documentation, but it does get occasionally rather frustrating on the reviewer side if I have to interpolate between some very sparse hints for this stuff :-/
Yeah, when have seen the history it's rather obvious what's wrong here and I expected Dave to review it himself.
Previously we had three states in TTM for a tt object: Allocated -> Populated -> Bound which on destruction where done in the order unbind -> unpopulate -> free.
Dave moved handling of the bound state into the drivers since it is basically a driver decision and not a TTM decision what should be bound and what not (that part perfectly makes sense).
The problem is that he also moved doing the unbind into the free callback instead of the unpopulate callback. This result in stale page pointers in the GART if that unpopulate operation isn't immediately followed by a free.
Thinking more about it if we do populated->unpopulated->populated then we would also have stale pointers to the old pages which is even worse.
This is also not de-midlayering since we already have a proper ttm_tt_init()/ttm_tt_fini() functions which should work nicely for the tt object.
Christian.
-Daniel
Signed-off-by: Christian König christian.koenig@amd.com
drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c index b0973c27e774..904031d03dbe 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c @@ -526,14 +526,9 @@ static void vmw_ttm_destroy(struct ttm_device *bdev, struct ttm_tt *ttm) struct vmw_ttm_tt *vmw_be = container_of(ttm, struct vmw_ttm_tt, dma_ttm);
- vmw_ttm_unbind(bdev, ttm); ttm_tt_destroy_common(bdev, ttm); vmw_ttm_unmap_dma(vmw_be);
- if (vmw_be->dev_priv->map_mode == vmw_dma_alloc_coherent)
ttm_tt_fini(&vmw_be->dma_ttm);
- else
ttm_tt_fini(ttm);
- ttm_tt_fini(ttm); if (vmw_be->mob) vmw_mob_destroy(vmw_be->mob);
@@ -578,6 +573,8 @@ static void vmw_ttm_unpopulate(struct ttm_device *bdev, dma_ttm); unsigned int i;
- vmw_ttm_unbind(bdev, ttm);
- if (vmw_tt->mob) { vmw_mob_destroy(vmw_tt->mob); vmw_tt->mob = NULL;
-- 2.25.1
On Fri, Jul 23, 2021 at 11:13 AM Christian König ckoenig.leichtzumerken@gmail.com wrote:
Am 23.07.21 um 10:47 schrieb Daniel Vetter:
On Thu, Jul 22, 2021 at 02:41:23PM +0200, Christian König wrote:
Doing this in vmw_ttm_destroy() is to late.
It turned out that this is not a good idea at all because it leaves pointers to freed up system memory pages in the GART tables of the drivers.
So I wanted to review this series, and I can't reconcile your claim here with the demidlayering Dave has done. The driver patches here don't ouright undo what Dave has done, but that means the bug has been preexisting since forever (or is due to some other change?), and your commit message is a bit confusing here.
The final patch just undoes the demidlayering from Dave, and I really don't see where there's even a functional change there.
And even these patches here don't really change a hole lot with the calling sequence for at least final teardown: ttm_tt_destroy_common calls ttm_tt_unpopulate as the first thing, so at least there there's no change.
Can you pls elaborate more clearly what exactly you're fixing and what exactly needs to be reordered and where this bug is from (commit sha1)? As is I'm playing detective and the evidence presented is extremely since and I can't reconcile it at all.
I mean I know you don't like typing commit message and documentation, but it does get occasionally rather frustrating on the reviewer side if I have to interpolate between some very sparse hints for this stuff :-/
Yeah, when have seen the history it's rather obvious what's wrong here and I expected Dave to review it himself.
Previously we had three states in TTM for a tt object: Allocated -> Populated -> Bound which on destruction where done in the order unbind -> unpopulate -> free.
Dave moved handling of the bound state into the drivers since it is basically a driver decision and not a TTM decision what should be bound and what not (that part perfectly makes sense).
I haven't reviewed all the patches from Dave, only the one you pointed at (in the last patch). And that one I still can't match up with your description. If there's other commits relevant, can you pls dig them out?
Like it all makes sense what you're saying and matches the code, I just can't match it up with the commit you're referencing.
The problem is that he also moved doing the unbind into the free callback instead of the unpopulate callback. This result in stale page pointers in the GART if that unpopulate operation isn't immediately followed by a free.
Thinking more about it if we do populated->unpopulated->populated then we would also have stale pointers to the old pages which is even worse.
This is also not de-midlayering since we already have a proper ttm_tt_init()/ttm_tt_fini() functions which should work nicely for the tt object.
Well you're last patch moves the ttm_tt_destroy_common stuff back into ttm, which kinda is de-demidlayering. So I'm confused.
Other bit: I think it'd be good to document this properly in the callbacks, and maybe ideally go about and kerneldoc-ify the entire ttm_tt.h header. Otherwise when we eventually (never?) get around to that, everyone has forgotten these semantic details and issues again. -Daniel
Christian.
-Daniel
Signed-off-by: Christian König christian.koenig@amd.com
drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c index b0973c27e774..904031d03dbe 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c @@ -526,14 +526,9 @@ static void vmw_ttm_destroy(struct ttm_device *bdev, struct ttm_tt *ttm) struct vmw_ttm_tt *vmw_be = container_of(ttm, struct vmw_ttm_tt, dma_ttm);
- vmw_ttm_unbind(bdev, ttm); ttm_tt_destroy_common(bdev, ttm); vmw_ttm_unmap_dma(vmw_be);
- if (vmw_be->dev_priv->map_mode == vmw_dma_alloc_coherent)
ttm_tt_fini(&vmw_be->dma_ttm);
- else
ttm_tt_fini(ttm);
- ttm_tt_fini(ttm); if (vmw_be->mob) vmw_mob_destroy(vmw_be->mob);
@@ -578,6 +573,8 @@ static void vmw_ttm_unpopulate(struct ttm_device *bdev, dma_ttm); unsigned int i;
- vmw_ttm_unbind(bdev, ttm);
- if (vmw_tt->mob) { vmw_mob_destroy(vmw_tt->mob); vmw_tt->mob = NULL;
-- 2.25.1
Am 23.07.21 um 11:21 schrieb Daniel Vetter:
On Fri, Jul 23, 2021 at 11:13 AM Christian König ckoenig.leichtzumerken@gmail.com wrote:
Am 23.07.21 um 10:47 schrieb Daniel Vetter:
On Thu, Jul 22, 2021 at 02:41:23PM +0200, Christian König wrote:
Doing this in vmw_ttm_destroy() is to late.
It turned out that this is not a good idea at all because it leaves pointers to freed up system memory pages in the GART tables of the drivers.
So I wanted to review this series, and I can't reconcile your claim here with the demidlayering Dave has done. The driver patches here don't ouright undo what Dave has done, but that means the bug has been preexisting since forever (or is due to some other change?), and your commit message is a bit confusing here.
The final patch just undoes the demidlayering from Dave, and I really don't see where there's even a functional change there.
And even these patches here don't really change a hole lot with the calling sequence for at least final teardown: ttm_tt_destroy_common calls ttm_tt_unpopulate as the first thing, so at least there there's no change.
Can you pls elaborate more clearly what exactly you're fixing and what exactly needs to be reordered and where this bug is from (commit sha1)? As is I'm playing detective and the evidence presented is extremely since and I can't reconcile it at all.
I mean I know you don't like typing commit message and documentation, but it does get occasionally rather frustrating on the reviewer side if I have to interpolate between some very sparse hints for this stuff :-/
Yeah, when have seen the history it's rather obvious what's wrong here and I expected Dave to review it himself.
Previously we had three states in TTM for a tt object: Allocated -> Populated -> Bound which on destruction where done in the order unbind -> unpopulate -> free.
Dave moved handling of the bound state into the drivers since it is basically a driver decision and not a TTM decision what should be bound and what not (that part perfectly makes sense).
I haven't reviewed all the patches from Dave, only the one you pointed at (in the last patch). And that one I still can't match up with your description. If there's other commits relevant, can you pls dig them out?
Like it all makes sense what you're saying and matches the code, I just can't match it up with the commit you're referencing.
That is the patch directly following the one I've mentioned:
commit 37bff6542c4e140a11657406c1bab50a40329cc1 Author: Dave Airlie airlied@redhat.com Date: Thu Sep 17 13:24:50 2020 +1000
drm/ttm: move unbind into the tt destroy.
This moves unbind into the driver side on destroy paths.
I will add a Fixes tag to make that clear.
But this patch also just moves the undbind from the TTM destroy path to the driver destroy path.
To be honest I'm not 100% sure either when the when the unbind moved from the unpopulate path into the destroy path, but I think that this wasn't always the case. Let me try to dig that up.
The problem is that he also moved doing the unbind into the free callback instead of the unpopulate callback. This result in stale page pointers in the GART if that unpopulate operation isn't immediately followed by a free.
Thinking more about it if we do populated->unpopulated->populated then we would also have stale pointers to the old pages which is even worse.
This is also not de-midlayering since we already have a proper ttm_tt_init()/ttm_tt_fini() functions which should work nicely for the tt object.
Well you're last patch moves the ttm_tt_destroy_common stuff back into ttm, which kinda is de-demidlayering. So I'm confused.
Ah, yes that is correct. I've also considered to move this in ttm_tt_fini instead of there.
But that would be a larger change and I wanted to fix the problem at hand first, potentially even adding a CC stable tag.
Other bit: I think it'd be good to document this properly in the callbacks, and maybe ideally go about and kerneldoc-ify the entire ttm_tt.h header. Otherwise when we eventually (never?) get around to that, everyone has forgotten these semantic details and issues again.
Already working towards including more of the TTM headers and code files in kerneldoc. But not quite there yet.
But you know, normal human: Only equipped with one head and two hands and not cloneable.
Cheers, Christian.
-Daniel
Christian.
-Daniel
Signed-off-by: Christian König christian.koenig@amd.com
drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c index b0973c27e774..904031d03dbe 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c @@ -526,14 +526,9 @@ static void vmw_ttm_destroy(struct ttm_device *bdev, struct ttm_tt *ttm) struct vmw_ttm_tt *vmw_be = container_of(ttm, struct vmw_ttm_tt, dma_ttm);
- vmw_ttm_unbind(bdev, ttm); ttm_tt_destroy_common(bdev, ttm); vmw_ttm_unmap_dma(vmw_be);
- if (vmw_be->dev_priv->map_mode == vmw_dma_alloc_coherent)
ttm_tt_fini(&vmw_be->dma_ttm);
- else
ttm_tt_fini(ttm);
- ttm_tt_fini(ttm); if (vmw_be->mob) vmw_mob_destroy(vmw_be->mob);
@@ -578,6 +573,8 @@ static void vmw_ttm_unpopulate(struct ttm_device *bdev, dma_ttm); unsigned int i;
- vmw_ttm_unbind(bdev, ttm);
if (vmw_tt->mob) { vmw_mob_destroy(vmw_tt->mob); vmw_tt->mob = NULL;
-- 2.25.1
On Fri, 23 Jul 2021 at 19:40, Christian König ckoenig.leichtzumerken@gmail.com wrote:
Am 23.07.21 um 11:21 schrieb Daniel Vetter:
On Fri, Jul 23, 2021 at 11:13 AM Christian König ckoenig.leichtzumerken@gmail.com wrote:
Am 23.07.21 um 10:47 schrieb Daniel Vetter:
On Thu, Jul 22, 2021 at 02:41:23PM +0200, Christian König wrote:
Doing this in vmw_ttm_destroy() is to late.
It turned out that this is not a good idea at all because it leaves pointers to freed up system memory pages in the GART tables of the drivers.
So I wanted to review this series, and I can't reconcile your claim here with the demidlayering Dave has done. The driver patches here don't ouright undo what Dave has done, but that means the bug has been preexisting since forever (or is due to some other change?), and your commit message is a bit confusing here.
The final patch just undoes the demidlayering from Dave, and I really don't see where there's even a functional change there.
And even these patches here don't really change a hole lot with the calling sequence for at least final teardown: ttm_tt_destroy_common calls ttm_tt_unpopulate as the first thing, so at least there there's no change.
Can you pls elaborate more clearly what exactly you're fixing and what exactly needs to be reordered and where this bug is from (commit sha1)? As is I'm playing detective and the evidence presented is extremely since and I can't reconcile it at all.
I mean I know you don't like typing commit message and documentation, but it does get occasionally rather frustrating on the reviewer side if I have to interpolate between some very sparse hints for this stuff :-/
Yeah, when have seen the history it's rather obvious what's wrong here and I expected Dave to review it himself.
Previously we had three states in TTM for a tt object: Allocated -> Populated -> Bound which on destruction where done in the order unbind -> unpopulate -> free.
Dave moved handling of the bound state into the drivers since it is basically a driver decision and not a TTM decision what should be bound and what not (that part perfectly makes sense).
I haven't reviewed all the patches from Dave, only the one you pointed at (in the last patch). And that one I still can't match up with your description. If there's other commits relevant, can you pls dig them out?
Like it all makes sense what you're saying and matches the code, I just can't match it up with the commit you're referencing.
That is the patch directly following the one I've mentioned:
commit 37bff6542c4e140a11657406c1bab50a40329cc1 Author: Dave Airlie airlied@redhat.com Date: Thu Sep 17 13:24:50 2020 +1000
drm/ttm: move unbind into the tt destroy. This moves unbind into the driver side on destroy paths.
I will add a Fixes tag to make that clear.
But this patch also just moves the undbind from the TTM destroy path to the driver destroy path.
To be honest I'm not 100% sure either when the when the unbind moved from the unpopulate path into the destroy path, but I think that this wasn't always the case. Let me try to dig that up.
The problem is that he also moved doing the unbind into the free callback instead of the unpopulate callback. This result in stale page pointers in the GART if that unpopulate operation isn't immediately followed by a free.
Thinking more about it if we do populated->unpopulated->populated then we would also have stale pointers to the old pages which is even worse.
This is also not de-midlayering since we already have a proper ttm_tt_init()/ttm_tt_fini() functions which should work nicely for the tt object.
Well you're last patch moves the ttm_tt_destroy_common stuff back into ttm, which kinda is de-demidlayering. So I'm confused.
Ah, yes that is correct. I've also considered to move this in ttm_tt_fini instead of there.
But that would be a larger change and I wanted to fix the problem at hand first, potentially even adding a CC stable tag.
Other bit: I think it'd be good to document this properly in the callbacks, and maybe ideally go about and kerneldoc-ify the entire ttm_tt.h header. Otherwise when we eventually (never?) get around to that, everyone has forgotten these semantic details and issues again.
Already working towards including more of the TTM headers and code files in kerneldoc. But not quite there yet.
But you know, normal human: Only equipped with one head and two hands and not cloneable.
I'm the same, but I'm not seeing where this problem happens at all, do we have any backtraces or demos for this?
I split bind/unbind into the driver, but the driver should still always be moving things to unbound states before an unpopulate is called, is there a driver doing something unexpected here?
at worst I'd like to see a WARN_ON put in first and a test in igt that triggers it, because right now I'm not see that path through the drivers/ttm that leads to unpopulated pages ending up happening while bound.
From 5.14-rc3 unpopulate is called from ttm_bo_pipeline_gutting in
non-ghost path and there is no unbind, pipeline gutting is called from evict/validate, when there is no placement suggested for the object, is this case getting hit somewhere without the driver having previously unbound things?
ttm_tt_destroy_common: calls unpopulate, everyone calls this directly after unbinding ttm_tt_swapout: calls unpopulate, we don't swapout objects from TT directly, we should always unbind first, this used to have an assert against that, ttm_tt_populate: call unpopulate in failure path
So any place I can see unpopulate getting called with a bound TT should be a bug, and fixed, we could protect against it better but I'm not seeing the need for this series to outright revert things back as helping.
Dave.
Am 26.07.21 um 02:03 schrieb Dave Airlie:
[SNIP]
But you know, normal human: Only equipped with one head and two hands and not cloneable.
I'm the same, but I'm not seeing where this problem happens at all, do we have any backtraces or demos for this?
I've stumbled over this while working on some patches which accidentally broke delayed delete and caused random memory corruption and was wondering how that even happened in the first place.
Having stale PTEs in the GART isn't a problem unless you break other things. Which is also the reason why I haven't added a CC stable yet.
I split bind/unbind into the driver, but the driver should still always be moving things to unbound states before an unpopulate is called, is there a driver doing something unexpected here?
Possible, I was only testing with amdgpu and the GART handling is rather special there (which was one of the reasons why we did that in the first place).
at worst I'd like to see a WARN_ON put in first and a test in igt that triggers it, because right now I'm not see that path through the drivers/ttm that leads to unpopulated pages ending up happening while bound.
From 5.14-rc3 unpopulate is called from ttm_bo_pipeline_gutting in non-ghost path and there is no unbind, pipeline gutting is called from evict/validate, when there is no placement suggested for the object, is this case getting hit somewhere without the driver having previously unbound things?
Yes, that will hit absolutely. Most likely with VM page tables for amdgpu which uses this.
ttm_tt_destroy_common: calls unpopulate, everyone calls this directly after unbinding ttm_tt_swapout: calls unpopulate, we don't swapout objects from TT directly, we should always unbind first, this used to have an assert against that, ttm_tt_populate: call unpopulate in failure path
unbinding was moved into the driver, so ttm_tt_swapout() won't unbind anything before calling unpopulate as far as I can see.
So any place I can see unpopulate getting called with a bound TT should be a bug, and fixed, we could protect against it better but I'm not seeing the need for this series to outright revert things back as helping.
I'm not reverting this because it is offhand wrong, but making sure the GART is clear before unpopulating the TT object sounds like the much more natural thing to do and the state machine is something I certainly don't see in the backend.
Regards, Christian.
Dave.
On Tue, 27 Jul 2021 at 05:35, Christian König ckoenig.leichtzumerken@gmail.com wrote:
Am 26.07.21 um 02:03 schrieb Dave Airlie:
[SNIP]
But you know, normal human: Only equipped with one head and two hands and not cloneable.
I'm the same, but I'm not seeing where this problem happens at all, do we have any backtraces or demos for this?
I've stumbled over this while working on some patches which accidentally broke delayed delete and caused random memory corruption and was wondering how that even happened in the first place.
Having stale PTEs in the GART isn't a problem unless you break other things. Which is also the reason why I haven't added a CC stable yet.
I split bind/unbind into the driver, but the driver should still always be moving things to unbound states before an unpopulate is called, is there a driver doing something unexpected here?
Possible, I was only testing with amdgpu and the GART handling is rather special there (which was one of the reasons why we did that in the first place).
at worst I'd like to see a WARN_ON put in first and a test in igt that triggers it, because right now I'm not see that path through the drivers/ttm that leads to unpopulated pages ending up happening while bound.
From 5.14-rc3 unpopulate is called from ttm_bo_pipeline_gutting in non-ghost path and there is no unbind, pipeline gutting is called from evict/validate, when there is no placement suggested for the object, is this case getting hit somewhere without the driver having previously unbound things?
Yes, that will hit absolutely. Most likely with VM page tables for amdgpu which uses this.
My thing is here we moved binding/unbinding to the driver, if the driver has a bug I'd expect the fix to be in the driver side here. I think giving drivers control over something and enforcing it in the core/helpers is fine, but I don't think we should be adding back the midlayering.
ttm_tt_destroy_common: calls unpopulate, everyone calls this directly after unbinding ttm_tt_swapout: calls unpopulate, we don't swapout objects from TT directly, we should always unbind first, this used to have an assert against that, ttm_tt_populate: call unpopulate in failure path
unbinding was moved into the driver, so ttm_tt_swapout() won't unbind anything before calling unpopulate as far as I can see.
But we won't call swapout on BOs that aren't in SYSTEM and to be in SYSTEM, the bo would have to go through the driver move function which will unbind it.
So any place I can see unpopulate getting called with a bound TT should be a bug, and fixed, we could protect against it better but I'm not seeing the need for this series to outright revert things back as helping.
I'm not reverting this because it is offhand wrong, but making sure the GART is clear before unpopulating the TT object sounds like the much more natural thing to do and the state machine is something I certainly don't see in the backend.
I don't think that's the core's responsibility anymore, I'm fine with adding code and even an flag that says if the the TT is bound/unbound that unpopulate can WARN_ON() against so we get a backtrace and track down where something is getting unpopulated without going through the driver move function to be unbound.
Dave.
On Tue, Jul 27, 2021 at 06:03:12AM +1000, Dave Airlie wrote:
On Tue, 27 Jul 2021 at 05:35, Christian König ckoenig.leichtzumerken@gmail.com wrote:
Am 26.07.21 um 02:03 schrieb Dave Airlie:
[SNIP]
But you know, normal human: Only equipped with one head and two hands and not cloneable.
I'm the same, but I'm not seeing where this problem happens at all, do we have any backtraces or demos for this?
I've stumbled over this while working on some patches which accidentally broke delayed delete and caused random memory corruption and was wondering how that even happened in the first place.
Having stale PTEs in the GART isn't a problem unless you break other things. Which is also the reason why I haven't added a CC stable yet.
I split bind/unbind into the driver, but the driver should still always be moving things to unbound states before an unpopulate is called, is there a driver doing something unexpected here?
Possible, I was only testing with amdgpu and the GART handling is rather special there (which was one of the reasons why we did that in the first place).
at worst I'd like to see a WARN_ON put in first and a test in igt that triggers it, because right now I'm not see that path through the drivers/ttm that leads to unpopulated pages ending up happening while bound.
From 5.14-rc3 unpopulate is called from ttm_bo_pipeline_gutting in non-ghost path and there is no unbind, pipeline gutting is called from evict/validate, when there is no placement suggested for the object, is this case getting hit somewhere without the driver having previously unbound things?
Yes, that will hit absolutely. Most likely with VM page tables for amdgpu which uses this.
My thing is here we moved binding/unbinding to the driver, if the driver has a bug I'd expect the fix to be in the driver side here. I think giving drivers control over something and enforcing it in the core/helpers is fine, but I don't think we should be adding back the midlayering.
ttm_tt_destroy_common: calls unpopulate, everyone calls this directly after unbinding ttm_tt_swapout: calls unpopulate, we don't swapout objects from TT directly, we should always unbind first, this used to have an assert against that, ttm_tt_populate: call unpopulate in failure path
unbinding was moved into the driver, so ttm_tt_swapout() won't unbind anything before calling unpopulate as far as I can see.
But we won't call swapout on BOs that aren't in SYSTEM and to be in SYSTEM, the bo would have to go through the driver move function which will unbind it.
So any place I can see unpopulate getting called with a bound TT should be a bug, and fixed, we could protect against it better but I'm not seeing the need for this series to outright revert things back as helping.
I'm not reverting this because it is offhand wrong, but making sure the GART is clear before unpopulating the TT object sounds like the much more natural thing to do and the state machine is something I certainly don't see in the backend.
I don't think that's the core's responsibility anymore, I'm fine with adding code and even an flag that says if the the TT is bound/unbound that unpopulate can WARN_ON() against so we get a backtrace and track down where something is getting unpopulated without going through the driver move function to be unbound.
Yeah I think sprinkling a few WARN_ON around to make sure drivers use the functions correctly is the right thing. Re-midlayering because we don't trust drivers to do things correctly isn't.
Aside from this principle I'll let you two duke out what's actually going on :-) -Daniel
Am 26.07.21 um 22:03 schrieb Dave Airlie:
On Tue, 27 Jul 2021 at 05:35, Christian König ckoenig.leichtzumerken@gmail.com wrote:
Am 26.07.21 um 02:03 schrieb Dave Airlie:
[SNIP]
But you know, normal human: Only equipped with one head and two hands and not cloneable.
I'm the same, but I'm not seeing where this problem happens at all, do we have any backtraces or demos for this?
I've stumbled over this while working on some patches which accidentally broke delayed delete and caused random memory corruption and was wondering how that even happened in the first place.
Having stale PTEs in the GART isn't a problem unless you break other things. Which is also the reason why I haven't added a CC stable yet.
I split bind/unbind into the driver, but the driver should still always be moving things to unbound states before an unpopulate is called, is there a driver doing something unexpected here?
Possible, I was only testing with amdgpu and the GART handling is rather special there (which was one of the reasons why we did that in the first place).
at worst I'd like to see a WARN_ON put in first and a test in igt that triggers it, because right now I'm not see that path through the drivers/ttm that leads to unpopulated pages ending up happening while bound.
From 5.14-rc3 unpopulate is called from ttm_bo_pipeline_gutting in non-ghost path and there is no unbind, pipeline gutting is called from evict/validate, when there is no placement suggested for the object, is this case getting hit somewhere without the driver having previously unbound things?
Yes, that will hit absolutely. Most likely with VM page tables for amdgpu which uses this.
My thing is here we moved binding/unbinding to the driver, if the driver has a bug I'd expect the fix to be in the driver side here. I think giving drivers control over something and enforcing it in the core/helpers is fine, but I don't think we should be adding back the midlayering.
Ok, then we are pretty much already on the same page here.
I've just reverted the patch because I thought it would be cleaner for eventually backporting it.
ttm_tt_destroy_common: calls unpopulate, everyone calls this directly after unbinding ttm_tt_swapout: calls unpopulate, we don't swapout objects from TT directly, we should always unbind first, this used to have an assert against that, ttm_tt_populate: call unpopulate in failure path
unbinding was moved into the driver, so ttm_tt_swapout() won't unbind anything before calling unpopulate as far as I can see.
But we won't call swapout on BOs that aren't in SYSTEM and to be in SYSTEM, the bo would have to go through the driver move function which will unbind it.
Ah, good point.
So any place I can see unpopulate getting called with a bound TT should be a bug, and fixed, we could protect against it better but I'm not seeing the need for this series to outright revert things back as helping.
I'm not reverting this because it is offhand wrong, but making sure the GART is clear before unpopulating the TT object sounds like the much more natural thing to do and the state machine is something I certainly don't see in the backend.
I don't think that's the core's responsibility anymore, I'm fine with adding code and even an flag that says if the the TT is bound/unbound that unpopulate can WARN_ON() against so we get a backtrace and track down where something is getting unpopulated without going through the driver move function to be unbound.
I was not talking about bound/unbound, but rather unpopulating before destroying.
The requirement we have is that the tt object is unpopulated before it is destroyed and the state machine of that object is essentially controlled by its BO and not the object itself.
I will prepare a patch making that cleaner.
Thanks, Christian.
Dave.
dri-devel@lists.freedesktop.org