From: Jerome Glisse jglisse@redhat.com
After unrecovered GPU lockup avoid any GPU activities to avoid things like kernel segfault and alike to happen in any of the path that assume hw is working.
cc: stable@vger.kernel.org Signed-off-by: Jerome Glisse jglisse@redhat.com --- drivers/gpu/drm/radeon/radeon_device.c | 9 ++++--- drivers/gpu/drm/radeon/radeon_object.c | 7 ++++++ drivers/gpu/drm/radeon/radeon_ttm.c | 41 ++++++++++++++++++++++++++++++++ drivers/gpu/drm/ttm/ttm_tt.c | 1 + 4 files changed, 55 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c index 066c98b..653f352 100644 --- a/drivers/gpu/drm/radeon/radeon_device.c +++ b/drivers/gpu/drm/radeon/radeon_device.c @@ -993,16 +993,19 @@ int radeon_gpu_reset(struct radeon_device *rdev) /* block TTM */ resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev); radeon_suspend(rdev); + rdev->accel_working = false;
r = radeon_asic_reset(rdev); if (!r) { dev_info(rdev->dev, "GPU reset succeed\n"); radeon_resume(rdev); - radeon_restore_bios_scratch_regs(rdev); - drm_helper_resume_force_mode(rdev->ddev); - ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched); }
+ /* no matter what restore video mode */ + radeon_restore_bios_scratch_regs(rdev); + drm_helper_resume_force_mode(rdev->ddev); + ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched); + if (r) { /* bad news, how to tell it to userspace ? */ dev_info(rdev->dev, "GPU reset failed\n"); diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c index 830f1a7..27e8e53 100644 --- a/drivers/gpu/drm/radeon/radeon_object.c +++ b/drivers/gpu/drm/radeon/radeon_object.c @@ -89,6 +89,13 @@ void radeon_ttm_placement_from_domain(struct radeon_bo *rbo, u32 domain) rbo->placement.lpfn = 0; rbo->placement.placement = rbo->placements; rbo->placement.busy_placement = rbo->placements; + if (!rbo->rdev->accel_working) { + /* for new bo to system ram when GPU is not working */ + rbo->placements[c++] = TTM_PL_MASK_CACHING | TTM_PL_FLAG_SYSTEM; + rbo->placement.num_placement = c; + rbo->placement.num_busy_placement = c; + return; + } if (domain & RADEON_GEM_DOMAIN_VRAM) rbo->placements[c++] = TTM_PL_FLAG_WC | TTM_PL_FLAG_UNCACHED | TTM_PL_FLAG_VRAM; diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c index c94a225..0994d1e 100644 --- a/drivers/gpu/drm/radeon/radeon_ttm.c +++ b/drivers/gpu/drm/radeon/radeon_ttm.c @@ -215,6 +215,25 @@ static void radeon_move_null(struct ttm_buffer_object *bo, new_mem->mm_node = NULL; }
+static void radeon_move_noop(struct ttm_buffer_object *bo, + struct ttm_mem_reg *new_mem) +{ + struct ttm_bo_device *bdev = bo->bdev; + struct ttm_mem_type_manager *man = &bdev->man[new_mem->mem_type]; + struct ttm_mem_reg *old_mem = &bo->mem; + struct ttm_mem_reg old_copy = *old_mem; + + *old_mem = *new_mem; + new_mem->mm_node = NULL; + + if ((man->flags & TTM_MEMTYPE_FLAG_FIXED) && (bo->ttm != NULL)) { + ttm_tt_destroy(bo->ttm); + bo->ttm = NULL; + } + + ttm_bo_mem_put(bo, &old_copy); +} + static int radeon_move_blit(struct ttm_buffer_object *bo, bool evict, int no_wait_reserve, bool no_wait_gpu, struct ttm_mem_reg *new_mem, @@ -399,6 +418,14 @@ static int radeon_bo_move(struct ttm_buffer_object *bo, radeon_move_null(bo, new_mem); return 0; } + if (!rdev->accel_working) { + /* when accel is not working GPU is in broken state just + * do nothing for any ttm operation to avoid making the + * situation worst than it's + */ + radeon_move_noop(bo, new_mem); + return 0; + } if ((old_mem->mem_type == TTM_PL_TT && new_mem->mem_type == TTM_PL_SYSTEM) || (old_mem->mem_type == TTM_PL_SYSTEM && @@ -545,6 +572,13 @@ static int radeon_ttm_backend_bind(struct ttm_tt *ttm, WARN(1, "nothing to bind %lu pages for mreg %p back %p!\n", ttm->num_pages, bo_mem, ttm); } + if (!gtt->rdev->accel_working) { + /* when accel is not working GPU is in broken state just + * do nothing for any ttm operation to avoid making the + * situation worst than it's + */ + return 0; + } r = radeon_gart_bind(gtt->rdev, gtt->offset, ttm->num_pages, ttm->pages, gtt->ttm.dma_address); if (r) { @@ -559,6 +593,13 @@ static int radeon_ttm_backend_unbind(struct ttm_tt *ttm) { struct radeon_ttm_tt *gtt = (void *)ttm;
+ if (!gtt->rdev->accel_working) { + /* when accel is not working GPU is in broken state just + * do nothing for any ttm operation to avoid making the + * situation worst than it's + */ + return 0; + } radeon_gart_unbind(gtt->rdev, gtt->offset, ttm->num_pages); return 0; } diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c index fa09daf..f7bdb04 100644 --- a/drivers/gpu/drm/ttm/ttm_tt.c +++ b/drivers/gpu/drm/ttm/ttm_tt.c @@ -181,6 +181,7 @@ void ttm_tt_destroy(struct ttm_tt *ttm) ttm->swap_storage = NULL; ttm->func->destroy(ttm); } +EXPORT_SYMBOL(ttm_tt_destroy);
int ttm_tt_init(struct ttm_tt *ttm, struct ttm_bo_device *bdev, unsigned long size, uint32_t page_flags,
On Die, 2012-06-26 at 17:04 -0400, j.glisse@gmail.com wrote:
From: Jerome Glisse jglisse@redhat.com
After unrecovered GPU lockup avoid any GPU activities to avoid things like kernel segfault and alike to happen in any of the path that assume hw is working.
Has the patch been tested and confirmed to actually fix such a problem?
r = radeon_asic_reset(rdev); if (!r) { dev_info(rdev->dev, "GPU reset succeed\n"); radeon_resume(rdev);
radeon_restore_bios_scratch_regs(rdev);
drm_helper_resume_force_mode(rdev->ddev);
}ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
- /* no matter what restore video mode */
- radeon_restore_bios_scratch_regs(rdev);
- drm_helper_resume_force_mode(rdev->ddev);
- ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
Maybe this should be in a separate patch.
@@ -399,6 +418,14 @@ static int radeon_bo_move(struct ttm_buffer_object *bo, radeon_move_null(bo, new_mem); return 0; }
- if (!rdev->accel_working) {
/* when accel is not working GPU is in broken state just
* do nothing for any ttm operation to avoid making the
* situation worst than it's
'worse than it is', same in the following two hunks.
On Wed, Jun 27, 2012 at 5:19 AM, Michel Dänzer michel@daenzer.net wrote:
On Die, 2012-06-26 at 17:04 -0400, j.glisse@gmail.com wrote:
From: Jerome Glisse jglisse@redhat.com
After unrecovered GPU lockup avoid any GPU activities to avoid things like kernel segfault and alike to happen in any of the path that assume hw is working.
Has the patch been tested and confirmed to actually fix such a problem?
Yes it has been tested i dont send untested patch to ml.
r = radeon_asic_reset(rdev); if (!r) { dev_info(rdev->dev, "GPU reset succeed\n"); radeon_resume(rdev);
- radeon_restore_bios_scratch_regs(rdev);
- drm_helper_resume_force_mode(rdev->ddev);
- ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
}
- /* no matter what restore video mode */
- radeon_restore_bios_scratch_regs(rdev);
- drm_helper_resume_force_mode(rdev->ddev);
- ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
Maybe this should be in a separate patch.
Idea is to send this patch to stable thus having one patch that have it all.
@@ -399,6 +418,14 @@ static int radeon_bo_move(struct ttm_buffer_object *bo, radeon_move_null(bo, new_mem); return 0; }
- if (!rdev->accel_working) {
- /* when accel is not working GPU is in broken state just
- * do nothing for any ttm operation to avoid making the
- * situation worst than it's
'worse than it is', same in the following two hunks.
Cheers, Jerome
On Mit, 2012-06-27 at 10:49 -0400, Jerome Glisse wrote:
On Wed, Jun 27, 2012 at 5:19 AM, Michel Dänzer michel@daenzer.net wrote:
On Die, 2012-06-26 at 17:04 -0400, j.glisse@gmail.com wrote:
From: Jerome Glisse jglisse@redhat.com
After unrecovered GPU lockup avoid any GPU activities to avoid things like kernel segfault and alike to happen in any of the path that assume hw is working.
Has the patch been tested and confirmed to actually fix such a problem?
Yes it has been tested i dont send untested patch to ml.
I didn't expect (or mean to suggest) otherwise. I think I misread the related IRC conversation from last night: I thought you basically whipped up this patch in response to a report of such problems. But on re-reading now, I guess you wrote this patch a while ago and are just sending it now in response to the report on IRC.
r = radeon_asic_reset(rdev); if (!r) { dev_info(rdev->dev, "GPU reset succeed\n"); radeon_resume(rdev);
radeon_restore_bios_scratch_regs(rdev);
drm_helper_resume_force_mode(rdev->ddev);
ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched); }
/* no matter what restore video mode */
radeon_restore_bios_scratch_regs(rdev);
drm_helper_resume_force_mode(rdev->ddev);
ttm_bo_unlock_delayed_workqueue(&rdev->mman.bdev, resched);
Maybe this should be in a separate patch.
Idea is to send this patch to stable thus having one patch that have it all.
That doesn't make sense. Either the changes belong into a single patch (but then the commit log should describe all of them) or not. They can be sent to stable[0] either way.
[0] Actually, patches with Cc: stable are picked up automagically once they hit mainline, there's no point in sending them there directly.
@@ -399,6 +418,14 @@ static int radeon_bo_move(struct ttm_buffer_object *bo, radeon_move_null(bo, new_mem); return 0; }
if (!rdev->accel_working) {
/* when accel is not working GPU is in broken state just
* do nothing for any ttm operation to avoid making the
* situation worst than it's
'worse than it is', same in the following two hunks.
Are you gonna fix these typos?
dri-devel@lists.freedesktop.org