+dri-devel
Please be sure to cc dri-devel when you send out gpu scheduler patches.
On Thu, Mar 11, 2021 at 10:57 PM Jack Zhang Jack.Zhang1@amd.com wrote:
re-insert Bailing jobs to avoid memory leak.
Signed-off-by: Jack Zhang Jack.Zhang1@amd.com
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +++- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 8 ++++++-- drivers/gpu/drm/scheduler/sched_main.c | 8 +++++++- include/drm/gpu_scheduler.h | 1 + 4 files changed, 17 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 79b9cc73763f..86463b0f936e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4815,8 +4815,10 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, job ? job->base.id : -1);
/* even we skipped this reset, still need to set the job to guilty */
if (job)
if (job) { drm_sched_increase_karma(&job->base);
r = DRM_GPU_SCHED_STAT_BAILING;
} goto skip_recovery; }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 759b34799221..41390bdacd9e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -34,6 +34,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) struct amdgpu_job *job = to_amdgpu_job(s_job); struct amdgpu_task_info ti; struct amdgpu_device *adev = ring->adev;
int ret; memset(&ti, 0, sizeof(struct amdgpu_task_info));
@@ -52,8 +53,11 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) ti.process_name, ti.tgid, ti.task_name, ti.pid);
if (amdgpu_device_should_recover_gpu(ring->adev)) {
amdgpu_device_gpu_recover(ring->adev, job);
return DRM_GPU_SCHED_STAT_NOMINAL;
ret = amdgpu_device_gpu_recover(ring->adev, job);
if (ret == DRM_GPU_SCHED_STAT_BAILING)
return DRM_GPU_SCHED_STAT_BAILING;
else
return DRM_GPU_SCHED_STAT_NOMINAL; } else { drm_sched_suspend_timeout(&ring->sched); if (amdgpu_sriov_vf(adev))
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 92d8de24d0a1..a44f621fb5c4 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -314,6 +314,7 @@ static void drm_sched_job_timedout(struct work_struct *work) { struct drm_gpu_scheduler *sched; struct drm_sched_job *job;
int ret; sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
@@ -331,8 +332,13 @@ static void drm_sched_job_timedout(struct work_struct *work) list_del_init(&job->list); spin_unlock(&sched->job_list_lock);
job->sched->ops->timedout_job(job);
ret = job->sched->ops->timedout_job(job);
if (ret == DRM_GPU_SCHED_STAT_BAILING) {
spin_lock(&sched->job_list_lock);
list_add(&job->node, &sched->ring_mirror_list);
spin_unlock(&sched->job_list_lock);
} /* * Guilty job did complete and hence needs to be manually removed * See drm_sched_stop doc.
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 4ea8606d91fe..8093ac2427ef 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -210,6 +210,7 @@ enum drm_gpu_sched_stat { DRM_GPU_SCHED_STAT_NONE, /* Reserve 0 */ DRM_GPU_SCHED_STAT_NOMINAL, DRM_GPU_SCHED_STAT_ENODEV,
DRM_GPU_SCHED_STAT_BAILING,
};
/**
2.25.1
amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx