[AMD Official Use Only]
-----Original Message----- From: Koenig, Christian Christian.Koenig@amd.com Sent: Tuesday, December 14, 2021 4:01 PM To: Huang, Ray Ray.Huang@amd.com; dri-devel@lists.freedesktop.org; Daniel Vetter daniel.vetter@ffwll.ch; Sumit Semwal sumit.semwal@linaro.org Cc: amd-gfx@lists.freedesktop.org; linux-media@vger.kernel.org; Deucher, Alexander Alexander.Deucher@amd.com; Liu, Monk Monk.Liu@amd.com Subject: Re: [PATCH 2/3] drm/amdgpu: fix the fence timeline null pointer
Am 13.12.21 um 07:34 schrieb Huang Rui:
Initialize the flags for embedded fence in the job at dma_fence_init(). Otherwise, we will go a wrong way in amdgpu_fence_get_timeline_name callback and trigger a null pointer panic once we enabled the trace event here.
[ 156.131790] BUG: kernel NULL pointer dereference, address: 00000000000002a0 [ 156.131804] #PF: supervisor read access in kernel mode [ 156.131811] #PF: error_code(0x0000) - not-present page [ 156.131817] PGD 0 P4D 0 [ 156.131824] Oops: 0000 [#1] PREEMPT SMP PTI [ 156.131832] CPU: 6 PID: 1404 Comm: sdma0 Tainted: G OE 5.16.0-
rc1-custom #1
[ 156.131842] Hardware name: Gigabyte Technology Co., Ltd. Z170XP-SLI/Z170XP-SLI-CF, BIOS F20 11/04/2016 [ 156.131848] RIP: 0010:strlen+0x0/0x20 [ 156.131859] Code: 89 c0 c3 0f 1f 80 00 00 00 00 48 01 fe eb 0f 0f b6 07 38 d0 74 10 48 83 c7 01 84 c0 74 05 48 39 f7 75 ec 31 c0 c3 48 89 f8 c3 <80> 3f 00 74 10 48 89 f8 48 83 c0 01 80 38 00 75 f7 48 29 f8 c3 31 [ 156.131872] RSP: 0018:ffff9bd0018dbcf8 EFLAGS: 00010206 [ 156.131880] RAX: 00000000000002a0 RBX: ffff8d0305ef01b0 RCX: 000000000000000b [ 156.131888] RDX: ffff8d03772ab924 RSI: ffff8d0305ef01b0 RDI: 00000000000002a0 [ 156.131895] RBP: ffff9bd0018dbd60 R08: ffff8d03002094d0 R09: 0000000000000000 [ 156.131901] R10: 000000000000005e R11: 0000000000000065 R12: ffff8d03002094d0 [ 156.131907] R13: 000000000000001f R14: 0000000000070018 R15: 0000000000000007 [ 156.131914] FS: 0000000000000000(0000) GS:ffff8d062ed80000(0000) knlGS:0000000000000000 [ 156.131923] CS: 0010 DS: 0000 ES: 0000 CR0:
0000000080050033 [ 156.131929] CR2: 00000000000002a0 CR3: 000000001120a005 CR4: 00000000003706e0 [ 156.131937] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 156.131942] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 156.131949] Call Trace:
[ 156.131953] <TASK> [ 156.131957] ? trace_event_raw_event_dma_fence+0xcc/0x200 [ 156.131973] ? ring_buffer_unlock_commit+0x23/0x130 [ 156.131982] dma_fence_init+0x92/0xb0 [ 156.131993] amdgpu_fence_emit+0x10d/0x2b0 [amdgpu] [ 156.132302] amdgpu_ib_schedule+0x2f9/0x580 [amdgpu] [ 156.132586] amdgpu_job_run+0xed/0x220 [amdgpu]
Signed-off-by: Huang Rui ray.huang@amd.com
drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index 3b7e86ea7167..e2aa71904278 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -76,7 +76,7 @@ void amdgpu_fence_slab_fini(void) /*
- Cast helper
*/ -static const struct dma_fence_ops amdgpu_fence_ops; +static struct dma_fence_ops amdgpu_fence_ops; static inline struct amdgpu_fence *to_amdgpu_fence(struct dma_fence
*f)
{ struct amdgpu_fence *__f = container_of(f, struct amdgpu_fence, base); @@ -158,21 +158,22 @@ int amdgpu_fence_emit(struct
amdgpu_ring *ring, struct dma_fence **f, struct amd
}
seq = ++ring->fence_drv.sync_seq;
- if (job != NULL && job->job_run_counter) {
- if (job && job->job_run_counter) { /* reinit seq for resubmitted jobs */ fence->seqno = seq;
set_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT,
&fence->flags);
} else {
amdgpu_fence_ops.init_flags = 0;
if (job)
set_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT,
&amdgpu_fence_ops.init_flags);
That is utterly nonsense. The amdgpu_fence_ops are global and can't be modified like that.
Please check the reply in patch1, we need initialize the fence and assign the flags together, otherwise it will trigger the panic once the trace event is enabled.
Thanks, Ray
Christian.
- dma_fence_init(fence, &amdgpu_fence_ops, &ring->fence_drv.lock, adev->fence_context + ring->idx, seq); }
- if (job != NULL) {
/* mark this fence has a parent job */
set_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT,
&fence->flags);
- }
- amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr, seq, flags | AMDGPU_FENCE_FLAG_INT); pm_runtime_get_noresume(adev_to_drm(adev)->dev);
@@ -720,7 +721,7 @@ static void amdgpu_fence_release(struct
dma_fence *f)
call_rcu(&f->rcu, amdgpu_fence_free); }
-static const struct dma_fence_ops amdgpu_fence_ops = { +static struct dma_fence_ops amdgpu_fence_ops = { .get_driver_name = amdgpu_fence_get_driver_name, .get_timeline_name = amdgpu_fence_get_timeline_name, .enable_signaling = amdgpu_fence_enable_signaling,