The first tracepoint for a request is trace_dma_fence_init which is called in the ctor before we have properly setup the request->engine. So if it's a non-recycled request the rq->engine might be NULL, or some garbage value, which leads to a crash.
Since we are not permitted to use kmem_cache_zalloc() here with SLAB_TYPESAFE_BY_RCU, one approach is simply to return DRIVER_NAME. We can then revisit this later if we decide to get rid of SLAB_TYPESAFE_BY_RCU.
Fixes: 855e39e65cfc ("drm/i915: Initialise basic fence before acquiring seqno") Signed-off-by: Matthew Auld matthew.auld@intel.com Cc: Michael Mason michael.w.mason@intel.com Cc: Daniel Vetter daniel@ffwll.ch --- drivers/gpu/drm/i915/i915_request.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 1014c71cf7f5..55fa94bde22e 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -61,7 +61,7 @@ static struct i915_global_request {
static const char *i915_fence_get_driver_name(struct dma_fence *fence) { - return dev_name(to_request(fence)->engine->i915->drm.dev); + return DRIVER_NAME; }
static const char *i915_fence_get_timeline_name(struct dma_fence *fence)
On Wed, Jun 16, 2021 at 01:28:33PM +0100, Matthew Auld wrote:
The first tracepoint for a request is trace_dma_fence_init which is called in the ctor before we have properly setup the request->engine. So if it's a non-recycled request the rq->engine might be NULL, or some garbage value, which leads to a crash.
I'd hammer it more in here that we cannot move the dma_fence_init out of the slab constructor (aka ctor), because that breaks the magic request recycling through SLAB_TYPESAFE_BY_RCU. You do explain it all, but in a rather dense fashion. More verbosity here would be good.
Since we are not permitted to use kmem_cache_zalloc() here with
This part I'm not really clear on ... what would zalloc break? Ideally references to the commits/bugs/functions in the code that would blow up would be good here.
SLAB_TYPESAFE_BY_RCU, one approach is simply to return DRIVER_NAME. We can then revisit this later if we decide to get rid of SLAB_TYPESAFE_BY_RCU.
Fixes: 855e39e65cfc ("drm/i915: Initialise basic fence before acquiring seqno") Signed-off-by: Matthew Auld matthew.auld@intel.com Cc: Michael Mason michael.w.mason@intel.com Cc: Daniel Vetter daniel@ffwll.ch
As a stop-gab the code change looks reasonable, but I think the commit message needs work and much more of the history/background we're hitting here.
Thanks, Daniel
drivers/gpu/drm/i915/i915_request.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 1014c71cf7f5..55fa94bde22e 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -61,7 +61,7 @@ static struct i915_global_request {
static const char *i915_fence_get_driver_name(struct dma_fence *fence) {
- return dev_name(to_request(fence)->engine->i915->drm.dev);
- return DRIVER_NAME;
}
static const char *i915_fence_get_timeline_name(struct dma_fence *fence)
2.26.3
This fixes the NULL pointer dereference in i915_fence_get_driver_name(), but now it crashes in i915_fence_get_timeline with another NULL pointer dereference. It attempts to use i915_gem_context.name, which apparently also hasn't been initialized
static const char *i915_fence_get_timeline_name(struct dma_fence *fence) { const struct i915_gem_context *ctx;
/* * The timeline struct (as part of the ppgtt underneath a context) * may be freed when the request is no longer in use by the GPU. * We could extend the life of a context to beyond that of all * fences, possibly keeping the hw resource around indefinitely, * or we just give them a false name. Since * dma_fence_ops.get_timeline_name is a debug feature, the occasional * lie seems justifiable. */ if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) return "signaled";
ctx = i915_request_gem_context(to_request(fence)); if (!ctx) return "[" DRIVER_NAME "]";
return ctx->name; }
<1>[ 414.327761] BUG: kernel NULL pointer dereference, address: 0000000000000020 <1>[ 414.327766] #PF: supervisor read access in kernel mode <1>[ 414.327768] #PF: error_code(0x0000) - not-present page <6>[ 414.327769] PGD 0 P4D 0 <4>[ 414.327772] Oops: 0000 [#1] PREEMPT SMP NOPTI <4>[ 414.327774] CPU: 3 PID: 1866 Comm: chrome Tainted: G U 5.4.125 #2 <4>[ 414.327776] Hardware name: Google Voxel/Voxel, BIOS Google_Voxel.13913.0.0 04/12/2021 <4>[ 414.327781] RIP: 0010:i915_fence_get_timeline_name+0x1d/0x37 <4>[ 414.327783] Code: 55 48 89 e5 48 c7 c0 ee d1 72 bd 5d c3 0f 1f 44 00 00 55 48 89 e5 48 8b 4f 30 48 c7 c0 b4 49 72 bd f6 c1 01 75 1c 48 8b 47 60 <48> 8b 40 20 48 85 c0 74 08 48 05 30 01 00 00 eb 07 48 c7 c0 79 7b <4>[ 414.327785] RSP: 0018:ffffa4a300d177b8 EFLAGS: 00010246 <4>[ 414.327787] RAX: 0000000000000000 RBX: ffffffffbd7a8680 RCX: 0000000000000000 <4>[ 414.327788] RDX: ffff9c27b3b0f4c0 RSI: ffff9c27b3b0f480 RDI: ffff9c27b3b0f480 <4>[ 414.327789] RBP: ffffa4a300d177b8 R08: 0000000000000000 R09: 0000000000000004 <4>[ 414.327791] R10: 0000000000000001 R11: ffffffffbca45536 R12: 0000000000000005 <4>[ 414.327792] R13: 0000000000000004 R14: ffff9c27b6c79bb0 R15: ffff9c27b3b0f480 <4>[ 414.327794] FS: 00007898a2965e00(0000) GS:ffff9c27b7f80000(0000) knlGS:0000000000000000 <4>[ 414.327795] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 <4>[ 414.327797] CR2: 0000000000000020 CR3: 000000022ded2006 CR4: 0000000000762ee0 <4>[ 414.327798] PKRU: 55555554 <4>[ 414.327799] Call Trace: <4>[ 414.327804] trace_event_raw_event_dma_fence+0xc9/0x1e5 <4>[ 414.327807] dma_fence_init+0xa6/0xca <4>[ 414.327809] __i915_request_ctor+0x7f/0xa9 <4>[ 414.327812] setup_object+0x88/0x8a <4>[ 414.327815] new_slab+0x22e/0x429 <4>[ 414.327817] ___slab_alloc+0x2c8/0x42e <4>[ 414.327820] ? __i915_request_create+0x68/0x238 <4>[ 414.327822] ? __i915_request_create+0x68/0x238 <4>[ 414.327824] __slab_alloc+0x3c/0x5f <4>[ 414.327826] kmem_cache_alloc+0x19b/0x201 <4>[ 414.327828] __i915_request_create+0x68/0x238 <4>[ 414.327830] i915_request_create+0x8a/0xca <4>[ 414.327833] i915_gem_do_execbuffer+0x12f9/0x1830 <4>[ 414.327837] i915_gem_execbuffer2_ioctl+0x157/0x398 <4>[ 414.327839] ? i915_gem_do_execbuffer+0x1830/0x1830 <4>[ 414.327840] drm_ioctl_kernel+0x94/0xf6 <4>[ 414.327842] drm_ioctl+0x276/0x39b <4>[ 414.327845] ? i915_gem_do_execbuffer+0x1830/0x1830 <4>[ 414.327847] do_vfs_ioctl+0x4f4/0x771 <4>[ 414.327849] ksys_ioctl+0x58/0x83 <4>[ 414.327851] __x64_sys_ioctl+0x1a/0x1e <4>[ 414.327853] do_syscall_64+0x54/0x7e <4>[ 414.327856] entry_SYSCALL_64_after_hwframe+0x44/0xa9 <4>[ 414.327859] RIP: 0033:0x7898a2f88f07
-----Original Message----- From: Auld, Matthew matthew.auld@intel.com Sent: Wednesday, June 16, 2021 5:29 AM To: intel-gfx@lists.freedesktop.org Cc: dri-devel@lists.freedesktop.org; Mason, Michael W michael.w.mason@intel.com; Daniel Vetter daniel@ffwll.ch Subject: [PATCH] drm/i915: return DRIVER_NAME for the fence driver name
The first tracepoint for a request is trace_dma_fence_init which is called in the ctor before we have properly setup the request->engine. So if it's a non- recycled request the rq->engine might be NULL, or some garbage value, which leads to a crash.
Since we are not permitted to use kmem_cache_zalloc() here with SLAB_TYPESAFE_BY_RCU, one approach is simply to return DRIVER_NAME. We can then revisit this later if we decide to get rid of SLAB_TYPESAFE_BY_RCU.
Fixes: 855e39e65cfc ("drm/i915: Initialise basic fence before acquiring seqno") Signed-off-by: Matthew Auld matthew.auld@intel.com Cc: Michael Mason michael.w.mason@intel.com Cc: Daniel Vetter daniel@ffwll.ch
drivers/gpu/drm/i915/i915_request.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 1014c71cf7f5..55fa94bde22e 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -61,7 +61,7 @@ static struct i915_global_request {
static const char *i915_fence_get_driver_name(struct dma_fence *fence) {
- return dev_name(to_request(fence)->engine->i915->drm.dev);
- return DRIVER_NAME;
}
static const char *i915_fence_get_timeline_name(struct dma_fence
*fence)
2.26.3
Hi
Am 16.06.21 um 14:28 schrieb Matthew Auld:
The first tracepoint for a request is trace_dma_fence_init which is called in the ctor before we have properly setup the request->engine. So if it's a non-recycled request the rq->engine might be NULL, or some garbage value, which leads to a crash.
Since we are not permitted to use kmem_cache_zalloc() here with SLAB_TYPESAFE_BY_RCU, one approach is simply to return DRIVER_NAME. We can then revisit this later if we decide to get rid of SLAB_TYPESAFE_BY_RCU.
Fixes: 855e39e65cfc ("drm/i915: Initialise basic fence before acquiring seqno") Signed-off-by: Matthew Auld matthew.auld@intel.com Cc: Michael Mason michael.w.mason@intel.com Cc: Daniel Vetter daniel@ffwll.ch
drivers/gpu/drm/i915/i915_request.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 1014c71cf7f5..55fa94bde22e 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -61,7 +61,7 @@ static struct i915_global_request {
static const char *i915_fence_get_driver_name(struct dma_fence *fence) {
- return dev_name(to_request(fence)->engine->i915->drm.dev);
- return DRIVER_NAME;
There was recently a discussion about using struct drm_driver.name consistently throughout the source code. I'd like to suggest to do this here.
Best regards Thomas
}
static const char *i915_fence_get_timeline_name(struct dma_fence *fence)
dri-devel@lists.freedesktop.org