When using e8860(gcn1) on arm64, the kernel crashed on drm/radeon:
[ 11.240414] pc : __memset+0x4c/0x188 [ 11.244101] lr : radeon_uvd_get_create_msg+0x114/0x1d0 [radeon] [ 11.249995] sp : ffff00000d7eb700 [ 11.253295] x29: ffff00000d7eb700 x28: ffff8001f632a868 [ 11.258585] x27: 0000000000040000 x26: ffff00000de00000 [ 11.263875] x25: 0000000000000125 x24: 0000000000000001 [ 11.269168] x23: 0000000000000000 x22: 0000000000000005 [ 11.274459] x21: ffff00000df24000 x20: ffff8001f74b4000 [ 11.279753] x19: 0000000000124000 x18: 0000000000000020 [ 11.285043] x17: 0000000000000000 x16: 0000000000000000 [ 11.290336] x15: ffff000009309000 x14: ffffffffffffffff [ 11.290340] x13: ffff0000094b6f88 x12: ffff0000094b6bd2 [ 11.290343] x11: ffff00000d7eb700 x10: ffff00000d7eb700 [ 11.306246] x9 : ffff00000d7eb700 x8 : ffff00000df2402c [ 11.306254] x7 : 0000000000000000 x6 : ffff0000094b626a [ 11.306257] x5 : 0000000000000000 x4 : 0000000000000004 [ 11.306262] x3 : ffffffffffffffff x2 : 0000000000000fd4 [ 11.306265] x1 : 0000000000000000 x0 : ffff00000df2402c [ 11.306272] Call trace: [ 11.306316] __memset+0x4c/0x188 [ 11.306638] uvd_v1_0_ib_test+0x70/0x1c0 [radeon] [ 11.306758] radeon_ib_ring_tests+0x54/0xe0 [radeon] [ 11.309961] IPv6: ADDRCONF(NETDEV_UP): enp5s0f0: link is not ready [ 11.354628] radeon_device_init+0x53c/0xbdc [radeon] [ 11.354693] radeon_driver_load_kms+0x6c/0x1b0 [radeon] [ 11.364788] drm_dev_register+0x130/0x1c0 [ 11.364794] drm_get_pci_dev+0x8c/0x14c [ 11.372704] radeon_pci_probe+0xb0/0x110 [radeon] [ 11.372715] local_pci_probe+0x3c/0xb0 [ 11.381129] pci_device_probe+0x114/0x1b0 [ 11.385121] really_probe+0x23c/0x400 [ 11.388757] driver_probe_device+0xdc/0x130 [ 11.392921] __driver_attach+0x128/0x150 [ 11.396826] bus_for_each_dev+0x70/0xbc [ 11.400643] driver_attach+0x20/0x2c [ 11.404201] bus_add_driver+0x160/0x260 [ 11.408019] driver_register+0x74/0x120 [ 11.411837] __pci_register_driver+0x40/0x50 [ 11.416149] radeon_init+0x78/0x1000 [radeon] [ 11.420489] do_one_initcall+0x54/0x154 [ 11.424310] do_init_module+0x54/0x260 [ 11.428041] load_module+0x1ccc/0x20b0 [ 11.431773] __se_sys_finit_module+0xac/0x10c [ 11.436109] __arm64_sys_finit_module+0x18/0x20 [ 11.440622] el0_svc_common+0x70/0x164 [ 11.444353] el0_svc_handler+0x2c/0x80 [ 11.448084] el0_svc+0x8/0xc [ 11.450954] Code: d65f03c0 cb0803e4 f2400c84 54000080 (a9001d07)
Obviously, the __memset call is generated by gcc(8.3.1). It optimizes this for loop into memset. But this may break, because dest here is cpu_addr mapped to io mem. So, just invoke `memset_io` directly, which do solve the problem here.
Signed-off-by: chenli chenli@uniontech.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c | 6 ++---- drivers/gpu/drm/radeon/radeon_uvd.c | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c index 7c5b60e53482..4dccde7a9e83 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c @@ -1187,8 +1187,7 @@ int amdgpu_uvd_get_create_msg(struct amdgpu_ring *ring, uint32_t handle, msg[8] = cpu_to_le32(0x00000440); msg[9] = cpu_to_le32(0x00000000); msg[10] = cpu_to_le32(0x01b37000); - for (i = 11; i < 1024; ++i) - msg[i] = cpu_to_le32(0x0); + memset_io(&msg[i], 0x0, 1013 * sizeof(uint32_t));
return amdgpu_uvd_send_msg(ring, bo, true, fence); } @@ -1212,8 +1211,7 @@ int amdgpu_uvd_get_destroy_msg(struct amdgpu_ring *ring, uint32_t handle, msg[1] = cpu_to_le32(0x00000002); msg[2] = cpu_to_le32(handle); msg[3] = cpu_to_le32(0x00000000); - for (i = 4; i < 1024; ++i) - msg[i] = cpu_to_le32(0x0); + memset_io(&msg[i], 0x0, 1020 * sizeof(uint32_t));
return amdgpu_uvd_send_msg(ring, bo, direct, fence); } diff --git a/drivers/gpu/drm/radeon/radeon_uvd.c b/drivers/gpu/drm/radeon/radeon_uvd.c index 57fb3eb3a4b4..2e2e737c4706 100644 --- a/drivers/gpu/drm/radeon/radeon_uvd.c +++ b/drivers/gpu/drm/radeon/radeon_uvd.c @@ -802,8 +802,7 @@ int radeon_uvd_get_create_msg(struct radeon_device *rdev, int ring, msg[8] = cpu_to_le32(0x00000440); msg[9] = cpu_to_le32(0x00000000); msg[10] = cpu_to_le32(0x01b37000); - for (i = 11; i < 1024; ++i) - msg[i] = cpu_to_le32(0x0); + memset_io(&msg[i], 0x0, 1013 * sizeof(uint32_t));
r = radeon_uvd_send_msg(rdev, ring, addr, fence); radeon_bo_unreserve(rdev->uvd.vcpu_bo); @@ -831,8 +830,7 @@ int radeon_uvd_get_destroy_msg(struct radeon_device *rdev, int ring, msg[1] = cpu_to_le32(0x00000002); msg[2] = cpu_to_le32(handle); msg[3] = cpu_to_le32(0x00000000); - for (i = 4; i < 1024; ++i) - msg[i] = cpu_to_le32(0x0); + memset_io(&msg[i], 0x0, 1020 * sizeof(uint32_t));
r = radeon_uvd_send_msg(rdev, ring, addr, fence); radeon_bo_unreserve(rdev->uvd.vcpu_bo);
Am 16.12.20 um 06:41 schrieb Chen Li:
When using e8860(gcn1) on arm64, the kernel crashed on drm/radeon:
[ 11.240414] pc : __memset+0x4c/0x188 [ 11.244101] lr : radeon_uvd_get_create_msg+0x114/0x1d0 [radeon] [ 11.249995] sp : ffff00000d7eb700 [ 11.253295] x29: ffff00000d7eb700 x28: ffff8001f632a868 [ 11.258585] x27: 0000000000040000 x26: ffff00000de00000 [ 11.263875] x25: 0000000000000125 x24: 0000000000000001 [ 11.269168] x23: 0000000000000000 x22: 0000000000000005 [ 11.274459] x21: ffff00000df24000 x20: ffff8001f74b4000 [ 11.279753] x19: 0000000000124000 x18: 0000000000000020 [ 11.285043] x17: 0000000000000000 x16: 0000000000000000 [ 11.290336] x15: ffff000009309000 x14: ffffffffffffffff [ 11.290340] x13: ffff0000094b6f88 x12: ffff0000094b6bd2 [ 11.290343] x11: ffff00000d7eb700 x10: ffff00000d7eb700 [ 11.306246] x9 : ffff00000d7eb700 x8 : ffff00000df2402c [ 11.306254] x7 : 0000000000000000 x6 : ffff0000094b626a [ 11.306257] x5 : 0000000000000000 x4 : 0000000000000004 [ 11.306262] x3 : ffffffffffffffff x2 : 0000000000000fd4 [ 11.306265] x1 : 0000000000000000 x0 : ffff00000df2402c [ 11.306272] Call trace: [ 11.306316] __memset+0x4c/0x188 [ 11.306638] uvd_v1_0_ib_test+0x70/0x1c0 [radeon] [ 11.306758] radeon_ib_ring_tests+0x54/0xe0 [radeon] [ 11.309961] IPv6: ADDRCONF(NETDEV_UP): enp5s0f0: link is not ready [ 11.354628] radeon_device_init+0x53c/0xbdc [radeon] [ 11.354693] radeon_driver_load_kms+0x6c/0x1b0 [radeon] [ 11.364788] drm_dev_register+0x130/0x1c0 [ 11.364794] drm_get_pci_dev+0x8c/0x14c [ 11.372704] radeon_pci_probe+0xb0/0x110 [radeon] [ 11.372715] local_pci_probe+0x3c/0xb0 [ 11.381129] pci_device_probe+0x114/0x1b0 [ 11.385121] really_probe+0x23c/0x400 [ 11.388757] driver_probe_device+0xdc/0x130 [ 11.392921] __driver_attach+0x128/0x150 [ 11.396826] bus_for_each_dev+0x70/0xbc [ 11.400643] driver_attach+0x20/0x2c [ 11.404201] bus_add_driver+0x160/0x260 [ 11.408019] driver_register+0x74/0x120 [ 11.411837] __pci_register_driver+0x40/0x50 [ 11.416149] radeon_init+0x78/0x1000 [radeon] [ 11.420489] do_one_initcall+0x54/0x154 [ 11.424310] do_init_module+0x54/0x260 [ 11.428041] load_module+0x1ccc/0x20b0 [ 11.431773] __se_sys_finit_module+0xac/0x10c [ 11.436109] __arm64_sys_finit_module+0x18/0x20 [ 11.440622] el0_svc_common+0x70/0x164 [ 11.444353] el0_svc_handler+0x2c/0x80 [ 11.448084] el0_svc+0x8/0xc [ 11.450954] Code: d65f03c0 cb0803e4 f2400c84 54000080 (a9001d07)
Obviously, the __memset call is generated by gcc(8.3.1). It optimizes this for loop into memset. But this may break, because dest here is cpu_addr mapped to io mem. So, just invoke `memset_io` directly, which do solve the problem here.
Well interesting problem you stumbled over here, but the solution is quite a hack.
For amdgpu I suggest that we allocate the UVD message in GTT instead of VRAM since we don't have the hardware restriction for that on the new generations.
For radeon I think the better approach would be to convert the direct memory writes into calls to writel().
BTW: How does userspace work on arm64 then? The driver stack usually only works if mmio can be mapped directly.
Regards, Christian.
Signed-off-by: chenli chenli@uniontech.com
drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c | 6 ++---- drivers/gpu/drm/radeon/radeon_uvd.c | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c index 7c5b60e53482..4dccde7a9e83 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c @@ -1187,8 +1187,7 @@ int amdgpu_uvd_get_create_msg(struct amdgpu_ring *ring, uint32_t handle, msg[8] = cpu_to_le32(0x00000440); msg[9] = cpu_to_le32(0x00000000); msg[10] = cpu_to_le32(0x01b37000);
- for (i = 11; i < 1024; ++i)
msg[i] = cpu_to_le32(0x0);
memset_io(&msg[i], 0x0, 1013 * sizeof(uint32_t));
return amdgpu_uvd_send_msg(ring, bo, true, fence); }
@@ -1212,8 +1211,7 @@ int amdgpu_uvd_get_destroy_msg(struct amdgpu_ring *ring, uint32_t handle, msg[1] = cpu_to_le32(0x00000002); msg[2] = cpu_to_le32(handle); msg[3] = cpu_to_le32(0x00000000);
- for (i = 4; i < 1024; ++i)
msg[i] = cpu_to_le32(0x0);
memset_io(&msg[i], 0x0, 1020 * sizeof(uint32_t));
return amdgpu_uvd_send_msg(ring, bo, direct, fence); }
diff --git a/drivers/gpu/drm/radeon/radeon_uvd.c b/drivers/gpu/drm/radeon/radeon_uvd.c index 57fb3eb3a4b4..2e2e737c4706 100644 --- a/drivers/gpu/drm/radeon/radeon_uvd.c +++ b/drivers/gpu/drm/radeon/radeon_uvd.c @@ -802,8 +802,7 @@ int radeon_uvd_get_create_msg(struct radeon_device *rdev, int ring, msg[8] = cpu_to_le32(0x00000440); msg[9] = cpu_to_le32(0x00000000); msg[10] = cpu_to_le32(0x01b37000);
- for (i = 11; i < 1024; ++i)
msg[i] = cpu_to_le32(0x0);
memset_io(&msg[i], 0x0, 1013 * sizeof(uint32_t));
r = radeon_uvd_send_msg(rdev, ring, addr, fence); radeon_bo_unreserve(rdev->uvd.vcpu_bo);
@@ -831,8 +830,7 @@ int radeon_uvd_get_destroy_msg(struct radeon_device *rdev, int ring, msg[1] = cpu_to_le32(0x00000002); msg[2] = cpu_to_le32(handle); msg[3] = cpu_to_le32(0x00000000);
- for (i = 4; i < 1024; ++i)
msg[i] = cpu_to_le32(0x0);
memset_io(&msg[i], 0x0, 1020 * sizeof(uint32_t));
r = radeon_uvd_send_msg(rdev, ring, addr, fence); radeon_bo_unreserve(rdev->uvd.vcpu_bo);
On Wed, 16 Dec 2020 15:59:37 +0800, Christian König wrote:
Am 16.12.20 um 06:41 schrieb Chen Li:
When using e8860(gcn1) on arm64, the kernel crashed on drm/radeon:
[ 11.240414] pc : __memset+0x4c/0x188 [ 11.244101] lr : radeon_uvd_get_create_msg+0x114/0x1d0 [radeon] [ 11.249995] sp : ffff00000d7eb700 [ 11.253295] x29: ffff00000d7eb700 x28: ffff8001f632a868 [ 11.258585] x27: 0000000000040000 x26: ffff00000de00000 [ 11.263875] x25: 0000000000000125 x24: 0000000000000001 [ 11.269168] x23: 0000000000000000 x22: 0000000000000005 [ 11.274459] x21: ffff00000df24000 x20: ffff8001f74b4000 [ 11.279753] x19: 0000000000124000 x18: 0000000000000020 [ 11.285043] x17: 0000000000000000 x16: 0000000000000000 [ 11.290336] x15: ffff000009309000 x14: ffffffffffffffff [ 11.290340] x13: ffff0000094b6f88 x12: ffff0000094b6bd2 [ 11.290343] x11: ffff00000d7eb700 x10: ffff00000d7eb700 [ 11.306246] x9 : ffff00000d7eb700 x8 : ffff00000df2402c [ 11.306254] x7 : 0000000000000000 x6 : ffff0000094b626a [ 11.306257] x5 : 0000000000000000 x4 : 0000000000000004 [ 11.306262] x3 : ffffffffffffffff x2 : 0000000000000fd4 [ 11.306265] x1 : 0000000000000000 x0 : ffff00000df2402c [ 11.306272] Call trace: [ 11.306316] __memset+0x4c/0x188 [ 11.306638] uvd_v1_0_ib_test+0x70/0x1c0 [radeon] [ 11.306758] radeon_ib_ring_tests+0x54/0xe0 [radeon] [ 11.309961] IPv6: ADDRCONF(NETDEV_UP): enp5s0f0: link is not ready [ 11.354628] radeon_device_init+0x53c/0xbdc [radeon] [ 11.354693] radeon_driver_load_kms+0x6c/0x1b0 [radeon] [ 11.364788] drm_dev_register+0x130/0x1c0 [ 11.364794] drm_get_pci_dev+0x8c/0x14c [ 11.372704] radeon_pci_probe+0xb0/0x110 [radeon] [ 11.372715] local_pci_probe+0x3c/0xb0 [ 11.381129] pci_device_probe+0x114/0x1b0 [ 11.385121] really_probe+0x23c/0x400 [ 11.388757] driver_probe_device+0xdc/0x130 [ 11.392921] __driver_attach+0x128/0x150 [ 11.396826] bus_for_each_dev+0x70/0xbc [ 11.400643] driver_attach+0x20/0x2c [ 11.404201] bus_add_driver+0x160/0x260 [ 11.408019] driver_register+0x74/0x120 [ 11.411837] __pci_register_driver+0x40/0x50 [ 11.416149] radeon_init+0x78/0x1000 [radeon] [ 11.420489] do_one_initcall+0x54/0x154 [ 11.424310] do_init_module+0x54/0x260 [ 11.428041] load_module+0x1ccc/0x20b0 [ 11.431773] __se_sys_finit_module+0xac/0x10c [ 11.436109] __arm64_sys_finit_module+0x18/0x20 [ 11.440622] el0_svc_common+0x70/0x164 [ 11.444353] el0_svc_handler+0x2c/0x80 [ 11.448084] el0_svc+0x8/0xc [ 11.450954] Code: d65f03c0 cb0803e4 f2400c84 54000080 (a9001d07)
Obviously, the __memset call is generated by gcc(8.3.1). It optimizes this for loop into memset. But this may break, because dest here is cpu_addr mapped to io mem. So, just invoke `memset_io` directly, which do solve the problem here.
Well interesting problem you stumbled over here, but the solution is quite a hack.
Hi, Christian. I'm not sure why this change is a hack here. I cannot see the problem and wll be grateful if you give more explainations.
For amdgpu I suggest that we allocate the UVD message in GTT instead of VRAM since we don't have the hardware restriction for that on the new generations.
Thanks, I will try to dig into deeper. But what's the "hardware restriction" meaning here? I'm not familiar with video driver stack and amd gpu, sorry.
For radeon I think the better approach would be to convert the direct memory writes into calls to writel().
Ok, so you mean the more proper way is to use writel instead of memset_io?
BTW: How does userspace work on arm64 then? The driver stack usually only works if mmio can be mapped directly.
I also post two usespace issue on mesa, and you may be interested with them: https://gitlab.freedesktop.org/mesa/mesa/-/issues/3954 https://gitlab.freedesktop.org/mesa/mesa/-/issues/3951 I paste some virtual memory map in userspace there. (and the two problems do bother me quite a long time.)
Regards, Christian.
Signed-off-by: chenli chenli@uniontech.com
drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c | 6 ++---- drivers/gpu/drm/radeon/radeon_uvd.c | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c index 7c5b60e53482..4dccde7a9e83 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c @@ -1187,8 +1187,7 @@ int amdgpu_uvd_get_create_msg(struct amdgpu_ring *ring, uint32_t handle, msg[8] = cpu_to_le32(0x00000440); msg[9] = cpu_to_le32(0x00000000); msg[10] = cpu_to_le32(0x01b37000);
- for (i = 11; i < 1024; ++i)
msg[i] = cpu_to_le32(0x0);
- memset_io(&msg[i], 0x0, 1013 * sizeof(uint32_t)); return amdgpu_uvd_send_msg(ring, bo, true, fence); }
@@ -1212,8 +1211,7 @@ int amdgpu_uvd_get_destroy_msg(struct amdgpu_ring *ring, uint32_t handle, msg[1] = cpu_to_le32(0x00000002); msg[2] = cpu_to_le32(handle); msg[3] = cpu_to_le32(0x00000000);
- for (i = 4; i < 1024; ++i)
msg[i] = cpu_to_le32(0x0);
- memset_io(&msg[i], 0x0, 1020 * sizeof(uint32_t)); return amdgpu_uvd_send_msg(ring, bo, direct, fence); }
diff --git a/drivers/gpu/drm/radeon/radeon_uvd.c b/drivers/gpu/drm/radeon/radeon_uvd.c index 57fb3eb3a4b4..2e2e737c4706 100644 --- a/drivers/gpu/drm/radeon/radeon_uvd.c +++ b/drivers/gpu/drm/radeon/radeon_uvd.c @@ -802,8 +802,7 @@ int radeon_uvd_get_create_msg(struct radeon_device *rdev, int ring, msg[8] = cpu_to_le32(0x00000440); msg[9] = cpu_to_le32(0x00000000); msg[10] = cpu_to_le32(0x01b37000);
- for (i = 11; i < 1024; ++i)
msg[i] = cpu_to_le32(0x0);
- memset_io(&msg[i], 0x0, 1013 * sizeof(uint32_t)); r = radeon_uvd_send_msg(rdev, ring, addr, fence); radeon_bo_unreserve(rdev->uvd.vcpu_bo);
@@ -831,8 +830,7 @@ int radeon_uvd_get_destroy_msg(struct radeon_device *rdev, int ring, msg[1] = cpu_to_le32(0x00000002); msg[2] = cpu_to_le32(handle); msg[3] = cpu_to_le32(0x00000000);
- for (i = 4; i < 1024; ++i)
msg[i] = cpu_to_le32(0x0);
- memset_io(&msg[i], 0x0, 1020 * sizeof(uint32_t)); r = radeon_uvd_send_msg(rdev, ring, addr, fence); radeon_bo_unreserve(rdev->uvd.vcpu_bo);
Am 16.12.20 um 14:48 schrieb Chen Li:
On Wed, 16 Dec 2020 15:59:37 +0800, Christian König wrote:
Am 16.12.20 um 06:41 schrieb Chen Li:
When using e8860(gcn1) on arm64, the kernel crashed on drm/radeon: [SNIP] Obviously, the __memset call is generated by gcc(8.3.1). It optimizes this for loop into memset. But this may break, because dest here is cpu_addr mapped to io mem. So, just invoke `memset_io` directly, which do solve the problem here.
Well interesting problem you stumbled over here, but the solution is quite a hack.
Hi, Christian. I'm not sure why this change is a hack here. I cannot see the problem and wll be grateful if you give more explainations.
__memset is supposed to work on those addresses, otherwise you can't use the e8860 on your arm64 system.
Replacing the the direct write in the kernel with calls to writel() or memset_io() will fix that temporary, but you have a more general problem here.
For amdgpu I suggest that we allocate the UVD message in GTT instead of VRAM since we don't have the hardware restriction for that on the new generations.
Thanks, I will try to dig into deeper. But what's the "hardware restriction" meaning here? I'm not familiar with video driver stack and amd gpu, sorry.
On older hardware (AGP days) the buffer had to be in VRAM (MMIO) memory, but on modern system GTT (system memory) works as well.
For radeon I think the better approach would be to convert the direct memory writes into calls to writel().
Ok, so you mean the more proper way is to use writel instead of memset_io?
Well, it is a start. But I'm not sure if you will ever get that hardware working with this CPU.
BTW: How does userspace work on arm64 then? The driver stack usually only works if mmio can be mapped directly.
I also post two usespace issue on mesa, and you may be interested with them: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.fre... https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.fre... I paste some virtual memory map in userspace there. (and the two problems do bother me quite a long time.)
I don't really see a solution for those problems.
See it is perfectly valid for an application to memset/memcpy on mmaped MMIO space which comes from OpenGL or Vulkan.
So your CPU simply won't work with the hardware. We could work around that with a couple of hacks, but this is a pretty much general problem.
Regards, Christian.
On Wed, 16 Dec 2020 22:19:11 +0800, Christian König wrote:
Am 16.12.20 um 14:48 schrieb Chen Li:
On Wed, 16 Dec 2020 15:59:37 +0800, Christian König wrote:
Am 16.12.20 um 06:41 schrieb Chen Li:
When using e8860(gcn1) on arm64, the kernel crashed on drm/radeon: [SNIP] Obviously, the __memset call is generated by gcc(8.3.1). It optimizes this for loop into memset. But this may break, because dest here is cpu_addr mapped to io mem. So, just invoke `memset_io` directly, which do solve the problem here.
Well interesting problem you stumbled over here, but the solution is quite a hack.
Hi, Christian. I'm not sure why this change is a hack here. I cannot see the problem and wll be grateful if you give more explainations.
__memset is supposed to work on those addresses, otherwise you can't use the e8860 on your arm64 system.
If __memset is supposed to work on those adresses, why this commit(https://github.com/torvalds/linux/commit/ba0b2275a6781b2f3919d931d63329b5548...) is needed? (I also notice drm/radeon didn't take this change though) just out of curiosity.
Replacing the the direct write in the kernel with calls to writel() or memset_io() will fix that temporary, but you have a more general problem here.
I cannot see what's the more general problem here :( u mean performance?
For amdgpu I suggest that we allocate the UVD message in GTT instead of VRAM since we don't have the hardware restriction for that on the new generations.
Thanks, I will try to dig into deeper. But what's the "hardware restriction" meaning here? I'm not familiar with video driver stack and amd gpu, sorry.
On older hardware (AGP days) the buffer had to be in VRAM (MMIO) memory, but on modern system GTT (system memory) works as well.
IIUC, e8860 can use amdgpu(I use radeon now) beause its device id 6822 is in amdgpu's table. But I cannot tell whether e8860 has iommu, and I cannot find iommu from lspci, so graphics translation table may not work here?
For radeon I think the better approach would be to convert the direct memory writes into calls to writel().
Ok, so you mean the more proper way is to use writel instead of memset_io?
Well, it is a start. But I'm not sure if you will ever get that hardware working with this CPU.
BTW: How does userspace work on arm64 then? The driver stack usually only works if mmio can be mapped directly.
I also post two usespace issue on mesa, and you may be interested with them: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.fre... https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.fre... I paste some virtual memory map in userspace there. (and the two problems do bother me quite a long time.)
I don't really see a solution for those problems.
See it is perfectly valid for an application to memset/memcpy on mmaped MMIO space which comes from OpenGL or Vulkan.
So your CPU simply won't work with the hardware. We could work around that with a couple of hacks, but this is a pretty much general problem.
Regards, Christian.
Thanks! Can you provid some details about these hacks? Should I post another issue on the mail list?
Am 17.12.20 um 02:07 schrieb Chen Li:
On Wed, 16 Dec 2020 22:19:11 +0800, Christian König wrote:
Am 16.12.20 um 14:48 schrieb Chen Li:
On Wed, 16 Dec 2020 15:59:37 +0800, Christian König wrote:
[SNIP]
Hi, Christian. I'm not sure why this change is a hack here. I cannot see the problem and wll be grateful if you give more explainations.
__memset is supposed to work on those addresses, otherwise you can't use the e8860 on your arm64 system.
If __memset is supposed to work on those adresses, why this commit(https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com...) is needed? (I also notice drm/radeon didn't take this change though) just out of curiosity.
We generally accept those patches as cleanup in the kernel with the hope that we can find a way to work around the userspace restrictions.
But when you also have this issue in userspace then there isn't much we can do for you.
Replacing the the direct write in the kernel with calls to writel() or memset_io() will fix that temporary, but you have a more general problem here.
I cannot see what's the more general problem here :( u mean performance?
No, not performance. See standards like OpenGL, Vulkan as well as VA-API and VDPAU require that you can mmap() device memory and execute memset/memcpy on the memory from userspace.
If your ARM base board can't do that for some then you can't use the hardware with that board.
For amdgpu I suggest that we allocate the UVD message in GTT instead of VRAM since we don't have the hardware restriction for that on the new generations.
Thanks, I will try to dig into deeper. But what's the "hardware restriction" meaning here? I'm not familiar with video driver stack and amd gpu, sorry.
On older hardware (AGP days) the buffer had to be in VRAM (MMIO) memory, but on modern system GTT (system memory) works as well.
IIUC, e8860 can use amdgpu(I use radeon now) beause its device id 6822 is in amdgpu's table. But I cannot tell whether e8860 has iommu, and I cannot find iommu from lspci, so graphics translation table may not work here?
That is not related to IOMMU. IOMMU is a feature of the CPU/motherboard. This is implemented using GTT, e.g. the VM page tables inside the GPU.
And yes it should work I will prepare a patch for it.
BTW: How does userspace work on arm64 then? The driver stack usually only works if mmio can be mapped directly.
I also post two usespace issue on mesa, and you may be interested with them: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.fre... https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.fre... I paste some virtual memory map in userspace there. (and the two problems do bother me quite a long time.)
I don't really see a solution for those problems.
See it is perfectly valid for an application to memset/memcpy on mmaped MMIO space which comes from OpenGL or Vulkan.
So your CPU simply won't work with the hardware. We could work around that with a couple of hacks, but this is a pretty much general problem.
Regards, Christian.
Thanks! Can you provid some details about these hacks? Should I post another issue on the mail list?
Adjust the kernel and/or user space to never map VRAM to the CPU.
This violates the OpenGL/Vulkan specification in some ways. So not sure if that will work or not.
Regards, Christian.
On Thu, 17 Dec 2020 18:25:11 +0800, Christian König wrote:
Am 17.12.20 um 02:07 schrieb Chen Li:
On Wed, 16 Dec 2020 22:19:11 +0800, Christian König wrote:
Am 16.12.20 um 14:48 schrieb Chen Li:
On Wed, 16 Dec 2020 15:59:37 +0800, Christian König wrote:
[SNIP]
Hi, Christian. I'm not sure why this change is a hack here. I cannot see the problem and wll be grateful if you give more explainations.
__memset is supposed to work on those addresses, otherwise you can't use the e8860 on your arm64 system.
If __memset is supposed to work on those adresses, why this commit(https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com...) is needed? (I also notice drm/radeon didn't take this change though) just out of curiosity.
We generally accept those patches as cleanup in the kernel with the hope that we can find a way to work around the userspace restrictions.
What's the userspace restriction here? mmap device memory?
But when you also have this issue in userspace then there isn't much we can do for you.
Replacing the the direct write in the kernel with calls to writel() or memset_io() will fix that temporary, but you have a more general problem here.
I cannot see what's the more general problem here :( u mean performance?
No, not performance. See standards like OpenGL, Vulkan as well as VA-API and VDPAU require that you can mmap() device memory and execute memset/memcpy on the memory from userspace.
If your ARM base board can't do that for some then you can't use the hardware with that board.
Good to know, thanks! BTW, have you ever seen or heard boards like mine which cannot mmap device memory correctly from userspace correctly?
For amdgpu I suggest that we allocate the UVD message in GTT instead of VRAM since we don't have the hardware restriction for that on the new generations.
Thanks, I will try to dig into deeper. But what's the "hardware restriction" meaning here? I'm not familiar with video driver stack and amd gpu, sorry.
On older hardware (AGP days) the buffer had to be in VRAM (MMIO) memory, but on modern system GTT (system memory) works as well.
IIUC, e8860 can use amdgpu(I use radeon now) beause its device id 6822 is in amdgpu's table. But I cannot tell whether e8860 has iommu, and I cannot find iommu from lspci, so graphics translation table may not work here?
That is not related to IOMMU. IOMMU is a feature of the CPU/motherboard. This is implemented using GTT, e.g. the VM page tables inside the GPU.
And yes it should work I will prepare a patch for it.
I think you mean mmu :) Refer to wikipedia: https://en.wikipedia.org/wiki/Input%E2%80%93output_memory_management_unit#:~....
In computing, an input–output memory management unit (IOMMU) is a memory management unit (MMU) that connects a direct-memory-access–capable (DMA-capable) I/O bus to the main memory. Like a traditional MMU, which translates CPU-visible virtual addresses to physical addresses, the IOMMU maps device-visible virtual addresses (also called device addresses or I/O addresses in this context) to physical addresses. Some units also provide memory protection from faulty or malicious devices. An example IOMMU is the graphics address remapping table (GART) used by AGP and PCI Express graphics cards on Intel Architecture and AMD computers.
GART should be antoher abber of GTT(https://en.wikipedia.org/wiki/Graphics_address_remapping_table):
The graphics address remapping table (GART),[1] also known as the graphics aperture remapping table,[2] or graphics translation table (GTT),[3] is an I/O memory management unit (IOMMU) used by Accelerated Graphics Port (AGP) and PCI Express (PCIe) graphics cards.
BTW: How does userspace work on arm64 then? The driver stack usually only works if mmio can be mapped directly.
I also post two usespace issue on mesa, and you may be interested with them: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.fre... https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.fre... I paste some virtual memory map in userspace there. (and the two problems do bother me quite a long time.)
I don't really see a solution for those problems.
See it is perfectly valid for an application to memset/memcpy on mmaped MMIO space which comes from OpenGL or Vulkan.
So your CPU simply won't work with the hardware. We could work around that with a couple of hacks, but this is a pretty much general problem.
Regards, Christian.
Thanks! Can you provid some details about these hacks? Should I post another issue on the mail list?
Adjust the kernel and/or user space to never map VRAM to the CPU.
This violates the OpenGL/Vulkan specification in some ways. So not sure if that will work or not.
Regards, Christian.
Well, if I never map vram to the cpu, then render node like renderD128 cannot expose to userspace any more and opencl also can't take effects, right?
The final thing I still cannot figure out is why smplayer can play video with vaapi but cannot with vdpau? both them should take use renderD128 to render.
Am 17.12.20 um 14:37 schrieb Chen Li:
On Thu, 17 Dec 2020 18:25:11 +0800, Christian König wrote:
Am 17.12.20 um 02:07 schrieb Chen Li:
On Wed, 16 Dec 2020 22:19:11 +0800, Christian König wrote:
Am 16.12.20 um 14:48 schrieb Chen Li:
On Wed, 16 Dec 2020 15:59:37 +0800, Christian König wrote:
[SNIP]
Hi, Christian. I'm not sure why this change is a hack here. I cannot see the problem and wll be grateful if you give more explainations.
__memset is supposed to work on those addresses, otherwise you can't use the e8860 on your arm64 system.
If __memset is supposed to work on those adresses, why this commit(https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com...) is needed? (I also notice drm/radeon didn't take this change though) just out of curiosity.
We generally accept those patches as cleanup in the kernel with the hope that we can find a way to work around the userspace restrictions.
What's the userspace restriction here? mmap device memory?
Yes, exactly that.
But when you also have this issue in userspace then there isn't much we can do for you.
Replacing the the direct write in the kernel with calls to writel() or memset_io() will fix that temporary, but you have a more general problem here.
I cannot see what's the more general problem here :( u mean performance?
No, not performance. See standards like OpenGL, Vulkan as well as VA-API and VDPAU require that you can mmap() device memory and execute memset/memcpy on the memory from userspace.
If your ARM base board can't do that for some then you can't use the hardware with that board.
Good to know, thanks! BTW, have you ever seen or heard boards like mine which cannot mmap device memory correctly from userspace correctly?
Unfortunately yes. We haven't been able to figure out what exactly goes wrong in those cases.
For amdgpu I suggest that we allocate the UVD message in GTT instead of VRAM since we don't have the hardware restriction for that on the new generations.
Thanks, I will try to dig into deeper. But what's the "hardware restriction" meaning here? I'm not familiar with video driver stack and amd gpu, sorry.
On older hardware (AGP days) the buffer had to be in VRAM (MMIO) memory, but on modern system GTT (system memory) works as well.
IIUC, e8860 can use amdgpu(I use radeon now) beause its device id 6822 is in amdgpu's table. But I cannot tell whether e8860 has iommu, and I cannot find iommu from lspci, so graphics translation table may not work here?
That is not related to IOMMU. IOMMU is a feature of the CPU/motherboard. This is implemented using GTT, e.g. the VM page tables inside the GPU.
And yes it should work I will prepare a patch for it.
I think you mean mmu :)
No, I really meant IOMMU.
Refer to wikipedia: https://nam11.safelinks.protection.outlook.com/?url=https:%2F%2Fen.wikipedia....
In computing, an input–output memory management unit (IOMMU) is a memory management unit (MMU) that connects a direct-memory-access–capable (DMA-capable) I/O bus to the main memory. Like a traditional MMU, which translates CPU-visible virtual addresses to physical addresses, the IOMMU maps device-visible virtual addresses (also called device addresses or I/O addresses in this context) to physical addresses. Some units also provide memory protection from faulty or malicious devices. An example IOMMU is the graphics address remapping table (GART) used by AGP and PCI Express graphics cards on Intel Architecture and AMD computers.
Maybe somebody should clarify the wikipedia article a bit since this is to general and misleading.
The key difference is that today IOMMU usually refers to the MMU block in the PCIe root complex of the CPU.
GART should be antoher abber of GTT(https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fen.wikiped...):
The graphics address remapping table (GART),[1] also known as the graphics aperture remapping table,[2] or graphics translation table (GTT),[3] is an I/O memory management unit (IOMMU) used by Accelerated Graphics Port (AGP) and PCI Express (PCIe) graphics cards.
GART or GTT refers to the translation tables graphics hardware use to access system memory.
Something like 15 years ago we used the IOMMU functionality from AGP to implement that. But modern hardware (PCIe) uses some specialized hardware in the GPU for that.
Regards, Christian.
On Thu, 17 Dec 2020 22:16:59 +0800, Christian König wrote:
Am 17.12.20 um 14:37 schrieb Chen Li:
On Thu, 17 Dec 2020 18:25:11 +0800, Christian König wrote:
Am 17.12.20 um 02:07 schrieb Chen Li:
On Wed, 16 Dec 2020 22:19:11 +0800, Christian König wrote:
Am 16.12.20 um 14:48 schrieb Chen Li:
On Wed, 16 Dec 2020 15:59:37 +0800, Christian König wrote: > [SNIP] Hi, Christian. I'm not sure why this change is a hack here. I cannot see the problem and wll be grateful if you give more explainations.
__memset is supposed to work on those addresses, otherwise you can't use the e8860 on your arm64 system.
If __memset is supposed to work on those adresses, why this commit(https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com...) is needed? (I also notice drm/radeon didn't take this change though) just out of curiosity.
We generally accept those patches as cleanup in the kernel with the hope that we can find a way to work around the userspace restrictions.
What's the userspace restriction here? mmap device memory?
Yes, exactly that.
But when you also have this issue in userspace then there isn't much we can do for you.
Replacing the the direct write in the kernel with calls to writel() or memset_io() will fix that temporary, but you have a more general problem here.
I cannot see what's the more general problem here :( u mean performance?
No, not performance. See standards like OpenGL, Vulkan as well as VA-API and VDPAU require that you can mmap() device memory and execute memset/memcpy on the memory from userspace.
If your ARM base board can't do that for some then you can't use the hardware with that board.
Good to know, thanks! BTW, have you ever seen or heard boards like mine which cannot mmap device memory correctly from userspace correctly?
Unfortunately yes. We haven't been able to figure out what exactly goes wrong in those cases.
Ok. one more question: only e8860 or all radeon cards have this issue?
> For amdgpu I suggest that we allocate the UVD message in GTT instead of VRAM > since we don't have the hardware restriction for that on the new generations. > Thanks, I will try to dig into deeper. But what's the "hardware restriction" meaning here? I'm not familiar with video driver stack and amd gpu, sorry.
On older hardware (AGP days) the buffer had to be in VRAM (MMIO) memory, but on modern system GTT (system memory) works as well.
IIUC, e8860 can use amdgpu(I use radeon now) beause its device id 6822 is in amdgpu's table. But I cannot tell whether e8860 has iommu, and I cannot find iommu from lspci, so graphics translation table may not work here?
That is not related to IOMMU. IOMMU is a feature of the CPU/motherboard. This is implemented using GTT, e.g. the VM page tables inside the GPU.
And yes it should work I will prepare a patch for it.
I think you mean mmu :)
No, I really meant IOMMU.
Refer to wikipedia: https://nam11.safelinks.protection.outlook.com/?url=https:%2F%2Fen.wikipedia....
In computing, an input–output memory management unit (IOMMU) is a memory management unit (MMU) that connects a direct-memory-access–capable (DMA-capable) I/O bus to the main memory. Like a traditional MMU, which translates CPU-visible virtual addresses to physical addresses, the IOMMU maps device-visible virtual addresses (also called device addresses or I/O addresses in this context) to physical addresses. Some units also provide memory protection from faulty or malicious devices. An example IOMMU is the graphics address remapping table (GART) used by AGP and PCI Express graphics cards on Intel Architecture and AMD computers.
Maybe somebody should clarify the wikipedia article a bit since this is to general and misleading.
The key difference is that today IOMMU usually refers to the MMU block in the PCIe root complex of the CPU.
GART should be antoher abber of GTT(https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fen.wikiped...):
The graphics address remapping table (GART),[1] also known as the graphics aperture remapping table,[2] or graphics translation table (GTT),[3] is an I/O memory management unit (IOMMU) used by Accelerated Graphics Port (AGP) and PCI Express (PCIe) graphics cards.
GART or GTT refers to the translation tables graphics hardware use to access system memory.
Something like 15 years ago we used the IOMMU functionality from AGP to implement that. But modern hardware (PCIe) uses some specialized hardware in the GPU for that.
Regards, Christian.
Good to know, thanks! So modern GART/GTT is like tlb, and iommu is forcused on translating address and not manager the tlb.
Am 18.12.20 um 04:51 schrieb Chen Li:
[SNIP]
If your ARM base board can't do that for some then you can't use the hardware with that board.
Good to know, thanks! BTW, have you ever seen or heard boards like mine which cannot mmap device memory correctly from userspace correctly?
Unfortunately yes. We haven't been able to figure out what exactly goes wrong in those cases.
Ok. one more question: only e8860 or all radeon cards have this issue?
This applies to all hardware with dedicated memory which needs to be mapped to userspace.
That includes all graphics hardware from AMD as well as NVidia and probably a whole bunch of other PCIe devices.
The graphics address remapping table (GART),[1] also known as the graphics aperture remapping table,[2] or graphics translation table (GTT),[3] is an I/O memory management unit (IOMMU) used by Accelerated Graphics Port (AGP) and PCI Express (PCIe) graphics cards.
GART or GTT refers to the translation tables graphics hardware use to access system memory.
Something like 15 years ago we used the IOMMU functionality from AGP to implement that. But modern hardware (PCIe) uses some specialized hardware in the GPU for that.
Regards, Christian.
Good to know, thanks! So modern GART/GTT is like tlb, and iommu is forcused on translating address and not manager the tlb.
You are getting closer in your understanding, but the TLB is the Translation lookaside buffer. Basically a cache of recent VM translations which is present is all page table translations (GART, IOMMU, CPU etc...).
The key difference is where the page table translation happens on modern hardware: 1. For the GART/GTT it is inside the GPU to translate between GPU internal and bus addresses. 2. For IOMMU it is inside the root complex of the PCIe to translate between bus addresses and physical addresses. 3. For CPU page tables it is inside the CPU core to translate between virtual addresses and physical addresses.
Regards, Christian.
On Fri, 18 Dec 2020 16:10:12 +0800, Christian König wrote:
Am 18.12.20 um 04:51 schrieb Chen Li:
[SNIP]
If your ARM base board can't do that for some then you can't use the hardware with that board.
Good to know, thanks! BTW, have you ever seen or heard boards like mine which cannot mmap device memory correctly from userspace correctly?
Unfortunately yes. We haven't been able to figure out what exactly goes wrong in those cases.
Ok. one more question: only e8860 or all radeon cards have this issue?
This applies to all hardware with dedicated memory which needs to be mapped to userspace.
That includes all graphics hardware from AMD as well as NVidia and probably a whole bunch of other PCIe devices.
Can mmio on these devices work fine in kernel space? I cannot see the difference here except user space should use uncacheable mmap to map virtual memory to device space(though I don't know how to use uncacheable mmap), while kernel use uncache ioremap.
The graphics address remapping table (GART),[1] also known as the graphics aperture remapping table,[2] or graphics translation table (GTT),[3] is an I/O memory management unit (IOMMU) used by Accelerated Graphics Port (AGP) and PCI Express (PCIe) graphics cards.
GART or GTT refers to the translation tables graphics hardware use to access system memory.
Something like 15 years ago we used the IOMMU functionality from AGP to implement that. But modern hardware (PCIe) uses some specialized hardware in the GPU for that.
Regards, Christian.
Good to know, thanks! So modern GART/GTT is like tlb, and iommu is forcused on translating address and not manager the tlb.
You are getting closer in your understanding, but the TLB is the Translation lookaside buffer. Basically a cache of recent VM translations which is present is all page table translations (GART, IOMMU, CPU etc...).
The key difference is where the page table translation happens on modern hardware:
- For the GART/GTT it is inside the GPU to translate between GPU internal and
bus addresses. 2. For IOMMU it is inside the root complex of the PCIe to translate between bus addresses and physical addresses. 3. For CPU page tables it is inside the CPU core to translate between virtual addresses and physical addresses.
Regards, Christian.
Awesome explaination! Thanks in a ton!
Am 18.12.20 um 09:52 schrieb Chen Li:
On Fri, 18 Dec 2020 16:10:12 +0800, Christian König wrote:
Am 18.12.20 um 04:51 schrieb Chen Li:
[SNIP]
If your ARM base board can't do that for some then you can't use the hardware with that board.
Good to know, thanks! BTW, have you ever seen or heard boards like mine which cannot mmap device memory correctly from userspace correctly?
Unfortunately yes. We haven't been able to figure out what exactly goes wrong in those cases.
Ok. one more question: only e8860 or all radeon cards have this issue?
This applies to all hardware with dedicated memory which needs to be mapped to userspace.
That includes all graphics hardware from AMD as well as NVidia and probably a whole bunch of other PCIe devices.
Can mmio on these devices work fine in kernel space?
The kernel drivers know that this is MMIO and can use special instructions/functions like writel()/writeq()/memcpy_fromio()/memcpy_toio() etc...
I cannot see the difference here except user space should use uncacheable mmap to map virtual memory to device space(though I don't know how to use uncacheable mmap), while kernel use uncache ioremap.
The problem with mmap() of MMIO into the userspace is that this can easily crash the whole system.
When an application uses memset()/memcpy() on the mapped region and the system spontaneous reboots than that's a rather big hardware problem.
Regards, Christian.
On 2020-12-17 10:25, Christian König wrote:
Am 17.12.20 um 02:07 schrieb Chen Li:
On Wed, 16 Dec 2020 22:19:11 +0800, Christian König wrote:
Am 16.12.20 um 14:48 schrieb Chen Li:
On Wed, 16 Dec 2020 15:59:37 +0800, Christian König wrote:
[SNIP]
Hi, Christian. I'm not sure why this change is a hack here. I cannot see the problem and wll be grateful if you give more explainations.
__memset is supposed to work on those addresses, otherwise you can't use the e8860 on your arm64 system.
If __memset is supposed to work on those adresses, why this commit(https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com...) is needed? (I also notice drm/radeon didn't take this change though) just out of curiosity.
We generally accept those patches as cleanup in the kernel with the hope that we can find a way to work around the userspace restrictions.
But when you also have this issue in userspace then there isn't much we can do for you.
Replacing the the direct write in the kernel with calls to writel() or memset_io() will fix that temporary, but you have a more general problem here.
I cannot see what's the more general problem here :( u mean performance?
No, not performance. See standards like OpenGL, Vulkan as well as VA-API and VDPAU require that you can mmap() device memory and execute memset/memcpy on the memory from userspace.
If your ARM base board can't do that for some then you can't use the hardware with that board.
If the VRAM lives in a prefetchable PCI bar then on most sane Arm-based systems I believe it should be able to mmap() to userspace with the Normal memory type, where unaligned accesses and such are allowed, as opposed to the Device memory type intended for MMIO mappings, which has more restrictions but stricter ordering guarantees.
Regardless of what happens elsewhere though, if something is mapped *into the kernel* with ioremap(), then it is fundamentally wrong per the kernel memory model to reference that mapping directly without using I/O accessors. That is not specific to any individual architecture, and Sparse should be screaming about it already. I guess in this case the UVD code needs to pay more attention to whether radeon_bo_kmap() ends up going via ttm_bo_ioremap() or not.
(I'm assuming the initial fault was memset() with 0 trying to perform "DC ZVA" on a Device-type mapping from ioremap() - FYI a stacktrace on its own without the rest of the error dump showing what actually triggered it isn't overly useful)
Robin.
For amdgpu I suggest that we allocate the UVD message in GTT instead of VRAM since we don't have the hardware restriction for that on the new generations.
Thanks, I will try to dig into deeper. But what's the "hardware restriction" meaning here? I'm not familiar with video driver stack and amd gpu, sorry.
On older hardware (AGP days) the buffer had to be in VRAM (MMIO) memory, but on modern system GTT (system memory) works as well.
IIUC, e8860 can use amdgpu(I use radeon now) beause its device id 6822 is in amdgpu's table. But I cannot tell whether e8860 has iommu, and I cannot find iommu from lspci, so graphics translation table may not work here?
That is not related to IOMMU. IOMMU is a feature of the CPU/motherboard. This is implemented using GTT, e.g. the VM page tables inside the GPU.
And yes it should work I will prepare a patch for it.
BTW: How does userspace work on arm64 then? The driver stack usually only works if mmio can be mapped directly.
I also post two usespace issue on mesa, and you may be interested with them:
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.fre...
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.fre...
I paste some virtual memory map in userspace there. (and the two problems do bother me quite a long time.)
I don't really see a solution for those problems.
See it is perfectly valid for an application to memset/memcpy on mmaped MMIO space which comes from OpenGL or Vulkan.
So your CPU simply won't work with the hardware. We could work around that with a couple of hacks, but this is a pretty much general problem.
Regards, Christian.
Thanks! Can you provid some details about these hacks? Should I post another issue on the mail list?
Adjust the kernel and/or user space to never map VRAM to the CPU.
This violates the OpenGL/Vulkan specification in some ways. So not sure if that will work or not.
Regards, Christian. _______________________________________________ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Am 17.12.20 um 14:45 schrieb Robin Murphy:
On 2020-12-17 10:25, Christian König wrote:
Am 17.12.20 um 02:07 schrieb Chen Li:
On Wed, 16 Dec 2020 22:19:11 +0800, Christian König wrote:
Am 16.12.20 um 14:48 schrieb Chen Li:
On Wed, 16 Dec 2020 15:59:37 +0800, Christian König wrote:
[SNIP]
Hi, Christian. I'm not sure why this change is a hack here. I cannot see the problem and wll be grateful if you give more explainations.
__memset is supposed to work on those addresses, otherwise you can't use the e8860 on your arm64 system.
If __memset is supposed to work on those adresses, why this commit(https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com...) is needed? (I also notice drm/radeon didn't take this change though) just out of curiosity.
We generally accept those patches as cleanup in the kernel with the hope that we can find a way to work around the userspace restrictions.
But when you also have this issue in userspace then there isn't much we can do for you.
Replacing the the direct write in the kernel with calls to writel() or memset_io() will fix that temporary, but you have a more general problem here.
I cannot see what's the more general problem here :( u mean performance?
No, not performance. See standards like OpenGL, Vulkan as well as VA-API and VDPAU require that you can mmap() device memory and execute memset/memcpy on the memory from userspace.
If your ARM base board can't do that for some then you can't use the hardware with that board.
If the VRAM lives in a prefetchable PCI bar then on most sane Arm-based systems I believe it should be able to mmap() to userspace with the Normal memory type, where unaligned accesses and such are allowed, as opposed to the Device memory type intended for MMIO mappings, which has more restrictions but stricter ordering guarantees.
Do you have some background why some ARM boards fail with that?
We had a couple of reports that memset/memcpy fail in userspace (usually system just spontaneously reboots or becomes unresponsive), but so far nobody could tell us why that happens?
Regardless of what happens elsewhere though, if something is mapped *into the kernel* with ioremap(), then it is fundamentally wrong per the kernel memory model to reference that mapping directly without using I/O accessors. That is not specific to any individual architecture, and Sparse should be screaming about it already. I guess in this case the UVD code needs to pay more attention to whether radeon_bo_kmap() ends up going via ttm_bo_ioremap() or not.
Yes, exactly. That's why we already have memcpy_fromio()/memcpy_toio() to upload the firmware and save the state on suspend/resume.
It's just that in this case here we also have IO memory because some 15+ years old AGP based hardware doesn't work when you but it in system memory :)
So pointing that out is correct and I'm going to clean that up now.
Regards, Christian.
(I'm assuming the initial fault was memset() with 0 trying to perform "DC ZVA" on a Device-type mapping from ioremap() - FYI a stacktrace on its own without the rest of the error dump showing what actually triggered it isn't overly useful)
Robin.
Am Donnerstag, den 17.12.2020, 15:02 +0100 schrieb Christian König:
Am 17.12.20 um 14:45 schrieb Robin Murphy:
On 2020-12-17 10:25, Christian König wrote:
Am 17.12.20 um 02:07 schrieb Chen Li:
On Wed, 16 Dec 2020 22:19:11 +0800, Christian König wrote:
Am 16.12.20 um 14:48 schrieb Chen Li:
On Wed, 16 Dec 2020 15:59:37 +0800, Christian König wrote: > [SNIP] Hi, Christian. I'm not sure why this change is a hack here. I cannot see the problem and wll be grateful if you give more explainations.
__memset is supposed to work on those addresses, otherwise you can't use the e8860 on your arm64 system.
If __memset is supposed to work on those adresses, why this commit(https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com...) is needed? (I also notice drm/radeon didn't take this change though) just out of curiosity.
We generally accept those patches as cleanup in the kernel with the hope that we can find a way to work around the userspace restrictions.
But when you also have this issue in userspace then there isn't much we can do for you.
Replacing the the direct write in the kernel with calls to writel() or memset_io() will fix that temporary, but you have a more general problem here.
I cannot see what's the more general problem here :( u mean performance?
No, not performance. See standards like OpenGL, Vulkan as well as VA-API and VDPAU require that you can mmap() device memory and execute memset/memcpy on the memory from userspace.
If your ARM base board can't do that for some then you can't use the hardware with that board.
If the VRAM lives in a prefetchable PCI bar then on most sane Arm-based systems I believe it should be able to mmap() to userspace with the Normal memory type, where unaligned accesses and such are allowed, as opposed to the Device memory type intended for MMIO mappings, which has more restrictions but stricter ordering guarantees.
Do you have some background why some ARM boards fail with that?
We had a couple of reports that memset/memcpy fail in userspace (usually system just spontaneously reboots or becomes unresponsive), but so far nobody could tell us why that happens?
Optimized memset/memcpy uses unaligned access in some cases, where handling unaligned start/end addresses would cause more instructions to be used otherwise.
If the device memory isn't mapped at least writecombined (bufferable in ARM speak) into userspace, those unaligned accesses are not allowed and will cause traps on the hardware level. Normally this should just lead to the process making the access getting killed with a SIGBUS, but maybe some systems handle those traps wrong on a firmware level? If the kernel makes such an unaligned access then the kernel will fault, which normally means halting the kernel.
Regards, Lucas
On 2020-12-17 14:02, Christian König wrote:
Am 17.12.20 um 14:45 schrieb Robin Murphy:
On 2020-12-17 10:25, Christian König wrote:
Am 17.12.20 um 02:07 schrieb Chen Li:
On Wed, 16 Dec 2020 22:19:11 +0800, Christian König wrote:
Am 16.12.20 um 14:48 schrieb Chen Li:
On Wed, 16 Dec 2020 15:59:37 +0800, Christian König wrote: > [SNIP] Hi, Christian. I'm not sure why this change is a hack here. I cannot see the problem and wll be grateful if you give more explainations.
__memset is supposed to work on those addresses, otherwise you can't use the e8860 on your arm64 system.
If __memset is supposed to work on those adresses, why this commit(https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com...) is needed? (I also notice drm/radeon didn't take this change though) just out of curiosity.
We generally accept those patches as cleanup in the kernel with the hope that we can find a way to work around the userspace restrictions.
But when you also have this issue in userspace then there isn't much we can do for you.
Replacing the the direct write in the kernel with calls to writel() or memset_io() will fix that temporary, but you have a more general problem here.
I cannot see what's the more general problem here :( u mean performance?
No, not performance. See standards like OpenGL, Vulkan as well as VA-API and VDPAU require that you can mmap() device memory and execute memset/memcpy on the memory from userspace.
If your ARM base board can't do that for some then you can't use the hardware with that board.
If the VRAM lives in a prefetchable PCI bar then on most sane Arm-based systems I believe it should be able to mmap() to userspace with the Normal memory type, where unaligned accesses and such are allowed, as opposed to the Device memory type intended for MMIO mappings, which has more restrictions but stricter ordering guarantees.
Do you have some background why some ARM boards fail with that?
We had a couple of reports that memset/memcpy fail in userspace (usually system just spontaneously reboots or becomes unresponsive), but so far nobody could tell us why that happens?
Part of it is that Arm doesn't really have an ideal memory type for mapping RAM behind PCI (much like we also struggle with the vague expectations of what write-combine might mean beyond x86). Device memory can be relaxed to allow gathering, reordering and write-buffering, but is still a bit too restrictive in other ways - aligned, non-speculative, etc. - for something that's really just RAM and expected to be usable as such. Thus to map PCI memory as "write-combine" we use Normal non-cacheable, which means the CPU MMU is going to allow software to do all the things it might expect of RAM, but we're now at the mercy of the menagerie of interconnects and PCI implementations out there.
Atomic operations, for example, *might* be resolved by the CPU coherency mechanism or in the interconnect, such that the PCI host bridge only sees regular loads and stores, but more often than not they'll just result in an atomic transaction going all the way to the host bridge. A super-duper-clever host bridge implementation might even support that, but the vast majority are likely to just reject it as invalid.
Similarly, unaligned accesses, cache line fills/evictions, and such will often work, since they're essentially just larger read/write bursts, but some host bridges can be picky and might reject access sizes they don't like (there's at least one where even 64-bit accesses don't work. On a 64-bit system...)
If an invalid transaction does reach the host bridge, it's going to come back to the CPU as an external abort. If we're really lucky that could be taken synchronously, attributable to a specific instruction, and just oops/SIGBUS the relevant kernel/userspace thread. Often though, (particularly with big out-of-order CPUs) it's likely to be asynchronous and no longer attributable, and thus taken as an SError event, which in general roughly translates to "part of the SoC has fallen off". The only reasonable response we have to that is to panic the system.
Robin.
Am 18.12.20 um 15:17 schrieb Robin Murphy:
On 2020-12-17 14:02, Christian König wrote:
[SNIP] Do you have some background why some ARM boards fail with that?
We had a couple of reports that memset/memcpy fail in userspace (usually system just spontaneously reboots or becomes unresponsive), but so far nobody could tell us why that happens?
Part of it is that Arm doesn't really have an ideal memory type for mapping RAM behind PCI (much like we also struggle with the vague expectations of what write-combine might mean beyond x86). Device memory can be relaxed to allow gathering, reordering and write-buffering, but is still a bit too restrictive in other ways - aligned, non-speculative, etc. - for something that's really just RAM and expected to be usable as such. Thus to map PCI memory as "write-combine" we use Normal non-cacheable, which means the CPU MMU is going to allow software to do all the things it might expect of RAM, but we're now at the mercy of the menagerie of interconnects and PCI implementations out there.
I see. As far as I know we already correctly map the RAM from the GPU as "write-combine".
Atomic operations, for example, *might* be resolved by the CPU coherency mechanism or in the interconnect, such that the PCI host bridge only sees regular loads and stores, but more often than not they'll just result in an atomic transaction going all the way to the host bridge. A super-duper-clever host bridge implementation might even support that, but the vast majority are likely to just reject it as invalid.
Support for atomics is actually specified by an PCIe extension. As far as I know that extension is even necessary for full KFD support on AMD and full Cuda support for NVidia GPUs.
Similarly, unaligned accesses, cache line fills/evictions, and such will often work, since they're essentially just larger read/write bursts, but some host bridges can be picky and might reject access sizes they don't like (there's at least one where even 64-bit accesses don't work. On a 64-bit system...)
This is breaking our neck here. We need 64bit writes on 64bit systems to end up as one 64bit write at the hardware and not two 32bit writes or otherwise the doorbells won't work correctly.
Larger writes are pretty much unproblematic, for P2P our bus interface even supports really large multi byte transfers.
If an invalid transaction does reach the host bridge, it's going to come back to the CPU as an external abort. If we're really lucky that could be taken synchronously, attributable to a specific instruction, and just oops/SIGBUS the relevant kernel/userspace thread. Often though, (particularly with big out-of-order CPUs) it's likely to be asynchronous and no longer attributable, and thus taken as an SError event, which in general roughly translates to "part of the SoC has fallen off". The only reasonable response we have to that is to panic the system.
Yeah, that sounds exactly like what we see on some of the ARM boards out there. At least we have an explanation for that behavior now.
Going to talk about this with our hardware engineers. We might be able to work around some of that stuff, but that is rather tricky to get working under those conditions.
Thanks, Christian.
Robin.
On 2020-12-18 14:33, Christian König wrote:
Am 18.12.20 um 15:17 schrieb Robin Murphy:
On 2020-12-17 14:02, Christian König wrote:
[SNIP] Do you have some background why some ARM boards fail with that?
We had a couple of reports that memset/memcpy fail in userspace (usually system just spontaneously reboots or becomes unresponsive), but so far nobody could tell us why that happens?
Part of it is that Arm doesn't really have an ideal memory type for mapping RAM behind PCI (much like we also struggle with the vague expectations of what write-combine might mean beyond x86). Device memory can be relaxed to allow gathering, reordering and write-buffering, but is still a bit too restrictive in other ways - aligned, non-speculative, etc. - for something that's really just RAM and expected to be usable as such. Thus to map PCI memory as "write-combine" we use Normal non-cacheable, which means the CPU MMU is going to allow software to do all the things it might expect of RAM, but we're now at the mercy of the menagerie of interconnects and PCI implementations out there.
I see. As far as I know we already correctly map the RAM from the GPU as "write-combine".
Atomic operations, for example, *might* be resolved by the CPU coherency mechanism or in the interconnect, such that the PCI host bridge only sees regular loads and stores, but more often than not they'll just result in an atomic transaction going all the way to the host bridge. A super-duper-clever host bridge implementation might even support that, but the vast majority are likely to just reject it as invalid.
Support for atomics is actually specified by an PCIe extension. As far as I know that extension is even necessary for full KFD support on AMD and full Cuda support for NVidia GPUs.
Similarly, unaligned accesses, cache line fills/evictions, and such will often work, since they're essentially just larger read/write bursts, but some host bridges can be picky and might reject access sizes they don't like (there's at least one where even 64-bit accesses don't work. On a 64-bit system...)
This is breaking our neck here. We need 64bit writes on 64bit systems to end up as one 64bit write at the hardware and not two 32bit writes or otherwise the doorbells won't work correctly.
Just to clarify, that particular case *is* considered catastrophically broken ;)
In general you can assume that on AArch64, any aligned 64-bit load or store is atomic (64-bit accesses on 32-bit Arm are less well-defined, but hopefully nobody cares by now).
Larger writes are pretty much unproblematic, for P2P our bus interface even supports really large multi byte transfers.
If an invalid transaction does reach the host bridge, it's going to come back to the CPU as an external abort. If we're really lucky that could be taken synchronously, attributable to a specific instruction, and just oops/SIGBUS the relevant kernel/userspace thread. Often though, (particularly with big out-of-order CPUs) it's likely to be asynchronous and no longer attributable, and thus taken as an SError event, which in general roughly translates to "part of the SoC has fallen off". The only reasonable response we have to that is to panic the system.
Yeah, that sounds exactly like what we see on some of the ARM boards out there. At least we have an explanation for that behavior now.
Going to talk about this with our hardware engineers. We might be able to work around some of that stuff, but that is rather tricky to get working under those conditions.
Yeah, unfortunately there's no easy way to judge the quality of any given SoC's PCI implementation until you throw your required traffic at it and things either break or don't...
Cheers, Robin.
On Thu, 17 Dec 2020 21:45:06 +0800, Robin Murphy wrote:
On 2020-12-17 10:25, Christian König wrote:
Am 17.12.20 um 02:07 schrieb Chen Li:
On Wed, 16 Dec 2020 22:19:11 +0800, Christian König wrote:
Am 16.12.20 um 14:48 schrieb Chen Li:
On Wed, 16 Dec 2020 15:59:37 +0800, Christian König wrote:
[SNIP]
Hi, Christian. I'm not sure why this change is a hack here. I cannot see the problem and wll be grateful if you give more explainations.
__memset is supposed to work on those addresses, otherwise you can't use the e8860 on your arm64 system.
If __memset is supposed to work on those adresses, why this commit(https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com...) is needed? (I also notice drm/radeon didn't take this change though) just out of curiosity.
We generally accept those patches as cleanup in the kernel with the hope that we can find a way to work around the userspace restrictions.
But when you also have this issue in userspace then there isn't much we can do for you.
Replacing the the direct write in the kernel with calls to writel() or memset_io() will fix that temporary, but you have a more general problem here.
I cannot see what's the more general problem here :( u mean performance?
No, not performance. See standards like OpenGL, Vulkan as well as VA-API and VDPAU require that you can mmap() device memory and execute memset/memcpy on the memory from userspace.
If your ARM base board can't do that for some then you can't use the hardware with that board.
If the VRAM lives in a prefetchable PCI bar then on most sane Arm-based systems I believe it should be able to mmap() to userspace with the Normal memory type, where unaligned accesses and such are allowed, as opposed to the Device memory type intended for MMIO mappings, which has more restrictions but stricter ordering guarantees.
Hi, Robin. I cannot understand it allow unaligned accesses. prefetchable PCI bar should also be mmio, and accesses will end with device memory, so why does this allow unaligned access?
Regardless of what happens elsewhere though, if something is mapped *into the kernel* with ioremap(), then it is fundamentally wrong per the kernel memory model to reference that mapping directly without using I/O accessors. That is not specific to any individual architecture, and Sparse should be screaming about it already. I guess in this case the UVD code needs to pay more attention to whether radeon_bo_kmap() ends up going via ttm_bo_ioremap() or not.
(I'm assuming the initial fault was memset() with 0 trying to perform "DC ZVA" on a Device-type mapping from ioremap() - FYI a stacktrace on its own without the rest of the error dump showing what actually triggered it isn't overly useful)
Robin.
why it may be 'DC ZVA'? I'm not sure the pc in initial kernel fault memset, but I capture the userspace crash pc: stp(128bit) or str with neon(also 128bit) to render node(/dev/dri/renderD128).
For amdgpu I suggest that we allocate the UVD message in GTT instead of VRAM since we don't have the hardware restriction for that on the new generations.
Thanks, I will try to dig into deeper. But what's the "hardware restriction" meaning here? I'm not familiar with video driver stack and amd gpu, sorry.
On older hardware (AGP days) the buffer had to be in VRAM (MMIO) memory, but on modern system GTT (system memory) works as well.
IIUC, e8860 can use amdgpu(I use radeon now) beause its device id 6822 is in amdgpu's table. But I cannot tell whether e8860 has iommu, and I cannot find iommu from lspci, so graphics translation table may not work here?
That is not related to IOMMU. IOMMU is a feature of the CPU/motherboard. This is implemented using GTT, e.g. the VM page tables inside the GPU.
And yes it should work I will prepare a patch for it.
BTW: How does userspace work on arm64 then? The driver stack usually only works if mmio can be mapped directly.
I also post two usespace issue on mesa, and you may be interested with them: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.fre... https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.fre... I paste some virtual memory map in userspace there. (and the two problems do bother me quite a long time.)
I don't really see a solution for those problems.
See it is perfectly valid for an application to memset/memcpy on mmaped MMIO space which comes from OpenGL or Vulkan.
So your CPU simply won't work with the hardware. We could work around that with a couple of hacks, but this is a pretty much general problem.
Regards, Christian.
Thanks! Can you provid some details about these hacks? Should I post another issue on the mail list?
Adjust the kernel and/or user space to never map VRAM to the CPU.
This violates the OpenGL/Vulkan specification in some ways. So not sure if that will work or not.
Regards, Christian. _______________________________________________ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
On 2020-12-18 06:14, Chen Li wrote: [...]
No, not performance. See standards like OpenGL, Vulkan as well as VA-API and VDPAU require that you can mmap() device memory and execute memset/memcpy on the memory from userspace.
If your ARM base board can't do that for some then you can't use the hardware with that board.
If the VRAM lives in a prefetchable PCI bar then on most sane Arm-based systems I believe it should be able to mmap() to userspace with the Normal memory type, where unaligned accesses and such are allowed, as opposed to the Device memory type intended for MMIO mappings, which has more restrictions but stricter ordering guarantees.
Hi, Robin. I cannot understand it allow unaligned accesses. prefetchable PCI bar should also be mmio, and accesses will end with device memory, so why does this allow unaligned access?
Because even Device-GRE is a bit too restrictive to expose to userspace that's likely to expect it to behave as regular memory, so, for better or worse, we use MT_NORMAL_MC for pgrprot_writecombine().
Regardless of what happens elsewhere though, if something is mapped *into the kernel* with ioremap(), then it is fundamentally wrong per the kernel memory model to reference that mapping directly without using I/O accessors. That is not specific to any individual architecture, and Sparse should be screaming about it already. I guess in this case the UVD code needs to pay more attention to whether radeon_bo_kmap() ends up going via ttm_bo_ioremap() or not.
(I'm assuming the initial fault was memset() with 0 trying to perform "DC ZVA" on a Device-type mapping from ioremap() - FYI a stacktrace on its own without the rest of the error dump showing what actually triggered it isn't overly useful)
Robin.
why it may be 'DC ZVA'? I'm not sure the pc in initial kernel fault memset, but I capture the userspace crash pc: stp(128bit) or str with neon(also 128bit) to render node(/dev/dri/renderD128).
As I said it was an assumption. I guessed at it being more likely to be an MMU fault than an external abort, and given the size and the fact that it's a variable initialisation guessed at it being slightly more likely to hit the ZVA special-case rather than being unaligned. Looking again, I guess starting at an odd-numbered 32-bit element might lead to an unaligned store of XZR, but either way it doesn't really matter - what it showed is it clearly *could* be an MMU fault because TTM seems to be a bit careless with iomem pointers.
That said, if you're also getting external aborts from your host bridge not liking 128-bit transactions, then as Christian says you're probably going to have a bad time on this platform either way.
Robin.
Hi Chen,
url: https://github.com/0day-ci/linux/commits/Chen-Li/drm-amdgpu-radeon-fix-memse... base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git d01e7f10dae29eba0f9ada82b65d24e035d5b2f9 config: x86_64-randconfig-m001-20201216 (attached as .config) compiler: gcc-9 (Debian 9.3.0-15) 9.3.0
If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot lkp@intel.com Reported-by: Dan Carpenter dan.carpenter@oracle.com
New smatch warnings: drivers/gpu/drm/radeon/radeon_uvd.c:805 radeon_uvd_get_create_msg() error: uninitialized symbol 'i'. drivers/gpu/drm/radeon/radeon_uvd.c:833 radeon_uvd_get_destroy_msg() error: uninitialized symbol 'i'.
Old smatch warnings: drivers/gpu/drm/radeon/radeon_uvd.c:568 radeon_uvd_cs_msg() warn: ignoring unreachable code.
vim +/i +805 drivers/gpu/drm/radeon/radeon_uvd.c
f2ba57b5eab8817 Christian König 2013-04-08 777 int radeon_uvd_get_create_msg(struct radeon_device *rdev, int ring, f2ba57b5eab8817 Christian König 2013-04-08 778 uint32_t handle, struct radeon_fence **fence) f2ba57b5eab8817 Christian König 2013-04-08 779 { feba9b0bcf492ba Christian König 2014-08-22 780 /* we use the last page of the vcpu bo for the UVD message */ feba9b0bcf492ba Christian König 2014-08-22 781 uint64_t offs = radeon_bo_size(rdev->uvd.vcpu_bo) - feba9b0bcf492ba Christian König 2014-08-22 782 RADEON_GPU_PAGE_SIZE; f2ba57b5eab8817 Christian König 2013-04-08 783 feba9b0bcf492ba Christian König 2014-08-22 784 uint32_t *msg = rdev->uvd.cpu_addr + offs; feba9b0bcf492ba Christian König 2014-08-22 785 uint64_t addr = rdev->uvd.gpu_addr + offs; f2ba57b5eab8817 Christian König 2013-04-08 786 feba9b0bcf492ba Christian König 2014-08-22 787 int r, i; f2ba57b5eab8817 Christian König 2013-04-08 788 feba9b0bcf492ba Christian König 2014-08-22 789 r = radeon_bo_reserve(rdev->uvd.vcpu_bo, true); feba9b0bcf492ba Christian König 2014-08-22 790 if (r) f2ba57b5eab8817 Christian König 2013-04-08 791 return r; f2ba57b5eab8817 Christian König 2013-04-08 792 f2ba57b5eab8817 Christian König 2013-04-08 793 /* stitch together an UVD create msg */ 9b1be4dc02bb6b9 Alex Deucher 2013-06-07 794 msg[0] = cpu_to_le32(0x00000de4); 9b1be4dc02bb6b9 Alex Deucher 2013-06-07 795 msg[1] = cpu_to_le32(0x00000000); 9b1be4dc02bb6b9 Alex Deucher 2013-06-07 796 msg[2] = cpu_to_le32(handle); 9b1be4dc02bb6b9 Alex Deucher 2013-06-07 797 msg[3] = cpu_to_le32(0x00000000); 9b1be4dc02bb6b9 Alex Deucher 2013-06-07 798 msg[4] = cpu_to_le32(0x00000000); 9b1be4dc02bb6b9 Alex Deucher 2013-06-07 799 msg[5] = cpu_to_le32(0x00000000); 9b1be4dc02bb6b9 Alex Deucher 2013-06-07 800 msg[6] = cpu_to_le32(0x00000000); 9b1be4dc02bb6b9 Alex Deucher 2013-06-07 801 msg[7] = cpu_to_le32(0x00000780); 9b1be4dc02bb6b9 Alex Deucher 2013-06-07 802 msg[8] = cpu_to_le32(0x00000440); 9b1be4dc02bb6b9 Alex Deucher 2013-06-07 803 msg[9] = cpu_to_le32(0x00000000); 9b1be4dc02bb6b9 Alex Deucher 2013-06-07 804 msg[10] = cpu_to_le32(0x01b37000); 201257d71c519be Chen Li 2020-12-16 @805 memset_io(&msg[i], 0x0, 1013 * sizeof(uint32_t)); ^^^^^^^ The "i" variable is never initialized anywhere.
f2ba57b5eab8817 Christian König 2013-04-08 806 feba9b0bcf492ba Christian König 2014-08-22 807 r = radeon_uvd_send_msg(rdev, ring, addr, fence); feba9b0bcf492ba Christian König 2014-08-22 808 radeon_bo_unreserve(rdev->uvd.vcpu_bo); feba9b0bcf492ba Christian König 2014-08-22 809 return r; f2ba57b5eab8817 Christian König 2013-04-08 810 } f2ba57b5eab8817 Christian König 2013-04-08 811 f2ba57b5eab8817 Christian König 2013-04-08 812 int radeon_uvd_get_destroy_msg(struct radeon_device *rdev, int ring, f2ba57b5eab8817 Christian König 2013-04-08 813 uint32_t handle, struct radeon_fence **fence) f2ba57b5eab8817 Christian König 2013-04-08 814 { feba9b0bcf492ba Christian König 2014-08-22 815 /* we use the last page of the vcpu bo for the UVD message */ feba9b0bcf492ba Christian König 2014-08-22 816 uint64_t offs = radeon_bo_size(rdev->uvd.vcpu_bo) - feba9b0bcf492ba Christian König 2014-08-22 817 RADEON_GPU_PAGE_SIZE; f2ba57b5eab8817 Christian König 2013-04-08 818 feba9b0bcf492ba Christian König 2014-08-22 819 uint32_t *msg = rdev->uvd.cpu_addr + offs; feba9b0bcf492ba Christian König 2014-08-22 820 uint64_t addr = rdev->uvd.gpu_addr + offs; f2ba57b5eab8817 Christian König 2013-04-08 821 feba9b0bcf492ba Christian König 2014-08-22 822 int r, i; f2ba57b5eab8817 Christian König 2013-04-08 823 feba9b0bcf492ba Christian König 2014-08-22 824 r = radeon_bo_reserve(rdev->uvd.vcpu_bo, true); feba9b0bcf492ba Christian König 2014-08-22 825 if (r) f2ba57b5eab8817 Christian König 2013-04-08 826 return r; f2ba57b5eab8817 Christian König 2013-04-08 827 f2ba57b5eab8817 Christian König 2013-04-08 828 /* stitch together an UVD destroy msg */ 9b1be4dc02bb6b9 Alex Deucher 2013-06-07 829 msg[0] = cpu_to_le32(0x00000de4); 9b1be4dc02bb6b9 Alex Deucher 2013-06-07 830 msg[1] = cpu_to_le32(0x00000002); 9b1be4dc02bb6b9 Alex Deucher 2013-06-07 831 msg[2] = cpu_to_le32(handle); 9b1be4dc02bb6b9 Alex Deucher 2013-06-07 832 msg[3] = cpu_to_le32(0x00000000); 201257d71c519be Chen Li 2020-12-16 @833 memset_io(&msg[i], 0x0, 1020 * sizeof(uint32_t)); ^^^^^^^ Same
f2ba57b5eab8817 Christian König 2013-04-08 834 feba9b0bcf492ba Christian König 2014-08-22 835 r = radeon_uvd_send_msg(rdev, ring, addr, fence); feba9b0bcf492ba Christian König 2014-08-22 836 radeon_bo_unreserve(rdev->uvd.vcpu_bo); feba9b0bcf492ba Christian König 2014-08-22 837 return r; f2ba57b5eab8817 Christian König 2013-04-08 838 } 55b51c88c5167ba Christian König 2013-04-18 839
--- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
_______________________________________________ kbuild mailing list -- kbuild@lists.01.org To unsubscribe send an email to kbuild-leave@lists.01.org
Hi Chen,
Thank you for the patch! Perhaps something to improve:
[auto build test WARNING on linus/master] [also build test WARNING on v5.10 next-20201215] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch]
url: https://github.com/0day-ci/linux/commits/Chen-Li/drm-amdgpu-radeon-fix-memse... base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git d01e7f10dae29eba0f9ada82b65d24e035d5b2f9 config: x86_64-randconfig-a002-20201216 (attached as .config) compiler: clang version 12.0.0 (https://github.com/llvm/llvm-project 71601d2ac9954cb59c443cb3ae442cb106df35d4) reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # install x86_64 cross compiling tool for clang build # apt-get install binutils-x86-64-linux-gnu # https://github.com/0day-ci/linux/commit/201257d71c519bef0966e555d95bf820512f... git remote add linux-review https://github.com/0day-ci/linux git fetch --no-tags linux-review Chen-Li/drm-amdgpu-radeon-fix-memset-on-io-mem/20201216-165835 git checkout 201257d71c519bef0966e555d95bf820512f5a34 # save the attached .config to linux build tree COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=x86_64
If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot lkp@intel.com
All warnings (new ones prefixed by >>):
drivers/gpu/drm/radeon/radeon_uvd.c:159:6: warning: format specifies type 'unsigned short' but the argument has type 'unsigned int' [-Wformat] version_major, version_minor, family_id); ^~~~~~~~~~~~~ include/drm/drm_print.h:484:29: note: expanded from macro 'DRM_INFO' _DRM_PRINTK(, INFO, fmt, ##__VA_ARGS__) ~~~ ^~~~~~~~~~~ include/drm/drm_print.h:481:53: note: expanded from macro '_DRM_PRINTK' printk##once(KERN_##level "[" DRM_NAME "] " fmt, ##__VA_ARGS__) ~~~ ^~~~~~~~~~~ drivers/gpu/drm/radeon/radeon_uvd.c:159:21: warning: format specifies type 'unsigned short' but the argument has type 'unsigned int' [-Wformat] version_major, version_minor, family_id); ^~~~~~~~~~~~~ include/drm/drm_print.h:484:29: note: expanded from macro 'DRM_INFO' _DRM_PRINTK(, INFO, fmt, ##__VA_ARGS__) ~~~ ^~~~~~~~~~~ include/drm/drm_print.h:481:53: note: expanded from macro '_DRM_PRINTK' printk##once(KERN_##level "[" DRM_NAME "] " fmt, ##__VA_ARGS__) ~~~ ^~~~~~~~~~~ drivers/gpu/drm/radeon/radeon_uvd.c:159:36: warning: format specifies type 'unsigned short' but the argument has type 'unsigned int' [-Wformat] version_major, version_minor, family_id); ^~~~~~~~~ include/drm/drm_print.h:484:29: note: expanded from macro 'DRM_INFO' _DRM_PRINTK(, INFO, fmt, ##__VA_ARGS__) ~~~ ^~~~~~~~~~~ include/drm/drm_print.h:481:53: note: expanded from macro '_DRM_PRINTK' printk##once(KERN_##level "[" DRM_NAME "] " fmt, ##__VA_ARGS__) ~~~ ^~~~~~~~~~~
drivers/gpu/drm/radeon/radeon_uvd.c:805:17: warning: variable 'i' is uninitialized when used here [-Wuninitialized]
memset_io(&msg[i], 0x0, 1013 * sizeof(uint32_t)); ^ drivers/gpu/drm/radeon/radeon_uvd.c:787:10: note: initialize the variable 'i' to silence this warning int r, i; ^ = 0 drivers/gpu/drm/radeon/radeon_uvd.c:833:17: warning: variable 'i' is uninitialized when used here [-Wuninitialized] memset_io(&msg[i], 0x0, 1020 * sizeof(uint32_t)); ^ drivers/gpu/drm/radeon/radeon_uvd.c:822:10: note: initialize the variable 'i' to silence this warning int r, i; ^ = 0 5 warnings generated.
vim +/i +805 drivers/gpu/drm/radeon/radeon_uvd.c
771 772 /* 773 * multiple fence commands without any stream commands in between can 774 * crash the vcpu so just try to emmit a dummy create/destroy msg to 775 * avoid this 776 */ 777 int radeon_uvd_get_create_msg(struct radeon_device *rdev, int ring, 778 uint32_t handle, struct radeon_fence **fence) 779 { 780 /* we use the last page of the vcpu bo for the UVD message */ 781 uint64_t offs = radeon_bo_size(rdev->uvd.vcpu_bo) - 782 RADEON_GPU_PAGE_SIZE; 783 784 uint32_t *msg = rdev->uvd.cpu_addr + offs; 785 uint64_t addr = rdev->uvd.gpu_addr + offs; 786 787 int r, i; 788 789 r = radeon_bo_reserve(rdev->uvd.vcpu_bo, true); 790 if (r) 791 return r; 792 793 /* stitch together an UVD create msg */ 794 msg[0] = cpu_to_le32(0x00000de4); 795 msg[1] = cpu_to_le32(0x00000000); 796 msg[2] = cpu_to_le32(handle); 797 msg[3] = cpu_to_le32(0x00000000); 798 msg[4] = cpu_to_le32(0x00000000); 799 msg[5] = cpu_to_le32(0x00000000); 800 msg[6] = cpu_to_le32(0x00000000); 801 msg[7] = cpu_to_le32(0x00000780); 802 msg[8] = cpu_to_le32(0x00000440); 803 msg[9] = cpu_to_le32(0x00000000); 804 msg[10] = cpu_to_le32(0x01b37000);
805 memset_io(&msg[i], 0x0, 1013 * sizeof(uint32_t));
806 807 r = radeon_uvd_send_msg(rdev, ring, addr, fence); 808 radeon_bo_unreserve(rdev->uvd.vcpu_bo); 809 return r; 810 } 811
--- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
dri-devel@lists.freedesktop.org