We need to wait for the GPUVM flush to complete. There was some confusion as to how this mechanism was supposed to work. The operation is not atomic. For GPU initiated invalidations you need to read back a VM register to introduce enough latency for the update to complete.
v2: drop gart changes
Reviewed-by: Christian König christian.koenig@amd.com Signed-off-by: Alex Deucher alexander.deucher@amd.com Cc: stable@vger.kernel.org --- drivers/gpu/drm/radeon/ni.c | 10 ++++++++++ drivers/gpu/drm/radeon/ni_dma.c | 6 ++++++ drivers/gpu/drm/radeon/nid.h | 21 +++++++++++++++++++++ 3 files changed, 37 insertions(+)
diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c index 360de9f..ba7c68b 100644 --- a/drivers/gpu/drm/radeon/ni.c +++ b/drivers/gpu/drm/radeon/ni.c @@ -2516,6 +2516,16 @@ void cayman_vm_flush(struct radeon_device *rdev, struct radeon_ring *ring, radeon_ring_write(ring, PACKET0(VM_INVALIDATE_REQUEST, 0)); radeon_ring_write(ring, 1 << vm_id);
+ /* wait for the invalidate to complete */ + radeon_ring_write(ring, PACKET3(PACKET3_WAIT_REG_MEM, 5)); + radeon_ring_write(ring, (WAIT_REG_MEM_FUNCTION(3) | /* == */ + WAIT_REG_MEM_ENGINE(0))); /* me */ + radeon_ring_write(ring, VM_INVALIDATE_REQUEST >> 2); + radeon_ring_write(ring, 0); + radeon_ring_write(ring, 0); /* ref */ + radeon_ring_write(ring, 1 << vm_id); /* mask */ + radeon_ring_write(ring, 0x20); /* poll interval */ + /* sync PFP to ME, otherwise we might get invalid PFP reads */ radeon_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0)); radeon_ring_write(ring, 0x0); diff --git a/drivers/gpu/drm/radeon/ni_dma.c b/drivers/gpu/drm/radeon/ni_dma.c index 50f8861..545e5f1 100644 --- a/drivers/gpu/drm/radeon/ni_dma.c +++ b/drivers/gpu/drm/radeon/ni_dma.c @@ -463,5 +463,11 @@ void cayman_dma_vm_flush(struct radeon_device *rdev, struct radeon_ring *ring, radeon_ring_write(ring, DMA_PACKET(DMA_PACKET_SRBM_WRITE, 0, 0, 0)); radeon_ring_write(ring, (0xf << 16) | (VM_INVALIDATE_REQUEST >> 2)); radeon_ring_write(ring, 1 << vm_id); + + /* wait for invalidate to complete */ + radeon_ring_write(ring, DMA_SRBM_POLL_PACKET); + radeon_ring_write(ring, (0xff << 20) | (VM_INVALIDATE_REQUEST >> 2)); + radeon_ring_write(ring, 1 << vm_id); /* mask */ + radeon_ring_write(ring, 0); /* value */ }
diff --git a/drivers/gpu/drm/radeon/nid.h b/drivers/gpu/drm/radeon/nid.h index 2e12e4d..e2b8bbf 100644 --- a/drivers/gpu/drm/radeon/nid.h +++ b/drivers/gpu/drm/radeon/nid.h @@ -1133,6 +1133,23 @@ #define PACKET3_MEM_SEMAPHORE 0x39 #define PACKET3_MPEG_INDEX 0x3A #define PACKET3_WAIT_REG_MEM 0x3C +#define WAIT_REG_MEM_FUNCTION(x) ((x) << 0) + /* 0 - always + * 1 - < + * 2 - <= + * 3 - == + * 4 - != + * 5 - >= + * 6 - > + */ +#define WAIT_REG_MEM_MEM_SPACE(x) ((x) << 4) + /* 0 - reg + * 1 - mem + */ +#define WAIT_REG_MEM_ENGINE(x) ((x) << 8) + /* 0 - me + * 1 - pfp + */ #define PACKET3_MEM_WRITE 0x3D #define PACKET3_PFP_SYNC_ME 0x42 #define PACKET3_SURFACE_SYNC 0x43 @@ -1272,6 +1289,10 @@ (1 << 21) | \ (((n) & 0xFFFFF) << 0))
+#define DMA_SRBM_POLL_PACKET ((9 << 28) | \ + (1 << 27) | \ + (1 << 26)) + /* async DMA Packet types */ #define DMA_PACKET_WRITE 0x2 #define DMA_PACKET_COPY 0x3
We need to wait for the GPUVM flush to complete. There was some confusion as to how this mechanism was supposed to work. The operation is not atomic. For GPU initiated invalidations you need to read back a VM register to introduce enough latency for the update to complete.
v2: drop gart changes
Reviewed-by: Christian König christian.koenig@amd.com Signed-off-by: Alex Deucher alexander.deucher@amd.com Cc: stable@vger.kernel.org --- drivers/gpu/drm/radeon/si.c | 10 ++++++++++ drivers/gpu/drm/radeon/si_dma.c | 8 ++++++++ drivers/gpu/drm/radeon/sid.h | 18 ++++++++++++++++++ 3 files changed, 36 insertions(+)
diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c index 60df444..f1354ed 100644 --- a/drivers/gpu/drm/radeon/si.c +++ b/drivers/gpu/drm/radeon/si.c @@ -5057,6 +5057,16 @@ void si_vm_flush(struct radeon_device *rdev, struct radeon_ring *ring, radeon_ring_write(ring, 0); radeon_ring_write(ring, 1 << vm_id);
+ /* wait for the invalidate to complete */ + radeon_ring_write(ring, PACKET3(PACKET3_WAIT_REG_MEM, 5)); + radeon_ring_write(ring, (WAIT_REG_MEM_FUNCTION(3) | /* == */ + WAIT_REG_MEM_ENGINE(0))); /* me */ + radeon_ring_write(ring, VM_INVALIDATE_REQUEST >> 2); + radeon_ring_write(ring, 0); + radeon_ring_write(ring, 0); /* ref */ + radeon_ring_write(ring, 1 << vm_id); /* mask */ + radeon_ring_write(ring, 0x20); /* poll interval */ + /* sync PFP to ME, otherwise we might get invalid PFP reads */ radeon_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0)); radeon_ring_write(ring, 0x0); diff --git a/drivers/gpu/drm/radeon/si_dma.c b/drivers/gpu/drm/radeon/si_dma.c index f5cc777..0a37112 100644 --- a/drivers/gpu/drm/radeon/si_dma.c +++ b/drivers/gpu/drm/radeon/si_dma.c @@ -206,6 +206,14 @@ void si_dma_vm_flush(struct radeon_device *rdev, struct radeon_ring *ring, radeon_ring_write(ring, DMA_PACKET(DMA_PACKET_SRBM_WRITE, 0, 0, 0, 0)); radeon_ring_write(ring, (0xf << 16) | (VM_INVALIDATE_REQUEST >> 2)); radeon_ring_write(ring, 1 << vm_id); + + /* wait for invalidate to complete */ + radeon_ring_write(ring, DMA_PACKET(DMA_PACKET_POLL_REG_MEM, 0, 0, 0, 0)); + radeon_ring_write(ring, VM_INVALIDATE_REQUEST); + radeon_ring_write(ring, 0xff << 16); /* retry */ + radeon_ring_write(ring, 1 << vm_id); /* mask */ + radeon_ring_write(ring, 0); /* value */ + radeon_ring_write(ring, 0x20); /* poll interval */ }
/** diff --git a/drivers/gpu/drm/radeon/sid.h b/drivers/gpu/drm/radeon/sid.h index 4069be89..8499924 100644 --- a/drivers/gpu/drm/radeon/sid.h +++ b/drivers/gpu/drm/radeon/sid.h @@ -1632,6 +1632,23 @@ #define PACKET3_MPEG_INDEX 0x3A #define PACKET3_COPY_DW 0x3B #define PACKET3_WAIT_REG_MEM 0x3C +#define WAIT_REG_MEM_FUNCTION(x) ((x) << 0) + /* 0 - always + * 1 - < + * 2 - <= + * 3 - == + * 4 - != + * 5 - >= + * 6 - > + */ +#define WAIT_REG_MEM_MEM_SPACE(x) ((x) << 4) + /* 0 - reg + * 1 - mem + */ +#define WAIT_REG_MEM_ENGINE(x) ((x) << 8) + /* 0 - me + * 1 - pfp + */ #define PACKET3_MEM_WRITE 0x3D #define PACKET3_COPY_DATA 0x40 #define PACKET3_CP_DMA 0x41 @@ -1835,6 +1852,7 @@ #define DMA_PACKET_TRAP 0x7 #define DMA_PACKET_SRBM_WRITE 0x9 #define DMA_PACKET_CONSTANT_FILL 0xd +#define DMA_PACKET_POLL_REG_MEM 0xe #define DMA_PACKET_NOP 0xf
#define VCE_STATUS 0x20004
We need to wait for the GPUVM flush to complete. There was some confusion as to how this mechanism was supposed to work. The operation is not atomic. For GPU initiated invalidations you need to read back a VM register to introduce enough latency for the update to complete.
v2: drop gart changes
Reviewed-by: Christian König christian.koenig@amd.com Signed-off-by: Alex Deucher alexander.deucher@amd.com Cc: stable@vger.kernel.org --- drivers/gpu/drm/radeon/cik.c | 11 +++++++++++ drivers/gpu/drm/radeon/cik_sdma.c | 10 ++++++++++ 2 files changed, 21 insertions(+)
diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c index 6dcde37..63adce3 100644 --- a/drivers/gpu/drm/radeon/cik.c +++ b/drivers/gpu/drm/radeon/cik.c @@ -6033,6 +6033,17 @@ void cik_vm_flush(struct radeon_device *rdev, struct radeon_ring *ring, radeon_ring_write(ring, 0); radeon_ring_write(ring, 1 << vm_id);
+ /* wait for the invalidate to complete */ + radeon_ring_write(ring, PACKET3(PACKET3_WAIT_REG_MEM, 5)); + radeon_ring_write(ring, (WAIT_REG_MEM_OPERATION(0) | /* wait */ + WAIT_REG_MEM_FUNCTION(3) | /* == */ + WAIT_REG_MEM_ENGINE(0))); /* me */ + radeon_ring_write(ring, VM_INVALIDATE_REQUEST >> 2); + radeon_ring_write(ring, 0); + radeon_ring_write(ring, 0); /* ref */ + radeon_ring_write(ring, 1 << vm_id); /* mask */ + radeon_ring_write(ring, 0x20); /* poll interval */ + /* compute doesn't have PFP */ if (usepfp) { /* sync PFP to ME, otherwise we might get invalid PFP reads */ diff --git a/drivers/gpu/drm/radeon/cik_sdma.c b/drivers/gpu/drm/radeon/cik_sdma.c index dde5c7e..f1a5cac 100644 --- a/drivers/gpu/drm/radeon/cik_sdma.c +++ b/drivers/gpu/drm/radeon/cik_sdma.c @@ -903,6 +903,9 @@ void cik_sdma_vm_pad_ib(struct radeon_ib *ib) void cik_dma_vm_flush(struct radeon_device *rdev, struct radeon_ring *ring, unsigned vm_id, uint64_t pd_addr) { + u32 extra_bits = (SDMA_POLL_REG_MEM_EXTRA_OP(0) | + SDMA_POLL_REG_MEM_EXTRA_FUNC(3)); /* == */ + radeon_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_SRBM_WRITE, 0, 0xf000)); if (vm_id < 8) { radeon_ring_write(ring, (VM_CONTEXT0_PAGE_TABLE_BASE_ADDR + (vm_id << 2)) >> 2); @@ -943,5 +946,12 @@ void cik_dma_vm_flush(struct radeon_device *rdev, struct radeon_ring *ring, radeon_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_SRBM_WRITE, 0, 0xf000)); radeon_ring_write(ring, VM_INVALIDATE_REQUEST >> 2); radeon_ring_write(ring, 1 << vm_id); + + radeon_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_POLL_REG_MEM, 0, extra_bits)); + radeon_ring_write(ring, VM_INVALIDATE_REQUEST >> 2); + radeon_ring_write(ring, 0); + radeon_ring_write(ring, 0); /* reference */ + radeon_ring_write(ring, 1 << vm_id); /* mask */ + radeon_ring_write(ring, (0xfff << 16) | 10); /* retry count, poll interval */ }
dri-devel@lists.freedesktop.org