So this patchset convert the fence to use 64bits sequence and simplify the fence code (dropping fence lock). I am still convinced that the best solution is to have the various helper code deals with fence cleanup/processing. The last patch show an example of what can be done to improve sa allocator taking advantage of u64 fence.
Cheers, Jerome
From: Jerome Glisse jglisse@redhat.com
This add the number of adjacent scratch reg you want to allocate or free to the scratch alloc/free function.
Signed-off-by: Jerome Glisse jglisse@redhat.com --- drivers/gpu/drm/radeon/r100.c | 12 ++++++------ drivers/gpu/drm/radeon/r420.c | 4 ++-- drivers/gpu/drm/radeon/r600.c | 12 ++++++------ drivers/gpu/drm/radeon/radeon.h | 4 ++-- drivers/gpu/drm/radeon/radeon_device.c | 31 +++++++++++++++++++++++-------- drivers/gpu/drm/radeon/radeon_fence.c | 6 +++--- 6 files changed, 42 insertions(+), 27 deletions(-)
diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c index d47ffd5..80b57c5 100644 --- a/drivers/gpu/drm/radeon/r100.c +++ b/drivers/gpu/drm/radeon/r100.c @@ -3636,7 +3636,7 @@ int r100_ring_test(struct radeon_device *rdev, struct radeon_ring *ring) unsigned i; int r;
- r = radeon_scratch_get(rdev, &scratch); + r = radeon_scratch_get(rdev, &scratch, 1); if (r) { DRM_ERROR("radeon: cp failed to get scratch reg (%d).\n", r); return r; @@ -3645,7 +3645,7 @@ int r100_ring_test(struct radeon_device *rdev, struct radeon_ring *ring) r = radeon_ring_lock(rdev, ring, 2); if (r) { DRM_ERROR("radeon: cp failed to lock ring (%d).\n", r); - radeon_scratch_free(rdev, scratch); + radeon_scratch_free(rdev, scratch, 1); return r; } radeon_ring_write(ring, PACKET0(scratch, 0)); @@ -3665,7 +3665,7 @@ int r100_ring_test(struct radeon_device *rdev, struct radeon_ring *ring) scratch, tmp); r = -EINVAL; } - radeon_scratch_free(rdev, scratch); + radeon_scratch_free(rdev, scratch, 1); return r; }
@@ -3686,7 +3686,7 @@ int r100_ib_test(struct radeon_device *rdev, struct radeon_ring *ring) unsigned i; int r;
- r = radeon_scratch_get(rdev, &scratch); + r = radeon_scratch_get(rdev, &scratch, 1); if (r) { DRM_ERROR("radeon: failed to get scratch reg (%d).\n", r); return r; @@ -3707,7 +3707,7 @@ int r100_ib_test(struct radeon_device *rdev, struct radeon_ring *ring) ib->length_dw = 8; r = radeon_ib_schedule(rdev, ib); if (r) { - radeon_scratch_free(rdev, scratch); + radeon_scratch_free(rdev, scratch, 1); radeon_ib_free(rdev, &ib); return r; } @@ -3729,7 +3729,7 @@ int r100_ib_test(struct radeon_device *rdev, struct radeon_ring *ring) scratch, tmp); r = -EINVAL; } - radeon_scratch_free(rdev, scratch); + radeon_scratch_free(rdev, scratch, 1); radeon_ib_free(rdev, &ib); return r; } diff --git a/drivers/gpu/drm/radeon/r420.c b/drivers/gpu/drm/radeon/r420.c index 99137be..5ba459b 100644 --- a/drivers/gpu/drm/radeon/r420.c +++ b/drivers/gpu/drm/radeon/r420.c @@ -207,7 +207,7 @@ static void r420_cp_errata_init(struct radeon_device *rdev) * The proper workaround is to queue a RESYNC at the beginning * of the CP init, apparently. */ - radeon_scratch_get(rdev, &rdev->config.r300.resync_scratch); + radeon_scratch_get(rdev, &rdev->config.r300.resync_scratch, 1); radeon_ring_lock(rdev, ring, 8); radeon_ring_write(ring, PACKET0(R300_CP_RESYNC_ADDR, 1)); radeon_ring_write(ring, rdev->config.r300.resync_scratch); @@ -226,7 +226,7 @@ static void r420_cp_errata_fini(struct radeon_device *rdev) radeon_ring_write(ring, PACKET0(R300_RB3D_DSTCACHE_CTLSTAT, 0)); radeon_ring_write(ring, R300_RB3D_DC_FINISH); radeon_ring_unlock_commit(rdev, ring); - radeon_scratch_free(rdev, rdev->config.r300.resync_scratch); + radeon_scratch_free(rdev, rdev->config.r300.resync_scratch, 1); }
static int r420_startup(struct radeon_device *rdev) diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c index 0cbcd3a..02abf32 100644 --- a/drivers/gpu/drm/radeon/r600.c +++ b/drivers/gpu/drm/radeon/r600.c @@ -2261,7 +2261,7 @@ int r600_ring_test(struct radeon_device *rdev, struct radeon_ring *ring) unsigned i, ridx = radeon_ring_index(rdev, ring); int r;
- r = radeon_scratch_get(rdev, &scratch); + r = radeon_scratch_get(rdev, &scratch, 1); if (r) { DRM_ERROR("radeon: cp failed to get scratch reg (%d).\n", r); return r; @@ -2270,7 +2270,7 @@ int r600_ring_test(struct radeon_device *rdev, struct radeon_ring *ring) r = radeon_ring_lock(rdev, ring, 3); if (r) { DRM_ERROR("radeon: cp failed to lock ring %d (%d).\n", ridx, r); - radeon_scratch_free(rdev, scratch); + radeon_scratch_free(rdev, scratch, 1); return r; } radeon_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1)); @@ -2290,7 +2290,7 @@ int r600_ring_test(struct radeon_device *rdev, struct radeon_ring *ring) ridx, scratch, tmp); r = -EINVAL; } - radeon_scratch_free(rdev, scratch); + radeon_scratch_free(rdev, scratch, 1); return r; }
@@ -2693,7 +2693,7 @@ int r600_ib_test(struct radeon_device *rdev, struct radeon_ring *ring) int r; int ring_index = radeon_ring_index(rdev, ring);
- r = radeon_scratch_get(rdev, &scratch); + r = radeon_scratch_get(rdev, &scratch, 1); if (r) { DRM_ERROR("radeon: failed to get scratch reg (%d).\n", r); return r; @@ -2710,7 +2710,7 @@ int r600_ib_test(struct radeon_device *rdev, struct radeon_ring *ring) ib->length_dw = 3; r = radeon_ib_schedule(rdev, ib); if (r) { - radeon_scratch_free(rdev, scratch); + radeon_scratch_free(rdev, scratch, 1); radeon_ib_free(rdev, &ib); DRM_ERROR("radeon: failed to schedule ib (%d).\n", r); return r; @@ -2733,7 +2733,7 @@ int r600_ib_test(struct radeon_device *rdev, struct radeon_ring *ring) scratch, tmp); r = -EINVAL; } - radeon_scratch_free(rdev, scratch); + radeon_scratch_free(rdev, scratch, 1); radeon_ib_free(rdev, &ib); return r; } diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index b899cec..4f21b68 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -533,8 +533,8 @@ struct radeon_scratch { uint32_t reg[32]; };
-int radeon_scratch_get(struct radeon_device *rdev, uint32_t *reg); -void radeon_scratch_free(struct radeon_device *rdev, uint32_t reg); +int radeon_scratch_get(struct radeon_device *rdev, uint32_t *reg, unsigned n); +void radeon_scratch_free(struct radeon_device *rdev, uint32_t reg, unsigned n);
/* diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c index 1dac27d..3880aad 100644 --- a/drivers/gpu/drm/radeon/radeon_device.c +++ b/drivers/gpu/drm/radeon/radeon_device.c @@ -136,13 +136,26 @@ void radeon_scratch_init(struct radeon_device *rdev) } }
-int radeon_scratch_get(struct radeon_device *rdev, uint32_t *reg) +int radeon_scratch_get(struct radeon_device *rdev, uint32_t *reg, unsigned n) { - int i; + unsigned i, j, c;
- for (i = 0; i < rdev->scratch.num_reg; i++) { - if (rdev->scratch.free[i]) { - rdev->scratch.free[i] = false; + if (n >= rdev->scratch.num_reg) { + dump_stack(); + dev_err(rdev->dev, "trying to allocate %d scratch reg out of %d\n", + n, rdev->scratch.num_reg); + return -EINVAL; + } + for (i = 0; i < rdev->scratch.num_reg - n; i++) { + for (j = 0, c = 0; j < n; j++) { + if (rdev->scratch.free[i+j]) { + c++; + } + } + if (c == n) { + for (j = 0; j < n; j++) { + rdev->scratch.free[i+j] = false; + } *reg = rdev->scratch.reg[i]; return 0; } @@ -150,13 +163,15 @@ int radeon_scratch_get(struct radeon_device *rdev, uint32_t *reg) return -EINVAL; }
-void radeon_scratch_free(struct radeon_device *rdev, uint32_t reg) +void radeon_scratch_free(struct radeon_device *rdev, uint32_t reg, unsigned n) { - int i; + unsigned i, j;
for (i = 0; i < rdev->scratch.num_reg; i++) { if (rdev->scratch.reg[i] == reg) { - rdev->scratch.free[i] = true; + for (j = 0; j < n; j++) { + rdev->scratch.free[i+j] = true; + } return; } } diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index 7d0c331..7733429 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -371,12 +371,12 @@ int radeon_fence_driver_start_ring(struct radeon_device *rdev, int ring) int r;
write_lock_irqsave(&rdev->fence_lock, irq_flags); - radeon_scratch_free(rdev, rdev->fence_drv[ring].scratch_reg); + radeon_scratch_free(rdev, rdev->fence_drv[ring].scratch_reg, 1); if (rdev->wb.use_event) { rdev->fence_drv[ring].scratch_reg = 0; index = R600_WB_EVENT_OFFSET + ring * 4; } else { - r = radeon_scratch_get(rdev, &rdev->fence_drv[ring].scratch_reg); + r = radeon_scratch_get(rdev, &rdev->fence_drv[ring].scratch_reg, 1); if (r) { dev_err(rdev->dev, "fence failed to get scratch register\n"); write_unlock_irqrestore(&rdev->fence_lock, irq_flags); @@ -435,7 +435,7 @@ void radeon_fence_driver_fini(struct radeon_device *rdev) radeon_fence_wait_empty(rdev, ring); wake_up_all(&rdev->fence_drv[ring].queue); write_lock_irqsave(&rdev->fence_lock, irq_flags); - radeon_scratch_free(rdev, rdev->fence_drv[ring].scratch_reg); + radeon_scratch_free(rdev, rdev->fence_drv[ring].scratch_reg, 1); write_unlock_irqrestore(&rdev->fence_lock, irq_flags); rdev->fence_drv[ring].initialized = false; }
From: Jerome Glisse jglisse@redhat.com
This convert fence to use uint64_t sequence number intention is to use the fact that uin64_t is big enough that we don't need to care about wrap around.
Tested with and without writeback using 0xFFFFF000 as initial fence sequence and thus allowing to test the wrap around from 32bits to 64bits.
Signed-off-by: Jerome Glisse jglisse@redhat.com --- drivers/gpu/drm/radeon/ni.c | 6 ++-- drivers/gpu/drm/radeon/r100.c | 5 ++- drivers/gpu/drm/radeon/r300.c | 5 ++- drivers/gpu/drm/radeon/r600.c | 11 ++++---- drivers/gpu/drm/radeon/radeon.h | 20 +++++++------- drivers/gpu/drm/radeon/radeon_fence.c | 43 +++++++++++++++++---------------- drivers/gpu/drm/radeon/si.c | 6 ++-- 7 files changed, 50 insertions(+), 46 deletions(-)
diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c index 3160a74..416d7c2 100644 --- a/drivers/gpu/drm/radeon/ni.c +++ b/drivers/gpu/drm/radeon/ni.c @@ -1120,9 +1120,9 @@ void cayman_fence_ring_emit(struct radeon_device *rdev, radeon_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE_EOP, 4)); radeon_ring_write(ring, EVENT_TYPE(CACHE_FLUSH_AND_INV_EVENT_TS) | EVENT_INDEX(5)); radeon_ring_write(ring, addr & 0xffffffff); - radeon_ring_write(ring, (upper_32_bits(addr) & 0xff) | DATA_SEL(1) | INT_SEL(2)); - radeon_ring_write(ring, fence->seq); - radeon_ring_write(ring, 0); + radeon_ring_write(ring, (upper_32_bits(addr) & 0xff) | DATA_SEL(2) | INT_SEL(2)); + radeon_ring_write(ring, lower_32_bits(fence->seq)); + radeon_ring_write(ring, upper_32_bits(fence->seq)); }
void cayman_ring_ib_execute(struct radeon_device *rdev, struct radeon_ib *ib) diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c index 80b57c5..d6bd9ea 100644 --- a/drivers/gpu/drm/radeon/r100.c +++ b/drivers/gpu/drm/radeon/r100.c @@ -864,8 +864,9 @@ void r100_fence_ring_emit(struct radeon_device *rdev, radeon_ring_write(ring, PACKET0(RADEON_HOST_PATH_CNTL, 0)); radeon_ring_write(ring, rdev->config.r100.hdp_cntl); /* Emit fence sequence & fire IRQ */ - radeon_ring_write(ring, PACKET0(rdev->fence_drv[fence->ring].scratch_reg, 0)); - radeon_ring_write(ring, fence->seq); + radeon_ring_write(ring, PACKET0(rdev->fence_drv[fence->ring].scratch_reg, 1)); + radeon_ring_write(ring, lower_32_bits(fence->seq)); + radeon_ring_write(ring, upper_32_bits(fence->seq)); radeon_ring_write(ring, PACKET0(RADEON_GEN_INT_STATUS, 0)); radeon_ring_write(ring, RADEON_SW_INT_FIRE); } diff --git a/drivers/gpu/drm/radeon/r300.c b/drivers/gpu/drm/radeon/r300.c index 6419a59..7c9e30d 100644 --- a/drivers/gpu/drm/radeon/r300.c +++ b/drivers/gpu/drm/radeon/r300.c @@ -200,8 +200,9 @@ void r300_fence_ring_emit(struct radeon_device *rdev, radeon_ring_write(ring, PACKET0(RADEON_HOST_PATH_CNTL, 0)); radeon_ring_write(ring, rdev->config.r300.hdp_cntl); /* Emit fence sequence & fire IRQ */ - radeon_ring_write(ring, PACKET0(rdev->fence_drv[fence->ring].scratch_reg, 0)); - radeon_ring_write(ring, fence->seq); + radeon_ring_write(ring, PACKET0(rdev->fence_drv[fence->ring].scratch_reg, 1)); + radeon_ring_write(ring, lower_32_bits(fence->seq)); + radeon_ring_write(ring, upper_32_bits(fence->seq)); radeon_ring_write(ring, PACKET0(RADEON_GEN_INT_STATUS, 0)); radeon_ring_write(ring, RADEON_SW_INT_FIRE); } diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c index 02abf32..4fbc590 100644 --- a/drivers/gpu/drm/radeon/r600.c +++ b/drivers/gpu/drm/radeon/r600.c @@ -2313,9 +2313,9 @@ void r600_fence_ring_emit(struct radeon_device *rdev, radeon_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE_EOP, 4)); radeon_ring_write(ring, EVENT_TYPE(CACHE_FLUSH_AND_INV_EVENT_TS) | EVENT_INDEX(5)); radeon_ring_write(ring, addr & 0xffffffff); - radeon_ring_write(ring, (upper_32_bits(addr) & 0xff) | DATA_SEL(1) | INT_SEL(2)); - radeon_ring_write(ring, fence->seq); - radeon_ring_write(ring, 0); + radeon_ring_write(ring, (upper_32_bits(addr) & 0xff) | DATA_SEL(2) | INT_SEL(2)); + radeon_ring_write(ring, lower_32_bits(fence->seq)); + radeon_ring_write(ring, upper_32_bits(fence->seq)); } else { /* flush read cache over gart */ radeon_ring_write(ring, PACKET3(PACKET3_SURFACE_SYNC, 3)); @@ -2332,9 +2332,10 @@ void r600_fence_ring_emit(struct radeon_device *rdev, radeon_ring_write(ring, (WAIT_UNTIL - PACKET3_SET_CONFIG_REG_OFFSET) >> 2); radeon_ring_write(ring, WAIT_3D_IDLE_bit | WAIT_3D_IDLECLEAN_bit); /* Emit fence sequence & fire IRQ */ - radeon_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1)); + radeon_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 2)); radeon_ring_write(ring, ((rdev->fence_drv[fence->ring].scratch_reg - PACKET3_SET_CONFIG_REG_OFFSET) >> 2)); - radeon_ring_write(ring, fence->seq); + radeon_ring_write(ring, lower_32_bits(fence->seq)); + radeon_ring_write(ring, upper_32_bits(fence->seq)); /* CP_INTERRUPT packet 3 no longer exists, use packet 0 */ radeon_ring_write(ring, PACKET0(CP_INT_STATUS, 0)); radeon_ring_write(ring, RB_INT_STAT); diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 4f21b68..4c5a667 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -281,9 +281,9 @@ void radeon_semaphore_free(struct radeon_device *rdev, struct radeon_fence_driver { uint32_t scratch_reg; uint64_t gpu_addr; - volatile uint32_t *cpu_addr; - atomic_t seq; - uint32_t last_seq; + volatile uint64_t *cpu_addr; + atomic64_t seq; + uint64_t last_seq; unsigned long last_activity; wait_queue_head_t queue; struct list_head emitted; @@ -296,7 +296,7 @@ struct radeon_fence { struct kref kref; struct list_head list; /* protected by radeon_fence.lock */ - uint32_t seq; + uint64_t seq; bool emitted; bool signaled; /* RB, DMA, etc. */ @@ -884,12 +884,12 @@ struct radeon_wb { bool use_event; };
-#define RADEON_WB_SCRATCH_OFFSET 0 -#define RADEON_WB_CP_RPTR_OFFSET 1024 -#define RADEON_WB_CP1_RPTR_OFFSET 1280 -#define RADEON_WB_CP2_RPTR_OFFSET 1536 -#define R600_WB_IH_WPTR_OFFSET 2048 -#define R600_WB_EVENT_OFFSET 3072 +#define RADEON_WB_SCRATCH_OFFSET 0 +#define RADEON_WB_CP_RPTR_OFFSET 1024 +#define RADEON_WB_CP1_RPTR_OFFSET 1280 +#define RADEON_WB_CP2_RPTR_OFFSET 1536 +#define R600_WB_IH_WPTR_OFFSET 2048 +#define R600_WB_EVENT_OFFSET 3072
/** * struct radeon_pm - power management datas diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index 7733429..6da1535 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -40,23 +40,25 @@ #include "radeon.h" #include "radeon_trace.h"
-static void radeon_fence_write(struct radeon_device *rdev, u32 seq, int ring) +static void radeon_fence_write(struct radeon_device *rdev, u64 seq, int ring) { if (rdev->wb.enabled) { - *rdev->fence_drv[ring].cpu_addr = cpu_to_le32(seq); + *rdev->fence_drv[ring].cpu_addr = cpu_to_le64(seq); } else { - WREG32(rdev->fence_drv[ring].scratch_reg, seq); + WREG32(rdev->fence_drv[ring].scratch_reg, lower_32_bits(seq)); + WREG32(rdev->fence_drv[ring].scratch_reg + 4, upper_32_bits(seq)); } }
-static u32 radeon_fence_read(struct radeon_device *rdev, int ring) +static u64 radeon_fence_read(struct radeon_device *rdev, int ring) { - u32 seq = 0; + u64 seq = 0;
if (rdev->wb.enabled) { - seq = le32_to_cpu(*rdev->fence_drv[ring].cpu_addr); + seq = le64_to_cpu(*rdev->fence_drv[ring].cpu_addr); } else { seq = RREG32(rdev->fence_drv[ring].scratch_reg); + seq |= ((u64)RREG32(rdev->fence_drv[ring].scratch_reg + 4)) << 32ULL; } return seq; } @@ -70,7 +72,7 @@ int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence *fence) write_unlock_irqrestore(&rdev->fence_lock, irq_flags); return 0; } - fence->seq = atomic_add_return(1, &rdev->fence_drv[fence->ring].seq); + fence->seq = atomic64_inc_return(&rdev->fence_drv[fence->ring].seq); radeon_fence_ring_emit(rdev, fence->ring, fence); trace_radeon_fence_emit(rdev->ddev, fence->seq); fence->emitted = true; @@ -87,7 +89,7 @@ static bool radeon_fence_poll_locked(struct radeon_device *rdev, int ring) { struct radeon_fence *fence; struct list_head *i, *n; - uint32_t seq; + uint64_t seq; bool wake = false;
seq = radeon_fence_read(rdev, ring); @@ -185,7 +187,7 @@ int radeon_fence_wait(struct radeon_fence *fence, bool intr) { struct radeon_device *rdev; unsigned long irq_flags, timeout; - u32 seq; + u64 seq; int i, r; bool signaled;
@@ -252,8 +254,8 @@ int radeon_fence_wait(struct radeon_fence *fence, bool intr) if (radeon_ring_is_lockup(rdev, fence->ring, &rdev->ring[fence->ring])) {
/* good news we believe it's a lockup */ - printk(KERN_WARNING "GPU lockup (waiting for 0x%08X last fence id 0x%08X)\n", - fence->seq, seq); + dev_warn(rdev->dev, "GPU lockup (waiting for 0x%016llx last fence id 0x%016llx)\n", + fence->seq, seq);
/* mark the ring as not ready any more */ rdev->ring[fence->ring].ready = false; @@ -371,12 +373,12 @@ int radeon_fence_driver_start_ring(struct radeon_device *rdev, int ring) int r;
write_lock_irqsave(&rdev->fence_lock, irq_flags); - radeon_scratch_free(rdev, rdev->fence_drv[ring].scratch_reg, 1); + radeon_scratch_free(rdev, rdev->fence_drv[ring].scratch_reg, 2); if (rdev->wb.use_event) { rdev->fence_drv[ring].scratch_reg = 0; - index = R600_WB_EVENT_OFFSET + ring * 4; + index = R600_WB_EVENT_OFFSET + ring * 8; } else { - r = radeon_scratch_get(rdev, &rdev->fence_drv[ring].scratch_reg, 1); + r = radeon_scratch_get(rdev, &rdev->fence_drv[ring].scratch_reg, 2); if (r) { dev_err(rdev->dev, "fence failed to get scratch register\n"); write_unlock_irqrestore(&rdev->fence_lock, irq_flags); @@ -386,9 +388,9 @@ int radeon_fence_driver_start_ring(struct radeon_device *rdev, int ring) rdev->fence_drv[ring].scratch_reg - rdev->scratch.reg_base; } - rdev->fence_drv[ring].cpu_addr = &rdev->wb.wb[index/4]; + rdev->fence_drv[ring].cpu_addr = (u64*)&rdev->wb.wb[index/4]; rdev->fence_drv[ring].gpu_addr = rdev->wb.gpu_addr + index; - radeon_fence_write(rdev, atomic_read(&rdev->fence_drv[ring].seq), ring); + radeon_fence_write(rdev, atomic64_read(&rdev->fence_drv[ring].seq), ring); rdev->fence_drv[ring].initialized = true; DRM_INFO("fence driver on ring %d use gpu addr 0x%08Lx and cpu addr 0x%p\n", ring, rdev->fence_drv[ring].gpu_addr, rdev->fence_drv[ring].cpu_addr); @@ -401,7 +403,7 @@ static void radeon_fence_driver_init_ring(struct radeon_device *rdev, int ring) rdev->fence_drv[ring].scratch_reg = -1; rdev->fence_drv[ring].cpu_addr = NULL; rdev->fence_drv[ring].gpu_addr = 0; - atomic_set(&rdev->fence_drv[ring].seq, 0); + atomic64_set(&rdev->fence_drv[ring].seq, 0); INIT_LIST_HEAD(&rdev->fence_drv[ring].emitted); INIT_LIST_HEAD(&rdev->fence_drv[ring].signaled); init_waitqueue_head(&rdev->fence_drv[ring].queue); @@ -435,7 +437,7 @@ void radeon_fence_driver_fini(struct radeon_device *rdev) radeon_fence_wait_empty(rdev, ring); wake_up_all(&rdev->fence_drv[ring].queue); write_lock_irqsave(&rdev->fence_lock, irq_flags); - radeon_scratch_free(rdev, rdev->fence_drv[ring].scratch_reg, 1); + radeon_scratch_free(rdev, rdev->fence_drv[ring].scratch_reg, 2); write_unlock_irqrestore(&rdev->fence_lock, irq_flags); rdev->fence_drv[ring].initialized = false; } @@ -459,12 +461,11 @@ static int radeon_debugfs_fence_info(struct seq_file *m, void *data) continue;
seq_printf(m, "--- ring %d ---\n", i); - seq_printf(m, "Last signaled fence 0x%08X\n", - radeon_fence_read(rdev, i)); + seq_printf(m, "Last signaled fence 0x%016llx\n", radeon_fence_read(rdev, i)); if (!list_empty(&rdev->fence_drv[i].emitted)) { fence = list_entry(rdev->fence_drv[i].emitted.prev, struct radeon_fence, list); - seq_printf(m, "Last emitted fence %p with 0x%08X\n", + seq_printf(m, "Last emitted fence %p with 0x%016llx\n", fence, fence->seq); } } diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c index eb49483..6de9610 100644 --- a/drivers/gpu/drm/radeon/si.c +++ b/drivers/gpu/drm/radeon/si.c @@ -1905,9 +1905,9 @@ void si_fence_ring_emit(struct radeon_device *rdev, radeon_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE_EOP, 4)); radeon_ring_write(ring, EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5)); radeon_ring_write(ring, addr & 0xffffffff); - radeon_ring_write(ring, (upper_32_bits(addr) & 0xff) | DATA_SEL(1) | INT_SEL(2)); - radeon_ring_write(ring, fence->seq); - radeon_ring_write(ring, 0); + radeon_ring_write(ring, (upper_32_bits(addr) & 0xff) | DATA_SEL(2) | INT_SEL(2)); + radeon_ring_write(ring, lower_32_bits(fence->seq)); + radeon_ring_write(ring, upper_32_bits(fence->seq)); }
/*
On Mit, 2012-05-02 at 16:20 -0400, j.glisse@gmail.com wrote:
From: Jerome Glisse jglisse@redhat.com
This convert fence to use uint64_t sequence number intention is to use the fact that uin64_t is big enough that we don't need to care about wrap around.
Tested with and without writeback using 0xFFFFF000 as initial fence sequence and thus allowing to test the wrap around from 32bits to 64bits.
Signed-off-by: Jerome Glisse jglisse@redhat.com
[...]
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index 7733429..6da1535 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -386,9 +388,9 @@ int radeon_fence_driver_start_ring(struct radeon_device *rdev, int ring) rdev->fence_drv[ring].scratch_reg - rdev->scratch.reg_base; }
- rdev->fence_drv[ring].cpu_addr = &rdev->wb.wb[index/4];
- rdev->fence_drv[ring].cpu_addr = (u64*)&rdev->wb.wb[index/4];
Might want to ensure cpu_addr is 64 bit aligned, or there might be trouble on some architectures.
With this change, Cayman cards will already use six scratch registers for the rings. It won't be possible to extend this scheme for even one additional ring, will it?
On 03.05.2012 09:21, Michel Dänzer wrote:
On Mit, 2012-05-02 at 16:20 -0400, j.glisse@gmail.com wrote:
From: Jerome Glissejglisse@redhat.com
This convert fence to use uint64_t sequence number intention is to use the fact that uin64_t is big enough that we don't need to care about wrap around.
Tested with and without writeback using 0xFFFFF000 as initial fence sequence and thus allowing to test the wrap around from 32bits to 64bits.
Signed-off-by: Jerome Glissejglisse@redhat.com
[...]
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index 7733429..6da1535 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -386,9 +388,9 @@ int radeon_fence_driver_start_ring(struct radeon_device *rdev, int ring) rdev->fence_drv[ring].scratch_reg - rdev->scratch.reg_base; }
- rdev->fence_drv[ring].cpu_addr =rdev->wb.wb[index/4];
- rdev->fence_drv[ring].cpu_addr =u64*)&rdev->wb.wb[index/4];
Might want to ensure cpu_addr is 64 bit aligned, or there might be trouble on some architectures.
With this change, Cayman cards will already use six scratch registers for the rings. It won't be possible to extend this scheme for even one additional ring, will it?
That won't work anyway, since not all rings can deal with 64 bit fences, so we need to still use 32 bit signaling and extend them to 64 bit while processing the fence value.
Already working on that.
Christian.
On Thu, May 3, 2012 at 7:39 AM, Christian König deathsimple@vodafone.de wrote:
On 03.05.2012 09:21, Michel Dänzer wrote:
On Mit, 2012-05-02 at 16:20 -0400, j.glisse@gmail.com wrote:
From: Jerome Glissejglisse@redhat.com
This convert fence to use uint64_t sequence number intention is to use the fact that uin64_t is big enough that we don't need to care about wrap around.
Tested with and without writeback using 0xFFFFF000 as initial fence sequence and thus allowing to test the wrap around from 32bits to 64bits.
Signed-off-by: Jerome Glissejglisse@redhat.com
[...]
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index 7733429..6da1535 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -386,9 +388,9 @@ int radeon_fence_driver_start_ring(struct radeon_device *rdev, int ring) rdev->fence_drv[ring].scratch_reg - rdev->scratch.reg_base; }
- rdev->fence_drv[ring].cpu_addr =rdev->wb.wb[index/4];
- rdev->fence_drv[ring].cpu_addr =u64*)&rdev->wb.wb[index/4];
Might want to ensure cpu_addr is 64 bit aligned, or there might be trouble on some architectures.
With this change, Cayman cards will already use six scratch registers for the rings. It won't be possible to extend this scheme for even one additional ring, will it?
That won't work anyway, since not all rings can deal with 64 bit fences, so we need to still use 32 bit signaling and extend them to 64 bit while processing the fence value.
Already working on that.
Christian.
This patch is fine with ring that can't emit directly 64bits, all you have to do is fix the emit_fence callback to properly handle it and then you have to fix the radeon_fence_read which can be move to a ring specific callback. Anyway point is that patchset works and is fine on current set of ring we have and it can work as easily for ring without easy 64bits value emitting. So please explain further why those patch can't work because as i just explained i don't see why.
I have updated some v2 version of those patchset to handle the cayman and newer possibly running out of scratch reg and i also fix the alignment issue to be 64bits http://people.freedesktop.org/~glisse/reset3/
Cheers, Jerome
On Thu, May 3, 2012 at 11:56 AM, Jerome Glisse j.glisse@gmail.com wrote:
On Thu, May 3, 2012 at 7:39 AM, Christian König deathsimple@vodafone.de wrote:
On 03.05.2012 09:21, Michel Dänzer wrote:
On Mit, 2012-05-02 at 16:20 -0400, j.glisse@gmail.com wrote:
From: Jerome Glissejglisse@redhat.com
This convert fence to use uint64_t sequence number intention is to use the fact that uin64_t is big enough that we don't need to care about wrap around.
Tested with and without writeback using 0xFFFFF000 as initial fence sequence and thus allowing to test the wrap around from 32bits to 64bits.
Signed-off-by: Jerome Glissejglisse@redhat.com
[...]
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index 7733429..6da1535 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -386,9 +388,9 @@ int radeon_fence_driver_start_ring(struct radeon_device *rdev, int ring) rdev->fence_drv[ring].scratch_reg - rdev->scratch.reg_base; }
- rdev->fence_drv[ring].cpu_addr =rdev->wb.wb[index/4];
- rdev->fence_drv[ring].cpu_addr =u64*)&rdev->wb.wb[index/4];
Might want to ensure cpu_addr is 64 bit aligned, or there might be trouble on some architectures.
With this change, Cayman cards will already use six scratch registers for the rings. It won't be possible to extend this scheme for even one additional ring, will it?
That won't work anyway, since not all rings can deal with 64 bit fences, so we need to still use 32 bit signaling and extend them to 64 bit while processing the fence value.
Already working on that.
Christian.
This patch is fine with ring that can't emit directly 64bits, all you have to do is fix the emit_fence callback to properly handle it and then you have to fix the radeon_fence_read which can be move to a ring specific callback. Anyway point is that patchset works and is fine on current set of ring we have and it can work as easily for ring without easy 64bits value emitting. So please explain further why those patch can't work because as i just explained i don't see why.
I have updated some v2 version of those patchset to handle the cayman and newer possibly running out of scratch reg and i also fix the alignment issue to be 64bits
FWIW, we don't actually use scratch regs any more on r6xx+ (non-AGP at least), it's just memory writes so we could make the scratch pool bigger.
Alex
http://people.freedesktop.org/~glisse/reset3/
Cheers, Jerome _______________________________________________ dri-devel mailing list dri-devel@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/dri-devel
On Thu, May 3, 2012 at 12:29 PM, Alex Deucher alexdeucher@gmail.com wrote:
On Thu, May 3, 2012 at 11:56 AM, Jerome Glisse j.glisse@gmail.com wrote:
On Thu, May 3, 2012 at 7:39 AM, Christian König deathsimple@vodafone.de wrote:
On 03.05.2012 09:21, Michel Dänzer wrote:
On Mit, 2012-05-02 at 16:20 -0400, j.glisse@gmail.com wrote:
From: Jerome Glissejglisse@redhat.com
This convert fence to use uint64_t sequence number intention is to use the fact that uin64_t is big enough that we don't need to care about wrap around.
Tested with and without writeback using 0xFFFFF000 as initial fence sequence and thus allowing to test the wrap around from 32bits to 64bits.
Signed-off-by: Jerome Glissejglisse@redhat.com
[...]
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index 7733429..6da1535 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -386,9 +388,9 @@ int radeon_fence_driver_start_ring(struct radeon_device *rdev, int ring) rdev->fence_drv[ring].scratch_reg - rdev->scratch.reg_base; }
- rdev->fence_drv[ring].cpu_addr =rdev->wb.wb[index/4];
- rdev->fence_drv[ring].cpu_addr =u64*)&rdev->wb.wb[index/4];
Might want to ensure cpu_addr is 64 bit aligned, or there might be trouble on some architectures.
With this change, Cayman cards will already use six scratch registers for the rings. It won't be possible to extend this scheme for even one additional ring, will it?
That won't work anyway, since not all rings can deal with 64 bit fences, so we need to still use 32 bit signaling and extend them to 64 bit while processing the fence value.
Already working on that.
Christian.
This patch is fine with ring that can't emit directly 64bits, all you have to do is fix the emit_fence callback to properly handle it and then you have to fix the radeon_fence_read which can be move to a ring specific callback. Anyway point is that patchset works and is fine on current set of ring we have and it can work as easily for ring without easy 64bits value emitting. So please explain further why those patch can't work because as i just explained i don't see why.
I have updated some v2 version of those patchset to handle the cayman and newer possibly running out of scratch reg and i also fix the alignment issue to be 64bits
FWIW, we don't actually use scratch regs any more on r6xx+ (non-AGP at least), it's just memory writes so we could make the scratch pool bigger.
Alex
That's what my v2 does, just drop scratch reg for cayman and newer.
Cheers, Jerome
On 03.05.2012 18:34, Jerome Glisse wrote:
On Thu, May 3, 2012 at 12:29 PM, Alex Deucheralexdeucher@gmail.com wrote:
On Thu, May 3, 2012 at 11:56 AM, Jerome Glissej.glisse@gmail.com wrote:
On Thu, May 3, 2012 at 7:39 AM, Christian Königdeathsimple@vodafone.de wrote:
On 03.05.2012 09:21, Michel Dänzer wrote:
On Mit, 2012-05-02 at 16:20 -0400, j.glisse@gmail.com wrote:
From: Jerome Glissejglisse@redhat.com
This convert fence to use uint64_t sequence number intention is to use the fact that uin64_t is big enough that we don't need to care about wrap around.
Tested with and without writeback using 0xFFFFF000 as initial fence sequence and thus allowing to test the wrap around from 32bits to 64bits.
Signed-off-by: Jerome Glissejglisse@redhat.com
[...]
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index 7733429..6da1535 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -386,9 +388,9 @@ int radeon_fence_driver_start_ring(struct radeon_device *rdev, int ring) rdev->fence_drv[ring].scratch_reg - rdev->scratch.reg_base; }
rdev->fence_drv[ring].cpu_addr =rdev->wb.wb[index/4];
rdev->fence_drv[ring].cpu_addr =u64*)&rdev->wb.wb[index/4];
Might want to ensure cpu_addr is 64 bit aligned, or there might be trouble on some architectures.
With this change, Cayman cards will already use six scratch registers for the rings. It won't be possible to extend this scheme for even one additional ring, will it?
That won't work anyway, since not all rings can deal with 64 bit fences, so we need to still use 32 bit signaling and extend them to 64 bit while processing the fence value.
Already working on that.
Christian.
This patch is fine with ring that can't emit directly 64bits, all you have to do is fix the emit_fence callback to properly handle it and then you have to fix the radeon_fence_read which can be move to a ring specific callback. Anyway point is that patchset works and is fine on current set of ring we have and it can work as easily for ring without easy 64bits value emitting. So please explain further why those patch can't work because as i just explained i don't see why.
I have updated some v2 version of those patchset to handle the cayman and newer possibly running out of scratch reg and i also fix the alignment issue to be 64bits
FWIW, we don't actually use scratch regs any more on r6xx+ (non-AGP at least), it's just memory writes so we could make the scratch pool bigger.
Alex
That's what my v2 does, just drop scratch reg for cayman and newer.
Cheers, Jerome
I actually always wanted to change that in a way that scratch regs are only allocated if wb is really disabled, just never had time to do so (sigh).
Cheers, Christian.
On Thu, May 3, 2012 at 12:34 PM, Jerome Glisse j.glisse@gmail.com wrote:
On Thu, May 3, 2012 at 12:29 PM, Alex Deucher alexdeucher@gmail.com wrote:
On Thu, May 3, 2012 at 11:56 AM, Jerome Glisse j.glisse@gmail.com wrote:
On Thu, May 3, 2012 at 7:39 AM, Christian König deathsimple@vodafone.de wrote:
On 03.05.2012 09:21, Michel Dänzer wrote:
On Mit, 2012-05-02 at 16:20 -0400, j.glisse@gmail.com wrote:
From: Jerome Glissejglisse@redhat.com
This convert fence to use uint64_t sequence number intention is to use the fact that uin64_t is big enough that we don't need to care about wrap around.
Tested with and without writeback using 0xFFFFF000 as initial fence sequence and thus allowing to test the wrap around from 32bits to 64bits.
Signed-off-by: Jerome Glissejglisse@redhat.com
[...]
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index 7733429..6da1535 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -386,9 +388,9 @@ int radeon_fence_driver_start_ring(struct radeon_device *rdev, int ring) rdev->fence_drv[ring].scratch_reg - rdev->scratch.reg_base; }
- rdev->fence_drv[ring].cpu_addr =rdev->wb.wb[index/4];
- rdev->fence_drv[ring].cpu_addr =u64*)&rdev->wb.wb[index/4];
Might want to ensure cpu_addr is 64 bit aligned, or there might be trouble on some architectures.
With this change, Cayman cards will already use six scratch registers for the rings. It won't be possible to extend this scheme for even one additional ring, will it?
That won't work anyway, since not all rings can deal with 64 bit fences, so we need to still use 32 bit signaling and extend them to 64 bit while processing the fence value.
Already working on that.
Christian.
This patch is fine with ring that can't emit directly 64bits, all you have to do is fix the emit_fence callback to properly handle it and then you have to fix the radeon_fence_read which can be move to a ring specific callback. Anyway point is that patchset works and is fine on current set of ring we have and it can work as easily for ring without easy 64bits value emitting. So please explain further why those patch can't work because as i just explained i don't see why.
I have updated some v2 version of those patchset to handle the cayman and newer possibly running out of scratch reg and i also fix the alignment issue to be 64bits
FWIW, we don't actually use scratch regs any more on r6xx+ (non-AGP at least), it's just memory writes so we could make the scratch pool bigger.
Alex
That's what my v2 does, just drop scratch reg for cayman and newer.
Cheers, Jerome
Btw uploaded a v3 with some more thing like warnononce for extra safety, better comment and trying to mitigate race btw cpu reading and gpu writing fence. http://people.freedesktop.org/~glisse/reset3/
Cheers, Jerome
On Thu, May 3, 2012 at 4:46 PM, Jerome Glisse j.glisse@gmail.com wrote:
On Thu, May 3, 2012 at 12:34 PM, Jerome Glisse j.glisse@gmail.com wrote:
On Thu, May 3, 2012 at 12:29 PM, Alex Deucher alexdeucher@gmail.com wrote:
On Thu, May 3, 2012 at 11:56 AM, Jerome Glisse j.glisse@gmail.com wrote:
On Thu, May 3, 2012 at 7:39 AM, Christian König deathsimple@vodafone.de wrote:
On 03.05.2012 09:21, Michel Dänzer wrote:
On Mit, 2012-05-02 at 16:20 -0400, j.glisse@gmail.com wrote: > > From: Jerome Glissejglisse@redhat.com > > This convert fence to use uint64_t sequence number intention is > to use the fact that uin64_t is big enough that we don't need to > care about wrap around. > > Tested with and without writeback using 0xFFFFF000 as initial > fence sequence and thus allowing to test the wrap around from > 32bits to 64bits. > > Signed-off-by: Jerome Glissejglisse@redhat.com
[...]
> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c > b/drivers/gpu/drm/radeon/radeon_fence.c > index 7733429..6da1535 100644 > --- a/drivers/gpu/drm/radeon/radeon_fence.c > +++ b/drivers/gpu/drm/radeon/radeon_fence.c > @@ -386,9 +388,9 @@ int radeon_fence_driver_start_ring(struct > radeon_device *rdev, int ring) > rdev->fence_drv[ring].scratch_reg - > rdev->scratch.reg_base; > } > - rdev->fence_drv[ring].cpu_addr =rdev->wb.wb[index/4]; > + rdev->fence_drv[ring].cpu_addr =u64*)&rdev->wb.wb[index/4];
Might want to ensure cpu_addr is 64 bit aligned, or there might be trouble on some architectures.
With this change, Cayman cards will already use six scratch registers for the rings. It won't be possible to extend this scheme for even one additional ring, will it?
That won't work anyway, since not all rings can deal with 64 bit fences, so we need to still use 32 bit signaling and extend them to 64 bit while processing the fence value.
Already working on that.
Christian.
This patch is fine with ring that can't emit directly 64bits, all you have to do is fix the emit_fence callback to properly handle it and then you have to fix the radeon_fence_read which can be move to a ring specific callback. Anyway point is that patchset works and is fine on current set of ring we have and it can work as easily for ring without easy 64bits value emitting. So please explain further why those patch can't work because as i just explained i don't see why.
I have updated some v2 version of those patchset to handle the cayman and newer possibly running out of scratch reg and i also fix the alignment issue to be 64bits
FWIW, we don't actually use scratch regs any more on r6xx+ (non-AGP at least), it's just memory writes so we could make the scratch pool bigger.
Alex
That's what my v2 does, just drop scratch reg for cayman and newer.
Cheers, Jerome
Btw uploaded a v3 with some more thing like warnononce for extra safety, better comment and trying to mitigate race btw cpu reading and gpu writing fence. http://people.freedesktop.org/~glisse/reset3/
In patch: http://people.freedesktop.org/~glisse/reset3/0003-drm-radeon-rework-fence-ha...
You can drop this hunk: + if (rdev->family >= CHIP_CAYMAN) { + /* because there is so many ring on newer GPU we can't use + * scratch reg thus we need to use event, on those GPU there + * is no AGP version and writting to system ram have always + * been reliable so far + */ + rdev->wb.enabled = true; + rdev->wb.use_event = true; + }
As the code right below it does the exact same thing. It could probably be extended to APUs as well since they will never have AGP support. I'll send out a patch.
Alex
Cheers, Jerome
From: Alex Deucher alexander.deucher@amd.com
Use family rather than DCE check for clarity, also always use wb on APUs, there will never be AGP variants.
Signed-off-by: Alex Deucher alexander.deucher@amd.com --- drivers/gpu/drm/radeon/radeon_device.c | 4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c index 94f8561..8479617 100644 --- a/drivers/gpu/drm/radeon/radeon_device.c +++ b/drivers/gpu/drm/radeon/radeon_device.c @@ -241,8 +241,8 @@ int radeon_wb_init(struct radeon_device *rdev) rdev->wb.use_event = true; } } - /* always use writeback/events on NI */ - if (ASIC_IS_DCE5(rdev)) { + /* always use writeback/events on NI, APUs */ + if (rdev->family >= CHIP_PALM) { rdev->wb.enabled = true; rdev->wb.use_event = true; }
On Don, 2012-05-03 at 17:06 -0400, alexdeucher@gmail.com wrote:
From: Alex Deucher alexander.deucher@amd.com
Use family rather than DCE check for clarity, also always use wb on APUs, there will never be AGP variants.
Signed-off-by: Alex Deucher alexander.deucher@amd.com
Reviewed-by: Michel Dänzer michel.daenzer@amd.com
On Thu, May 3, 2012 at 5:04 PM, Alex Deucher alexdeucher@gmail.com wrote:
On Thu, May 3, 2012 at 4:46 PM, Jerome Glisse j.glisse@gmail.com wrote:
On Thu, May 3, 2012 at 12:34 PM, Jerome Glisse j.glisse@gmail.com wrote:
On Thu, May 3, 2012 at 12:29 PM, Alex Deucher alexdeucher@gmail.com wrote:
On Thu, May 3, 2012 at 11:56 AM, Jerome Glisse j.glisse@gmail.com wrote:
On Thu, May 3, 2012 at 7:39 AM, Christian König deathsimple@vodafone.de wrote:
On 03.05.2012 09:21, Michel Dänzer wrote: > > On Mit, 2012-05-02 at 16:20 -0400, j.glisse@gmail.com wrote: >> >> From: Jerome Glissejglisse@redhat.com >> >> This convert fence to use uint64_t sequence number intention is >> to use the fact that uin64_t is big enough that we don't need to >> care about wrap around. >> >> Tested with and without writeback using 0xFFFFF000 as initial >> fence sequence and thus allowing to test the wrap around from >> 32bits to 64bits. >> >> Signed-off-by: Jerome Glissejglisse@redhat.com > > [...] > >> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c >> b/drivers/gpu/drm/radeon/radeon_fence.c >> index 7733429..6da1535 100644 >> --- a/drivers/gpu/drm/radeon/radeon_fence.c >> +++ b/drivers/gpu/drm/radeon/radeon_fence.c >> @@ -386,9 +388,9 @@ int radeon_fence_driver_start_ring(struct >> radeon_device *rdev, int ring) >> rdev->fence_drv[ring].scratch_reg - >> rdev->scratch.reg_base; >> } >> - rdev->fence_drv[ring].cpu_addr =rdev->wb.wb[index/4]; >> + rdev->fence_drv[ring].cpu_addr =u64*)&rdev->wb.wb[index/4]; > > Might want to ensure cpu_addr is 64 bit aligned, or there might be > trouble on some architectures. > > > With this change, Cayman cards will already use six scratch registers > for the rings. It won't be possible to extend this scheme for even one > additional ring, will it?
That won't work anyway, since not all rings can deal with 64 bit fences, so we need to still use 32 bit signaling and extend them to 64 bit while processing the fence value.
Already working on that.
Christian.
This patch is fine with ring that can't emit directly 64bits, all you have to do is fix the emit_fence callback to properly handle it and then you have to fix the radeon_fence_read which can be move to a ring specific callback. Anyway point is that patchset works and is fine on current set of ring we have and it can work as easily for ring without easy 64bits value emitting. So please explain further why those patch can't work because as i just explained i don't see why.
I have updated some v2 version of those patchset to handle the cayman and newer possibly running out of scratch reg and i also fix the alignment issue to be 64bits
FWIW, we don't actually use scratch regs any more on r6xx+ (non-AGP at least), it's just memory writes so we could make the scratch pool bigger.
Alex
That's what my v2 does, just drop scratch reg for cayman and newer.
Cheers, Jerome
Btw uploaded a v3 with some more thing like warnononce for extra safety, better comment and trying to mitigate race btw cpu reading and gpu writing fence. http://people.freedesktop.org/~glisse/reset3/
In patch: http://people.freedesktop.org/~glisse/reset3/0003-drm-radeon-rework-fence-ha...
You can drop this hunk:
- if (rdev->family >= CHIP_CAYMAN) {
- /* because there is so many ring on newer GPU we can't use
- * scratch reg thus we need to use event, on those GPU there
- * is no AGP version and writting to system ram have always
- * been reliable so far
- */
- rdev->wb.enabled = true;
- rdev->wb.use_event = true;
- }
As the code right below it does the exact same thing. It could probably be extended to APUs as well since they will never have AGP support. I'll send out a patch.
Alex
Good catch updated patch http://people.freedesktop.org/~glisse/reset3/
Cheers, Jerome
From: Jerome Glisse jglisse@redhat.com
Using 64bits fence sequence we can directly compare sequence number to know if a fence is signaled or not. Thus the fence list became useless, so does the fence lock that mainly protected the fence list.
Things like ring.ready are no longer behind a lock, this should be ok as ring.ready is initialized once and will only change when facing lockup. Worst case is that we return an -EBUSY just after a successfull GPU reset, or we go into wait state instead of returning -EBUSY (thus delaying reporting -EBUSY to fence wait caller).
Signed-off-by: Jerome Glisse jglisse@redhat.com --- drivers/gpu/drm/radeon/radeon.h | 6 +- drivers/gpu/drm/radeon/radeon_device.c | 1 - drivers/gpu/drm/radeon/radeon_fence.c | 278 ++++++++++++-------------------- 3 files changed, 102 insertions(+), 183 deletions(-)
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 4c5a667..141aee2 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -286,15 +286,12 @@ struct radeon_fence_driver { uint64_t last_seq; unsigned long last_activity; wait_queue_head_t queue; - struct list_head emitted; - struct list_head signaled; bool initialized; };
struct radeon_fence { struct radeon_device *rdev; struct kref kref; - struct list_head list; /* protected by radeon_fence.lock */ uint64_t seq; bool emitted; @@ -316,7 +313,7 @@ int radeon_fence_wait_next(struct radeon_device *rdev, int ring); int radeon_fence_wait_empty(struct radeon_device *rdev, int ring); struct radeon_fence *radeon_fence_ref(struct radeon_fence *fence); void radeon_fence_unref(struct radeon_fence **fence); -int radeon_fence_count_emitted(struct radeon_device *rdev, int ring); +unsigned radeon_fence_count_emitted(struct radeon_device *rdev, int ring);
/* * Tiling registers @@ -1488,7 +1485,6 @@ struct radeon_device { struct radeon_mode_info mode_info; struct radeon_scratch scratch; struct radeon_mman mman; - rwlock_t fence_lock; struct radeon_fence_driver fence_drv[RADEON_NUM_RINGS]; struct radeon_ring ring[RADEON_NUM_RINGS]; bool ib_pool_ready; diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c index 3880aad..290b8d0 100644 --- a/drivers/gpu/drm/radeon/radeon_device.c +++ b/drivers/gpu/drm/radeon/radeon_device.c @@ -746,7 +746,6 @@ int radeon_device_init(struct radeon_device *rdev, mutex_init(&rdev->gem.mutex); mutex_init(&rdev->pm.mutex); mutex_init(&rdev->vram_mutex); - rwlock_init(&rdev->fence_lock); INIT_LIST_HEAD(&rdev->gem.objects); init_waitqueue_head(&rdev->irq.vblank_queue); init_waitqueue_head(&rdev->irq.idle_queue); diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index 6da1535..3f34f7b 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -65,32 +65,22 @@ static u64 radeon_fence_read(struct radeon_device *rdev, int ring)
int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence *fence) { - unsigned long irq_flags; - - write_lock_irqsave(&rdev->fence_lock, irq_flags); if (fence->emitted) { - write_unlock_irqrestore(&rdev->fence_lock, irq_flags); return 0; } fence->seq = atomic64_inc_return(&rdev->fence_drv[fence->ring].seq); + /* there is small chance that we overwritte a bigger last_emited + * value, but in normal usage this + */ radeon_fence_ring_emit(rdev, fence->ring, fence); trace_radeon_fence_emit(rdev->ddev, fence->seq); fence->emitted = true; - /* are we the first fence on a previusly idle ring? */ - if (list_empty(&rdev->fence_drv[fence->ring].emitted)) { - rdev->fence_drv[fence->ring].last_activity = jiffies; - } - list_move_tail(&fence->list, &rdev->fence_drv[fence->ring].emitted); - write_unlock_irqrestore(&rdev->fence_lock, irq_flags); return 0; }
-static bool radeon_fence_poll_locked(struct radeon_device *rdev, int ring) +static bool radeon_fence_poll(struct radeon_device *rdev, int ring) { - struct radeon_fence *fence; - struct list_head *i, *n; uint64_t seq; - bool wake = false;
seq = radeon_fence_read(rdev, ring); if (seq == rdev->fence_drv[ring].last_seq) @@ -98,40 +88,15 @@ static bool radeon_fence_poll_locked(struct radeon_device *rdev, int ring)
rdev->fence_drv[ring].last_seq = seq; rdev->fence_drv[ring].last_activity = jiffies; - - n = NULL; - list_for_each(i, &rdev->fence_drv[ring].emitted) { - fence = list_entry(i, struct radeon_fence, list); - if (fence->seq == seq) { - n = i; - break; - } - } - /* all fence previous to this one are considered as signaled */ - if (n) { - i = n; - do { - n = i->prev; - list_move_tail(i, &rdev->fence_drv[ring].signaled); - fence = list_entry(i, struct radeon_fence, list); - fence->signaled = true; - i = n; - } while (i != &rdev->fence_drv[ring].emitted); - wake = true; - } - return wake; + return true; }
static void radeon_fence_destroy(struct kref *kref) { - unsigned long irq_flags; - struct radeon_fence *fence; + struct radeon_fence *fence;
fence = container_of(kref, struct radeon_fence, kref); - write_lock_irqsave(&fence->rdev->fence_lock, irq_flags); - list_del(&fence->list); fence->emitted = false; - write_unlock_irqrestore(&fence->rdev->fence_lock, irq_flags); if (fence->semaphore.sa_bo) { radeon_semaphore_free(fence->rdev, fence); } @@ -153,78 +118,95 @@ int radeon_fence_create(struct radeon_device *rdev, (*fence)->seq = 0; (*fence)->ring = ring; (*fence)->semaphore.sa_bo = NULL; - INIT_LIST_HEAD(&(*fence)->list); return 0; }
bool radeon_fence_signaled(struct radeon_fence *fence) { - unsigned long irq_flags; - bool signaled = false; + struct radeon_device *rdev;
- if (!fence) + if (!fence) { return true; + }
- write_lock_irqsave(&fence->rdev->fence_lock, irq_flags); - signaled = fence->signaled; + rdev = fence->rdev; + if (!fence->emitted) { + dev_warn(rdev->dev, "Querying an unemitted fence : %p !\n", fence); + fence->signaled = true; + return true; + } + if (fence->signaled) { + return true; + } + if (rdev->fence_drv[fence->ring].last_seq >= fence->seq) { + fence->signaled = true; + return true; + } /* if we are shuting down report all fence as signaled */ - if (fence->rdev->shutdown) { - signaled = true; + if (rdev->shutdown) { + fence->signaled = true; + return true; } - if (!fence->emitted) { - WARN(1, "Querying an unemitted fence : %p !\n", fence); - signaled = true; + if (!fence->signaled) { + radeon_fence_poll(fence->rdev, fence->ring); + if (rdev->fence_drv[fence->ring].last_seq >= fence->seq) { + fence->signaled = true; + return true; + } } - if (!signaled) { - radeon_fence_poll_locked(fence->rdev, fence->ring); - signaled = fence->signaled; + return fence->signaled; +} + +bool radeon_seq_signaled(struct radeon_device *rdev, u64 seq, unsigned ring) +{ + if (rdev->fence_drv[ring].last_seq >= seq) { + return true; + } + radeon_fence_poll(rdev, ring); + if (rdev->fence_drv[ring].last_seq >= seq) { + return true; } - write_unlock_irqrestore(&fence->rdev->fence_lock, irq_flags); - return signaled; + return false; }
-int radeon_fence_wait(struct radeon_fence *fence, bool intr) +static int radeon_wait_seq(struct radeon_device *rdev, u64 target_seq, + unsigned ring, bool intr) { - struct radeon_device *rdev; - unsigned long irq_flags, timeout; - u64 seq; - int i, r; + unsigned long timeout; + uint64_t seq; bool signaled; + int r;
- if (fence == NULL) { - WARN(1, "Querying an invalid fence : %p !\n", fence); - return -EINVAL; - } + while (target_seq > rdev->fence_drv[ring].last_seq) { + if (!rdev->ring[ring].ready) { + return -EBUSY; + }
- rdev = fence->rdev; - signaled = radeon_fence_signaled(fence); - while (!signaled) { - read_lock_irqsave(&rdev->fence_lock, irq_flags); timeout = jiffies - RADEON_FENCE_JIFFIES_TIMEOUT; - if (time_after(rdev->fence_drv[fence->ring].last_activity, timeout)) { + if (time_after(rdev->fence_drv[ring].last_activity, timeout)) { /* the normal case, timeout is somewhere before last_activity */ - timeout = rdev->fence_drv[fence->ring].last_activity - timeout; + timeout = rdev->fence_drv[ring].last_activity - timeout; } else { /* either jiffies wrapped around, or no fence was signaled in the last 500ms - * anyway we will just wait for the minimum amount and then check for a lockup */ + * anyway we will just wait for the minimum amount and then check for a lockup + */ timeout = 1; } /* save current sequence value used to check for GPU lockups */ - seq = rdev->fence_drv[fence->ring].last_seq; - read_unlock_irqrestore(&rdev->fence_lock, irq_flags); + seq = rdev->fence_drv[ring].last_seq;
trace_radeon_fence_wait_begin(rdev->ddev, seq); - radeon_irq_kms_sw_irq_get(rdev, fence->ring); + radeon_irq_kms_sw_irq_get(rdev, ring); if (intr) { - r = wait_event_interruptible_timeout( - rdev->fence_drv[fence->ring].queue, - (signaled = radeon_fence_signaled(fence)), timeout); + r = wait_event_interruptible_timeout(rdev->fence_drv[ring].queue, + (signaled = radeon_seq_signaled(rdev, target_seq, ring)), + timeout); } else { - r = wait_event_timeout( - rdev->fence_drv[fence->ring].queue, - (signaled = radeon_fence_signaled(fence)), timeout); + r = wait_event_timeout(rdev->fence_drv[ring].queue, + (signaled = radeon_seq_signaled(rdev, target_seq, ring)), + timeout); } - radeon_irq_kms_sw_irq_put(rdev, fence->ring); + radeon_irq_kms_sw_irq_put(rdev, ring); if (unlikely(r < 0)) { return r; } @@ -237,28 +219,18 @@ int radeon_fence_wait(struct radeon_fence *fence, bool intr) continue; }
- write_lock_irqsave(&rdev->fence_lock, irq_flags); /* check if sequence value has changed since last_activity */ - if (seq != rdev->fence_drv[fence->ring].last_seq) { - write_unlock_irqrestore(&rdev->fence_lock, irq_flags); + if (seq != rdev->fence_drv[ring].last_seq) { continue; }
- /* change sequence value on all rings, so nobody else things there is a lockup */ - for (i = 0; i < RADEON_NUM_RINGS; ++i) - rdev->fence_drv[i].last_seq -= 0x10000; - - rdev->fence_drv[fence->ring].last_activity = jiffies; - write_unlock_irqrestore(&rdev->fence_lock, irq_flags); - - if (radeon_ring_is_lockup(rdev, fence->ring, &rdev->ring[fence->ring])) { - + if (radeon_ring_is_lockup(rdev, ring, &rdev->ring[ring])) { /* good news we believe it's a lockup */ dev_warn(rdev->dev, "GPU lockup (waiting for 0x%016llx last fence id 0x%016llx)\n", - fence->seq, seq); + target_seq, seq);
/* mark the ring as not ready any more */ - rdev->ring[fence->ring].ready = false; + rdev->ring[ring].ready = false; return -EDEADLK; } } @@ -266,52 +238,31 @@ int radeon_fence_wait(struct radeon_fence *fence, bool intr) return 0; }
-int radeon_fence_wait_next(struct radeon_device *rdev, int ring) +int radeon_fence_wait(struct radeon_fence *fence, bool intr) { - unsigned long irq_flags; - struct radeon_fence *fence; int r;
- write_lock_irqsave(&rdev->fence_lock, irq_flags); - if (!rdev->ring[ring].ready) { - write_unlock_irqrestore(&rdev->fence_lock, irq_flags); - return -EBUSY; + if (fence == NULL) { + WARN(1, "Querying an invalid fence : %p !\n", fence); + return -EINVAL; } - if (list_empty(&rdev->fence_drv[ring].emitted)) { - write_unlock_irqrestore(&rdev->fence_lock, irq_flags); - return -ENOENT; + + r = radeon_wait_seq(fence->rdev, fence->seq, fence->ring, intr); + if (r) { + return r; } - fence = list_entry(rdev->fence_drv[ring].emitted.next, - struct radeon_fence, list); - radeon_fence_ref(fence); - write_unlock_irqrestore(&rdev->fence_lock, irq_flags); - r = radeon_fence_wait(fence, false); - radeon_fence_unref(&fence); - return r; + fence->signaled = true; + return 0; }
-int radeon_fence_wait_empty(struct radeon_device *rdev, int ring) +int radeon_fence_wait_next(struct radeon_device *rdev, int ring) { - unsigned long irq_flags; - struct radeon_fence *fence; - int r; + return radeon_wait_seq(rdev, rdev->fence_drv[ring].last_seq + 1ULL, ring, false); +}
- write_lock_irqsave(&rdev->fence_lock, irq_flags); - if (!rdev->ring[ring].ready) { - write_unlock_irqrestore(&rdev->fence_lock, irq_flags); - return -EBUSY; - } - if (list_empty(&rdev->fence_drv[ring].emitted)) { - write_unlock_irqrestore(&rdev->fence_lock, irq_flags); - return 0; - } - fence = list_entry(rdev->fence_drv[ring].emitted.prev, - struct radeon_fence, list); - radeon_fence_ref(fence); - write_unlock_irqrestore(&rdev->fence_lock, irq_flags); - r = radeon_fence_wait(fence, false); - radeon_fence_unref(&fence); - return r; +int radeon_fence_wait_empty(struct radeon_device *rdev, int ring) +{ + return radeon_wait_seq(rdev, atomic64_read(&rdev->fence_drv[ring].seq), ring, false); }
struct radeon_fence *radeon_fence_ref(struct radeon_fence *fence) @@ -332,47 +283,32 @@ void radeon_fence_unref(struct radeon_fence **fence)
void radeon_fence_process(struct radeon_device *rdev, int ring) { - unsigned long irq_flags; bool wake;
- write_lock_irqsave(&rdev->fence_lock, irq_flags); - wake = radeon_fence_poll_locked(rdev, ring); - write_unlock_irqrestore(&rdev->fence_lock, irq_flags); + wake = radeon_fence_poll(rdev, ring); if (wake) { wake_up_all(&rdev->fence_drv[ring].queue); } }
-int radeon_fence_count_emitted(struct radeon_device *rdev, int ring) +unsigned radeon_fence_count_emitted(struct radeon_device *rdev, int ring) { - unsigned long irq_flags; - int not_processed = 0; - - read_lock_irqsave(&rdev->fence_lock, irq_flags); - if (!rdev->fence_drv[ring].initialized) { - read_unlock_irqrestore(&rdev->fence_lock, irq_flags); - return 0; - } + uint64_t emitted;
- if (!list_empty(&rdev->fence_drv[ring].emitted)) { - struct list_head *ptr; - list_for_each(ptr, &rdev->fence_drv[ring].emitted) { - /* count up to 3, that's enought info */ - if (++not_processed >= 3) - break; - } + radeon_fence_poll(rdev, ring); + emitted = atomic64_read(&rdev->fence_drv[ring].seq) - rdev->fence_drv[ring].last_seq; + /* to avoid 32bits warp around */ + if (emitted > 0x10000000) { + emitted = 0x10000000; } - read_unlock_irqrestore(&rdev->fence_lock, irq_flags); - return not_processed; + return (unsigned)emitted; }
int radeon_fence_driver_start_ring(struct radeon_device *rdev, int ring) { - unsigned long irq_flags; uint64_t index; int r;
- write_lock_irqsave(&rdev->fence_lock, irq_flags); radeon_scratch_free(rdev, rdev->fence_drv[ring].scratch_reg, 2); if (rdev->wb.use_event) { rdev->fence_drv[ring].scratch_reg = 0; @@ -381,7 +317,6 @@ int radeon_fence_driver_start_ring(struct radeon_device *rdev, int ring) r = radeon_scratch_get(rdev, &rdev->fence_drv[ring].scratch_reg, 2); if (r) { dev_err(rdev->dev, "fence failed to get scratch register\n"); - write_unlock_irqrestore(&rdev->fence_lock, irq_flags); return r; } index = RADEON_WB_SCRATCH_OFFSET + @@ -392,9 +327,8 @@ int radeon_fence_driver_start_ring(struct radeon_device *rdev, int ring) rdev->fence_drv[ring].gpu_addr = rdev->wb.gpu_addr + index; radeon_fence_write(rdev, atomic64_read(&rdev->fence_drv[ring].seq), ring); rdev->fence_drv[ring].initialized = true; - DRM_INFO("fence driver on ring %d use gpu addr 0x%08Lx and cpu addr 0x%p\n", + dev_info(rdev->dev, "fence driver on ring %d use gpu addr 0x%08Lx and cpu addr 0x%p\n", ring, rdev->fence_drv[ring].gpu_addr, rdev->fence_drv[ring].cpu_addr); - write_unlock_irqrestore(&rdev->fence_lock, irq_flags); return 0; }
@@ -403,23 +337,20 @@ static void radeon_fence_driver_init_ring(struct radeon_device *rdev, int ring) rdev->fence_drv[ring].scratch_reg = -1; rdev->fence_drv[ring].cpu_addr = NULL; rdev->fence_drv[ring].gpu_addr = 0; + rdev->fence_drv[ring].last_activity = jiffies; atomic64_set(&rdev->fence_drv[ring].seq, 0); - INIT_LIST_HEAD(&rdev->fence_drv[ring].emitted); - INIT_LIST_HEAD(&rdev->fence_drv[ring].signaled); + rdev->fence_drv[ring].last_seq = 0; init_waitqueue_head(&rdev->fence_drv[ring].queue); rdev->fence_drv[ring].initialized = false; }
int radeon_fence_driver_init(struct radeon_device *rdev) { - unsigned long irq_flags; int ring;
- write_lock_irqsave(&rdev->fence_lock, irq_flags); for (ring = 0; ring < RADEON_NUM_RINGS; ring++) { radeon_fence_driver_init_ring(rdev, ring); } - write_unlock_irqrestore(&rdev->fence_lock, irq_flags); if (radeon_debugfs_fence_init(rdev)) { dev_err(rdev->dev, "fence debugfs file creation failed\n"); } @@ -428,7 +359,6 @@ int radeon_fence_driver_init(struct radeon_device *rdev)
void radeon_fence_driver_fini(struct radeon_device *rdev) { - unsigned long irq_flags; int ring;
for (ring = 0; ring < RADEON_NUM_RINGS; ring++) { @@ -436,9 +366,7 @@ void radeon_fence_driver_fini(struct radeon_device *rdev) continue; radeon_fence_wait_empty(rdev, ring); wake_up_all(&rdev->fence_drv[ring].queue); - write_lock_irqsave(&rdev->fence_lock, irq_flags); radeon_scratch_free(rdev, rdev->fence_drv[ring].scratch_reg, 2); - write_unlock_irqrestore(&rdev->fence_lock, irq_flags); rdev->fence_drv[ring].initialized = false; } } @@ -453,7 +381,6 @@ static int radeon_debugfs_fence_info(struct seq_file *m, void *data) struct drm_info_node *node = (struct drm_info_node *)m->private; struct drm_device *dev = node->minor->dev; struct radeon_device *rdev = dev->dev_private; - struct radeon_fence *fence; int i;
for (i = 0; i < RADEON_NUM_RINGS; ++i) { @@ -461,13 +388,10 @@ static int radeon_debugfs_fence_info(struct seq_file *m, void *data) continue;
seq_printf(m, "--- ring %d ---\n", i); - seq_printf(m, "Last signaled fence 0x%016llx\n", radeon_fence_read(rdev, i)); - if (!list_empty(&rdev->fence_drv[i].emitted)) { - fence = list_entry(rdev->fence_drv[i].emitted.prev, - struct radeon_fence, list); - seq_printf(m, "Last emitted fence %p with 0x%016llx\n", - fence, fence->seq); - } + seq_printf(m, "Last signaled 0x%016llx\n", + radeon_fence_read(rdev, i)); + seq_printf(m, "Last emitted 0x%016lx\n", + atomic64_read(&rdev->fence_drv[i].seq)); } return 0; }
From: Jerome Glisse jglisse@redhat.com
With fence rework it's now easier to agressivly free idle bo when there is no hole to satisfy current allocation request. The hit of some cs ioctl to have to go through the sa bo list and free them is minimal, it happens once in while and avoid some fence waiting.
Signed-off-by: Jerome Glisse jglisse@redhat.com --- drivers/gpu/drm/radeon/radeon.h | 1 + drivers/gpu/drm/radeon/radeon_fence.c | 2 +- drivers/gpu/drm/radeon/radeon_sa.c | 51 +++++++++++++++++++++++++++++--- 3 files changed, 48 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 141aee2..5459722 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -314,6 +314,7 @@ int radeon_fence_wait_empty(struct radeon_device *rdev, int ring); struct radeon_fence *radeon_fence_ref(struct radeon_fence *fence); void radeon_fence_unref(struct radeon_fence **fence); unsigned radeon_fence_count_emitted(struct radeon_device *rdev, int ring); +bool radeon_fence_poll(struct radeon_device *rdev, int ring);
/* * Tiling registers diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index 3f34f7b..043f431 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -78,7 +78,7 @@ int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence *fence) return 0; }
-static bool radeon_fence_poll(struct radeon_device *rdev, int ring) +bool radeon_fence_poll(struct radeon_device *rdev, int ring) { uint64_t seq;
diff --git a/drivers/gpu/drm/radeon/radeon_sa.c b/drivers/gpu/drm/radeon/radeon_sa.c index e758aaa..2cbf5ba 100644 --- a/drivers/gpu/drm/radeon/radeon_sa.c +++ b/drivers/gpu/drm/radeon/radeon_sa.c @@ -48,6 +48,10 @@ #include "drm.h" #include "radeon.h"
+static bool radeon_sa_manager_try_free(struct radeon_device *rdev, + struct radeon_sa_manager *sa_manager, + struct radeon_sa_bo *oldest); + int radeon_sa_bo_manager_init(struct radeon_device *rdev, struct radeon_sa_manager *sa_manager, unsigned size, u32 domain) @@ -77,7 +81,16 @@ void radeon_sa_bo_manager_fini(struct radeon_device *rdev, struct radeon_sa_bo *sa_bo, *tmp;
if (!list_empty(&sa_manager->sa_bo)) { - dev_err(rdev->dev, "sa_manager is not empty, clearing anyway\n"); + struct radeon_sa_bo *oldest; + + /* try to free them */ + oldest = list_entry(sa_manager->sa_bo.next, struct radeon_sa_bo, list); + radeon_sa_manager_try_free(rdev, sa_manager, oldest); + + if (!list_empty(&sa_manager->sa_bo)) { + /* something went wrong */ + dev_err(rdev->dev, "sa_manager is not empty, clearing anyway\n"); + } } list_for_each_entry_safe(sa_bo, tmp, &sa_manager->sa_bo, list) { list_del_init(&sa_bo->list); @@ -171,15 +184,43 @@ static void radeon_sa_bo_free_locked(struct radeon_device *rdev, struct radeon_s }
static bool radeon_sa_manager_try_free(struct radeon_device *rdev, + struct radeon_sa_manager *sa_manager, struct radeon_sa_bo *oldest) { - if (oldest->fence && oldest->fence->emitted) { - if (radeon_fence_signaled(oldest->fence)) { + struct radeon_sa_bo *tmp, *sa_bo; + unsigned ring, free_count = 0; + + if (oldest->fence == NULL || !oldest->fence->emitted) { + return false; + } + ring = oldest->fence->ring; + radeon_fence_poll(rdev, ring); + if (rdev->fence_drv[ring].last_seq < oldest->fence->seq) { + return false; + } + free_count++; + /* go over the remaining of the list and try to free as much + * as possible + */ + sa_bo = oldest; + list_for_each_entry_safe_continue(sa_bo, tmp, &sa_manager->sa_bo, list) { + if (sa_bo->fence == NULL || !sa_bo->fence->emitted) { + radeon_sa_bo_free_locked(rdev, oldest); + return true; + } + if (ring != sa_bo->fence->ring) { + ring = sa_bo->fence->ring; + radeon_fence_poll(rdev, ring); + } + if (rdev->fence_drv[ring].last_seq < sa_bo->fence->seq) { radeon_sa_bo_free_locked(rdev, oldest); return true; } + radeon_sa_bo_free_locked(rdev, sa_bo); + free_count++; } - return false; + radeon_sa_bo_free_locked(rdev, oldest); + return true; }
/* @@ -260,7 +301,7 @@ retry: } } /* try to be optimist and free the oldest one */ - if (radeon_sa_manager_try_free(rdev, oldest)) { + if (radeon_sa_manager_try_free(rdev, sa_manager, oldest)) { goto retry; }
dri-devel@lists.freedesktop.org