Am 31.01.2013 22:37, schrieb alexdeucher@gmail.com:
From: Alex Deucher alexander.deucher@amd.com
For very large page table updates, we can exceed the size of the ring. To avoid this, use an IB to perform the page table update.
Signed-off-by: Alex Deucher alexander.deucher@amd.com
We may want to do something similar for the BO move code as we could potentially run out of ring space on a small ring like the DMA rings if we are moving very large buffers.
Alex
drivers/gpu/drm/radeon/ni.c | 30 +++++++++++-------- drivers/gpu/drm/radeon/radeon.h | 6 ++- drivers/gpu/drm/radeon/radeon_asic.h | 8 ++++- drivers/gpu/drm/radeon/radeon_gart.c | 45 +++++++++++++++++++++++------ drivers/gpu/drm/radeon/si.c | 52 ++++++++++++++++++--------------- 5 files changed, 90 insertions(+), 51 deletions(-)
diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c index 170bd03..7cead76 100644 --- a/drivers/gpu/drm/radeon/ni.c +++ b/drivers/gpu/drm/radeon/ni.c @@ -1946,19 +1946,21 @@ uint32_t cayman_vm_page_flags(struct radeon_device *rdev, uint32_t flags)
- cayman_vm_set_page - update the page tables using the CP
- @rdev: radeon_device pointer
- @ib: indirect buffer to fill with commands
- @pe: addr of the page entry
- @addr: dst addr to write into pe
- @count: number of page entries to update
- @incr: increase next addr by incr bytes
- @flags: access flags
- Update the page tables using the CP (cayman-si).
*/
- Update the page tables using the CP (cayman/TN).
-void cayman_vm_set_page(struct radeon_device *rdev, uint64_t pe, +void cayman_vm_set_page(struct radeon_device *rdev,
struct radeon_ib *ib,
{uint64_t pe, uint64_t addr, unsigned count, uint32_t incr, uint32_t flags)
- struct radeon_ring *ring = &rdev->ring[rdev->asic->vm.pt_ring_index]; uint32_t r600_flags = cayman_vm_page_flags(rdev, flags); uint64_t value; unsigned ndw;
@@ -1969,9 +1971,9 @@ void cayman_vm_set_page(struct radeon_device *rdev, uint64_t pe, if (ndw > 0x3FFF) ndw = 0x3FFF;
radeon_ring_write(ring, PACKET3(PACKET3_ME_WRITE, ndw));
radeon_ring_write(ring, pe);
radeon_ring_write(ring, upper_32_bits(pe) & 0xff);
ib->ptr[ib->length_dw++] = PACKET3(PACKET3_ME_WRITE, ndw);
ib->ptr[ib->length_dw++] = pe;
ib->ptr[ib->length_dw++] = upper_32_bits(pe) & 0xff; for (; ndw > 1; ndw -= 2, --count, pe += 8) { if (flags & RADEON_VM_PAGE_SYSTEM) { value = radeon_vm_map_gart(rdev, addr);
@@ -1983,8 +1985,8 @@ void cayman_vm_set_page(struct radeon_device *rdev, uint64_t pe, } addr += incr; value |= r600_flags;
radeon_ring_write(ring, value);
radeon_ring_write(ring, upper_32_bits(value));
ib->ptr[ib->length_dw++] = value;
} } else {ib->ptr[ib->length_dw++] = upper_32_bits(value); }
@@ -1994,9 +1996,9 @@ void cayman_vm_set_page(struct radeon_device *rdev, uint64_t pe, ndw = 0xFFFFE;
/* for non-physically contiguous pages (system) */
radeon_ring_write(ring, DMA_PACKET(DMA_PACKET_WRITE, 0, 0, ndw));
radeon_ring_write(ring, pe);
radeon_ring_write(ring, upper_32_bits(pe) & 0xff);
ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_WRITE, 0, 0, ndw);
ib->ptr[ib->length_dw++] = pe;
ib->ptr[ib->length_dw++] = upper_32_bits(pe) & 0xff; for (; ndw > 0; ndw -= 2, --count, pe += 8) { if (flags & RADEON_VM_PAGE_SYSTEM) { value = radeon_vm_map_gart(rdev, addr);
@@ -2008,10 +2010,12 @@ void cayman_vm_set_page(struct radeon_device *rdev, uint64_t pe, } addr += incr; value |= r600_flags;
radeon_ring_write(ring, value);
radeon_ring_write(ring, upper_32_bits(value));
ib->ptr[ib->length_dw++] = value;
}ib->ptr[ib->length_dw++] = upper_32_bits(value); }
while (ib->length_dw & 0x7)
} }ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_NOP, 0, 0, 0);
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 6539d6c..89b9645 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -1188,7 +1188,9 @@ struct radeon_asic { void (*fini)(struct radeon_device *rdev);
u32 pt_ring_index;
void (*set_page)(struct radeon_device *rdev, uint64_t pe,
void (*set_page)(struct radeon_device *rdev,
struct radeon_ib *ib,
} vm;uint64_t pe, uint64_t addr, unsigned count, uint32_t incr, uint32_t flags);
@@ -1810,7 +1812,7 @@ void radeon_ring_write(struct radeon_ring *ring, uint32_t v); #define radeon_gart_set_page(rdev, i, p) (rdev)->asic->gart.set_page((rdev), (i), (p)) #define radeon_asic_vm_init(rdev) (rdev)->asic->vm.init((rdev)) #define radeon_asic_vm_fini(rdev) (rdev)->asic->vm.fini((rdev)) -#define radeon_asic_vm_set_page(rdev, pe, addr, count, incr, flags) ((rdev)->asic->vm.set_page((rdev), (pe), (addr), (count), (incr), (flags))) +#define radeon_asic_vm_set_page(rdev, ib, pe, addr, count, incr, flags) ((rdev)->asic->vm.set_page((rdev), (ib), (pe), (addr), (count), (incr), (flags))) #define radeon_ring_start(rdev, r, cp) (rdev)->asic->ring[(r)].ring_start((rdev), (cp)) #define radeon_ring_test(rdev, r, cp) (rdev)->asic->ring[(r)].ring_test((rdev), (cp)) #define radeon_ib_test(rdev, r, cp) (rdev)->asic->ring[(r)].ib_test((rdev), (cp)) diff --git a/drivers/gpu/drm/radeon/radeon_asic.h b/drivers/gpu/drm/radeon/radeon_asic.h index e429e25..f4134a8 100644 --- a/drivers/gpu/drm/radeon/radeon_asic.h +++ b/drivers/gpu/drm/radeon/radeon_asic.h @@ -474,7 +474,9 @@ int cayman_vm_init(struct radeon_device *rdev); void cayman_vm_fini(struct radeon_device *rdev); void cayman_vm_flush(struct radeon_device *rdev, int ridx, struct radeon_vm *vm); uint32_t cayman_vm_page_flags(struct radeon_device *rdev, uint32_t flags); -void cayman_vm_set_page(struct radeon_device *rdev, uint64_t pe, +void cayman_vm_set_page(struct radeon_device *rdev,
struct radeon_ib *ib,
int evergreen_ib_parse(struct radeon_device *rdev, struct radeon_ib *ib);uint64_t pe, uint64_t addr, unsigned count, uint32_t incr, uint32_t flags);
@@ -506,7 +508,9 @@ int si_irq_set(struct radeon_device *rdev); int si_irq_process(struct radeon_device *rdev); int si_vm_init(struct radeon_device *rdev); void si_vm_fini(struct radeon_device *rdev); -void si_vm_set_page(struct radeon_device *rdev, uint64_t pe, +void si_vm_set_page(struct radeon_device *rdev,
struct radeon_ib *ib,
uint64_t addr, unsigned count, uint32_t incr, uint32_t flags); void si_vm_flush(struct radeon_device *rdev, int ridx, struct radeon_vm *vm);uint64_t pe,
diff --git a/drivers/gpu/drm/radeon/radeon_gart.c b/drivers/gpu/drm/radeon/radeon_gart.c index 6e24f84..d95c61d 100644 --- a/drivers/gpu/drm/radeon/radeon_gart.c +++ b/drivers/gpu/drm/radeon/radeon_gart.c @@ -929,6 +929,7 @@ uint64_t radeon_vm_map_gart(struct radeon_device *rdev, uint64_t addr) */ static int radeon_vm_update_pdes(struct radeon_device *rdev, struct radeon_vm *vm,
{ static const uint32_t incr = RADEON_VM_PTE_COUNT * 8;struct radeon_ib *ib, uint64_t start, uint64_t end)
@@ -971,7 +972,7 @@ retry: ((last_pt + incr * count) != pt)) {
if (count) {
radeon_asic_vm_set_page(rdev, last_pde,
radeon_asic_vm_set_page(rdev, ib, last_pde, last_pt, count, incr, RADEON_VM_PAGE_VALID); }
@@ -985,7 +986,7 @@ retry: }
if (count) {
radeon_asic_vm_set_page(rdev, last_pde, last_pt, count,
radeon_asic_vm_set_page(rdev, ib, last_pde, last_pt, count, incr, RADEON_VM_PAGE_VALID);
}
@@ -1009,6 +1010,7 @@ retry: */ static void radeon_vm_update_ptes(struct radeon_device *rdev, struct radeon_vm *vm,
{struct radeon_ib *ib, uint64_t start, uint64_t end, uint64_t dst, uint32_t flags)
@@ -1038,7 +1040,7 @@ static void radeon_vm_update_ptes(struct radeon_device *rdev, if ((last_pte + 8 * count) != pte) {
if (count) {
radeon_asic_vm_set_page(rdev, last_pte,
radeon_asic_vm_set_page(rdev, ib, last_pte, last_dst, count, RADEON_GPU_PAGE_SIZE, flags);
@@ -1056,7 +1058,8 @@ static void radeon_vm_update_ptes(struct radeon_device *rdev, }
if (count) {
radeon_asic_vm_set_page(rdev, last_pte, last_dst, count,
radeon_asic_vm_set_page(rdev, ib, last_pte,
} }last_dst, count, RADEON_GPU_PAGE_SIZE, flags);
@@ -1082,6 +1085,7 @@ int radeon_vm_bo_update_pte(struct radeon_device *rdev, unsigned ridx = rdev->asic->vm.pt_ring_index; struct radeon_ring *ring = &rdev->ring[ridx]; struct radeon_semaphore *sem = NULL;
- struct radeon_ib ib; struct radeon_bo_va *bo_va; unsigned nptes, npdes, ndw; uint64_t addr;
@@ -1140,9 +1144,8 @@ int radeon_vm_bo_update_pte(struct radeon_device *rdev, /* assume two extra pdes in case the mapping overlaps the borders */ npdes = (nptes >> RADEON_VM_BLOCK_SIZE) + 2;
- /* estimate number of dw needed */
- /* semaphore, fence and padding */
- ndw = 32;
/* padding, etc. */
ndw = 64;
if (RADEON_VM_BLOCK_SIZE > 11) /* reserve space for one header for every 2k dwords */
@@ -1161,8 +1164,25 @@ int radeon_vm_bo_update_pte(struct radeon_device *rdev, /* reserve space for pde addresses */ ndw += npdes * 2;
- r = radeon_ring_lock(rdev, ring, ndw);
- /* update too big for an IB */
- if (ndw > 0xfffff)
return -ENOMEM;
- r = radeon_sa_bo_new(rdev, &rdev->ring_tmp_bo, &ib.sa_bo, ndw * 4, 256, true);
- if (r) {
return r;
- }
- ib.ring = ridx;
- ib.fence = NULL;
- ib.ptr = radeon_sa_bo_cpu_addr(ib.sa_bo);
- ib.vm = NULL;
- ib.gpu_addr = radeon_sa_bo_gpu_addr(ib.sa_bo);
- ib.is_const_ib = false;
- ib.length_dw = 0;
Wouldn't it make sense to use the IB functions (ib_get, ib_schedule, ib_free) here instead of filling the IB structure manually?
Apart from that it looks quite good.
Christian.
- r = radeon_ring_lock(rdev, ring, 64); if (r) {
return r; }radeon_sa_bo_free(rdev, &ib.sa_bo, NULL);
@@ -1171,23 +1191,28 @@ int radeon_vm_bo_update_pte(struct radeon_device *rdev, radeon_fence_note_sync(vm->fence, ridx); }
- r = radeon_vm_update_pdes(rdev, vm, bo_va->soffset, bo_va->eoffset);
- r = radeon_vm_update_pdes(rdev, vm, &ib, bo_va->soffset, bo_va->eoffset); if (r) {
radeon_ring_unlock_undo(rdev, ring); return r; }radeon_sa_bo_free(rdev, &ib.sa_bo, NULL);
- radeon_vm_update_ptes(rdev, vm, bo_va->soffset, bo_va->eoffset,
radeon_vm_update_ptes(rdev, vm, &ib, bo_va->soffset, bo_va->eoffset, addr, bo_va->flags);
radeon_ring_ib_execute(rdev, ib.ring, &ib);
radeon_fence_unref(&vm->fence); r = radeon_fence_emit(rdev, &vm->fence, ridx); if (r) {
radeon_sa_bo_free(rdev, &ib.sa_bo, NULL);
radeon_ring_unlock_undo(rdev, ring); return r; } radeon_ring_unlock_commit(rdev, ring); radeon_semaphore_free(rdev, &sem, vm->fence);
radeon_sa_bo_free(rdev, &ib.sa_bo, vm->fence); radeon_fence_unref(&vm->last_flush);
return 0;
diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c index cd83bc5..a910cb9 100644 --- a/drivers/gpu/drm/radeon/si.c +++ b/drivers/gpu/drm/radeon/si.c @@ -3043,19 +3043,21 @@ void si_vm_fini(struct radeon_device *rdev)
- si_vm_set_page - update the page tables using the CP
- @rdev: radeon_device pointer
- @ib: indirect buffer to fill with commands
- @pe: addr of the page entry
- @addr: dst addr to write into pe
- @count: number of page entries to update
- @incr: increase next addr by incr bytes
- @flags: access flags
- Update the page tables using the CP (cayman-si).
*/
- Update the page tables using the CP (SI).
-void si_vm_set_page(struct radeon_device *rdev, uint64_t pe, +void si_vm_set_page(struct radeon_device *rdev,
struct radeon_ib *ib,
uint64_t addr, unsigned count, uint32_t incr, uint32_t flags) {uint64_t pe,
- struct radeon_ring *ring = &rdev->ring[rdev->asic->vm.pt_ring_index]; uint32_t r600_flags = cayman_vm_page_flags(rdev, flags); uint64_t value; unsigned ndw;
@@ -3066,11 +3068,11 @@ void si_vm_set_page(struct radeon_device *rdev, uint64_t pe, if (ndw > 0x3FFE) ndw = 0x3FFE;
radeon_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, ndw));
radeon_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
WRITE_DATA_DST_SEL(1)));
radeon_ring_write(ring, pe);
radeon_ring_write(ring, upper_32_bits(pe));
ib->ptr[ib->length_dw++] = PACKET3(PACKET3_WRITE_DATA, ndw);
ib->ptr[ib->length_dw++] = (WRITE_DATA_ENGINE_SEL(0) |
WRITE_DATA_DST_SEL(1));
ib->ptr[ib->length_dw++] = pe;
ib->ptr[ib->length_dw++] = upper_32_bits(pe); for (; ndw > 2; ndw -= 2, --count, pe += 8) { if (flags & RADEON_VM_PAGE_SYSTEM) { value = radeon_vm_map_gart(rdev, addr);
@@ -3082,8 +3084,8 @@ void si_vm_set_page(struct radeon_device *rdev, uint64_t pe, } addr += incr; value |= r600_flags;
radeon_ring_write(ring, value);
radeon_ring_write(ring, upper_32_bits(value));
ib->ptr[ib->length_dw++] = value;
} } else {ib->ptr[ib->length_dw++] = upper_32_bits(value); }
@@ -3095,9 +3097,9 @@ void si_vm_set_page(struct radeon_device *rdev, uint64_t pe, ndw = 0xFFFFE;
/* for non-physically contiguous pages (system) */
radeon_ring_write(ring, DMA_PACKET(DMA_PACKET_WRITE, 0, 0, 0, ndw));
radeon_ring_write(ring, pe);
radeon_ring_write(ring, upper_32_bits(pe) & 0xff);
ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_WRITE, 0, 0, 0, ndw);
ib->ptr[ib->length_dw++] = pe;
ib->ptr[ib->length_dw++] = upper_32_bits(pe) & 0xff; for (; ndw > 0; ndw -= 2, --count, pe += 8) { if (flags & RADEON_VM_PAGE_SYSTEM) { value = radeon_vm_map_gart(rdev, addr);
@@ -3109,8 +3111,8 @@ void si_vm_set_page(struct radeon_device *rdev, uint64_t pe, } addr += incr; value |= r600_flags;
radeon_ring_write(ring, value);
radeon_ring_write(ring, upper_32_bits(value));
ib->ptr[ib->length_dw++] = value;
} else {ib->ptr[ib->length_dw++] = upper_32_bits(value); } }
@@ -3124,20 +3126,22 @@ void si_vm_set_page(struct radeon_device *rdev, uint64_t pe, else value = 0; /* for physically contiguous pages (vram) */
radeon_ring_write(ring, DMA_PTE_PDE_PACKET(ndw));
radeon_ring_write(ring, pe); /* dst addr */
radeon_ring_write(ring, upper_32_bits(pe) & 0xff);
radeon_ring_write(ring, r600_flags); /* mask */
radeon_ring_write(ring, 0);
radeon_ring_write(ring, value); /* value */
radeon_ring_write(ring, upper_32_bits(value));
radeon_ring_write(ring, incr); /* increment size */
radeon_ring_write(ring, 0);
ib->ptr[ib->length_dw++] = DMA_PTE_PDE_PACKET(ndw);
ib->ptr[ib->length_dw++] = pe; /* dst addr */
ib->ptr[ib->length_dw++] = upper_32_bits(pe) & 0xff;
ib->ptr[ib->length_dw++] = r600_flags; /* mask */
ib->ptr[ib->length_dw++] = 0;
ib->ptr[ib->length_dw++] = value; /* value */
ib->ptr[ib->length_dw++] = upper_32_bits(value);
ib->ptr[ib->length_dw++] = incr; /* increment size */
}ib->ptr[ib->length_dw++] = 0; pe += ndw * 4; addr += (ndw / 2) * incr; count -= ndw / 2; }
while (ib->length_dw & 0x7)
} }ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_NOP, 0, 0, 0, 0);