Trying to resolve the remaining bugs today. Expect an v3 of the patch this evening or Monday morning.
Cheers, Christian.
On 04.10.2012 16:32, Dmitry Cherkasov wrote:
v2: setup and alloc number of contiguous PTs if possible
Warning: Heaven benchmark /sometimes/ fails with this patch after 10 or 15 minutes of working, so any insight is greatly appreciated.
The code is a bit bloated because it's a question how a decent optimization should be made: via macros? using structs? etc.
The rationale for struct radeon_pt is that BO may contain several contiguous PTs and we should have that u64 gpu_addr to point to actual begining of PT.
I've only tested it on cayman card, should work on SI but who knows? ;)
Please say your ideas.
drivers/gpu/drm/radeon/radeon.h | 12 ++ drivers/gpu/drm/radeon/radeon_gart.c | 263 ++++++++++++++++++++++++++++++++-- 2 files changed, 260 insertions(+), 15 deletions(-)
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index b04c064..38d4eda 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -659,6 +659,15 @@ struct radeon_ring { /* number of entries in page table */ #define RADEON_VM_PTE_COUNT (1 << RADEON_VM_BLOCK_SIZE)
+struct radeon_pt {
- /* BO containing the page table */
- /* radeon_sa_bo_gpu_addr(sa_bo); */
- struct radeon_sa_bo *bo;
- /* GPU address of page table */
- u64 gpu_addr;
+};
- struct radeon_vm { struct list_head list; struct list_head va;
@@ -671,6 +680,9 @@ struct radeon_vm { struct radeon_fence *fence; /* last flush or NULL if we still need to flush */ struct radeon_fence *last_flush;
/* page tables list */
struct radeon_pt *vm_pts; };
struct radeon_vm_manager {
diff --git a/drivers/gpu/drm/radeon/radeon_gart.c b/drivers/gpu/drm/radeon/radeon_gart.c index 753b7ca..cea918d 100644 --- a/drivers/gpu/drm/radeon/radeon_gart.c +++ b/drivers/gpu/drm/radeon/radeon_gart.c @@ -500,6 +500,10 @@ static void radeon_vm_free_pt(struct radeon_device *rdev, struct radeon_vm *vm) { struct radeon_bo_va *bo_va;
int i;
int driver_table_entries = (rdev->vm_manager.max_pfn >>
RADEON_VM_BLOCK_SIZE);
if (!vm->sa_bo) return;
@@ -510,6 +514,14 @@ static void radeon_vm_free_pt(struct radeon_device *rdev, list_for_each_entry(bo_va, &vm->va, vm_list) { bo_va->valid = false; }
if (vm->vm_pts == NULL)
return;
for (i = 0;i < driver_table_entries; i++)
radeon_sa_bo_free(rdev, &vm->vm_pts[i].bo, vm->fence);
kfree (vm->vm_pts); }
/**
@@ -563,6 +575,9 @@ int radeon_vm_alloc_pt(struct radeon_device *rdev, struct radeon_vm *vm) int r; u64 *pd_addr; int tables_size;
int driver_table_size = (rdev->vm_manager.max_pfn >>
RADEON_VM_BLOCK_SIZE) *
sizeof(struct radeon_pt);
if (vm == NULL) { return -EINVAL;
@@ -570,7 +585,6 @@ int radeon_vm_alloc_pt(struct radeon_device *rdev, struct radeon_vm *vm)
/* allocate enough to cover the current VM size */ tables_size = RADEON_GPU_PAGE_ALIGN(radeon_vm_directory_size(rdev));
tables_size += RADEON_GPU_PAGE_ALIGN(vm->last_pfn * 8);
if (vm->sa_bo != NULL) { /* update lru */
@@ -600,6 +614,16 @@ retry: vm->pd_gpu_addr = radeon_sa_bo_gpu_addr(vm->sa_bo); memset(pd_addr, 0, tables_size);
- vm->vm_pts = kmalloc(driver_table_size, GFP_KERNEL);
- if (vm->vm_pts == NULL) {
DRM_ERROR("Cannot allocate space for driver page table\n");
radeon_sa_bo_free(rdev, &vm->sa_bo, vm->fence);
return -ENOMEM;
- }
- memset(vm->vm_pts, 0, driver_table_size);
- list_add_tail(&vm->list, &rdev->vm_manager.lru_vm); return radeon_vm_bo_update_pte(rdev, vm, rdev->ring_tmp_bo.bo, &rdev->ring_tmp_bo.bo->tbo.mem);
@@ -864,6 +888,69 @@ uint64_t radeon_vm_map_gart(struct radeon_device *rdev, uint64_t addr) return result; }
+/* setup @count pfns starting at @addr to PTEs starting at @pt_start and
- @pde_count pdes starting at @pd_start */
+static void radeon_vm_map_pfns (struct radeon_device *rdev,
uint64_t pt_addr, uint64_t pt_offset,
uint64_t addr, uint64_t pte_count,
uint64_t pd_start, uint32_t pde_count, uint32_t flags)
+{
- if (pde_count == 0 && pte_count == 0)
return;
- radeon_asic_vm_set_page(rdev, pt_addr + pt_offset, addr,
pte_count,
RADEON_GPU_PAGE_SIZE, flags);
- radeon_asic_vm_set_page(rdev, pd_start, pt_addr,
pde_count,
RADEON_VM_PTE_COUNT * 8, RADEON_VM_PAGE_VALID);
+}
+int radeon_suballoc_pts(struct radeon_device *rdev, struct radeon_vm *vm, uint64_t start_pt, uint32_t count) +{
- uint32_t i;
- int r;
- struct radeon_vm *vm_evict;
- struct radeon_pt *pt = &vm->vm_pts[start_pt], *pti;
+retry:
- r = radeon_sa_bo_new(rdev, &rdev->vm_manager.sa_manager,
&pt->bo,
RADEON_VM_PTE_COUNT * 8 * count,
RADEON_GPU_PAGE_SIZE, false);
- if (r == -ENOMEM) {
if (list_empty(&rdev->vm_manager.lru_vm)) {
DRM_ERROR("cannot allocate driver page table"
"for vmid = %d", vm->id);
return r;
}
vm_evict = list_first_entry(&rdev->vm_manager.lru_vm,
struct radeon_vm, list);
mutex_lock(&vm_evict->mutex);
radeon_vm_free_pt(rdev, vm_evict);
mutex_unlock(&vm_evict->mutex);
DRM_INFO("run out of SA memory for PT. Trying to free LRU vm id = %d\n", vm_evict->id);
goto retry;
- }
- pt->gpu_addr = radeon_sa_bo_gpu_addr(pt->bo);
- for (i = 1; i < count; i++) {
pti = &vm->vm_pts[start_pt + i];
pti->bo = NULL;
pti->gpu_addr = pt->gpu_addr + i * RADEON_VM_PTE_COUNT * 8;
- }
- return 0;
+}
- /**
- radeon_vm_bo_update_pte - map a bo into the vm page table
@@ -886,10 +973,18 @@ int radeon_vm_bo_update_pte(struct radeon_device *rdev, struct radeon_ring *ring = &rdev->ring[ridx]; struct radeon_semaphore *sem = NULL; struct radeon_bo_va *bo_va;
- unsigned nptes, npdes, ndw;
- uint64_t pe, addr;
struct radeon_pt *pt;
unsigned nptes, npdes, ndw, count;
uint64_t addr; uint64_t pfn;
uint32_t pfns_to_pt_edge, pfns_to_end; int r;
uint64_t mem_pfn_offset;
uint64_t pfn_idx, last_pfn, pde_num, pte_num;
uint64_t pfn_map_start, pde_map_start, pte_map_start, pde_map_count, pte_map_count;
uint64_t prev_gpu_addr;
char need_alloc, need_map;
/* nothing to do if vm isn't bound */ if (vm->sa_bo == NULL)
@@ -971,22 +1066,159 @@ int radeon_vm_bo_update_pte(struct radeon_device *rdev, radeon_fence_note_sync(vm->fence, ridx); }
- /* update page table entries */
- pe = vm->pd_gpu_addr;
- pe += radeon_vm_directory_size(rdev);
- pe += (bo_va->soffset / RADEON_GPU_PAGE_SIZE) * 8;
- pfn_idx = pfn;
- last_pfn = pfn_idx + nptes;
- pfn_map_start = 0;
- pde_map_start = pfn_idx / RADEON_VM_PTE_COUNT;
- pte_map_start = pfn_idx % RADEON_VM_PTE_COUNT;
- pde_map_count = 0;
- pte_map_count = 0;
- pte_num = pfn_idx % RADEON_VM_PTE_COUNT;
- pde_num = pfn_idx / RADEON_VM_PTE_COUNT;
- pt = &vm->vm_pts[pde_num];
- prev_gpu_addr = 0;
- need_alloc = 0;
- need_map = 0;
- for (mem_pfn_offset = 0; mem_pfn_offset < nptes;) {
pfns_to_end = last_pfn - pfn_idx;
pfns_to_pt_edge = RADEON_VM_PTE_COUNT -
(pfn_idx % RADEON_VM_PTE_COUNT);
count = pfns_to_pt_edge < pfns_to_end ?
pfns_to_pt_edge : pfns_to_end;
pde_num = pfn_idx / RADEON_VM_PTE_COUNT;
pte_num = pfn_idx % RADEON_VM_PTE_COUNT;
pt = &vm->vm_pts[pde_num];
if (pt->gpu_addr == 0 && prev_gpu_addr == -1) {
/* case 1 */
/* pt for current pfn_idx is unmapped */
/* previous ptes are unmapped */
need_alloc = 1;
need_map = 1;
}
else if (pt->gpu_addr != 0 && prev_gpu_addr != -1) {
/* case 4 */
/* pt for current pfn_idx is mapped */
/* previous ptes are mapped */
if ( pt->gpu_addr != prev_gpu_addr + RADEON_VM_PTE_COUNT * 8) {
/* current pt is not contiguous with previous
one */
/* flush prior pts */
radeon_vm_map_pfns(
rdev, vm->vm_pts[pde_map_start].gpu_addr,
pte_map_start * 8,
addr + pfn_map_start * RADEON_GPU_PAGE_SIZE,
pte_map_count,
vm->pd_gpu_addr + pde_map_start * 8,
pde_map_count,
bo_va->flags);
pfn_map_start = mem_pfn_offset;
pde_map_start = pfn_idx / RADEON_VM_PTE_COUNT;
pte_map_start = pfn_idx % RADEON_VM_PTE_COUNT;
pde_map_count = 0;
pte_map_count = 0;
}
prev_gpu_addr = pt->gpu_addr;
need_alloc = 0;
need_map = 1;
}
else if (pt->gpu_addr == 0 && prev_gpu_addr != -1) {
/* case 2 */
/* pt for current pfn_idx is unmapped */
/* previous ptes are mapped */
- radeon_asic_vm_set_page(rdev, pe, addr, nptes,
RADEON_GPU_PAGE_SIZE, bo_va->flags);
/* setup prior pdes & ptes here */
- /* update page directory entries */
- addr = pe;
radeon_vm_map_pfns(
rdev, vm->vm_pts[pde_map_start].gpu_addr,
pte_map_start * 8,
addr + pfn_map_start * RADEON_GPU_PAGE_SIZE,
pte_map_count,
vm->pd_gpu_addr + pde_map_start * 8,
pde_map_count,
bo_va->flags);
- pe = vm->pd_gpu_addr;
- pe += ((bo_va->soffset / RADEON_GPU_PAGE_SIZE) >> RADEON_VM_BLOCK_SIZE) * 8;
pfn_map_start = mem_pfn_offset;
pde_map_start = pfn_idx / RADEON_VM_PTE_COUNT;
pte_map_start = pfn_idx % RADEON_VM_PTE_COUNT;
- radeon_asic_vm_set_page(rdev, pe, addr, npdes,
RADEON_VM_PTE_COUNT * 8, RADEON_VM_PAGE_VALID);
pde_map_count = 0;
pte_map_count = 0;
prev_gpu_addr = -1;
need_alloc = 1;
need_map = 1;
}
else if (pt->gpu_addr != 0 && prev_gpu_addr == -1) {
/* case 3 */
/* pt for current pfn_idx is mapped */
/* previous ptes are unmapped */
/* map prior pfns if there are any */
if (pfn_map_start < mem_pfn_offset) {
radeon_suballoc_pts(rdev, vm,
pde_map_start, pde_map_count);
radeon_vm_map_pfns(
rdev, vm->vm_pts[pde_map_start].gpu_addr,
pte_map_start * 8,
addr + pfn_map_start * RADEON_GPU_PAGE_SIZE,
pte_map_count,
vm->pd_gpu_addr + pde_map_start * 8,
pde_map_count,
bo_va->flags);
}
pfn_map_start = mem_pfn_offset;
pde_map_start = pfn_idx / RADEON_VM_PTE_COUNT;
pte_map_start = pfn_idx % RADEON_VM_PTE_COUNT;
pde_map_count = 0;
pte_map_count = 0;
prev_gpu_addr = pt->gpu_addr;
need_alloc = 0;
need_map = 1;
}
pde_map_count++;
pte_map_count += count;
pfn_idx += count;
mem_pfn_offset += count;
}
if (need_alloc) {
radeon_suballoc_pts(rdev, vm, pde_map_start, pde_map_count);
}
if (need_map) {
if (vm->vm_pts[pde_map_start].gpu_addr == 0)
DRM_ERROR("gpu_addr == 0. smth is wrong\n");
radeon_vm_map_pfns(
rdev, vm->vm_pts[pde_map_start].gpu_addr,
pte_map_start * 8,
addr + pfn_map_start * RADEON_GPU_PAGE_SIZE,
pte_map_count,
vm->pd_gpu_addr + pde_map_start * 8,
pde_map_count,
bo_va->flags);
}
radeon_fence_unref(&vm->fence); r = radeon_fence_emit(rdev, &vm->fence, ridx);
@@ -997,6 +1229,7 @@ int radeon_vm_bo_update_pte(struct radeon_device *rdev, radeon_ring_unlock_commit(rdev, ring); radeon_semaphore_free(rdev, &sem, vm->fence); radeon_fence_unref(&vm->last_flush);
- return 0; }