From: Christian König christian.koenig@amd.com
This patch implements support for VRAM page table entry compression. PTE construction is enhanced to identify physically contiguous page ranges and mark them in the PTE fragment field. L1 TLB and L2 cache support is enabled for 64KB (SI/CIK) and 256KB (NI) PTE fragments, significantly improving TLB utilization for VRAM allocations.
Linear store bandwidth is improved from 60GB/s to 125GB/s on Pitcairn. Unigine Heaven 3.0 sees an average improvement from 24.7 to 27.7 FPS on default settings at 1920x1200 resolution with vsync disabled.
See main comment in radeon_gart.c gives a technical description.
v2 (chk): rebased and simplified. v3 (chk): add missing hw setup v4 (chk): rebased on current drm-fixes-3.15
Signed-off-by: Jay Cornwall jay@jcornwall.me Signed-off-by: Christian König christian.koenig@amd.com --- drivers/gpu/drm/radeon/cik.c | 4 +- drivers/gpu/drm/radeon/ni.c | 2 + drivers/gpu/drm/radeon/radeon.h | 5 +++ drivers/gpu/drm/radeon/radeon_vm.c | 91 +++++++++++++++++++++++++++++++++++--- drivers/gpu/drm/radeon/si.c | 5 ++- 5 files changed, 98 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c index 38f3fcc..38da9f3 100644 --- a/drivers/gpu/drm/radeon/cik.c +++ b/drivers/gpu/drm/radeon/cik.c @@ -5395,6 +5395,7 @@ static int cik_pcie_gart_enable(struct radeon_device *rdev) WREG32(MC_VM_MX_L1_TLB_CNTL, (0xA << 7) | ENABLE_L1_TLB | + ENABLE_L1_FRAGMENT_PROCESSING | SYSTEM_ACCESS_MODE_NOT_IN_SYS | ENABLE_ADVANCED_DRIVER_MODEL | SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU); @@ -5407,7 +5408,8 @@ static int cik_pcie_gart_enable(struct radeon_device *rdev) CONTEXT1_IDENTITY_ACCESS_MODE(1)); WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE); WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY | - L2_CACHE_BIGK_FRAGMENT_SIZE(6)); + BANK_SELECT(4) | + L2_CACHE_BIGK_FRAGMENT_SIZE(4)); /* setup context0 */ WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >> 12); WREG32(VM_CONTEXT0_PAGE_TABLE_END_ADDR, rdev->mc.gtt_end >> 12); diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c index d246e04..5e8db9b 100644 --- a/drivers/gpu/drm/radeon/ni.c +++ b/drivers/gpu/drm/radeon/ni.c @@ -1228,12 +1228,14 @@ static int cayman_pcie_gart_enable(struct radeon_device *rdev) SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU); /* Setup L2 cache */ WREG32(VM_L2_CNTL, ENABLE_L2_CACHE | + ENABLE_L2_FRAGMENT_PROCESSING | ENABLE_L2_PTE_CACHE_LRU_UPDATE_BY_WRITE | ENABLE_L2_PDE0_CACHE_LRU_UPDATE_BY_WRITE | EFFECTIVE_L2_QUEUE_SIZE(7) | CONTEXT1_IDENTITY_ACCESS_MODE(1)); WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE); WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY | + BANK_SELECT(6) | L2_CACHE_BIGK_FRAGMENT_SIZE(6)); /* setup context0 */ WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >> 12); diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 6852861..e3d6be3 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -854,6 +854,11 @@ struct radeon_mec { #define R600_PTE_READABLE (1 << 5) #define R600_PTE_WRITEABLE (1 << 6)
+/* PTE (Page Table Entry) fragment field for different page sizes */ +#define R600_PTE_FRAG_4KB (0 << 7) +#define R600_PTE_FRAG_64KB (4 << 7) +#define R600_PTE_FRAG_256KB (6 << 7) + struct radeon_vm_pt { struct radeon_bo *bo; uint64_t addr; diff --git a/drivers/gpu/drm/radeon/radeon_vm.c b/drivers/gpu/drm/radeon/radeon_vm.c index 2aae6ce..6bf656e 100644 --- a/drivers/gpu/drm/radeon/radeon_vm.c +++ b/drivers/gpu/drm/radeon/radeon_vm.c @@ -658,6 +658,84 @@ int radeon_vm_update_page_directory(struct radeon_device *rdev, }
/** + * radeon_vm_frag_ptes - add fragment information to PTEs + * + * @rdev: radeon_device pointer + * @ib: IB for the update + * @pe_start: first PTE to handle + * @pe_end: last PTE to handle + * @addr: addr those PTEs should point to + * @flags: hw mapping flags + * + * Global and local mutex must be locked! + */ +static void radeon_vm_frag_ptes(struct radeon_device *rdev, + struct radeon_ib *ib, + uint64_t pe_start, uint64_t pe_end, + uint64_t addr, uint32_t flags) +{ + /** + * The MC L1 TLB supports variable sized pages, based on a fragment + * field in the PTE. When this field is set to a non-zero value, page + * granularity is increased from 4KB to (1 << (12 + frag)). The PTE + * flags are considered valid for all PTEs within the fragment range + * and corresponding mappings are assumed to be physically contiguous. + * + * The L1 TLB can store a single PTE for the whole fragment, + * significantly increasing the space available for translation + * caching. This leads to large improvements in throughput when the + * TLB is under pressure. + * + * The L2 cache distributes small and large fragments into two + * asymmetric partitions. The large fragment cache is significantly + * larger. Thus, we try to use large fragments wherever possible. + * Userspace can support this by aligning virtual base address and + * allocation size to the fragment size. + */ + + /* NI is optimized for 256KB fragments, SI and newer for 64KB */ + uint64_t frag_flags = rdev->family == CHIP_CAYMAN ? + R600_PTE_FRAG_256KB : R600_PTE_FRAG_64KB; + uint64_t frag_align = rdev->family == CHIP_CAYMAN ? 0x200 : 0x80; + + uint64_t frag_start = ALIGN(pe_start, frag_align); + uint64_t frag_end = pe_end & ~(frag_align - 1); + + unsigned count; + + /* system pages are non continuously */ + if ((flags & R600_PTE_SYSTEM) || !(flags & R600_PTE_VALID) || + (frag_start >= frag_end)) { + + count = (pe_end - pe_start) / 8; + radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count, + RADEON_GPU_PAGE_SIZE, flags); + return; + } + + /* handle the 4K area at the beginning */ + if (pe_start != frag_start) { + count = (frag_start - pe_start) / 8; + radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count, + RADEON_GPU_PAGE_SIZE, flags); + addr += RADEON_GPU_PAGE_SIZE * count; + } + + /* handle the area in the middle */ + count = (frag_end - frag_start) / 8; + radeon_asic_vm_set_page(rdev, ib, frag_start, addr, count, + RADEON_GPU_PAGE_SIZE, flags | frag_flags); + + /* handle the 4K area at the end */ + if (frag_end != pe_end) { + addr += RADEON_GPU_PAGE_SIZE * count; + count = (pe_end - frag_end) / 8; + radeon_asic_vm_set_page(rdev, ib, frag_end, addr, count, + RADEON_GPU_PAGE_SIZE, flags); + } +} + +/** * radeon_vm_update_ptes - make sure that page tables are valid * * @rdev: radeon_device pointer @@ -703,10 +781,9 @@ static void radeon_vm_update_ptes(struct radeon_device *rdev, if ((last_pte + 8 * count) != pte) {
if (count) { - radeon_asic_vm_set_page(rdev, ib, last_pte, - last_dst, count, - RADEON_GPU_PAGE_SIZE, - flags); + radeon_vm_frag_ptes(rdev, ib, last_pte, + last_pte + 8 * count, + last_dst, flags); }
count = nptes; @@ -721,9 +798,9 @@ static void radeon_vm_update_ptes(struct radeon_device *rdev, }
if (count) { - radeon_asic_vm_set_page(rdev, ib, last_pte, - last_dst, count, - RADEON_GPU_PAGE_SIZE, flags); + radeon_vm_frag_ptes(rdev, ib, last_pte, + last_pte + 8 * count, + last_dst, flags); } }
diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c index 22a63c9..dece3be 100644 --- a/drivers/gpu/drm/radeon/si.c +++ b/drivers/gpu/drm/radeon/si.c @@ -4044,18 +4044,21 @@ static int si_pcie_gart_enable(struct radeon_device *rdev) WREG32(MC_VM_MX_L1_TLB_CNTL, (0xA << 7) | ENABLE_L1_TLB | + ENABLE_L1_FRAGMENT_PROCESSING | SYSTEM_ACCESS_MODE_NOT_IN_SYS | ENABLE_ADVANCED_DRIVER_MODEL | SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU); /* Setup L2 cache */ WREG32(VM_L2_CNTL, ENABLE_L2_CACHE | + ENABLE_L2_FRAGMENT_PROCESSING | ENABLE_L2_PTE_CACHE_LRU_UPDATE_BY_WRITE | ENABLE_L2_PDE0_CACHE_LRU_UPDATE_BY_WRITE | EFFECTIVE_L2_QUEUE_SIZE(7) | CONTEXT1_IDENTITY_ACCESS_MODE(1)); WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE); WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY | - L2_CACHE_BIGK_FRAGMENT_SIZE(0)); + BANK_SELECT(4) | + L2_CACHE_BIGK_FRAGMENT_SIZE(4)); /* setup context0 */ WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >> 12); WREG32(VM_CONTEXT0_PAGE_TABLE_END_ADDR, rdev->mc.gtt_end >> 12);
From: Christian König christian.koenig@amd.com
This patch makes it possible to decide how many address bits are spend on the page directory vs the page tables.
Signed-off-by: Christian König christian.koenig@amd.com --- drivers/gpu/drm/radeon/cik.c | 1 + drivers/gpu/drm/radeon/cikd.h | 1 + drivers/gpu/drm/radeon/ni.c | 1 + drivers/gpu/drm/radeon/nid.h | 1 + drivers/gpu/drm/radeon/radeon.h | 2 +- drivers/gpu/drm/radeon/radeon_vm.c | 4 +++- drivers/gpu/drm/radeon/si.c | 1 + drivers/gpu/drm/radeon/sid.h | 1 + 8 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c index 38da9f3..f11a9ab 100644 --- a/drivers/gpu/drm/radeon/cik.c +++ b/drivers/gpu/drm/radeon/cik.c @@ -5445,6 +5445,7 @@ static int cik_pcie_gart_enable(struct radeon_device *rdev) (u32)(rdev->dummy_page.addr >> 12)); WREG32(VM_CONTEXT1_CNTL2, 4); WREG32(VM_CONTEXT1_CNTL, ENABLE_CONTEXT | PAGE_TABLE_DEPTH(1) | + PAGE_TABLE_BLOCK_SIZE(RADEON_VM_BLOCK_SIZE - 9) | RANGE_PROTECTION_FAULT_ENABLE_INTERRUPT | RANGE_PROTECTION_FAULT_ENABLE_DEFAULT | DUMMY_PAGE_PROTECTION_FAULT_ENABLE_INTERRUPT | diff --git a/drivers/gpu/drm/radeon/cikd.h b/drivers/gpu/drm/radeon/cikd.h index dd79263..ae88660 100644 --- a/drivers/gpu/drm/radeon/cikd.h +++ b/drivers/gpu/drm/radeon/cikd.h @@ -482,6 +482,7 @@ #define READ_PROTECTION_FAULT_ENABLE_DEFAULT (1 << 16) #define WRITE_PROTECTION_FAULT_ENABLE_INTERRUPT (1 << 18) #define WRITE_PROTECTION_FAULT_ENABLE_DEFAULT (1 << 19) +#define PAGE_TABLE_BLOCK_SIZE(x) (((x) & 0xF) << 24) #define VM_CONTEXT1_CNTL 0x1414 #define VM_CONTEXT0_CNTL2 0x1430 #define VM_CONTEXT1_CNTL2 0x1434 diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c index 5e8db9b..1d3209f 100644 --- a/drivers/gpu/drm/radeon/ni.c +++ b/drivers/gpu/drm/radeon/ni.c @@ -1268,6 +1268,7 @@ static int cayman_pcie_gart_enable(struct radeon_device *rdev) (u32)(rdev->dummy_page.addr >> 12)); WREG32(VM_CONTEXT1_CNTL2, 4); WREG32(VM_CONTEXT1_CNTL, ENABLE_CONTEXT | PAGE_TABLE_DEPTH(1) | + PAGE_TABLE_BLOCK_SIZE(RADEON_VM_BLOCK_SIZE - 9) | RANGE_PROTECTION_FAULT_ENABLE_INTERRUPT | RANGE_PROTECTION_FAULT_ENABLE_DEFAULT | DUMMY_PAGE_PROTECTION_FAULT_ENABLE_INTERRUPT | diff --git a/drivers/gpu/drm/radeon/nid.h b/drivers/gpu/drm/radeon/nid.h index d996033..2e12e4d 100644 --- a/drivers/gpu/drm/radeon/nid.h +++ b/drivers/gpu/drm/radeon/nid.h @@ -128,6 +128,7 @@ #define READ_PROTECTION_FAULT_ENABLE_DEFAULT (1 << 16) #define WRITE_PROTECTION_FAULT_ENABLE_INTERRUPT (1 << 18) #define WRITE_PROTECTION_FAULT_ENABLE_DEFAULT (1 << 19) +#define PAGE_TABLE_BLOCK_SIZE(x) (((x) & 0xF) << 24) #define VM_CONTEXT1_CNTL 0x1414 #define VM_CONTEXT0_CNTL2 0x1430 #define VM_CONTEXT1_CNTL2 0x1434 diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index e3d6be3..2d80cf2 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -838,7 +838,7 @@ struct radeon_mec { /* defines number of bits in page table versus page directory, * a page is 4KB so we have 12 bits offset, 9 bits in the page * table and the remaining 19 bits are in the page directory */ -#define RADEON_VM_BLOCK_SIZE 9 +#define RADEON_VM_BLOCK_SIZE 12
/* number of entries in page table */ #define RADEON_VM_PTE_COUNT (1 << RADEON_VM_BLOCK_SIZE) diff --git a/drivers/gpu/drm/radeon/radeon_vm.c b/drivers/gpu/drm/radeon/radeon_vm.c index 6bf656e..3900ecf 100644 --- a/drivers/gpu/drm/radeon/radeon_vm.c +++ b/drivers/gpu/drm/radeon/radeon_vm.c @@ -964,6 +964,8 @@ void radeon_vm_bo_invalidate(struct radeon_device *rdev, */ int radeon_vm_init(struct radeon_device *rdev, struct radeon_vm *vm) { + const unsigned align = min(RADEON_VM_PTB_ALIGN_SIZE, + RADEON_VM_PTE_COUNT * 8); unsigned pd_size, pd_entries, pts_size; int r;
@@ -985,7 +987,7 @@ int radeon_vm_init(struct radeon_device *rdev, struct radeon_vm *vm) return -ENOMEM; }
- r = radeon_bo_create(rdev, pd_size, RADEON_VM_PTB_ALIGN_SIZE, false, + r = radeon_bo_create(rdev, pd_size, align, false, RADEON_GEM_DOMAIN_VRAM, NULL, &vm->page_directory); if (r) diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c index dece3be..8fa6be5 100644 --- a/drivers/gpu/drm/radeon/si.c +++ b/drivers/gpu/drm/radeon/si.c @@ -4095,6 +4095,7 @@ static int si_pcie_gart_enable(struct radeon_device *rdev) (u32)(rdev->dummy_page.addr >> 12)); WREG32(VM_CONTEXT1_CNTL2, 4); WREG32(VM_CONTEXT1_CNTL, ENABLE_CONTEXT | PAGE_TABLE_DEPTH(1) | + PAGE_TABLE_BLOCK_SIZE(RADEON_VM_BLOCK_SIZE - 9) | RANGE_PROTECTION_FAULT_ENABLE_INTERRUPT | RANGE_PROTECTION_FAULT_ENABLE_DEFAULT | DUMMY_PAGE_PROTECTION_FAULT_ENABLE_INTERRUPT | diff --git a/drivers/gpu/drm/radeon/sid.h b/drivers/gpu/drm/radeon/sid.h index 683532f..da8f867 100644 --- a/drivers/gpu/drm/radeon/sid.h +++ b/drivers/gpu/drm/radeon/sid.h @@ -362,6 +362,7 @@ #define READ_PROTECTION_FAULT_ENABLE_DEFAULT (1 << 16) #define WRITE_PROTECTION_FAULT_ENABLE_INTERRUPT (1 << 18) #define WRITE_PROTECTION_FAULT_ENABLE_DEFAULT (1 << 19) +#define PAGE_TABLE_BLOCK_SIZE(x) (((x) & 0xF) << 24) #define VM_CONTEXT1_CNTL 0x1414 #define VM_CONTEXT0_CNTL2 0x1430 #define VM_CONTEXT1_CNTL2 0x1434
On Thu, May 01, 2014 at 06:52:33PM +0200, Christian König wrote:
From: Christian König christian.koenig@amd.com
This patch makes it possible to decide how many address bits are spend on the page directory vs the page tables.
Signed-off-by: Christian König christian.koenig@amd.com
NAK on the basis you are modifying default behavior and hardware programming in same patch see below.
drivers/gpu/drm/radeon/cik.c | 1 + drivers/gpu/drm/radeon/cikd.h | 1 + drivers/gpu/drm/radeon/ni.c | 1 + drivers/gpu/drm/radeon/nid.h | 1 + drivers/gpu/drm/radeon/radeon.h | 2 +- drivers/gpu/drm/radeon/radeon_vm.c | 4 +++- drivers/gpu/drm/radeon/si.c | 1 + drivers/gpu/drm/radeon/sid.h | 1 + 8 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c index 38da9f3..f11a9ab 100644 --- a/drivers/gpu/drm/radeon/cik.c +++ b/drivers/gpu/drm/radeon/cik.c @@ -5445,6 +5445,7 @@ static int cik_pcie_gart_enable(struct radeon_device *rdev) (u32)(rdev->dummy_page.addr >> 12)); WREG32(VM_CONTEXT1_CNTL2, 4); WREG32(VM_CONTEXT1_CNTL, ENABLE_CONTEXT | PAGE_TABLE_DEPTH(1) |
PAGE_TABLE_BLOCK_SIZE(RADEON_VM_BLOCK_SIZE - 9) | RANGE_PROTECTION_FAULT_ENABLE_INTERRUPT | RANGE_PROTECTION_FAULT_ENABLE_DEFAULT | DUMMY_PAGE_PROTECTION_FAULT_ENABLE_INTERRUPT |
diff --git a/drivers/gpu/drm/radeon/cikd.h b/drivers/gpu/drm/radeon/cikd.h index dd79263..ae88660 100644 --- a/drivers/gpu/drm/radeon/cikd.h +++ b/drivers/gpu/drm/radeon/cikd.h @@ -482,6 +482,7 @@ #define READ_PROTECTION_FAULT_ENABLE_DEFAULT (1 << 16) #define WRITE_PROTECTION_FAULT_ENABLE_INTERRUPT (1 << 18) #define WRITE_PROTECTION_FAULT_ENABLE_DEFAULT (1 << 19) +#define PAGE_TABLE_BLOCK_SIZE(x) (((x) & 0xF) << 24) #define VM_CONTEXT1_CNTL 0x1414 #define VM_CONTEXT0_CNTL2 0x1430 #define VM_CONTEXT1_CNTL2 0x1434 diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c index 5e8db9b..1d3209f 100644 --- a/drivers/gpu/drm/radeon/ni.c +++ b/drivers/gpu/drm/radeon/ni.c @@ -1268,6 +1268,7 @@ static int cayman_pcie_gart_enable(struct radeon_device *rdev) (u32)(rdev->dummy_page.addr >> 12)); WREG32(VM_CONTEXT1_CNTL2, 4); WREG32(VM_CONTEXT1_CNTL, ENABLE_CONTEXT | PAGE_TABLE_DEPTH(1) |
PAGE_TABLE_BLOCK_SIZE(RADEON_VM_BLOCK_SIZE - 9) | RANGE_PROTECTION_FAULT_ENABLE_INTERRUPT | RANGE_PROTECTION_FAULT_ENABLE_DEFAULT | DUMMY_PAGE_PROTECTION_FAULT_ENABLE_INTERRUPT |
diff --git a/drivers/gpu/drm/radeon/nid.h b/drivers/gpu/drm/radeon/nid.h index d996033..2e12e4d 100644 --- a/drivers/gpu/drm/radeon/nid.h +++ b/drivers/gpu/drm/radeon/nid.h @@ -128,6 +128,7 @@ #define READ_PROTECTION_FAULT_ENABLE_DEFAULT (1 << 16) #define WRITE_PROTECTION_FAULT_ENABLE_INTERRUPT (1 << 18) #define WRITE_PROTECTION_FAULT_ENABLE_DEFAULT (1 << 19) +#define PAGE_TABLE_BLOCK_SIZE(x) (((x) & 0xF) << 24) #define VM_CONTEXT1_CNTL 0x1414 #define VM_CONTEXT0_CNTL2 0x1430 #define VM_CONTEXT1_CNTL2 0x1434 diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index e3d6be3..2d80cf2 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -838,7 +838,7 @@ struct radeon_mec { /* defines number of bits in page table versus page directory,
- a page is 4KB so we have 12 bits offset, 9 bits in the page
- table and the remaining 19 bits are in the page directory */
-#define RADEON_VM_BLOCK_SIZE 9 +#define RADEON_VM_BLOCK_SIZE 12
I would first do a patch that add all the code to set this bit inside the vm context registers and then a patch that change the default current value explaining why.
This would allow bisection to either find that the register setting is guilty or that the default value change is the one introducing problem
But really i am being picky here.
Cheers, Jérôme
/* number of entries in page table */ #define RADEON_VM_PTE_COUNT (1 << RADEON_VM_BLOCK_SIZE) diff --git a/drivers/gpu/drm/radeon/radeon_vm.c b/drivers/gpu/drm/radeon/radeon_vm.c index 6bf656e..3900ecf 100644 --- a/drivers/gpu/drm/radeon/radeon_vm.c +++ b/drivers/gpu/drm/radeon/radeon_vm.c @@ -964,6 +964,8 @@ void radeon_vm_bo_invalidate(struct radeon_device *rdev, */ int radeon_vm_init(struct radeon_device *rdev, struct radeon_vm *vm) {
- const unsigned align = min(RADEON_VM_PTB_ALIGN_SIZE,
unsigned pd_size, pd_entries, pts_size; int r;RADEON_VM_PTE_COUNT * 8);
@@ -985,7 +987,7 @@ int radeon_vm_init(struct radeon_device *rdev, struct radeon_vm *vm) return -ENOMEM; }
- r = radeon_bo_create(rdev, pd_size, RADEON_VM_PTB_ALIGN_SIZE, false,
- r = radeon_bo_create(rdev, pd_size, align, false, RADEON_GEM_DOMAIN_VRAM, NULL, &vm->page_directory); if (r)
diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c index dece3be..8fa6be5 100644 --- a/drivers/gpu/drm/radeon/si.c +++ b/drivers/gpu/drm/radeon/si.c @@ -4095,6 +4095,7 @@ static int si_pcie_gart_enable(struct radeon_device *rdev) (u32)(rdev->dummy_page.addr >> 12)); WREG32(VM_CONTEXT1_CNTL2, 4); WREG32(VM_CONTEXT1_CNTL, ENABLE_CONTEXT | PAGE_TABLE_DEPTH(1) |
PAGE_TABLE_BLOCK_SIZE(RADEON_VM_BLOCK_SIZE - 9) | RANGE_PROTECTION_FAULT_ENABLE_INTERRUPT | RANGE_PROTECTION_FAULT_ENABLE_DEFAULT | DUMMY_PAGE_PROTECTION_FAULT_ENABLE_INTERRUPT |
diff --git a/drivers/gpu/drm/radeon/sid.h b/drivers/gpu/drm/radeon/sid.h index 683532f..da8f867 100644 --- a/drivers/gpu/drm/radeon/sid.h +++ b/drivers/gpu/drm/radeon/sid.h @@ -362,6 +362,7 @@ #define READ_PROTECTION_FAULT_ENABLE_DEFAULT (1 << 16) #define WRITE_PROTECTION_FAULT_ENABLE_INTERRUPT (1 << 18) #define WRITE_PROTECTION_FAULT_ENABLE_DEFAULT (1 << 19) +#define PAGE_TABLE_BLOCK_SIZE(x) (((x) & 0xF) << 24) #define VM_CONTEXT1_CNTL 0x1414 #define VM_CONTEXT0_CNTL2 0x1430
#define VM_CONTEXT1_CNTL2 0x1434
1.9.1
dri-devel mailing list dri-devel@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/dri-devel
/* defines number of bits in page table versus page directory,
- a page is 4KB so we have 12 bits offset, 9 bits in the page
- table and the remaining 19 bits are in the page directory */
-#define RADEON_VM_BLOCK_SIZE 9 +#define RADEON_VM_BLOCK_SIZE 12 I would first do a patch that add all the code to set this bit inside the vm context registers and then a patch that change the default current value explaining why.
This would allow bisection to either find that the register setting is guilty or that the default value change is the one introducing problem
But really i am being picky here.
No, you're right. That change was unintentional, otherwise I would have updated the comment above as well.
Christian.
Am 01.05.2014 19:17, schrieb Jerome Glisse:
On Thu, May 01, 2014 at 06:52:33PM +0200, Christian König wrote:
From: Christian König christian.koenig@amd.com
This patch makes it possible to decide how many address bits are spend on the page directory vs the page tables.
Signed-off-by: Christian König christian.koenig@amd.com
NAK on the basis you are modifying default behavior and hardware programming in same patch see below.
drivers/gpu/drm/radeon/cik.c | 1 + drivers/gpu/drm/radeon/cikd.h | 1 + drivers/gpu/drm/radeon/ni.c | 1 + drivers/gpu/drm/radeon/nid.h | 1 + drivers/gpu/drm/radeon/radeon.h | 2 +- drivers/gpu/drm/radeon/radeon_vm.c | 4 +++- drivers/gpu/drm/radeon/si.c | 1 + drivers/gpu/drm/radeon/sid.h | 1 + 8 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c index 38da9f3..f11a9ab 100644 --- a/drivers/gpu/drm/radeon/cik.c +++ b/drivers/gpu/drm/radeon/cik.c @@ -5445,6 +5445,7 @@ static int cik_pcie_gart_enable(struct radeon_device *rdev) (u32)(rdev->dummy_page.addr >> 12)); WREG32(VM_CONTEXT1_CNTL2, 4); WREG32(VM_CONTEXT1_CNTL, ENABLE_CONTEXT | PAGE_TABLE_DEPTH(1) |
PAGE_TABLE_BLOCK_SIZE(RADEON_VM_BLOCK_SIZE - 9) | RANGE_PROTECTION_FAULT_ENABLE_INTERRUPT | RANGE_PROTECTION_FAULT_ENABLE_DEFAULT | DUMMY_PAGE_PROTECTION_FAULT_ENABLE_INTERRUPT |
diff --git a/drivers/gpu/drm/radeon/cikd.h b/drivers/gpu/drm/radeon/cikd.h index dd79263..ae88660 100644 --- a/drivers/gpu/drm/radeon/cikd.h +++ b/drivers/gpu/drm/radeon/cikd.h @@ -482,6 +482,7 @@ #define READ_PROTECTION_FAULT_ENABLE_DEFAULT (1 << 16) #define WRITE_PROTECTION_FAULT_ENABLE_INTERRUPT (1 << 18) #define WRITE_PROTECTION_FAULT_ENABLE_DEFAULT (1 << 19) +#define PAGE_TABLE_BLOCK_SIZE(x) (((x) & 0xF) << 24) #define VM_CONTEXT1_CNTL 0x1414 #define VM_CONTEXT0_CNTL2 0x1430 #define VM_CONTEXT1_CNTL2 0x1434 diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c index 5e8db9b..1d3209f 100644 --- a/drivers/gpu/drm/radeon/ni.c +++ b/drivers/gpu/drm/radeon/ni.c @@ -1268,6 +1268,7 @@ static int cayman_pcie_gart_enable(struct radeon_device *rdev) (u32)(rdev->dummy_page.addr >> 12)); WREG32(VM_CONTEXT1_CNTL2, 4); WREG32(VM_CONTEXT1_CNTL, ENABLE_CONTEXT | PAGE_TABLE_DEPTH(1) |
PAGE_TABLE_BLOCK_SIZE(RADEON_VM_BLOCK_SIZE - 9) | RANGE_PROTECTION_FAULT_ENABLE_INTERRUPT | RANGE_PROTECTION_FAULT_ENABLE_DEFAULT | DUMMY_PAGE_PROTECTION_FAULT_ENABLE_INTERRUPT |
diff --git a/drivers/gpu/drm/radeon/nid.h b/drivers/gpu/drm/radeon/nid.h index d996033..2e12e4d 100644 --- a/drivers/gpu/drm/radeon/nid.h +++ b/drivers/gpu/drm/radeon/nid.h @@ -128,6 +128,7 @@ #define READ_PROTECTION_FAULT_ENABLE_DEFAULT (1 << 16) #define WRITE_PROTECTION_FAULT_ENABLE_INTERRUPT (1 << 18) #define WRITE_PROTECTION_FAULT_ENABLE_DEFAULT (1 << 19) +#define PAGE_TABLE_BLOCK_SIZE(x) (((x) & 0xF) << 24) #define VM_CONTEXT1_CNTL 0x1414 #define VM_CONTEXT0_CNTL2 0x1430 #define VM_CONTEXT1_CNTL2 0x1434 diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index e3d6be3..2d80cf2 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -838,7 +838,7 @@ struct radeon_mec { /* defines number of bits in page table versus page directory,
- a page is 4KB so we have 12 bits offset, 9 bits in the page
- table and the remaining 19 bits are in the page directory */
-#define RADEON_VM_BLOCK_SIZE 9 +#define RADEON_VM_BLOCK_SIZE 12
I would first do a patch that add all the code to set this bit inside the vm context registers and then a patch that change the default current value explaining why.
This would allow bisection to either find that the register setting is guilty or that the default value change is the one introducing problem
But really i am being picky here.
Cheers, Jérôme
/* number of entries in page table */ #define RADEON_VM_PTE_COUNT (1 << RADEON_VM_BLOCK_SIZE) diff --git a/drivers/gpu/drm/radeon/radeon_vm.c b/drivers/gpu/drm/radeon/radeon_vm.c index 6bf656e..3900ecf 100644 --- a/drivers/gpu/drm/radeon/radeon_vm.c +++ b/drivers/gpu/drm/radeon/radeon_vm.c @@ -964,6 +964,8 @@ void radeon_vm_bo_invalidate(struct radeon_device *rdev, */ int radeon_vm_init(struct radeon_device *rdev, struct radeon_vm *vm) {
- const unsigned align = min(RADEON_VM_PTB_ALIGN_SIZE,
unsigned pd_size, pd_entries, pts_size; int r;RADEON_VM_PTE_COUNT * 8);
@@ -985,7 +987,7 @@ int radeon_vm_init(struct radeon_device *rdev, struct radeon_vm *vm) return -ENOMEM; }
- r = radeon_bo_create(rdev, pd_size, RADEON_VM_PTB_ALIGN_SIZE, false,
- r = radeon_bo_create(rdev, pd_size, align, false, RADEON_GEM_DOMAIN_VRAM, NULL, &vm->page_directory); if (r)
diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c index dece3be..8fa6be5 100644 --- a/drivers/gpu/drm/radeon/si.c +++ b/drivers/gpu/drm/radeon/si.c @@ -4095,6 +4095,7 @@ static int si_pcie_gart_enable(struct radeon_device *rdev) (u32)(rdev->dummy_page.addr >> 12)); WREG32(VM_CONTEXT1_CNTL2, 4); WREG32(VM_CONTEXT1_CNTL, ENABLE_CONTEXT | PAGE_TABLE_DEPTH(1) |
PAGE_TABLE_BLOCK_SIZE(RADEON_VM_BLOCK_SIZE - 9) | RANGE_PROTECTION_FAULT_ENABLE_INTERRUPT | RANGE_PROTECTION_FAULT_ENABLE_DEFAULT | DUMMY_PAGE_PROTECTION_FAULT_ENABLE_INTERRUPT |
diff --git a/drivers/gpu/drm/radeon/sid.h b/drivers/gpu/drm/radeon/sid.h index 683532f..da8f867 100644 --- a/drivers/gpu/drm/radeon/sid.h +++ b/drivers/gpu/drm/radeon/sid.h @@ -362,6 +362,7 @@ #define READ_PROTECTION_FAULT_ENABLE_DEFAULT (1 << 16) #define WRITE_PROTECTION_FAULT_ENABLE_INTERRUPT (1 << 18) #define WRITE_PROTECTION_FAULT_ENABLE_DEFAULT (1 << 19) +#define PAGE_TABLE_BLOCK_SIZE(x) (((x) & 0xF) << 24) #define VM_CONTEXT1_CNTL 0x1414 #define VM_CONTEXT0_CNTL2 0x1430
#define VM_CONTEXT1_CNTL2 0x1434
1.9.1
dri-devel mailing list dri-devel@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/dri-devel
On 2014-05-01 11:52, Christian König wrote:
Some minor comment fixes inline. I've been using v3 of this patch on SI for quite a while, with no visible failures.
Thanks for pushing this.
From: Christian König christian.koenig@amd.com
This patch implements support for VRAM page table entry compression. PTE construction is enhanced to identify physically contiguous page ranges and mark them in the PTE fragment field. L1 TLB and L2 cache
^^^^^^^^^^^^^^^^^^^ This should read L1/L2 TLB. HW spec refers to the L2 TLB as the VM L2 "cache", which confused the draft comments.
support is enabled for 64KB (SI/CIK) and 256KB (NI) PTE fragments, significantly improving TLB utilization for VRAM allocations.
Linear store bandwidth is improved from 60GB/s to 125GB/s on Pitcairn. Unigine Heaven 3.0 sees an average improvement from 24.7 to 27.7 FPS on default settings at 1920x1200 resolution with vsync disabled.
See main comment in radeon_gart.c gives a technical description.
v2 (chk): rebased and simplified. v3 (chk): add missing hw setup v4 (chk): rebased on current drm-fixes-3.15
Signed-off-by: Jay Cornwall jay@jcornwall.me Signed-off-by: Christian König christian.koenig@amd.com
drivers/gpu/drm/radeon/cik.c | 4 +- drivers/gpu/drm/radeon/ni.c | 2 + drivers/gpu/drm/radeon/radeon.h | 5 +++ drivers/gpu/drm/radeon/radeon_vm.c | 91 +++++++++++++++++++++++++++++++++++--- drivers/gpu/drm/radeon/si.c | 5 ++- 5 files changed, 98 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c index 38f3fcc..38da9f3 100644 --- a/drivers/gpu/drm/radeon/cik.c +++ b/drivers/gpu/drm/radeon/cik.c @@ -5395,6 +5395,7 @@ static int cik_pcie_gart_enable(struct radeon_device *rdev) WREG32(MC_VM_MX_L1_TLB_CNTL, (0xA << 7) | ENABLE_L1_TLB |
ENABLE_L1_FRAGMENT_PROCESSING | SYSTEM_ACCESS_MODE_NOT_IN_SYS | ENABLE_ADVANCED_DRIVER_MODEL | SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU);
@@ -5407,7 +5408,8 @@ static int cik_pcie_gart_enable(struct radeon_device *rdev) CONTEXT1_IDENTITY_ACCESS_MODE(1)); WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE); WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY |
L2_CACHE_BIGK_FRAGMENT_SIZE(6));
BANK_SELECT(4) |
/* setup context0 */ WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >> 12); WREG32(VM_CONTEXT0_PAGE_TABLE_END_ADDR, rdev->mc.gtt_end >> 12);L2_CACHE_BIGK_FRAGMENT_SIZE(4));
diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c index d246e04..5e8db9b 100644 --- a/drivers/gpu/drm/radeon/ni.c +++ b/drivers/gpu/drm/radeon/ni.c @@ -1228,12 +1228,14 @@ static int cayman_pcie_gart_enable(struct radeon_device *rdev) SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU); /* Setup L2 cache */ WREG32(VM_L2_CNTL, ENABLE_L2_CACHE |
WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE); WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY |ENABLE_L2_FRAGMENT_PROCESSING | ENABLE_L2_PTE_CACHE_LRU_UPDATE_BY_WRITE | ENABLE_L2_PDE0_CACHE_LRU_UPDATE_BY_WRITE | EFFECTIVE_L2_QUEUE_SIZE(7) | CONTEXT1_IDENTITY_ACCESS_MODE(1));
/* setup context0 */ WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >> 12);BANK_SELECT(6) | L2_CACHE_BIGK_FRAGMENT_SIZE(6));
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 6852861..e3d6be3 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -854,6 +854,11 @@ struct radeon_mec { #define R600_PTE_READABLE (1 << 5) #define R600_PTE_WRITEABLE (1 << 6)
+/* PTE (Page Table Entry) fragment field for different page sizes */ +#define R600_PTE_FRAG_4KB (0 << 7) +#define R600_PTE_FRAG_64KB (4 << 7) +#define R600_PTE_FRAG_256KB (6 << 7)
struct radeon_vm_pt { struct radeon_bo *bo; uint64_t addr; diff --git a/drivers/gpu/drm/radeon/radeon_vm.c b/drivers/gpu/drm/radeon/radeon_vm.c index 2aae6ce..6bf656e 100644 --- a/drivers/gpu/drm/radeon/radeon_vm.c +++ b/drivers/gpu/drm/radeon/radeon_vm.c @@ -658,6 +658,84 @@ int radeon_vm_update_page_directory(struct radeon_device *rdev, }
/**
- radeon_vm_frag_ptes - add fragment information to PTEs
- @rdev: radeon_device pointer
- @ib: IB for the update
- @pe_start: first PTE to handle
- @pe_end: last PTE to handle
- @addr: addr those PTEs should point to
- @flags: hw mapping flags
- Global and local mutex must be locked!
- */
+static void radeon_vm_frag_ptes(struct radeon_device *rdev,
struct radeon_ib *ib,
uint64_t pe_start, uint64_t pe_end,
uint64_t addr, uint32_t flags)
+{
- /**
* The MC L1 TLB supports variable sized pages, based on a fragment
* field in the PTE. When this field is set to a non-zero value, page
* granularity is increased from 4KB to (1 << (12 + frag)). The PTE
* flags are considered valid for all PTEs within the fragment range
* and corresponding mappings are assumed to be physically
contiguous.
*
* The L1 TLB can store a single PTE for the whole fragment,
* significantly increasing the space available for translation
* caching. This leads to large improvements in throughput when the
* TLB is under pressure.
*
* The L2 cache distributes small and large fragments into two
^^^^^ Again, L2 TLB.
* asymmetric partitions. The large fragment cache is significantly
* larger. Thus, we try to use large fragments wherever possible.
* Userspace can support this by aligning virtual base address and
* allocation size to the fragment size.
*/
- /* NI is optimized for 256KB fragments, SI and newer for 64KB */
- uint64_t frag_flags = rdev->family == CHIP_CAYMAN ?
R600_PTE_FRAG_256KB : R600_PTE_FRAG_64KB;
- uint64_t frag_align = rdev->family == CHIP_CAYMAN ? 0x200 : 0x80;
- uint64_t frag_start = ALIGN(pe_start, frag_align);
- uint64_t frag_end = pe_end & ~(frag_align - 1);
- unsigned count;
- /* system pages are non continuously */
- if ((flags & R600_PTE_SYSTEM) || !(flags & R600_PTE_VALID) ||
(frag_start >= frag_end)) {
count = (pe_end - pe_start) / 8;
radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count,
RADEON_GPU_PAGE_SIZE, flags);
return;
- }
- /* handle the 4K area at the beginning */
- if (pe_start != frag_start) {
count = (frag_start - pe_start) / 8;
radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count,
RADEON_GPU_PAGE_SIZE, flags);
addr += RADEON_GPU_PAGE_SIZE * count;
- }
- /* handle the area in the middle */
- count = (frag_end - frag_start) / 8;
- radeon_asic_vm_set_page(rdev, ib, frag_start, addr, count,
RADEON_GPU_PAGE_SIZE, flags | frag_flags);
- /* handle the 4K area at the end */
- if (frag_end != pe_end) {
addr += RADEON_GPU_PAGE_SIZE * count;
count = (pe_end - frag_end) / 8;
radeon_asic_vm_set_page(rdev, ib, frag_end, addr, count,
RADEON_GPU_PAGE_SIZE, flags);
- }
+}
+/**
- radeon_vm_update_ptes - make sure that page tables are valid
- @rdev: radeon_device pointer
@@ -703,10 +781,9 @@ static void radeon_vm_update_ptes(struct radeon_device *rdev, if ((last_pte + 8 * count) != pte) {
if (count) {
radeon_asic_vm_set_page(rdev, ib, last_pte,
last_dst, count,
RADEON_GPU_PAGE_SIZE,
flags);
radeon_vm_frag_ptes(rdev, ib, last_pte,
last_pte + 8 * count,
last_dst, flags); } count = nptes;
@@ -721,9 +798,9 @@ static void radeon_vm_update_ptes(struct radeon_device *rdev, }
if (count) {
radeon_asic_vm_set_page(rdev, ib, last_pte,
last_dst, count,
RADEON_GPU_PAGE_SIZE, flags);
radeon_vm_frag_ptes(rdev, ib, last_pte,
last_pte + 8 * count,
}last_dst, flags);
}
diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c index 22a63c9..dece3be 100644 --- a/drivers/gpu/drm/radeon/si.c +++ b/drivers/gpu/drm/radeon/si.c @@ -4044,18 +4044,21 @@ static int si_pcie_gart_enable(struct radeon_device *rdev) WREG32(MC_VM_MX_L1_TLB_CNTL, (0xA << 7) | ENABLE_L1_TLB |
/* Setup L2 cache */ WREG32(VM_L2_CNTL, ENABLE_L2_CACHE |ENABLE_L1_FRAGMENT_PROCESSING | SYSTEM_ACCESS_MODE_NOT_IN_SYS | ENABLE_ADVANCED_DRIVER_MODEL | SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU);
WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE); WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY |ENABLE_L2_FRAGMENT_PROCESSING | ENABLE_L2_PTE_CACHE_LRU_UPDATE_BY_WRITE | ENABLE_L2_PDE0_CACHE_LRU_UPDATE_BY_WRITE | EFFECTIVE_L2_QUEUE_SIZE(7) | CONTEXT1_IDENTITY_ACCESS_MODE(1));
L2_CACHE_BIGK_FRAGMENT_SIZE(0));
BANK_SELECT(4) |
/* setup context0 */ WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >> 12); WREG32(VM_CONTEXT0_PAGE_TABLE_END_ADDR, rdev->mc.gtt_end >> 12);L2_CACHE_BIGK_FRAGMENT_SIZE(4));
Am 01.05.2014 19:29, schrieb Jay Cornwall:
On 2014-05-01 11:52, Christian König wrote:
Some minor comment fixes inline. I've been using v3 of this patch on SI for quite a while, with no visible failures.
Thanks for the notes. I've added them to my v5 of the patch and also updated the file name in the commit message (we have moved that stuff to radeon_vm.c in the meantime).
Christian.
Thanks for pushing this.
From: Christian König christian.koenig@amd.com
This patch implements support for VRAM page table entry compression. PTE construction is enhanced to identify physically contiguous page ranges and mark them in the PTE fragment field. L1 TLB and L2 cache
^^^^^^^^^^^^^^^^^^^ This should read L1/L2 TLB. HW spec refers to the L2 TLB as the VM L2 "cache", which confused the draft comments.
support is enabled for 64KB (SI/CIK) and 256KB (NI) PTE fragments, significantly improving TLB utilization for VRAM allocations.
Linear store bandwidth is improved from 60GB/s to 125GB/s on Pitcairn. Unigine Heaven 3.0 sees an average improvement from 24.7 to 27.7 FPS on default settings at 1920x1200 resolution with vsync disabled.
See main comment in radeon_gart.c gives a technical description.
v2 (chk): rebased and simplified. v3 (chk): add missing hw setup v4 (chk): rebased on current drm-fixes-3.15
Signed-off-by: Jay Cornwall jay@jcornwall.me Signed-off-by: Christian König christian.koenig@amd.com
drivers/gpu/drm/radeon/cik.c | 4 +- drivers/gpu/drm/radeon/ni.c | 2 + drivers/gpu/drm/radeon/radeon.h | 5 +++ drivers/gpu/drm/radeon/radeon_vm.c | 91 +++++++++++++++++++++++++++++++++++--- drivers/gpu/drm/radeon/si.c | 5 ++- 5 files changed, 98 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c index 38f3fcc..38da9f3 100644 --- a/drivers/gpu/drm/radeon/cik.c +++ b/drivers/gpu/drm/radeon/cik.c @@ -5395,6 +5395,7 @@ static int cik_pcie_gart_enable(struct radeon_device *rdev) WREG32(MC_VM_MX_L1_TLB_CNTL, (0xA << 7) | ENABLE_L1_TLB |
ENABLE_L1_FRAGMENT_PROCESSING | SYSTEM_ACCESS_MODE_NOT_IN_SYS | ENABLE_ADVANCED_DRIVER_MODEL | SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU);
@@ -5407,7 +5408,8 @@ static int cik_pcie_gart_enable(struct radeon_device *rdev) CONTEXT1_IDENTITY_ACCESS_MODE(1)); WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE); WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY |
L2_CACHE_BIGK_FRAGMENT_SIZE(6));
BANK_SELECT(4) |
/* setup context0 */ WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >>L2_CACHE_BIGK_FRAGMENT_SIZE(4));
12); WREG32(VM_CONTEXT0_PAGE_TABLE_END_ADDR, rdev->mc.gtt_end >> 12); diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c index d246e04..5e8db9b 100644 --- a/drivers/gpu/drm/radeon/ni.c +++ b/drivers/gpu/drm/radeon/ni.c @@ -1228,12 +1228,14 @@ static int cayman_pcie_gart_enable(struct radeon_device *rdev) SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU); /* Setup L2 cache */ WREG32(VM_L2_CNTL, ENABLE_L2_CACHE |
WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE); WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY |ENABLE_L2_FRAGMENT_PROCESSING | ENABLE_L2_PTE_CACHE_LRU_UPDATE_BY_WRITE | ENABLE_L2_PDE0_CACHE_LRU_UPDATE_BY_WRITE | EFFECTIVE_L2_QUEUE_SIZE(7) | CONTEXT1_IDENTITY_ACCESS_MODE(1));
/* setup context0 */ WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >>BANK_SELECT(6) | L2_CACHE_BIGK_FRAGMENT_SIZE(6));
12); diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 6852861..e3d6be3 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -854,6 +854,11 @@ struct radeon_mec { #define R600_PTE_READABLE (1 << 5) #define R600_PTE_WRITEABLE (1 << 6)
+/* PTE (Page Table Entry) fragment field for different page sizes */ +#define R600_PTE_FRAG_4KB (0 << 7) +#define R600_PTE_FRAG_64KB (4 << 7) +#define R600_PTE_FRAG_256KB (6 << 7)
struct radeon_vm_pt { struct radeon_bo *bo; uint64_t addr; diff --git a/drivers/gpu/drm/radeon/radeon_vm.c b/drivers/gpu/drm/radeon/radeon_vm.c index 2aae6ce..6bf656e 100644 --- a/drivers/gpu/drm/radeon/radeon_vm.c +++ b/drivers/gpu/drm/radeon/radeon_vm.c @@ -658,6 +658,84 @@ int radeon_vm_update_page_directory(struct radeon_device *rdev, }
/**
- radeon_vm_frag_ptes - add fragment information to PTEs
- @rdev: radeon_device pointer
- @ib: IB for the update
- @pe_start: first PTE to handle
- @pe_end: last PTE to handle
- @addr: addr those PTEs should point to
- @flags: hw mapping flags
- Global and local mutex must be locked!
- */
+static void radeon_vm_frag_ptes(struct radeon_device *rdev,
struct radeon_ib *ib,
uint64_t pe_start, uint64_t pe_end,
uint64_t addr, uint32_t flags)
+{
- /**
* The MC L1 TLB supports variable sized pages, based on a fragment
* field in the PTE. When this field is set to a non-zero value,
page
* granularity is increased from 4KB to (1 << (12 + frag)). The PTE
* flags are considered valid for all PTEs within the fragment
range
* and corresponding mappings are assumed to be physically
contiguous.
*
* The L1 TLB can store a single PTE for the whole fragment,
* significantly increasing the space available for translation
* caching. This leads to large improvements in throughput when the
* TLB is under pressure.
*
* The L2 cache distributes small and large fragments into two
^^^^^
Again, L2 TLB.
* asymmetric partitions. The large fragment cache is significantly
* larger. Thus, we try to use large fragments wherever possible.
* Userspace can support this by aligning virtual base address and
* allocation size to the fragment size.
*/
- /* NI is optimized for 256KB fragments, SI and newer for 64KB */
- uint64_t frag_flags = rdev->family == CHIP_CAYMAN ?
R600_PTE_FRAG_256KB : R600_PTE_FRAG_64KB;
- uint64_t frag_align = rdev->family == CHIP_CAYMAN ? 0x200 : 0x80;
- uint64_t frag_start = ALIGN(pe_start, frag_align);
- uint64_t frag_end = pe_end & ~(frag_align - 1);
- unsigned count;
- /* system pages are non continuously */
- if ((flags & R600_PTE_SYSTEM) || !(flags & R600_PTE_VALID) ||
(frag_start >= frag_end)) {
count = (pe_end - pe_start) / 8;
radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count,
RADEON_GPU_PAGE_SIZE, flags);
return;
- }
- /* handle the 4K area at the beginning */
- if (pe_start != frag_start) {
count = (frag_start - pe_start) / 8;
radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count,
RADEON_GPU_PAGE_SIZE, flags);
addr += RADEON_GPU_PAGE_SIZE * count;
- }
- /* handle the area in the middle */
- count = (frag_end - frag_start) / 8;
- radeon_asic_vm_set_page(rdev, ib, frag_start, addr, count,
RADEON_GPU_PAGE_SIZE, flags | frag_flags);
- /* handle the 4K area at the end */
- if (frag_end != pe_end) {
addr += RADEON_GPU_PAGE_SIZE * count;
count = (pe_end - frag_end) / 8;
radeon_asic_vm_set_page(rdev, ib, frag_end, addr, count,
RADEON_GPU_PAGE_SIZE, flags);
- }
+}
+/**
- radeon_vm_update_ptes - make sure that page tables are valid
- @rdev: radeon_device pointer
@@ -703,10 +781,9 @@ static void radeon_vm_update_ptes(struct radeon_device *rdev, if ((last_pte + 8 * count) != pte) {
if (count) {
radeon_asic_vm_set_page(rdev, ib, last_pte,
last_dst, count,
RADEON_GPU_PAGE_SIZE,
flags);
radeon_vm_frag_ptes(rdev, ib, last_pte,
last_pte + 8 * count,
last_dst, flags); } count = nptes;
@@ -721,9 +798,9 @@ static void radeon_vm_update_ptes(struct radeon_device *rdev, }
if (count) {
radeon_asic_vm_set_page(rdev, ib, last_pte,
last_dst, count,
RADEON_GPU_PAGE_SIZE, flags);
radeon_vm_frag_ptes(rdev, ib, last_pte,
last_pte + 8 * count,
}last_dst, flags);
}
diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c index 22a63c9..dece3be 100644 --- a/drivers/gpu/drm/radeon/si.c +++ b/drivers/gpu/drm/radeon/si.c @@ -4044,18 +4044,21 @@ static int si_pcie_gart_enable(struct radeon_device *rdev) WREG32(MC_VM_MX_L1_TLB_CNTL, (0xA << 7) | ENABLE_L1_TLB |
/* Setup L2 cache */ WREG32(VM_L2_CNTL, ENABLE_L2_CACHE |ENABLE_L1_FRAGMENT_PROCESSING | SYSTEM_ACCESS_MODE_NOT_IN_SYS | ENABLE_ADVANCED_DRIVER_MODEL | SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU);
WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE); WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY |ENABLE_L2_FRAGMENT_PROCESSING | ENABLE_L2_PTE_CACHE_LRU_UPDATE_BY_WRITE | ENABLE_L2_PDE0_CACHE_LRU_UPDATE_BY_WRITE | EFFECTIVE_L2_QUEUE_SIZE(7) | CONTEXT1_IDENTITY_ACCESS_MODE(1));
L2_CACHE_BIGK_FRAGMENT_SIZE(0));
BANK_SELECT(4) |
/* setup context0 */ WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >>L2_CACHE_BIGK_FRAGMENT_SIZE(4));
12); WREG32(VM_CONTEXT0_PAGE_TABLE_END_ADDR, rdev->mc.gtt_end >> 12);
dri-devel mailing list dri-devel@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/dri-devel
dri-devel@lists.freedesktop.org