While updating "[PATCH 5/6] drm/radeon: validate relocations in the order determined by userspace" based on feedback, which should have been a harmless change, I discovered that the performance dropped. The problem was that list_add/move from one list to another reversed the list of relocations and the only thing which restored performance was to sort the relocation list with a stable sort. (preserve ordering of buffers whose priorities are equal). The only explanation I've come up with is that the buffers which appear sooner in the list are likely to be used more often by an IB than buffers which appear later in the list. For example, a texture used in all draw commands will most probably be somewhere at the beginning. However, a texture which appears at the end is probably only used in a few last draw commands.
Also, the stats are changed to atomic64_t as discusssed before.
Marek
From: Marek Olšák marek.olsak@amd.com
When passing buffers between processes, the receiving process needs to know the original buffer domain, so that it doesn't accidentally move the buffer.
v2: reserve the buffer
Signed-off-by: Marek Olšák marek.olsak@amd.com --- drivers/gpu/drm/radeon/radeon.h | 3 +++ drivers/gpu/drm/radeon/radeon_drv.c | 3 ++- drivers/gpu/drm/radeon/radeon_gem.c | 36 ++++++++++++++++++++++++++++++++++ drivers/gpu/drm/radeon/radeon_kms.c | 1 + drivers/gpu/drm/radeon/radeon_object.c | 3 +++ include/uapi/drm/radeon_drm.h | 12 ++++++++++++ 6 files changed, 57 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index a415f8e..3f10782 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -454,6 +454,7 @@ struct radeon_bo { /* Protected by gem.mutex */ struct list_head list; /* Protected by tbo.reserved */ + u32 initial_domain; u32 placements[3]; struct ttm_placement placement; struct ttm_buffer_object tbo; @@ -2114,6 +2115,8 @@ int radeon_gem_wait_idle_ioctl(struct drm_device *dev, void *data, struct drm_file *filp); int radeon_gem_va_ioctl(struct drm_device *dev, void *data, struct drm_file *filp); +int radeon_gem_op_ioctl(struct drm_device *dev, void *data, + struct drm_file *filp); int radeon_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp); int radeon_gem_set_tiling_ioctl(struct drm_device *dev, void *data, struct drm_file *filp); diff --git a/drivers/gpu/drm/radeon/radeon_drv.c b/drivers/gpu/drm/radeon/radeon_drv.c index 84a1bbb7..4392b7c 100644 --- a/drivers/gpu/drm/radeon/radeon_drv.c +++ b/drivers/gpu/drm/radeon/radeon_drv.c @@ -79,9 +79,10 @@ * 2.35.0 - Add CIK macrotile mode array query * 2.36.0 - Fix CIK DCE tiling setup * 2.37.0 - allow GS ring setup on r6xx/r7xx + * 2.38.0 - RADEON_GEM_OP (GET_INITIAL_DOMAIN, SET_INITIAL_DOMAIN) */ #define KMS_DRIVER_MAJOR 2 -#define KMS_DRIVER_MINOR 37 +#define KMS_DRIVER_MINOR 38 #define KMS_DRIVER_PATCHLEVEL 0 int radeon_driver_load_kms(struct drm_device *dev, unsigned long flags); int radeon_driver_unload_kms(struct drm_device *dev); diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c index b96c819..9863ca7 100644 --- a/drivers/gpu/drm/radeon/radeon_gem.c +++ b/drivers/gpu/drm/radeon/radeon_gem.c @@ -533,6 +533,42 @@ out: return r; }
+int radeon_gem_op_ioctl(struct drm_device *dev, void *data, + struct drm_file *filp) +{ + struct drm_radeon_gem_op *args = data; + struct drm_gem_object *gobj; + struct radeon_bo *robj; + int r; + + gobj = drm_gem_object_lookup(dev, filp, args->handle); + if (gobj == NULL) { + return -ENOENT; + } + robj = gem_to_radeon_bo(gobj); + r = radeon_bo_reserve(robj, false); + if (unlikely(r)) + goto out; + + switch (args->op) { + case RADEON_GEM_OP_GET_INITIAL_DOMAIN: + args->value = robj->initial_domain; + break; + case RADEON_GEM_OP_SET_INITIAL_DOMAIN: + robj->initial_domain = args->value & (RADEON_GEM_DOMAIN_VRAM | + RADEON_GEM_DOMAIN_GTT | + RADEON_GEM_DOMAIN_CPU); + break; + default: + r = -EINVAL; + } + + radeon_bo_unreserve(robj); +out: + drm_gem_object_unreference_unlocked(gobj); + return r; +} + int radeon_mode_dumb_create(struct drm_file *file_priv, struct drm_device *dev, struct drm_mode_create_dumb *args) diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index baff98b..0b631eb 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -814,5 +814,6 @@ const struct drm_ioctl_desc radeon_ioctls_kms[] = { DRM_IOCTL_DEF_DRV(RADEON_GEM_GET_TILING, radeon_gem_get_tiling_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(RADEON_GEM_BUSY, radeon_gem_busy_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(RADEON_GEM_VA, radeon_gem_va_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(RADEON_GEM_OP, radeon_gem_op_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), }; int radeon_max_kms_ioctl = DRM_ARRAY_SIZE(radeon_ioctls_kms); diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c index 08595cf..dd12bb4 100644 --- a/drivers/gpu/drm/radeon/radeon_object.c +++ b/drivers/gpu/drm/radeon/radeon_object.c @@ -145,6 +145,9 @@ int radeon_bo_create(struct radeon_device *rdev, bo->surface_reg = -1; INIT_LIST_HEAD(&bo->list); INIT_LIST_HEAD(&bo->va); + bo->initial_domain = domain & (RADEON_GEM_DOMAIN_VRAM | + RADEON_GEM_DOMAIN_GTT | + RADEON_GEM_DOMAIN_CPU); radeon_ttm_placement_from_domain(bo, domain); /* Kernel allocation are uninterruptible */ down_read(&rdev->pm.mclk_lock); diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h index 1cf18b4..cb5c93a 100644 --- a/include/uapi/drm/radeon_drm.h +++ b/include/uapi/drm/radeon_drm.h @@ -510,6 +510,7 @@ typedef struct { #define DRM_RADEON_GEM_GET_TILING 0x29 #define DRM_RADEON_GEM_BUSY 0x2a #define DRM_RADEON_GEM_VA 0x2b +#define DRM_RADEON_GEM_OP 0x2c
#define DRM_IOCTL_RADEON_CP_INIT DRM_IOW( DRM_COMMAND_BASE + DRM_RADEON_CP_INIT, drm_radeon_init_t) #define DRM_IOCTL_RADEON_CP_START DRM_IO( DRM_COMMAND_BASE + DRM_RADEON_CP_START) @@ -552,6 +553,7 @@ typedef struct { #define DRM_IOCTL_RADEON_GEM_GET_TILING DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_GET_TILING, struct drm_radeon_gem_get_tiling) #define DRM_IOCTL_RADEON_GEM_BUSY DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_BUSY, struct drm_radeon_gem_busy) #define DRM_IOCTL_RADEON_GEM_VA DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_VA, struct drm_radeon_gem_va) +#define DRM_IOCTL_RADEON_GEM_OP DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_OP, struct drm_radeon_gem_op)
typedef struct drm_radeon_init { enum { @@ -884,6 +886,16 @@ struct drm_radeon_gem_pwrite { uint64_t data_ptr; };
+/* Sets or returns a value associated with a buffer. */ +struct drm_radeon_gem_op { + uint32_t handle; /* buffer */ + uint32_t op; /* RADEON_GEM_OP_* */ + uint64_t value; /* input or return value */ +}; + +#define RADEON_GEM_OP_GET_INITIAL_DOMAIN 0 +#define RADEON_GEM_OP_SET_INITIAL_DOMAIN 1 + #define RADEON_VA_MAP 1 #define RADEON_VA_UNMAP 2
From: Marek Olšák marek.olsak@amd.com
The statistics are: - VRAM usage in bytes - GTT usage in bytes - number of bytes moved by TTM
The last one is actually a counter, so you need to sample it before and after command submission and take the difference.
This is useful for finding performance bottlenecks. Userspace queries are also added.
v2: use atomic64_t
Signed-off-by: Marek Olšák marek.olsak@amd.com --- drivers/gpu/drm/radeon/radeon.h | 4 ++++ drivers/gpu/drm/radeon/radeon_kms.c | 15 ++++++++++++++ drivers/gpu/drm/radeon/radeon_object.c | 36 +++++++++++++++++++++++++++++++++- drivers/gpu/drm/radeon/radeon_object.h | 2 +- drivers/gpu/drm/radeon/radeon_ttm.c | 8 +++++++- include/uapi/drm/radeon_drm.h | 3 +++ 6 files changed, 65 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 3f10782..df28613 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -2307,6 +2307,10 @@ struct radeon_device { /* virtual memory */ struct radeon_vm_manager vm_manager; struct mutex gpu_clock_mutex; + /* memory stats */ + atomic64_t vram_usage; + atomic64_t gtt_usage; + atomic64_t num_bytes_moved; /* ACPI interface */ struct radeon_atif atif; struct radeon_atcs atcs; diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index 0b631eb..806506c 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -486,6 +486,21 @@ static int radeon_info_ioctl(struct drm_device *dev, void *data, struct drm_file case RADEON_INFO_VCE_FB_VERSION: *value = rdev->vce.fb_version; break; + case RADEON_INFO_NUM_BYTES_MOVED: + value = (uint32_t*)&value64; + value_size = sizeof(uint64_t); + value64 = atomic64_read(&rdev->num_bytes_moved); + break; + case RADEON_INFO_VRAM_USAGE: + value = (uint32_t*)&value64; + value_size = sizeof(uint64_t); + value64 = atomic64_read(&rdev->vram_usage); + break; + case RADEON_INFO_GTT_USAGE: + value = (uint32_t*)&value64; + value_size = sizeof(uint64_t); + value64 = atomic64_read(&rdev->gtt_usage); + break; default: DRM_DEBUG_KMS("Invalid request %d\n", info->request); return -EINVAL; diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c index dd12bb4..282d6a2 100644 --- a/drivers/gpu/drm/radeon/radeon_object.c +++ b/drivers/gpu/drm/radeon/radeon_object.c @@ -56,11 +56,36 @@ static void radeon_bo_clear_va(struct radeon_bo *bo) } }
+static void radeon_update_memory_usage(struct radeon_bo *bo, + unsigned mem_type, int sign) +{ + struct radeon_device *rdev = bo->rdev; + u64 size = (u64)bo->tbo.num_pages << PAGE_SHIFT; + + switch (mem_type) { + case TTM_PL_TT: + if (sign > 0) + atomic64_add(size, &rdev->gtt_usage); + else + atomic64_sub(size, &rdev->gtt_usage); + break; + case TTM_PL_VRAM: + if (sign > 0) + atomic64_add(size, &rdev->vram_usage); + else + atomic64_sub(size, &rdev->vram_usage); + break; + } +} + static void radeon_ttm_bo_destroy(struct ttm_buffer_object *tbo) { struct radeon_bo *bo;
bo = container_of(tbo, struct radeon_bo, tbo); + + radeon_update_memory_usage(bo, bo->tbo.mem.mem_type, -1); + mutex_lock(&bo->rdev->gem.mutex); list_del_init(&bo->list); mutex_unlock(&bo->rdev->gem.mutex); @@ -567,14 +592,23 @@ int radeon_bo_check_tiling(struct radeon_bo *bo, bool has_moved, }
void radeon_bo_move_notify(struct ttm_buffer_object *bo, - struct ttm_mem_reg *mem) + struct ttm_mem_reg *new_mem) { struct radeon_bo *rbo; + if (!radeon_ttm_bo_is_radeon_bo(bo)) return; + rbo = container_of(bo, struct radeon_bo, tbo); radeon_bo_check_tiling(rbo, 0, 1); radeon_vm_bo_invalidate(rbo->rdev, rbo); + + /* update statistics */ + if (!new_mem) + return; + + radeon_update_memory_usage(rbo, bo->mem.mem_type, -1); + radeon_update_memory_usage(rbo, new_mem->mem_type, 1); }
int radeon_bo_fault_reserve_notify(struct ttm_buffer_object *bo) diff --git a/drivers/gpu/drm/radeon/radeon_object.h b/drivers/gpu/drm/radeon/radeon_object.h index 209b111..a9a8c11 100644 --- a/drivers/gpu/drm/radeon/radeon_object.h +++ b/drivers/gpu/drm/radeon/radeon_object.h @@ -151,7 +151,7 @@ extern void radeon_bo_get_tiling_flags(struct radeon_bo *bo, extern int radeon_bo_check_tiling(struct radeon_bo *bo, bool has_moved, bool force_drop); extern void radeon_bo_move_notify(struct ttm_buffer_object *bo, - struct ttm_mem_reg *mem); + struct ttm_mem_reg *new_mem); extern int radeon_bo_fault_reserve_notify(struct ttm_buffer_object *bo); extern int radeon_bo_get_surface_reg(struct radeon_bo *bo);
diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c index 77f5b0c..60dfce8 100644 --- a/drivers/gpu/drm/radeon/radeon_ttm.c +++ b/drivers/gpu/drm/radeon/radeon_ttm.c @@ -406,8 +406,14 @@ static int radeon_bo_move(struct ttm_buffer_object *bo, if (r) { memcpy: r = ttm_bo_move_memcpy(bo, evict, no_wait_gpu, new_mem); + if (r) { + return r; + } } - return r; + + /* update statistics */ + atomic64_add((u64)bo->num_pages << PAGE_SHIFT, &rdev->num_bytes_moved); + return 0; }
static int radeon_ttm_io_mem_reserve(struct ttm_bo_device *bdev, struct ttm_mem_reg *mem) diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h index cb5c93a..aefa2f6 100644 --- a/include/uapi/drm/radeon_drm.h +++ b/include/uapi/drm/radeon_drm.h @@ -1004,6 +1004,9 @@ struct drm_radeon_cs { #define RADEON_INFO_VCE_FW_VERSION 0x1b /* version of VCE feedback */ #define RADEON_INFO_VCE_FB_VERSION 0x1c +#define RADEON_INFO_NUM_BYTES_MOVED 0x1d +#define RADEON_INFO_VRAM_USAGE 0x1e +#define RADEON_INFO_GTT_USAGE 0x1f
struct drm_radeon_info {
From: Marek Olšák marek.olsak@amd.com
Signed-off-by: Marek Olšák marek.olsak@amd.com Reviewed-by: Christian König christian.koenig@amd.com --- drivers/gpu/drm/radeon/radeon_gem.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-)
diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c index 9863ca7..d09650c 100644 --- a/drivers/gpu/drm/radeon/radeon_gem.c +++ b/drivers/gpu/drm/radeon/radeon_gem.c @@ -344,18 +344,7 @@ int radeon_gem_busy_ioctl(struct drm_device *dev, void *data, } robj = gem_to_radeon_bo(gobj); r = radeon_bo_wait(robj, &cur_placement, true); - switch (cur_placement) { - case TTM_PL_VRAM: - args->domain = RADEON_GEM_DOMAIN_VRAM; - break; - case TTM_PL_TT: - args->domain = RADEON_GEM_DOMAIN_GTT; - break; - case TTM_PL_SYSTEM: - args->domain = RADEON_GEM_DOMAIN_CPU; - default: - break; - } + args->domain = radeon_mem_type_to_domain(cur_placement); drm_gem_object_unreference_unlocked(gobj); r = radeon_gem_handle_lockup(rdev, r); return r;
From: Marek Olšák marek.olsak@amd.com
Signed-off-by: Marek Olšák marek.olsak@amd.com Reviewed-by: Christian König christian.koenig@amd.com --- drivers/gpu/drm/radeon/radeon_cs.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+)
diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c index f28a8d8..d49a3f7 100644 --- a/drivers/gpu/drm/radeon/radeon_cs.c +++ b/drivers/gpu/drm/radeon/radeon_cs.c @@ -24,6 +24,7 @@ * Authors: * Jerome Glisse glisse@freedesktop.org */ +#include <linux/list_sort.h> #include <drm/drmP.h> #include <drm/radeon_drm.h> #include "radeon_reg.h" @@ -290,6 +291,16 @@ int radeon_cs_parser_init(struct radeon_cs_parser *p, void *data) return 0; }
+static int cmp_size_smaller_first(void *priv, struct list_head *a, + struct list_head *b) +{ + struct radeon_bo_list *la = list_entry(a, struct radeon_bo_list, tv.head); + struct radeon_bo_list *lb = list_entry(b, struct radeon_bo_list, tv.head); + + /* Sort A before B if A is smaller. */ + return (int)la->bo->tbo.num_pages - (int)lb->bo->tbo.num_pages; +} + /** * cs_parser_fini() - clean parser states * @parser: parser structure holding parsing context. @@ -303,6 +314,18 @@ static void radeon_cs_parser_fini(struct radeon_cs_parser *parser, int error, bo unsigned i;
if (!error) { + /* Sort the buffer list from the smallest to largest buffer, + * which affects the order of buffers in the LRU list. + * This assures that the smallest buffers are added first + * to the LRU list, so they are likely to be later evicted + * first, instead of large buffers whose eviction is more + * expensive. + * + * This slightly lowers the number of bytes moved by TTM + * per frame under memory pressure. + */ + list_sort(NULL, &parser->validated, cmp_size_smaller_first); + ttm_eu_fence_buffer_objects(&parser->ticket, &parser->validated, parser->ib.fence);
From: Marek Olšák marek.olsak@amd.com
Userspace should set the first 4 bits of drm_radeon_cs_reloc::flags to a number from 0 to 15. The higher the number, the higher the priority, which means a buffer with a higher number will be validated sooner.
The old behavior is preserved: Buffers used for write are prioritized over read-only buffers if the userspace doesn't set the number.
v2: add buffers to buckets directly, then concatenate them v3: use a stable sort
Signed-off-by: Marek Olšák marek.olsak@amd.com --- drivers/gpu/drm/radeon/radeon.h | 1 - drivers/gpu/drm/radeon/radeon_cs.c | 64 ++++++++++++++++++++++++++++++++-- drivers/gpu/drm/radeon/radeon_object.c | 10 ------ drivers/gpu/drm/radeon/radeon_object.h | 2 -- 4 files changed, 61 insertions(+), 16 deletions(-)
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index df28613..ec719e8 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -481,7 +481,6 @@ struct radeon_bo_list { struct ttm_validate_buffer tv; struct radeon_bo *bo; uint64_t gpu_offset; - bool written; unsigned domain; unsigned alt_domain; u32 tiling_flags; diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c index d49a3f7..07e1651 100644 --- a/drivers/gpu/drm/radeon/radeon_cs.c +++ b/drivers/gpu/drm/radeon/radeon_cs.c @@ -31,10 +31,52 @@ #include "radeon.h" #include "radeon_trace.h"
+#define RADEON_CS_MAX_PRIORITY 32u +#define RADEON_CS_NUM_BUCKETS (RADEON_CS_MAX_PRIORITY + 1) + +/* This is based on the bucket sort with O(n) time complexity. + * An item with priority "i" is added to bucket[i]. The lists are then + * concatenated in descending order. + */ +struct radeon_cs_buckets { + struct list_head bucket[RADEON_CS_NUM_BUCKETS]; +}; + +static void radeon_cs_buckets_init(struct radeon_cs_buckets *b) +{ + unsigned i; + + for (i = 0; i < RADEON_CS_NUM_BUCKETS; i++) + INIT_LIST_HEAD(&b->bucket[i]); +} + +static void radeon_cs_buckets_add(struct radeon_cs_buckets *b, + struct list_head *item, unsigned priority) +{ + /* Since buffers which appear sooner in the relocation list are + * likely to be used more often than buffers which appear later + * in the list, the sort mustn't change the ordering of buffers + * with the same priority, i.e. it must be stable. + */ + list_add_tail(item, &b->bucket[min(priority, RADEON_CS_MAX_PRIORITY)]); +} + +static void radeon_cs_buckets_get_list(struct radeon_cs_buckets *b, + struct list_head *out_list) +{ + unsigned i; + + /* Connect the sorted buckets in the output list. */ + for (i = 0; i < RADEON_CS_NUM_BUCKETS; i++) { + list_splice(&b->bucket[i], out_list); + } +} + static int radeon_cs_parser_relocs(struct radeon_cs_parser *p) { struct drm_device *ddev = p->rdev->ddev; struct radeon_cs_chunk *chunk; + struct radeon_cs_buckets buckets; unsigned i, j; bool duplicate;
@@ -53,8 +95,12 @@ static int radeon_cs_parser_relocs(struct radeon_cs_parser *p) if (p->relocs == NULL) { return -ENOMEM; } + + radeon_cs_buckets_init(&buckets); + for (i = 0; i < p->nrelocs; i++) { struct drm_radeon_cs_reloc *r; + unsigned priority;
duplicate = false; r = (struct drm_radeon_cs_reloc *)&chunk->kdata[i*4]; @@ -80,7 +126,14 @@ static int radeon_cs_parser_relocs(struct radeon_cs_parser *p) p->relocs_ptr[i] = &p->relocs[i]; p->relocs[i].robj = gem_to_radeon_bo(p->relocs[i].gobj); p->relocs[i].lobj.bo = p->relocs[i].robj; - p->relocs[i].lobj.written = !!r->write_domain; + + /* The userspace buffer priorities are from 0 to 15. A higher + * number means the buffer is more important. + * Also, the buffers used for write have a higher priority than + * the buffers used for read only, which doubles the range + * to 0 to 31. 32 is reserved for the kernel driver. + */ + priority = (r->flags & 0xf) * 2 + !!r->write_domain;
/* the first reloc of an UVD job is the msg and that must be in VRAM, also but everything into VRAM on AGP cards to avoid @@ -94,6 +147,8 @@ static int radeon_cs_parser_relocs(struct radeon_cs_parser *p) p->relocs[i].lobj.alt_domain = RADEON_GEM_DOMAIN_VRAM;
+ /* prioritize this over any other relocation */ + priority = RADEON_CS_MAX_PRIORITY; } else { uint32_t domain = r->write_domain ? r->write_domain : r->read_domains; @@ -107,9 +162,12 @@ static int radeon_cs_parser_relocs(struct radeon_cs_parser *p) p->relocs[i].lobj.tv.bo = &p->relocs[i].robj->tbo; p->relocs[i].handle = r->handle;
- radeon_bo_list_add_object(&p->relocs[i].lobj, - &p->validated); + radeon_cs_buckets_add(&buckets, &p->relocs[i].lobj.tv.head, + priority); } + + radeon_cs_buckets_get_list(&buckets, &p->validated); + return radeon_bo_list_validate(&p->ticket, &p->validated, p->ring); }
diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c index 282d6a2..8399fe0 100644 --- a/drivers/gpu/drm/radeon/radeon_object.c +++ b/drivers/gpu/drm/radeon/radeon_object.c @@ -366,16 +366,6 @@ void radeon_bo_fini(struct radeon_device *rdev) arch_phys_wc_del(rdev->mc.vram_mtrr); }
-void radeon_bo_list_add_object(struct radeon_bo_list *lobj, - struct list_head *head) -{ - if (lobj->written) { - list_add(&lobj->tv.head, head); - } else { - list_add_tail(&lobj->tv.head, head); - } -} - int radeon_bo_list_validate(struct ww_acquire_ctx *ticket, struct list_head *head, int ring) { diff --git a/drivers/gpu/drm/radeon/radeon_object.h b/drivers/gpu/drm/radeon/radeon_object.h index a9a8c11..6c3ca9e 100644 --- a/drivers/gpu/drm/radeon/radeon_object.h +++ b/drivers/gpu/drm/radeon/radeon_object.h @@ -138,8 +138,6 @@ extern int radeon_bo_evict_vram(struct radeon_device *rdev); extern void radeon_bo_force_delete(struct radeon_device *rdev); extern int radeon_bo_init(struct radeon_device *rdev); extern void radeon_bo_fini(struct radeon_device *rdev); -extern void radeon_bo_list_add_object(struct radeon_bo_list *lobj, - struct list_head *head); extern int radeon_bo_list_validate(struct ww_acquire_ctx *ticket, struct list_head *head, int ring); extern int radeon_bo_fbdev_mmap(struct radeon_bo *bo,
From: Marek Olšák marek.olsak@amd.com
Signed-off-by: Marek Olšák marek.olsak@amd.com Reviewed-by: Christian König christian.koenig@amd.com --- drivers/gpu/drm/radeon/radeon_cs.c | 2 +- drivers/gpu/drm/radeon/radeon_object.c | 88 +++++++++++++++++++++++++++++++--- drivers/gpu/drm/radeon/radeon_object.h | 3 +- 3 files changed, 85 insertions(+), 8 deletions(-)
diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c index 07e1651..5abae40 100644 --- a/drivers/gpu/drm/radeon/radeon_cs.c +++ b/drivers/gpu/drm/radeon/radeon_cs.c @@ -168,7 +168,7 @@ static int radeon_cs_parser_relocs(struct radeon_cs_parser *p)
radeon_cs_buckets_get_list(&buckets, &p->validated);
- return radeon_bo_list_validate(&p->ticket, &p->validated, p->ring); + return radeon_bo_list_validate(p->rdev, &p->ticket, &p->validated, p->ring); }
static int radeon_cs_get_ring(struct radeon_cs_parser *p, u32 ring, s32 priority) diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c index 8399fe0..ed03f2d 100644 --- a/drivers/gpu/drm/radeon/radeon_object.c +++ b/drivers/gpu/drm/radeon/radeon_object.c @@ -366,29 +366,105 @@ void radeon_bo_fini(struct radeon_device *rdev) arch_phys_wc_del(rdev->mc.vram_mtrr); }
-int radeon_bo_list_validate(struct ww_acquire_ctx *ticket, +/* Returns how many bytes TTM can move per IB. + */ +static u64 radeon_bo_get_threshold_for_moves(struct radeon_device *rdev) +{ + u64 real_vram_size = rdev->mc.real_vram_size; + u64 vram_usage = atomic64_read(&rdev->vram_usage); + + /* This function is based on the current VRAM usage. + * + * - If all of VRAM is free, allow relocating the number of bytes that + * is equal to 1/4 of the size of VRAM for this IB. + + * - If more than one half of VRAM is occupied, only allow relocating + * 1 MB of data for this IB. + * + * - From 0 to one half of used VRAM, the threshold decreases + * linearly. + * __________________ + * 1/4 of -|\ | + * VRAM | \ | + * | \ | + * | \ | + * | \ | + * | \ | + * | \ | + * | ________|1 MB + * |----------------| + * VRAM 0 % 100 % + * used used + * + * Note: It's a threshold, not a limit. The threshold must be crossed + * for buffer relocations to stop, so any buffer of an arbitrary size + * can be moved as long as the threshold isn't crossed before + * the relocation takes place. We don't want to disable buffer + * relocations completely. + * + * The idea is that buffers should be placed in VRAM at creation time + * and TTM should only do a minimum number of relocations during + * command submission. In practice, you need to submit at least + * a dozen IBs to move all buffers to VRAM if they are in GTT. + * + * Also, things can get pretty crazy under memory pressure and actual + * VRAM usage can change a lot, so playing safe even at 50% does + * consistently increase performance. + */ + + u64 half_vram = real_vram_size >> 1; + u64 half_free_vram = vram_usage >= half_vram ? 0 : half_vram - vram_usage; + u64 bytes_moved_threshold = half_free_vram >> 1; + return max(bytes_moved_threshold, 1024*1024ull); +} + +int radeon_bo_list_validate(struct radeon_device *rdev, + struct ww_acquire_ctx *ticket, struct list_head *head, int ring) { struct radeon_bo_list *lobj; struct radeon_bo *bo; - u32 domain; int r; + u64 bytes_moved = 0, initial_bytes_moved; + u64 bytes_moved_threshold = radeon_bo_get_threshold_for_moves(rdev);
r = ttm_eu_reserve_buffers(ticket, head); if (unlikely(r != 0)) { return r; } + list_for_each_entry(lobj, head, tv.head) { bo = lobj->bo; if (!bo->pin_count) { - domain = lobj->domain; - + u32 domain = lobj->domain; + u32 current_domain = + radeon_mem_type_to_domain(bo->tbo.mem.mem_type); + + /* Check if this buffer will be moved and don't move it + * if we have moved too many buffers for this IB already. + * + * Note that this allows moving at least one buffer of + * any size, because it doesn't take the current "bo" + * into account. We don't want to disallow buffer moves + * completely. + */ + if (current_domain != RADEON_GEM_DOMAIN_CPU && + (domain & current_domain) == 0 && /* will be moved */ + bytes_moved > bytes_moved_threshold) { + /* don't move it */ + domain = current_domain; + } + retry: radeon_ttm_placement_from_domain(bo, domain); if (ring == R600_RING_TYPE_UVD_INDEX) radeon_uvd_force_into_uvd_segment(bo); - r = ttm_bo_validate(&bo->tbo, &bo->placement, - true, false); + + initial_bytes_moved = atomic64_read(&rdev->num_bytes_moved); + r = ttm_bo_validate(&bo->tbo, &bo->placement, true, false); + bytes_moved += atomic64_read(&rdev->num_bytes_moved) - + initial_bytes_moved; + if (unlikely(r)) { if (r != -ERESTARTSYS && domain != lobj->alt_domain) { domain = lobj->alt_domain; diff --git a/drivers/gpu/drm/radeon/radeon_object.h b/drivers/gpu/drm/radeon/radeon_object.h index 6c3ca9e..7dff64d 100644 --- a/drivers/gpu/drm/radeon/radeon_object.h +++ b/drivers/gpu/drm/radeon/radeon_object.h @@ -138,7 +138,8 @@ extern int radeon_bo_evict_vram(struct radeon_device *rdev); extern void radeon_bo_force_delete(struct radeon_device *rdev); extern int radeon_bo_init(struct radeon_device *rdev); extern void radeon_bo_fini(struct radeon_device *rdev); -extern int radeon_bo_list_validate(struct ww_acquire_ctx *ticket, +extern int radeon_bo_list_validate(struct radeon_device *rdev, + struct ww_acquire_ctx *ticket, struct list_head *head, int ring); extern int radeon_bo_fbdev_mmap(struct radeon_bo *bo, struct vm_area_struct *vma);
dri-devel@lists.freedesktop.org