48-bit virtual address range will be enabled in i915 soon, but some objects must be referenced by 32-bit offsets. These patches use a new kernel flag to specify if this restriction applies or not.
I'm sending these patches to comply with the i915 merge process. Once the kernel patch is merged, I'll make a new libdrm release and address the mesa build dependency.
Michel Thierry (2): intel: Add EXEC_OBJECT_SUPPORTS_48B_ADDRESS flag intel: add new function name to symbol-check test
include/drm/i915_drm.h | 3 ++- intel/intel-symbol-check | 1 + intel/intel_bufmgr.c | 16 ++++++++++++++ intel/intel_bufmgr.h | 6 +++++- intel/intel_bufmgr_gem.c | 54 +++++++++++++++++++++++++++++++++++++++++++---- intel/intel_bufmgr_priv.h | 11 ++++++++++ 6 files changed, 85 insertions(+), 6 deletions(-)
Gen8+ supports 48-bit virtual addresses, but some objects must always be allocated inside the 32-bit address range.
In specific, any resource used with flat/heapless (0x00000000-0xfffff000) General State Heap (GSH) or Instruction State Heap (ISH) must be in a 32-bit range, because the General State Offset and Instruction State Offset are limited to 32-bits.
The i915 driver has been modified to provide a flag to set when the 4GB limit is not necessary in a given bo (EXEC_OBJECT_SUPPORTS_48B_ADDRESS). 48-bit range will only be used when explicitly requested.
Calls to the new drm_intel_bo_emit_reloc_48bit function will have this flag set automatically, while calls to drm_intel_bo_emit_reloc will clear it.
v2: Make set/clear functions nops on pre-gen8 platforms, and use them internally in emit_reloc functions (Ben) s/48BADDRESS/48B_ADDRESS/ (Dave) v3: Keep set/clear functions internal, no-one needs to use them directly.
References: http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html Cc: Ben Widawsky ben@bwidawsk.net Cc: Dave Gordon david.s.gordon@intel.com Signed-off-by: Michel Thierry michel.thierry@intel.com --- include/drm/i915_drm.h | 3 ++- intel/intel_bufmgr.c | 16 ++++++++++++++ intel/intel_bufmgr.h | 6 +++++- intel/intel_bufmgr_gem.c | 54 +++++++++++++++++++++++++++++++++++++++++++---- intel/intel_bufmgr_priv.h | 11 ++++++++++ 5 files changed, 84 insertions(+), 6 deletions(-)
diff --git a/include/drm/i915_drm.h b/include/drm/i915_drm.h index ded43b1..426b25c 100644 --- a/include/drm/i915_drm.h +++ b/include/drm/i915_drm.h @@ -680,7 +680,8 @@ struct drm_i915_gem_exec_object2 { #define EXEC_OBJECT_NEEDS_FENCE (1<<0) #define EXEC_OBJECT_NEEDS_GTT (1<<1) #define EXEC_OBJECT_WRITE (1<<2) -#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_WRITE<<1) +#define EXEC_OBJECT_SUPPORTS_48B_ADDRESS (1<<3) +#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_SUPPORTS_48B_ADDRESS<<1) __u64 flags;
__u64 rsvd1; diff --git a/intel/intel_bufmgr.c b/intel/intel_bufmgr.c index 14ea9f9..0bd5191 100644 --- a/intel/intel_bufmgr.c +++ b/intel/intel_bufmgr.c @@ -202,6 +202,22 @@ drm_intel_bo_emit_reloc(drm_intel_bo *bo, uint32_t offset, drm_intel_bo *target_bo, uint32_t target_offset, uint32_t read_domains, uint32_t write_domain) { + if (bo->bufmgr->bo_clear_supports_48b_address) + bo->bufmgr->bo_clear_supports_48b_address(target_bo); + + return bo->bufmgr->bo_emit_reloc(bo, offset, + target_bo, target_offset, + read_domains, write_domain); +} + +int +drm_intel_bo_emit_reloc_48bit(drm_intel_bo *bo, uint32_t offset, + drm_intel_bo *target_bo, uint32_t target_offset, + uint32_t read_domains, uint32_t write_domain) +{ + if (bo->bufmgr->bo_set_supports_48b_address) + bo->bufmgr->bo_set_supports_48b_address(target_bo); + return bo->bufmgr->bo_emit_reloc(bo, offset, target_bo, target_offset, read_domains, write_domain); diff --git a/intel/intel_bufmgr.h b/intel/intel_bufmgr.h index 285919e..8f91ffe 100644 --- a/intel/intel_bufmgr.h +++ b/intel/intel_bufmgr.h @@ -87,7 +87,8 @@ struct _drm_intel_bo { /** * Last seen card virtual address (offset from the beginning of the * aperture) for the object. This should be used to fill relocation - * entries when calling drm_intel_bo_emit_reloc() + * entries when calling drm_intel_bo_emit_reloc() or + * drm_intel_bo_emit_reloc_48bit() */ uint64_t offset64; }; @@ -147,6 +148,9 @@ int drm_intel_bufmgr_check_aperture_space(drm_intel_bo ** bo_array, int count); int drm_intel_bo_emit_reloc(drm_intel_bo *bo, uint32_t offset, drm_intel_bo *target_bo, uint32_t target_offset, uint32_t read_domains, uint32_t write_domain); +int drm_intel_bo_emit_reloc_48bit(drm_intel_bo *bo, uint32_t offset, + drm_intel_bo *target_bo, uint32_t target_offset, + uint32_t read_domains, uint32_t write_domain); int drm_intel_bo_emit_reloc_fence(drm_intel_bo *bo, uint32_t offset, drm_intel_bo *target_bo, uint32_t target_offset, diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c index 41de396..713ed4e 100644 --- a/intel/intel_bufmgr_gem.c +++ b/intel/intel_bufmgr_gem.c @@ -140,6 +140,7 @@ typedef struct _drm_intel_bufmgr_gem { } drm_intel_bufmgr_gem;
#define DRM_INTEL_RELOC_FENCE (1<<0) +#define DRM_INTEL_RELOC_SUPPORTS_48B_ADDRESS (2<<0)
typedef struct _drm_intel_reloc_target_info { drm_intel_bo *bo; @@ -237,6 +238,14 @@ struct _drm_intel_bo_gem { bool is_userptr;
/** + * Boolean of whether this buffer can be in the whole 48-bit address. + * + * By default, buffers will be keep in a 32-bit range, unless this + * flag is explicitly set. + */ + bool supports_48b_address; + + /** * Size in bytes of this buffer and its relocation descendents. * * Used to avoid costly tree walking in @@ -463,13 +472,18 @@ drm_intel_add_validate_buffer(drm_intel_bo *bo) }
static void -drm_intel_add_validate_buffer2(drm_intel_bo *bo, int need_fence) +drm_intel_add_validate_buffer2(drm_intel_bo *bo, int need_fence, + int supports_48b_address) { drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *)bo->bufmgr; drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *)bo; int index;
if (bo_gem->validate_index != -1) { + if (supports_48b_address) { + bufmgr_gem->exec2_objects[bo_gem->validate_index].flags |= + EXEC_OBJECT_SUPPORTS_48B_ADDRESS; + } if (need_fence) bufmgr_gem->exec2_objects[bo_gem->validate_index].flags |= EXEC_OBJECT_NEEDS_FENCE; @@ -508,6 +522,10 @@ drm_intel_add_validate_buffer2(drm_intel_bo *bo, int need_fence) bufmgr_gem->exec2_objects[index].flags |= EXEC_OBJECT_NEEDS_FENCE; } + if (supports_48b_address) { + bufmgr_gem->exec2_objects[index].flags |= + EXEC_OBJECT_SUPPORTS_48B_ADDRESS; + } bufmgr_gem->exec_count++; }
@@ -1925,11 +1943,33 @@ do_bo_emit_reloc(drm_intel_bo *bo, uint32_t offset, else bo_gem->reloc_target_info[bo_gem->reloc_count].flags = 0;
+ if (target_bo_gem->supports_48b_address) + bo_gem->reloc_target_info[bo_gem->reloc_count].flags |= + DRM_INTEL_RELOC_SUPPORTS_48B_ADDRESS; + bo_gem->reloc_count++;
return 0; }
+static void drm_intel_gem_bo_set_supports_48b_address(drm_intel_bo *bo) +{ + drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr; + drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo; + + if (bufmgr_gem->gen >= 8) + bo_gem->supports_48b_address = 1; +} + +static void drm_intel_gem_bo_clear_supports_48b_address(drm_intel_bo *bo) +{ + drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr; + drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo; + + if (bufmgr_gem->gen >= 8) + bo_gem->supports_48b_address = 0; +} + static int drm_intel_gem_bo_emit_reloc(drm_intel_bo *bo, uint32_t offset, drm_intel_bo *target_bo, uint32_t target_offset, @@ -2043,7 +2083,7 @@ drm_intel_gem_bo_process_reloc2(drm_intel_bo *bo)
for (i = 0; i < bo_gem->reloc_count; i++) { drm_intel_bo *target_bo = bo_gem->reloc_target_info[i].bo; - int need_fence; + int need_fence, supports_48b_addr;
if (target_bo == bo) continue; @@ -2056,8 +2096,12 @@ drm_intel_gem_bo_process_reloc2(drm_intel_bo *bo) need_fence = (bo_gem->reloc_target_info[i].flags & DRM_INTEL_RELOC_FENCE);
+ supports_48b_addr = (bo_gem->reloc_target_info[i].flags & + DRM_INTEL_RELOC_SUPPORTS_48B_ADDRESS); + /* Add the target to the validate list */ - drm_intel_add_validate_buffer2(target_bo, need_fence); + drm_intel_add_validate_buffer2(target_bo, need_fence, + supports_48b_addr); } }
@@ -2217,7 +2261,7 @@ do_exec2(drm_intel_bo *bo, int used, drm_intel_context *ctx, /* Add the batch buffer to the validation list. There are no relocations * pointing to it. */ - drm_intel_add_validate_buffer2(bo, 0); + drm_intel_add_validate_buffer2(bo, 0, 0);
memclear(execbuf); execbuf.buffers_ptr = (uintptr_t)bufmgr_gem->exec2_objects; @@ -3299,6 +3343,8 @@ drm_intel_bufmgr_gem_init(int fd, int batch_size) bufmgr_gem->bufmgr.bo_wait_rendering = drm_intel_gem_bo_wait_rendering; bufmgr_gem->bufmgr.bo_emit_reloc = drm_intel_gem_bo_emit_reloc; bufmgr_gem->bufmgr.bo_emit_reloc_fence = drm_intel_gem_bo_emit_reloc_fence; + bufmgr_gem->bufmgr.bo_set_supports_48b_address = drm_intel_gem_bo_set_supports_48b_address; + bufmgr_gem->bufmgr.bo_clear_supports_48b_address = drm_intel_gem_bo_clear_supports_48b_address; bufmgr_gem->bufmgr.bo_pin = drm_intel_gem_bo_pin; bufmgr_gem->bufmgr.bo_unpin = drm_intel_gem_bo_unpin; bufmgr_gem->bufmgr.bo_get_tiling = drm_intel_gem_bo_get_tiling; diff --git a/intel/intel_bufmgr_priv.h b/intel/intel_bufmgr_priv.h index 59ebd18..774fa02 100644 --- a/intel/intel_bufmgr_priv.h +++ b/intel/intel_bufmgr_priv.h @@ -152,6 +152,17 @@ struct _drm_intel_bufmgr { void (*destroy) (drm_intel_bufmgr *bufmgr);
/** + * Set/Clear 48-bit address support flag in a given bo. + * + * Any resource used with flat/heapless (0x00000000-0xfffff000) + * General State Heap (GSH) or Intructions State Heap (ISH) must + * be in a 32-bit range. 48-bit range will only be used when explicitly + * requested. + */ + void (*bo_set_supports_48b_address) (drm_intel_bo *bo); + void (*bo_clear_supports_48b_address) (drm_intel_bo *bo); + + /** * Add relocation entry in reloc_buf, which will be updated with the * target buffer's real offset on on command submission. *
On Fri, Aug 07, 2015 at 10:45:21AM +0100, Michel Thierry wrote:
Gen8+ supports 48-bit virtual addresses, but some objects must always be allocated inside the 32-bit address range.
In specific, any resource used with flat/heapless (0x00000000-0xfffff000) General State Heap (GSH) or Instruction State Heap (ISH) must be in a 32-bit range, because the General State Offset and Instruction State Offset are limited to 32-bits.
The i915 driver has been modified to provide a flag to set when the 4GB limit is not necessary in a given bo (EXEC_OBJECT_SUPPORTS_48B_ADDRESS). 48-bit range will only be used when explicitly requested.
Calls to the new drm_intel_bo_emit_reloc_48bit function will have this flag set automatically, while calls to drm_intel_bo_emit_reloc will clear it.
v2: Make set/clear functions nops on pre-gen8 platforms, and use them internally in emit_reloc functions (Ben) s/48BADDRESS/48B_ADDRESS/ (Dave) v3: Keep set/clear functions internal, no-one needs to use them directly.
References: http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html Cc: Ben Widawsky ben@bwidawsk.net Cc: Dave Gordon david.s.gordon@intel.com Signed-off-by: Michel Thierry michel.thierry@intel.com
include/drm/i915_drm.h | 3 ++- intel/intel_bufmgr.c | 16 ++++++++++++++ intel/intel_bufmgr.h | 6 +++++- intel/intel_bufmgr_gem.c | 54 +++++++++++++++++++++++++++++++++++++++++++---- intel/intel_bufmgr_priv.h | 11 ++++++++++ 5 files changed, 84 insertions(+), 6 deletions(-)
diff --git a/include/drm/i915_drm.h b/include/drm/i915_drm.h index ded43b1..426b25c 100644 --- a/include/drm/i915_drm.h +++ b/include/drm/i915_drm.h @@ -680,7 +680,8 @@ struct drm_i915_gem_exec_object2 { #define EXEC_OBJECT_NEEDS_FENCE (1<<0) #define EXEC_OBJECT_NEEDS_GTT (1<<1) #define EXEC_OBJECT_WRITE (1<<2) -#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_WRITE<<1) +#define EXEC_OBJECT_SUPPORTS_48B_ADDRESS (1<<3) +#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_SUPPORTS_48B_ADDRESS<<1) __u64 flags;
__u64 rsvd1; diff --git a/intel/intel_bufmgr.c b/intel/intel_bufmgr.c index 14ea9f9..0bd5191 100644 --- a/intel/intel_bufmgr.c +++ b/intel/intel_bufmgr.c @@ -202,6 +202,22 @@ drm_intel_bo_emit_reloc(drm_intel_bo *bo, uint32_t offset, drm_intel_bo *target_bo, uint32_t target_offset, uint32_t read_domains, uint32_t write_domain) {
- if (bo->bufmgr->bo_clear_supports_48b_address)
bo->bufmgr->bo_clear_supports_48b_address(target_bo);
This is always true - you assign func to this func pointer unconditionally. Also - why not return some meaningful value if the user does not have enable_ppgtt=3 set? You can get that from I915_PARAM_HAS_ALIASING_PPGTT right now. Check for gen >= 8 seems rather inadequate, and even then you're not returning anything useful to the caller.
- return bo->bufmgr->bo_emit_reloc(bo, offset,
target_bo, target_offset,
read_domains, write_domain);
+}
Using emit_reloc to set a BO flag seems confusing and can be error prone, emit_reloc can be called many times and caller needs to be careful and call the right function for each reloc.
+int +drm_intel_bo_emit_reloc_48bit(drm_intel_bo *bo, uint32_t offset,
drm_intel_bo *target_bo, uint32_t target_offset,
uint32_t read_domains, uint32_t write_domain)
+{
- if (bo->bufmgr->bo_set_supports_48b_address)
bo->bufmgr->bo_set_supports_48b_address(target_bo);
Same situation as with clear_supports_48b_address.
- return bo->bufmgr->bo_emit_reloc(bo, offset, target_bo, target_offset, read_domains, write_domain);
diff --git a/intel/intel_bufmgr.h b/intel/intel_bufmgr.h index 285919e..8f91ffe 100644 --- a/intel/intel_bufmgr.h +++ b/intel/intel_bufmgr.h @@ -87,7 +87,8 @@ struct _drm_intel_bo { /** * Last seen card virtual address (offset from the beginning of the * aperture) for the object. This should be used to fill relocation
* entries when calling drm_intel_bo_emit_reloc()
* entries when calling drm_intel_bo_emit_reloc() or
*/ uint64_t offset64;* drm_intel_bo_emit_reloc_48bit()
}; @@ -147,6 +148,9 @@ int drm_intel_bufmgr_check_aperture_space(drm_intel_bo ** bo_array, int count); int drm_intel_bo_emit_reloc(drm_intel_bo *bo, uint32_t offset, drm_intel_bo *target_bo, uint32_t target_offset, uint32_t read_domains, uint32_t write_domain); +int drm_intel_bo_emit_reloc_48bit(drm_intel_bo *bo, uint32_t offset,
drm_intel_bo *target_bo, uint32_t target_offset,
uint32_t read_domains, uint32_t write_domain);
int drm_intel_bo_emit_reloc_fence(drm_intel_bo *bo, uint32_t offset, drm_intel_bo *target_bo, uint32_t target_offset, diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c index 41de396..713ed4e 100644 --- a/intel/intel_bufmgr_gem.c +++ b/intel/intel_bufmgr_gem.c @@ -140,6 +140,7 @@ typedef struct _drm_intel_bufmgr_gem { } drm_intel_bufmgr_gem;
#define DRM_INTEL_RELOC_FENCE (1<<0) +#define DRM_INTEL_RELOC_SUPPORTS_48B_ADDRESS (2<<0)
Why are you duplicating information about support for 48B in both emit reloc function argument and struct bo_gem?
typedef struct _drm_intel_reloc_target_info { drm_intel_bo *bo; @@ -237,6 +238,14 @@ struct _drm_intel_bo_gem { bool is_userptr;
/**
* Boolean of whether this buffer can be in the whole 48-bit address.
*
* By default, buffers will be keep in a 32-bit range, unless this
* flag is explicitly set.
*/
- bool supports_48b_address;
- /**
- Size in bytes of this buffer and its relocation descendents.
- Used to avoid costly tree walking in
@@ -463,13 +472,18 @@ drm_intel_add_validate_buffer(drm_intel_bo *bo) }
static void -drm_intel_add_validate_buffer2(drm_intel_bo *bo, int need_fence) +drm_intel_add_validate_buffer2(drm_intel_bo *bo, int need_fence,
int supports_48b_address)
{ drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *)bo->bufmgr; drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *)bo; int index;
if (bo_gem->validate_index != -1) {
if (supports_48b_address) {
bufmgr_gem->exec2_objects[bo_gem->validate_index].flags |=
EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
if (need_fence) bufmgr_gem->exec2_objects[bo_gem->validate_index].flags |= EXEC_OBJECT_NEEDS_FENCE;}
@@ -508,6 +522,10 @@ drm_intel_add_validate_buffer2(drm_intel_bo *bo, int need_fence) bufmgr_gem->exec2_objects[index].flags |= EXEC_OBJECT_NEEDS_FENCE; }
- if (supports_48b_address) {
bufmgr_gem->exec2_objects[index].flags |=
EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
- } bufmgr_gem->exec_count++;
}
@@ -1925,11 +1943,33 @@ do_bo_emit_reloc(drm_intel_bo *bo, uint32_t offset, else bo_gem->reloc_target_info[bo_gem->reloc_count].flags = 0;
if (target_bo_gem->supports_48b_address)
bo_gem->reloc_target_info[bo_gem->reloc_count].flags |=
DRM_INTEL_RELOC_SUPPORTS_48B_ADDRESS;
bo_gem->reloc_count++;
return 0;
}
+static void drm_intel_gem_bo_set_supports_48b_address(drm_intel_bo *bo) +{
- drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
- drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
- if (bufmgr_gem->gen >= 8)
bo_gem->supports_48b_address = 1;
+}
+static void drm_intel_gem_bo_clear_supports_48b_address(drm_intel_bo *bo) +{
- drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
- drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
- if (bufmgr_gem->gen >= 8)
bo_gem->supports_48b_address = 0;
+}
static int drm_intel_gem_bo_emit_reloc(drm_intel_bo *bo, uint32_t offset, drm_intel_bo *target_bo, uint32_t target_offset, @@ -2043,7 +2083,7 @@ drm_intel_gem_bo_process_reloc2(drm_intel_bo *bo)
for (i = 0; i < bo_gem->reloc_count; i++) { drm_intel_bo *target_bo = bo_gem->reloc_target_info[i].bo;
int need_fence;
int need_fence, supports_48b_addr;
if (target_bo == bo) continue;
@@ -2056,8 +2096,12 @@ drm_intel_gem_bo_process_reloc2(drm_intel_bo *bo) need_fence = (bo_gem->reloc_target_info[i].flags & DRM_INTEL_RELOC_FENCE);
supports_48b_addr = (bo_gem->reloc_target_info[i].flags &
DRM_INTEL_RELOC_SUPPORTS_48B_ADDRESS);
- /* Add the target to the validate list */
drm_intel_add_validate_buffer2(target_bo, need_fence);
drm_intel_add_validate_buffer2(target_bo, need_fence,
}supports_48b_addr);
}
@@ -2217,7 +2261,7 @@ do_exec2(drm_intel_bo *bo, int used, drm_intel_context *ctx, /* Add the batch buffer to the validation list. There are no relocations * pointing to it. */
- drm_intel_add_validate_buffer2(bo, 0);
drm_intel_add_validate_buffer2(bo, 0, 0);
memclear(execbuf); execbuf.buffers_ptr = (uintptr_t)bufmgr_gem->exec2_objects;
@@ -3299,6 +3343,8 @@ drm_intel_bufmgr_gem_init(int fd, int batch_size) bufmgr_gem->bufmgr.bo_wait_rendering = drm_intel_gem_bo_wait_rendering; bufmgr_gem->bufmgr.bo_emit_reloc = drm_intel_gem_bo_emit_reloc; bufmgr_gem->bufmgr.bo_emit_reloc_fence = drm_intel_gem_bo_emit_reloc_fence;
- bufmgr_gem->bufmgr.bo_set_supports_48b_address = drm_intel_gem_bo_set_supports_48b_address;
- bufmgr_gem->bufmgr.bo_clear_supports_48b_address = drm_intel_gem_bo_clear_supports_48b_address; bufmgr_gem->bufmgr.bo_pin = drm_intel_gem_bo_pin; bufmgr_gem->bufmgr.bo_unpin = drm_intel_gem_bo_unpin; bufmgr_gem->bufmgr.bo_get_tiling = drm_intel_gem_bo_get_tiling;
diff --git a/intel/intel_bufmgr_priv.h b/intel/intel_bufmgr_priv.h index 59ebd18..774fa02 100644 --- a/intel/intel_bufmgr_priv.h +++ b/intel/intel_bufmgr_priv.h @@ -152,6 +152,17 @@ struct _drm_intel_bufmgr { void (*destroy) (drm_intel_bufmgr *bufmgr);
/**
* Set/Clear 48-bit address support flag in a given bo.
*
* Any resource used with flat/heapless (0x00000000-0xfffff000)
* General State Heap (GSH) or Intructions State Heap (ISH) must
* be in a 32-bit range. 48-bit range will only be used when explicitly
* requested.
*/
- void (*bo_set_supports_48b_address) (drm_intel_bo *bo);
- void (*bo_clear_supports_48b_address) (drm_intel_bo *bo);
- /**
- Add relocation entry in reloc_buf, which will be updated with the
- target buffer's real offset on on command submission.
-- 2.5.0
You also haven't updated the dump_validation_list with proper format specifier (addresses are truncated otherwise).
I really don't like hiding set_supports_48b_address/clear_supports_48b_address in emit reloc - making this explicit would be a better approach. Of course this is just my opinion, but let me post a bit different version of this API - just as an RFC so you can see how it would look with explicit set on bo and without adding another emit_reloc variant.
--- include/drm/i915_drm.h | 3 ++- intel/intel_bufmgr.c | 11 +++++++++++ intel/intel_bufmgr.h | 1 + intel/intel_bufmgr_gem.c | 43 +++++++++++++++++++++++++++++++++---------- intel/intel_bufmgr_priv.h | 8 ++++++++ 5 files changed, 55 insertions(+), 11 deletions(-)
diff --git a/include/drm/i915_drm.h b/include/drm/i915_drm.h index ded43b1..426b25c 100644 --- a/include/drm/i915_drm.h +++ b/include/drm/i915_drm.h @@ -680,7 +680,8 @@ struct drm_i915_gem_exec_object2 { #define EXEC_OBJECT_NEEDS_FENCE (1<<0) #define EXEC_OBJECT_NEEDS_GTT (1<<1) #define EXEC_OBJECT_WRITE (1<<2) -#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_WRITE<<1) +#define EXEC_OBJECT_SUPPORTS_48B_ADDRESS (1<<3) +#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_SUPPORTS_48B_ADDRESS<<1) __u64 flags;
__u64 rsvd1; diff --git a/intel/intel_bufmgr.c b/intel/intel_bufmgr.c index 14ea9f9..97ea6ec 100644 --- a/intel/intel_bufmgr.c +++ b/intel/intel_bufmgr.c @@ -261,6 +261,17 @@ drm_intel_bo_get_tiling(drm_intel_bo *bo, uint32_t * tiling_mode, }
int +drm_intel_bo_use_full_range(drm_intel_bo *bo, uint32_t enable) +{ + if (bo->bufmgr->bo_use_full_range) { + bo->bufmgr->bo_use_full_range(bo, enable); + return 0; + } + + return -ENODEV; +} + +int drm_intel_bo_disable_reuse(drm_intel_bo *bo) { if (bo->bufmgr->bo_disable_reuse) diff --git a/intel/intel_bufmgr.h b/intel/intel_bufmgr.h index 285919e..2635fa4 100644 --- a/intel/intel_bufmgr.h +++ b/intel/intel_bufmgr.h @@ -160,6 +160,7 @@ int drm_intel_bo_get_tiling(drm_intel_bo *bo, uint32_t * tiling_mode, int drm_intel_bo_flink(drm_intel_bo *bo, uint32_t * name); int drm_intel_bo_busy(drm_intel_bo *bo); int drm_intel_bo_madvise(drm_intel_bo *bo, int madv); +int drm_intel_bo_use_full_range(drm_intel_bo *bo, uint32_t enable);
int drm_intel_bo_disable_reuse(drm_intel_bo *bo); int drm_intel_bo_is_reusable(drm_intel_bo *bo); diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c index 41de396..ef71686 100644 --- a/intel/intel_bufmgr_gem.c +++ b/intel/intel_bufmgr_gem.c @@ -237,6 +237,11 @@ struct _drm_intel_bo_gem { bool is_userptr;
/** + * Whether this buffer can be placed in full (2^48) ppgtt range on gen8+ + */ + bool uses_full_range; + + /** * Size in bytes of this buffer and its relocation descendents. * * Used to avoid costly tree walking in @@ -395,8 +400,8 @@ drm_intel_gem_dump_validation_list(drm_intel_bufmgr_gem *bufmgr_gem) drm_intel_bo_gem *target_gem = (drm_intel_bo_gem *) target_bo;
- DBG("%2d: %d (%s)@0x%08llx -> " - "%d (%s)@0x%08lx + 0x%08x\n", + DBG("%2d: %d (%s)@0x%016llx -> " + "%d (%s)@0x%016lx + 0x%08x\n", i, bo_gem->gem_handle, bo_gem->name, (unsigned long long)bo_gem->relocs[j].offset, @@ -468,11 +473,15 @@ drm_intel_add_validate_buffer2(drm_intel_bo *bo, int need_fence) drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *)bo->bufmgr; drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *)bo; int index; + int flags = 0; + + if (need_fence) + flags |= EXEC_OBJECT_NEEDS_FENCE; + if (bo_gem->uses_full_range) + flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
if (bo_gem->validate_index != -1) { - if (need_fence) - bufmgr_gem->exec2_objects[bo_gem->validate_index].flags |= - EXEC_OBJECT_NEEDS_FENCE; + bufmgr_gem->exec2_objects[bo_gem->validate_index].flags |= flags; return; }
@@ -501,13 +510,9 @@ drm_intel_add_validate_buffer2(drm_intel_bo *bo, int need_fence) bufmgr_gem->exec2_objects[index].alignment = bo->align; bufmgr_gem->exec2_objects[index].offset = 0; bufmgr_gem->exec_bos[index] = bo; - bufmgr_gem->exec2_objects[index].flags = 0; + bufmgr_gem->exec2_objects[index].flags = flags; bufmgr_gem->exec2_objects[index].rsvd1 = 0; bufmgr_gem->exec2_objects[index].rsvd2 = 0; - if (need_fence) { - bufmgr_gem->exec2_objects[index].flags |= - EXEC_OBJECT_NEEDS_FENCE; - } bufmgr_gem->exec_count++; }
@@ -780,6 +785,7 @@ retry: bo_gem->used_as_reloc_target = false; bo_gem->has_error = false; bo_gem->reusable = true; + bo_gem->uses_full_range = false;
drm_intel_bo_gem_set_in_aperture_size(bufmgr_gem, bo_gem, alignment);
@@ -926,6 +932,7 @@ drm_intel_gem_bo_alloc_userptr(drm_intel_bufmgr *bufmgr, bo_gem->used_as_reloc_target = false; bo_gem->has_error = false; bo_gem->reusable = false; + bo_gem->uses_full_range = false;
drm_intel_bo_gem_set_in_aperture_size(bufmgr_gem, bo_gem, 0);
@@ -1081,6 +1088,7 @@ drm_intel_bo_gem_create_from_name(drm_intel_bufmgr *bufmgr, bo_gem->bo.handle = open_arg.handle; bo_gem->global_name = handle; bo_gem->reusable = false; + bo_gem->uses_full_range = false;
memclear(get_tiling); get_tiling.handle = bo_gem->gem_handle; @@ -2418,6 +2426,13 @@ drm_intel_gem_bo_get_tiling(drm_intel_bo *bo, uint32_t * tiling_mode, return 0; }
+static void +drm_intel_gem_bo_use_full_range(drm_intel_bo *bo, uint32_t enable) +{ + drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo; + bo_gem->uses_full_range = enable; +} + drm_intel_bo * drm_intel_bo_gem_create_from_prime(drm_intel_bufmgr *bufmgr, int prime_fd, int size) { @@ -2482,6 +2497,7 @@ drm_intel_bo_gem_create_from_prime(drm_intel_bufmgr *bufmgr, int prime_fd, int s bo_gem->used_as_reloc_target = false; bo_gem->has_error = false; bo_gem->reusable = false; + bo_gem->uses_full_range = false;
DRMINITLISTHEAD(&bo_gem->vma_list); DRMLISTADDTAIL(&bo_gem->name_list, &bufmgr_gem->named); @@ -3278,6 +3294,13 @@ drm_intel_bufmgr_gem_init(int fd, int batch_size) } }
+ if (bufmgr_gem->gen >= 8) { + gp.param = I915_PARAM_HAS_ALIASING_PPGTT; + ret = drmIoctl(bufmgr_gem->fd, DRM_IOCTL_I915_GETPARAM, &gp); + if (ret == 0 && *gp.value == 3) + bufmgr_gem->bufmgr.bo_use_full_range = drm_intel_gem_bo_use_full_range; + } + /* Let's go with one relocation per every 2 dwords (but round down a bit * since a power of two will mean an extra page allocation for the reloc * buffer). diff --git a/intel/intel_bufmgr_priv.h b/intel/intel_bufmgr_priv.h index 59ebd18..3461d31 100644 --- a/intel/intel_bufmgr_priv.h +++ b/intel/intel_bufmgr_priv.h @@ -227,6 +227,14 @@ struct _drm_intel_bufmgr { uint32_t * swizzle_mode);
/** + * Indicate that the buffer can be placed in full ppgtt range (2^48) + * without restrictions. + * \param bo Buffer to set the full_range flag for + * \param enable Flag value + */ + void (*bo_use_full_range) (drm_intel_bo *bo, uint32_t enable); + + /** * Create a visible name for a buffer which can be used by other apps * * \param buf Buffer to create a name for
On 8/7/2015 11:56 AM, Michał Winiarski wrote:
On Fri, Aug 07, 2015 at 10:45:21AM +0100, Michel Thierry wrote:
Gen8+ supports 48-bit virtual addresses, but some objects must always be allocated inside the 32-bit address range.
In specific, any resource used with flat/heapless (0x00000000-0xfffff000) General State Heap (GSH) or Instruction State Heap (ISH) must be in a 32-bit range, because the General State Offset and Instruction State Offset are limited to 32-bits.
The i915 driver has been modified to provide a flag to set when the 4GB limit is not necessary in a given bo (EXEC_OBJECT_SUPPORTS_48B_ADDRESS). 48-bit range will only be used when explicitly requested.
Calls to the new drm_intel_bo_emit_reloc_48bit function will have this flag set automatically, while calls to drm_intel_bo_emit_reloc will clear it.
v2: Make set/clear functions nops on pre-gen8 platforms, and use them internally in emit_reloc functions (Ben) s/48BADDRESS/48B_ADDRESS/ (Dave) v3: Keep set/clear functions internal, no-one needs to use them directly.
References: http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html Cc: Ben Widawsky ben@bwidawsk.net Cc: Dave Gordon david.s.gordon@intel.com Signed-off-by: Michel Thierry michel.thierry@intel.com
include/drm/i915_drm.h | 3 ++- intel/intel_bufmgr.c | 16 ++++++++++++++ intel/intel_bufmgr.h | 6 +++++- intel/intel_bufmgr_gem.c | 54 +++++++++++++++++++++++++++++++++++++++++++---- intel/intel_bufmgr_priv.h | 11 ++++++++++ 5 files changed, 84 insertions(+), 6 deletions(-)
diff --git a/include/drm/i915_drm.h b/include/drm/i915_drm.h index ded43b1..426b25c 100644 --- a/include/drm/i915_drm.h +++ b/include/drm/i915_drm.h @@ -680,7 +680,8 @@ struct drm_i915_gem_exec_object2 { #define EXEC_OBJECT_NEEDS_FENCE (1<<0) #define EXEC_OBJECT_NEEDS_GTT (1<<1) #define EXEC_OBJECT_WRITE (1<<2) -#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_WRITE<<1) +#define EXEC_OBJECT_SUPPORTS_48B_ADDRESS (1<<3) +#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_SUPPORTS_48B_ADDRESS<<1) __u64 flags;
__u64 rsvd1; diff --git a/intel/intel_bufmgr.c b/intel/intel_bufmgr.c index 14ea9f9..0bd5191 100644 --- a/intel/intel_bufmgr.c +++ b/intel/intel_bufmgr.c @@ -202,6 +202,22 @@ drm_intel_bo_emit_reloc(drm_intel_bo *bo, uint32_t offset, drm_intel_bo *target_bo, uint32_t target_offset, uint32_t read_domains, uint32_t write_domain) {
- if (bo->bufmgr->bo_clear_supports_48b_address)
bo->bufmgr->bo_clear_supports_48b_address(target_bo);
This is always true - you assign func to this func pointer unconditionally. Also - why not return some meaningful value if the user does not have enable_ppgtt=3 set? You can get that from I915_PARAM_HAS_ALIASING_PPGTT right now. Check for gen >= 8 seems rather inadequate, and even then you're not returning anything useful to the caller.
- return bo->bufmgr->bo_emit_reloc(bo, offset,
target_bo, target_offset,
read_domains, write_domain);
+}
Using emit_reloc to set a BO flag seems confusing and can be error prone, emit_reloc can be called many times and caller needs to be careful and call the right function for each reloc.
+int +drm_intel_bo_emit_reloc_48bit(drm_intel_bo *bo, uint32_t offset,
drm_intel_bo *target_bo, uint32_t target_offset,
uint32_t read_domains, uint32_t write_domain)
+{
- if (bo->bufmgr->bo_set_supports_48b_address)
bo->bufmgr->bo_set_supports_48b_address(target_bo);
Same situation as with clear_supports_48b_address.
- return bo->bufmgr->bo_emit_reloc(bo, offset, target_bo, target_offset, read_domains, write_domain);
diff --git a/intel/intel_bufmgr.h b/intel/intel_bufmgr.h index 285919e..8f91ffe 100644 --- a/intel/intel_bufmgr.h +++ b/intel/intel_bufmgr.h @@ -87,7 +87,8 @@ struct _drm_intel_bo { /** * Last seen card virtual address (offset from the beginning of the * aperture) for the object. This should be used to fill relocation
* entries when calling drm_intel_bo_emit_reloc()
* entries when calling drm_intel_bo_emit_reloc() or
*/ uint64_t offset64; };* drm_intel_bo_emit_reloc_48bit()
@@ -147,6 +148,9 @@ int drm_intel_bufmgr_check_aperture_space(drm_intel_bo ** bo_array, int count); int drm_intel_bo_emit_reloc(drm_intel_bo *bo, uint32_t offset, drm_intel_bo *target_bo, uint32_t target_offset, uint32_t read_domains, uint32_t write_domain); +int drm_intel_bo_emit_reloc_48bit(drm_intel_bo *bo, uint32_t offset,
drm_intel_bo *target_bo, uint32_t target_offset,
int drm_intel_bo_emit_reloc_fence(drm_intel_bo *bo, uint32_t offset, drm_intel_bo *target_bo, uint32_t target_offset,uint32_t read_domains, uint32_t write_domain);
diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c index 41de396..713ed4e 100644 --- a/intel/intel_bufmgr_gem.c +++ b/intel/intel_bufmgr_gem.c @@ -140,6 +140,7 @@ typedef struct _drm_intel_bufmgr_gem { } drm_intel_bufmgr_gem;
#define DRM_INTEL_RELOC_FENCE (1<<0) +#define DRM_INTEL_RELOC_SUPPORTS_48B_ADDRESS (2<<0)
Why are you duplicating information about support for 48B in both emit reloc function argument and struct bo_gem?
typedef struct _drm_intel_reloc_target_info { drm_intel_bo *bo; @@ -237,6 +238,14 @@ struct _drm_intel_bo_gem { bool is_userptr;
/**
* Boolean of whether this buffer can be in the whole 48-bit address.
*
* By default, buffers will be keep in a 32-bit range, unless this
* flag is explicitly set.
*/
- bool supports_48b_address;
- /**
- Size in bytes of this buffer and its relocation descendents.
- Used to avoid costly tree walking in
@@ -463,13 +472,18 @@ drm_intel_add_validate_buffer(drm_intel_bo *bo) }
static void -drm_intel_add_validate_buffer2(drm_intel_bo *bo, int need_fence) +drm_intel_add_validate_buffer2(drm_intel_bo *bo, int need_fence,
int supports_48b_address)
{ drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *)bo->bufmgr; drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *)bo; int index;
if (bo_gem->validate_index != -1) {
if (supports_48b_address) {
bufmgr_gem->exec2_objects[bo_gem->validate_index].flags |=
EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
}
if (need_fence) bufmgr_gem->exec2_objects[bo_gem->validate_index].flags |= EXEC_OBJECT_NEEDS_FENCE;
@@ -508,6 +522,10 @@ drm_intel_add_validate_buffer2(drm_intel_bo *bo, int need_fence) bufmgr_gem->exec2_objects[index].flags |= EXEC_OBJECT_NEEDS_FENCE; }
- if (supports_48b_address) {
bufmgr_gem->exec2_objects[index].flags |=
EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
- } bufmgr_gem->exec_count++; }
@@ -1925,11 +1943,33 @@ do_bo_emit_reloc(drm_intel_bo *bo, uint32_t offset, else bo_gem->reloc_target_info[bo_gem->reloc_count].flags = 0;
if (target_bo_gem->supports_48b_address)
bo_gem->reloc_target_info[bo_gem->reloc_count].flags |=
DRM_INTEL_RELOC_SUPPORTS_48B_ADDRESS;
bo_gem->reloc_count++;
return 0; }
+static void drm_intel_gem_bo_set_supports_48b_address(drm_intel_bo *bo) +{
- drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
- drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
- if (bufmgr_gem->gen >= 8)
bo_gem->supports_48b_address = 1;
+}
+static void drm_intel_gem_bo_clear_supports_48b_address(drm_intel_bo *bo) +{
- drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
- drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
- if (bufmgr_gem->gen >= 8)
bo_gem->supports_48b_address = 0;
+}
- static int drm_intel_gem_bo_emit_reloc(drm_intel_bo *bo, uint32_t offset, drm_intel_bo *target_bo, uint32_t target_offset,
@@ -2043,7 +2083,7 @@ drm_intel_gem_bo_process_reloc2(drm_intel_bo *bo)
for (i = 0; i < bo_gem->reloc_count; i++) { drm_intel_bo *target_bo = bo_gem->reloc_target_info[i].bo;
int need_fence;
int need_fence, supports_48b_addr;
if (target_bo == bo) continue;
@@ -2056,8 +2096,12 @@ drm_intel_gem_bo_process_reloc2(drm_intel_bo *bo) need_fence = (bo_gem->reloc_target_info[i].flags & DRM_INTEL_RELOC_FENCE);
supports_48b_addr = (bo_gem->reloc_target_info[i].flags &
DRM_INTEL_RELOC_SUPPORTS_48B_ADDRESS);
- /* Add the target to the validate list */
drm_intel_add_validate_buffer2(target_bo, need_fence);
drm_intel_add_validate_buffer2(target_bo, need_fence,
} }supports_48b_addr);
@@ -2217,7 +2261,7 @@ do_exec2(drm_intel_bo *bo, int used, drm_intel_context *ctx, /* Add the batch buffer to the validation list. There are no relocations * pointing to it. */
- drm_intel_add_validate_buffer2(bo, 0);
drm_intel_add_validate_buffer2(bo, 0, 0);
memclear(execbuf); execbuf.buffers_ptr = (uintptr_t)bufmgr_gem->exec2_objects;
@@ -3299,6 +3343,8 @@ drm_intel_bufmgr_gem_init(int fd, int batch_size) bufmgr_gem->bufmgr.bo_wait_rendering = drm_intel_gem_bo_wait_rendering; bufmgr_gem->bufmgr.bo_emit_reloc = drm_intel_gem_bo_emit_reloc; bufmgr_gem->bufmgr.bo_emit_reloc_fence = drm_intel_gem_bo_emit_reloc_fence;
- bufmgr_gem->bufmgr.bo_set_supports_48b_address = drm_intel_gem_bo_set_supports_48b_address;
- bufmgr_gem->bufmgr.bo_clear_supports_48b_address = drm_intel_gem_bo_clear_supports_48b_address; bufmgr_gem->bufmgr.bo_pin = drm_intel_gem_bo_pin; bufmgr_gem->bufmgr.bo_unpin = drm_intel_gem_bo_unpin; bufmgr_gem->bufmgr.bo_get_tiling = drm_intel_gem_bo_get_tiling;
diff --git a/intel/intel_bufmgr_priv.h b/intel/intel_bufmgr_priv.h index 59ebd18..774fa02 100644 --- a/intel/intel_bufmgr_priv.h +++ b/intel/intel_bufmgr_priv.h @@ -152,6 +152,17 @@ struct _drm_intel_bufmgr { void (*destroy) (drm_intel_bufmgr *bufmgr);
/**
* Set/Clear 48-bit address support flag in a given bo.
*
* Any resource used with flat/heapless (0x00000000-0xfffff000)
* General State Heap (GSH) or Intructions State Heap (ISH) must
* be in a 32-bit range. 48-bit range will only be used when explicitly
* requested.
*/
- void (*bo_set_supports_48b_address) (drm_intel_bo *bo);
- void (*bo_clear_supports_48b_address) (drm_intel_bo *bo);
- /**
- Add relocation entry in reloc_buf, which will be updated with the
- target buffer's real offset on on command submission.
-- 2.5.0
You also haven't updated the dump_validation_list with proper format specifier (addresses are truncated otherwise).
I really don't like hiding set_supports_48b_address/clear_supports_48b_address in emit reloc - making this explicit would be a better approach. Of course this is just my opinion, but let me post a bit different version of this API - just as an RFC so you can see how it would look with explicit set on bo and without adding another emit_reloc variant.
Hi Michał,
Ben suggested having the set/clear in emit reloc as this is the only place mesa cares about these wa.
But I see your point, this will be used not only by mesa, so we should have something that is good for all the other projects.
-Michel
On Fri, 2015-08-07 at 12:36 +0100, Michel Thierry wrote:
Hi Michał,
Ben suggested having the set/clear in emit reloc as this is the only place mesa cares about these wa.
But I see your point, this will be used not only by mesa, so we should have something that is good for all the other projects.
-Michel
If there are no comments from anyone else then I'm fine with public API from last version. But comments about the error handling and implementation details could still be addressed.
-Michał
Signed-off-by: Michel Thierry michel.thierry@intel.com --- intel/intel-symbol-check | 1 + 1 file changed, 1 insertion(+)
diff --git a/intel/intel-symbol-check b/intel/intel-symbol-check index c555e6d..6f8450b 100755 --- a/intel/intel-symbol-check +++ b/intel/intel-symbol-check @@ -18,6 +18,7 @@ drm_intel_bo_busy drm_intel_bo_disable_reuse drm_intel_bo_emit_reloc drm_intel_bo_emit_reloc_fence +drm_intel_bo_emit_reloc_48bit drm_intel_bo_exec drm_intel_bo_fake_alloc_static drm_intel_bo_fake_disable_backing_store
Gen8+ supports 48-bit virtual addresses, but some objects must always be allocated inside the 32-bit address range.
In specific, any resource used with flat/heapless (0x00000000-0xfffff000) General State Heap or Intruction State Heap must be in a 32-bit range (GSH / ISH), because the General State Offset and Instruction State Offset are limited to 32-bits.
Use drm_intel_bo_emit_reloc_48bit when the 4GB limit is not necessary, and the bo can be in the full address space.
This commit introduces a dependency of libdrm 2.4.63, which introduces the drm_intel_bo_emit_reloc_48bit function.
v2: s/48baddress/48b_address/, Only use in OUT_RELOC64 cases, OUT_RELOC implies a 32-bit address offset is needed (Ben) v3: Added OUT_RELOC64_INSIDE_4G, so it stands out when a 64-bit relocation needs the 32-bit workaround (Chris)
References: http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html Cc: Ben Widawsky ben@bwidawsk.net Cc: Chris Wilson chris@chris-wilson.co.uk Signed-off-by: Michel Thierry michel.thierry@intel.com --- configure.ac | 2 +- src/mesa/drivers/dri/i965/gen8_misc_state.c | 19 +++++++++++-------- src/mesa/drivers/dri/i965/intel_batchbuffer.c | 20 ++++++++++++++++---- src/mesa/drivers/dri/i965/intel_batchbuffer.h | 10 ++++++++-- 4 files changed, 36 insertions(+), 15 deletions(-)
diff --git a/configure.ac b/configure.ac index af61aa2..c92ca44 100644 --- a/configure.ac +++ b/configure.ac @@ -68,7 +68,7 @@ AC_SUBST([OSMESA_VERSION]) dnl Versions for external dependencies LIBDRM_REQUIRED=2.4.38 LIBDRM_RADEON_REQUIRED=2.4.56 -LIBDRM_INTEL_REQUIRED=2.4.60 +LIBDRM_INTEL_REQUIRED=2.4.63 LIBDRM_NVVIEUX_REQUIRED=2.4.33 LIBDRM_NOUVEAU_REQUIRED="2.4.33 libdrm >= 2.4.41" LIBDRM_FREEDRENO_REQUIRED=2.4.57 diff --git a/src/mesa/drivers/dri/i965/gen8_misc_state.c b/src/mesa/drivers/dri/i965/gen8_misc_state.c index b20038e..73eba06 100644 --- a/src/mesa/drivers/dri/i965/gen8_misc_state.c +++ b/src/mesa/drivers/dri/i965/gen8_misc_state.c @@ -28,6 +28,10 @@
/** * Define the base addresses which some state is referenced from. + * + * Use OUT_RELOC64_INSIDE_4G instead of OUT_RELOC64, the General State + * Offset and Instruction State Offset are limited to 32-bits by hardware, + * and must be located in the first 4GBs (32-bit offset). */ void gen8_upload_state_base_address(struct brw_context *brw) { @@ -41,19 +45,18 @@ void gen8_upload_state_base_address(struct brw_context *brw) OUT_BATCH(0); OUT_BATCH(mocs_wb << 16); /* Surface state base address: */ - OUT_RELOC64(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0, - mocs_wb << 4 | 1); + OUT_RELOC64_INSIDE_4G(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0, + mocs_wb << 4 | 1); /* Dynamic state base address: */ - OUT_RELOC64(brw->batch.bo, - I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION, 0, - mocs_wb << 4 | 1); + OUT_RELOC64_INSIDE_4G(brw->batch.bo, + I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION, 0, + mocs_wb << 4 | 1); /* Indirect object base address: MEDIA_OBJECT data */ OUT_BATCH(mocs_wb << 4 | 1); OUT_BATCH(0); /* Instruction base address: shader kernels (incl. SIP) */ - OUT_RELOC64(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, - mocs_wb << 4 | 1); - + OUT_RELOC64_INSIDE_4G(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, + mocs_wb << 4 | 1); /* General state buffer size */ OUT_BATCH(0xfffff001); /* Dynamic state buffer size */ diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c index 54081a1..ca90784 100644 --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c @@ -409,11 +409,23 @@ bool intel_batchbuffer_emit_reloc64(struct brw_context *brw, drm_intel_bo *buffer, uint32_t read_domains, uint32_t write_domain, - uint32_t delta) + uint32_t delta, + bool support_48bit_offset) { - int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used, - buffer, delta, - read_domains, write_domain); + int ret; + + /* Not all buffers can be allocated outside the first 4GB, and + * offset must be limited to 32-bits. + */ + if (support_48bit_offset) + drm_intel_bo_emit_reloc_48bit(brw->batch.bo, 4*brw->batch.used, + buffer, delta, + read_domains, write_domain); + else + drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used, + buffer, delta, + read_domains, write_domain); + assert(ret == 0); (void) ret;
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h index ef8a6ff..de5023b 100644 --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h @@ -62,7 +62,8 @@ bool intel_batchbuffer_emit_reloc64(struct brw_context *brw, drm_intel_bo *buffer, uint32_t read_domains, uint32_t write_domain, - uint32_t offset); + uint32_t offset, + bool support_48bit_offset); static inline uint32_t float_as_int(float f) { union { @@ -167,7 +168,12 @@ intel_batchbuffer_advance(struct brw_context *brw)
/* Handle 48-bit address relocations for Gen8+ */ #define OUT_RELOC64(buf, read_domains, write_domain, delta) do { \ - intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain, delta); \ + intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain, delta, 1); \ +} while (0) + +/* Handle 48-bit address relocations for Gen8+, requesting 32-bit offset */ +#define OUT_RELOC64_INSIDE_4G(buf, read_domains, write_domain, delta) do { \ + intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain, delta, 0); \ } while (0)
#define ADVANCE_BATCH() intel_batchbuffer_advance(brw);
On Fri, Aug 7, 2015 at 2:45 AM, Michel Thierry michel.thierry@intel.com wrote:
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c index 54081a1..ca90784 100644 --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c @@ -409,11 +409,23 @@ bool intel_batchbuffer_emit_reloc64(struct brw_context *brw,
This patch needs to be rebased on commit 09348c12f (committed more than 3 weeks ago).
On Fri, Aug 7, 2015 at 5:45 AM, Michel Thierry michel.thierry@intel.com wrote:
Gen8+ supports 48-bit virtual addresses, but some objects must always be allocated inside the 32-bit address range.
In specific, any resource used with flat/heapless (0x00000000-0xfffff000) General State Heap or Intruction State Heap must be in a 32-bit range (GSH / ISH), because the General State Offset and Instruction State Offset are limited to 32-bits.
Use drm_intel_bo_emit_reloc_48bit when the 4GB limit is not necessary, and the bo can be in the full address space.
This commit introduces a dependency of libdrm 2.4.63, which introduces the drm_intel_bo_emit_reloc_48bit function.
v2: s/48baddress/48b_address/, Only use in OUT_RELOC64 cases, OUT_RELOC implies a 32-bit address offset is needed (Ben) v3: Added OUT_RELOC64_INSIDE_4G, so it stands out when a 64-bit relocation needs the 32-bit workaround (Chris)
References: http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html Cc: Ben Widawsky ben@bwidawsk.net Cc: Chris Wilson chris@chris-wilson.co.uk Signed-off-by: Michel Thierry michel.thierry@intel.com
configure.ac | 2 +- src/mesa/drivers/dri/i965/gen8_misc_state.c | 19 +++++++++++-------- src/mesa/drivers/dri/i965/intel_batchbuffer.c | 20 ++++++++++++++++---- src/mesa/drivers/dri/i965/intel_batchbuffer.h | 10 ++++++++-- 4 files changed, 36 insertions(+), 15 deletions(-)
diff --git a/configure.ac b/configure.ac index af61aa2..c92ca44 100644 --- a/configure.ac +++ b/configure.ac @@ -68,7 +68,7 @@ AC_SUBST([OSMESA_VERSION]) dnl Versions for external dependencies LIBDRM_REQUIRED=2.4.38 LIBDRM_RADEON_REQUIRED=2.4.56 -LIBDRM_INTEL_REQUIRED=2.4.60 +LIBDRM_INTEL_REQUIRED=2.4.63
There is no such version. I think you need a release before you can commit this. Otherwise you'll cause pain for a whole lot of people.
LIBDRM_NVVIEUX_REQUIRED=2.4.33 LIBDRM_NOUVEAU_REQUIRED="2.4.33 libdrm >= 2.4.41" LIBDRM_FREEDRENO_REQUIRED=2.4.57 diff --git a/src/mesa/drivers/dri/i965/gen8_misc_state.c b/src/mesa/drivers/dri/i965/gen8_misc_state.c index b20038e..73eba06 100644 --- a/src/mesa/drivers/dri/i965/gen8_misc_state.c +++ b/src/mesa/drivers/dri/i965/gen8_misc_state.c @@ -28,6 +28,10 @@
/**
- Define the base addresses which some state is referenced from.
- Use OUT_RELOC64_INSIDE_4G instead of OUT_RELOC64, the General State
- Offset and Instruction State Offset are limited to 32-bits by hardware,
*/
- and must be located in the first 4GBs (32-bit offset).
void gen8_upload_state_base_address(struct brw_context *brw) { @@ -41,19 +45,18 @@ void gen8_upload_state_base_address(struct brw_context *brw) OUT_BATCH(0); OUT_BATCH(mocs_wb << 16); /* Surface state base address: */
- OUT_RELOC64(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0,
mocs_wb << 4 | 1);
- OUT_RELOC64_INSIDE_4G(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0,
/* Dynamic state base address: */mocs_wb << 4 | 1);
- OUT_RELOC64(brw->batch.bo,
I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION, 0,
mocs_wb << 4 | 1);
- OUT_RELOC64_INSIDE_4G(brw->batch.bo,
I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION, 0,
/* Indirect object base address: MEDIA_OBJECT data */ OUT_BATCH(mocs_wb << 4 | 1); OUT_BATCH(0); /* Instruction base address: shader kernels (incl. SIP) */mocs_wb << 4 | 1);
- OUT_RELOC64(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
mocs_wb << 4 | 1);
- OUT_RELOC64_INSIDE_4G(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
/* General state buffer size */ OUT_BATCH(0xfffff001); /* Dynamic state buffer size */mocs_wb << 4 | 1);
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c index 54081a1..ca90784 100644 --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c @@ -409,11 +409,23 @@ bool intel_batchbuffer_emit_reloc64(struct brw_context *brw, drm_intel_bo *buffer, uint32_t read_domains, uint32_t write_domain,
uint32_t delta)
uint32_t delta,
bool support_48bit_offset)
{
- int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
buffer, delta,
read_domains, write_domain);
- int ret;
- /* Not all buffers can be allocated outside the first 4GB, and
- offset must be limited to 32-bits.
- */
- if (support_48bit_offset)
drm_intel_bo_emit_reloc_48bit(brw->batch.bo, 4*brw->batch.used,
buffer, delta,
read_domains, write_domain);
- else
drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
buffer, delta,
read_domains, write_domain);
- assert(ret == 0); (void) ret;
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h index ef8a6ff..de5023b 100644 --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h @@ -62,7 +62,8 @@ bool intel_batchbuffer_emit_reloc64(struct brw_context *brw, drm_intel_bo *buffer, uint32_t read_domains, uint32_t write_domain,
uint32_t offset);
uint32_t offset,
bool support_48bit_offset);
static inline uint32_t float_as_int(float f) { union { @@ -167,7 +168,12 @@ intel_batchbuffer_advance(struct brw_context *brw)
/* Handle 48-bit address relocations for Gen8+ */ #define OUT_RELOC64(buf, read_domains, write_domain, delta) do { \
- intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain, delta); \
- intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain, delta, 1); \
+} while (0)
+/* Handle 48-bit address relocations for Gen8+, requesting 32-bit offset */ +#define OUT_RELOC64_INSIDE_4G(buf, read_domains, write_domain, delta) do { \
- intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain, delta, 0); \
} while (0)
#define ADVANCE_BATCH() intel_batchbuffer_advance(brw);
2.5.0
dri-devel mailing list dri-devel@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/dri-devel
On 7 August 2015 at 20:57, Ilia Mirkin imirkin@alum.mit.edu wrote:
On Fri, Aug 7, 2015 at 5:45 AM, Michel Thierry michel.thierry@intel.com wrote:
Gen8+ supports 48-bit virtual addresses, but some objects must always be allocated inside the 32-bit address range.
In specific, any resource used with flat/heapless (0x00000000-0xfffff000) General State Heap or Intruction State Heap must be in a 32-bit range (GSH / ISH), because the General State Offset and Instruction State Offset are limited to 32-bits.
Use drm_intel_bo_emit_reloc_48bit when the 4GB limit is not necessary, and the bo can be in the full address space.
This commit introduces a dependency of libdrm 2.4.63, which introduces the drm_intel_bo_emit_reloc_48bit function.
v2: s/48baddress/48b_address/, Only use in OUT_RELOC64 cases, OUT_RELOC implies a 32-bit address offset is needed (Ben) v3: Added OUT_RELOC64_INSIDE_4G, so it stands out when a 64-bit relocation needs the 32-bit workaround (Chris)
References: http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html Cc: Ben Widawsky ben@bwidawsk.net Cc: Chris Wilson chris@chris-wilson.co.uk Signed-off-by: Michel Thierry michel.thierry@intel.com
configure.ac | 2 +- src/mesa/drivers/dri/i965/gen8_misc_state.c | 19 +++++++++++-------- src/mesa/drivers/dri/i965/intel_batchbuffer.c | 20 ++++++++++++++++---- src/mesa/drivers/dri/i965/intel_batchbuffer.h | 10 ++++++++-- 4 files changed, 36 insertions(+), 15 deletions(-)
diff --git a/configure.ac b/configure.ac index af61aa2..c92ca44 100644 --- a/configure.ac +++ b/configure.ac @@ -68,7 +68,7 @@ AC_SUBST([OSMESA_VERSION]) dnl Versions for external dependencies LIBDRM_REQUIRED=2.4.38 LIBDRM_RADEON_REQUIRED=2.4.56 -LIBDRM_INTEL_REQUIRED=2.4.60 +LIBDRM_INTEL_REQUIRED=2.4.63
There is no such version. I think you need a release before you can commit this. Otherwise you'll cause pain for a whole lot of people.
Afaics Michel explicitly covered the way things will/should be merged in the cover letter.
-Emil
On Fri, Aug 7, 2015 at 2:45 AM, Michel Thierry michel.thierry@intel.com wrote:
Gen8+ supports 48-bit virtual addresses, but some objects must always be allocated inside the 32-bit address range.
In specific, any resource used with flat/heapless (0x00000000-0xfffff000) General State Heap or Intruction State Heap must be in a 32-bit range (GSH / ISH), because the General State Offset and Instruction State Offset are limited to 32-bits.
This doesn't look right. From the workaround text, it doesn't sound like this is a HW problem, it only affects GMM. In mesa, we don't use "heapless" (which I assume means setting the base to 0 and max range) for instructions, dynamic state or surface state. Surface state and dynamic state is always in our batch bo which is limited to 8k. Provided STATE_BASE_ADDRESS works correctly in the HW, the batch bo can be places anywhere in the aperture. Offsets to dynamic or surface state is relative to the beginning of the batch bo and will always be less than 4g. Instructions are in their own bo (brw->cache.bo), but again, we point instruction state base to it and all our shader stage setup code references the instructions relative to the beginning of the instruction bo.
Kristian
Use drm_intel_bo_emit_reloc_48bit when the 4GB limit is not necessary, and the bo can be in the full address space.
This commit introduces a dependency of libdrm 2.4.63, which introduces the drm_intel_bo_emit_reloc_48bit function.
v2: s/48baddress/48b_address/, Only use in OUT_RELOC64 cases, OUT_RELOC implies a 32-bit address offset is needed (Ben) v3: Added OUT_RELOC64_INSIDE_4G, so it stands out when a 64-bit relocation needs the 32-bit workaround (Chris)
References: http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html Cc: Ben Widawsky ben@bwidawsk.net Cc: Chris Wilson chris@chris-wilson.co.uk Signed-off-by: Michel Thierry michel.thierry@intel.com
configure.ac | 2 +- src/mesa/drivers/dri/i965/gen8_misc_state.c | 19 +++++++++++-------- src/mesa/drivers/dri/i965/intel_batchbuffer.c | 20 ++++++++++++++++---- src/mesa/drivers/dri/i965/intel_batchbuffer.h | 10 ++++++++-- 4 files changed, 36 insertions(+), 15 deletions(-)
diff --git a/configure.ac b/configure.ac index af61aa2..c92ca44 100644 --- a/configure.ac +++ b/configure.ac @@ -68,7 +68,7 @@ AC_SUBST([OSMESA_VERSION]) dnl Versions for external dependencies LIBDRM_REQUIRED=2.4.38 LIBDRM_RADEON_REQUIRED=2.4.56 -LIBDRM_INTEL_REQUIRED=2.4.60 +LIBDRM_INTEL_REQUIRED=2.4.63 LIBDRM_NVVIEUX_REQUIRED=2.4.33 LIBDRM_NOUVEAU_REQUIRED="2.4.33 libdrm >= 2.4.41" LIBDRM_FREEDRENO_REQUIRED=2.4.57 diff --git a/src/mesa/drivers/dri/i965/gen8_misc_state.c b/src/mesa/drivers/dri/i965/gen8_misc_state.c index b20038e..73eba06 100644 --- a/src/mesa/drivers/dri/i965/gen8_misc_state.c +++ b/src/mesa/drivers/dri/i965/gen8_misc_state.c @@ -28,6 +28,10 @@
/**
- Define the base addresses which some state is referenced from.
- Use OUT_RELOC64_INSIDE_4G instead of OUT_RELOC64, the General State
- Offset and Instruction State Offset are limited to 32-bits by hardware,
*/
- and must be located in the first 4GBs (32-bit offset).
void gen8_upload_state_base_address(struct brw_context *brw) { @@ -41,19 +45,18 @@ void gen8_upload_state_base_address(struct brw_context *brw) OUT_BATCH(0); OUT_BATCH(mocs_wb << 16); /* Surface state base address: */
- OUT_RELOC64(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0,
mocs_wb << 4 | 1);
- OUT_RELOC64_INSIDE_4G(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0,
/* Dynamic state base address: */mocs_wb << 4 | 1);
- OUT_RELOC64(brw->batch.bo,
I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION, 0,
mocs_wb << 4 | 1);
- OUT_RELOC64_INSIDE_4G(brw->batch.bo,
I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION, 0,
/* Indirect object base address: MEDIA_OBJECT data */ OUT_BATCH(mocs_wb << 4 | 1); OUT_BATCH(0); /* Instruction base address: shader kernels (incl. SIP) */mocs_wb << 4 | 1);
- OUT_RELOC64(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
mocs_wb << 4 | 1);
- OUT_RELOC64_INSIDE_4G(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
/* General state buffer size */ OUT_BATCH(0xfffff001); /* Dynamic state buffer size */mocs_wb << 4 | 1);
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c index 54081a1..ca90784 100644 --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c @@ -409,11 +409,23 @@ bool intel_batchbuffer_emit_reloc64(struct brw_context *brw, drm_intel_bo *buffer, uint32_t read_domains, uint32_t write_domain,
uint32_t delta)
uint32_t delta,
bool support_48bit_offset)
{
- int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
buffer, delta,
read_domains, write_domain);
- int ret;
- /* Not all buffers can be allocated outside the first 4GB, and
- offset must be limited to 32-bits.
- */
- if (support_48bit_offset)
drm_intel_bo_emit_reloc_48bit(brw->batch.bo, 4*brw->batch.used,
buffer, delta,
read_domains, write_domain);
- else
drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
buffer, delta,
read_domains, write_domain);
- assert(ret == 0); (void) ret;
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h index ef8a6ff..de5023b 100644 --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h @@ -62,7 +62,8 @@ bool intel_batchbuffer_emit_reloc64(struct brw_context *brw, drm_intel_bo *buffer, uint32_t read_domains, uint32_t write_domain,
uint32_t offset);
uint32_t offset,
bool support_48bit_offset);
static inline uint32_t float_as_int(float f) { union { @@ -167,7 +168,12 @@ intel_batchbuffer_advance(struct brw_context *brw)
/* Handle 48-bit address relocations for Gen8+ */ #define OUT_RELOC64(buf, read_domains, write_domain, delta) do { \
- intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain, delta); \
- intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain, delta, 1); \
+} while (0)
+/* Handle 48-bit address relocations for Gen8+, requesting 32-bit offset */ +#define OUT_RELOC64_INSIDE_4G(buf, read_domains, write_domain, delta) do { \
- intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain, delta, 0); \
} while (0)
#define ADVANCE_BATCH() intel_batchbuffer_advance(brw);
2.5.0
Intel-gfx mailing list Intel-gfx@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Hi,
Thanks for the comments,
On 8/7/2015 11:46 PM, Kristian Høgsberg wrote:
On Fri, Aug 7, 2015 at 2:45 AM, Michel Thierry michel.thierry@intel.com wrote:
Gen8+ supports 48-bit virtual addresses, but some objects must always be allocated inside the 32-bit address range.
In specific, any resource used with flat/heapless (0x00000000-0xfffff000) General State Heap or Intruction State Heap must be in a 32-bit range (GSH / ISH), because the General State Offset and Instruction State Offset are limited to 32-bits.
This doesn't look right. From the workaround text, it doesn't sound like this is a HW problem, it only affects GMM. In mesa, we don't use "heapless" (which I assume means setting the base to 0 and max range)
It is a hw problem, specifically in the state base address command. General State Base Address and Instruction Base Address shouldn't be > 4GB.
for instructions, dynamic state or surface state. Surface state and dynamic state is always in our batch bo which is limited to 8k. Provided STATE_BASE_ADDRESS works correctly in the HW, the batch bo can be places anywhere in the aperture. Offsets to dynamic or surface state is relative to the beginning of the batch bo and will always be less than 4g. Instructions are in their own bo (brw->cache.bo), but again, we point instruction state base to it and all our shader stage setup code references the instructions relative to the beginning of the instruction bo.
I've seen the issue when the driver allocates mapped objects from bottom to top and the other bo's from top to bottom (gpu hangs). So I say this wa is needed.
-Michel
Kristian
Use drm_intel_bo_emit_reloc_48bit when the 4GB limit is not necessary, and the bo can be in the full address space.
This commit introduces a dependency of libdrm 2.4.63, which introduces the drm_intel_bo_emit_reloc_48bit function.
v2: s/48baddress/48b_address/, Only use in OUT_RELOC64 cases, OUT_RELOC implies a 32-bit address offset is needed (Ben) v3: Added OUT_RELOC64_INSIDE_4G, so it stands out when a 64-bit relocation needs the 32-bit workaround (Chris)
References: http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html Cc: Ben Widawsky ben@bwidawsk.net Cc: Chris Wilson chris@chris-wilson.co.uk Signed-off-by: Michel Thierry michel.thierry@intel.com
configure.ac | 2 +- src/mesa/drivers/dri/i965/gen8_misc_state.c | 19 +++++++++++-------- src/mesa/drivers/dri/i965/intel_batchbuffer.c | 20 ++++++++++++++++---- src/mesa/drivers/dri/i965/intel_batchbuffer.h | 10 ++++++++-- 4 files changed, 36 insertions(+), 15 deletions(-)
diff --git a/configure.ac b/configure.ac index af61aa2..c92ca44 100644 --- a/configure.ac +++ b/configure.ac @@ -68,7 +68,7 @@ AC_SUBST([OSMESA_VERSION]) dnl Versions for external dependencies LIBDRM_REQUIRED=2.4.38 LIBDRM_RADEON_REQUIRED=2.4.56 -LIBDRM_INTEL_REQUIRED=2.4.60 +LIBDRM_INTEL_REQUIRED=2.4.63 LIBDRM_NVVIEUX_REQUIRED=2.4.33 LIBDRM_NOUVEAU_REQUIRED="2.4.33 libdrm >= 2.4.41" LIBDRM_FREEDRENO_REQUIRED=2.4.57 diff --git a/src/mesa/drivers/dri/i965/gen8_misc_state.c b/src/mesa/drivers/dri/i965/gen8_misc_state.c index b20038e..73eba06 100644 --- a/src/mesa/drivers/dri/i965/gen8_misc_state.c +++ b/src/mesa/drivers/dri/i965/gen8_misc_state.c @@ -28,6 +28,10 @@
/**
- Define the base addresses which some state is referenced from.
- Use OUT_RELOC64_INSIDE_4G instead of OUT_RELOC64, the General State
- Offset and Instruction State Offset are limited to 32-bits by hardware,
*/ void gen8_upload_state_base_address(struct brw_context *brw) {
- and must be located in the first 4GBs (32-bit offset).
@@ -41,19 +45,18 @@ void gen8_upload_state_base_address(struct brw_context *brw) OUT_BATCH(0); OUT_BATCH(mocs_wb << 16); /* Surface state base address: */
- OUT_RELOC64(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0,
mocs_wb << 4 | 1);
- OUT_RELOC64_INSIDE_4G(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0,
/* Dynamic state base address: */mocs_wb << 4 | 1);
- OUT_RELOC64(brw->batch.bo,
I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION, 0,
mocs_wb << 4 | 1);
- OUT_RELOC64_INSIDE_4G(brw->batch.bo,
I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION, 0,
/* Indirect object base address: MEDIA_OBJECT data */ OUT_BATCH(mocs_wb << 4 | 1); OUT_BATCH(0); /* Instruction base address: shader kernels (incl. SIP) */mocs_wb << 4 | 1);
- OUT_RELOC64(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
mocs_wb << 4 | 1);
- OUT_RELOC64_INSIDE_4G(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
/* General state buffer size */ OUT_BATCH(0xfffff001); /* Dynamic state buffer size */mocs_wb << 4 | 1);
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c index 54081a1..ca90784 100644 --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c @@ -409,11 +409,23 @@ bool intel_batchbuffer_emit_reloc64(struct brw_context *brw, drm_intel_bo *buffer, uint32_t read_domains, uint32_t write_domain,
uint32_t delta)
uint32_t delta,
{bool support_48bit_offset)
- int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
buffer, delta,
read_domains, write_domain);
- int ret;
- /* Not all buffers can be allocated outside the first 4GB, and
- offset must be limited to 32-bits.
- */
- if (support_48bit_offset)
drm_intel_bo_emit_reloc_48bit(brw->batch.bo, 4*brw->batch.used,
buffer, delta,
read_domains, write_domain);
- else
drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
buffer, delta,
read_domains, write_domain);
- assert(ret == 0); (void) ret;
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h index ef8a6ff..de5023b 100644 --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h @@ -62,7 +62,8 @@ bool intel_batchbuffer_emit_reloc64(struct brw_context *brw, drm_intel_bo *buffer, uint32_t read_domains, uint32_t write_domain,
uint32_t offset);
uint32_t offset,
static inline uint32_t float_as_int(float f) { union {bool support_48bit_offset);
@@ -167,7 +168,12 @@ intel_batchbuffer_advance(struct brw_context *brw)
/* Handle 48-bit address relocations for Gen8+ */ #define OUT_RELOC64(buf, read_domains, write_domain, delta) do { \
- intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain, delta); \
- intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain, delta, 1); \
+} while (0)
+/* Handle 48-bit address relocations for Gen8+, requesting 32-bit offset */ +#define OUT_RELOC64_INSIDE_4G(buf, read_domains, write_domain, delta) do { \
intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain, delta, 0); \ } while (0)
#define ADVANCE_BATCH() intel_batchbuffer_advance(brw);
-- 2.5.0
Intel-gfx mailing list Intel-gfx@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/intel-gfx
On Mon, Aug 10, 2015 at 2:21 AM, Michel Thierry michel.thierry@intel.com wrote:
Hi,
Thanks for the comments,
On 8/7/2015 11:46 PM, Kristian Høgsberg wrote:
On Fri, Aug 7, 2015 at 2:45 AM, Michel Thierry michel.thierry@intel.com wrote:
Gen8+ supports 48-bit virtual addresses, but some objects must always be allocated inside the 32-bit address range.
In specific, any resource used with flat/heapless (0x00000000-0xfffff000) General State Heap or Intruction State Heap must be in a 32-bit range (GSH / ISH), because the General State Offset and Instruction State Offset are limited to 32-bits.
This doesn't look right. From the workaround text, it doesn't sound like this is a HW problem, it only affects GMM. In mesa, we don't use "heapless" (which I assume means setting the base to 0 and max range)
It is a hw problem, specifically in the state base address command. General State Base Address and Instruction Base Address shouldn't be > 4GB.
for instructions, dynamic state or surface state. Surface state and dynamic state is always in our batch bo which is limited to 8k. Provided STATE_BASE_ADDRESS works correctly in the HW, the batch bo can be places anywhere in the aperture. Offsets to dynamic or surface state is relative to the beginning of the batch bo and will always be less than 4g. Instructions are in their own bo (brw->cache.bo), but again, we point instruction state base to it and all our shader stage setup code references the instructions relative to the beginning of the instruction bo.
I've seen the issue when the driver allocates mapped objects from bottom to top and the other bo's from top to bottom (gpu hangs). So I say this wa is needed.
mesa does use heapless for the scratch buffers, and on BDW+ that's the only address type that's relative to general state base address. So we need the workaround there, whether or not there's a HW limitation/bug.
I don't think there is a HW problem though. The driver I'm working on can use 48 bit ppgtt and Chris Wilson's softpin ioctl for userspace BO placement. I enabled that and made the userspace allocator place all BOs above 8g, and it all stilll worked. In particular, the instructions were above 8g and referenced relative to instruction base address. If the instruction base was limited to 4g, this would have hung the GPU.
The fact that the HW limits the size of the heaps to 4g can be restriction when we move to full 48 ppgtt, but for how mesa uses STATE_BASE_ADDRESS, it only affects general state base address.
Kristiain
-Michel
Kristian
Use drm_intel_bo_emit_reloc_48bit when the 4GB limit is not necessary, and the bo can be in the full address space.
This commit introduces a dependency of libdrm 2.4.63, which introduces the drm_intel_bo_emit_reloc_48bit function.
v2: s/48baddress/48b_address/, Only use in OUT_RELOC64 cases, OUT_RELOC implies a 32-bit address offset is needed (Ben) v3: Added OUT_RELOC64_INSIDE_4G, so it stands out when a 64-bit relocation needs the 32-bit workaround (Chris)
References: http://lists.freedesktop.org/archives/intel-gfx/2015-July/072612.html Cc: Ben Widawsky ben@bwidawsk.net Cc: Chris Wilson chris@chris-wilson.co.uk Signed-off-by: Michel Thierry michel.thierry@intel.com
configure.ac | 2 +- src/mesa/drivers/dri/i965/gen8_misc_state.c | 19 +++++++++++-------- src/mesa/drivers/dri/i965/intel_batchbuffer.c | 20 ++++++++++++++++---- src/mesa/drivers/dri/i965/intel_batchbuffer.h | 10 ++++++++-- 4 files changed, 36 insertions(+), 15 deletions(-)
diff --git a/configure.ac b/configure.ac index af61aa2..c92ca44 100644 --- a/configure.ac +++ b/configure.ac @@ -68,7 +68,7 @@ AC_SUBST([OSMESA_VERSION]) dnl Versions for external dependencies LIBDRM_REQUIRED=2.4.38 LIBDRM_RADEON_REQUIRED=2.4.56 -LIBDRM_INTEL_REQUIRED=2.4.60 +LIBDRM_INTEL_REQUIRED=2.4.63 LIBDRM_NVVIEUX_REQUIRED=2.4.33 LIBDRM_NOUVEAU_REQUIRED="2.4.33 libdrm >= 2.4.41" LIBDRM_FREEDRENO_REQUIRED=2.4.57 diff --git a/src/mesa/drivers/dri/i965/gen8_misc_state.c b/src/mesa/drivers/dri/i965/gen8_misc_state.c index b20038e..73eba06 100644 --- a/src/mesa/drivers/dri/i965/gen8_misc_state.c +++ b/src/mesa/drivers/dri/i965/gen8_misc_state.c @@ -28,6 +28,10 @@
/**
- Define the base addresses which some state is referenced from.
- Use OUT_RELOC64_INSIDE_4G instead of OUT_RELOC64, the General State
- Offset and Instruction State Offset are limited to 32-bits by
hardware,
*/ void gen8_upload_state_base_address(struct brw_context *brw) {
- and must be located in the first 4GBs (32-bit offset).
@@ -41,19 +45,18 @@ void gen8_upload_state_base_address(struct brw_context *brw) OUT_BATCH(0); OUT_BATCH(mocs_wb << 16); /* Surface state base address: */
- OUT_RELOC64(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0,
mocs_wb << 4 | 1);
- OUT_RELOC64_INSIDE_4G(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0,
/* Dynamic state base address: */mocs_wb << 4 | 1);
- OUT_RELOC64(brw->batch.bo,
I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION, 0,
mocs_wb << 4 | 1);
- OUT_RELOC64_INSIDE_4G(brw->batch.bo,
I915_GEM_DOMAIN_RENDER |
I915_GEM_DOMAIN_INSTRUCTION, 0,
/* Indirect object base address: MEDIA_OBJECT data */ OUT_BATCH(mocs_wb << 4 | 1); OUT_BATCH(0); /* Instruction base address: shader kernels (incl. SIP) */mocs_wb << 4 | 1);
- OUT_RELOC64(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
mocs_wb << 4 | 1);
- OUT_RELOC64_INSIDE_4G(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
/* General state buffer size */ OUT_BATCH(0xfffff001); /* Dynamic state buffer size */mocs_wb << 4 | 1);
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c index 54081a1..ca90784 100644 --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c @@ -409,11 +409,23 @@ bool intel_batchbuffer_emit_reloc64(struct brw_context *brw, drm_intel_bo *buffer, uint32_t read_domains, uint32_t write_domain,
uint32_t delta)
uint32_t delta,
{bool support_48bit_offset)
- int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
buffer, delta,
read_domains, write_domain);
- int ret;
- /* Not all buffers can be allocated outside the first 4GB, and
- offset must be limited to 32-bits.
- */
- if (support_48bit_offset)
drm_intel_bo_emit_reloc_48bit(brw->batch.bo, 4*brw->batch.used,
buffer, delta,
read_domains, write_domain);
- else
drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
buffer, delta,
read_domains, write_domain);
- assert(ret == 0); (void) ret;
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h index ef8a6ff..de5023b 100644 --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h @@ -62,7 +62,8 @@ bool intel_batchbuffer_emit_reloc64(struct brw_context *brw, drm_intel_bo *buffer, uint32_t read_domains, uint32_t write_domain,
uint32_t offset);
uint32_t offset,
static inline uint32_t float_as_int(float f) { union {bool support_48bit_offset);
@@ -167,7 +168,12 @@ intel_batchbuffer_advance(struct brw_context *brw)
/* Handle 48-bit address relocations for Gen8+ */ #define OUT_RELOC64(buf, read_domains, write_domain, delta) do { \
- intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain,
delta); \
- intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain,
delta, 1); \ +} while (0)
+/* Handle 48-bit address relocations for Gen8+, requesting 32-bit offset */ +#define OUT_RELOC64_INSIDE_4G(buf, read_domains, write_domain, delta) do { \
- intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain,
delta, 0); \ } while (0)
#define ADVANCE_BATCH() intel_batchbuffer_advance(brw);
2.5.0
Intel-gfx mailing list Intel-gfx@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/intel-gfx
dri-devel@lists.freedesktop.org