Flat-CCS eviction enhancements
v2: Correcting the memory residency requirement for flat-ccs capability [Thomas]
Ramalingam C (4): drm/i915/gt: GEM_BUG_ON unexpected NULL at scatterlist walking drm/i915/gt: optimize the ccs_sz calculation per chunk drm/i915/gt: Document the eviction of the Flat-CCS objects uapi/drm/i915: Document memory residency and Flat-CCS capability of obj
drivers/gpu/drm/i915/gt/intel_migrate.c | 65 ++++++++++++------------- include/uapi/drm/i915_drm.h | 18 +++++++ 2 files changed, 50 insertions(+), 33 deletions(-)
While locating the start of ccs scatterlist in smem scatterlist, that has to be the size of lmem obj size + corresponding ccs data size. Report bug if scatterlist terminate before that length.
Signed-off-by: Ramalingam C ramalingam.c@intel.com --- drivers/gpu/drm/i915/gt/intel_migrate.c | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c index 9d552f30b627..29d761da02c4 100644 --- a/drivers/gpu/drm/i915/gt/intel_migrate.c +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c @@ -687,6 +687,12 @@ static void get_ccs_sg_sgt(struct sgt_dma *it, u32 bytes_to_cpy) bytes_to_cpy -= len;
it->sg = __sg_next(it->sg); + + /* + * scatterlist supposed to be the size of + * bytes_to_cpy + GET_CCS_BYTES(bytes_to_copy). + */ + GEM_BUG_ON(!it->sg); it->dma = sg_dma_address(it->sg); it->max = it->dma + sg_dma_len(it->sg); } while (bytes_to_cpy);
On 25/04/2022 17:24, Ramalingam C wrote:
While locating the start of ccs scatterlist in smem scatterlist, that has to be the size of lmem obj size + corresponding ccs data size. Report bug if scatterlist terminate before that length.
Signed-off-by: Ramalingam C ramalingam.c@intel.com
drivers/gpu/drm/i915/gt/intel_migrate.c | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c index 9d552f30b627..29d761da02c4 100644 --- a/drivers/gpu/drm/i915/gt/intel_migrate.c +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c @@ -687,6 +687,12 @@ static void get_ccs_sg_sgt(struct sgt_dma *it, u32 bytes_to_cpy) bytes_to_cpy -= len;
it->sg = __sg_next(it->sg);
/*
* scatterlist supposed to be the size of
* bytes_to_cpy + GET_CCS_BYTES(bytes_to_copy).
*/
GEM_BUG_ON(!it->sg);
It will crash and burn anyway, with the below NULL deref. Not sure if BUG_ON() is really much better, but I guess with the additional comment, Reviewed-by: Matthew Auld matthew.auld@intel.com
it->dma = sg_dma_address(it->sg); it->max = it->dma + sg_dma_len(it->sg);
} while (bytes_to_cpy);
Calculate the ccs_sz that needs to be emitted based on the src and dst pages emitted per chunk. And handle the return value of emit_pte for the ccs pages.
Signed-off-by: Ramalingam C ramalingam.c@intel.com --- drivers/gpu/drm/i915/gt/intel_migrate.c | 36 +++++++++---------------- 1 file changed, 12 insertions(+), 24 deletions(-)
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c index 29d761da02c4..463a6a14b5f9 100644 --- a/drivers/gpu/drm/i915/gt/intel_migrate.c +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c @@ -647,17 +647,9 @@ static int scatter_list_length(struct scatterlist *sg)
static void calculate_chunk_sz(struct drm_i915_private *i915, bool src_is_lmem, - int *src_sz, int *ccs_sz, u32 bytes_to_cpy, - u32 ccs_bytes_to_cpy) + int *src_sz, u32 bytes_to_cpy, u32 ccs_bytes_to_cpy) { if (ccs_bytes_to_cpy) { - /* - * We can only copy the ccs data corresponding to - * the CHUNK_SZ of lmem which is - * GET_CCS_BYTES(i915, CHUNK_SZ)) - */ - *ccs_sz = min_t(int, ccs_bytes_to_cpy, GET_CCS_BYTES(i915, CHUNK_SZ)); - if (!src_is_lmem) /* * When CHUNK_SZ is passed all the pages upto CHUNK_SZ @@ -713,10 +705,10 @@ intel_context_migrate_copy(struct intel_context *ce, struct drm_i915_private *i915 = ce->engine->i915; u32 ccs_bytes_to_cpy = 0, bytes_to_cpy; enum i915_cache_level ccs_cache_level; - int src_sz, dst_sz, ccs_sz; u32 src_offset, dst_offset; u8 src_access, dst_access; struct i915_request *rq; + int src_sz, dst_sz; bool ccs_is_src; int err;
@@ -770,7 +762,7 @@ intel_context_migrate_copy(struct intel_context *ce, }
do { - int len; + int len, ccs_sz;
rq = i915_request_create(ce); if (IS_ERR(rq)) { @@ -797,7 +789,7 @@ intel_context_migrate_copy(struct intel_context *ce, if (err) goto out_rq;
- calculate_chunk_sz(i915, src_is_lmem, &src_sz, &ccs_sz, + calculate_chunk_sz(i915, src_is_lmem, &src_sz, bytes_to_cpy, ccs_bytes_to_cpy);
len = emit_pte(rq, &it_src, src_cache_level, src_is_lmem, @@ -835,33 +827,29 @@ intel_context_migrate_copy(struct intel_context *ce, if (err) goto out_rq;
+ ccs_sz = GET_CCS_BYTES(i915, len); err = emit_pte(rq, &it_ccs, ccs_cache_level, false, ccs_is_src ? src_offset : dst_offset, ccs_sz); + if (err < 0) + goto out_rq; + if (err < ccs_sz) { + err = -EINVAL; + goto out_rq; + }
err = rq->engine->emit_flush(rq, EMIT_INVALIDATE); if (err) goto out_rq;
- /* - * Using max of src_sz and dst_sz, as we need to - * pass the lmem size corresponding to the ccs - * blocks we need to handle. - */ - ccs_sz = max_t(int, ccs_is_src ? ccs_sz : src_sz, - ccs_is_src ? dst_sz : ccs_sz); - err = emit_copy_ccs(rq, dst_offset, dst_access, - src_offset, src_access, ccs_sz); + src_offset, src_access, len); if (err) goto out_rq;
err = rq->engine->emit_flush(rq, EMIT_INVALIDATE); if (err) goto out_rq; - - /* Converting back to ccs bytes */ - ccs_sz = GET_CCS_BYTES(rq->engine->i915, ccs_sz); ccs_bytes_to_cpy -= ccs_sz; }
On 25/04/2022 17:24, Ramalingam C wrote:
Calculate the ccs_sz that needs to be emitted based on the src and dst pages emitted per chunk. And handle the return value of emit_pte for the ccs pages.
Signed-off-by: Ramalingam C ramalingam.c@intel.com
drivers/gpu/drm/i915/gt/intel_migrate.c | 36 +++++++++---------------- 1 file changed, 12 insertions(+), 24 deletions(-)
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c index 29d761da02c4..463a6a14b5f9 100644 --- a/drivers/gpu/drm/i915/gt/intel_migrate.c +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c @@ -647,17 +647,9 @@ static int scatter_list_length(struct scatterlist *sg)
static void calculate_chunk_sz(struct drm_i915_private *i915, bool src_is_lmem,
int *src_sz, int *ccs_sz, u32 bytes_to_cpy,
u32 ccs_bytes_to_cpy)
{ if (ccs_bytes_to_cpy) {int *src_sz, u32 bytes_to_cpy, u32 ccs_bytes_to_cpy)
/*
* We can only copy the ccs data corresponding to
* the CHUNK_SZ of lmem which is
* GET_CCS_BYTES(i915, CHUNK_SZ))
*/
*ccs_sz = min_t(int, ccs_bytes_to_cpy, GET_CCS_BYTES(i915, CHUNK_SZ));
- if (!src_is_lmem) /* * When CHUNK_SZ is passed all the pages upto CHUNK_SZ
@@ -713,10 +705,10 @@ intel_context_migrate_copy(struct intel_context *ce, struct drm_i915_private *i915 = ce->engine->i915; u32 ccs_bytes_to_cpy = 0, bytes_to_cpy; enum i915_cache_level ccs_cache_level;
- int src_sz, dst_sz, ccs_sz; u32 src_offset, dst_offset; u8 src_access, dst_access; struct i915_request *rq;
- int src_sz, dst_sz; bool ccs_is_src; int err;
@@ -770,7 +762,7 @@ intel_context_migrate_copy(struct intel_context *ce, }
do {
int len;
int len, ccs_sz;
This could be moved into the reduced scope below.
Reviewed-by: Matthew Auld matthew.auld@intel.com
rq = i915_request_create(ce); if (IS_ERR(rq)) {
@@ -797,7 +789,7 @@ intel_context_migrate_copy(struct intel_context *ce, if (err) goto out_rq;
calculate_chunk_sz(i915, src_is_lmem, &src_sz, &ccs_sz,
calculate_chunk_sz(i915, src_is_lmem, &src_sz, bytes_to_cpy, ccs_bytes_to_cpy);
len = emit_pte(rq, &it_src, src_cache_level, src_is_lmem,
@@ -835,33 +827,29 @@ intel_context_migrate_copy(struct intel_context *ce, if (err) goto out_rq;
ccs_sz = GET_CCS_BYTES(i915, len); err = emit_pte(rq, &it_ccs, ccs_cache_level, false, ccs_is_src ? src_offset : dst_offset, ccs_sz);
if (err < 0)
goto out_rq;
if (err < ccs_sz) {
err = -EINVAL;
goto out_rq;
} err = rq->engine->emit_flush(rq, EMIT_INVALIDATE); if (err) goto out_rq;
/*
* Using max of src_sz and dst_sz, as we need to
* pass the lmem size corresponding to the ccs
* blocks we need to handle.
*/
ccs_sz = max_t(int, ccs_is_src ? ccs_sz : src_sz,
ccs_is_src ? dst_sz : ccs_sz);
err = emit_copy_ccs(rq, dst_offset, dst_access,
src_offset, src_access, ccs_sz);
src_offset, src_access, len); if (err) goto out_rq; err = rq->engine->emit_flush(rq, EMIT_INVALIDATE); if (err) goto out_rq;
/* Converting back to ccs bytes */
}ccs_sz = GET_CCS_BYTES(rq->engine->i915, ccs_sz); ccs_bytes_to_cpy -= ccs_sz;
Capture the eviction details for Flat-CCS capable, lmem objects.
v2: Fix the Flat-ccs capbility of lmem obj with smem residency possibility [Thomas]
Signed-off-by: Ramalingam C ramalingam.c@intel.com cc: Thomas Hellstrom thomas.hellstrom@linux.intel.com cc: Matthew Auld matthew.auld@intel.com --- drivers/gpu/drm/i915/gt/intel_migrate.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c index 463a6a14b5f9..930e0fd9795f 100644 --- a/drivers/gpu/drm/i915/gt/intel_migrate.c +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c @@ -485,16 +485,21 @@ static bool wa_1209644611_applies(int ver, u32 size) * And CCS data can be copied in and out of CCS region through * XY_CTRL_SURF_COPY_BLT. CPU can't access the CCS data directly. * - * When we exhaust the lmem, if the object's placements support smem, then we can - * directly decompress the compressed lmem object into smem and start using it - * from smem itself. + * I915 supports Flat-CCS on lmem only objects. When an objects has the smem in + * its preference list, on memory pressure, i915 needs to migarte the lmem + * content into smem. If the lmem object is Flat-CCS compressed by userspace, + * then i915 needs to decompress it. But I915 lack the required information + * for such decompression. Hence I915 supports Flat-CCS only on lmem only objects. * - * But when we need to swapout the compressed lmem object into a smem region - * though objects' placement doesn't support smem, then we copy the lmem content - * as it is into smem region along with ccs data (using XY_CTRL_SURF_COPY_BLT). - * When the object is referred, lmem content will be swaped in along with - * restoration of the CCS data (using XY_CTRL_SURF_COPY_BLT) at corresponding - * location. + * when we exhaust the lmem, Flat-CCS capable objects' lmem backing memory can + * be temporarily evicted to smem, along with the auxiliary CCS state, where + * it can be potentially swapped-out at a later point, if required. + * If userspace later touches the evicted pages, then we always move + * the backing memory back to lmem, which includes restoring the saved CCS state, + * and potentially performing any required swap-in. + * + * For the migration of the lmem objects with smem in placement list, such as + * {lmem, smem}, objects are treated as non Flat-CCS capable objects. */
static inline u32 *i915_flush_dw(u32 *cmd, u32 flags)
On 25/04/2022 17:24, Ramalingam C wrote:
Capture the eviction details for Flat-CCS capable, lmem objects.
v2: Fix the Flat-ccs capbility of lmem obj with smem residency possibility [Thomas]
Signed-off-by: Ramalingam C ramalingam.c@intel.com cc: Thomas Hellstrom thomas.hellstrom@linux.intel.com cc: Matthew Auld matthew.auld@intel.com
drivers/gpu/drm/i915/gt/intel_migrate.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c index 463a6a14b5f9..930e0fd9795f 100644 --- a/drivers/gpu/drm/i915/gt/intel_migrate.c +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c @@ -485,16 +485,21 @@ static bool wa_1209644611_applies(int ver, u32 size)
- And CCS data can be copied in and out of CCS region through
- XY_CTRL_SURF_COPY_BLT. CPU can't access the CCS data directly.
- When we exhaust the lmem, if the object's placements support smem, then we can
- directly decompress the compressed lmem object into smem and start using it
- from smem itself.
- I915 supports Flat-CCS on lmem only objects. When an objects has the smem in
"When an object has smem in"
- its preference list, on memory pressure, i915 needs to migarte the lmem
"migrate"
- content into smem. If the lmem object is Flat-CCS compressed by userspace,
- then i915 needs to decompress it. But I915 lack the required information
- for such decompression. Hence I915 supports Flat-CCS only on lmem only objects.
- But when we need to swapout the compressed lmem object into a smem region
- though objects' placement doesn't support smem, then we copy the lmem content
- as it is into smem region along with ccs data (using XY_CTRL_SURF_COPY_BLT).
- When the object is referred, lmem content will be swaped in along with
- restoration of the CCS data (using XY_CTRL_SURF_COPY_BLT) at corresponding
- location.
- when we exhaust the lmem, Flat-CCS capable objects' lmem backing memory can
"When"
Otherwise, Reviewed-by: Matthew Auld matthew.auld@intel.com
- be temporarily evicted to smem, along with the auxiliary CCS state, where
- it can be potentially swapped-out at a later point, if required.
- If userspace later touches the evicted pages, then we always move
- the backing memory back to lmem, which includes restoring the saved CCS state,
- and potentially performing any required swap-in.
- For the migration of the lmem objects with smem in placement list, such as
- {lmem, smem}, objects are treated as non Flat-CCS capable objects.
*/
static inline u32 *i915_flush_dw(u32 *cmd, u32 flags)
Capture the impact of memory region preference list of an object, on their memory residency and Flat-CCS capability of the objects.
v2: Fix the Flat-CCS capability of an obj with {lmem, smem} preference list [Thomas]
Signed-off-by: Ramalingam C ramalingam.c@intel.com cc: Matthew Auld matthew.auld@intel.com cc: Thomas Hellstrom thomas.hellstrom@linux.intel.com --- include/uapi/drm/i915_drm.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 35ca528803fd..ad191ed6547c 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -3393,6 +3393,24 @@ struct drm_i915_gem_create_ext { * At which point we get the object handle in &drm_i915_gem_create_ext.handle, * along with the final object size in &drm_i915_gem_create_ext.size, which * should account for any rounding up, if required. + * + * Objects with multiple memory regions in the preference list will be backed + * by one of the memory regions mentioned in the preference list. Though I915 + * tries to honour the order of the memory regions in the preference list, + * based on the memory pressure of the regions, objects' backing region + * will be selected. + * + * Userspace has no means of knowing the backing region for such objects. + * + * On Flat-CCS capable HW, compression is supported for the objects residing + * in I915_MEMORY_CLASS_DEVICE. When such objects (compressed) has other + * memory class in preference list and migrated (by I915, due to memory + * constrain) to the non I915_MEMORY_CLASS_DEVICE region, then I915 needs to + * decompress the content. But I915 dont have the required information to + * decompress the userspace compressed objects. + * + * So I915 supports Flat-CCS, only on the objects which can reside only on + * I915_MEMORY_CLASS_DEVICE regions. */ struct drm_i915_gem_create_ext_memory_regions { /** @base: Extension link. See struct i915_user_extension. */
On 25/04/2022 17:24, Ramalingam C wrote:
Capture the impact of memory region preference list of an object, on their memory residency and Flat-CCS capability of the objects.
v2: Fix the Flat-CCS capability of an obj with {lmem, smem} preference list [Thomas]
Signed-off-by: Ramalingam C ramalingam.c@intel.com cc: Matthew Auld matthew.auld@intel.com cc: Thomas Hellstrom thomas.hellstrom@linux.intel.com
include/uapi/drm/i915_drm.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 35ca528803fd..ad191ed6547c 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -3393,6 +3393,24 @@ struct drm_i915_gem_create_ext {
- At which point we get the object handle in &drm_i915_gem_create_ext.handle,
- along with the final object size in &drm_i915_gem_create_ext.size, which
- should account for any rounding up, if required.
- Objects with multiple memory regions in the preference list will be backed
- by one of the memory regions mentioned in the preference list. Though I915
- tries to honour the order of the memory regions in the preference list,
- based on the memory pressure of the regions, objects' backing region
- will be selected.
- Userspace has no means of knowing the backing region for such objects.
"Note that userspace has no means of knowing the current backing region for objects where @num_regions is larger than one. The kernel will only ensure that the priority order of the @regions array is honoured, either when initially placing the object, or when moving memory around due to memory pressure."
- On Flat-CCS capable HW, compression is supported for the objects residing
- in I915_MEMORY_CLASS_DEVICE. When such objects (compressed) has other
- memory class in preference list and migrated (by I915, due to memory
- constrain) to the non I915_MEMORY_CLASS_DEVICE region, then I915 needs to
- decompress the content. But I915 dont have the required information to
"doesn't", also prefer @regions etc instead of "preference list"
Anyway, Reviewed-by: Matthew Auld matthew.auld@intel.com
- decompress the userspace compressed objects.
- So I915 supports Flat-CCS, only on the objects which can reside only on
struct drm_i915_gem_create_ext_memory_regions { /** @base: Extension link. See struct i915_user_extension. */
- I915_MEMORY_CLASS_DEVICE regions. > */
dri-devel@lists.freedesktop.org