Flat-CCS eviction enhancements.
Ramalingam C (4): drm/i915/gt: GEM_BUG_ON unexpected NULL at scatterlist walking drm/i915/gt: optimize the ccs_sz calculation per chunk drm/i915/gt: Extend doc on Flat-CCS obj eviction uapi/drm/i915: Update the placement list impact on obj residency
drivers/gpu/drm/i915/gt/intel_migrate.c | 78 ++++++++++++++----------- include/uapi/drm/i915_drm.h | 14 +++++ 2 files changed, 59 insertions(+), 33 deletions(-)
While locating the start of ccs scatterlist in smem scatterlist, that has to be the size of lmem obj size + corresponding ccs data size. Report bug if scatterlist terminate before that length.
Signed-off-by: Ramalingam C ramalingam.c@intel.com --- drivers/gpu/drm/i915/gt/intel_migrate.c | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c index 9d552f30b627..29d761da02c4 100644 --- a/drivers/gpu/drm/i915/gt/intel_migrate.c +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c @@ -687,6 +687,12 @@ static void get_ccs_sg_sgt(struct sgt_dma *it, u32 bytes_to_cpy) bytes_to_cpy -= len;
it->sg = __sg_next(it->sg); + + /* + * scatterlist supposed to be the size of + * bytes_to_cpy + GET_CCS_BYTES(bytes_to_copy). + */ + GEM_BUG_ON(!it->sg); it->dma = sg_dma_address(it->sg); it->max = it->dma + sg_dma_len(it->sg); } while (bytes_to_cpy);
On Thu, 2022-04-21 at 17:08 +0530, Ramalingam C wrote:
While locating the start of ccs scatterlist in smem scatterlist, that has to be the size of lmem obj size + corresponding ccs data size. Report bug if scatterlist terminate before that length.
Signed-off-by: Ramalingam C ramalingam.c@intel.com
drivers/gpu/drm/i915/gt/intel_migrate.c | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c index 9d552f30b627..29d761da02c4 100644 --- a/drivers/gpu/drm/i915/gt/intel_migrate.c +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c @@ -687,6 +687,12 @@ static void get_ccs_sg_sgt(struct sgt_dma *it, u32 bytes_to_cpy) bytes_to_cpy -= len;
it->sg = __sg_next(it->sg);
If bytes_to_cpy == 0 here, couldn't it->sg be NULL then?
+ /* + * scatterlist supposed to be the size of + * bytes_to_cpy + GET_CCS_BYTES(bytes_to_copy). + */ + GEM_BUG_ON(!it->sg); it->dma = sg_dma_address(it->sg); it->max = it->dma + sg_dma_len(it->sg); } while (bytes_to_cpy);
/Thomas
On 2022-04-21 at 18:57:59 +0530, Hellstrom, Thomas wrote:
On Thu, 2022-04-21 at 17:08 +0530, Ramalingam C wrote:
While locating the start of ccs scatterlist in smem scatterlist, that has to be the size of lmem obj size + corresponding ccs data size. Report bug if scatterlist terminate before that length.
Signed-off-by: Ramalingam C ramalingam.c@intel.com
drivers/gpu/drm/i915/gt/intel_migrate.c | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c index 9d552f30b627..29d761da02c4 100644 --- a/drivers/gpu/drm/i915/gt/intel_migrate.c +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c @@ -687,6 +687,12 @@ static void get_ccs_sg_sgt(struct sgt_dma *it, u32 bytes_to_cpy) bytes_to_cpy -= len;
it->sg = __sg_next(it->sg);
If bytes_to_cpy == 0 here, couldn't it->sg be NULL then?
Hi,
bytes_to_cpy is the lmem size and the scatterlist is the length of bytes_to_cpy + GET_CCS_BYTES(bytes_to_cpy). So this should not be null.
when bytes_to_cpy reduces to zero we will be having the start of the scatterlist for ccs.
Ram.
/*
* scatterlist supposed to be the size of
* bytes_to_cpy + GET_CCS_BYTES(bytes_to_copy).
*/
GEM_BUG_ON(!it->sg); it->dma = sg_dma_address(it->sg); it->max = it->dma + sg_dma_len(it->sg); } while (bytes_to_cpy);
/Thomas
Calculate the ccs_sz that needs to be emitted based on the src and dst pages emitted per chunk. And handle the return value of emit_pte for the ccs pages.
Signed-off-by: Ramalingam C ramalingam.c@intel.com --- drivers/gpu/drm/i915/gt/intel_migrate.c | 36 +++++++++---------------- 1 file changed, 12 insertions(+), 24 deletions(-)
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c index 29d761da02c4..463a6a14b5f9 100644 --- a/drivers/gpu/drm/i915/gt/intel_migrate.c +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c @@ -647,17 +647,9 @@ static int scatter_list_length(struct scatterlist *sg)
static void calculate_chunk_sz(struct drm_i915_private *i915, bool src_is_lmem, - int *src_sz, int *ccs_sz, u32 bytes_to_cpy, - u32 ccs_bytes_to_cpy) + int *src_sz, u32 bytes_to_cpy, u32 ccs_bytes_to_cpy) { if (ccs_bytes_to_cpy) { - /* - * We can only copy the ccs data corresponding to - * the CHUNK_SZ of lmem which is - * GET_CCS_BYTES(i915, CHUNK_SZ)) - */ - *ccs_sz = min_t(int, ccs_bytes_to_cpy, GET_CCS_BYTES(i915, CHUNK_SZ)); - if (!src_is_lmem) /* * When CHUNK_SZ is passed all the pages upto CHUNK_SZ @@ -713,10 +705,10 @@ intel_context_migrate_copy(struct intel_context *ce, struct drm_i915_private *i915 = ce->engine->i915; u32 ccs_bytes_to_cpy = 0, bytes_to_cpy; enum i915_cache_level ccs_cache_level; - int src_sz, dst_sz, ccs_sz; u32 src_offset, dst_offset; u8 src_access, dst_access; struct i915_request *rq; + int src_sz, dst_sz; bool ccs_is_src; int err;
@@ -770,7 +762,7 @@ intel_context_migrate_copy(struct intel_context *ce, }
do { - int len; + int len, ccs_sz;
rq = i915_request_create(ce); if (IS_ERR(rq)) { @@ -797,7 +789,7 @@ intel_context_migrate_copy(struct intel_context *ce, if (err) goto out_rq;
- calculate_chunk_sz(i915, src_is_lmem, &src_sz, &ccs_sz, + calculate_chunk_sz(i915, src_is_lmem, &src_sz, bytes_to_cpy, ccs_bytes_to_cpy);
len = emit_pte(rq, &it_src, src_cache_level, src_is_lmem, @@ -835,33 +827,29 @@ intel_context_migrate_copy(struct intel_context *ce, if (err) goto out_rq;
+ ccs_sz = GET_CCS_BYTES(i915, len); err = emit_pte(rq, &it_ccs, ccs_cache_level, false, ccs_is_src ? src_offset : dst_offset, ccs_sz); + if (err < 0) + goto out_rq; + if (err < ccs_sz) { + err = -EINVAL; + goto out_rq; + }
err = rq->engine->emit_flush(rq, EMIT_INVALIDATE); if (err) goto out_rq;
- /* - * Using max of src_sz and dst_sz, as we need to - * pass the lmem size corresponding to the ccs - * blocks we need to handle. - */ - ccs_sz = max_t(int, ccs_is_src ? ccs_sz : src_sz, - ccs_is_src ? dst_sz : ccs_sz); - err = emit_copy_ccs(rq, dst_offset, dst_access, - src_offset, src_access, ccs_sz); + src_offset, src_access, len); if (err) goto out_rq;
err = rq->engine->emit_flush(rq, EMIT_INVALIDATE); if (err) goto out_rq; - - /* Converting back to ccs bytes */ - ccs_sz = GET_CCS_BYTES(rq->engine->i915, ccs_sz); ccs_bytes_to_cpy -= ccs_sz; }
Capture the eviction details for Flat-CCS capable lmem only objects and lmem objects with smem residency. This also captures the impact of eviction on object's memory residency and Flat-CCS compression state.
Signed-off-by: Ramalingam C ramalingam.c@intel.com --- drivers/gpu/drm/i915/gt/intel_migrate.c | 36 ++++++++++++++++++------- 1 file changed, 27 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c index 463a6a14b5f9..9d0d18950e76 100644 --- a/drivers/gpu/drm/i915/gt/intel_migrate.c +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c @@ -485,16 +485,34 @@ static bool wa_1209644611_applies(int ver, u32 size) * And CCS data can be copied in and out of CCS region through * XY_CTRL_SURF_COPY_BLT. CPU can't access the CCS data directly. * - * When we exhaust the lmem, if the object's placements support smem, then we can - * directly decompress the compressed lmem object into smem and start using it - * from smem itself. + * when we exhaust the lmem, we need to handle two types of flat-ccs capable + * objects for its eviction. + * 1) lmem only objects + * 2) lmem objects with smem residency option * - * But when we need to swapout the compressed lmem object into a smem region - * though objects' placement doesn't support smem, then we copy the lmem content - * as it is into smem region along with ccs data (using XY_CTRL_SURF_COPY_BLT). - * When the object is referred, lmem content will be swaped in along with - * restoration of the CCS data (using XY_CTRL_SURF_COPY_BLT) at corresponding - * location. + * 1) lmem only objects: + * + * lmem backing memory can be temporarily evicted to smem, along with the + * auxiliary CCS state, where it can be potentially swapped-out at a later point, + * if required. If userspace later touches the evicted pages, then we always move + * the backing memory back to lmem, which includes restoring the saved CCS state, + * and potentially performing any required swap-in. + * + * In this scenario, objects' backing memory class and Flat-CCS state doesn't + * change. + * + * 2) lmem objects with smem residency option + * + * Lmem object with smem region in it's placement list, will be migrated into + * smem by decompressing the content. I915 doesn't handle this kind of + * migration for Flat-CCS compressed objects yet. + * + * In this scenario, objects' backing memory class and Flat-CCS state changed, + * and userspace is not aware of it. + * + * In summary, when a userspace wants to be sure about the objects memory + * residency and flat-ccs compression state, then placement list can't have + * the lmem and smem together. Instead, object has to be lmem resident only. */
static inline u32 *i915_flush_dw(u32 *cmd, u32 flags)
On Thu, 2022-04-21 at 17:08 +0530, Ramalingam C wrote:
Capture the eviction details for Flat-CCS capable lmem only objects and lmem objects with smem residency. This also captures the impact of eviction on object's memory residency and Flat-CCS compression state.
Signed-off-by: Ramalingam C ramalingam.c@intel.com
drivers/gpu/drm/i915/gt/intel_migrate.c | 36 ++++++++++++++++++-----
1 file changed, 27 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c index 463a6a14b5f9..9d0d18950e76 100644 --- a/drivers/gpu/drm/i915/gt/intel_migrate.c +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c @@ -485,16 +485,34 @@ static bool wa_1209644611_applies(int ver, u32 size) * And CCS data can be copied in and out of CCS region through * XY_CTRL_SURF_COPY_BLT. CPU can't access the CCS data directly. *
- When we exhaust the lmem, if the object's placements support
smem, then we can
- directly decompress the compressed lmem object into smem and
start using it
- from smem itself.
- when we exhaust the lmem, we need to handle two types of flat-ccs
capable
- objects for its eviction.
- * 1) lmem only objects
- * 2) lmem objects with smem residency option
*
- But when we need to swapout the compressed lmem object into a
smem region
- though objects' placement doesn't support smem, then we copy the
lmem content
- as it is into smem region along with ccs data (using
XY_CTRL_SURF_COPY_BLT).
- When the object is referred, lmem content will be swaped in along
with
- restoration of the CCS data (using XY_CTRL_SURF_COPY_BLT) at
corresponding
- location.
- lmem only objects:
- lmem backing memory can be temporarily evicted to smem, along
with the
- auxiliary CCS state, where it can be potentially swapped-out at a
later point,
- if required. If userspace later touches the evicted pages, then
we always move
- the backing memory back to lmem, which includes restoring the
saved CCS state,
- and potentially performing any required swap-in.
- In this scenario, objects' backing memory class and Flat-CCS
state doesn't
- change.
- lmem objects with smem residency option
- Lmem object with smem region in it's placement list, will be
migrated into
- smem by decompressing the content. I915 doesn't handle this kind
of
- migration for Flat-CCS compressed objects yet.
- In this scenario, objects' backing memory class and Flat-CCS
state changed,
- and userspace is not aware of it.
- In summary, when a userspace wants to be sure about the objects
memory
- residency and flat-ccs compression state, then placement list
can't have
- the lmem and smem together. Instead, object has to be lmem
resident only.
For 2) I was under the impression that with flat CCS, these objects need to be always uncompressed, since the kernel doesn't have the needed information to decompress / compress. Or has this been changed recently?
/Thomas
*/ static inline u32 *i915_flush_dw(u32 *cmd, u32 flags)
On 2022-04-21 at 19:07:29 +0530, Hellstrom, Thomas wrote:
On Thu, 2022-04-21 at 17:08 +0530, Ramalingam C wrote:
Capture the eviction details for Flat-CCS capable lmem only objects and lmem objects with smem residency. This also captures the impact of eviction on object's memory residency and Flat-CCS compression state.
Signed-off-by: Ramalingam C ramalingam.c@intel.com
drivers/gpu/drm/i915/gt/intel_migrate.c | 36 ++++++++++++++++++-----
1 file changed, 27 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c index 463a6a14b5f9..9d0d18950e76 100644 --- a/drivers/gpu/drm/i915/gt/intel_migrate.c +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c @@ -485,16 +485,34 @@ static bool wa_1209644611_applies(int ver, u32 size)
- And CCS data can be copied in and out of CCS region through
- XY_CTRL_SURF_COPY_BLT. CPU can't access the CCS data directly.
- When we exhaust the lmem, if the object's placements support
smem, then we can
- directly decompress the compressed lmem object into smem and
start using it
- from smem itself.
- when we exhaust the lmem, we need to handle two types of flat-ccs
capable
- objects for its eviction.
- lmem only objects
- lmem objects with smem residency option
- But when we need to swapout the compressed lmem object into a
smem region
- though objects' placement doesn't support smem, then we copy the
lmem content
- as it is into smem region along with ccs data (using
XY_CTRL_SURF_COPY_BLT).
- When the object is referred, lmem content will be swaped in along
with
- restoration of the CCS data (using XY_CTRL_SURF_COPY_BLT) at
corresponding
- location.
- lmem only objects:
- lmem backing memory can be temporarily evicted to smem, along
with the
- auxiliary CCS state, where it can be potentially swapped-out at a
later point,
- if required. If userspace later touches the evicted pages, then
we always move
- the backing memory back to lmem, which includes restoring the
saved CCS state,
- and potentially performing any required swap-in.
- In this scenario, objects' backing memory class and Flat-CCS
state doesn't
- change.
- lmem objects with smem residency option
- Lmem object with smem region in it's placement list, will be
migrated into
- smem by decompressing the content. I915 doesn't handle this kind
of
- migration for Flat-CCS compressed objects yet.
- In this scenario, objects' backing memory class and Flat-CCS
state changed,
- and userspace is not aware of it.
- In summary, when a userspace wants to be sure about the objects
memory
- residency and flat-ccs compression state, then placement list
can't have
- the lmem and smem together. Instead, object has to be lmem
resident only.
For 2) I was under the impression that with flat CCS, these objects need to be always uncompressed, since the kernel doesn't have the needed information to decompress / compress. Or has this been changed recently?
Sorry. I have overlooked the lack of inputs required for decompression at kernel. So yes we can't support the compression on the lmem objects with {lmem, smem} as placement preferences. I will update the documentation accordingly.
Ram.
/Thomas
*/
static inline u32 *i915_flush_dw(u32 *cmd, u32 flags)
Object created with list of memory classes as placement preferences, can be backed with any memory class of the list as per kernel's migration policy for the memory contrain situation. Userspace won't be notified of the memory residency change at this scenario.
And also Flat-CCS compression is supported only on objects of I915_MEMORY_CLASS_DEVICE. When the Flat-CCS compressed objects migrates out of I915_MEMORY_CLASS_DEVICE, due to memory constrain, content will be decompressed without notifying the userpsace.
Record these details in Kernel documentation.
Signed-off-by: Ramalingam C ramalingam.c@intel.com --- include/uapi/drm/i915_drm.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 35ca528803fd..8b25dd6a157a 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -3393,6 +3393,20 @@ struct drm_i915_gem_create_ext { * At which point we get the object handle in &drm_i915_gem_create_ext.handle, * along with the final object size in &drm_i915_gem_create_ext.size, which * should account for any rounding up, if required. + * + * If an object is created with list of memory classes as their placement + * preference, kernel could use one of the memory class as the backing storage + * based on the memory availability. At memory pressure kernel could migrate the + * objects content from one memory class to another, given in the placement list. + * + * With placement preference list, userpace can't be sure about the object's memory + * residence. + * + * Flat-CCS compression is supported only for objects of I915_MEMORY_CLASS_DEVICE. + * If the object has other placement preferences, and if the content is + * migrated (by kernel due to memory constrain) to a memory class which is other + * than I915_MEMORY_CLASS_DEVICE, object content will be decompressed by kernel. + * Userpace will be ignorant of this Flat-CCS state change. */ struct drm_i915_gem_create_ext_memory_regions { /** @base: Extension link. See struct i915_user_extension. */
On Thu, 2022-04-21 at 17:08 +0530, Ramalingam C wrote:
Object created with list of memory classes as placement preferences, can be backed with any memory class of the list as per kernel's migration policy for the memory contrain situation. Userspace won't be notified of the memory residency change at this scenario.
And also Flat-CCS compression is supported only on objects of I915_MEMORY_CLASS_DEVICE. When the Flat-CCS compressed objects migrates out of I915_MEMORY_CLASS_DEVICE, due to memory constrain, content will be decompressed without notifying the userpsace.
userspace
Record these details in Kernel documentation.
Signed-off-by: Ramalingam C ramalingam.c@intel.com
include/uapi/drm/i915_drm.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 35ca528803fd..8b25dd6a157a 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -3393,6 +3393,20 @@ struct drm_i915_gem_create_ext { * At which point we get the object handle in &drm_i915_gem_create_ext.handle, * along with the final object size in &drm_i915_gem_create_ext.size, which * should account for any rounding up, if required.
- If an object is created with list of memory classes as their
placement
- preference, kernel could use one of the memory class as the
backing storage
- based on the memory availability. At memory pressure kernel could
migrate the
- objects content from one memory class to another, given in the
placement list.
- With placement preference list, userpace can't be sure about the
object's memory
- residence.
- Flat-CCS compression is supported only for objects of
I915_MEMORY_CLASS_DEVICE.
- If the object has other placement preferences, and if the content
is
- migrated (by kernel due to memory constrain) to a memory class
which is other
- than I915_MEMORY_CLASS_DEVICE, object content will be
decompressed by kernel.
- Userpace will be ignorant of this Flat-CCS state change.
Same question here as for previous commit.
*/ struct drm_i915_gem_create_ext_memory_regions { /** @base: Extension link. See struct i915_user_extension. */
dri-devel@lists.freedesktop.org