On 2022-03-21 at 16:09:08 +0530, Hellstrom, Thomas wrote:
On Sun, 2022-03-20 at 02:12 +0530, Ramalingam C wrote:
While clearing the Flat-CCS capable lmem object, we need to clear the CCS meta data corresponding to the memory.
As part of live_migrate_clear add check for the ccs meta data clear for the Flat-CCS capable lmem object.
Signed-off-by: Ramalingam C ramalingam.c@intel.com
drivers/gpu/drm/i915/gt/intel_migrate.c | 32 +++ drivers/gpu/drm/i915/gt/selftest_migrate.c | 274 ++++++++++++++++++- -- 2 files changed, 278 insertions(+), 28 deletions(-)
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c index c1db8daf994a..bbfea570c239 100644 --- a/drivers/gpu/drm/i915/gt/intel_migrate.c +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c @@ -572,6 +572,38 @@ static u32 *_i915_ctrl_surf_copy_blt(u32 *cmd, u64 src_addr, u64 dst_addr, return cmd; }
+static int emit_copy_ccs(struct i915_request *rq,
u32 dst_offset, u8 dst_access,
u32 src_offset, u8 src_access, int size)
+{
struct drm_i915_private *i915 = rq->engine->i915;
int mocs = rq->engine->gt->mocs.uc_index << 1;
u32 num_ccs_blks, ccs_ring_size;
u32 *cs;
ccs_ring_size = calc_ctrl_surf_instr_size(i915, size);
WARN_ON(!ccs_ring_size);
cs = intel_ring_begin(rq, round_up(ccs_ring_size, 2));
if (IS_ERR(cs))
return PTR_ERR(cs);
num_ccs_blks = DIV_ROUND_UP(GET_CCS_BYTES(i915, size),
NUM_CCS_BYTES_PER_BLOCK);
cs = i915_flush_dw(cs, MI_FLUSH_DW_LLC | MI_FLUSH_DW_CCS);
cs = _i915_ctrl_surf_copy_blt(cs, src_offset, dst_offset,
src_access, dst_access,
mocs, mocs, num_ccs_blks);
cs = i915_flush_dw(cs, MI_FLUSH_DW_LLC | MI_FLUSH_DW_CCS);
if (ccs_ring_size & 1)
*cs++ = MI_NOOP;
intel_ring_advance(rq, cs);
return 0;
+}
This would be an unused function if selftests are not configured, right?
No Thomas. This is reused between selftest and eviction flow. in next version i am reusing it for evict_clear too.
static int emit_copy(struct i915_request *rq, u32 dst_offset, u32 src_offset, int size) { diff --git a/drivers/gpu/drm/i915/gt/selftest_migrate.c b/drivers/gpu/drm/i915/gt/selftest_migrate.c index b5da8b8cd039..e32cc994f4a2 100644 --- a/drivers/gpu/drm/i915/gt/selftest_migrate.c +++ b/drivers/gpu/drm/i915/gt/selftest_migrate.c @@ -132,6 +132,126 @@ static int copy(struct intel_migrate *migrate, return err; }
+static int intel_context_copy_ccs(struct intel_context *ce,
const struct i915_deps *deps,
struct scatterlist *sg,
enum i915_cache_level cache_level,
bool write_to_ccs,
struct i915_request **out)
+{
u8 src_access = write_to_ccs ? DIRECT_ACCESS :
INDIRECT_ACCESS;
u8 dst_access = write_to_ccs ? INDIRECT_ACCESS :
DIRECT_ACCESS;
struct sgt_dma it = sg_sgt(sg);
struct i915_request *rq;
u32 offset;
int err;
GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
*out = NULL;
GEM_BUG_ON(ce->ring->size < SZ_64K);
offset = 0;
if (HAS_64K_PAGES(ce->engine->i915))
offset = CHUNK_SZ;
offset += (u64)rq->engine->instance << 32;
do {
int len;
rq = i915_request_create(ce);
if (IS_ERR(rq)) {
err = PTR_ERR(rq);
goto out_ce;
}
if (deps) {
err = i915_request_await_deps(rq, deps);
if (err)
goto out_rq;
if (rq->engine->emit_init_breadcrumb) {
err = rq->engine-
emit_init_breadcrumb(rq);
if (err)
goto out_rq;
}
deps = NULL;
}
/* The PTE updates + clear must not be interrupted.
*/
err = emit_no_arbitration(rq);
if (err)
goto out_rq;
len = emit_pte(rq, &it, cache_level, true, offset,
CHUNK_SZ);
if (len <= 0) {
err = len;
goto out_rq;
}
err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
if (err)
goto out_rq;
err = emit_copy_ccs(rq, offset, dst_access,
offset, src_access, len);
if (err)
goto out_rq;
err = rq->engine->emit_flush(rq, EMIT_INVALIDATE |
MI_FLUSH_DW_CCS);
/* Arbitration is re-enabled between requests. */
+out_rq:
if (*out)
i915_request_put(*out);
*out = i915_request_get(rq);
i915_request_add(rq);
if (err || !it.sg || !sg_dma_len(it.sg))
break;
cond_resched();
} while (1);
+out_ce:
return err;
+}
+static int +intel_migrate_ccs_copy(struct intel_migrate *m,
struct i915_gem_ww_ctx *ww,
const struct i915_deps *deps,
struct scatterlist *sg,
enum i915_cache_level cache_level,
bool write_to_ccs,
struct i915_request **out)
+{
struct intel_context *ce;
int err;
*out = NULL;
if (!m->context)
return -ENODEV;
ce = intel_migrate_create_context(m);
if (IS_ERR(ce))
ce = intel_context_get(m->context);
GEM_BUG_ON(IS_ERR(ce));
err = intel_context_pin_ww(ce, ww);
if (err)
goto out;
err = intel_context_copy_ccs(ce, deps, sg, cache_level,
write_to_ccs, out);
intel_context_unpin(ce);
+out:
intel_context_put(ce);
return err;
+}
static int clear(struct intel_migrate *migrate, int (*fn)(struct intel_migrate *migrate, struct i915_gem_ww_ctx *ww, @@ -144,7 +264,8 @@ static int clear(struct intel_migrate *migrate, struct drm_i915_gem_object *obj; struct i915_request *rq; struct i915_gem_ww_ctx ww;
u32 *vaddr;
u32 *vaddr, val = 0;
bool ccs_cap = false; int err = 0; int i;
@@ -155,7 +276,12 @@ static int clear(struct intel_migrate *migrate, /* Consider the rounded up memory too */ sz = obj->base.size;
if (HAS_FLAT_CCS(i915) && i915_gem_object_is_lmem(obj))
ccs_cap = true;
for_i915_gem_ww(&ww, err, true) {
int ccs_bytes;
err = i915_gem_object_lock(obj, &ww); if (err) continue;
@@ -170,44 +296,136 @@ static int clear(struct intel_migrate *migrate, vaddr[i] = ~i; i915_gem_object_flush_map(obj);
err = fn(migrate, &ww, obj, sz, &rq);
if (!err)
continue;
if (ccs_cap && !val) {
/* Write the obj data into ccs surface */
err = intel_migrate_ccs_copy(migrate, &ww,
NULL,
obj->mm.pages-
sgl,
obj-
cache_level,
true, &rq);
if (rq && !err) {
if (i915_request_wait(rq, 0, HZ) < 0)
{
pr_err("%ps timed out, size:
%u\n",
fn, sz);
err = -ETIME;
}
i915_request_put(rq);
rq = NULL;
}
if (err)
continue;
for (i = 0; i < sz / sizeof(u32); i++)
vaddr[i] = 0x5a5a5a5a;
i915_gem_object_flush_map(obj);
err = intel_migrate_ccs_copy(migrate, &ww,
NULL, obj->mm.pages->sgl,
obj-
cache_level, false, &rq);
Why do we read back CCS content here?
Was rechecking the ccs copy. but this is not needed for real intention. Removing in next version.
if (rq && !err) {
if (i915_request_wait(rq, 0, HZ) < 0)
{
pr_err("%ps timed out, size:
%u\n",
fn, sz);
err = -ETIME;
}
i915_request_put(rq);
rq = NULL;
}
if (err)
continue;
i915_gem_object_flush_map(obj);
for (i = 0; !err && i < ccs_bytes; i += 4) {
if (vaddr[i] != ~i) {
pr_err("%ps ccs write and
read failed, offset: %d\n",
fn, i);
err = -EINVAL;
}
}
if (err)
continue;
i915_gem_object_flush_map(obj);
}
if (err != -EDEADLK && err != -EINTR && err != -
ERESTARTSYS)
pr_err("%ps failed, size: %u\n", fn, sz);
if (rq) {
i915_request_wait(rq, 0, HZ);
err = fn(migrate, &ww, obj, val, &rq);
if (rq && !err) {
if (i915_request_wait(rq, 0, HZ) < 0) {
pr_err("%ps timed out, size: %u\n",
fn, sz);
err = -ETIME;
} i915_request_put(rq);
rq = NULL; }
i915_gem_object_unpin_map(obj);
}
if (err)
goto err_out;
if (err)
continue;
if (rq) {
if (i915_request_wait(rq, 0, HZ) < 0) {
pr_err("%ps timed out, size: %u\n", fn, sz);
err = -ETIME;
i915_gem_object_flush_map(obj);
/* Verify the set/clear of the obj mem */
for (i = 0; !err && i < sz / PAGE_SIZE; i++) {
int x = i * 1024 +
i915_prandom_u32_max_state(1024,
prng);
if (vaddr[x] != val) {
pr_err("%ps failed, (%u != %u),
offset: %zu\n",
fn, vaddr[x], val, x *
sizeof(u32));
igt_hexdump(vaddr + i * 1024, 4096);
err = -EINVAL;
} }
i915_request_put(rq);
}
if (err)
continue;
for (i = 0; !err && i < sz / PAGE_SIZE; i++) {
int x = i * 1024 + i915_prandom_u32_max_state(1024,
prng);
if (ccs_cap && !val) {
for (i = 0; i < sz / sizeof(u32); i++)
vaddr[i] = ~i;
i915_gem_object_flush_map(obj);
err = intel_migrate_ccs_copy(migrate, &ww,
NULL,
obj->mm.pages-
sgl,
obj-
cache_level,
false, &rq);
if (rq && !err) {
if (i915_request_wait(rq, 0, HZ) < 0)
{
pr_err("%ps timed out, size:
%u\n",
fn, sz);
err = -ETIME;
}
i915_request_put(rq);
rq = NULL;
}
if (err)
continue;
ccs_bytes = GET_CCS_BYTES(i915, sz);
i915_gem_object_flush_map(obj);
for (i = 0; !err && i < ccs_bytes /
sizeof(u32); i++) {
if (vaddr[i]) {
I think this is incorrect. This assumes that CCS data is read back contiguous for the whole buffer, but instead CCS data is read back per 8MiB chunk and placed at the beginning of each chunk?
Yes. This is the source for the problem I was discussing with you. Fixed it in the next version. Please share your feedback. could have used a dedicated obj for ccs, but just calculated the offset of the ccs bytes.
Ram
/Thomas
pr_err("%ps ccs clearing
failed, offset: %d/%lu\n",
fn, i, (ccs_bytes /
sizeof(u32)) - 1);
igt_hexdump(vaddr + i,
ccs_bytes - i * sizeof(u32));
err = -EINVAL;
}
}
if (err)
continue;
}
i915_gem_object_unpin_map(obj);
}
if (vaddr[x] != sz) {
pr_err("%ps failed, size: %u, offset: %zu\n",
fn, sz, x * sizeof(u32));
igt_hexdump(vaddr + i * 1024, 4096);
err = -EINVAL;
if (err) {
if (err != -EDEADLK && err != -EINTR && err != -
ERESTARTSYS)
pr_err("%ps failed, size: %u\n", fn, sz);
if (rq && err != -EINVAL) {
i915_request_wait(rq, 0, HZ);
i915_request_put(rq); }
i915_gem_object_unpin_map(obj);
} else {
pr_debug("%ps Passed. size: %u\n", fn, sz); }
i915_gem_object_unpin_map(obj);
-err_out: i915_gem_object_put(obj);
return err;
}