Quoting Matthew Auld (2019-08-09 23:26:18)
+struct i915_vma *intel_emit_vma_copy_blt(struct intel_engine_pool_node **p,
struct intel_context *ce,
struct i915_vma *src,
struct i915_vma *dst)
+{
struct drm_i915_private *i915 = ce->vm->i915;
const u32 block_size = S16_MAX * PAGE_SIZE;
struct intel_engine_pool_node *pool;
struct i915_vma *batch;
u64 src_offset, dst_offset;
u64 count;
u64 rem;
u32 size;
u32 *cmd;
int err;
GEM_BUG_ON(src->size != dst->size);
count = div_u64(dst->size, block_size);
size = (1 + 11 * count) * sizeof(u32);
size = round_up(size, PAGE_SIZE);
pool = intel_engine_pool_get(&ce->engine->pool, size);
if (IS_ERR(pool))
return ERR_CAST(pool);
cmd = i915_gem_object_pin_map(pool->obj, I915_MAP_WC);
if (IS_ERR(cmd)) {
err = PTR_ERR(cmd);
goto out_put;
}
rem = src->size;
src_offset = src->node.start;
dst_offset = dst->node.start;
do {
u32 size = min_t(u64, rem, block_size);
GEM_BUG_ON(size >> PAGE_SHIFT > S16_MAX);
if (INTEL_GEN(i915) >= 9) {
*cmd++ = GEN9_XY_FAST_COPY_BLT_CMD | (10 - 2);
*cmd++ = BLT_DEPTH_32 | PAGE_SIZE;
*cmd++ = 0;
*cmd++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
*cmd++ = lower_32_bits(dst_offset);
*cmd++ = upper_32_bits(dst_offset);
*cmd++ = 0;
*cmd++ = PAGE_SIZE;
*cmd++ = lower_32_bits(src_offset);
*cmd++ = upper_32_bits(src_offset);
} else if (INTEL_GEN(i915) >= 8) {
*cmd++ = XY_SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (10 - 2);
*cmd++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE;
*cmd++ = 0;
*cmd++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
*cmd++ = lower_32_bits(dst_offset);
*cmd++ = upper_32_bits(dst_offset);
*cmd++ = 0;
*cmd++ = PAGE_SIZE;
*cmd++ = lower_32_bits(src_offset);
*cmd++ = upper_32_bits(src_offset);
} else {
*cmd++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (6 - 2);
*cmd++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE;
*cmd++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE;
*cmd++ = dst_offset;
*cmd++ = PAGE_SIZE;
*cmd++ = src_offset;
}
/* Allow ourselves to be preempted in between blocks. */
*cmd++ = MI_ARB_CHECK;
src_offset += size;
dst_offset += size;
rem -= size;
} while (rem);
*cmd = MI_BATCH_BUFFER_END;
intel_gt_chipset_flush(ce->vm->gt);
i915_gem_object_unpin_map(pool->obj);
batch = i915_vma_instance(pool->obj, ce->vm, NULL);
if (IS_ERR(batch)) {
err = PTR_ERR(batch);
goto out_put;
}
err = i915_vma_pin(batch, 0, 0, PIN_USER);
if (unlikely(err))
goto out_put;
*p = pool;
return batch;
+out_put:
intel_engine_pool_put(pool);
return ERR_PTR(err);
+}
+int i915_gem_object_copy_blt(struct drm_i915_gem_object *src,
struct drm_i915_gem_object *dst,
struct intel_context *ce)
+{
struct drm_gem_object *objs[] = { &src->base, &dst->base };
struct i915_address_space *vm = ce->vm;
struct intel_engine_pool_node *pool;
struct ww_acquire_ctx acquire;
struct i915_vma *vma_src, *vma_dst;
struct i915_vma *batch;
struct i915_request *rq;
int err;
vma_src = i915_vma_instance(src, vm, NULL);
if (IS_ERR(vma_src))
return PTR_ERR(vma_src);
err = i915_vma_pin(vma_src, 0, 0, PIN_USER);
if (unlikely(err))
return err;
vma_dst = i915_vma_instance(dst, vm, NULL);
if (IS_ERR(vma_dst))
goto out_unpin_src;
err = i915_vma_pin(vma_dst, 0, 0, PIN_USER);
if (unlikely(err))
goto out_unpin_src;
intel_engine_pm_get(ce->engine);
batch = intel_emit_vma_copy_blt(&pool, ce, vma_src, vma_dst);
if (IS_ERR(batch)) {
err = PTR_ERR(batch);
goto out_unpin_dst;
}
rq = intel_context_create_request(ce);
if (IS_ERR(rq)) {
err = PTR_ERR(rq);
goto out_batch;
}
i915_vma_lock(batch);
err = i915_vma_move_to_active(batch, rq, 0);
i915_vma_unlock(batch);
if (unlikely(err))
goto out_request;
err = intel_engine_pool_mark_active(pool, rq);
if (unlikely(err))
goto out_request;
err = drm_gem_lock_reservations(objs, ARRAY_SIZE(objs), &acquire);
if (unlikely(err))
goto out_request;
if (src->cache_dirty & ~src->cache_coherent)
i915_gem_clflush_object(src, 0);
if (dst->cache_dirty & ~dst->cache_coherent)
i915_gem_clflush_object(dst, 0);
err = i915_request_await_object(rq, src, false);
if (unlikely(err))
goto out_unlock;
err = i915_vma_move_to_active(vma_src, rq, 0);
if (unlikely(err))
goto out_unlock;
err = i915_request_await_object(rq, dst, true);
if (unlikely(err))
goto out_unlock;
err = i915_vma_move_to_active(vma_dst, rq, EXEC_OBJECT_WRITE);
if (unlikely(err))
goto out_unlock;
Strictly, wait on all objects, then setup all signals. Avoids any nasty cycles in the dependency graphs. Such as if someone passed in src = dst. Time for another selftest ;)
for (i = 0; i < ARRAY_SIZE(obj); i++) { clflush_object(obj[i]); await_object(rq, obj[i]); }
for (i = 0; i < ARRAY_SIZE(obj); i++) move_to_active(obj[i]);
if (ce->engine->emit_init_breadcrumb) {
err = ce->engine->emit_init_breadcrumb(rq);
if (unlikely(err))
goto out_unlock;
}
err = ce->engine->emit_bb_start(rq,
batch->node.start, batch->node.size,
0);
+out_unlock:
drm_gem_unlock_reservations(objs, ARRAY_SIZE(objs), &acquire);
+out_request:
if (unlikely(err))
i915_request_skip(rq, err);
i915_request_add(rq);
+out_batch:
i915_vma_unpin(batch);
intel_engine_pool_put(pool);
+out_unpin_dst:
i915_vma_unpin(vma_dst);
intel_engine_pm_put(ce->engine);
+out_unpin_src:
i915_vma_unpin(vma_src);
return err;
+}