This patch series is the second installment of my quest to clean up the i915 uAPI. The first three patches delete interfaces which have only ever been used by i-g-t and never any real userspace. In the case of NO_ZEROMAP, it's unclear exactly how this happened. There were userspace patches for it back in the day, they just never landed. For the others, there was never any userspace beyond i-g-t and they never should have landed in the first place.
The last patch moves the SINGLE_TIMELINE API to an emulation using syncobj to help simplify some of our locking infrastructure.
Test-with: 20210319223233.2982842-1-jason@jlekstrand.net
Jason Ekstrand (4): drm/i915: Drop I915_CONTEXT_PARAM_RINGSIZE drm/i915: Drop I915_CONTEXT_PARAM_NO_ZEROMAP drm/i915: Drop the CONTEXT_CLONE API drm/i915: Implement SINGLE_TIMELINE with a syncobj
drivers/gpu/drm/i915/Makefile | 1 - drivers/gpu/drm/i915/gem/i915_gem_context.c | 372 ++---------------- .../gpu/drm/i915/gem/i915_gem_context_types.h | 9 +- .../gpu/drm/i915/gem/i915_gem_execbuffer.c | 23 +- drivers/gpu/drm/i915/gt/intel_context_param.c | 63 --- drivers/gpu/drm/i915/gt/intel_context_param.h | 14 - include/uapi/drm/i915_drm.h | 40 +- 7 files changed, 55 insertions(+), 467 deletions(-) delete mode 100644 drivers/gpu/drm/i915/gt/intel_context_param.c delete mode 100644 drivers/gpu/drm/i915/gt/intel_context_param.h
This reverts commit 88be76cdafc7e60e2e4ed883bfe7e8dd7f35fa3a. This API has never been used by any real userspace.
Signed-off-by: Jason Ekstrand jason@jlekstrand.net --- drivers/gpu/drm/i915/Makefile | 1 - drivers/gpu/drm/i915/gem/i915_gem_context.c | 112 ++---------------- drivers/gpu/drm/i915/gt/intel_context_param.c | 63 ---------- drivers/gpu/drm/i915/gt/intel_context_param.h | 14 --- include/uapi/drm/i915_drm.h | 20 +--- 5 files changed, 12 insertions(+), 198 deletions(-) delete mode 100644 drivers/gpu/drm/i915/gt/intel_context_param.c delete mode 100644 drivers/gpu/drm/i915/gt/intel_context_param.h
diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile index 921db06232c35..6825988f0328f 100644 --- a/drivers/gpu/drm/i915/Makefile +++ b/drivers/gpu/drm/i915/Makefile @@ -87,7 +87,6 @@ gt-y += \ gt/gen8_ppgtt.o \ gt/intel_breadcrumbs.o \ gt/intel_context.o \ - gt/intel_context_param.o \ gt/intel_context_sseu.o \ gt/intel_engine_cs.o \ gt/intel_engine_heartbeat.o \ diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index 4d2f40cf237bd..c528db3b7dcf4 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -69,7 +69,6 @@
#include "gt/gen6_ppgtt.h" #include "gt/intel_context.h" -#include "gt/intel_context_param.h" #include "gt/intel_engine_heartbeat.h" #include "gt/intel_engine_user.h" #include "gt/intel_execlists_submission.h" /* virtual_engine */ @@ -744,32 +743,25 @@ __context_engines_await(const struct i915_gem_context *ctx, return engines; }
-static int +static void context_apply_all(struct i915_gem_context *ctx, - int (*fn)(struct intel_context *ce, void *data), + void (*fn)(struct intel_context *ce, void *data), void *data) { struct i915_gem_engines_iter it; struct i915_gem_engines *e; struct intel_context *ce; - int err = 0;
e = __context_engines_await(ctx, NULL); - for_each_gem_engine(ce, e, it) { - err = fn(ce, data); - if (err) - break; - } + for_each_gem_engine(ce, e, it) + fn(ce, data); i915_sw_fence_complete(&e->fence); - - return err; }
-static int __apply_ppgtt(struct intel_context *ce, void *vm) +static void __apply_ppgtt(struct intel_context *ce, void *vm) { i915_vm_put(ce->vm); ce->vm = i915_vm_get(vm); - return 0; }
static struct i915_address_space * @@ -809,10 +801,9 @@ static void __set_timeline(struct intel_timeline **dst, intel_timeline_put(old); }
-static int __apply_timeline(struct intel_context *ce, void *timeline) +static void __apply_timeline(struct intel_context *ce, void *timeline) { __set_timeline(&ce->timeline, timeline); - return 0; }
static void __assign_timeline(struct i915_gem_context *ctx, @@ -1328,63 +1319,6 @@ static int set_ppgtt(struct drm_i915_file_private *file_priv, return err; }
-static int __apply_ringsize(struct intel_context *ce, void *sz) -{ - return intel_context_set_ring_size(ce, (unsigned long)sz); -} - -static int set_ringsize(struct i915_gem_context *ctx, - struct drm_i915_gem_context_param *args) -{ - if (!HAS_LOGICAL_RING_CONTEXTS(ctx->i915)) - return -ENODEV; - - if (args->size) - return -EINVAL; - - if (!IS_ALIGNED(args->value, I915_GTT_PAGE_SIZE)) - return -EINVAL; - - if (args->value < I915_GTT_PAGE_SIZE) - return -EINVAL; - - if (args->value > 128 * I915_GTT_PAGE_SIZE) - return -EINVAL; - - return context_apply_all(ctx, - __apply_ringsize, - __intel_context_ring_size(args->value)); -} - -static int __get_ringsize(struct intel_context *ce, void *arg) -{ - long sz; - - sz = intel_context_get_ring_size(ce); - GEM_BUG_ON(sz > INT_MAX); - - return sz; /* stop on first engine */ -} - -static int get_ringsize(struct i915_gem_context *ctx, - struct drm_i915_gem_context_param *args) -{ - int sz; - - if (!HAS_LOGICAL_RING_CONTEXTS(ctx->i915)) - return -ENODEV; - - if (args->size) - return -EINVAL; - - sz = context_apply_all(ctx, __get_ringsize, NULL); - if (sz < 0) - return sz; - - args->value = sz; - return 0; -} - int i915_gem_user_to_context_sseu(struct intel_gt *gt, const struct drm_i915_gem_context_param_sseu *user, @@ -1925,19 +1859,17 @@ set_persistence(struct i915_gem_context *ctx, return __context_set_persistence(ctx, args->value); }
-static int __apply_priority(struct intel_context *ce, void *arg) +static void __apply_priority(struct intel_context *ce, void *arg) { struct i915_gem_context *ctx = arg;
if (!intel_engine_has_timeslices(ce->engine)) - return 0; + return;
if (ctx->sched.priority >= I915_PRIORITY_NORMAL) intel_context_set_use_semaphores(ce); else intel_context_clear_use_semaphores(ce); - - return 0; }
static int set_priority(struct i915_gem_context *ctx, @@ -2030,11 +1962,8 @@ static int ctx_setparam(struct drm_i915_file_private *fpriv, ret = set_persistence(ctx, args); break;
- case I915_CONTEXT_PARAM_RINGSIZE: - ret = set_ringsize(ctx, args); - break; - case I915_CONTEXT_PARAM_BAN_PERIOD: + case I915_CONTEXT_PARAM_RINGSIZE: default: ret = -EINVAL; break; @@ -2062,18 +1991,6 @@ static int create_setparam(struct i915_user_extension __user *ext, void *data) return ctx_setparam(arg->fpriv, arg->ctx, &local.param); }
-static int copy_ring_size(struct intel_context *dst, - struct intel_context *src) -{ - long sz; - - sz = intel_context_get_ring_size(src); - if (sz < 0) - return sz; - - return intel_context_set_ring_size(dst, sz); -} - static int clone_engines(struct i915_gem_context *dst, struct i915_gem_context *src) { @@ -2118,12 +2035,6 @@ static int clone_engines(struct i915_gem_context *dst, }
intel_context_set_gem(clone->engines[n], dst); - - /* Copy across the preferred ringsize */ - if (copy_ring_size(clone->engines[n], e->engines[n])) { - __free_engines(clone, n + 1); - goto err_unlock; - } } clone->num_engines = n; i915_sw_fence_complete(&e->fence); @@ -2483,11 +2394,8 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, args->value = i915_gem_context_is_persistent(ctx); break;
- case I915_CONTEXT_PARAM_RINGSIZE: - ret = get_ringsize(ctx, args); - break; - case I915_CONTEXT_PARAM_BAN_PERIOD: + case I915_CONTEXT_PARAM_RINGSIZE: default: ret = -EINVAL; break; diff --git a/drivers/gpu/drm/i915/gt/intel_context_param.c b/drivers/gpu/drm/i915/gt/intel_context_param.c deleted file mode 100644 index 65dcd090245d6..0000000000000 --- a/drivers/gpu/drm/i915/gt/intel_context_param.c +++ /dev/null @@ -1,63 +0,0 @@ -// SPDX-License-Identifier: MIT -/* - * Copyright © 2019 Intel Corporation - */ - -#include "i915_active.h" -#include "intel_context.h" -#include "intel_context_param.h" -#include "intel_ring.h" - -int intel_context_set_ring_size(struct intel_context *ce, long sz) -{ - int err; - - if (intel_context_lock_pinned(ce)) - return -EINTR; - - err = i915_active_wait(&ce->active); - if (err < 0) - goto unlock; - - if (intel_context_is_pinned(ce)) { - err = -EBUSY; /* In active use, come back later! */ - goto unlock; - } - - if (test_bit(CONTEXT_ALLOC_BIT, &ce->flags)) { - struct intel_ring *ring; - - /* Replace the existing ringbuffer */ - ring = intel_engine_create_ring(ce->engine, sz); - if (IS_ERR(ring)) { - err = PTR_ERR(ring); - goto unlock; - } - - intel_ring_put(ce->ring); - ce->ring = ring; - - /* Context image will be updated on next pin */ - } else { - ce->ring = __intel_context_ring_size(sz); - } - -unlock: - intel_context_unlock_pinned(ce); - return err; -} - -long intel_context_get_ring_size(struct intel_context *ce) -{ - long sz = (unsigned long)READ_ONCE(ce->ring); - - if (test_bit(CONTEXT_ALLOC_BIT, &ce->flags)) { - if (intel_context_lock_pinned(ce)) - return -EINTR; - - sz = ce->ring->size; - intel_context_unlock_pinned(ce); - } - - return sz; -} diff --git a/drivers/gpu/drm/i915/gt/intel_context_param.h b/drivers/gpu/drm/i915/gt/intel_context_param.h deleted file mode 100644 index f053d8633fe2a..0000000000000 --- a/drivers/gpu/drm/i915/gt/intel_context_param.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: MIT */ -/* - * Copyright © 2019 Intel Corporation - */ - -#ifndef INTEL_CONTEXT_PARAM_H -#define INTEL_CONTEXT_PARAM_H - -struct intel_context; - -int intel_context_set_ring_size(struct intel_context *ce, long sz); -long intel_context_get_ring_size(struct intel_context *ce); - -#endif /* INTEL_CONTEXT_PARAM_H */ diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index ddc47bbf48b6d..a80eb2bcd22bc 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1675,24 +1675,8 @@ struct drm_i915_gem_context_param { */ #define I915_CONTEXT_PARAM_PERSISTENCE 0xb
-/* - * I915_CONTEXT_PARAM_RINGSIZE: - * - * Sets the size of the CS ringbuffer to use for logical ring contexts. This - * applies a limit of how many batches can be queued to HW before the caller - * is blocked due to lack of space for more commands. - * - * Only reliably possible to be set prior to first use, i.e. during - * construction. At any later point, the current execution must be flushed as - * the ring can only be changed while the context is idle. Note, the ringsize - * can be specified as a constructor property, see - * I915_CONTEXT_CREATE_EXT_SETPARAM, but can also be set later if required. - * - * Only applies to the current set of engine and lost when those engines - * are replaced by a new mapping (see I915_CONTEXT_PARAM_ENGINES). - * - * Must be between 4 - 512 KiB, in intervals of page size [4 KiB]. - * Default is 16 KiB. +/* This API has been removed. On the off chance someone somewhere has + * attempted to use it, never re-use this context param number. */ #define I915_CONTEXT_PARAM_RINGSIZE 0xc /* Must be kept compact -- no holes and well documented */
On Fri, Mar 19, 2021 at 5:39 PM Jason Ekstrand jason@jlekstrand.net wrote:
This reverts commit 88be76cdafc7e60e2e4ed883bfe7e8dd7f35fa3a. This API has never been used by any real userspace.
After further digging, there is a compute-runtime PR for this. I still think we should drop it and I've updated the commit message locally with the following rationale:
This reverts commit 88be76cdafc7e60e2e4ed883bfe7e8dd7f35fa3a. This API was originally added for OpenCL but the compute-runtime PR has sat open for a year without action so we can still pull it out if we want. I argue we should drop it for three reasons:
1. If the compute-runtime PR has sat open for a year, this clearly isn't that important.
2. It's a very leaky API. Ring size is an implementation detail of the current execlist scheduler and really only makes sense there. It can't apply to the older ring-buffer scheduler on pre-execlist hardware because that's shared across all contexts and it won't apply to the GuC scheduler that's in the pipeline.
3. Having userspace set a ring size in bytes is a bad solution to the problem of having too small a ring. There is no way that userspace has the information to know how to properly set the ring size so it's just going to detect the feature and always set it to the maximum of 512K. This is what the compute-runtime PR does. The scheduler in i915, on the other hand, does have the information to make an informed choice. It could detect if the ring size is a problem and grow it itself. Or, if that's too hard, we could just increase the default size from 16K to 32K or even 64K instead of relying on userspace to do it.
Let's drop this API for now and, if someone decides they really care about solving this problem, they can do it properly.
On Sat, 20 Mar 2021 at 14:48, Jason Ekstrand jason@jlekstrand.net wrote:
Just a drive-by-comment. I thought the lrc model was shared between execlists and the GuC, so in both cases we get something like a ring per engine per context which the driver can emit commands into. Why doesn't ring size still apply with the GuC scheduler?
On Mon, Mar 22, 2021 at 5:52 AM Matthew Auld matthew.william.auld@gmail.com wrote:
Even if they both have a ring in some sense, the number of bytes which correspond to a single request is going to be different and that difference isn't visible to userspace so bytes is the wrong unit.
--Jason
On Sat, 20 Mar 2021, Jason Ekstrand jason@jlekstrand.net wrote:
This reverts commit 88be76cdafc7e60e2e4ed883bfe7e8dd7f35fa3a. This API
Small nit, I think it would be useful to reference commits with the citation style:
88be76cdafc7 ("drm/i915: Allow userspace to specify ringsize on construction")
I use this monster in my .gitconfig:
[alias] cite = "!f() { git log -1 '--pretty=format:%H ("%s")%n' $1 | sed -e 's/\([0-f]\{12\}\)[0-f]*/\1/'; }; f"
With that, 'git cite <committish>' will give you the nice reference with 12 chars of sha1 regardless of core.abbrev config.
BR, Jani.
On Mon, Mar 22, 2021 at 7:01 AM Jani Nikula jani.nikula@linux.intel.com wrote:
Done.
Thanks for the tip!
On Mon, Mar 22, 2021 at 5:01 PM Jason Ekstrand jason@jlekstrand.net wrote:
dim script from maintainer-tools has a bunch of these helpers (andother one is dim fixes for generating Cc: lists) which should work even without commit rights and all that set up:
https://gitlab.freedesktop.org/drm/maintainer-tools/-/blob/master/dim
Cheers, Daniel
The idea behind this param is to support OpenCL drivers with relocations because OpenCL reserves 0x0 for NULL and, if we placed memory there, it would confuse CL kernels. It was originally sent out as part of a patch series including libdrm [1] and Beignet [2] support. However, the libdrm and Beignet patches never landed in their respective upstream projects so this API has never been used. It's never been used in Mesa or any other driver, either.
Dropping this API allows us to delete a small bit of code.
[1]: https://lists.freedesktop.org/archives/intel-gfx/2015-May/067030.html [2]: https://lists.freedesktop.org/archives/intel-gfx/2015-May/067031.html
Signed-off-by: Jason Ekstrand jason@jlekstrand.net --- drivers/gpu/drm/i915/gem/i915_gem_context.c | 16 ++-------------- .../gpu/drm/i915/gem/i915_gem_context_types.h | 1 - drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 8 -------- include/uapi/drm/i915_drm.h | 4 ++++ 4 files changed, 6 insertions(+), 23 deletions(-)
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index c528db3b7dcf4..d28ac79de7573 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -1904,15 +1904,6 @@ static int ctx_setparam(struct drm_i915_file_private *fpriv, int ret = 0;
switch (args->param) { - case I915_CONTEXT_PARAM_NO_ZEROMAP: - if (args->size) - ret = -EINVAL; - else if (args->value) - set_bit(UCONTEXT_NO_ZEROMAP, &ctx->user_flags); - else - clear_bit(UCONTEXT_NO_ZEROMAP, &ctx->user_flags); - break; - case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE: if (args->size) ret = -EINVAL; @@ -1962,6 +1953,7 @@ static int ctx_setparam(struct drm_i915_file_private *fpriv, ret = set_persistence(ctx, args); break;
+ case I915_CONTEXT_PARAM_NO_ZEROMAP: case I915_CONTEXT_PARAM_BAN_PERIOD: case I915_CONTEXT_PARAM_RINGSIZE: default: @@ -2342,11 +2334,6 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, return -ENOENT;
switch (args->param) { - case I915_CONTEXT_PARAM_NO_ZEROMAP: - args->size = 0; - args->value = test_bit(UCONTEXT_NO_ZEROMAP, &ctx->user_flags); - break; - case I915_CONTEXT_PARAM_GTT_SIZE: args->size = 0; rcu_read_lock(); @@ -2394,6 +2381,7 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, args->value = i915_gem_context_is_persistent(ctx); break;
+ case I915_CONTEXT_PARAM_NO_ZEROMAP: case I915_CONTEXT_PARAM_BAN_PERIOD: case I915_CONTEXT_PARAM_RINGSIZE: default: diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h index 1449f54924e03..676592e27e7d2 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h @@ -129,7 +129,6 @@ struct i915_gem_context { * @user_flags: small set of booleans controlled by the user */ unsigned long user_flags; -#define UCONTEXT_NO_ZEROMAP 0 #define UCONTEXT_NO_ERROR_CAPTURE 1 #define UCONTEXT_BANNABLE 2 #define UCONTEXT_RECOVERABLE 3 diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index 21676baca8f58..96403130a373d 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -286,7 +286,6 @@ struct i915_execbuffer { struct intel_context *reloc_context;
u64 invalid_flags; /** Set of execobj.flags that are invalid */ - u32 context_flags; /** Set of execobj.flags to insert from the ctx */
u64 batch_len; /** Length of batch within object */ u32 batch_start_offset; /** Location within object of batch */ @@ -528,9 +527,6 @@ eb_validate_vma(struct i915_execbuffer *eb, entry->flags |= EXEC_OBJECT_NEEDS_GTT | __EXEC_OBJECT_NEEDS_MAP; }
- if (!(entry->flags & EXEC_OBJECT_PINNED)) - entry->flags |= eb->context_flags; - return 0; }
@@ -737,10 +733,6 @@ static int eb_select_context(struct i915_execbuffer *eb) if (rcu_access_pointer(ctx->vm)) eb->invalid_flags |= EXEC_OBJECT_NEEDS_GTT;
- eb->context_flags = 0; - if (test_bit(UCONTEXT_NO_ZEROMAP, &ctx->user_flags)) - eb->context_flags |= __EXEC_OBJECT_NEEDS_BIAS; - return 0; }
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index a80eb2bcd22bc..4c4b9254def1b 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1591,6 +1591,10 @@ struct drm_i915_gem_context_param { __u32 size; __u64 param; #define I915_CONTEXT_PARAM_BAN_PERIOD 0x1 +/* I915_CONTEXT_PARAM_NO_ZEROMAP has been removed. On the off chance + * someone somewhere has attempted to use it, never re-use this context + * param number. + */ #define I915_CONTEXT_PARAM_NO_ZEROMAP 0x2 #define I915_CONTEXT_PARAM_GTT_SIZE 0x3 #define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE 0x4
Greeting,
FYI, we noticed the following commit (built with gcc-9):
commit: 014c1518e82b75a19dda35d7000ec4a179b888d5 ("[PATCH 2/4] drm/i915: Drop I915_CONTEXT_PARAM_NO_ZEROMAP") url: https://github.com/0day-ci/linux/commits/Jason-Ekstrand/drm-i915-uAPI-clean-... base: git://anongit.freedesktop.org/drm-intel for-linux-next
in testcase: igt version: igt-x86_64-e230cd8d-1_20210106 with following parameters:
group: group-20 ucode: 0xe2
on test machine: 8 threads Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz with 28G memory
caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):
If you fix the issue, kindly add following tag Reported-by: kernel test robot oliver.sang@intel.com
[ 38.144371] [ 38.156061] IGT-Version: 1.25-ge230cd8d (x86_64) (Linux: 5.12.0-rc2-00175-g014c1518e82b x86_64) [ 38.156062] [ 38.166881] Starting subtest: non-root-set-no-zeromap [ 38.166882] [ 38.175996] (gem_ctx_param:1412) i915/gem_context-CRITICAL: Test assertion failure function gem_context_get_param, file ../lib/i915/gem_context.c:178: [ 38.175997] [ 38.192727] (gem_ctx_param:1412) i915/gem_context-CRITICAL: Failed assertion: __gem_context_get_param(fd, p) == 0 [ 38.192729] [ 38.205523] (gem_ctx_param:1412) i915/gem_context-CRITICAL: error: -22 != 0 [ 38.205525]
To reproduce:
git clone https://github.com/intel/lkp-tests.git cd lkp-tests bin/lkp install job.yaml # job file is attached in this email bin/lkp split-job --compatible job.yaml bin/lkp run compatible-job.yaml
--- 0DAY/LKP+ Test Infrastructure Open Source Technology Center https://lists.01.org/hyperkitty/list/lkp@lists.01.org Intel Corporation
Thanks, Oliver Sang
This API allows one context to grab bits out of another context upon creation. It can be used as a short-cut for setparam(getparam()) for things like I915_CONTEXT_PARAM_VM. However, it's never been used by any real userspace. It's used by a few IGT tests and that's it. Since it doesn't add any real value (most of the stuff you can CLONE you can copy in other ways), drop it.
There is one thing that this API allows you to clone which you cannot clone via getparam/setparam: timelines. However, timelines are an implementation detail of i915 and not really something that needs to be exposed to userspace. Also, sharing timelines between contexts isn't obviously useful and supporting it has the potential to complicate i915 internally. It also doesn't add any functionality that the client can't get in other ways. If a client really wants a shared timeline, they can use a syncobj and set it as an in and out fence on every submit.
Signed-off-by: Jason Ekstrand jason@jlekstrand.net Cc: Tvrtko Ursulin tvrtko.ursulin@intel.com --- drivers/gpu/drm/i915/gem/i915_gem_context.c | 199 +------------------- include/uapi/drm/i915_drm.h | 16 +- 2 files changed, 6 insertions(+), 209 deletions(-)
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index d28ac79de7573..f88bac19333ec 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -1983,207 +1983,14 @@ static int create_setparam(struct i915_user_extension __user *ext, void *data) return ctx_setparam(arg->fpriv, arg->ctx, &local.param); }
-static int clone_engines(struct i915_gem_context *dst, - struct i915_gem_context *src) +static int invalid_ext(struct i915_user_extension __user *ext, void *data) { - struct i915_gem_engines *clone, *e; - bool user_engines; - unsigned long n; - - e = __context_engines_await(src, &user_engines); - if (!e) - return -ENOENT; - - clone = alloc_engines(e->num_engines); - if (!clone) - goto err_unlock; - - for (n = 0; n < e->num_engines; n++) { - struct intel_engine_cs *engine; - - if (!e->engines[n]) { - clone->engines[n] = NULL; - continue; - } - engine = e->engines[n]->engine; - - /* - * Virtual engines are singletons; they can only exist - * inside a single context, because they embed their - * HW context... As each virtual context implies a single - * timeline (each engine can only dequeue a single request - * at any time), it would be surprising for two contexts - * to use the same engine. So let's create a copy of - * the virtual engine instead. - */ - if (intel_engine_is_virtual(engine)) - clone->engines[n] = - intel_execlists_clone_virtual(engine); - else - clone->engines[n] = intel_context_create(engine); - if (IS_ERR_OR_NULL(clone->engines[n])) { - __free_engines(clone, n); - goto err_unlock; - } - - intel_context_set_gem(clone->engines[n], dst); - } - clone->num_engines = n; - i915_sw_fence_complete(&e->fence); - - /* Serialised by constructor */ - engines_idle_release(dst, rcu_replace_pointer(dst->engines, clone, 1)); - if (user_engines) - i915_gem_context_set_user_engines(dst); - else - i915_gem_context_clear_user_engines(dst); - return 0; - -err_unlock: - i915_sw_fence_complete(&e->fence); - return -ENOMEM; -} - -static int clone_flags(struct i915_gem_context *dst, - struct i915_gem_context *src) -{ - dst->user_flags = src->user_flags; - return 0; -} - -static int clone_schedattr(struct i915_gem_context *dst, - struct i915_gem_context *src) -{ - dst->sched = src->sched; - return 0; -} - -static int clone_sseu(struct i915_gem_context *dst, - struct i915_gem_context *src) -{ - struct i915_gem_engines *e = i915_gem_context_lock_engines(src); - struct i915_gem_engines *clone; - unsigned long n; - int err; - - /* no locking required; sole access under constructor*/ - clone = __context_engines_static(dst); - if (e->num_engines != clone->num_engines) { - err = -EINVAL; - goto unlock; - } - - for (n = 0; n < e->num_engines; n++) { - struct intel_context *ce = e->engines[n]; - - if (clone->engines[n]->engine->class != ce->engine->class) { - /* Must have compatible engine maps! */ - err = -EINVAL; - goto unlock; - } - - /* serialises with set_sseu */ - err = intel_context_lock_pinned(ce); - if (err) - goto unlock; - - clone->engines[n]->sseu = ce->sseu; - intel_context_unlock_pinned(ce); - } - - err = 0; -unlock: - i915_gem_context_unlock_engines(src); - return err; -} - -static int clone_timeline(struct i915_gem_context *dst, - struct i915_gem_context *src) -{ - if (src->timeline) - __assign_timeline(dst, src->timeline); - - return 0; -} - -static int clone_vm(struct i915_gem_context *dst, - struct i915_gem_context *src) -{ - struct i915_address_space *vm; - int err = 0; - - if (!rcu_access_pointer(src->vm)) - return 0; - - rcu_read_lock(); - vm = context_get_vm_rcu(src); - rcu_read_unlock(); - - if (!mutex_lock_interruptible(&dst->mutex)) { - __assign_ppgtt(dst, vm); - mutex_unlock(&dst->mutex); - } else { - err = -EINTR; - } - - i915_vm_put(vm); - return err; -} - -static int create_clone(struct i915_user_extension __user *ext, void *data) -{ - static int (* const fn[])(struct i915_gem_context *dst, - struct i915_gem_context *src) = { -#define MAP(x, y) [ilog2(I915_CONTEXT_CLONE_##x)] = y - MAP(ENGINES, clone_engines), - MAP(FLAGS, clone_flags), - MAP(SCHEDATTR, clone_schedattr), - MAP(SSEU, clone_sseu), - MAP(TIMELINE, clone_timeline), - MAP(VM, clone_vm), -#undef MAP - }; - struct drm_i915_gem_context_create_ext_clone local; - const struct create_ext *arg = data; - struct i915_gem_context *dst = arg->ctx; - struct i915_gem_context *src; - int err, bit; - - if (copy_from_user(&local, ext, sizeof(local))) - return -EFAULT; - - BUILD_BUG_ON(GENMASK(BITS_PER_TYPE(local.flags) - 1, ARRAY_SIZE(fn)) != - I915_CONTEXT_CLONE_UNKNOWN); - - if (local.flags & I915_CONTEXT_CLONE_UNKNOWN) - return -EINVAL; - - if (local.rsvd) - return -EINVAL; - - rcu_read_lock(); - src = __i915_gem_context_lookup_rcu(arg->fpriv, local.clone_id); - rcu_read_unlock(); - if (!src) - return -ENOENT; - - GEM_BUG_ON(src == dst); - - for (bit = 0; bit < ARRAY_SIZE(fn); bit++) { - if (!(local.flags & BIT(bit))) - continue; - - err = fn[bit](dst, src); - if (err) - return err; - } - - return 0; + return -EINVAL; }
static const i915_user_extension_fn create_extensions[] = { [I915_CONTEXT_CREATE_EXT_SETPARAM] = create_setparam, - [I915_CONTEXT_CREATE_EXT_CLONE] = create_clone, + [I915_CONTEXT_CREATE_EXT_CLONE] = invalid_ext, };
static bool client_is_banned(struct drm_i915_file_private *file_priv) diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 4c4b9254def1b..33ef78cb1deb7 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1841,20 +1841,10 @@ struct drm_i915_gem_context_create_ext_setparam { struct drm_i915_gem_context_param param; };
-struct drm_i915_gem_context_create_ext_clone { +/* This API has been removed. On the off chance someone somewhere has + * attempted to use it, never re-use this extension number. + */ #define I915_CONTEXT_CREATE_EXT_CLONE 1 - struct i915_user_extension base; - __u32 clone_id; - __u32 flags; -#define I915_CONTEXT_CLONE_ENGINES (1u << 0) -#define I915_CONTEXT_CLONE_FLAGS (1u << 1) -#define I915_CONTEXT_CLONE_SCHEDATTR (1u << 2) -#define I915_CONTEXT_CLONE_SSEU (1u << 3) -#define I915_CONTEXT_CLONE_TIMELINE (1u << 4) -#define I915_CONTEXT_CLONE_VM (1u << 5) -#define I915_CONTEXT_CLONE_UNKNOWN -(I915_CONTEXT_CLONE_VM << 1) - __u64 rsvd; -};
struct drm_i915_gem_context_destroy { __u32 ctx_id;
On 19/03/2021 22:38, Jason Ekstrand wrote:
No complaints to remove if it ended up unused outside IGT. Latter is a _big_ problem though, since it is much more that a few IGT tests. So I really think there really needs to be an evaluation and a plan for that (we don't want to lose 50% of the coverage over night).
Not really true timelines are i915 implementation detail. They are in fact a dma-fence context:seqno concept, nothing more that than. I think you are probably confusing struct intel_timeline with the timeline wording in the uapi. Former is i915 implementation detail, but context:seqno are truly userspace timelines.
But again, no objection to removing unused uapi in principle. Narrative has to be accurate and test coverage not lost though.
Regards,
Tvrtko
On Mon, Mar 22, 2021 at 11:22:01AM +0000, Tvrtko Ursulin wrote:
I think you're both saying the same thing and talking a bit past each another.
Yes the timeline is just a string of dma_fence, that's correct. Now usually if you submit batches with execbuf, we have 3 ways to synchronize concurrent submission: implicit sync, sync_file and drm_syncob. They all map to different needs in different protocols/render apis.
Now in one additional case the kernel makes sure that batchbuffers are ordered, and that's when you submit them to the same hw ctx. Because there's only 1 hw context and you really can't have batchbuffers run on that single hw context out of order. That's what the timeline object we talk about here is. But that largely is an internal implementation detail, which happens to also use most/all the same infrastructure as the dma_fence uapi pieces above.
Now the internal implementation detail leaking here is that we exposed this to userspace, without there being any need for this. What Jason implements with syncobj in the next patch is essentially what userspace should have been using for cross-engine sync. media userspace doesn't care about interop with winsys/client apis, so they equally could have used implicit sync or sync_file here (which I think is the solution now for the new uapi prepped internally), since they all are about equally powerful for stringing batchbuffers together.
So I do think the assessment is accurate, albeit a bit on the terse side. Maybe we could quote just the entire thing here in the commit message. -Danile
On 22/03/2021 14:09, Daniel Vetter wrote:
Are you saying we exposed a single timeline of execution per hw context via the single timeline flag?!
Timelines of execution were always exposed. Any "engine" (ring previously) in I915_EXEC_RING_MASK was a single timeline of execution. It is completely the same with engine map engines, which are also different indices into I915_EXEC_RING_MASK space.
Userspace was aware of these timelines forever as well. Media was creating multiple contexts to have multiple timelines (so parallelism). Everyone knew that engine-hopping submissions needs to be either implicitly or explicitly synchronised, etc.
So I really don't see that we have leaked timelines as a concept *now*. What the patch has exposed to userspace is a new way to sync between timelines and nothing more.
Regards,
Tvrtko
So I do think the assessment is accurate, albeit a bit on the terse side. Maybe we could quote just the entire thing here in the commit message.
On Mon, Mar 22, 2021 at 3:33 PM Tvrtko Ursulin tvrtko.ursulin@linux.intel.com wrote:
Nope.
Yup, I think we're saying the same thing here.
We've leaked it as something you can now share across hw context. Which is possible because of how it's internally implemented (I think load balancer relies on that), but not really a synchronization primitive we want to export as such to userspace. We have other interfaces and concepts for that. -Daniel
On 22/03/2021 14:57, Daniel Vetter wrote:
Okay so we agree on most things but apparently have different definitions of what it means to leak internal implementation details.
While at the same time proof that we haven't leaked the internal implementation details is that Jason was able to implement the single timeline flag with a drm syncobj at the execbuf top level. (Well mostly, ignoring the probably inconsequential difference of one vs multiple fence contexts.)
Which is possible because of how it's internally implemented (I think load balancer relies on that), but not really a synchronization
Virtual engine is a single timeline by definition and it is still that regardless of the implementation details (execlists or GuC, in both cases it is a single hardware context and a single timeline).
primitive we want to export as such to userspace. We have other interfaces and concepts for that.
Yes, that is the only point to argue IMO. We can say it wasn't needed and should have been avoided, but I still maintain we can't really say we leaked anything backend specific to userspace via it.
Regards,
Tvrtko
Ugh... timezones.
On Mon, Mar 22, 2021 at 10:31 AM Tvrtko Ursulin tvrtko.ursulin@linux.intel.com wrote:
You should look at my IGT patch set. I'm not deleting any tests except those that explicitly test the clone API. All the other tests which use cloning to save a few lines when constructing new contexts are updated to not require the cloning API.
Right. We've always had the concept that everything submitted to given HW context happens in-order. As Daniel said below, allowing out-of-order execution on a single HW context would be a bit nuts because HW contexts are, by definition, stateful. What this API adds is a way to do in-order synchronization across multiple HW contexts which is both new and unnecessary given the other primitives available.
I said it was a "leak" because, from my git archeology, the best I could find for justification of doing it this way was that we already have a timeline object so why not expose it. Same for the SINGLE_TIMELINE flag. Is a "timeline" really an internal concept? No, not really. It's pretty standard. But intel_timeline is an internal thing and, while this doesn't give userspace an actual handle to it, it gives it more visibility than needed, IMO.
--Jason
On 22/03/2021 16:24, Jason Ekstrand wrote:
I dare not mention the other IGT tree. There will be a plan needed since I fear much more usage will be found there.
[snip]
Cloning of timelines absolutely - I don't see a point for that. But I think there was no intent there. Rather it was just a consequence of striving for symmetry in the uapi.
But for the single timeline flag itself (so next patch in this series and it's commit message), when looked at within a single GEM context, I still really can't see the argument that it is leaking anything to userspace. Certainly not intel_timeline, which is also not even backend specific.
We seem to all agree timeline is just context:seqno, which was exposed to userpsace forever. For instance if the flag wasn't called "single timeline" but "implicit sync", "serial context", "ordered engines", whatever, would you still argue it is leaking struct intel_timeline out to userspace?
Regards,
Tvrtko
On Mon, Mar 22, 2021 at 4:31 PM Tvrtko Ursulin tvrtko.ursulin@linux.intel.com wrote:
It's not a matching implementation. It's only good enough for what media needs, and essentially what media should have done to begin with.
There's substantially different behaviour between SINGLE_TIMELINE and what Jason has done here when you race concurrent execbuf calls: Former guarantees total ordering, the latter doesn't even try. They are not the same thing, but luckily userspace doesn't care about that difference.
Aside, just to make sure this wont get lost: I do agree that we should only allow this up to maybe ADL, and reject it on anything new (maybe including dg1 while we're at it, since the pci ids for that aren't even close to upstream yet). -Daniel
On 22/03/2021 16:43, Daniel Vetter wrote:
Sounds like a very important difference to stress in the commit message.
Secondly, I am unclear whether we have agreement on whether the single timeline flag is leaking implementation details of the execlists scheduler to userspace or not?
Regards,
Tvrtko
On Tue, Mar 23, 2021 at 09:14:36AM +0000, Tvrtko Ursulin wrote:
I do think Jason&me agree on that it does leak an internal concept to userspace that we shouldn't leak.
I'm honestly not entirely understanding your argument for why single_timeline isn't an internal concept somehow, and how exposing it to userspace doesn't leak that concept to userspace. Whether internally that concept is now perfectly represented by just struct intel_timeline, or maybe more the seqno/hswp, or more diffused through the code doesn't really change that we have an internal concept that we're now exposing for sharing in ways that wasn't possible before. -Daniel
On 23/03/2021 13:23, Daniel Vetter wrote:
Don't know, obviously we think with very different paradigms.
GEM context always had as many timelines as there are engines in it's map so multiple timelines is the default mode everyone is aware of.
Single timeline flag added a new mode where instead of multiple timelines single GEM context becomes a single timeline.
The fact that userspace can achieve the single timeline execution on its own should be an argument enough that it is not a new concept that got leaked out. Definitely not any backend specific implementation details. It simply added a new feature which may or may not have been needed.
Regards,
Tvrtko
P.S. Or rename the flag in your mind to "I915_GEM_CONTEXT_SERIAL_EXECUTION" or something and see if that still leaks the timeline or some implementation details.
P.P.S Keep in mind I am arguing on wording in single timeline flag removal. Removal of timeline cloning is not controversial.
On Tue, Mar 23, 2021 at 11:23 AM Tvrtko Ursulin tvrtko.ursulin@linux.intel.com wrote:
I just commented on the SINGLE_TIMELINE patch and will send the v3 momentarily. I think you'll find the commit message much more to your liking. :-)
--Jason
I'd love to delete the SINGLE_TIMELINE API because it leaks an implementation detail of contexts through to the API and is something that userspace can do itself, trivially. Unfortunately, it's used by the media driver so we can't do that. We can, however, do the next-best thing which is to embed a syncobj in the context and do exactly what we'd expect from userspace internally.
This has a couple of advantages. One is that we're no longer leaking a detail of the current execlist scheduler which will be problematic when we try to add GuC scheduling. Second is that, together with deleting the CLONE_CONTEXT API, we should now have a 1:1 mapping between intel_context and intel_timeline which should make some of our locking mess a bit easier.
Signed-off-by: Jason Ekstrand jason@jlekstrand.net Cc: Maarten Lankhorst maarten.lankhorst@linux.intel.com Cc: Matthew Brost matthew.brost@intel.com --- drivers/gpu/drm/i915/gem/i915_gem_context.c | 47 ++++--------------- .../gpu/drm/i915/gem/i915_gem_context_types.h | 8 +++- .../gpu/drm/i915/gem/i915_gem_execbuffer.c | 15 ++++++ 3 files changed, 32 insertions(+), 38 deletions(-)
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index f88bac19333ec..e094f4a1ca4cd 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -67,6 +67,8 @@ #include <linux/log2.h> #include <linux/nospec.h>
+#include <drm/drm_syncobj.h> + #include "gt/gen6_ppgtt.h" #include "gt/intel_context.h" #include "gt/intel_engine_heartbeat.h" @@ -224,10 +226,6 @@ static void intel_context_set_gem(struct intel_context *ce, ce->vm = vm; }
- GEM_BUG_ON(ce->timeline); - if (ctx->timeline) - ce->timeline = intel_timeline_get(ctx->timeline); - if (ctx->sched.priority >= I915_PRIORITY_NORMAL && intel_engine_has_timeslices(ce->engine)) __set_bit(CONTEXT_USE_SEMAPHORES, &ce->flags); @@ -344,8 +342,8 @@ void i915_gem_context_release(struct kref *ref) mutex_destroy(&ctx->engines_mutex); mutex_destroy(&ctx->lut_mutex);
- if (ctx->timeline) - intel_timeline_put(ctx->timeline); + if (ctx->syncobj) + drm_syncobj_put(ctx->syncobj);
put_pid(ctx->pid); mutex_destroy(&ctx->mutex); @@ -790,33 +788,11 @@ static void __assign_ppgtt(struct i915_gem_context *ctx, i915_vm_close(vm); }
-static void __set_timeline(struct intel_timeline **dst, - struct intel_timeline *src) -{ - struct intel_timeline *old = *dst; - - *dst = src ? intel_timeline_get(src) : NULL; - - if (old) - intel_timeline_put(old); -} - -static void __apply_timeline(struct intel_context *ce, void *timeline) -{ - __set_timeline(&ce->timeline, timeline); -} - -static void __assign_timeline(struct i915_gem_context *ctx, - struct intel_timeline *timeline) -{ - __set_timeline(&ctx->timeline, timeline); - context_apply_all(ctx, __apply_timeline, timeline); -} - static struct i915_gem_context * i915_gem_create_context(struct drm_i915_private *i915, unsigned int flags) { struct i915_gem_context *ctx; + int ret;
if (flags & I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE && !HAS_EXECLISTS(i915)) @@ -845,16 +821,13 @@ i915_gem_create_context(struct drm_i915_private *i915, unsigned int flags) }
if (flags & I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE) { - struct intel_timeline *timeline; - - timeline = intel_timeline_create(&i915->gt); - if (IS_ERR(timeline)) { + ret = drm_syncobj_create(&ctx->syncobj, + DRM_SYNCOBJ_CREATE_SIGNALED, + NULL); + if (ret) { context_close(ctx); - return ERR_CAST(timeline); + return ERR_PTR(ret); } - - __assign_timeline(ctx, timeline); - intel_timeline_put(timeline); }
trace_i915_context_create(ctx); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h index 676592e27e7d2..8a5fdd163b79d 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h @@ -83,7 +83,13 @@ struct i915_gem_context { struct i915_gem_engines __rcu *engines; struct mutex engines_mutex; /* guards writes to engines */
- struct intel_timeline *timeline; + /** + * @syncobj: Shared timeline syncobj + * + * When the SHARED_TIMELINE flag is set on context creation, this + * provides automatic implicit synchronization across all engines. + */ + struct drm_syncobj *syncobj;
/** * @vm: unique address space (GTT) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index 96403130a373d..2c56796f6a71b 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -3295,6 +3295,15 @@ i915_gem_do_execbuffer(struct drm_device *dev, goto err_vma; }
+ if (eb.gem_context->syncobj) { + struct dma_fence *fence; + + fence = drm_syncobj_fence_get(eb.gem_context->syncobj); + err = i915_request_await_dma_fence(eb.request, fence); + if (err) + goto err_ext; + } + if (in_fence) { if (args->flags & I915_EXEC_FENCE_SUBMIT) err = i915_request_await_execution(eb.request, @@ -3351,6 +3360,12 @@ i915_gem_do_execbuffer(struct drm_device *dev, fput(out_fence->file); } } + + if (eb.gem_context->syncobj) { + drm_syncobj_replace_fence(eb.gem_context->syncobj, + &eb.request->fence); + } + i915_request_put(eb.request);
err_vma:
On 19/03/2021 22:38, Jason Ekstrand wrote:
Narrative needs to be corrected as with the previous patch.
Mess or complexity? Could you expand with some details so it's easier to understand? (I am thinking what gets easier, how and why, if this is done.)
If removal works out I would suggest deprecating the flag starting from some future platform. Maybe for GT gen greater than 12 you could already start rejecting in order to future proof.
Who drops this reference?
So essentially moving the synchronisation to top level which is extra work, but given limited and questionable usage of the uapi may be acceptable. Need full picture on motivation to understand.
Semantics are also not 1:1 since dma fence context will be different. So not fully single timeline as so far, but just implicitly serialised execution. Again due limited usage this may not be a problem. Worth spelling out in the commit message though.
Regards,
Tvrtko
On Mon, Mar 22, 2021 at 7:28 AM Tvrtko Ursulin tvrtko.ursulin@linux.intel.com wrote:
Both? I guess "complexity" is a less abrasive way of stating it. I've not dug into the actual refactor yet but we should be able to drop the whole RCU business on intel_timeline that we have right now just to facilitate sharing like this. Fewer objects that are shared deep inside i915 with their own locks and RCUs seems like an advantage to me. Especially when we already have nice generic infrastructure for this.
i915_request_await_dma_fence() below consumes a reference.
For one thing, the GuC scheduler doesn't natively have a concept of "timelines" which can be shared like this. To work with the GuC scheduler as currently proposed in DII, they've asked the media driver to stop using this flag in favor of passing a sync file from batch to batch. If we want to slide GuC scheduling in smoothly, we've got to keep it working. This means either making timelines a concept there or doing an emulation like this.
Semantics are also not 1:1 since dma fence context will be different.
Could you elaborate?
--Jason
On 22/03/2021 16:10, Jason Ekstrand wrote:
On Mon, Mar 22, 2021 at 7:28 AM Tvrtko Ursulin
[snip]
Not sure, please check on difference wrt input fence handling.
Confused - neither does execlists. It is handled in common layer in i915. GuC scheduler has the same concept of one hw context is one timeline because that is the hw concept and not backend specific.
Hm not aware and don't see that GuC backend can't or doesn't implement this. Perhaps this would be best discussed once GuC patches are posted.
Semantics are also not 1:1 since dma fence context will be different.
Could you elaborate?
Exported dma fence context as an "timeline" id will be single with the current patch and multiple contexts with this implementation.
Daniel also raised another difference caused by lack of serialisation due multiple tl->mutex here.
I don't think this is important, it was never part of a contract what happens with racing execbufs, but it is definitely required covering both topics in the commit message.
Regards,
Tvrtko
On Tue, Mar 23, 2021 at 4:35 AM Tvrtko Ursulin tvrtko.ursulin@linux.intel.com wrote:
Gah. You were right. It takes a reference if it needs one. I thought I was being symmetric with the other syncobj usage but it was well hidden inside our confusing tear-down paths.
At Daniel's request, I've wrapped these in unlikely()
I've updated the commit message as follows:
drm/i915: Implement SINGLE_TIMELINE with a syncobj (v2)
This API is entirely unnecessary and I'd love to get rid of it. If userspace wants a single timeline across multiple contexts, they can either use implicit synchronization or a syncobj, both of which existed at the time this feature landed. The justification given at the time was that it would help GL drivers which are inherently single-timeline. However, neither of our GL drivers actually wanted the feature. i965 was already in maintenance mode at the time and iris uses syncobj for everything.
Unfortunately, as much as I'd love to get rid of it, it is used by the media driver so we can't do that. We can, however, do the next-best thing which is to embed a syncobj in the context and do exactly what we'd expect from userspace internally. This isn't an entirely identical implementation because it's no longer atomic if userspace races with itself by calling execbuffer2 twice simultaneously from different threads. It won't crash in that case; it just doesn't guarantee any ordering between those two submits.
Moving SINGLE_TIMELINE to a syncobj emulation has a couple of technical advantages beyond mere annoyance. One is that intel_timeline is no longer an api-visible object and can remain entirely an implementation detail. This may be advantageous as we make scheduler changes going forward. Second is that, together with deleting the CLONE_CONTEXT API, we should now have a 1:1 mapping between intel_context and intel_timeline which may help us reduce locking.
I hope that clears up some of the confusion and is less bothersome.
--Jason
On Fri, Mar 19, 2021 at 05:38:56PM -0500, Jason Ekstrand wrote:
I think we should explain a bit more what this actually does, i.e. emulate it using a syncobj, but unlike the real timeline it does not make any total order guarantees. Since userspace doesn't expect that.
I think would be good to wrap these in unlikely, least also because it's nice documentation. -Daniel
On Mon, Mar 22, 2021 at 11:59 AM Daniel Vetter daniel@ffwll.ch wrote:
New comment:
/** * @syncobj: Shared timeline syncobj * * When the SHARED_TIMELINE flag is set on context creation, we * emulate a single timeline across all engines using this syncobj. * For every execbuffer2 call, this syncobj is used as both an in- * and out-fence. Unlike the real intel_timeline, this doesn't * provide perfect atomic in-order guarantees if the client races * with itself by calling execbuffer2 twice concurrently. However, * if userspace races with itself, that's not likely to yield well- * defined results anyway so we choose to not care. */
Done.
This API is entirely unnecessary and I'd love to get rid of it. If userspace wants a single timeline across multiple contexts, they can either use implicit synchronization or a syncobj, both of which existed at the time this feature landed. The justification given at the time was that it would help GL drivers which are inherently single-timeline. However, neither of our GL drivers actually wanted the feature. i965 was already in maintenance mode at the time and iris uses syncobj for everything.
Unfortunately, as much as I'd love to get rid of it, it is used by the media driver so we can't do that. We can, however, do the next-best thing which is to embed a syncobj in the context and do exactly what we'd expect from userspace internally. This isn't an entirely identical implementation because it's no longer atomic if userspace races with itself by calling execbuffer2 twice simultaneously from different threads. It won't crash in that case; it just doesn't guarantee any ordering between those two submits.
Moving SINGLE_TIMELINE to a syncobj emulation has a couple of technical advantages beyond mere annoyance. One is that intel_timeline is no longer an api-visible object and can remain entirely an implementation detail. This may be advantageous as we make scheduler changes going forward. Second is that, together with deleting the CLONE_CONTEXT API, we should now have a 1:1 mapping between intel_context and intel_timeline which may help us reduce locking.
v2 (Jason Ekstrand): - Update the comment on i915_gem_context::syncobj to mention that it's an emulation and the possible race if userspace calls execbuffer2 twice on the same context concurrently. - Wrap the checks for eb.gem_context->syncobj in unlikely() - Drop the dma_fence reference - Improved commit message
Signed-off-by: Jason Ekstrand jason@jlekstrand.net Cc: Maarten Lankhorst maarten.lankhorst@linux.intel.com Cc: Matthew Brost matthew.brost@intel.com --- drivers/gpu/drm/i915/gem/i915_gem_context.c | 47 ++++--------------- .../gpu/drm/i915/gem/i915_gem_context_types.h | 14 +++++- .../gpu/drm/i915/gem/i915_gem_execbuffer.c | 16 +++++++ 3 files changed, 39 insertions(+), 38 deletions(-)
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index f88bac19333ec..e094f4a1ca4cd 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -67,6 +67,8 @@ #include <linux/log2.h> #include <linux/nospec.h>
+#include <drm/drm_syncobj.h> + #include "gt/gen6_ppgtt.h" #include "gt/intel_context.h" #include "gt/intel_engine_heartbeat.h" @@ -224,10 +226,6 @@ static void intel_context_set_gem(struct intel_context *ce, ce->vm = vm; }
- GEM_BUG_ON(ce->timeline); - if (ctx->timeline) - ce->timeline = intel_timeline_get(ctx->timeline); - if (ctx->sched.priority >= I915_PRIORITY_NORMAL && intel_engine_has_timeslices(ce->engine)) __set_bit(CONTEXT_USE_SEMAPHORES, &ce->flags); @@ -344,8 +342,8 @@ void i915_gem_context_release(struct kref *ref) mutex_destroy(&ctx->engines_mutex); mutex_destroy(&ctx->lut_mutex);
- if (ctx->timeline) - intel_timeline_put(ctx->timeline); + if (ctx->syncobj) + drm_syncobj_put(ctx->syncobj);
put_pid(ctx->pid); mutex_destroy(&ctx->mutex); @@ -790,33 +788,11 @@ static void __assign_ppgtt(struct i915_gem_context *ctx, i915_vm_close(vm); }
-static void __set_timeline(struct intel_timeline **dst, - struct intel_timeline *src) -{ - struct intel_timeline *old = *dst; - - *dst = src ? intel_timeline_get(src) : NULL; - - if (old) - intel_timeline_put(old); -} - -static void __apply_timeline(struct intel_context *ce, void *timeline) -{ - __set_timeline(&ce->timeline, timeline); -} - -static void __assign_timeline(struct i915_gem_context *ctx, - struct intel_timeline *timeline) -{ - __set_timeline(&ctx->timeline, timeline); - context_apply_all(ctx, __apply_timeline, timeline); -} - static struct i915_gem_context * i915_gem_create_context(struct drm_i915_private *i915, unsigned int flags) { struct i915_gem_context *ctx; + int ret;
if (flags & I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE && !HAS_EXECLISTS(i915)) @@ -845,16 +821,13 @@ i915_gem_create_context(struct drm_i915_private *i915, unsigned int flags) }
if (flags & I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE) { - struct intel_timeline *timeline; - - timeline = intel_timeline_create(&i915->gt); - if (IS_ERR(timeline)) { + ret = drm_syncobj_create(&ctx->syncobj, + DRM_SYNCOBJ_CREATE_SIGNALED, + NULL); + if (ret) { context_close(ctx); - return ERR_CAST(timeline); + return ERR_PTR(ret); } - - __assign_timeline(ctx, timeline); - intel_timeline_put(timeline); }
trace_i915_context_create(ctx); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h index 676592e27e7d2..df76767f0c41b 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h @@ -83,7 +83,19 @@ struct i915_gem_context { struct i915_gem_engines __rcu *engines; struct mutex engines_mutex; /* guards writes to engines */
- struct intel_timeline *timeline; + /** + * @syncobj: Shared timeline syncobj + * + * When the SHARED_TIMELINE flag is set on context creation, we + * emulate a single timeline across all engines using this syncobj. + * For every execbuffer2 call, this syncobj is used as both an in- + * and out-fence. Unlike the real intel_timeline, this doesn't + * provide perfect atomic in-order guarantees if the client races + * with itself by calling execbuffer2 twice concurrently. However, + * if userspace races with itself, that's not likely to yield well- + * defined results anyway so we choose to not care. + */ + struct drm_syncobj *syncobj;
/** * @vm: unique address space (GTT) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index 96403130a373d..2e9748c1edddf 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -3295,6 +3295,16 @@ i915_gem_do_execbuffer(struct drm_device *dev, goto err_vma; }
+ if (unlikely(eb.gem_context->syncobj)) { + struct dma_fence *fence; + + fence = drm_syncobj_fence_get(eb.gem_context->syncobj); + err = i915_request_await_dma_fence(eb.request, fence); + if (err) + goto err_ext; + dma_fence_put(fence); + } + if (in_fence) { if (args->flags & I915_EXEC_FENCE_SUBMIT) err = i915_request_await_execution(eb.request, @@ -3351,6 +3361,12 @@ i915_gem_do_execbuffer(struct drm_device *dev, fput(out_fence->file); } } + + if (unlikely(eb.gem_context->syncobj)) { + drm_syncobj_replace_fence(eb.gem_context->syncobj, + &eb.request->fence); + } + i915_request_put(eb.request);
err_vma:
On 23/03/2021 17:51, Jason Ekstrand wrote:
Much, much better commit message although I still fail to understand where do you see implementation details leaking out. So for me this is still something I'd like to get to the bottom of.
I would also mention the difference regarding fence context change.
And in general I would maintain this patch as part of a series which ends up demonstrating the "mays" and "shoulds".
I think put goes before the error bail.
Regards,
Tvrtko
On Wed, Mar 24, 2021 at 09:28:58AM +0000, Tvrtko Ursulin wrote:
I disagree. The past few years we've merged way too much patches and features without carefully answering the high level questions of - do we really need to solve this problem - and if so, are we really solving this problem in the right place
Now we're quite in a hole, and we're not going to get out of this hole if we keep applying the same standards that got us here. Anything that does not clearly and without reservation the above two questions with "yes" needs to be removed or walled off, just so we can eventually see which complexity we really need, and what is actually superflous.
Especially when the kernel patch is this simple. -Daniel
On 24/03/2021 09:52, Daniel Vetter wrote:
I understand your general point but when I apply it to this specific patch, even if it is simple, it is neither removing the uapi or walling it off. So I see it as the usual review standard to ask to see what "mays" and "shoulds" actually get us in concrete terms.
I would be able to understand putting the uapi behind the "if gen > 12 || is_discrete EINVAL", or whatever, since it is fair game to deprecate for any new platform or say GuC submission.
Not doing simply that makes me worry there are still misunderstandings on what kind of problems were encountered with regards to intel_timeline, by work item this or work item that, and that there isn't still a confusion about what is the internal timeline object and what is the internal hwsp object. I feel there hasn't been full transparency on these technical issues which is why I think seeing the actual series which is supposed to build on top of this would be helpful.
I even worry that we still have a big disconnect on whether this flag is leaking any internal implementation details out to userspace. If the commit message was reworded without actual agreement that implementation details are not exposed we will continue disagreeing going forward, which is not a good start.
We exchanged so many emails on this but I don't feel we are getting anywhere so I really have no idea - obviously you will steamroll this in regardless so I don't think there is point to argue further.
Regards,
Tvrtko
On Wed, Mar 24, 2021 at 6:36 AM Tvrtko Ursulin tvrtko.ursulin@linux.intel.com wrote:
I didn't say "leaking". I said it's no longer API-visible. That's just true. It used to be a kernel object that userspace was unaware of, then we added SINGLE_TIMELINE and userspace now has some influence over the object. With this patch, it's hidden again. I don't get why that's confusing.
I would also mention the difference regarding fence context change.
There are no fence context changes. The fence that we stuff in the syncobj is an i915 fence and the fence we pull back out is an i915 fence. A syncobj is just a fancy wrapper for a dma_buf pointer.
Instead of focusing on the term "leak", let's focus on this line of the commit message instead:
Now, I've not written any patches yet which actually reduce the locking. I can try to look into that today. I CC'd Maarten on the first send of this because I was hoping he would have good intuition about this. It may be that this object will always have to require some amount of locking if the scheduler has to touch them in parallel with other stuff. What I can say concretely, however, is that this does reduce the sharing of an internal object even if it doesn't get rid of it completely. The one thing that is shared all over the place with this patch is a syncobj which is explicitly designed for exactly this sort of case and can be used and abused by as many threads as you'd like. That seems like it's going in the right direction.
I can further weasel-word the commit message to make the above more prominent if you'd like.
Fixed locally. It'll be in v3.
--Jason
On 24/03/2021 17:18, Jason Ekstrand wrote:
I am definitely glad we moved on from leaking to less dramatic wording, but it is still not API "visible" in so much struct file_operations * is not user visible in any negative way when you do open(2), being just implementation detail. But I give up.
Change is in the dma_fence->context.
Current code single timeline is one fence context. With this patch that is no longer the case.
Not sure that it has any practical implications for the internal dma_fence code like is_later checks , haven't analysed it.
And sync fence info ioctl exposes this value to userspace which probably has no practical implications. Unless some indirect effect when merging fences.
Main point is that I think these are the things which need to be discussed in the commit message.
I am not interested in making you weasel-word anything but reaching a consensus and what is actually true and accurate.
Regards,
Tvrtko
On Thu, Mar 25, 2021 at 10:48 AM Tvrtko Ursulin tvrtko.ursulin@linux.intel.com wrote:
We merge fewer fences at the higher levels and rely more on the fence dependency tracking of the scheduler frontend to sort stuff out.
Userspace can use that to do fence merging. Which is always only a strict optimization. I'm not even sure whether Android does that or not.
Main point is that I think these are the things which need to be discussed in the commit message.
Yeah makes sense to add these as impact, next to the "we don't deal with races anymore" part. -Daniel
On Tue, Mar 23, 2021 at 12:51:49PM -0500, Jason Ekstrand wrote:
Yeah I think this captures everything we need to say here.
Acked-by: Daniel Vetter daniel.vetter@ffwll.ch
No full r-b because I have a pile of other things to do too. -Daniel
On Tue, Mar 23, 2021 at 12:51:49PM -0500, Jason Ekstrand wrote:
I'm not going to dive into the philosophical debate that this patch has pigeonholed into, but a 1:1 mapping between intel_context and intel_timeline is huge win IMO - these are the types of cleanups I can get really get behind.
Also think this needs to be moved above the error checking, as this put corresponds to 'drm_syncobj_fence_get', right?
Matt
On Thu, Mar 25, 2021 at 4:21 PM Matthew Brost matthew.brost@intel.com wrote:
Already done. Let me send v3.
This API is entirely unnecessary and I'd love to get rid of it. If userspace wants a single timeline across multiple contexts, they can either use implicit synchronization or a syncobj, both of which existed at the time this feature landed. The justification given at the time was that it would help GL drivers which are inherently single-timeline. However, neither of our GL drivers actually wanted the feature. i965 was already in maintenance mode at the time and iris uses syncobj for everything.
Unfortunately, as much as I'd love to get rid of it, it is used by the media driver so we can't do that. We can, however, do the next-best thing which is to embed a syncobj in the context and do exactly what we'd expect from userspace internally. This isn't an entirely identical implementation because it's no longer atomic if userspace races with itself by calling execbuffer2 twice simultaneously from different threads. It won't crash in that case; it just doesn't guarantee any ordering between those two submits.
Moving SINGLE_TIMELINE to a syncobj emulation has a couple of technical advantages beyond mere annoyance. One is that intel_timeline is no longer an api-visible object and can remain entirely an implementation detail. This may be advantageous as we make scheduler changes going forward. Second is that, together with deleting the CLONE_CONTEXT API, we should now have a 1:1 mapping between intel_context and intel_timeline which may help us reduce locking.
v2 (Jason Ekstrand): - Update the comment on i915_gem_context::syncobj to mention that it's an emulation and the possible race if userspace calls execbuffer2 twice on the same context concurrently. - Wrap the checks for eb.gem_context->syncobj in unlikely() - Drop the dma_fence reference - Improved commit message
v3 (Jason Ekstrand): - Move the dma_fence_put() to before the error exit
Signed-off-by: Jason Ekstrand jason@jlekstrand.net Cc: Maarten Lankhorst maarten.lankhorst@linux.intel.com Cc: Matthew Brost matthew.brost@intel.com --- drivers/gpu/drm/i915/gem/i915_gem_context.c | 47 ++++--------------- .../gpu/drm/i915/gem/i915_gem_context_types.h | 14 +++++- .../gpu/drm/i915/gem/i915_gem_execbuffer.c | 16 +++++++ 3 files changed, 39 insertions(+), 38 deletions(-)
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index ea449be2672c2..87d1715e354fe 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -67,6 +67,8 @@ #include <linux/log2.h> #include <linux/nospec.h>
+#include <drm/drm_syncobj.h> + #include "gt/gen6_ppgtt.h" #include "gt/intel_context.h" #include "gt/intel_engine_heartbeat.h" @@ -225,10 +227,6 @@ static void intel_context_set_gem(struct intel_context *ce, ce->vm = vm; }
- GEM_BUG_ON(ce->timeline); - if (ctx->timeline) - ce->timeline = intel_timeline_get(ctx->timeline); - if (ctx->sched.priority >= I915_PRIORITY_NORMAL && intel_engine_has_timeslices(ce->engine)) __set_bit(CONTEXT_USE_SEMAPHORES, &ce->flags); @@ -365,8 +363,8 @@ void i915_gem_context_release(struct kref *ref) if (ctx->client) i915_drm_client_put(ctx->client);
- if (ctx->timeline) - intel_timeline_put(ctx->timeline); + if (ctx->syncobj) + drm_syncobj_put(ctx->syncobj);
mutex_destroy(&ctx->engines_mutex); mutex_destroy(&ctx->lut_mutex); @@ -820,33 +818,11 @@ static void __assign_ppgtt(struct i915_gem_context *ctx, i915_vm_close(vm); }
-static void __set_timeline(struct intel_timeline **dst, - struct intel_timeline *src) -{ - struct intel_timeline *old = *dst; - - *dst = src ? intel_timeline_get(src) : NULL; - - if (old) - intel_timeline_put(old); -} - -static void __apply_timeline(struct intel_context *ce, void *timeline) -{ - __set_timeline(&ce->timeline, timeline); -} - -static void __assign_timeline(struct i915_gem_context *ctx, - struct intel_timeline *timeline) -{ - __set_timeline(&ctx->timeline, timeline); - context_apply_all(ctx, __apply_timeline, timeline); -} - static struct i915_gem_context * i915_gem_create_context(struct drm_i915_private *i915, unsigned int flags) { struct i915_gem_context *ctx; + int ret;
if (flags & I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE && !HAS_EXECLISTS(i915)) @@ -875,16 +851,13 @@ i915_gem_create_context(struct drm_i915_private *i915, unsigned int flags) }
if (flags & I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE) { - struct intel_timeline *timeline; - - timeline = intel_timeline_create(&i915->gt); - if (IS_ERR(timeline)) { + ret = drm_syncobj_create(&ctx->syncobj, + DRM_SYNCOBJ_CREATE_SIGNALED, + NULL); + if (ret) { context_close(ctx); - return ERR_CAST(timeline); + return ERR_PTR(ret); } - - __assign_timeline(ctx, timeline); - intel_timeline_put(timeline); }
trace_i915_context_create(ctx); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h index 886515b8e7a7b..e38574525b0f2 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h @@ -83,7 +83,19 @@ struct i915_gem_context { struct i915_gem_engines __rcu *engines; struct mutex engines_mutex; /* guards writes to engines */
- struct intel_timeline *timeline; + /** + * @syncobj: Shared timeline syncobj + * + * When the SHARED_TIMELINE flag is set on context creation, we + * emulate a single timeline across all engines using this syncobj. + * For every execbuffer2 call, this syncobj is used as both an in- + * and out-fence. Unlike the real intel_timeline, this doesn't + * provide perfect atomic in-order guarantees if the client races + * with itself by calling execbuffer2 twice concurrently. However, + * if userspace races with itself, that's not likely to yield well- + * defined results anyway so we choose to not care. + */ + struct drm_syncobj *syncobj;
/** * @vm: unique address space (GTT) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index 339af3aa5174e..6ff046dd19bd0 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -3288,6 +3288,16 @@ i915_gem_do_execbuffer(struct drm_device *dev, goto err_vma; }
+ if (unlikely(eb.gem_context->syncobj)) { + struct dma_fence *fence; + + fence = drm_syncobj_fence_get(eb.gem_context->syncobj); + err = i915_request_await_dma_fence(eb.request, fence); + dma_fence_put(fence); + if (err) + goto err_ext; + } + if (in_fence) { if (args->flags & I915_EXEC_FENCE_SUBMIT) err = i915_request_await_execution(eb.request, @@ -3344,6 +3354,12 @@ i915_gem_do_execbuffer(struct drm_device *dev, fput(out_fence->file); } } + + if (unlikely(eb.gem_context->syncobj)) { + drm_syncobj_replace_fence(eb.gem_context->syncobj, + &eb.request->fence); + } + i915_request_put(eb.request);
err_vma:
dri-devel@lists.freedesktop.org