Reserve 0 for general use a token meaning that the fence doesn't belong to an ordered timeline (fence context).
Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Sumit Semwal sumit.semwal@linaro.org Cc: Gustavo Padovan gustavo@padovan.org Cc: Joonas Lahtinen joonas.lahtinen@linux.intel.com Cc: "Christian König" <christian.koenig@amd.com Cc: Daniel Vetter daniel.vetter@ffwll.ch Cc: Alex Deucher alexander.deucher@amd.com --- drivers/dma-buf/dma-fence.c | 4 +++- include/linux/dma-fence.h | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c index 0918d3f003d6..0646357ea350 100644 --- a/drivers/dma-buf/dma-fence.c +++ b/drivers/dma-buf/dma-fence.c @@ -36,8 +36,10 @@ EXPORT_TRACEPOINT_SYMBOL(dma_fence_enable_signal); * fence context, this allows checking if fences belong to the same * context or not. One device can have multiple separate contexts, * and they're used if some engine can run independently of another. + * + * 0 is excluded and treated as a special DMA_FENCE_NO_CONTEXT. */ -static atomic64_t dma_fence_context_counter = ATOMIC64_INIT(0); +static atomic64_t dma_fence_context_counter = ATOMIC64_INIT(1);
/** * dma_fence_context_alloc - allocate an array of fence contexts diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index 6048fa404e57..adfdc7fdd9c3 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -455,6 +455,8 @@ static inline signed long dma_fence_wait(struct dma_fence *fence, bool intr) return ret < 0 ? ret : 0; }
+#define DMA_FENCE_NO_CONTEXT ((u64)0) + u64 dma_fence_context_alloc(unsigned num);
#define DMA_FENCE_TRACE(f, fmt, args...) \
2 clflushes on two different objects are not ordered, and so do not belong to the same timeline (context). Either we use a unique context for each, or we reserve a special context (0 / DMA_FENCE_NO_CONTEXT) to mean unordered.
Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Sumit Semwal sumit.semwal@linaro.org Cc: Gustavo Padovan gustavo@padovan.org Cc: Daniel Vetter daniel.vetter@ffwll.ch Cc: Joonas Lahtinen joonas.lahtinen@linux.intel.com --- drivers/gpu/drm/i915/i915_gem.c | 2 -- drivers/gpu/drm/i915/i915_gem_clflush.c | 8 +------- drivers/gpu/drm/i915/i915_gem_clflush.h | 1 - 3 files changed, 1 insertion(+), 10 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index b210acc3d0b4..6dacc5c21889 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -4681,8 +4681,6 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
mutex_lock(&dev_priv->drm.struct_mutex);
- i915_gem_clflush_init(dev_priv); - if (!i915.enable_execlists) { dev_priv->gt.resume = intel_legacy_submission_resume; dev_priv->gt.cleanup_engine = intel_engine_cleanup; diff --git a/drivers/gpu/drm/i915/i915_gem_clflush.c b/drivers/gpu/drm/i915/i915_gem_clflush.c index ffd01e02fe94..57ee389c665f 100644 --- a/drivers/gpu/drm/i915/i915_gem_clflush.c +++ b/drivers/gpu/drm/i915/i915_gem_clflush.c @@ -27,7 +27,6 @@ #include "i915_gem_clflush.h"
static DEFINE_SPINLOCK(clflush_lock); -static u64 clflush_context;
struct clflush { struct dma_fence dma; /* Must be first for dma_fence_free() */ @@ -157,7 +156,7 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj, dma_fence_init(&clflush->dma, &i915_clflush_ops, &clflush_lock, - clflush_context, + DMA_FENCE_NO_CONTEXT, 0); i915_sw_fence_init(&clflush->wait, i915_clflush_notify);
@@ -182,8 +181,3 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj, GEM_BUG_ON(obj->base.write_domain != I915_GEM_DOMAIN_CPU); } } - -void i915_gem_clflush_init(struct drm_i915_private *i915) -{ - clflush_context = dma_fence_context_alloc(1); -} diff --git a/drivers/gpu/drm/i915/i915_gem_clflush.h b/drivers/gpu/drm/i915/i915_gem_clflush.h index b62d61a2d15f..2455a7820937 100644 --- a/drivers/gpu/drm/i915/i915_gem_clflush.h +++ b/drivers/gpu/drm/i915/i915_gem_clflush.h @@ -28,7 +28,6 @@ struct drm_i915_private; struct drm_i915_gem_object;
-void i915_gem_clflush_init(struct drm_i915_private *i915); void i915_gem_clflush_object(struct drm_i915_gem_object *obj, unsigned int flags); #define I915_CLFLUSH_FORCE BIT(0)
Track the latest fence waited upon on each context, and only add a new asynchronous wait if the new fence is more recent than the recorded fence for that context. This requires us to filter out unordered timelines, which are noted by DMA_FENCE_NO_CONTEXT.
Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Tvrtko Ursulin tvrtko.ursulin@intel.com Cc: Joonas Lahtinen joonas.lahtinen@linux.intel.com --- drivers/gpu/drm/i915/i915_gem_request.c | 33 +++++++++++++++++++++++++++++++++ drivers/gpu/drm/i915/i915_gem_request.h | 2 ++ lib/radix-tree.c | 1 + 3 files changed, 36 insertions(+)
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c index 313cdff7c6dd..c184f1d26f25 100644 --- a/drivers/gpu/drm/i915/i915_gem_request.c +++ b/drivers/gpu/drm/i915/i915_gem_request.c @@ -606,6 +606,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
i915_priotree_init(&req->priotree);
+ INIT_RADIX_TREE(&req->waits, GFP_KERNEL); INIT_LIST_HEAD(&req->active_list); req->i915 = dev_priv; req->engine = engine; @@ -723,6 +724,27 @@ i915_gem_request_await_dma_fence(struct drm_i915_gem_request *req, if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) return 0;
+ /* Squash repeated waits to the same timelines, picking the latest */ + if (fence->context != DMA_FENCE_NO_CONTEXT) { + void __rcu **slot; + + slot = radix_tree_lookup_slot(&req->waits, fence->context); + if (!slot) { + ret = radix_tree_insert(&req->waits, + fence->context, fence); + if (ret) + return ret; + } else { + struct dma_fence *old = + rcu_dereference_protected(*slot, true); + + if (!dma_fence_is_later(fence, old)) + return 0; + + radix_tree_replace_slot(&req->waits, slot, fence); + } + } + if (dma_fence_is_i915(fence)) return i915_gem_request_await_request(req, to_request(fence));
@@ -843,6 +865,15 @@ static void i915_gem_mark_busy(const struct intel_engine_cs *engine) round_jiffies_up_relative(HZ)); }
+static void free_radixtree(struct radix_tree_root *root) +{ + struct radix_tree_iter iter; + void __rcu **slot; + + radix_tree_for_each_slot(slot, root, &iter, 0) + radix_tree_iter_delete(root, &iter, slot); +} + /* * NB: This function is not allowed to fail. Doing so would mean the the * request is not being tracked for completion but the work itself is @@ -943,6 +974,8 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches) local_bh_disable(); i915_sw_fence_commit(&request->submit); local_bh_enable(); /* Kick the execlists tasklet if just scheduled */ + + free_radixtree(&request->waits); }
static unsigned long local_clock_us(unsigned int *cpu) diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h index a211c53c813f..638899b9c170 100644 --- a/drivers/gpu/drm/i915/i915_gem_request.h +++ b/drivers/gpu/drm/i915/i915_gem_request.h @@ -137,6 +137,8 @@ struct drm_i915_gem_request { struct i915_priotree priotree; struct i915_dependency dep;
+ struct radix_tree_root waits; + /** GEM sequence number associated with this request on the * global execution timeline. It is zero when the request is not * on the HW queue (i.e. not on the engine timeline list). diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 691a9ad48497..84cccf7138c4 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -2022,6 +2022,7 @@ void radix_tree_iter_delete(struct radix_tree_root *root, if (__radix_tree_delete(root, iter->node, slot)) iter->index = iter->next_index; } +EXPORT_SYMBOL(radix_tree_iter_delete);
/** * radix_tree_delete_item - delete an item from a radix tree
Am 08.04.2017 um 18:26 schrieb Chris Wilson:
Reserve 0 for general use a token meaning that the fence doesn't belong to an ordered timeline (fence context).
NAK, we kept context allocation cheap to avoid exactly that.
Please elaborate further why it should be necessary now.
Regards, Christian.
Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Sumit Semwal sumit.semwal@linaro.org Cc: Gustavo Padovan gustavo@padovan.org Cc: Joonas Lahtinen joonas.lahtinen@linux.intel.com Cc: "Christian König" <christian.koenig@amd.com Cc: Daniel Vetter daniel.vetter@ffwll.ch Cc: Alex Deucher alexander.deucher@amd.com
drivers/dma-buf/dma-fence.c | 4 +++- include/linux/dma-fence.h | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c index 0918d3f003d6..0646357ea350 100644 --- a/drivers/dma-buf/dma-fence.c +++ b/drivers/dma-buf/dma-fence.c @@ -36,8 +36,10 @@ EXPORT_TRACEPOINT_SYMBOL(dma_fence_enable_signal);
- fence context, this allows checking if fences belong to the same
- context or not. One device can have multiple separate contexts,
- and they're used if some engine can run independently of another.
*/
- 0 is excluded and treated as a special DMA_FENCE_NO_CONTEXT.
-static atomic64_t dma_fence_context_counter = ATOMIC64_INIT(0); +static atomic64_t dma_fence_context_counter = ATOMIC64_INIT(1);
/**
- dma_fence_context_alloc - allocate an array of fence contexts
diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index 6048fa404e57..adfdc7fdd9c3 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -455,6 +455,8 @@ static inline signed long dma_fence_wait(struct dma_fence *fence, bool intr) return ret < 0 ? ret : 0; }
+#define DMA_FENCE_NO_CONTEXT ((u64)0)
u64 dma_fence_context_alloc(unsigned num);
#define DMA_FENCE_TRACE(f, fmt, args...) \
On Sat, Apr 08, 2017 at 07:49:37PM +0200, Christian König wrote:
Am 08.04.2017 um 18:26 schrieb Chris Wilson:
Reserve 0 for general use a token meaning that the fence doesn't belong to an ordered timeline (fence context).
NAK, we kept context allocation cheap to avoid exactly that.
However, they result in very sparse mappings.
Please elaborate further why it should be necessary now.
Because I want to efficiently exclude them from comparisons as demonstrated by this small series as there may be several hundred such fences as dependencies for this job. -Chris
Am 08.04.2017 um 20:00 schrieb Chris Wilson:
On Sat, Apr 08, 2017 at 07:49:37PM +0200, Christian König wrote:
Am 08.04.2017 um 18:26 schrieb Chris Wilson:
Reserve 0 for general use a token meaning that the fence doesn't belong to an ordered timeline (fence context).
NAK, we kept context allocation cheap to avoid exactly that.
However, they result in very sparse mappings.
Which is perfectly fine at least for how we use this in Radeon and Amdgpu.
The fence context is used as key for a hashtable, even when the context is only used once we want an evenly distribution over all of them.
Please elaborate further why it should be necessary now.
Because I want to efficiently exclude them from comparisons as demonstrated by this small series as there may be several hundred such fences as dependencies for this job.
That would horrible break Amdgpu, Radeon and the GPU scheduler because all of them assume that context numbers are unique.
So that is a clear NAK from my side, Christian.
-Chris
On Sun, Apr 09, 2017 at 09:08:53AM +0200, Christian König wrote:
Am 08.04.2017 um 20:00 schrieb Chris Wilson:
On Sat, Apr 08, 2017 at 07:49:37PM +0200, Christian König wrote:
Am 08.04.2017 um 18:26 schrieb Chris Wilson:
Reserve 0 for general use a token meaning that the fence doesn't belong to an ordered timeline (fence context).
NAK, we kept context allocation cheap to avoid exactly that.
However, they result in very sparse mappings.
Which is perfectly fine at least for how we use this in Radeon and Amdgpu.
The fence context is used as key for a hashtable, even when the context is only used once we want an evenly distribution over all of them.
The ht is a more expensive solution.
Please elaborate further why it should be necessary now.
Because I want to efficiently exclude them from comparisons as demonstrated by this small series as there may be several hundred such fences as dependencies for this job.
That would horrible break Amdgpu, Radeon and the GPU scheduler because all of them assume that context numbers are unique.
Why would it break if it respected the trivial notion of an unordered timeline? -Chris
Am 09.04.2017 um 18:02 schrieb Chris Wilson:
On Sun, Apr 09, 2017 at 09:08:53AM +0200, Christian König wrote:
Am 08.04.2017 um 20:00 schrieb Chris Wilson:
On Sat, Apr 08, 2017 at 07:49:37PM +0200, Christian König wrote:
Am 08.04.2017 um 18:26 schrieb Chris Wilson:
Reserve 0 for general use a token meaning that the fence doesn't belong to an ordered timeline (fence context).
NAK, we kept context allocation cheap to avoid exactly that.
However, they result in very sparse mappings.
Which is perfectly fine at least for how we use this in Radeon and Amdgpu.
The fence context is used as key for a hashtable, even when the context is only used once we want an evenly distribution over all of them.
The ht is a more expensive solution.
Not when the fences without contexts are rare.
Please elaborate further why it should be necessary now.
Because I want to efficiently exclude them from comparisons as demonstrated by this small series as there may be several hundred such fences as dependencies for this job.
That would horrible break Amdgpu, Radeon and the GPU scheduler because all of them assume that context numbers are unique.
Why would it break if it respected the trivial notion of an unordered timeline?
You seem to miss the point. We already had that discussion and decided against using no-context fences.
Now we have a whole bunch of cases over different drivers/components where we assume uniqueness of fence contexts.
If you change this without fixing all those cases they will just subtle break, probably without anybody noticing the problem immediately.
So if you don't want to scan all drivers for such cases (and I know at least three of hand) and fix them before that patch then that is a clear no-go for this patch.
I suggest to just use two separate fence contexts for you clflush problem in i915. We have a similar situation in the GPU scheduler solve it the same way.
Regards, Christian.
-Chris
dri-devel@lists.freedesktop.org