Am 11.06.21 um 10:58 schrieb Daniel Vetter:
On Thu, Jun 10, 2021 at 11:17:57AM +0200, Christian König wrote:
Add some rather sophisticated lockless garbage collection for dma_fence_chain objects.
For this keep all initialized dma_fence_chain nodes an a queue and trigger garbage collection before a new one is allocated.
Signed-off-by: Christian König christian.koenig@amd.com
Uh hand-rolled lockless list, I'm not a fan.
But the real question here is why? This is a global list, so it's going to look great on your desktop, but gpus are for servers now and those are NUMA. So just from that pov doing garbage-collection individually still feels like a much better idea.
Yeah, I was pondering on that quite a bit as well. That why I added the multi threaded alloc/free test.
So what's the problem your trying to solve here?
I was not sure if the chain is garbage collected enough when used for the dma_resv exclusive object.
But I think we could just drop this for now and just see how it goes.
Christian.
-Daniel
drivers/dma-buf/dma-fence-chain.c | 160 +++++++++++++++++++++++++----- include/linux/dma-fence-chain.h | 5 + 2 files changed, 142 insertions(+), 23 deletions(-)
diff --git a/drivers/dma-buf/dma-fence-chain.c b/drivers/dma-buf/dma-fence-chain.c index 1b4cb3e5cec9..c2f0b69eabb7 100644 --- a/drivers/dma-buf/dma-fence-chain.c +++ b/drivers/dma-buf/dma-fence-chain.c @@ -9,8 +9,53 @@
#include <linux/dma-fence-chain.h>
+static struct dma_fence_chain __rcu *fifo_front; +static struct dma_fence_chain __rcu **fifo_back = &fifo_front;
- static bool dma_fence_chain_enable_signaling(struct dma_fence *fence);
+/**
- dma_fence_chain_enqueue - enqeue a chain node for garbage collection
- @chain: the chain node to enqueue
- Add the chain node to the end of the gc fifo.
- */
+static void dma_fence_chain_enqueue(struct dma_fence_chain *chain) +{
- struct dma_fence_chain __rcu **tmp;
- RCU_INIT_POINTER(chain->next, NULL);
- tmp = xchg((struct dma_fence_chain __force ***)&fifo_back,
&chain->next);
- /* This is intentionally unordered since we only need the fifo for gc */
- rcu_assign_pointer(*tmp, chain);
+}
+/**
- dma_fence_chain_dequeue - deqeueue a chain node for garbage collection
- Remove the first chain node from the gc fifo while making sure that always
- keep at least one node on the fifo for lockless fifo implementation.
- Returns the dequeued chain node or NULL.
- */
+static struct dma_fence_chain *dma_fence_chain_dequeue(void) +{
- struct dma_fence_chain *chain, *tmp;
- rcu_read_lock();
- chain = rcu_dereference(fifo_front);
- /* Never dequeue the last chain node for lockless fifo */
- if (unlikely(!chain || !rcu_access_pointer(chain->next))) {
rcu_read_unlock();
return NULL;
- }
- tmp = cmpxchg((struct dma_fence_chain __force **)&fifo_front,
chain, rcu_access_pointer(chain->next));
- rcu_read_unlock();
- return tmp == chain ? chain : NULL;
+}
- /**
- dma_fence_chain_get_prev - use RCU to get a reference to the previous fence
- @chain: chain node to get the previous node from
@@ -28,6 +73,43 @@ static struct dma_fence *dma_fence_chain_get_prev(struct dma_fence_chain *chain) return prev; }
+/**
- dma_fence_chain_try_replace - try to replace the prev node
- @chain: Chain node we try to replace prev.
- @prev: the old prev node
- Try to replace the previous chain node when it or its containing fence is
- signaled. Returns true if we tried, false if we need to wait.
- */
+static bool dma_fence_chain_try_replace(struct dma_fence_chain *chain,
struct dma_fence *prev)
+{
- struct dma_fence *replacement, *tmp;
- struct dma_fence_chain *prev_chain;
- prev_chain = to_dma_fence_chain(prev);
- if (prev_chain) {
if (!dma_fence_is_signaled(prev_chain->fence))
return false;
replacement = dma_fence_chain_get_prev(prev_chain);
- } else {
if (!dma_fence_is_signaled(prev))
return false;
replacement = NULL;
- }
- tmp = cmpxchg((struct dma_fence __force **)&chain->prev, prev,
replacement);
- if (tmp == prev)
dma_fence_put(tmp);
- else
dma_fence_put(replacement);
- return true;
+}
- /**
- dma_fence_chain_walk - chain walking function
- @fence: current chain node
@@ -38,8 +120,8 @@ static struct dma_fence *dma_fence_chain_get_prev(struct dma_fence_chain *chain) */ struct dma_fence *dma_fence_chain_walk(struct dma_fence *fence) {
- struct dma_fence_chain *chain, *prev_chain;
- struct dma_fence *prev, *replacement, *tmp;
struct dma_fence_chain *chain;
struct dma_fence *prev;
chain = to_dma_fence_chain(fence); if (!chain) {
@@ -48,26 +130,8 @@ struct dma_fence *dma_fence_chain_walk(struct dma_fence *fence) }
while ((prev = dma_fence_chain_get_prev(chain))) {
prev_chain = to_dma_fence_chain(prev);
if (prev_chain) {
if (!dma_fence_is_signaled(prev_chain->fence))
break;
replacement = dma_fence_chain_get_prev(prev_chain);
} else {
if (!dma_fence_is_signaled(prev))
break;
replacement = NULL;
}
tmp = cmpxchg((struct dma_fence __force **)&chain->prev,
prev, replacement);
if (tmp == prev)
dma_fence_put(tmp);
else
dma_fence_put(replacement);
if (!dma_fence_chain_try_replace(chain, prev))
dma_fence_put(prev); }break;
@@ -205,7 +269,21 @@ static void dma_fence_chain_release(struct dma_fence *fence) dma_fence_put(prev);
dma_fence_put(chain->fence);
- dma_fence_free(fence);
WRITE_ONCE(chain->fence, NULL);
/*
* Don't garbage collect here to avoid recursion and potential stack
* overflow.
*/
chain = dma_fence_chain_dequeue();
if (!chain)
return;
if (kref_read(&chain->base.refcount) ||
READ_ONCE(chain->fence))
dma_fence_chain_enqueue(chain);
else
dma_fence_free(&chain->base);
}
const struct dma_fence_ops dma_fence_chain_ops = {
@@ -218,6 +296,40 @@ const struct dma_fence_ops dma_fence_chain_ops = { }; EXPORT_SYMBOL(dma_fence_chain_ops);
+/**
- dma_fence_chain_garbage_collect - cleanup chain nodes
- Do some garbage collection and try to release chain nodes.
- */
+void dma_fence_chain_garbage_collect(void) +{
- struct dma_fence_chain *chain = dma_fence_chain_dequeue();
- if (!chain)
return;
- if (!dma_fence_get_rcu(&chain->base)) {
/* Unused, check if it's also clean */
if (likely(!READ_ONCE(chain->fence))) {
dma_fence_free(&chain->base);
return;
}
- } else {
struct dma_fence *prev;
/* Used, do some chain walk */
prev = dma_fence_chain_get_prev(chain);
if (prev) {
dma_fence_chain_try_replace(chain, prev);
dma_fence_put(prev);
}
dma_fence_put(&chain->base);
- }
- dma_fence_chain_enqueue(chain);
+} +EXPORT_SYMBOL(dma_fence_chain_garbage_collect);
- /**
- dma_fence_chain_init - initialize a fence chain
- @chain: the chain node to initialize
@@ -254,5 +366,7 @@ void dma_fence_chain_init(struct dma_fence_chain *chain,
dma_fence_init(&chain->base, &dma_fence_chain_ops, &chain->lock, context, seqno);
- dma_fence_chain_enqueue(chain); }
- EXPORT_SYMBOL(dma_fence_chain_init);
diff --git a/include/linux/dma-fence-chain.h b/include/linux/dma-fence-chain.h index 5f45689a6d2e..b412b5396434 100644 --- a/include/linux/dma-fence-chain.h +++ b/include/linux/dma-fence-chain.h @@ -19,6 +19,7 @@
- @base: fence base class
- @lock: spinlock for fence handling
- @prev: previous fence of the chain
- @next: next chain node for garbage collection
- @prev_seqno: original previous seqno before garbage collection
- @fence: encapsulated fence
- @cb: callback structure for signaling
@@ -27,6 +28,7 @@ struct dma_fence_chain { struct dma_fence base; struct dma_fence __rcu *prev;
- struct dma_fence_chain __rcu *next; u64 prev_seqno; struct dma_fence *fence; union {
@@ -38,6 +40,8 @@ struct dma_fence_chain {
extern const struct dma_fence_ops dma_fence_chain_ops;
+void dma_fence_chain_garbage_collect(void);
- /**
- to_dma_fence_chain - cast a fence to a dma_fence_chain
- @fence: fence to cast to a dma_fence_array
@@ -61,6 +65,7 @@ to_dma_fence_chain(struct dma_fence *fence) */ static inline struct dma_fence_chain *dma_fence_chain_alloc(void) {
- dma_fence_chain_garbage_collect(); return kmalloc(sizeof(struct dma_fence_chain), GFP_KERNEL); };
-- 2.25.1