Re: [Intel-gfx] [PATCH 23/27] drm/i915/guc: Implement no mid batch preemption for multi-lrc

28 Sep 2021

On 8/20/2021 15:44, Matthew Brost wrote:
...
For some users of multi-lrc, e.g. split frame, it isn't safe to preempt
mid BB. To safely enable preemption at the BB boundary, a handshake
between to parent and child is needed. This is implemented via custom
emit_bb_start & emit_fini_breadcrumb functions and enabled via by
via by -> by
...
default if a context is configured by set parallel extension.
I tend to agree with Tvrtko that this should probably be an opt in 
change. Is there a flags word passed in when creating the context?
Also, it's not just a change in pre-emption behaviour but a change in 
synchronisation too, right? Previously, if you had a whole bunch of back 
to back submissions then one child could run ahead of another and/or the 
parent. After this change, there is a forced regroup at the end of each 
batch. So while one could end sooner/later than the others, they can't 
ever get an entire batch (or more) ahead or behind. Or was that 
synchronisation already in there through other means anyway?
...
Signed-off-by: Matthew Brost matthew.brost@intel.com
drivers/gpu/drm/i915/gt/intel_context.c       |   2 +-
  drivers/gpu/drm/i915/gt/intel_context_types.h |   3 +
  drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |   2 +-
  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 283 +++++++++++++++++-
  4 files changed, 287 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
index 5615be32879c..2de62649e275 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.c
+++ b/drivers/gpu/drm/i915/gt/intel_context.c
@@ -561,7 +561,7 @@ void intel_context_bind_parent_child(struct intel_context *parent,
   GEM_BUG_ON(intel_context_is_child(child));
   GEM_BUG_ON(intel_context_is_parent(child));

parent->guc_number_children++;


child->guc_child_index = parent->guc_number_children++;
list_add_tail(&child->guc_child_link,
     &parent->guc_child_list);
child->parent = parent;

diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
index 713d85b0b364..727f91e7f7c2 100644
--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
@@ -246,6 +246,9 @@ struct intel_context {
   	/** @guc_number_children: number of children if parent */
   	u8 guc_number_children;

/** @guc_child_index: index into guc_child_list if child */


u8 guc_child_index;


/**
@parent_page: page in context used by parent for work queue,
work queue descriptor



diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
index 6cd26dc060d1..9f61cfa5566a 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
@@ -188,7 +188,7 @@ struct guc_process_desc {
   u32 wq_status;
   u32 engine_presence;
   u32 priority;

u32 reserved[30];


u32 reserved[36];

What is this extra space for? All the extra storage is grabbed from 
after the end of this structure, isn't it?
...
} __packed;
#define CONTEXT_REGISTRATION_FLAG_KMD	BIT(0)
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 91330525330d..1a18f99bf12a 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -11,6 +11,7 @@
  #include "gt/intel_context.h"
  #include "gt/intel_engine_pm.h"
  #include "gt/intel_engine_heartbeat.h"
+#include "gt/intel_gpu_commands.h"
  #include "gt/intel_gt.h"
  #include "gt/intel_gt_irq.h"
  #include "gt/intel_gt_pm.h"
@@ -366,10 +367,14 @@ static struct i915_priolist *to_priolist(struct rb_node *rb)
/*

When using multi-lrc submission an extra page in the context state is



reserved for the process descriptor and work queue.





reserved for the process descriptor, work queue, and preempt BB boundary



handshake between the parent + childlren contexts.

The layout of this page is below:
0						guc_process_desc




sizeof(struct guc_process_desc)		child go






CACHELINE_BYTES				child join ...






CACHELINE_BYTES ...





Would be better written as '[num_children]' instead of '...' to make it 
clear it is a per child array.
Also, maybe create a struct for this to get rid of the magic '+1's and 
'BYTES / sizeof' constructs in the functions below.
...

...						unused
PAGE_SIZE / 2				work queue start
...						work queue

@@ -1785,6 +1790,30 @@ static int deregister_context(struct intel_context *ce, u32 guc_id, bool loop)
   return __guc_action_deregister_context(guc, guc_id, loop);
  }
+static inline void clear_children_join_go_memory(struct intel_context *ce)
+{

u32 *mem = (u32 *)(__get_process_desc(ce) + 1);
u8 i;

for (i = 0; i < ce->guc_number_children + 1; ++i)
mem[i * (CACHELINE_BYTES / sizeof(u32))] = 0;



+}



+static inline u32 get_children_go_value(struct intel_context *ce)
+{

u32 *mem = (u32 *)(__get_process_desc(ce) + 1);

return mem[0];

+}



+static inline u32 get_children_join_value(struct intel_context *ce,

			  u8 child_index)



+{

u32 *mem = (u32 *)(__get_process_desc(ce) + 1);

return mem[(child_index + 1) * (CACHELINE_BYTES / sizeof(u32))];

+}

static void guc_context_policy_init(struct intel_engine_cs *engine,
  		    struct guc_lrc_desc *desc)
{

@@ -1867,6 +1896,8 @@ static int guc_lrc_desc_pin(struct intel_context *ce, bool loop)
   		desc->context_flags = CONTEXT_REGISTRATION_FLAG_KMD;
   		guc_context_policy_init(engine, desc);
   	}


clear_children_join_go_memory(ce);

}
/*


@@ -2943,6 +2974,31 @@ static const struct intel_context_ops virtual_child_context_ops = {
   .get_sibling = guc_virtual_get_sibling,
  };
+/*


The below override of the breadcrumbs is enabled when the user configures a



context for parallel submission (multi-lrc, parent-child).







The overridden breadcrumbs implements an algorithm which allows the GuC to



safely preempt all the hw contexts configured for parallel submission



between each BB. The contract between the i915 and GuC is if the parent



context can be preempted, all the children can be preempted, and the GuC will



always try to preempt the parent before the children. A handshake between the



parent / children breadcrumbs ensures the i915 holds up its end of the deal



creating a window to preempt between each set of BBs.


*/

+static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq,

				     u64 offset, u32 len,


				     const unsigned int flags);



+static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq,

				    u64 offset, u32 len,


				    const unsigned int flags);



+static u32 *
+emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq,

				 u32 *cs);



+static u32 *
+emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq,

				u32 *cs);


static struct intel_context *
guc_create_parallel(struct intel_engine_cs **engines,
      unsigned int num_siblings,

@@ -2978,6 +3034,20 @@ guc_create_parallel(struct intel_engine_cs **engines,
   	}
   }

parent->engine->emit_bb_start =
emit_bb_start_parent_no_preempt_mid_batch;


parent->engine->emit_fini_breadcrumb =
emit_fini_breadcrumb_parent_no_preempt_mid_batch;


parent->engine->emit_fini_breadcrumb_dw =
12 + 4 * parent->guc_number_children;


for_each_child(parent, ce) {
ce->engine->emit_bb_start =


	emit_bb_start_child_no_preempt_mid_batch;


ce->engine->emit_fini_breadcrumb =


	emit_fini_breadcrumb_child_no_preempt_mid_batch;


ce->engine->emit_fini_breadcrumb_dw = 16;


}
kfree(siblings);
 return parent;

@@ -3362,6 +3432,204 @@ void intel_guc_submission_init_early(struct intel_guc *guc)
   guc->submission_selected = __guc_submission_selected(guc);
  }
+static inline u32 get_children_go_addr(struct intel_context *ce)
+{

GEM_BUG_ON(!intel_context_is_parent(ce));

return i915_ggtt_offset(ce->state) +
__get_process_desc_offset(ce) +


sizeof(struct guc_process_desc);



+}



+static inline u32 get_children_join_addr(struct intel_context *ce,

			 u8 child_index)



+{

GEM_BUG_ON(!intel_context_is_parent(ce));

return get_children_go_addr(ce) + (child_index + 1) * CACHELINE_BYTES;

+}



+#define PARENT_GO_BB			1
+#define PARENT_GO_FINI_BREADCRUMB	0
+#define CHILD_GO_BB			1
+#define CHILD_GO_FINI_BREADCRUMB	0
+static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq,

				     u64 offset, u32 len,


				     const unsigned int flags)



+{

struct intel_context *ce = rq->context;
u32 *cs;
u8 i;

GEM_BUG_ON(!intel_context_is_parent(ce));

cs = intel_ring_begin(rq, 10 + 4 * ce->guc_number_children);
if (IS_ERR(cs))
return PTR_ERR(cs);



/* Wait on chidlren */

chidlren -> children
...

for (i = 0; i < ce->guc_number_children; ++i) {
*cs++ = (MI_SEMAPHORE_WAIT |


	 MI_SEMAPHORE_GLOBAL_GTT |


	 MI_SEMAPHORE_POLL |


	 MI_SEMAPHORE_SAD_EQ_SDD);


*cs++ = PARENT_GO_BB;


*cs++ = get_children_join_addr(ce, i);


*cs++ = 0;


}

/* Turn off preemption */
*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
*cs++ = MI_NOOP;

/* Tell children go */
cs = gen8_emit_ggtt_write(cs,
		  CHILD_GO_BB,


		  get_children_go_addr(ce),


		  0);



/* Jump to batch */
*cs++ = MI_BATCH_BUFFER_START_GEN8 |
(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));


*cs++ = lower_32_bits(offset);
*cs++ = upper_32_bits(offset);
*cs++ = MI_NOOP;

intel_ring_advance(rq, cs);

return 0;

+}



+static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq,

				    u64 offset, u32 len,


				    const unsigned int flags)



+{

struct intel_context *ce = rq->context;
u32 *cs;

GEM_BUG_ON(!intel_context_is_child(ce));

cs = intel_ring_begin(rq, 12);
if (IS_ERR(cs))
return PTR_ERR(cs);



/* Signal parent */
cs = gen8_emit_ggtt_write(cs,
		  PARENT_GO_BB,


		  get_children_join_addr(ce->parent,


					 ce->guc_child_index),


		  0);



/* Wait parent on for go */

parent on -> on parent
...

*cs++ = (MI_SEMAPHORE_WAIT |
 MI_SEMAPHORE_GLOBAL_GTT |


 MI_SEMAPHORE_POLL |


 MI_SEMAPHORE_SAD_EQ_SDD);


*cs++ = CHILD_GO_BB;
*cs++ = get_children_go_addr(ce->parent);
*cs++ = 0;

/* Turn off preemption */
*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;

/* Jump to batch */
*cs++ = MI_BATCH_BUFFER_START_GEN8 |
(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));


*cs++ = lower_32_bits(offset);
*cs++ = upper_32_bits(offset);

intel_ring_advance(rq, cs);

return 0;

+}



+static u32 *
+emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq,

				 u32 *cs)



+{

struct intel_context *ce = rq->context;
u8 i;

GEM_BUG_ON(!intel_context_is_parent(ce));

/* Wait on children */
for (i = 0; i < ce->guc_number_children; ++i) {
*cs++ = (MI_SEMAPHORE_WAIT |


	 MI_SEMAPHORE_GLOBAL_GTT |


	 MI_SEMAPHORE_POLL |


	 MI_SEMAPHORE_SAD_EQ_SDD);


*cs++ = PARENT_GO_FINI_BREADCRUMB;


*cs++ = get_children_join_addr(ce, i);


*cs++ = 0;


}

/* Turn on preemption */
*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
*cs++ = MI_NOOP;

/* Tell children go */
cs = gen8_emit_ggtt_write(cs,
		  CHILD_GO_FINI_BREADCRUMB,


		  get_children_go_addr(ce),


		  0);



/* Emit fini breadcrumb */
cs = gen8_emit_ggtt_write(cs,
		  rq->fence.seqno,


		  i915_request_active_timeline(rq)->hwsp_offset,


		  0);



/* User interrupt */
*cs++ = MI_USER_INTERRUPT;
*cs++ = MI_NOOP;

rq->tail = intel_ring_offset(rq, cs);

return cs;

+}



+static u32 *
+emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq, u32 *cs)
+{

struct intel_context *ce = rq->context;

GEM_BUG_ON(!intel_context_is_child(ce));

/* Turn on preemption */
*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
*cs++ = MI_NOOP;

/* Signal parent */
cs = gen8_emit_ggtt_write(cs,
		  PARENT_GO_FINI_BREADCRUMB,


		  get_children_join_addr(ce->parent,


					 ce->guc_child_index),


		  0);




This is backwards compared to the parent?
Parent: wait on children then enable pre-emption
Child: enable pre-emption then signal parent
Makes for a window where the parent is waiting in atomic context for a 
signal from a child that has been pre-empted and might not get to run 
for some time?
John.
...

/* Wait parent on for go */
*cs++ = (MI_SEMAPHORE_WAIT |
 MI_SEMAPHORE_GLOBAL_GTT |


 MI_SEMAPHORE_POLL |


 MI_SEMAPHORE_SAD_EQ_SDD);


*cs++ = CHILD_GO_FINI_BREADCRUMB;
*cs++ = get_children_go_addr(ce->parent);
*cs++ = 0;

/* Emit fini breadcrumb */
cs = gen8_emit_ggtt_write(cs,
		  rq->fence.seqno,


		  i915_request_active_timeline(rq)->hwsp_offset,


		  0);



/* User interrupt */
*cs++ = MI_USER_INTERRUPT;
*cs++ = MI_NOOP;

rq->tail = intel_ring_offset(rq, cs);

return cs;

+}

static struct intel_context *
g2h_context_lookup(struct intel_guc *guc, u32 desc_idx)
{

@@ -3807,6 +4075,19 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc,
   		drm_printf(p, "\t\tWQI Status: %u\n\n",
   			   READ_ONCE(desc->wq_status));

	drm_printf(p, "\t\tNumber Children: %u\n\n",


		   ce->guc_number_children);


	if (ce->engine->emit_bb_start ==


	    emit_bb_start_parent_no_preempt_mid_batch) {


		u8 i;



		drm_printf(p, "\t\tChildren Go: %u\n\n",


			   get_children_go_value(ce));


		for (i = 0; i < ce->guc_number_children; ++i)


			drm_printf(p, "\t\tChildren Join: %u\n",


				   get_children_join_value(ce, i));


	}


for_each_child(ce, child)
	guc_log_context(p, child);

}


    

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

Re: [Intel-gfx] [PATCH 23/27] drm/i915/guc: Implement no mid batch preemption for multi-lrc

Signed-off-by: Matthew Brost matthew.brost@intel.com