On 10/4/2021 15:06, Matthew Brost wrote:
For some users of multi-lrc, e.g. split frame, it isn't safe to preempt mid BB. To safely enable preemption at the BB boundary, a handshake between to parent and child is needed. This is implemented via custom
between to parent -> between parent
emit_bb_start & emit_fini_breadcrumb functions and enabled via by
via by -> by
I'm also not seeing any mention of the forced re-group behavioural change in either the comments or commit description.
default if a context is configured by set parallel extension.
v2: (John Harrison)
- Fix a few comments wording
- Add struture for parent page layout
Signed-off-by: Matthew Brost matthew.brost@intel.com
drivers/gpu/drm/i915/gt/intel_context.c | 2 +- drivers/gpu/drm/i915/gt/intel_context_types.h | 2 + drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h | 2 +- .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 330 +++++++++++++++++- 4 files changed, 324 insertions(+), 12 deletions(-)
diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c index 3b340eb59ada..ee84259959d0 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.c +++ b/drivers/gpu/drm/i915/gt/intel_context.c @@ -569,7 +569,7 @@ void intel_context_bind_parent_child(struct intel_context *parent, GEM_BUG_ON(intel_context_is_child(child)); GEM_BUG_ON(intel_context_is_parent(child));
- parent->parallel.number_children++;
- parent->parallel.child_index = parent->parallel.number_children++; list_add_tail(&child->parallel.child_link, &parent->parallel.child_list); child->parallel.parent = parent;
diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h index 1d880303a7e4..95a5b94b4ece 100644 --- a/drivers/gpu/drm/i915/gt/intel_context_types.h +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h @@ -250,6 +250,8 @@ struct intel_context { struct i915_request *last_rq; /** @number_children: number of children if parent */ u8 number_children;
/** @child_index: index into child_list if child */
/** @guc: GuC specific members for parallel submission */ struct { /** @wqi_head: head pointer in work queue */u8 child_index;
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h index a00eeddc1449..663950d3badc 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h @@ -181,7 +181,7 @@ struct guc_process_desc { u32 wq_status; u32 engine_presence; u32 priority;
- u32 reserved[30];
- u32 reserved[36];
Not seeing the promised explanation of this bug fix.
} __packed;
#define CONTEXT_REGISTRATION_FLAG_KMD BIT(0) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 12ee8ca76249..f28e36aa77c2 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -11,6 +11,7 @@ #include "gt/intel_context.h" #include "gt/intel_engine_pm.h" #include "gt/intel_engine_heartbeat.h" +#include "gt/intel_gpu_commands.h" #include "gt/intel_gt.h" #include "gt/intel_gt_irq.h" #include "gt/intel_gt_pm.h" @@ -368,10 +369,16 @@ static inline struct i915_priolist *to_priolist(struct rb_node *rb)
/*
- When using multi-lrc submission an extra page in the context state is
- reserved for the process descriptor and work queue.
- reserved for the process descriptor, work queue, and handshake between the
- parent + childlren contexts to insert safe preemption points between each set
- of BBs.
- The layout of this page is below:
- 0 guc_process_desc
- sizeof(struct guc_process_desc) child go
- CACHELINE_BYTES child join[0]
- ...
- CACHELINE_BYTES child join[n - 1]
- ... unused
- PAGE_SIZE / 2 work queue start
- ... work queue
@@ -379,7 +386,25 @@ static inline struct i915_priolist *to_priolist(struct rb_node *rb) */ #define WQ_SIZE (PAGE_SIZE / 2) #define WQ_OFFSET (PAGE_SIZE - WQ_SIZE) -static u32 __get_process_desc_offset(struct intel_context *ce)
+struct parent_page {
- struct guc_process_desc pdesc;
- u32 child_go_memory;
- u8 unused0[CACHELINE_BYTES - sizeof(u32)];
- struct {
u32 child_join_memory;
u8 unused1[CACHELINE_BYTES - sizeof(u32)];
- } join[MAX_ENGINE_INSTANCE + 1];
Could have a common structure for these. Call the u32 'semaphore_memory' or something then just have: struct sync_semaphore go; struct sync_semaphore go[MAX + 1];
- u8 unused2[(WQ_OFFSET - sizeof(struct guc_process_desc) -
CACHELINE_BYTES * (MAX_ENGINE_INSTANCE + 2))];
And this bit could be 'sizeof(struct sync_semaphore) * MAX + 2' to be clearer what it refers to.
And to be totally paranoid about it, could also add 'BUILD_BUG_ON(sizeof(struct sync_semaphore) != CACHELINE_BYTES'.
And BUILD_BUG_ON(sizeof(parent_page) == PARENT_PAGE_SIZE)'.
- u32 wq[WQ_SIZE / sizeof(u32)];
+};
+static u32 __get_parent_page_offset(struct intel_context *ce) { GEM_BUG_ON(!ce->parallel.guc.parent_page);
@@ -388,23 +413,35 @@ static u32 __get_process_desc_offset(struct intel_context *ce)
static u32 __get_wq_offset(struct intel_context *ce) {
- return __get_process_desc_offset(ce) + WQ_OFFSET;
- BUILD_BUG_ON(offsetof(struct parent_page, wq) != WQ_OFFSET);
- return __get_parent_page_offset(ce) + WQ_OFFSET; }
-static struct guc_process_desc * -__get_process_desc(struct intel_context *ce) +static struct parent_page * +__get_parent_page(struct intel_context *ce) {
- BUILD_BUG_ON(sizeof(struct parent_page) != PAGE_SIZE);
- /*
*/
- Need to subtract LRC_STATE_OFFSET here as the
- parallel.guc.parent_page is the offset into ce->state while
- ce->lrc_reg_reg is ce->state + LRC_STATE_OFFSET.
- return (struct guc_process_desc *)
- return (struct parent_page *) (ce->lrc_reg_state +
((__get_process_desc_offset(ce) -
LRC_STATE_OFFSET) / sizeof(u32))); }((__get_parent_page_offset(ce) -
+static struct guc_process_desc * +__get_process_desc(struct intel_context *ce) +{
- struct parent_page *pp = __get_parent_page(ce);
- return &pp->pdesc;
+}
- static u32 *get_wq_pointer(struct guc_process_desc *desc, struct intel_context *ce, u32 wqi_size)
@@ -424,8 +461,7 @@ static u32 *get_wq_pointer(struct guc_process_desc *desc, } #undef AVAILABLE_SPACE
- return ((u32 *)__get_process_desc(ce)) +
((WQ_OFFSET + ce->parallel.guc.wqi_tail) / sizeof(u32));
return &__get_parent_page(ce)->wq[ce->parallel.guc.wqi_tail / sizeof(u32)]; }
static struct guc_lrc_desc *__get_lrc_desc(struct intel_guc *guc, u32 index)
@@ -1829,6 +1865,26 @@ static int deregister_context(struct intel_context *ce, u32 guc_id) return __guc_action_deregister_context(guc, guc_id); }
+static inline void clear_children_join_go_memory(struct intel_context *ce) +{
- u32 *mem = (u32 *)(&__get_parent_page(ce)->child_go_memory);
- u8 i;
- for (i = 0; i < ce->parallel.number_children + 1; ++i)
mem[i * (CACHELINE_BYTES / sizeof(u32))] = 0;
Can't this be written as: pp->child_go_memory = 0; for(i = 0 to number_children) pp->child_join_memory = 0;
Seems like that would be much clearer than this magic casting and offsetting. I mean, that was the whole point of creating the parent_page structure.
+}
+static inline u32 get_children_go_value(struct intel_context *ce) +{
- return __get_parent_page(ce)->child_go_memory;
+}
+static inline u32 get_children_join_value(struct intel_context *ce,
u8 child_index)
+{
- return __get_parent_page(ce)->join[child_index].child_join_memory;
+}
- static void guc_context_policy_init(struct intel_engine_cs *engine, struct guc_lrc_desc *desc) {
@@ -1888,7 +1944,7 @@ static int guc_lrc_desc_pin(struct intel_context *ce, bool loop) ce->parallel.guc.wqi_head = 0;
desc->process_desc = i915_ggtt_offset(ce->state) +
__get_process_desc_offset(ce);
desc->wq_addr = i915_ggtt_offset(ce->state) + __get_wq_offset(ce); desc->wq_size = WQ_SIZE;__get_parent_page_offset(ce);
@@ -1910,6 +1966,8 @@ static int guc_lrc_desc_pin(struct intel_context *ce, bool loop) desc->context_flags = CONTEXT_REGISTRATION_FLAG_KMD; guc_context_policy_init(engine, desc); }
clear_children_join_go_memory(ce);
}
/*
@@ -2976,6 +3034,31 @@ static const struct intel_context_ops virtual_child_context_ops = { .get_sibling = guc_virtual_get_sibling, };
+/*
- The below override of the breadcrumbs is enabled when the user configures a
- context for parallel submission (multi-lrc, parent-child).
- The overridden breadcrumbs implements an algorithm which allows the GuC to
- safely preempt all the hw contexts configured for parallel submission
- between each BB. The contract between the i915 and GuC is if the parent
- context can be preempted, all the children can be preempted, and the GuC will
- always try to preempt the parent before the children. A handshake between the
- parent / children breadcrumbs ensures the i915 holds up its end of the deal
- creating a window to preempt between each set of BBs.
- */
+static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq,
u64 offset, u32 len,
const unsigned int flags);
+static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq,
u64 offset, u32 len,
const unsigned int flags);
+static u32 * +emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq,
u32 *cs);
+static u32 * +emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq,
u32 *cs);
- static struct intel_context * guc_create_parallel(struct intel_engine_cs **engines, unsigned int num_siblings,
@@ -3011,6 +3094,20 @@ guc_create_parallel(struct intel_engine_cs **engines, } }
- parent->engine->emit_bb_start =
emit_bb_start_parent_no_preempt_mid_batch;
- parent->engine->emit_fini_breadcrumb =
emit_fini_breadcrumb_parent_no_preempt_mid_batch;
- parent->engine->emit_fini_breadcrumb_dw =
12 + 4 * parent->parallel.number_children;
- for_each_child(parent, ce) {
ce->engine->emit_bb_start =
emit_bb_start_child_no_preempt_mid_batch;
ce->engine->emit_fini_breadcrumb =
emit_fini_breadcrumb_child_no_preempt_mid_batch;
ce->engine->emit_fini_breadcrumb_dw = 16;
- }
- kfree(siblings); return parent;
@@ -3840,6 +3937,17 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc, drm_printf(p, "\t\tWQI Status: %u\n\n", READ_ONCE(desc->wq_status));
if (ce->engine->emit_bb_start ==
emit_bb_start_parent_no_preempt_mid_batch) {
u8 i;
drm_printf(p, "\t\tChildren Go: %u\n\n",
get_children_go_value(ce));
for (i = 0; i < ce->parallel.number_children; ++i)
drm_printf(p, "\t\tChildren Join: %u\n",
get_children_join_value(ce, i));
}
}for_each_child(ce, child) guc_log_context(p, child);
@@ -3847,6 +3955,208 @@ void intel_guc_submission_print_context_info(struct intel_guc *guc, xa_unlock_irqrestore(&guc->context_lookup, flags); }
+static inline u32 get_children_go_addr(struct intel_context *ce) +{
- GEM_BUG_ON(!intel_context_is_parent(ce));
- BUILD_BUG_ON(offsetof(struct parent_page, child_go_memory) !=
sizeof(struct guc_process_desc));
- return i915_ggtt_offset(ce->state) +
__get_parent_page_offset(ce) +
sizeof(struct guc_process_desc);
Rather than relying on the BUILD_BUG to make sure that the magic calculation matches the structure definition, can't this just say "ggtt_offset + pp_offset + offsetof(pp, child_go)"?
+}
+static inline u32 get_children_join_addr(struct intel_context *ce,
u8 child_index)
+{
- GEM_BUG_ON(!intel_context_is_parent(ce));
- return get_children_go_addr(ce) + (child_index + 1) * CACHELINE_BYTES;
"ggtt_offset + pp_offset + offsetof(pp, child_join[i])"?
+}
+#define PARENT_GO_BB 1 +#define PARENT_GO_FINI_BREADCRUMB 0 +#define CHILD_GO_BB 1 +#define CHILD_GO_FINI_BREADCRUMB 0 +static int emit_bb_start_parent_no_preempt_mid_batch(struct i915_request *rq,
u64 offset, u32 len,
const unsigned int flags)
+{
- struct intel_context *ce = rq->context;
- u32 *cs;
- u8 i;
- GEM_BUG_ON(!intel_context_is_parent(ce));
- cs = intel_ring_begin(rq, 10 + 4 * ce->parallel.number_children);
- if (IS_ERR(cs))
return PTR_ERR(cs);
- /* Wait on children */
- for (i = 0; i < ce->parallel.number_children; ++i) {
*cs++ = (MI_SEMAPHORE_WAIT |
MI_SEMAPHORE_GLOBAL_GTT |
MI_SEMAPHORE_POLL |
MI_SEMAPHORE_SAD_EQ_SDD);
*cs++ = PARENT_GO_BB;
*cs++ = get_children_join_addr(ce, i);
*cs++ = 0;
- }
- /* Turn off preemption */
- *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
- *cs++ = MI_NOOP;
- /* Tell children go */
- cs = gen8_emit_ggtt_write(cs,
CHILD_GO_BB,
get_children_go_addr(ce),
0);
- /* Jump to batch */
- *cs++ = MI_BATCH_BUFFER_START_GEN8 |
(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
- *cs++ = lower_32_bits(offset);
- *cs++ = upper_32_bits(offset);
- *cs++ = MI_NOOP;
- intel_ring_advance(rq, cs);
- return 0;
+}
+static int emit_bb_start_child_no_preempt_mid_batch(struct i915_request *rq,
u64 offset, u32 len,
const unsigned int flags)
+{
- struct intel_context *ce = rq->context;
- struct intel_context *parent = intel_context_to_parent(ce);
- u32 *cs;
- GEM_BUG_ON(!intel_context_is_child(ce));
- cs = intel_ring_begin(rq, 12);
- if (IS_ERR(cs))
return PTR_ERR(cs);
- /* Signal parent */
- cs = gen8_emit_ggtt_write(cs,
PARENT_GO_BB,
get_children_join_addr(parent,
ce->parallel.child_index),
0);
- /* Wait on parent for go */
- *cs++ = (MI_SEMAPHORE_WAIT |
MI_SEMAPHORE_GLOBAL_GTT |
MI_SEMAPHORE_POLL |
MI_SEMAPHORE_SAD_EQ_SDD);
- *cs++ = CHILD_GO_BB;
- *cs++ = get_children_go_addr(parent);
- *cs++ = 0;
- /* Turn off preemption */
- *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
- /* Jump to batch */
- *cs++ = MI_BATCH_BUFFER_START_GEN8 |
(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
- *cs++ = lower_32_bits(offset);
- *cs++ = upper_32_bits(offset);
- intel_ring_advance(rq, cs);
- return 0;
+}
+static u32 * +emit_fini_breadcrumb_parent_no_preempt_mid_batch(struct i915_request *rq,
u32 *cs)
+{
- struct intel_context *ce = rq->context;
- u8 i;
- GEM_BUG_ON(!intel_context_is_parent(ce));
- /* Wait on children */
- for (i = 0; i < ce->parallel.number_children; ++i) {
*cs++ = (MI_SEMAPHORE_WAIT |
MI_SEMAPHORE_GLOBAL_GTT |
MI_SEMAPHORE_POLL |
MI_SEMAPHORE_SAD_EQ_SDD);
*cs++ = PARENT_GO_FINI_BREADCRUMB;
*cs++ = get_children_join_addr(ce, i);
*cs++ = 0;
- }
- /* Turn on preemption */
- *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
- *cs++ = MI_NOOP;
You mentioned possibly needing to add an MI_ARB_CHECK in here but I'm not seeing it. Did the testing happen? I don't see that it should be necessary. Once you execute the MI_ARB_ENABLE, the CS can preempt anywhere, I thought? Even if it can't there should be an MI_ARB_CHECK added at the next level up after the breadcrumb code. Or do we not have those in between batches any more?
John.
- /* Tell children go */
- cs = gen8_emit_ggtt_write(cs,
CHILD_GO_FINI_BREADCRUMB,
get_children_go_addr(ce),
0);
- /* Emit fini breadcrumb */
- cs = gen8_emit_ggtt_write(cs,
rq->fence.seqno,
i915_request_active_timeline(rq)->hwsp_offset,
0);
- /* User interrupt */
- *cs++ = MI_USER_INTERRUPT;
- *cs++ = MI_NOOP;
- rq->tail = intel_ring_offset(rq, cs);
- return cs;
+}
+static u32 * +emit_fini_breadcrumb_child_no_preempt_mid_batch(struct i915_request *rq, u32 *cs) +{
- struct intel_context *ce = rq->context;
- struct intel_context *parent = intel_context_to_parent(ce);
- GEM_BUG_ON(!intel_context_is_child(ce));
- /* Turn on preemption */
- *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
- *cs++ = MI_NOOP;
- /* Signal parent */
- cs = gen8_emit_ggtt_write(cs,
PARENT_GO_FINI_BREADCRUMB,
get_children_join_addr(parent,
ce->parallel.child_index),
0);
- /* Wait parent on for go */
- *cs++ = (MI_SEMAPHORE_WAIT |
MI_SEMAPHORE_GLOBAL_GTT |
MI_SEMAPHORE_POLL |
MI_SEMAPHORE_SAD_EQ_SDD);
- *cs++ = CHILD_GO_FINI_BREADCRUMB;
- *cs++ = get_children_go_addr(parent);
- *cs++ = 0;
- /* Emit fini breadcrumb */
- cs = gen8_emit_ggtt_write(cs,
rq->fence.seqno,
i915_request_active_timeline(rq)->hwsp_offset,
0);
- /* User interrupt */
- *cs++ = MI_USER_INTERRUPT;
- *cs++ = MI_NOOP;
- rq->tail = intel_ring_offset(rq, cs);
- return cs;
+}
- static struct intel_context * guc_create_virtual(struct intel_engine_cs **siblings, unsigned int count, unsigned long flags)