[PATCH 0/4] More fixes/tweaks to GuC support

List overview All Threads
Download

newer

older

[PATCH v8 0/6] Add Unisoc's drm...

[PATCH] udmabuf: put dmabuf in...

John.C.Harrison＠Intel.com

11 Dec 2021 11 Dec '21

6:58 a.m.

From: John Harrison John.C.Harrison@Intel.com

Allow engine resets on RCS, report problems with engine resets, improve GuC log usability.

Signed-off-by: John Harrison John.C.Harrison@Intel.com

John Harrison (4): drm/i915/guc: Speed up GuC log dumps drm/i915/guc: Increase GuC log size for CONFIG_DEBUG_GEM drm/i915/guc: Flag an error if an engine reset fails drm/i915: Improve long running OCL w/a for GuC submission

drivers/gpu/drm/i915/gt/intel_engine_cs.c | 42 +++++++++++++- drivers/gpu/drm/i915/gt/intel_gt_debugfs.h | 21 +++++-- drivers/gpu/drm/i915/gt/uc/intel_guc_log.h | 5 +- .../drm/i915/gt/uc/intel_guc_log_debugfs.c | 58 ++++++++++++++++++- .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 14 ++++- 5 files changed, 125 insertions(+), 15 deletions(-)

-- 2.25.1

Show replies by date

John.C.Harrison＠Intel.com

11 Dec 11 Dec

6:58 a.m.

New subject: [PATCH 1/4] drm/i915/guc: Speed up GuC log dumps

From: John Harrison John.C.Harrison@Intel.com

Add support for telling the debugfs interface the size of the GuC log dump in advance. Without that, the underlying framework keeps calling the 'show' function with larger and larger buffer allocations until it fits. That means reading the log from graphics memory many times - 16 times with the full 18MB log size.

v2: Don't return error codes from size query. Report overflow in the error dump as well (review feedback from Daniele).

Signed-off-by: John Harrison John.C.Harrison@Intel.com --- drivers/gpu/drm/i915/gt/intel_gt_debugfs.h | 21 +++++-- .../drm/i915/gt/uc/intel_guc_log_debugfs.c | 58 ++++++++++++++++++- 2 files changed, 71 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_debugfs.h b/drivers/gpu/drm/i915/gt/intel_gt_debugfs.h index e307ceb99031..17e79b735cfe 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_debugfs.h +++ b/drivers/gpu/drm/i915/gt/intel_gt_debugfs.h @@ -10,11 +10,7 @@

struct intel_gt;

-#define DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE(__name) \ - static int __name ## _open(struct inode *inode, struct file *file) \ -{ \ - return single_open(file, __name ## _show, inode->i_private); \ -} \ +#define __GT_DEBUGFS_ATTRIBUTE_FOPS(__name) \ static const struct file_operations __name ## _fops = { \ .owner = THIS_MODULE, \ .open = __name ## _open, \ @@ -23,6 +19,21 @@ static const struct file_operations __name ## _fops = { \ .release = single_release, \ }

+#define DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE(__name) \ +static int __name ## _open(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, __name ## _show, inode->i_private); \ +} \ +__GT_DEBUGFS_ATTRIBUTE_FOPS(__name) + +#define DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE_WITH_SIZE(__name, __size_vf) \ +static int __name ## _open(struct inode *inode, struct file *file) \ +{ \ + return single_open_size(file, __name ## _show, inode->i_private, \ + __size_vf(inode->i_private)); \ +} \ +__GT_DEBUGFS_ATTRIBUTE_FOPS(__name) + void intel_gt_debugfs_register(struct intel_gt *gt);

struct intel_gt_debugfs_file { diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log_debugfs.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_log_debugfs.c index 8fd068049376..ddfbe334689f 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_log_debugfs.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log_debugfs.c @@ -10,22 +10,74 @@ #include "intel_guc.h" #include "intel_guc_log.h" #include "intel_guc_log_debugfs.h" +#include "intel_uc.h" + +static u32 obj_to_guc_log_dump_size(struct drm_i915_gem_object *obj) +{ + u32 size; + + if (!obj) + return PAGE_SIZE; + + /* "0x%08x 0x%08x 0x%08x 0x%08x\n" => 16 bytes -> 44 chars => x2.75 */ + size = ((obj->base.size * 11) + 3) / 4; + + /* Add padding for final blank line, any extra header info, etc. */ + size = PAGE_ALIGN(size + PAGE_SIZE); + + return size; +} + +static u32 guc_log_dump_size(struct intel_guc_log *log) +{ + struct intel_guc *guc = log_to_guc(log); + + if (!intel_guc_is_supported(guc)) + return PAGE_SIZE; + + if (!log->vma) + return PAGE_SIZE; + + return obj_to_guc_log_dump_size(log->vma->obj); +}

static int guc_log_dump_show(struct seq_file *m, void *data) { struct drm_printer p = drm_seq_file_printer(m); + int ret;

- return intel_guc_log_dump(m->private, &p, false); + ret = intel_guc_log_dump(m->private, &p, false); + + if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) && seq_has_overflowed(m)) + pr_warn_once("preallocated size:%zx for %s exceeded\n", + m->size, __func__); + + return ret; +} +DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE_WITH_SIZE(guc_log_dump, guc_log_dump_size); + +static u32 guc_load_err_dump_size(struct intel_guc_log *log) +{ + struct intel_guc *guc = log_to_guc(log); + struct intel_uc *uc = container_of(guc, struct intel_uc, guc); + + if (!intel_guc_is_supported(guc)) + return PAGE_SIZE; + + return obj_to_guc_log_dump_size(uc->load_err_log); } -DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE(guc_log_dump);

static int guc_load_err_log_dump_show(struct seq_file *m, void *data) { struct drm_printer p = drm_seq_file_printer(m);

+ if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) && seq_has_overflowed(m)) + pr_warn_once("preallocated size:%zx for %s exceeded\n", + m->size, __func__); + return intel_guc_log_dump(m->private, &p, true); } -DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE(guc_load_err_log_dump); +DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE_WITH_SIZE(guc_load_err_log_dump, guc_load_err_dump_size);

static int guc_log_level_get(void *data, u64 *val) {

-- 2.25.1

Daniele Ceraolo Spurio

20 Dec 20 Dec

2:43 p.m.

New subject: [PATCH 1/4] drm/i915/guc: Speed up GuC log dumps

On 12/10/2021 10:58 PM, John.C.Harrison@Intel.com wrote:

...

From: John Harrison John.C.Harrison@Intel.com

Add support for telling the debugfs interface the size of the GuC log dump in advance. Without that, the underlying framework keeps calling the 'show' function with larger and larger buffer allocations until it fits. That means reading the log from graphics memory many times - 16 times with the full 18MB log size.

v2: Don't return error codes from size query. Report overflow in the error dump as well (review feedback from Daniele).

Signed-off-by: John Harrison John.C.Harrison@Intel.com

Reviewed-by: Daniele Ceraolo Spurio daniele.ceraolospurio@intel.com

Daniele

...

drivers/gpu/drm/i915/gt/intel_gt_debugfs.h | 21 +++++-- .../drm/i915/gt/uc/intel_guc_log_debugfs.c | 58 ++++++++++++++++++- 2 files changed, 71 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_debugfs.h b/drivers/gpu/drm/i915/gt/intel_gt_debugfs.h index e307ceb99031..17e79b735cfe 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_debugfs.h +++ b/drivers/gpu/drm/i915/gt/intel_gt_debugfs.h @@ -10,11 +10,7 @@

struct intel_gt;

-#define DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE(__name) \

static int __name ## _open(struct inode *inode, struct file *file) \

-{ \

return single_open(file, __name ## _show, inode->i_private); \

-} \ +#define __GT_DEBUGFS_ATTRIBUTE_FOPS(__name) \ static const struct file_operations __name ## _fops = { \ .owner = THIS_MODULE, \ .open = __name ## _open, \ @@ -23,6 +19,21 @@ static const struct file_operations __name ## _fops = { \ .release = single_release, \ }

+#define DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE(__name) \ +static int __name ## _open(struct inode *inode, struct file *file) \ +{ \

return single_open(file, __name ## _show, inode->i_private); \

+} \ +__GT_DEBUGFS_ATTRIBUTE_FOPS(__name)

+#define DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE_WITH_SIZE(__name, __size_vf) \ +static int __name ## _open(struct inode *inode, struct file *file) \ +{ \
return single_open_size(file, __name ## _show, inode->i_private, \
	    __size_vf(inode->i_private));			\
+} \ +__GT_DEBUGFS_ATTRIBUTE_FOPS(__name)

void intel_gt_debugfs_register(struct intel_gt *gt);

struct intel_gt_debugfs_file {

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log_debugfs.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_log_debugfs.c index 8fd068049376..ddfbe334689f 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_log_debugfs.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log_debugfs.c @@ -10,22 +10,74 @@ #include "intel_guc.h" #include "intel_guc_log.h" #include "intel_guc_log_debugfs.h" +#include "intel_uc.h"

+static u32 obj_to_guc_log_dump_size(struct drm_i915_gem_object *obj) +{
u32 size;

if (!obj)
return PAGE_SIZE;
/* "0x%08x 0x%08x 0x%08x 0x%08x\n" => 16 bytes -> 44 chars => x2.75 */

size = ((obj->base.size * 11) + 3) / 4;

/* Add padding for final blank line, any extra header info, etc. */

size = PAGE_ALIGN(size + PAGE_SIZE);

return size;
+}

+static u32 guc_log_dump_size(struct intel_guc_log *log) +{
struct intel_guc *guc = log_to_guc(log);

if (!intel_guc_is_supported(guc))
return PAGE_SIZE;
if (!log->vma)
return PAGE_SIZE;
return obj_to_guc_log_dump_size(log->vma->obj);
+}

static int guc_log_dump_show(struct seq_file *m, void *data) { struct drm_printer p = drm_seq_file_printer(m);

int ret;

return intel_guc_log_dump(m->private, &p, false);
ret = intel_guc_log_dump(m->private, &p, false);

if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) && seq_has_overflowed(m))
pr_warn_once("preallocated size:%zx for %s exceeded\n",
	     m->size, __func__);
return ret;
+} +DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE_WITH_SIZE(guc_log_dump, guc_log_dump_size);

+static u32 guc_load_err_dump_size(struct intel_guc_log *log) +{
struct intel_guc *guc = log_to_guc(log);

struct intel_uc *uc = container_of(guc, struct intel_uc, guc);

if (!intel_guc_is_supported(guc))
return PAGE_SIZE;
return obj_to_guc_log_dump_size(uc->load_err_log); }
-DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE(guc_log_dump);

static int guc_load_err_log_dump_show(struct seq_file *m, void *data) { struct drm_printer p = drm_seq_file_printer(m);
if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) && seq_has_overflowed(m))
pr_warn_once("preallocated size:%zx for %s exceeded\n",
	     m->size, __func__);
return intel_guc_log_dump(m->private, &p, true); }
-DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE(guc_load_err_log_dump); +DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE_WITH_SIZE(guc_load_err_log_dump, guc_load_err_dump_size);

static int guc_log_level_get(void *data, u64 *val) {

John.C.Harrison＠Intel.com

11 Dec 11 Dec

6:58 a.m.

New subject: [PATCH 2/4] drm/i915/guc: Increase GuC log size for CONFIG_DEBUG_GEM

From: John Harrison John.C.Harrison@Intel.com

Lots of testing is done with the DEBUG_GEM config option enabled but not the DEBUG_GUC option. That means we only get teeny-tiny GuC logs which are not hugely useful. Enabling full DEBUG_GUC also spews lots of other detailed output that is not generally desired. However, bigger GuC logs are extremely useful for almost any regression debug. So enable bigger logs for DEBUG_GEM builds as well.

Signed-off-by: John Harrison John.C.Harrison@Intel.com Reviewed-by: Matthew Brost matthew.brost@intel.com --- drivers/gpu/drm/i915/gt/uc/intel_guc_log.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.h index ac1ee1d5ce10..fe6ab7550a14 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.h @@ -15,9 +15,12 @@

struct intel_guc;

-#ifdef CONFIG_DRM_I915_DEBUG_GUC +#if defined(CONFIG_DRM_I915_DEBUG_GUC) #define CRASH_BUFFER_SIZE SZ_2M #define DEBUG_BUFFER_SIZE SZ_16M +#elif defined(CONFIG_DRM_I915_DEBUG_GEM) +#define CRASH_BUFFER_SIZE SZ_1M +#define DEBUG_BUFFER_SIZE SZ_2M #else #define CRASH_BUFFER_SIZE SZ_8K #define DEBUG_BUFFER_SIZE SZ_64K

-- 2.25.1

John.C.Harrison＠Intel.com

6:58 a.m.

New subject: [PATCH 3/4] drm/i915/guc: Flag an error if an engine reset fails

From: John Harrison John.C.Harrison@Intel.com

If GuC encounters an error during engine reset, the i915 driver promotes to full GT reset. This includes an info message about why the reset is happening. However, that is not treated as a failure by any of the CI systems because resets are an expected occurrance during testing. This kind of failure is a major problem and should never happen. So, complain more loudly and make sure CI notices.

Signed-off-by: John Harrison John.C.Harrison@Intel.com --- drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 97311119da6f..6015815f1da0 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -4018,11 +4018,12 @@ int intel_guc_engine_failure_process_msg(struct intel_guc *guc, const u32 *msg, u32 len) { struct intel_engine_cs *engine; + struct intel_gt *gt = guc_to_gt(guc); u8 guc_class, instance; u32 reason;

if (unlikely(len != 3)) { - drm_err(&guc_to_gt(guc)->i915->drm, "Invalid length %u", len); + drm_err(&gt->i915->drm, "Invalid length %u", len); return -EPROTO; }

@@ -4032,12 +4033,19 @@ int intel_guc_engine_failure_process_msg(struct intel_guc *guc,

engine = guc_lookup_engine(guc, guc_class, instance); if (unlikely(!engine)) { - drm_err(&guc_to_gt(guc)->i915->drm, + drm_err(&gt->i915->drm, "Invalid engine %d:%d", guc_class, instance); return -EPROTO; }

- intel_gt_handle_error(guc_to_gt(guc), engine->mask, + /* + * This is an unexpected failure of a hardware feature. So, log a real + * error message not just the informational that comes with the reset. + */ + drm_err(&gt->i915->drm, "GuC engine reset request failed on %d:%d (%s) because %d", + guc_class, instance, engine->name, reason); + + intel_gt_handle_error(gt, engine->mask, I915_ERROR_CAPTURE, "GuC failed to reset %s (reason=0x%08x)\n", engine->name, reason);

-- 2.25.1

Daniele Ceraolo Spurio

16 Dec 16 Dec

1:16 a.m.

New subject: [Intel-gfx] [PATCH 3/4] drm/i915/guc: Flag an error if an engine reset fails

On 12/10/2021 10:58 PM, John.C.Harrison@Intel.com wrote:

...

From: John Harrison John.C.Harrison@Intel.com

If GuC encounters an error during engine reset, the i915 driver promotes to full GT reset. This includes an info message about why the reset is happening. However, that is not treated as a failure by any of the CI systems because resets are an expected occurrance during testing. This kind of failure is a major problem and should never happen. So, complain more loudly and make sure CI notices.

Signed-off-by: John Harrison John.C.Harrison@Intel.com

drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 97311119da6f..6015815f1da0 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -4018,11 +4018,12 @@ int intel_guc_engine_failure_process_msg(struct intel_guc *guc, const u32 *msg, u32 len) { struct intel_engine_cs *engine;

struct intel_gt *gt = guc_to_gt(guc); u8 guc_class, instance; u32 reason;

if (unlikely(len != 3)) {
drm_err(&guc_to_gt(guc)->i915->drm, "Invalid length %u", len);
drm_err(&gt->i915->drm, "Invalid length %u", len);
return -EPROTO; }
@@ -4032,12 +4033,19 @@ int intel_guc_engine_failure_process_msg(struct intel_guc *guc,

engine = guc_lookup_engine(guc, guc_class, instance); if (unlikely(!engine)) {
drm_err(&guc_to_gt(guc)->i915->drm,
drm_err(&gt->i915->drm,
"Invalid engine %d:%d", guc_class, instance);
return -EPROTO; }
intel_gt_handle_error(guc_to_gt(guc), engine->mask,
/*
* This is an unexpected failure of a hardware feature. So, log a real
* error message not just the informational that comes with the reset.
*/
drm_err(&gt->i915->drm, "GuC engine reset request failed on %d:%d (%s) because %d",

In the error handling called below, the reason is logged as 0x%08x, so IMO we should do the same here for consistency. With that:

Reviewed-by: Daniele Ceraolo Spurio daniele.ceraolospurio@intel.com

Daniele

...

guc_class, instance, engine->name, reason);
intel_gt_handle_error(gt, engine->mask, I915_ERROR_CAPTURE, "GuC failed to reset %s (reason=0x%08x)\n", engine->name, reason);

John.C.Harrison＠Intel.com

11 Dec 11 Dec

6:58 a.m.

New subject: [PATCH 4/4] drm/i915: Improve long running OCL w/a for GuC submission

From: John Harrison John.C.Harrison@Intel.com

A workaround was added to the driver to allow OpenCL workloads to run 'forever' by disabling pre-emption on the RCS engine for Gen12. It is not totally unbound as the heartbeat will kick in eventually and cause a reset of the hung engine.

However, this does not work well in GuC submission mode. In GuC mode, the pre-emption timeout is how GuC detects hung contexts and triggers a per engine reset. Thus, disabling the timeout means also losing all per engine reset ability. A full GT reset will still occur when the heartbeat finally expires, but that is a much more destructive and undesirable mechanism.

The purpose of the workaround is actually to give OpenCL tasks longer to reach a pre-emption point after a pre-emption request has been issued. This is necessary because Gen12 does not support mid-thread pre-emption and OpenCL can have long running threads.

So, rather than disabling the timeout completely, just set it to a 'long' value. Likewise, bump the heartbeat interval. That gives the OpenCL thread sufficient time to reach a pre-emption point without being killed off either by the GuC or by the heartbeat.

Signed-off-by: John Harrison John.C.Harrison@Intel.com --- drivers/gpu/drm/i915/gt/intel_engine_cs.c | 42 +++++++++++++++++++++-- 1 file changed, 39 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index 352254e001b4..26af8d60fe2b 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -382,9 +382,45 @@ static int intel_engine_setup(struct intel_gt *gt, enum intel_engine_id id, engine->props.timeslice_duration_ms = CONFIG_DRM_I915_TIMESLICE_DURATION;

- /* Override to uninterruptible for OpenCL workloads. */ - if (GRAPHICS_VER(i915) == 12 && engine->class == RENDER_CLASS) - engine->props.preempt_timeout_ms = 0; + /* + * Mid-thread pre-emption is not available in Gen12. Unfortunately, + * some OpenCL workloads run quite long threads. That means they get + * reset due to not pre-empting in a timely manner. + * The execlist solution was to disable pre-emption completely. + * However, pre-emption timeouts are the way GuC detects hung contexts + * and triggers engine resets. Thus, without pre-emption, there is no + * per engine reset. And full GT reset is much more intrusive. So keep + * the timeout for GuC submission platforms and just bump it to be + * much larger. Also bump the heartbeat timeout to match, otherwise + * the heartbeat can expire before the pre-emption can timeout and + * thus trigger a full GT reset anyway. + */ + if (GRAPHICS_VER(i915) == 12 && engine->class == RENDER_CLASS) { + if (intel_uc_wants_guc_submission(&gt->uc)) { + const unsigned long min_preempt = 7500; + const unsigned long min_beat = 5000; + + if (engine->props.preempt_timeout_ms && + engine->props.preempt_timeout_ms < min_preempt) { + drm_info(&gt->i915->drm, "Bumping pre-emption timeout from %ld to %ld on %s to allow slow compute pre-emption\n", + engine->props.preempt_timeout_ms, + min_preempt, engine->name); + + engine->props.preempt_timeout_ms = min_preempt; + } + + if (engine->props.heartbeat_interval_ms && + engine->props.heartbeat_interval_ms < min_beat) { + drm_info(&gt->i915->drm, "Bumping heartbeat interval from %ld to %ld on %s to allow slow compute pre-emption\n", + engine->props.heartbeat_interval_ms, + min_beat, engine->name); + + engine->props.heartbeat_interval_ms = min_beat; + } + } else { + engine->props.preempt_timeout_ms = 0; + } + }

engine->defaults = engine->props; /* never to change again */

-- 2.25.1

Daniele Ceraolo Spurio

16 Dec 16 Dec

1:11 a.m.

New subject: [Intel-gfx] [PATCH 4/4] drm/i915: Improve long running OCL w/a for GuC submission

On 12/10/2021 10:58 PM, John.C.Harrison@Intel.com wrote:

...

From: John Harrison John.C.Harrison@Intel.com

A workaround was added to the driver to allow OpenCL workloads to run 'forever' by disabling pre-emption on the RCS engine for Gen12. It is not totally unbound as the heartbeat will kick in eventually and cause a reset of the hung engine.

However, this does not work well in GuC submission mode. In GuC mode, the pre-emption timeout is how GuC detects hung contexts and triggers a per engine reset. Thus, disabling the timeout means also losing all per engine reset ability. A full GT reset will still occur when the heartbeat finally expires, but that is a much more destructive and undesirable mechanism.

The purpose of the workaround is actually to give OpenCL tasks longer to reach a pre-emption point after a pre-emption request has been issued. This is necessary because Gen12 does not support mid-thread pre-emption and OpenCL can have long running threads.

So, rather than disabling the timeout completely, just set it to a 'long' value. Likewise, bump the heartbeat interval. That gives the OpenCL thread sufficient time to reach a pre-emption point without being killed off either by the GuC or by the heartbeat.

Signed-off-by: John Harrison John.C.Harrison@Intel.com

On the approach and the code:

Reviewed-by: Daniele Ceraolo Spurio daniele.ceraolospurio@intel.com

Please get an ack from the interested parties on the actual numbers used for the timeouts (they look big enough to me, but I'm not familiar with the compute use-case).

Daniele

...

drivers/gpu/drm/i915/gt/intel_engine_cs.c | 42 +++++++++++++++++++++-- 1 file changed, 39 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index 352254e001b4..26af8d60fe2b 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -382,9 +382,45 @@ static int intel_engine_setup(struct intel_gt *gt, enum intel_engine_id id, engine->props.timeslice_duration_ms = CONFIG_DRM_I915_TIMESLICE_DURATION;
/* Override to uninterruptible for OpenCL workloads. */

if (GRAPHICS_VER(i915) == 12 && engine->class == RENDER_CLASS)
engine->props.preempt_timeout_ms = 0;
/*
* Mid-thread pre-emption is not available in Gen12. Unfortunately,
* some OpenCL workloads run quite long threads. That means they get
* reset due to not pre-empting in a timely manner.
* The execlist solution was to disable pre-emption completely.
* However, pre-emption timeouts are the way GuC detects hung contexts
* and triggers engine resets. Thus, without pre-emption, there is no
* per engine reset. And full GT reset is much more intrusive. So keep
* the timeout for GuC submission platforms and just bump it to be
* much larger. Also bump the heartbeat timeout to match, otherwise
* the heartbeat can expire before the pre-emption can timeout and
* thus trigger a full GT reset anyway.
*/
if (GRAPHICS_VER(i915) == 12 && engine->class == RENDER_CLASS) {
if (intel_uc_wants_guc_submission(&gt->uc)) {
	const unsigned long min_preempt = 7500;
	const unsigned long min_beat = 5000;
	if (engine->props.preempt_timeout_ms &&
	    engine->props.preempt_timeout_ms < min_preempt) {
		drm_info(&gt->i915->drm, "Bumping pre-emption timeout from %ld to %ld on %s to allow slow compute pre-emption\n",
			 engine->props.preempt_timeout_ms,
			 min_preempt, engine->name);
		engine->props.preempt_timeout_ms = min_preempt;
	}
	if (engine->props.heartbeat_interval_ms &&
	    engine->props.heartbeat_interval_ms < min_beat) {
		drm_info(&gt->i915->drm, "Bumping heartbeat interval from %ld to %ld on %s to allow slow compute pre-emption\n",
			 engine->props.heartbeat_interval_ms,
			 min_beat, engine->name);
		engine->props.heartbeat_interval_ms = min_beat;
	}
} else {
	engine->props.preempt_timeout_ms = 0;
}
}

engine->defaults = engine->props; /* never to change again */

1238

Age (days ago)

1247

Last active (days ago)

dri-devel@lists.freedesktop.org

7 comments

2 participants

tags (0)

participants (2)

Daniele Ceraolo Spurio
John.C.Harrison＠Intel.com