Some sources of significant amounts of latency aren't simple sleeps but instead busy-loops or a series of hundreds of small sleeps simply because the hardware can't do better. Unfortunately latencytop doesn't register these and so they slip under the radar. Hence expose a simplified interface to report additional latencies and export the underlying function so that modules can use this.
The example I have in mind are edid reads. The drm subsystem exposes both interfaces to do full probes and to just get at the cached state from the last probe and often userspace developers don't know about the difference and incur unecessary big latencies. And usually the i2c transfer is done with busy-looping or if there is a hw engine it might only be able to transfer a few bytes per sleep/irq cycle. And edid reads take at least 12ms and with crappy hw can easily be a few hundred ms.
Cc: Thomas Gleixner tglx@linutronix.de Cc: Arjan van de Ven arjan@linux.intel.com Cc: Andrew Morton akpm@linux-foundation.org Signed-off-by: Daniel Vetter daniel.vetter@intel.com --- include/linux/latencytop.h | 15 +++++++++++++++ kernel/latencytop.c | 2 ++ 2 files changed, 17 insertions(+)
diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h index e23121f9d82a..46b69bc35f02 100644 --- a/include/linux/latencytop.h +++ b/include/linux/latencytop.h @@ -10,6 +10,9 @@ #define _INCLUDE_GUARD_LATENCYTOP_H_
#include <linux/compiler.h> + +#include <asm/current.h> + struct task_struct;
#ifdef CONFIG_LATENCYTOP @@ -35,6 +38,13 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter) __account_scheduler_latency(task, usecs, inter); }
+static inline void +account_latency(int usecs) +{ + if (unlikely(latencytop_enabled)) + __account_scheduler_latency(current, usecs, 0); +} + void clear_all_latency_tracing(struct task_struct *p);
#else @@ -44,6 +54,11 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter) { }
+static inline void +account_latency(int usecs) +{ +} + static inline void clear_all_latency_tracing(struct task_struct *p) { } diff --git a/kernel/latencytop.c b/kernel/latencytop.c index a02812743a7e..b066a19fc52a 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -64,6 +64,7 @@ static DEFINE_RAW_SPINLOCK(latency_lock); static struct latency_record latency_record[MAXLR];
int latencytop_enabled; +EXPORT_SYMBOL_GPL(latencytop_enabled);
void clear_all_latency_tracing(struct task_struct *p) { @@ -234,6 +235,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) out_unlock: raw_spin_unlock_irqrestore(&latency_lock, flags); } +EXPORT_SYMBOL_GPL(__account_scheduler_latency);
static int lstats_show(struct seq_file *m, void *v) {
A forced EDID read takes 22.5ms best-case, and that's per 128byte block. HDMI screens tend to have 2-3 of those. Mutliply that by a few outputs and then it's clear that userspace really never ever should re-probe connector state on its own and trust the kernel to tell it when anything changed. The only exception is a manual reprobe button that the user must press itself (for extremely shitty KVM switches that don't wire up hotplug handling properly).
There have been bugs in the past, but we're slowly fixing them up. To the point even that some of the most abused interfaces (e.g. in sysfs) have been changed to only return the cached state ever due to too much polling by userspace.
But there's other places where we can't pull these tricks, so give userspace the tools to notice their abuse and expose delays due to EDID reads in latencytop.
Cc: Thomas Gleixner tglx@linutronix.de Cc: Arjan van de Ven arjan@linux.intel.com Cc: Andrew Morton akpm@linux-foundation.org Signed-off-by: Daniel Vetter daniel.vetter@intel.com --- drivers/gpu/drm/drm_edid.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-)
diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c index c214f1246cb4..370003e0cc69 100644 --- a/drivers/gpu/drm/drm_edid.c +++ b/drivers/gpu/drm/drm_edid.c @@ -32,6 +32,7 @@ #include <linux/hdmi.h> #include <linux/i2c.h> #include <linux/module.h> +#include <linux/latencytop.h> #include <drm/drmP.h> #include <drm/drm_edid.h> #include <drm/drm_displayid.h> @@ -1272,14 +1273,17 @@ struct edid *drm_do_get_edid(struct drm_connector *connector, int i, j = 0, valid_extensions = 0; u8 *block, *new; bool print_bad_edid = !connector->bad_edid_counter || (drm_debug & DRM_UT_KMS); + u64 before, after;
if ((block = kmalloc(EDID_LENGTH, GFP_KERNEL)) == NULL) return NULL;
+ before = ktime_get_raw_ns(); + /* base block fetch */ for (i = 0; i < 4; i++) { if (get_edid_block(data, block, 0, EDID_LENGTH)) - goto out; + goto none; if (drm_edid_block_valid(block, 0, print_bad_edid, &connector->edid_corrupt)) break; @@ -1293,11 +1297,11 @@ struct edid *drm_do_get_edid(struct drm_connector *connector,
/* if there's no extensions, we're done */ if (block[0x7e] == 0) - return (struct edid *)block; + goto out;
new = krealloc(block, (block[0x7e] + 1) * EDID_LENGTH, GFP_KERNEL); if (!new) - goto out; + goto none; block = new;
for (j = 1; j <= block[0x7e]; j++) { @@ -1305,7 +1309,7 @@ struct edid *drm_do_get_edid(struct drm_connector *connector, if (get_edid_block(data, block + (valid_extensions + 1) * EDID_LENGTH, j, EDID_LENGTH)) - goto out; + goto none; if (drm_edid_block_valid(block + (valid_extensions + 1) * EDID_LENGTH, j, print_bad_edid, @@ -1329,11 +1333,11 @@ struct edid *drm_do_get_edid(struct drm_connector *connector, block[0x7e] = valid_extensions; new = krealloc(block, (valid_extensions + 1) * EDID_LENGTH, GFP_KERNEL); if (!new) - goto out; + goto none; block = new; }
- return (struct edid *)block; + goto out;
carp: if (print_bad_edid) { @@ -1342,9 +1346,16 @@ carp: } connector->bad_edid_counter++;
-out: +none: kfree(block); - return NULL; + block = NULL; + +out: + after = ktime_get_raw_ns(); + + account_latency(DIV_ROUND_UP_ULL(after - before, 1000)); + + return (struct edid *)block; } EXPORT_SYMBOL_GPL(drm_do_get_edid);
On Tue, Dec 01, 2015 at 04:29:27PM +0100, Daniel Vetter wrote:
Some sources of significant amounts of latency aren't simple sleeps but instead busy-loops or a series of hundreds of small sleeps simply because the hardware can't do better. Unfortunately latencytop doesn't register these and so they slip under the radar. Hence expose a simplified interface to report additional latencies and export the underlying function so that modules can use this.
The example I have in mind are edid reads. The drm subsystem exposes both interfaces to do full probes and to just get at the cached state from the last probe and often userspace developers don't know about the difference and incur unecessary big latencies. And usually the i2c transfer is done with busy-looping or if there is a hw engine it might only be able to transfer a few bytes per sleep/irq cycle. And edid reads take at least 12ms and with crappy hw can easily be a few hundred ms.
Cc: Thomas Gleixner tglx@linutronix.de Cc: Arjan van de Ven arjan@linux.intel.com Cc: Andrew Morton akpm@linux-foundation.org Signed-off-by: Daniel Vetter daniel.vetter@intel.com
include/linux/latencytop.h | 15 +++++++++++++++ kernel/latencytop.c | 2 ++ 2 files changed, 17 insertions(+)
diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h index e23121f9d82a..46b69bc35f02 100644 --- a/include/linux/latencytop.h +++ b/include/linux/latencytop.h @@ -10,6 +10,9 @@ #define _INCLUDE_GUARD_LATENCYTOP_H_
#include <linux/compiler.h>
+#include <asm/current.h>
struct task_struct;
#ifdef CONFIG_LATENCYTOP @@ -35,6 +38,13 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter) __account_scheduler_latency(task, usecs, inter); }
+static inline void +account_latency(int usecs) +{
- if (unlikely(latencytop_enabled))
__account_scheduler_latency(current, usecs, 0);
Just
account_scheduler_latency(current, usecs, 0);
+}
And then that can be used for both ifdef paths, i.e. move account_latency() to after the #endif. -Chris
Some sources of significant amounts of latency aren't simple sleeps but instead busy-loops or a series of hundreds of small sleeps simply because the hardware can't do better. Unfortunately latencytop doesn't register these and so they slip under the radar. Hence expose a simplified interface to report additional latencies and export the underlying function so that modules can use this.
The example I have in mind are edid reads. The drm subsystem exposes both interfaces to do full probes and to just get at the cached state from the last probe and often userspace developers don't know about the difference and incur unecessary big latencies. And usually the i2c transfer is done with busy-looping or if there is a hw engine it might only be able to transfer a few bytes per sleep/irq cycle. And edid reads take at least 12ms and with crappy hw can easily be a few hundred ms.
v2: Simplify #ifdefs a bit (Chris).
Cc: Chris Wilson chris@chris-wilson.co.uk Cc: Thomas Gleixner tglx@linutronix.de Cc: Arjan van de Ven arjan@linux.intel.com Cc: Andrew Morton akpm@linux-foundation.org Signed-off-by: Daniel Vetter daniel.vetter@intel.com --- include/linux/latencytop.h | 9 +++++++++ kernel/latencytop.c | 2 ++ 2 files changed, 11 insertions(+)
diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h index e23121f9d82a..6f7c35a0bbfe 100644 --- a/include/linux/latencytop.h +++ b/include/linux/latencytop.h @@ -10,6 +10,9 @@ #define _INCLUDE_GUARD_LATENCYTOP_H_
#include <linux/compiler.h> + +#include <asm/current.h> + struct task_struct;
#ifdef CONFIG_LATENCYTOP @@ -50,4 +53,10 @@ static inline void clear_all_latency_tracing(struct task_struct *p)
#endif
+static inline void +account_latency(int usecs) +{ + account_scheduler_latency(current, usecs, 0); +} + #endif diff --git a/kernel/latencytop.c b/kernel/latencytop.c index a02812743a7e..b066a19fc52a 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -64,6 +64,7 @@ static DEFINE_RAW_SPINLOCK(latency_lock); static struct latency_record latency_record[MAXLR];
int latencytop_enabled; +EXPORT_SYMBOL_GPL(latencytop_enabled);
void clear_all_latency_tracing(struct task_struct *p) { @@ -234,6 +235,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) out_unlock: raw_spin_unlock_irqrestore(&latency_lock, flags); } +EXPORT_SYMBOL_GPL(__account_scheduler_latency);
static int lstats_show(struct seq_file *m, void *v) {
dri-devel@lists.freedesktop.org