A recent request to export kstat_irqs() pointed to a copy of the same in the i915 code, which made me look for further usage of irq descriptors in drivers.
The usage in drivers ranges from creative to broken in all colours.
irqdesc.h clearly says that this is core functionality and the fact C does not allow full encapsulation is not a justification to fiddle with it just because. It took us a lot of effort to make the core functionality provide what drivers need.
If there is a shortcoming, it's not asked too much to talk to the relevant maintainers instead of going off and fiddling with the guts of interrupt descriptors and often enough without understanding lifetime and locking rules.
As people insist on not respecting boundaries, this series cleans up the (ab)use and at the end removes the export of irq_to_desc() to make it at least harder. All legitimate users of this are built in.
While at it I stumbled over some other oddities related to interrupt counting and cleaned them up as well.
The series applies on top of
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/core
and is also available from git:
git://git.kernel.org/pub/scm/linux/kernel/git/tglx/devel.git genirq
Thanks,
tglx --- arch/alpha/kernel/sys_jensen.c | 2 arch/arm/kernel/smp.c | 2 arch/parisc/kernel/irq.c | 7 arch/s390/kernel/irq.c | 2 arch/x86/kernel/topology.c | 1 arch/arm64/kernel/smp.c | 2 drivers/gpu/drm/i915/display/intel_lpe_audio.c | 4 drivers/gpu/drm/i915/i915_irq.c | 34 +++ drivers/gpu/drm/i915/i915_pmu.c | 18 - drivers/gpu/drm/i915/i915_pmu.h | 8 drivers/mfd/ab8500-debugfs.c | 16 - drivers/net/ethernet/mellanox/mlx4/en_cq.c | 8 drivers/net/ethernet/mellanox/mlx4/en_rx.c | 6 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 3 drivers/net/ethernet/mellanox/mlx5/core/en.h | 2 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c | 6 drivers/ntb/msi.c | 4 drivers/pci/controller/mobiveil/pcie-mobiveil-host.c | 8 drivers/pci/controller/pcie-xilinx-nwl.c | 8 drivers/pinctrl/nomadik/pinctrl-nomadik.c | 3 drivers/xen/events/events_base.c | 172 +++++++++++-------- drivers/xen/evtchn.c | 34 --- include/linux/interrupt.h | 1 include/linux/irq.h | 7 include/linux/irqdesc.h | 40 +--- include/linux/kernel_stat.h | 1 kernel/irq/irqdesc.c | 42 ++-- kernel/irq/manage.c | 37 ++++ kernel/irq/proc.c | 5 30 files changed, 263 insertions(+), 222 deletions(-)
This function uses irq_to_desc() and is going to be used by modules to replace the open coded irq_to_desc() (ab)usage. The final goal is to remove the export of irq_to_desc() so driver cannot fiddle with it anymore.
Move it into the core code and fixup the usage sites to include the proper header.
Signed-off-by: Thomas Gleixner tglx@linutronix.de --- arch/alpha/kernel/sys_jensen.c | 2 +- arch/x86/kernel/topology.c | 1 + include/linux/interrupt.h | 1 + include/linux/irqdesc.h | 7 +------ kernel/irq/manage.c | 17 +++++++++++++++++ 5 files changed, 21 insertions(+), 7 deletions(-)
--- a/arch/alpha/kernel/sys_jensen.c +++ b/arch/alpha/kernel/sys_jensen.c @@ -7,7 +7,7 @@ * * Code supporting the Jensen. */ - +#include <linux/interrupt.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/mm.h> --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -25,6 +25,7 @@ * * Send feedback to colpatch@us.ibm.com */ +#include <linux/interrupt.h> #include <linux/nodemask.h> #include <linux/export.h> #include <linux/mmzone.h> --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -232,6 +232,7 @@ extern void devm_free_irq(struct device # define local_irq_enable_in_hardirq() local_irq_enable() #endif
+bool irq_has_action(unsigned int irq); extern void disable_irq_nosync(unsigned int irq); extern bool disable_hardirq(unsigned int irq); extern void disable_irq(unsigned int irq); --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -179,12 +179,7 @@ int handle_domain_nmi(struct irq_domain /* Test to see if a driver has successfully requested an irq */ static inline int irq_desc_has_action(struct irq_desc *desc) { - return desc->action != NULL; -} - -static inline int irq_has_action(unsigned int irq) -{ - return irq_desc_has_action(irq_to_desc(irq)); + return desc && desc->action != NULL; }
/** --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -2752,3 +2752,20 @@ int irq_set_irqchip_state(unsigned int i return err; } EXPORT_SYMBOL_GPL(irq_set_irqchip_state); + +/** + * irq_has_action - Check whether an interrupt is requested + * @irq: The linux irq number + * + * Returns: A snapshot of the current state + */ +bool irq_has_action(unsigned int irq) +{ + bool res; + + rcu_read_lock(); + res = irq_desc_has_action(irq_to_desc(irq)); + rcu_read_unlock(); + return res; +} +EXPORT_SYMBOL_GPL(irq_has_action);
These checks are used by modules and prevent the removal of the export of irq_to_desc(). Move the accessor into the core.
Signed-off-by: Thomas Gleixner tglx@linutronix.de --- include/linux/irqdesc.h | 17 +++++------------ kernel/irq/manage.c | 17 +++++++++++++++++ 2 files changed, 22 insertions(+), 12 deletions(-)
--- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -223,28 +223,21 @@ irq_set_chip_handler_name_locked(struct data->chip = chip; }
+bool irq_check_status_bit(unsigned int irq, unsigned int bitmask); + static inline bool irq_balancing_disabled(unsigned int irq) { - struct irq_desc *desc; - - desc = irq_to_desc(irq); - return desc->status_use_accessors & IRQ_NO_BALANCING_MASK; + return irq_check_status_bit(irq, IRQ_NO_BALANCING_MASK); }
static inline bool irq_is_percpu(unsigned int irq) { - struct irq_desc *desc; - - desc = irq_to_desc(irq); - return desc->status_use_accessors & IRQ_PER_CPU; + return irq_check_status_bit(irq, IRQ_PER_CPU); }
static inline bool irq_is_percpu_devid(unsigned int irq) { - struct irq_desc *desc; - - desc = irq_to_desc(irq); - return desc->status_use_accessors & IRQ_PER_CPU_DEVID; + return irq_check_status_bit(irq, IRQ_PER_CPU_DEVID); }
static inline void --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -2769,3 +2769,23 @@ bool irq_has_action(unsigned int irq) return res; } EXPORT_SYMBOL_GPL(irq_has_action); + +/** + * irq_check_status_bit - Check whether bits in the irq descriptor status are set + * @irq: The linux irq number + * @bitmask: The bitmask to evaluate + * + * Returns: True if one of the bits in @bitmask is set + */ +bool irq_check_status_bit(unsigned int irq, unsigned int bitmask) +{ + struct irq_desc *desc; + bool res = false; + + rcu_read_lock(); + desc = irq_to_desc(irq); + if (desc) + res = !!(desc->status_use_accessors & bitmask); + rcu_read_unlock(); + return res; +}
On Thu, Dec 10, 2020 at 08:25:38PM +0100, Thomas Gleixner wrote:
These checks are used by modules and prevent the removal of the export of irq_to_desc(). Move the accessor into the core.
Signed-off-by: Thomas Gleixner tglx@linutronix.de
Yes, but that means that irq_check_status_bit() may be called from modules, but it is not exported, resulting in build errors such as the following.
arm64:allmodconfig:
ERROR: modpost: "irq_check_status_bit" [drivers/perf/arm_spe_pmu.ko] undefined!
Guenter
include/linux/irqdesc.h | 17 +++++------------ kernel/irq/manage.c | 17 +++++++++++++++++ 2 files changed, 22 insertions(+), 12 deletions(-)
--- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -223,28 +223,21 @@ irq_set_chip_handler_name_locked(struct data->chip = chip; }
+bool irq_check_status_bit(unsigned int irq, unsigned int bitmask);
static inline bool irq_balancing_disabled(unsigned int irq) {
- struct irq_desc *desc;
- desc = irq_to_desc(irq);
- return desc->status_use_accessors & IRQ_NO_BALANCING_MASK;
- return irq_check_status_bit(irq, IRQ_NO_BALANCING_MASK);
}
static inline bool irq_is_percpu(unsigned int irq) {
- struct irq_desc *desc;
- desc = irq_to_desc(irq);
- return desc->status_use_accessors & IRQ_PER_CPU;
- return irq_check_status_bit(irq, IRQ_PER_CPU);
}
static inline bool irq_is_percpu_devid(unsigned int irq) {
- struct irq_desc *desc;
- desc = irq_to_desc(irq);
- return desc->status_use_accessors & IRQ_PER_CPU_DEVID;
- return irq_check_status_bit(irq, IRQ_PER_CPU_DEVID);
}
static inline void --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -2769,3 +2769,23 @@ bool irq_has_action(unsigned int irq) return res; } EXPORT_SYMBOL_GPL(irq_has_action);
+/**
- irq_check_status_bit - Check whether bits in the irq descriptor status are set
- @irq: The linux irq number
- @bitmask: The bitmask to evaluate
- Returns: True if one of the bits in @bitmask is set
- */
+bool irq_check_status_bit(unsigned int irq, unsigned int bitmask) +{
- struct irq_desc *desc;
- bool res = false;
- rcu_read_lock();
- desc = irq_to_desc(irq);
- if (desc)
res = !!(desc->status_use_accessors & bitmask);
- rcu_read_unlock();
- return res;
+}
On Sun, Dec 27 2020 at 11:20, Guenter Roeck wrote:
On Thu, Dec 10, 2020 at 08:25:38PM +0100, Thomas Gleixner wrote: Yes, but that means that irq_check_status_bit() may be called from modules, but it is not exported, resulting in build errors such as the following.
arm64:allmodconfig:
ERROR: modpost: "irq_check_status_bit" [drivers/perf/arm_spe_pmu.ko] undefined!
Duh. Yes, that lacks an export obviously.
Thanks,
tglx
irq_set_lockdep_class() is used from modules and requires irq_to_desc() to be exported. Move it into the core code which lifts another requirement for the export.
Signed-off-by: Thomas Gleixner tglx@linutronix.de --- include/linux/irqdesc.h | 10 ++++------ kernel/irq/irqdesc.c | 14 ++++++++++++++ 2 files changed, 18 insertions(+), 6 deletions(-)
--- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -240,16 +240,14 @@ static inline bool irq_is_percpu_devid(u return irq_check_status_bit(irq, IRQ_PER_CPU_DEVID); }
+void __irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class, + struct lock_class_key *request_class); static inline void irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class, struct lock_class_key *request_class) { - struct irq_desc *desc = irq_to_desc(irq); - - if (desc) { - lockdep_set_class(&desc->lock, lock_class); - lockdep_set_class(&desc->request_mutex, request_class); - } + if (IS_ENABLED(CONFIG_LOCKDEP)) + __irq_set_lockdep_class(irq, lock_class, request_class); }
#endif --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -968,3 +968,17 @@ unsigned int kstat_irqs_usr(unsigned int rcu_read_unlock(); return sum; } + +#ifdef CONFIG_LOCKDEP +void __irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class, + struct lock_class_key *request_class) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (desc) { + lockdep_set_class(&desc->lock, lock_class); + lockdep_set_class(&desc->request_mutex, request_class); + } +} +EXPORT_SYMBOL_GPL(irq_set_lockdep_class); +#endif
On Thu, Dec 10, 2020 at 10:14 PM Thomas Gleixner tglx@linutronix.de wrote:
irq_set_lockdep_class() is used from modules and requires irq_to_desc() to be exported. Move it into the core code which lifts another requirement for the export.
...
if (IS_ENABLED(CONFIG_LOCKDEP))
__irq_set_lockdep_class(irq, lock_class, request_class);
Maybe I missed something, but even if the compiler does not warn the use of if IS_ENABLED() with complimentary #ifdef seems inconsistent.
+#ifdef CONFIG_LOCKDEP
...
+EXPORT_SYMBOL_GPL(irq_set_lockdep_class); +#endif
On Fri, Dec 11 2020 at 19:53, Andy Shevchenko wrote:
On Thu, Dec 10, 2020 at 10:14 PM Thomas Gleixner tglx@linutronix.de wrote:
irq_set_lockdep_class() is used from modules and requires irq_to_desc() to be exported. Move it into the core code which lifts another requirement for the export.
...
if (IS_ENABLED(CONFIG_LOCKDEP))
__irq_set_lockdep_class(irq, lock_class, request_class);
You are right. Let me fix that.
On Fri, Dec 11 2020 at 22:08, Thomas Gleixner wrote:
On Fri, Dec 11 2020 at 19:53, Andy Shevchenko wrote:
On Thu, Dec 10, 2020 at 10:14 PM Thomas Gleixner tglx@linutronix.de wrote:
irq_set_lockdep_class() is used from modules and requires irq_to_desc() to be exported. Move it into the core code which lifts another requirement for the export.
...
if (IS_ENABLED(CONFIG_LOCKDEP))
__irq_set_lockdep_class(irq, lock_class, request_class);
You are right. Let me fix that.
No. I have to correct myself. You're wrong.
The inline is evaluated in the compilation units which include that header and because the function declaration is unconditional it is happy.
Now the optimizer stage makes the whole thing a NOOP if CONFIG_LOCKDEP=n and thereby drops the reference to the function which makes it not required for linking.
So in the file where the function is implemented:
#ifdef CONFIG_LOCKDEP void __irq_set_lockdep_class(....) { } #endif
The whole block is either discarded because CONFIG_LOCKDEP is not defined or compile if it is defined which makes it available for the linker.
And in the latter case the optimizer keeps the call in the inline (it optimizes the condition away because it's always true).
So in both cases the compiler and the linker are happy and everything works as expected.
It would fail if the header file had the following:
#ifdef CONFIG_LOCKDEP void __irq_set_lockdep_class(....); #endif
Because then it would complain about the missing function prototype when it evaluates the inline.
Thanks,
tglx
On Sat, Dec 12, 2020 at 12:07 AM Thomas Gleixner tglx@linutronix.de wrote:
On Fri, Dec 11 2020 at 22:08, Thomas Gleixner wrote:
On Fri, Dec 11 2020 at 19:53, Andy Shevchenko wrote:
On Thu, Dec 10, 2020 at 10:14 PM Thomas Gleixner tglx@linutronix.de wrote:
irq_set_lockdep_class() is used from modules and requires irq_to_desc() to be exported. Move it into the core code which lifts another requirement for the export.
...
if (IS_ENABLED(CONFIG_LOCKDEP))
__irq_set_lockdep_class(irq, lock_class, request_class);
You are right. Let me fix that.
No. I have to correct myself. You're wrong.
The inline is evaluated in the compilation units which include that header and because the function declaration is unconditional it is happy.
Now the optimizer stage makes the whole thing a NOOP if CONFIG_LOCKDEP=n and thereby drops the reference to the function which makes it not required for linking.
So in the file where the function is implemented:
#ifdef CONFIG_LOCKDEP void __irq_set_lockdep_class(....) { } #endif
The whole block is either discarded because CONFIG_LOCKDEP is not defined or compile if it is defined which makes it available for the linker.
And in the latter case the optimizer keeps the call in the inline (it optimizes the condition away because it's always true).
So in both cases the compiler and the linker are happy and everything works as expected.
It would fail if the header file had the following:
#ifdef CONFIG_LOCKDEP void __irq_set_lockdep_class(....); #endif
Because then it would complain about the missing function prototype when it evaluates the inline.
I understand that (that's why I put "if even no warning") and what I'm talking about is the purpose of IS_ENABLED(). It's usually good for compile testing !CONFIG_FOO cases. But here it seems inconsistent.
The pattern I usually see in the cases like this is
#ifdef CONFIG_LOCKDEP void __irq_set_lockdep_class(....); #else static inline void ... {} #endif
and call it directly in the caller.
It's not a big deal, so up to you.
Provide an accessor to the effective interrupt affinity mask. Going to be used to replace open coded fiddling with the irq descriptor.
Signed-off-by: Thomas Gleixner tglx@linutronix.de --- include/linux/irq.h | 7 +++++++ 1 file changed, 7 insertions(+)
--- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -907,6 +907,13 @@ struct cpumask *irq_data_get_effective_a } #endif
+static inline struct cpumask *irq_get_effective_affinity_mask(unsigned int irq) +{ + struct irq_data *d = irq_get_irq_data(irq); + + return d ? irq_data_get_effective_affinity_mask(d) : NULL; +} + unsigned int arch_dynirq_lower_bound(unsigned int from);
int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
Both the per cpu stats and the accumulated count are accessed lockless and can be concurrently modified. That's intentional and the stats are a rough estimate anyway. Annotate them with data_race().
Signed-off-by: Thomas Gleixner tglx@linutronix.de --- kernel/irq/irqdesc.c | 4 ++-- kernel/irq/proc.c | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-)
--- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -943,10 +943,10 @@ unsigned int kstat_irqs(unsigned int irq if (!irq_settings_is_per_cpu_devid(desc) && !irq_settings_is_per_cpu(desc) && !irq_is_nmi(desc)) - return desc->tot_count; + return data_race(desc->tot_count);
for_each_possible_cpu(cpu) - sum += *per_cpu_ptr(desc->kstat_irqs, cpu); + sum += data_race(*per_cpu_ptr(desc->kstat_irqs, cpu)); return sum; }
--- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -488,9 +488,10 @@ int show_interrupts(struct seq_file *p, if (!desc || irq_settings_is_hidden(desc)) goto outsparse;
- if (desc->kstat_irqs) + if (desc->kstat_irqs) { for_each_online_cpu(j) - any_count |= *per_cpu_ptr(desc->kstat_irqs, j); + any_count |= data_race(*per_cpu_ptr(desc->kstat_irqs, j)); + }
if ((!desc->action || irq_desc_is_chained(desc)) && !any_count) goto outsparse;
The SMP variant works perfectly fine on UP as well.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: "James E.J. Bottomley" James.Bottomley@HansenPartnership.com Cc: Helge Deller deller@gmx.de Cc: afzal mohammed afzal.mohd.ma@gmail.com Cc: linux-parisc@vger.kernel.org --- arch/parisc/kernel/irq.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-)
--- a/arch/parisc/kernel/irq.c +++ b/arch/parisc/kernel/irq.c @@ -216,12 +216,9 @@ int show_interrupts(struct seq_file *p, if (!action) goto skip; seq_printf(p, "%3d: ", i); -#ifdef CONFIG_SMP + for_each_online_cpu(j) seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); -#else - seq_printf(p, "%10u ", kstat_irqs(i)); -#endif
seq_printf(p, " %14s", irq_desc_get_chip(desc)->name); #ifndef PARISC_IRQ_CR16_COUNTS
No more users outside the core code.
Signed-off-by: Thomas Gleixner tglx@linutronix.de --- include/linux/kernel_stat.h | 1 - kernel/irq/irqdesc.c | 19 ++++++------------- 2 files changed, 6 insertions(+), 14 deletions(-)
--- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -67,7 +67,6 @@ static inline unsigned int kstat_softirq /* * Number of interrupts per specific IRQ source, since bootup */ -extern unsigned int kstat_irqs(unsigned int irq); extern unsigned int kstat_irqs_usr(unsigned int irq);
/* --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -924,15 +924,7 @@ static bool irq_is_nmi(struct irq_desc * return desc->istate & IRQS_NMI; }
-/** - * kstat_irqs - Get the statistics for an interrupt - * @irq: The interrupt number - * - * Returns the sum of interrupt counts on all cpus since boot for - * @irq. The caller must ensure that the interrupt is not removed - * concurrently. - */ -unsigned int kstat_irqs(unsigned int irq) +static unsigned int kstat_irqs(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); unsigned int sum = 0; @@ -951,13 +943,14 @@ unsigned int kstat_irqs(unsigned int irq }
/** - * kstat_irqs_usr - Get the statistics for an interrupt + * kstat_irqs_usr - Get the statistics for an interrupt from thread context * @irq: The interrupt number * * Returns the sum of interrupt counts on all cpus since boot for @irq. - * Contrary to kstat_irqs() this can be called from any context. - * It uses rcu since a concurrent removal of an interrupt descriptor is - * observing an rcu grace period before delayed_free_desc()/irq_kobj_release(). + * + * It uses rcu to protect the access since a concurrent removal of an + * interrupt descriptor is observing an rcu grace period before + * delayed_free_desc()/irq_kobj_release(). */ unsigned int kstat_irqs_usr(unsigned int irq) {
Most users of kstat_irqs_cpu() have the irq descriptor already. No point in calling into the core code and looking it up once more.
Use it in per_cpu_count_show() to start with.
Signed-off-by: Thomas Gleixner tglx@linutronix.de --- include/linux/irqdesc.h | 6 ++++++ kernel/irq/irqdesc.c | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-)
--- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -113,6 +113,12 @@ static inline void irq_unlock_sparse(voi extern struct irq_desc irq_desc[NR_IRQS]; #endif
+static inline unsigned int irq_desc_kstat_cpu(struct irq_desc *desc, + unsigned int cpu) +{ + return desc->kstat_irqs ? *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; +} + static inline struct irq_desc *irq_data_to_desc(struct irq_data *data) { return container_of(data->common, struct irq_desc, irq_common_data); --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -147,12 +147,12 @@ static ssize_t per_cpu_count_show(struct struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); - int cpu, irq = desc->irq_data.irq; ssize_t ret = 0; char *p = ""; + int cpu;
for_each_possible_cpu(cpu) { - unsigned int c = kstat_irqs_cpu(irq, cpu); + unsigned int c = irq_desc_kstat_cpu(desc, cpu);
ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%u", p, c); p = ",";
The irq descriptor is already there, no need to look it up again.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Marc Zyngier maz@kernel.org Cc: Russell King linux@armlinux.org.uk Cc: linux-arm-kernel@lists.infradead.org --- arch/arm/kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -550,7 +550,7 @@ void show_ipi_list(struct seq_file *p, i seq_printf(p, "%*s%u: ", prec - 1, "IPI", i);
for_each_online_cpu(cpu) - seq_printf(p, "%10u ", kstat_irqs_cpu(irq, cpu)); + seq_printf(p, "%10u ", irq_desc_kstat_cpu(ipi_desc[i], cpu));
seq_printf(p, " %s\n", ipi_types[i]); }
On Thu, 10 Dec 2020 19:25:45 +0000, Thomas Gleixner tglx@linutronix.de wrote:
The irq descriptor is already there, no need to look it up again.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Marc Zyngier maz@kernel.org Cc: Russell King linux@armlinux.org.uk Cc: linux-arm-kernel@lists.infradead.org
arch/arm/kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -550,7 +550,7 @@ void show_ipi_list(struct seq_file *p, i seq_printf(p, "%*s%u: ", prec - 1, "IPI", i);
for_each_online_cpu(cpu)
seq_printf(p, "%10u ", kstat_irqs_cpu(irq, cpu));
seq_printf(p, "%10u ", irq_desc_kstat_cpu(ipi_desc[i], cpu));
seq_printf(p, " %s\n", ipi_types[i]); }
Acked-by: Marc Zyngier maz@kernel.org
The irq descriptor is already there, no need to look it up again.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Mark Rutland mark.rutland@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Cc: Marc Zyngier maz@kernel.org Cc: linux-arm-kernel@lists.infradead.org --- arch/arm64/kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -809,7 +809,7 @@ int arch_show_interrupts(struct seq_file seq_printf(p, "%*s%u:%s", prec - 1, "IPI", i, prec >= 4 ? " " : ""); for_each_online_cpu(cpu) - seq_printf(p, "%10u ", kstat_irqs_cpu(irq, cpu)); + seq_printf(p, "%10u ", irq_desc_kstat_cpu(ipi_desc[i], cpu)); seq_printf(p, " %s\n", ipi_types[i]); }
On Thu, 10 Dec 2020 19:25:46 +0000, Thomas Gleixner tglx@linutronix.de wrote:
The irq descriptor is already there, no need to look it up again.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Mark Rutland mark.rutland@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Cc: Marc Zyngier maz@kernel.org Cc: linux-arm-kernel@lists.infradead.org
arch/arm64/kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -809,7 +809,7 @@ int arch_show_interrupts(struct seq_file seq_printf(p, "%*s%u:%s", prec - 1, "IPI", i, prec >= 4 ? " " : ""); for_each_online_cpu(cpu)
seq_printf(p, "%10u ", kstat_irqs_cpu(irq, cpu));
seq_printf(p, " %s\n", ipi_types[i]); }seq_printf(p, "%10u ", irq_desc_kstat_cpu(ipi_desc[i], cpu));
Acked-by: Marc Zyngier maz@kernel.org
The irq descriptor is already there, no need to look it up again.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: "James E.J. Bottomley" James.Bottomley@HansenPartnership.com Cc: Helge Deller deller@gmx.de Cc: afzal mohammed afzal.mohd.ma@gmail.com Cc: linux-parisc@vger.kernel.org --- arch/parisc/kernel/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/parisc/kernel/irq.c +++ b/arch/parisc/kernel/irq.c @@ -218,7 +218,7 @@ int show_interrupts(struct seq_file *p, seq_printf(p, "%3d: ", i);
for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); + seq_printf(p, "%10u ", irq_desc_kstat_cpu(desc, j));
seq_printf(p, " %14s", irq_desc_get_chip(desc)->name); #ifndef PARISC_IRQ_CR16_COUNTS
The irq descriptor is already there, no need to look it up again.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Christian Borntraeger borntraeger@de.ibm.com Cc: Heiko Carstens hca@linux.ibm.com Cc: linux-s390@vger.kernel.org --- arch/s390/kernel/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -124,7 +124,7 @@ static void show_msi_interrupt(struct se raw_spin_lock_irqsave(&desc->lock, flags); seq_printf(p, "%3d: ", irq); for_each_online_cpu(cpu) - seq_printf(p, "%10u ", kstat_irqs_cpu(irq, cpu)); + seq_printf(p, "%10u ", irq_desc_kstat_irq(desc, cpu));
if (desc->irq_data.chip) seq_printf(p, " %8s", desc->irq_data.chip->name);
On Thu, Dec 10, 2020 at 08:25:48PM +0100, Thomas Gleixner wrote:
The irq descriptor is already there, no need to look it up again.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Christian Borntraeger borntraeger@de.ibm.com Cc: Heiko Carstens hca@linux.ibm.com Cc: linux-s390@vger.kernel.org
arch/s390/kernel/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
Acked-by: Heiko Carstens hca@linux.ibm.com
Nothing uses the result and nothing should ever use it in driver code.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Jani Nikula jani.nikula@linux.intel.com Cc: Joonas Lahtinen joonas.lahtinen@linux.intel.com Cc: Rodrigo Vivi rodrigo.vivi@intel.com Cc: David Airlie airlied@linux.ie Cc: Daniel Vetter daniel@ffwll.ch Cc: Pankaj Bharadiya pankaj.laxminarayan.bharadiya@intel.com Cc: Chris Wilson chris@chris-wilson.co.uk Cc: Wambui Karuga wambui.karugax@gmail.com Cc: intel-gfx@lists.freedesktop.org Cc: dri-devel@lists.freedesktop.org --- drivers/gpu/drm/i915/display/intel_lpe_audio.c | 4 ---- 1 file changed, 4 deletions(-)
--- a/drivers/gpu/drm/i915/display/intel_lpe_audio.c +++ b/drivers/gpu/drm/i915/display/intel_lpe_audio.c @@ -297,13 +297,9 @@ int intel_lpe_audio_init(struct drm_i915 */ void intel_lpe_audio_teardown(struct drm_i915_private *dev_priv) { - struct irq_desc *desc; - if (!HAS_LPE_AUDIO(dev_priv)) return;
- desc = irq_to_desc(dev_priv->lpe_audio.irq); - lpe_audio_platdev_destroy(dev_priv);
irq_free_desc(dev_priv->lpe_audio.irq);
On Thu, Dec 10, 2020 at 08:25:49PM +0100, Thomas Gleixner wrote:
Nothing uses the result and nothing should ever use it in driver code.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Jani Nikula jani.nikula@linux.intel.com Cc: Joonas Lahtinen joonas.lahtinen@linux.intel.com Cc: Rodrigo Vivi rodrigo.vivi@intel.com Cc: David Airlie airlied@linux.ie Cc: Daniel Vetter daniel@ffwll.ch Cc: Pankaj Bharadiya pankaj.laxminarayan.bharadiya@intel.com Cc: Chris Wilson chris@chris-wilson.co.uk Cc: Wambui Karuga wambui.karugax@gmail.com Cc: intel-gfx@lists.freedesktop.org Cc: dri-devel@lists.freedesktop.org
Reviewed-by: Ville Syrjälä ville.syrjala@linux.intel.com
drivers/gpu/drm/i915/display/intel_lpe_audio.c | 4 ---- 1 file changed, 4 deletions(-)
--- a/drivers/gpu/drm/i915/display/intel_lpe_audio.c +++ b/drivers/gpu/drm/i915/display/intel_lpe_audio.c @@ -297,13 +297,9 @@ int intel_lpe_audio_init(struct drm_i915 */ void intel_lpe_audio_teardown(struct drm_i915_private *dev_priv) {
struct irq_desc *desc;
if (!HAS_LPE_AUDIO(dev_priv)) return;
desc = irq_to_desc(dev_priv->lpe_audio.irq);
lpe_audio_platdev_destroy(dev_priv);
irq_free_desc(dev_priv->lpe_audio.irq);
Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
On Thu, 10 Dec 2020, Ville Syrjälä ville.syrjala@linux.intel.com wrote:
On Thu, Dec 10, 2020 at 08:25:49PM +0100, Thomas Gleixner wrote:
Nothing uses the result and nothing should ever use it in driver code.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Jani Nikula jani.nikula@linux.intel.com Cc: Joonas Lahtinen joonas.lahtinen@linux.intel.com Cc: Rodrigo Vivi rodrigo.vivi@intel.com Cc: David Airlie airlied@linux.ie Cc: Daniel Vetter daniel@ffwll.ch Cc: Pankaj Bharadiya pankaj.laxminarayan.bharadiya@intel.com Cc: Chris Wilson chris@chris-wilson.co.uk Cc: Wambui Karuga wambui.karugax@gmail.com Cc: intel-gfx@lists.freedesktop.org Cc: dri-devel@lists.freedesktop.org
Reviewed-by: Ville Syrjälä ville.syrjala@linux.intel.com
Thomas, I presume you want to merge this series as a whole.
Acked-by: Jani Nikula jani.nikula@intel.com
for merging via whichever tree makes most sense. Please let us know if you want us to pick this up via drm-intel instead.
drivers/gpu/drm/i915/display/intel_lpe_audio.c | 4 ---- 1 file changed, 4 deletions(-)
--- a/drivers/gpu/drm/i915/display/intel_lpe_audio.c +++ b/drivers/gpu/drm/i915/display/intel_lpe_audio.c @@ -297,13 +297,9 @@ int intel_lpe_audio_init(struct drm_i915 */ void intel_lpe_audio_teardown(struct drm_i915_private *dev_priv) {
struct irq_desc *desc;
if (!HAS_LPE_AUDIO(dev_priv)) return;
desc = irq_to_desc(dev_priv->lpe_audio.irq);
lpe_audio_platdev_destroy(dev_priv);
irq_free_desc(dev_priv->lpe_audio.irq);
Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Driver code has no business with the internals of the irq descriptor.
Aside of that the count is per interrupt line and therefore takes interrupts from other devices into account which share the interrupt line and are not handled by the graphics driver.
Replace it with a pmu private count which only counts interrupts which originate from the graphics card.
To avoid atomics or heuristics of some sort make the counter field 'unsigned long'. That limits the count to 4e9 on 32bit which is a lot and postprocessing can easily deal with the occasional wraparound.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Tvrtko Ursulin tvrtko.ursulin@linux.intel.com Cc: Jani Nikula jani.nikula@linux.intel.com Cc: Joonas Lahtinen joonas.lahtinen@linux.intel.com Cc: Rodrigo Vivi rodrigo.vivi@intel.com Cc: David Airlie airlied@linux.ie Cc: Daniel Vetter daniel@ffwll.ch Cc: intel-gfx@lists.freedesktop.org Cc: dri-devel@lists.freedesktop.org --- drivers/gpu/drm/i915/i915_irq.c | 34 ++++++++++++++++++++++++++++++++++ drivers/gpu/drm/i915/i915_pmu.c | 18 +----------------- drivers/gpu/drm/i915/i915_pmu.h | 8 ++++++++ 3 files changed, 43 insertions(+), 17 deletions(-)
--- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -60,6 +60,24 @@ * and related files, but that will be described in separate chapters. */
+/* + * Interrupt statistic for PMU. Increments the counter only if the + * interrupt originated from the the GPU so interrupts from a device which + * shares the interrupt line are not accounted. + */ +static inline void pmu_irq_stats(struct drm_i915_private *priv, + irqreturn_t res) +{ + if (unlikely(res != IRQ_HANDLED)) + return; + + /* + * A clever compiler translates that into INC. A not so clever one + * should at least prevent store tearing. + */ + WRITE_ONCE(priv->pmu.irq_count, priv->pmu.irq_count + 1); +} + typedef bool (*long_pulse_detect_func)(enum hpd_pin pin, u32 val);
static const u32 hpd_ilk[HPD_NUM_PINS] = { @@ -1599,6 +1617,8 @@ static irqreturn_t valleyview_irq_handle valleyview_pipestat_irq_handler(dev_priv, pipe_stats); } while (0);
+ pmu_irq_stats(dev_priv, ret); + enable_rpm_wakeref_asserts(&dev_priv->runtime_pm);
return ret; @@ -1676,6 +1696,8 @@ static irqreturn_t cherryview_irq_handle valleyview_pipestat_irq_handler(dev_priv, pipe_stats); } while (0);
+ pmu_irq_stats(dev_priv, ret); + enable_rpm_wakeref_asserts(&dev_priv->runtime_pm);
return ret; @@ -2103,6 +2125,8 @@ static irqreturn_t ilk_irq_handler(int i if (sde_ier) raw_reg_write(regs, SDEIER, sde_ier);
+ pmu_irq_stats(i915, ret); + /* IRQs are synced during runtime_suspend, we don't require a wakeref */ enable_rpm_wakeref_asserts(&i915->runtime_pm);
@@ -2419,6 +2443,8 @@ static irqreturn_t gen8_irq_handler(int
gen8_master_intr_enable(regs);
+ pmu_irq_stats(dev_priv, IRQ_HANDLED); + return IRQ_HANDLED; }
@@ -2514,6 +2540,8 @@ static __always_inline irqreturn_t
gen11_gu_misc_irq_handler(gt, gu_misc_iir);
+ pmu_irq_stats(i915, IRQ_HANDLED); + return IRQ_HANDLED; }
@@ -3688,6 +3716,8 @@ static irqreturn_t i8xx_irq_handler(int i8xx_pipestat_irq_handler(dev_priv, iir, pipe_stats); } while (0);
+ pmu_irq_stats(dev_priv, ret); + enable_rpm_wakeref_asserts(&dev_priv->runtime_pm);
return ret; @@ -3796,6 +3826,8 @@ static irqreturn_t i915_irq_handler(int i915_pipestat_irq_handler(dev_priv, iir, pipe_stats); } while (0);
+ pmu_irq_stats(dev_priv, ret); + enable_rpm_wakeref_asserts(&dev_priv->runtime_pm);
return ret; @@ -3941,6 +3973,8 @@ static irqreturn_t i965_irq_handler(int i965_pipestat_irq_handler(dev_priv, iir, pipe_stats); } while (0);
+ pmu_irq_stats(dev_priv, IRQ_HANDLED); + enable_rpm_wakeref_asserts(&dev_priv->runtime_pm);
return ret; --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -423,22 +423,6 @@ static enum hrtimer_restart i915_sample( return HRTIMER_RESTART; }
-static u64 count_interrupts(struct drm_i915_private *i915) -{ - /* open-coded kstat_irqs() */ - struct irq_desc *desc = irq_to_desc(i915->drm.pdev->irq); - u64 sum = 0; - int cpu; - - if (!desc || !desc->kstat_irqs) - return 0; - - for_each_possible_cpu(cpu) - sum += *per_cpu_ptr(desc->kstat_irqs, cpu); - - return sum; -} - static void i915_pmu_event_destroy(struct perf_event *event) { struct drm_i915_private *i915 = @@ -581,7 +565,7 @@ static u64 __i915_pmu_event_read(struct USEC_PER_SEC /* to MHz */); break; case I915_PMU_INTERRUPTS: - val = count_interrupts(i915); + val = READ_ONCE(pmu->irq_count); break; case I915_PMU_RC6_RESIDENCY: val = get_rc6(&i915->gt); --- a/drivers/gpu/drm/i915/i915_pmu.h +++ b/drivers/gpu/drm/i915/i915_pmu.h @@ -108,6 +108,14 @@ struct i915_pmu { */ ktime_t sleep_last; /** + * @irq_count: Number of interrupts + * + * Intentionally unsigned long to avoid atomics or heuristics on 32bit. + * 4e9 interrupts are a lot and postprocessing can really deal with an + * occasional wraparound easily. It's 32bit after all. + */ + unsigned long irq_count; + /** * @events_attr_group: Device events attribute group. */ struct attribute_group events_attr_group;
On Thu, 10 Dec 2020, Thomas Gleixner tglx@linutronix.de wrote:
Driver code has no business with the internals of the irq descriptor.
Aside of that the count is per interrupt line and therefore takes interrupts from other devices into account which share the interrupt line and are not handled by the graphics driver.
Replace it with a pmu private count which only counts interrupts which originate from the graphics card.
To avoid atomics or heuristics of some sort make the counter field 'unsigned long'. That limits the count to 4e9 on 32bit which is a lot and postprocessing can easily deal with the occasional wraparound.
I'll let Tvrtko and Chris review the substance here, but assuming they don't object,
Acked-by: Jani Nikula jani.nikula@intel.com
for merging via whichever tree makes most sense.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Tvrtko Ursulin tvrtko.ursulin@linux.intel.com Cc: Jani Nikula jani.nikula@linux.intel.com Cc: Joonas Lahtinen joonas.lahtinen@linux.intel.com Cc: Rodrigo Vivi rodrigo.vivi@intel.com Cc: David Airlie airlied@linux.ie Cc: Daniel Vetter daniel@ffwll.ch Cc: intel-gfx@lists.freedesktop.org Cc: dri-devel@lists.freedesktop.org
drivers/gpu/drm/i915/i915_irq.c | 34 ++++++++++++++++++++++++++++++++++ drivers/gpu/drm/i915/i915_pmu.c | 18 +----------------- drivers/gpu/drm/i915/i915_pmu.h | 8 ++++++++ 3 files changed, 43 insertions(+), 17 deletions(-)
--- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -60,6 +60,24 @@
- and related files, but that will be described in separate chapters.
*/
+/*
- Interrupt statistic for PMU. Increments the counter only if the
- interrupt originated from the the GPU so interrupts from a device which
- shares the interrupt line are not accounted.
- */
+static inline void pmu_irq_stats(struct drm_i915_private *priv,
irqreturn_t res)
+{
- if (unlikely(res != IRQ_HANDLED))
return;
- /*
* A clever compiler translates that into INC. A not so clever one
* should at least prevent store tearing.
*/
- WRITE_ONCE(priv->pmu.irq_count, priv->pmu.irq_count + 1);
+}
typedef bool (*long_pulse_detect_func)(enum hpd_pin pin, u32 val);
static const u32 hpd_ilk[HPD_NUM_PINS] = { @@ -1599,6 +1617,8 @@ static irqreturn_t valleyview_irq_handle valleyview_pipestat_irq_handler(dev_priv, pipe_stats); } while (0);
pmu_irq_stats(dev_priv, ret);
enable_rpm_wakeref_asserts(&dev_priv->runtime_pm);
return ret;
@@ -1676,6 +1696,8 @@ static irqreturn_t cherryview_irq_handle valleyview_pipestat_irq_handler(dev_priv, pipe_stats); } while (0);
pmu_irq_stats(dev_priv, ret);
enable_rpm_wakeref_asserts(&dev_priv->runtime_pm);
return ret;
@@ -2103,6 +2125,8 @@ static irqreturn_t ilk_irq_handler(int i if (sde_ier) raw_reg_write(regs, SDEIER, sde_ier);
- pmu_irq_stats(i915, ret);
- /* IRQs are synced during runtime_suspend, we don't require a wakeref */ enable_rpm_wakeref_asserts(&i915->runtime_pm);
@@ -2419,6 +2443,8 @@ static irqreturn_t gen8_irq_handler(int
gen8_master_intr_enable(regs);
- pmu_irq_stats(dev_priv, IRQ_HANDLED);
- return IRQ_HANDLED;
}
@@ -2514,6 +2540,8 @@ static __always_inline irqreturn_t
gen11_gu_misc_irq_handler(gt, gu_misc_iir);
- pmu_irq_stats(i915, IRQ_HANDLED);
- return IRQ_HANDLED;
}
@@ -3688,6 +3716,8 @@ static irqreturn_t i8xx_irq_handler(int i8xx_pipestat_irq_handler(dev_priv, iir, pipe_stats); } while (0);
pmu_irq_stats(dev_priv, ret);
enable_rpm_wakeref_asserts(&dev_priv->runtime_pm);
return ret;
@@ -3796,6 +3826,8 @@ static irqreturn_t i915_irq_handler(int i915_pipestat_irq_handler(dev_priv, iir, pipe_stats); } while (0);
pmu_irq_stats(dev_priv, ret);
enable_rpm_wakeref_asserts(&dev_priv->runtime_pm);
return ret;
@@ -3941,6 +3973,8 @@ static irqreturn_t i965_irq_handler(int i965_pipestat_irq_handler(dev_priv, iir, pipe_stats); } while (0);
pmu_irq_stats(dev_priv, IRQ_HANDLED);
enable_rpm_wakeref_asserts(&dev_priv->runtime_pm);
return ret;
--- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -423,22 +423,6 @@ static enum hrtimer_restart i915_sample( return HRTIMER_RESTART; }
-static u64 count_interrupts(struct drm_i915_private *i915) -{
- /* open-coded kstat_irqs() */
- struct irq_desc *desc = irq_to_desc(i915->drm.pdev->irq);
- u64 sum = 0;
- int cpu;
- if (!desc || !desc->kstat_irqs)
return 0;
- for_each_possible_cpu(cpu)
sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
- return sum;
-}
static void i915_pmu_event_destroy(struct perf_event *event) { struct drm_i915_private *i915 = @@ -581,7 +565,7 @@ static u64 __i915_pmu_event_read(struct USEC_PER_SEC /* to MHz */); break; case I915_PMU_INTERRUPTS:
val = count_interrupts(i915);
case I915_PMU_RC6_RESIDENCY: val = get_rc6(&i915->gt);val = READ_ONCE(pmu->irq_count); break;
--- a/drivers/gpu/drm/i915/i915_pmu.h +++ b/drivers/gpu/drm/i915/i915_pmu.h @@ -108,6 +108,14 @@ struct i915_pmu { */ ktime_t sleep_last; /**
* @irq_count: Number of interrupts
*
* Intentionally unsigned long to avoid atomics or heuristics on 32bit.
* 4e9 interrupts are a lot and postprocessing can really deal with an
* occasional wraparound easily. It's 32bit after all.
*/
- unsigned long irq_count;
- /**
*/ struct attribute_group events_attr_group;
- @events_attr_group: Device events attribute group.
On 10/12/2020 19:25, Thomas Gleixner wrote:
Driver code has no business with the internals of the irq descriptor.
Aside of that the count is per interrupt line and therefore takes interrupts from other devices into account which share the interrupt line and are not handled by the graphics driver.
Replace it with a pmu private count which only counts interrupts which originate from the graphics card.
To avoid atomics or heuristics of some sort make the counter field 'unsigned long'. That limits the count to 4e9 on 32bit which is a lot and postprocessing can easily deal with the occasional wraparound.
After my failed hasty sketch from last night I had a different one which was kind of heuristics based (re-reading the upper dword and retrying if it changed on 32-bit). But you are right - it is okay to at least start like this today and if later there is a need we can either do that or deal with wrap at PMU read time.
So thanks for dealing with it, some small comments below but overall it is fine.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Tvrtko Ursulin tvrtko.ursulin@linux.intel.com Cc: Jani Nikula jani.nikula@linux.intel.com Cc: Joonas Lahtinen joonas.lahtinen@linux.intel.com Cc: Rodrigo Vivi rodrigo.vivi@intel.com Cc: David Airlie airlied@linux.ie Cc: Daniel Vetter daniel@ffwll.ch Cc: intel-gfx@lists.freedesktop.org Cc: dri-devel@lists.freedesktop.org
drivers/gpu/drm/i915/i915_irq.c | 34 ++++++++++++++++++++++++++++++++++ drivers/gpu/drm/i915/i915_pmu.c | 18 +----------------- drivers/gpu/drm/i915/i915_pmu.h | 8 ++++++++ 3 files changed, 43 insertions(+), 17 deletions(-)
--- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -60,6 +60,24 @@
- and related files, but that will be described in separate chapters.
*/
+/*
- Interrupt statistic for PMU. Increments the counter only if the
- interrupt originated from the the GPU so interrupts from a device which
- shares the interrupt line are not accounted.
- */
+static inline void pmu_irq_stats(struct drm_i915_private *priv,
We never use priv as a local name, it should be either i915 or dev_priv.
irqreturn_t res)
+{
- if (unlikely(res != IRQ_HANDLED))
return;
- /*
* A clever compiler translates that into INC. A not so clever one
* should at least prevent store tearing.
*/
- WRITE_ONCE(priv->pmu.irq_count, priv->pmu.irq_count + 1);
Curious, probably more educational for me - given x86_32 and x86_64, and the context of it getting called, what is the difference from just doing irq_count++?
+}
typedef bool (*long_pulse_detect_func)(enum hpd_pin pin, u32 val);
static const u32 hpd_ilk[HPD_NUM_PINS] = {
@@ -1599,6 +1617,8 @@ static irqreturn_t valleyview_irq_handle valleyview_pipestat_irq_handler(dev_priv, pipe_stats); } while (0);
pmu_irq_stats(dev_priv, ret);
enable_rpm_wakeref_asserts(&dev_priv->runtime_pm);
return ret;
@@ -1676,6 +1696,8 @@ static irqreturn_t cherryview_irq_handle valleyview_pipestat_irq_handler(dev_priv, pipe_stats); } while (0);
pmu_irq_stats(dev_priv, ret);
enable_rpm_wakeref_asserts(&dev_priv->runtime_pm);
return ret;
@@ -2103,6 +2125,8 @@ static irqreturn_t ilk_irq_handler(int i if (sde_ier) raw_reg_write(regs, SDEIER, sde_ier);
- pmu_irq_stats(i915, ret);
- /* IRQs are synced during runtime_suspend, we don't require a wakeref */ enable_rpm_wakeref_asserts(&i915->runtime_pm);
@@ -2419,6 +2443,8 @@ static irqreturn_t gen8_irq_handler(int
gen8_master_intr_enable(regs);
- pmu_irq_stats(dev_priv, IRQ_HANDLED);
- return IRQ_HANDLED; }
@@ -2514,6 +2540,8 @@ static __always_inline irqreturn_t
gen11_gu_misc_irq_handler(gt, gu_misc_iir);
- pmu_irq_stats(i915, IRQ_HANDLED);
- return IRQ_HANDLED; }
@@ -3688,6 +3716,8 @@ static irqreturn_t i8xx_irq_handler(int i8xx_pipestat_irq_handler(dev_priv, iir, pipe_stats); } while (0);
pmu_irq_stats(dev_priv, ret);
enable_rpm_wakeref_asserts(&dev_priv->runtime_pm);
return ret;
@@ -3796,6 +3826,8 @@ static irqreturn_t i915_irq_handler(int i915_pipestat_irq_handler(dev_priv, iir, pipe_stats); } while (0);
pmu_irq_stats(dev_priv, ret);
enable_rpm_wakeref_asserts(&dev_priv->runtime_pm);
return ret;
@@ -3941,6 +3973,8 @@ static irqreturn_t i965_irq_handler(int i965_pipestat_irq_handler(dev_priv, iir, pipe_stats); } while (0);
pmu_irq_stats(dev_priv, IRQ_HANDLED);
enable_rpm_wakeref_asserts(&dev_priv->runtime_pm);
return ret;
--- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -423,22 +423,6 @@ static enum hrtimer_restart i915_sample( return HRTIMER_RESTART; }
In this file you can also drop the #include <linux/irq.h> line.
-static u64 count_interrupts(struct drm_i915_private *i915) -{
- /* open-coded kstat_irqs() */
- struct irq_desc *desc = irq_to_desc(i915->drm.pdev->irq);
- u64 sum = 0;
- int cpu;
- if (!desc || !desc->kstat_irqs)
return 0;
- for_each_possible_cpu(cpu)
sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
- return sum;
-}
- static void i915_pmu_event_destroy(struct perf_event *event) { struct drm_i915_private *i915 =
@@ -581,7 +565,7 @@ static u64 __i915_pmu_event_read(struct USEC_PER_SEC /* to MHz */); break; case I915_PMU_INTERRUPTS:
val = count_interrupts(i915);
val = READ_ONCE(pmu->irq_count);
I guess same curiosity about READ_ONCE like in the increment site.
break; case I915_PMU_RC6_RESIDENCY: val = get_rc6(&i915->gt);
--- a/drivers/gpu/drm/i915/i915_pmu.h +++ b/drivers/gpu/drm/i915/i915_pmu.h @@ -108,6 +108,14 @@ struct i915_pmu { */ ktime_t sleep_last; /**
* @irq_count: Number of interrupts
*
* Intentionally unsigned long to avoid atomics or heuristics on 32bit.
* 4e9 interrupts are a lot and postprocessing can really deal with an
* occasional wraparound easily. It's 32bit after all.
*/
- unsigned long irq_count;
- /**
*/ struct attribute_group events_attr_group;
- @events_attr_group: Device events attribute group.
Regards,
Tvrtko
On Fri, Dec 11 2020 at 10:13, Tvrtko Ursulin wrote:
On 10/12/2020 19:25, Thomas Gleixner wrote:
Aside of that the count is per interrupt line and therefore takes interrupts from other devices into account which share the interrupt line and are not handled by the graphics driver.
Replace it with a pmu private count which only counts interrupts which originate from the graphics card.
To avoid atomics or heuristics of some sort make the counter field 'unsigned long'. That limits the count to 4e9 on 32bit which is a lot and postprocessing can easily deal with the occasional wraparound.
After my failed hasty sketch from last night I had a different one which was kind of heuristics based (re-reading the upper dword and retrying if it changed on 32-bit).
The problem is that there will be two seperate modifications for the low and high word. Several ways how the compiler can translate this, but the problem is the same for all of them:
CPU 0 CPU 1 load low load high add low, 1 addc high, 0 store low load high --> NMI load low load high and compare store high
You can't catch that. If this really becomes an issue you need a sequence counter around it.
But you are right - it is okay to at least start like this today and if later there is a need we can either do that or deal with wrap at PMU read time.
Right.
+/*
- Interrupt statistic for PMU. Increments the counter only if the
- interrupt originated from the the GPU so interrupts from a device which
- shares the interrupt line are not accounted.
- */
+static inline void pmu_irq_stats(struct drm_i915_private *priv,
We never use priv as a local name, it should be either i915 or dev_priv.
Sure, will fix.
- /*
* A clever compiler translates that into INC. A not so clever one
* should at least prevent store tearing.
*/
- WRITE_ONCE(priv->pmu.irq_count, priv->pmu.irq_count + 1);
Curious, probably more educational for me - given x86_32 and x86_64, and the context of it getting called, what is the difference from just doing irq_count++?
Several reasons:
1) The compiler can pretty much do what it wants with cnt++ including tearing and whatever. https://lwn.net/Articles/816850/ for the full set of insanities.
Not really a problem here, but
2) It's annotating the reader and the writer side and documenting that this is subject to concurrency
3) It will prevent KCSAN to complain about the data race, i.e. concurrent modification while reading.
Thanks,
tglx
--- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -423,22 +423,6 @@ static enum hrtimer_restart i915_sample( return HRTIMER_RESTART; }
In this file you can also drop the #include <linux/irq.h> line.
Indeed.
Thanks,
tglx
From: Thomas Gleixner
Sent: 11 December 2020 12:58
..
After my failed hasty sketch from last night I had a different one which was kind of heuristics based (re-reading the upper dword and retrying if it changed on 32-bit).
The problem is that there will be two seperate modifications for the low and high word. Several ways how the compiler can translate this, but the problem is the same for all of them:
CPU 0 CPU 1 load low load high add low, 1 addc high, 0 store low load high --> NMI load low load high and compare store high
You can't catch that. If this really becomes an issue you need a sequence counter around it.
Or just two copies of the high word. Provided the accesses are sequenced: writer: load high:low add small_value,high:low store high store low store high_copy reader: load high_copy load low load high if (high != high_copy) low = 0;
The read value is always stale, so it probably doesn't matter that the value you have is one that is between the value when you started and that when you finished.
David
- Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)
On Fri, Dec 11 2020 at 14:19, David Laight wrote:
From: Thomas Gleixner
You can't catch that. If this really becomes an issue you need a sequence counter around it.
Or just two copies of the high word. Provided the accesses are sequenced: writer: load high:low add small_value,high:low store high store low store high_copy reader: load high_copy load low load high if (high != high_copy) low = 0;
And low = 0 is solving what? You need to loop back and retry until it's consistent and then it's nothing else than an open coded sequence count.
Thanks,
tglx
From: Thomas Gleixner
Sent: 11 December 2020 21:11
On Fri, Dec 11 2020 at 14:19, David Laight wrote:
From: Thomas Gleixner
You can't catch that. If this really becomes an issue you need a sequence counter around it.
Or just two copies of the high word. Provided the accesses are sequenced: writer: load high:low add small_value,high:low store high store low store high_copy reader: load high_copy load low load high if (high != high_copy) low = 0;
And low = 0 is solving what? You need to loop back and retry until it's consistent and then it's nothing else than an open coded sequence count.
If it is a counter or timestamp then the high:0 value happened some time between when you started trying to read the value and when you finished trying to read it.
As such it is a perfectly reasonable return value.
David
- Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)
Let the core code do the fiddling with irq_desc.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Linus Walleij linus.walleij@linaro.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-gpio@vger.kernel.org --- drivers/pinctrl/nomadik/pinctrl-nomadik.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-)
--- a/drivers/pinctrl/nomadik/pinctrl-nomadik.c +++ b/drivers/pinctrl/nomadik/pinctrl-nomadik.c @@ -948,7 +948,6 @@ static void nmk_gpio_dbg_show_one(struct (mode < 0) ? "unknown" : modes[mode]); } else { int irq = chip->to_irq(chip, offset); - struct irq_desc *desc = irq_to_desc(irq); const int pullidx = pull ? 1 : 0; int val; static const char * const pulls[] = { @@ -969,7 +968,7 @@ static void nmk_gpio_dbg_show_one(struct * This races with request_irq(), set_irq_type(), * and set_irq_wake() ... but those are "rare". */ - if (irq > 0 && desc && desc->action) { + if (irq > 0 && irq_has_action(irq)) { char *trigger;
if (nmk_chip->edge_rising & BIT(offset))
On Thu, Dec 10, 2020 at 8:42 PM Thomas Gleixner tglx@linutronix.de wrote:
Let the core code do the fiddling with irq_desc.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Linus Walleij linus.walleij@linaro.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-gpio@vger.kernel.org
Reviewed-by: Linus Walleij linus.walleij@linaro.org
I suppose you will funnel this directly to Torvalds, else tell me and I'll apply it to my tree.
Yours, Linus Walleij
First of all drivers have absolutely no business to dig into the internals of an irq descriptor. That's core code and subject to change. All of this information is readily available to /proc/interrupts in a safe and race free way.
Remove the inspection code which is a blatant violation of subsystem boundaries and racy against concurrent modifications of the interrupt descriptor.
Print the irq line instead so the information can be looked up in a sane way in /proc/interrupts.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Linus Walleij linus.walleij@linaro.org Cc: Lee Jones lee.jones@linaro.org Cc: linux-arm-kernel@lists.infradead.org --- drivers/mfd/ab8500-debugfs.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-)
--- a/drivers/mfd/ab8500-debugfs.c +++ b/drivers/mfd/ab8500-debugfs.c @@ -1513,24 +1513,14 @@ static int ab8500_interrupts_show(struct { int line;
- seq_puts(s, "name: number: number of: wake:\n"); + seq_puts(s, "name: number: irq: number of: wake:\n");
for (line = 0; line < num_interrupt_lines; line++) { - struct irq_desc *desc = irq_to_desc(line + irq_first); - - seq_printf(s, "%3i: %6i %4i", + seq_printf(s, "%3i: %6i %4i %4i\n", line, + line + irq_first, num_interrupts[line], num_wake_interrupts[line]); - - if (desc && desc->name) - seq_printf(s, "-%-8s", desc->name); - if (desc && desc->action) { - struct irqaction *action = desc->action; - - seq_printf(s, " %s", action->name); - while ((action = action->next) != NULL) - seq_printf(s, ", %s", action->name); } seq_putc(s, '\n'); }
On Thu, Dec 10, 2020 at 8:42 PM Thomas Gleixner tglx@linutronix.de wrote:
First of all drivers have absolutely no business to dig into the internals of an irq descriptor. That's core code and subject to change. All of this information is readily available to /proc/interrupts in a safe and race free way.
Remove the inspection code which is a blatant violation of subsystem boundaries and racy against concurrent modifications of the interrupt descriptor.
Print the irq line instead so the information can be looked up in a sane way in /proc/interrupts.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Linus Walleij linus.walleij@linaro.org Cc: Lee Jones lee.jones@linaro.org Cc: linux-arm-kernel@lists.infradead.org
Reviewed-by: Linus Walleij linus.walleij@linaro.org
Yours, Linus Walleij
On Thu, 10 Dec 2020, Thomas Gleixner wrote:
First of all drivers have absolutely no business to dig into the internals of an irq descriptor. That's core code and subject to change. All of this information is readily available to /proc/interrupts in a safe and race free way.
Remove the inspection code which is a blatant violation of subsystem boundaries and racy against concurrent modifications of the interrupt descriptor.
Print the irq line instead so the information can be looked up in a sane way in /proc/interrupts.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Linus Walleij linus.walleij@linaro.org Cc: Lee Jones lee.jones@linaro.org Cc: linux-arm-kernel@lists.infradead.org
drivers/mfd/ab8500-debugfs.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-)
Acked-by: Lee Jones lee.jones@linaro.org
On Thu, Dec 10, 2020 at 9:57 PM Thomas Gleixner tglx@linutronix.de wrote:
First of all drivers have absolutely no business to dig into the internals of an irq descriptor. That's core code and subject to change. All of this information is readily available to /proc/interrupts in a safe and race free way.
Remove the inspection code which is a blatant violation of subsystem boundaries and racy against concurrent modifications of the interrupt descriptor.
Print the irq line instead so the information can be looked up in a sane way in /proc/interrupts.
...
seq_printf(s, "%3i: %6i %4i",
seq_printf(s, "%3i: %6i %4i %4i\n",
Seems different specifiers, I think the intention was something like seq_printf(s, "%3i: %4i %6i %4i\n",
line,
line + irq_first, num_interrupts[line], num_wake_interrupts[line]);
Use the proper core function.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Jon Mason jdmason@kudzu.us Cc: Dave Jiang dave.jiang@intel.com Cc: Allen Hubbe allenbh@gmail.com Cc: linux-ntb@googlegroups.com --- drivers/ntb/msi.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-)
--- a/drivers/ntb/msi.c +++ b/drivers/ntb/msi.c @@ -282,15 +282,13 @@ int ntbm_msi_request_threaded_irq(struct struct ntb_msi_desc *msi_desc) { struct msi_desc *entry; - struct irq_desc *desc; int ret;
if (!ntb->msi) return -EINVAL;
for_each_pci_msi_entry(entry, ntb->pdev) { - desc = irq_to_desc(entry->irq); - if (desc->action) + if (irq_has_action(entry->irq)) continue;
ret = devm_request_threaded_irq(&ntb->dev, entry->irq, handler,
On 2020-12-10 12:25 p.m., Thomas Gleixner wrote:
Use the proper core function.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Jon Mason jdmason@kudzu.us Cc: Dave Jiang dave.jiang@intel.com Cc: Allen Hubbe allenbh@gmail.com Cc: linux-ntb@googlegroups.com
Looks good to me.
Reviewed-by: Logan Gunthorpe logang@deltatee.com
drivers/ntb/msi.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-)
--- a/drivers/ntb/msi.c +++ b/drivers/ntb/msi.c @@ -282,15 +282,13 @@ int ntbm_msi_request_threaded_irq(struct struct ntb_msi_desc *msi_desc) { struct msi_desc *entry;
struct irq_desc *desc; int ret;
if (!ntb->msi) return -EINVAL;
for_each_pci_msi_entry(entry, ntb->pdev) {
desc = irq_to_desc(entry->irq);
if (desc->action)
if (irq_has_action(entry->irq)) continue;
ret = devm_request_threaded_irq(&ntb->dev, entry->irq, handler,
Going through a full irq descriptor lookup instead of just using the proper helper function which provides direct access is suboptimal.
In fact it _is_ wrong because the chip callback needs to get the chip data which is relevant for the chip while using the irq descriptor variant returns the irq chip data of the top level chip of a hierarchy. It does not matter in this case because the chip is the top level chip, but that doesn't make it more correct.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Lorenzo Pieralisi lorenzo.pieralisi@arm.com Cc: Rob Herring robh@kernel.org Cc: Bjorn Helgaas bhelgaas@google.com Cc: Michal Simek michal.simek@xilinx.com Cc: linux-pci@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org --- drivers/pci/controller/pcie-xilinx-nwl.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-)
--- a/drivers/pci/controller/pcie-xilinx-nwl.c +++ b/drivers/pci/controller/pcie-xilinx-nwl.c @@ -379,13 +379,11 @@ static void nwl_pcie_msi_handler_low(str
static void nwl_mask_leg_irq(struct irq_data *data) { - struct irq_desc *desc = irq_to_desc(data->irq); - struct nwl_pcie *pcie; + struct nwl_pcie *pcie = irq_data_get_irq_chip_data(data); unsigned long flags; u32 mask; u32 val;
- pcie = irq_desc_get_chip_data(desc); mask = 1 << (data->hwirq - 1); raw_spin_lock_irqsave(&pcie->leg_mask_lock, flags); val = nwl_bridge_readl(pcie, MSGF_LEG_MASK); @@ -395,13 +393,11 @@ static void nwl_mask_leg_irq(struct irq_
static void nwl_unmask_leg_irq(struct irq_data *data) { - struct irq_desc *desc = irq_to_desc(data->irq); - struct nwl_pcie *pcie; + struct nwl_pcie *pcie = irq_data_get_irq_chip_data(data); unsigned long flags; u32 mask; u32 val;
- pcie = irq_desc_get_chip_data(desc); mask = 1 << (data->hwirq - 1); raw_spin_lock_irqsave(&pcie->leg_mask_lock, flags); val = nwl_bridge_readl(pcie, MSGF_LEG_MASK);
On Thu, Dec 10, 2020 at 1:42 PM Thomas Gleixner tglx@linutronix.de wrote:
Going through a full irq descriptor lookup instead of just using the proper helper function which provides direct access is suboptimal.
In fact it _is_ wrong because the chip callback needs to get the chip data which is relevant for the chip while using the irq descriptor variant returns the irq chip data of the top level chip of a hierarchy. It does not matter in this case because the chip is the top level chip, but that doesn't make it more correct.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Lorenzo Pieralisi lorenzo.pieralisi@arm.com Cc: Rob Herring robh@kernel.org Cc: Bjorn Helgaas bhelgaas@google.com Cc: Michal Simek michal.simek@xilinx.com Cc: linux-pci@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org
drivers/pci/controller/pcie-xilinx-nwl.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-)
Reviewed-by: Rob Herring robh@kernel.org
Going through a full irq descriptor lookup instead of just using the proper helper function which provides direct access is suboptimal.
In fact it _is_ wrong because the chip callback needs to get the chip data which is relevant for the chip while using the irq descriptor variant returns the irq chip data of the top level chip of a hierarchy. It does not matter in this case because the chip is the top level chip, but that doesn't make it more correct.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Karthikeyan Mitran m.karthikeyan@mobiveil.co.in Cc: Hou Zhiqiang Zhiqiang.Hou@nxp.com Cc: Lorenzo Pieralisi lorenzo.pieralisi@arm.com Cc: Rob Herring robh@kernel.org Cc: Bjorn Helgaas bhelgaas@google.com Cc: linux-pci@vger.kernel.org --- drivers/pci/controller/mobiveil/pcie-mobiveil-host.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-)
--- a/drivers/pci/controller/mobiveil/pcie-mobiveil-host.c +++ b/drivers/pci/controller/mobiveil/pcie-mobiveil-host.c @@ -306,13 +306,11 @@ int mobiveil_host_init(struct mobiveil_p
static void mobiveil_mask_intx_irq(struct irq_data *data) { - struct irq_desc *desc = irq_to_desc(data->irq); - struct mobiveil_pcie *pcie; + struct mobiveil_pcie *pcie = irq_data_get_irq_chip_data(data); struct mobiveil_root_port *rp; unsigned long flags; u32 mask, shifted_val;
- pcie = irq_desc_get_chip_data(desc); rp = &pcie->rp; mask = 1 << ((data->hwirq + PAB_INTX_START) - 1); raw_spin_lock_irqsave(&rp->intx_mask_lock, flags); @@ -324,13 +322,11 @@ static void mobiveil_mask_intx_irq(struc
static void mobiveil_unmask_intx_irq(struct irq_data *data) { - struct irq_desc *desc = irq_to_desc(data->irq); - struct mobiveil_pcie *pcie; + struct mobiveil_pcie *pcie = irq_data_get_irq_chip_data(data); struct mobiveil_root_port *rp; unsigned long flags; u32 shifted_val, mask;
- pcie = irq_desc_get_chip_data(desc); rp = &pcie->rp; mask = 1 << ((data->hwirq + PAB_INTX_START) - 1); raw_spin_lock_irqsave(&rp->intx_mask_lock, flags);
On Thu, Dec 10, 2020 at 1:42 PM Thomas Gleixner tglx@linutronix.de wrote:
Going through a full irq descriptor lookup instead of just using the proper helper function which provides direct access is suboptimal.
In fact it _is_ wrong because the chip callback needs to get the chip data which is relevant for the chip while using the irq descriptor variant returns the irq chip data of the top level chip of a hierarchy. It does not matter in this case because the chip is the top level chip, but that doesn't make it more correct.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Karthikeyan Mitran m.karthikeyan@mobiveil.co.in Cc: Hou Zhiqiang Zhiqiang.Hou@nxp.com Cc: Lorenzo Pieralisi lorenzo.pieralisi@arm.com Cc: Rob Herring robh@kernel.org Cc: Bjorn Helgaas bhelgaas@google.com Cc: linux-pci@vger.kernel.org
drivers/pci/controller/mobiveil/pcie-mobiveil-host.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-)
Reviewed-by: Rob Herring robh@kernel.org
No driver has any business with the internals of an interrupt descriptor. Storing a pointer to it just to use yet another helper at the actual usage site to retrieve the affinity mask is creative at best. Just because C does not allow encapsulation does not mean that the kernel has no limits.
Retrieve a pointer to the affinity mask itself and use that. It's still using an interface which is usually not for random drivers, but definitely less hideous than the previous hack.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Tariq Toukan tariqt@nvidia.com Cc: "David S. Miller" davem@davemloft.net Cc: Jakub Kicinski kuba@kernel.org Cc: netdev@vger.kernel.org Cc: linux-rdma@vger.kernel.org --- drivers/net/ethernet/mellanox/mlx4/en_cq.c | 8 +++----- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 6 +----- drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 3 ++- 3 files changed, 6 insertions(+), 11 deletions(-)
--- a/drivers/net/ethernet/mellanox/mlx4/en_cq.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c @@ -90,7 +90,7 @@ int mlx4_en_activate_cq(struct mlx4_en_p int cq_idx) { struct mlx4_en_dev *mdev = priv->mdev; - int err = 0; + int irq, err = 0; int timestamp_en = 0; bool assigned_eq = false;
@@ -116,10 +116,8 @@ int mlx4_en_activate_cq(struct mlx4_en_p
assigned_eq = true; } - - cq->irq_desc = - irq_to_desc(mlx4_eq_get_irq(mdev->dev, - cq->vector)); + irq = mlx4_eq_get_irq(mdev->dev, cq->vector); + cq->aff_mask = irq_get_affinity_mask(irq); } else { /* For TX we use the same irq per ring we assigned for the RX */ --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -959,8 +959,6 @@ int mlx4_en_poll_rx_cq(struct napi_struc
/* If we used up all the quota - we're probably not done yet... */ if (done == budget || !clean_complete) { - const struct cpumask *aff; - struct irq_data *idata; int cpu_curr;
/* in case we got here because of !clean_complete */ @@ -969,10 +967,8 @@ int mlx4_en_poll_rx_cq(struct napi_struc INC_PERF_COUNTER(priv->pstats.napi_quota);
cpu_curr = smp_processor_id(); - idata = irq_desc_get_irq_data(cq->irq_desc); - aff = irq_data_get_affinity_mask(idata);
- if (likely(cpumask_test_cpu(cpu_curr, aff))) + if (likely(cpumask_test_cpu(cpu_curr, cq->aff_mask))) return budget;
/* Current cpu is not according to smp_irq_affinity - --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -46,6 +46,7 @@ #endif #include <linux/cpu_rmap.h> #include <linux/ptp_clock_kernel.h> +#include <linux/irq.h> #include <net/xdp.h>
#include <linux/mlx4/device.h> @@ -380,7 +381,7 @@ struct mlx4_en_cq { struct mlx4_cqe *buf; #define MLX4_EN_OPCODE_ERROR 0x1e
- struct irq_desc *irq_desc; + const struct cpumask *aff_mask; };
struct mlx4_en_port_profile {
On 12/10/2020 9:25 PM, Thomas Gleixner wrote:
No driver has any business with the internals of an interrupt descriptor. Storing a pointer to it just to use yet another helper at the actual usage site to retrieve the affinity mask is creative at best. Just because C does not allow encapsulation does not mean that the kernel has no limits.
Retrieve a pointer to the affinity mask itself and use that. It's still using an interface which is usually not for random drivers, but definitely less hideous than the previous hack.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Tariq Toukan tariqt@nvidia.com Cc: "David S. Miller" davem@davemloft.net Cc: Jakub Kicinski kuba@kernel.org Cc: netdev@vger.kernel.org Cc: linux-rdma@vger.kernel.org
drivers/net/ethernet/mellanox/mlx4/en_cq.c | 8 +++----- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 6 +----- drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 3 ++- 3 files changed, 6 insertions(+), 11 deletions(-)
Reviewed-by: Tariq Toukan tariqt@nvidia.com
Thanks for your patch.
Using the interrupt affinity mask for checking locality is not really working well on architectures which support effective affinity masks.
The affinity mask is either the system wide default or set by user space, but the architecture can or even must reduce the mask to the effective set, which means that checking the affinity mask itself does not really tell about the actual target CPUs.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Tariq Toukan tariqt@nvidia.com Cc: "David S. Miller" davem@davemloft.net Cc: Jakub Kicinski kuba@kernel.org Cc: netdev@vger.kernel.org Cc: linux-rdma@vger.kernel.org --- drivers/net/ethernet/mellanox/mlx4/en_cq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/net/ethernet/mellanox/mlx4/en_cq.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c @@ -117,7 +117,7 @@ int mlx4_en_activate_cq(struct mlx4_en_p assigned_eq = true; } irq = mlx4_eq_get_irq(mdev->dev, cq->vector); - cq->aff_mask = irq_get_affinity_mask(irq); + cq->aff_mask = irq_get_effective_affinity_mask(irq); } else { /* For TX we use the same irq per ring we assigned for the RX */
On 12/10/2020 9:25 PM, Thomas Gleixner wrote:
Using the interrupt affinity mask for checking locality is not really working well on architectures which support effective affinity masks.
The affinity mask is either the system wide default or set by user space, but the architecture can or even must reduce the mask to the effective set, which means that checking the affinity mask itself does not really tell about the actual target CPUs.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Tariq Toukan tariqt@nvidia.com Cc: "David S. Miller" davem@davemloft.net Cc: Jakub Kicinski kuba@kernel.org Cc: netdev@vger.kernel.org Cc: linux-rdma@vger.kernel.org
drivers/net/ethernet/mellanox/mlx4/en_cq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/net/ethernet/mellanox/mlx4/en_cq.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c @@ -117,7 +117,7 @@ int mlx4_en_activate_cq(struct mlx4_en_p assigned_eq = true; } irq = mlx4_eq_get_irq(mdev->dev, cq->vector);
cq->aff_mask = irq_get_affinity_mask(irq);
} else { /* For TX we use the same irq per ring we assigned for the RX */cq->aff_mask = irq_get_effective_affinity_mask(irq);
Reviewed-by: Tariq Toukan tariqt@nvidia.com
Thanks.
No driver has any business with the internals of an interrupt descriptor. Storing a pointer to it just to use yet another helper at the actual usage site to retrieve the affinity mask is creative at best. Just because C does not allow encapsulation does not mean that the kernel has no limits.
Retrieve a pointer to the affinity mask itself and use that. It's still using an interface which is usually not for random drivers, but definitely less hideous than the previous hack.
Signed-off-by: Thomas Gleixner tglx@linutronix.de --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c | 6 +----- 3 files changed, 3 insertions(+), 7 deletions(-)
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -669,7 +669,7 @@ struct mlx5e_channel { spinlock_t async_icosq_lock;
/* data path - accessed per napi poll */ - struct irq_desc *irq_desc; + const struct cpumask *aff_mask; struct mlx5e_ch_stats *stats;
/* control */ --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1998,7 +1998,7 @@ static int mlx5e_open_channel(struct mlx c->num_tc = params->num_tc; c->xdp = !!params->xdp_prog; c->stats = &priv->channel_stats[ix].ch; - c->irq_desc = irq_to_desc(irq); + c->aff_mask = irq_get_affinity_mask(irq); c->lag_port = mlx5e_enumerate_lag_port(priv->mdev, ix);
netif_napi_add(netdev, &c->napi, mlx5e_napi_poll, 64); --- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c @@ -40,12 +40,8 @@ static inline bool mlx5e_channel_no_affinity_change(struct mlx5e_channel *c) { int current_cpu = smp_processor_id(); - const struct cpumask *aff; - struct irq_data *idata;
- idata = irq_desc_get_irq_data(c->irq_desc); - aff = irq_data_get_affinity_mask(idata); - return cpumask_test_cpu(current_cpu, aff); + return cpumask_test_cpu(current_cpu, c->aff_mask); }
static void mlx5e_handle_tx_dim(struct mlx5e_txqsq *sq)
On 12/10/2020 9:25 PM, Thomas Gleixner wrote:
No driver has any business with the internals of an interrupt descriptor. Storing a pointer to it just to use yet another helper at the actual usage site to retrieve the affinity mask is creative at best. Just because C does not allow encapsulation does not mean that the kernel has no limits.
Retrieve a pointer to the affinity mask itself and use that. It's still using an interface which is usually not for random drivers, but definitely less hideous than the previous hack.
Signed-off-by: Thomas Gleixner tglx@linutronix.de
drivers/net/ethernet/mellanox/mlx5/core/en.h | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c | 6 +----- 3 files changed, 3 insertions(+), 7 deletions(-)
Reviewed-by: Tariq Toukan tariqt@nvidia.com
Thanks.
On Thu, 2020-12-10 at 20:25 +0100, Thomas Gleixner wrote:
No driver has any business with the internals of an interrupt descriptor. Storing a pointer to it just to use yet another helper at the actual usage site to retrieve the affinity mask is creative at best. Just because C does not allow encapsulation does not mean that the kernel has no limits.
you can't blame the developers for using stuff from include/linux/ Not all developers are the same, and sometime we don't read in between the lines, you can't assume all driver developers to be expert on irq APIs disciplines.
your rules must be programmatically expressed, for instance, you can just hide struct irq_desc and irq_to_desc() in kernel/irq/ and remove them from include/linux/ header files, if you want privacy in your subsystem, don't put all your header files on display under include/linux.
Retrieve a pointer to the affinity mask itself and use that. It's still using an interface which is usually not for random drivers, but definitely less hideous than the previous hack.
Signed-off-by: Thomas Gleixner tglx@linutronix.de
drivers/net/ethernet/mellanox/mlx5/core/en.h | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c | 6 +----- 3 files changed, 3 insertions(+), 7 deletions(-)
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -669,7 +669,7 @@ struct mlx5e_channel { spinlock_t async_icosq_lock;
/* data path - accessed per napi poll */
- struct irq_desc *irq_desc;
const struct cpumask *aff_mask; struct mlx5e_ch_stats *stats;
/* control */
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1998,7 +1998,7 @@ static int mlx5e_open_channel(struct mlx c->num_tc = params->num_tc; c->xdp = !!params->xdp_prog; c->stats = &priv->channel_stats[ix].ch;
- c->irq_desc = irq_to_desc(irq);
- c->aff_mask = irq_get_affinity_mask(irq);
as long as the affinity mask pointer stays the same for the lifetime of the irq vector.
Assuming that: Acked-by: Saeed Mahameed saeedm@nvidia.com
Using the interrupt affinity mask for checking locality is not really working well on architectures which support effective affinity masks.
The affinity mask is either the system wide default or set by user space, but the architecture can or even must reduce the mask to the effective set, which means that checking the affinity mask itself does not really tell about the actual target CPUs.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Saeed Mahameed saeedm@nvidia.com Cc: Leon Romanovsky leon@kernel.org Cc: "David S. Miller" davem@davemloft.net Cc: Jakub Kicinski kuba@kernel.org Cc: netdev@vger.kernel.org Cc: linux-rdma@vger.kernel.org --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1998,7 +1998,7 @@ static int mlx5e_open_channel(struct mlx c->num_tc = params->num_tc; c->xdp = !!params->xdp_prog; c->stats = &priv->channel_stats[ix].ch; - c->aff_mask = irq_get_affinity_mask(irq); + c->aff_mask = irq_get_effective_affinity_mask(irq); c->lag_port = mlx5e_enumerate_lag_port(priv->mdev, ix);
netif_napi_add(netdev, &c->napi, mlx5e_napi_poll, 64);
On 12/10/2020 9:25 PM, Thomas Gleixner wrote:
Using the interrupt affinity mask for checking locality is not really working well on architectures which support effective affinity masks.
The affinity mask is either the system wide default or set by user space, but the architecture can or even must reduce the mask to the effective set, which means that checking the affinity mask itself does not really tell about the actual target CPUs.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Saeed Mahameed saeedm@nvidia.com Cc: Leon Romanovsky leon@kernel.org Cc: "David S. Miller" davem@davemloft.net Cc: Jakub Kicinski kuba@kernel.org Cc: netdev@vger.kernel.org Cc: linux-rdma@vger.kernel.org
drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1998,7 +1998,7 @@ static int mlx5e_open_channel(struct mlx c->num_tc = params->num_tc; c->xdp = !!params->xdp_prog; c->stats = &priv->channel_stats[ix].ch;
- c->aff_mask = irq_get_affinity_mask(irq);
c->aff_mask = irq_get_effective_affinity_mask(irq); c->lag_port = mlx5e_enumerate_lag_port(priv->mdev, ix);
netif_napi_add(netdev, &c->napi, mlx5e_napi_poll, 64);
Reviewed-by: Tariq Toukan tariqt@nvidia.com
Thanks.
On Thu, 2020-12-10 at 20:25 +0100, Thomas Gleixner wrote:
Using the interrupt affinity mask for checking locality is not really working well on architectures which support effective affinity masks.
The affinity mask is either the system wide default or set by user space, but the architecture can or even must reduce the mask to the effective set, which means that checking the affinity mask itself does not really tell about the actual target CPUs.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Saeed Mahameed saeedm@nvidia.com Cc: Leon Romanovsky leon@kernel.org Cc: "David S. Miller" davem@davemloft.net Cc: Jakub Kicinski kuba@kernel.org Cc: netdev@vger.kernel.org Cc: linux-rdma@vger.kernel.org
Acked-by: Saeed Mahameed saeedm@nvidia.com
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Boris Ostrovsky boris.ostrovsky@oracle.com Cc: Juergen Gross jgross@suse.com Cc: Stefano Stabellini sstabellini@kernel.org Cc: xen-devel@lists.xenproject.org --- drivers/xen/events/events_base.c | 6 ------ 1 file changed, 6 deletions(-)
--- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -1132,12 +1132,6 @@ int bind_evtchn_to_irq(evtchn_port_t evt } EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
-int bind_evtchn_to_irq_lateeoi(evtchn_port_t evtchn) -{ - return bind_evtchn_to_irq_chip(evtchn, &xen_lateeoi_chip); -} -EXPORT_SYMBOL_GPL(bind_evtchn_to_irq_lateeoi); - static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) { struct evtchn_bind_ipi bind_ipi;
On 12/10/20 2:26 PM, Thomas Gleixner wrote:
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Boris Ostrovsky boris.ostrovsky@oracle.com Cc: Juergen Gross jgross@suse.com Cc: Stefano Stabellini sstabellini@kernel.org Cc: xen-devel@lists.xenproject.org
drivers/xen/events/events_base.c | 6 ------ 1 file changed, 6 deletions(-)
--- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -1132,12 +1132,6 @@ int bind_evtchn_to_irq(evtchn_port_t evt } EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
-int bind_evtchn_to_irq_lateeoi(evtchn_port_t evtchn) -{
- return bind_evtchn_to_irq_chip(evtchn, &xen_lateeoi_chip);
-} -EXPORT_SYMBOL_GPL(bind_evtchn_to_irq_lateeoi);
include/xen/events.h also needs to be updated (and in the next patch for xen_set_affinity_evtchn() as well).
-boris
On Thu, Dec 10 2020 at 18:19, boris ostrovsky wrote:
On 12/10/20 2:26 PM, Thomas Gleixner wrote:
-EXPORT_SYMBOL_GPL(bind_evtchn_to_irq_lateeoi);
include/xen/events.h also needs to be updated (and in the next patch for xen_set_affinity_evtchn() as well).
Darn, I lost that.
This function can only ever work when the event channels:
- are already established - interrupts assigned to them - the affinity has been set by user space already
because any newly set up event channel is forced to be bound to CPU0 and the affinity mask of the interrupt is forced to contain cpumask_of(0).
As the CPU0 enforcement was in place _before_ this was implemented it's entirely unclear how that can ever have worked at all.
Remove it as preparation for doing it proper.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Boris Ostrovsky boris.ostrovsky@oracle.com Cc: Juergen Gross jgross@suse.com Cc: Stefano Stabellini sstabellini@kernel.org Cc: xen-devel@lists.xenproject.org --- drivers/xen/events/events_base.c | 9 --------- drivers/xen/evtchn.c | 34 +--------------------------------- 2 files changed, 1 insertion(+), 42 deletions(-)
--- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -1696,15 +1696,6 @@ static int set_affinity_irq(struct irq_d return ret; }
-/* To be called with desc->lock held. */ -int xen_set_affinity_evtchn(struct irq_desc *desc, unsigned int tcpu) -{ - struct irq_data *d = irq_desc_get_irq_data(desc); - - return set_affinity_irq(d, cpumask_of(tcpu), false); -} -EXPORT_SYMBOL_GPL(xen_set_affinity_evtchn); - static void enable_dynirq(struct irq_data *data) { evtchn_port_t evtchn = evtchn_from_irq(data->irq); --- a/drivers/xen/evtchn.c +++ b/drivers/xen/evtchn.c @@ -421,36 +421,6 @@ static void evtchn_unbind_from_user(stru del_evtchn(u, evtchn); }
-static DEFINE_PER_CPU(int, bind_last_selected_cpu); - -static void evtchn_bind_interdom_next_vcpu(evtchn_port_t evtchn) -{ - unsigned int selected_cpu, irq; - struct irq_desc *desc; - unsigned long flags; - - irq = irq_from_evtchn(evtchn); - desc = irq_to_desc(irq); - - if (!desc) - return; - - raw_spin_lock_irqsave(&desc->lock, flags); - selected_cpu = this_cpu_read(bind_last_selected_cpu); - selected_cpu = cpumask_next_and(selected_cpu, - desc->irq_common_data.affinity, cpu_online_mask); - - if (unlikely(selected_cpu >= nr_cpu_ids)) - selected_cpu = cpumask_first_and(desc->irq_common_data.affinity, - cpu_online_mask); - - this_cpu_write(bind_last_selected_cpu, selected_cpu); - - /* unmask expects irqs to be disabled */ - xen_set_affinity_evtchn(desc, selected_cpu); - raw_spin_unlock_irqrestore(&desc->lock, flags); -} - static long evtchn_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -508,10 +478,8 @@ static long evtchn_ioctl(struct file *fi break;
rc = evtchn_bind_to_user(u, bind_interdomain.local_port); - if (rc == 0) { + if (rc == 0) rc = bind_interdomain.local_port; - evtchn_bind_interdom_next_vcpu(rc); - } break; }
There is absolutely no reason to mimic the x86 deferred affinity setting. This mechanism is required to handle the hardware induced issues of IO/APIC and MSI and is not in use when the interrupts are remapped.
XEN does not need this and can simply change the affinity from the calling context. The core code invokes this with the interrupt descriptor lock held so it is fully serialized against any other operation.
Mark the interrupts with IRQ_MOVE_PCNTXT to disable the deferred affinity setting. The conditional mask/unmask operation is already handled in xen_rebind_evtchn_to_cpu().
This makes XEN on x86 use the same mechanics as on e.g. ARM64 where deferred affinity setting is not required and not implemented and the code path in the ack functions is compiled out.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Boris Ostrovsky boris.ostrovsky@oracle.com Cc: Juergen Gross jgross@suse.com Cc: Stefano Stabellini sstabellini@kernel.org Cc: xen-devel@lists.xenproject.org --- drivers/xen/events/events_base.c | 35 +++++++++-------------------------- 1 file changed, 9 insertions(+), 26 deletions(-)
--- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -628,6 +628,11 @@ static void xen_irq_init(unsigned irq) info->refcnt = -1;
set_info_for_irq(irq, info); + /* + * Interrupt affinity setting can be immediate. No point + * in delaying it until an interrupt is handled. + */ + irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
INIT_LIST_HEAD(&info->eoi_list); list_add_tail(&info->list, &xen_irq_list_head); @@ -739,18 +744,7 @@ static void eoi_pirq(struct irq_data *da if (!VALID_EVTCHN(evtchn)) return;
- if (unlikely(irqd_is_setaffinity_pending(data)) && - likely(!irqd_irq_disabled(data))) { - int masked = test_and_set_mask(evtchn); - - clear_evtchn(evtchn); - - irq_move_masked_irq(data); - - if (!masked) - unmask_evtchn(evtchn); - } else - clear_evtchn(evtchn); + clear_evtchn(evtchn);
if (pirq_needs_eoi(data->irq)) { rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); @@ -1641,7 +1635,6 @@ void rebind_evtchn_irq(evtchn_port_t evt mutex_unlock(&irq_mapping_update_lock);
bind_evtchn_to_cpu(evtchn, info->cpu); - /* This will be deferred until interrupt is processed */ irq_set_affinity(irq, cpumask_of(info->cpu));
/* Unmask the event channel. */ @@ -1688,8 +1681,9 @@ static int set_affinity_irq(struct irq_d bool force) { unsigned tcpu = cpumask_first_and(dest, cpu_online_mask); - int ret = xen_rebind_evtchn_to_cpu(evtchn_from_irq(data->irq), tcpu); + int ret;
+ ret = xen_rebind_evtchn_to_cpu(evtchn_from_irq(data->irq), tcpu); if (!ret) irq_data_update_effective_affinity(data, cpumask_of(tcpu));
@@ -1719,18 +1713,7 @@ static void ack_dynirq(struct irq_data * if (!VALID_EVTCHN(evtchn)) return;
- if (unlikely(irqd_is_setaffinity_pending(data)) && - likely(!irqd_irq_disabled(data))) { - int masked = test_and_set_mask(evtchn); - - clear_evtchn(evtchn); - - irq_move_masked_irq(data); - - if (!masked) - unmask_evtchn(evtchn); - } else - clear_evtchn(evtchn); + clear_evtchn(evtchn); }
static void mask_ack_dynirq(struct irq_data *data)
All event channel setups bind the interrupt on CPU0 or the target CPU for percpu interrupts and overwrite the affinity mask with the corresponding cpumask. That does not make sense.
The XEN implementation of irqchip::irq_set_affinity() already picks a single target CPU out of the affinity mask and the actual target is stored in the effective CPU mask, so destroying the user chosen affinity mask which might contain more than one CPU is wrong.
Change the implementation so that the channel is bound to CPU0 at the XEN level and leave the affinity mask alone. At startup of the interrupt affinity will be assigned out of the affinity mask and the XEN binding will be updated. Only keep the enforcement for real percpu interrupts.
On resume the overwrite is not required either because info->cpu and the affinity mask are still the same as at the time of suspend. Same for rebind_evtchn_irq().
This also prepares for proper interrupt spreading.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Boris Ostrovsky boris.ostrovsky@oracle.com Cc: Juergen Gross jgross@suse.com Cc: Stefano Stabellini sstabellini@kernel.org Cc: xen-devel@lists.xenproject.org --- drivers/xen/events/events_base.c | 42 ++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 14 deletions(-)
--- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -433,15 +433,20 @@ static bool pirq_needs_eoi_flag(unsigned return info->u.pirq.flags & PIRQ_NEEDS_EOI; }
-static void bind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int cpu) +static void bind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int cpu, + bool force_affinity) { int irq = get_evtchn_to_irq(evtchn); struct irq_info *info = info_for_irq(irq);
BUG_ON(irq == -1); -#ifdef CONFIG_SMP - cpumask_copy(irq_get_affinity_mask(irq), cpumask_of(cpu)); -#endif + + if (IS_ENABLED(CONFIG_SMP) && force_affinity) { + cpumask_copy(irq_get_affinity_mask(irq), cpumask_of(cpu)); + cpumask_copy(irq_get_effective_affinity_mask(irq), + cpumask_of(cpu)); + } + xen_evtchn_port_bind_to_cpu(evtchn, cpu, info->cpu);
info->cpu = cpu; @@ -788,7 +793,7 @@ static unsigned int __startup_pirq(unsig goto err;
info->evtchn = evtchn; - bind_evtchn_to_cpu(evtchn, 0); + bind_evtchn_to_cpu(evtchn, 0, false);
rc = xen_evtchn_port_setup(evtchn); if (rc) @@ -1107,8 +1112,8 @@ static int bind_evtchn_to_irq_chip(evtch irq = ret; goto out; } - /* New interdomain events are bound to VCPU 0. */ - bind_evtchn_to_cpu(evtchn, 0); + /* New interdomain events are initially bound to VCPU 0. */ + bind_evtchn_to_cpu(evtchn, 0, false); } else { struct irq_info *info = info_for_irq(irq); WARN_ON(info == NULL || info->type != IRQT_EVTCHN); @@ -1156,7 +1161,11 @@ static int bind_ipi_to_irq(unsigned int irq = ret; goto out; } - bind_evtchn_to_cpu(evtchn, cpu); + /* + * Force the affinity mask to the target CPU so proc shows + * the correct target. + */ + bind_evtchn_to_cpu(evtchn, cpu, true); } else { struct irq_info *info = info_for_irq(irq); WARN_ON(info == NULL || info->type != IRQT_IPI); @@ -1269,7 +1278,11 @@ int bind_virq_to_irq(unsigned int virq, goto out; }
- bind_evtchn_to_cpu(evtchn, cpu); + /* + * Force the affinity mask for percpu interrupts so proc + * shows the correct target. + */ + bind_evtchn_to_cpu(evtchn, cpu, percpu); } else { struct irq_info *info = info_for_irq(irq); WARN_ON(info == NULL || info->type != IRQT_VIRQ); @@ -1634,8 +1647,7 @@ void rebind_evtchn_irq(evtchn_port_t evt
mutex_unlock(&irq_mapping_update_lock);
- bind_evtchn_to_cpu(evtchn, info->cpu); - irq_set_affinity(irq, cpumask_of(info->cpu)); + bind_evtchn_to_cpu(evtchn, info->cpu, false);
/* Unmask the event channel. */ enable_irq(irq); @@ -1669,7 +1681,7 @@ static int xen_rebind_evtchn_to_cpu(evtc * it, but don't do the xenlinux-level rebind in that case. */ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) - bind_evtchn_to_cpu(evtchn, tcpu); + bind_evtchn_to_cpu(evtchn, tcpu, false);
if (!masked) unmask_evtchn(evtchn); @@ -1798,7 +1810,8 @@ static void restore_cpu_virqs(unsigned i
/* Record the new mapping. */ (void)xen_irq_info_virq_setup(cpu, irq, evtchn, virq); - bind_evtchn_to_cpu(evtchn, cpu); + /* The affinity mask is still valid */ + bind_evtchn_to_cpu(evtchn, cpu, false); } }
@@ -1823,7 +1836,8 @@ static void restore_cpu_ipis(unsigned in
/* Record the new mapping. */ (void)xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi); - bind_evtchn_to_cpu(evtchn, cpu); + /* The affinity mask is still valid */ + bind_evtchn_to_cpu(evtchn, cpu, false); } }
On 12/10/20 2:26 PM, Thomas Gleixner wrote:
All event channel setups bind the interrupt on CPU0 or the target CPU for percpu interrupts and overwrite the affinity mask with the corresponding cpumask. That does not make sense.
The XEN implementation of irqchip::irq_set_affinity() already picks a single target CPU out of the affinity mask and the actual target is stored in the effective CPU mask, so destroying the user chosen affinity mask which might contain more than one CPU is wrong.
Change the implementation so that the channel is bound to CPU0 at the XEN level and leave the affinity mask alone. At startup of the interrupt affinity will be assigned out of the affinity mask and the XEN binding will be updated.
If that's the case then I wonder whether we need this call at all and instead bind at startup time.
-boris
Only keep the enforcement for real percpu interrupts.
On Thu, Dec 10 2020 at 18:20, boris ostrovsky wrote:
On 12/10/20 2:26 PM, Thomas Gleixner wrote:
All event channel setups bind the interrupt on CPU0 or the target CPU for percpu interrupts and overwrite the affinity mask with the corresponding cpumask. That does not make sense.
The XEN implementation of irqchip::irq_set_affinity() already picks a single target CPU out of the affinity mask and the actual target is stored in the effective CPU mask, so destroying the user chosen affinity mask which might contain more than one CPU is wrong.
Change the implementation so that the channel is bound to CPU0 at the XEN level and leave the affinity mask alone. At startup of the interrupt affinity will be assigned out of the affinity mask and the XEN binding will be updated.
If that's the case then I wonder whether we need this call at all and instead bind at startup time.
I was wondering about that, but my knowledge about the Xen internal requirements is pretty limited. The current set at least survived basic testing by Jürgen.
Thanks,
tglx
On 11.12.20 00:20, boris.ostrovsky@oracle.com wrote:
On 12/10/20 2:26 PM, Thomas Gleixner wrote:
All event channel setups bind the interrupt on CPU0 or the target CPU for percpu interrupts and overwrite the affinity mask with the corresponding cpumask. That does not make sense.
The XEN implementation of irqchip::irq_set_affinity() already picks a single target CPU out of the affinity mask and the actual target is stored in the effective CPU mask, so destroying the user chosen affinity mask which might contain more than one CPU is wrong.
Change the implementation so that the channel is bound to CPU0 at the XEN level and leave the affinity mask alone. At startup of the interrupt affinity will be assigned out of the affinity mask and the XEN binding will be updated.
If that's the case then I wonder whether we need this call at all and instead bind at startup time.
This binding to cpu0 was introduced with commit 97253eeeb792d61ed2 and I have no reason to believe the underlying problem has been eliminated.
Juergen
On 11.12.20 00:20, boris.ostrovsky@oracle.com wrote:
On 12/10/20 2:26 PM, Thomas Gleixner wrote:
All event channel setups bind the interrupt on CPU0 or the target CPU for percpu interrupts and overwrite the affinity mask with the corresponding cpumask. That does not make sense.
The XEN implementation of irqchip::irq_set_affinity() already picks a single target CPU out of the affinity mask and the actual target is stored in the effective CPU mask, so destroying the user chosen affinity mask which might contain more than one CPU is wrong.
Change the implementation so that the channel is bound to CPU0 at the XEN level and leave the affinity mask alone. At startup of the interrupt affinity will be assigned out of the affinity mask and the XEN binding will be updated.
If that's the case then I wonder whether we need this call at all and instead bind at startup time.
After some discussion with Thomas on IRC and xen-devel archaeology the result is: this will be needed especially for systems running on a single vcpu (e.g. small guests), as the .irq_set_affinity() callback won't be called in this case when starting the irq.
Juergen
On Fri, Dec 11 2020 at 13:10, Jürgen Groß wrote:
On 11.12.20 00:20, boris.ostrovsky@oracle.com wrote:
On 12/10/20 2:26 PM, Thomas Gleixner wrote:
All event channel setups bind the interrupt on CPU0 or the target CPU for percpu interrupts and overwrite the affinity mask with the corresponding cpumask. That does not make sense.
The XEN implementation of irqchip::irq_set_affinity() already picks a single target CPU out of the affinity mask and the actual target is stored in the effective CPU mask, so destroying the user chosen affinity mask which might contain more than one CPU is wrong.
Change the implementation so that the channel is bound to CPU0 at the XEN level and leave the affinity mask alone. At startup of the interrupt affinity will be assigned out of the affinity mask and the XEN binding will be updated.
If that's the case then I wonder whether we need this call at all and instead bind at startup time.
After some discussion with Thomas on IRC and xen-devel archaeology the result is: this will be needed especially for systems running on a single vcpu (e.g. small guests), as the .irq_set_affinity() callback won't be called in this case when starting the irq.
That's right, but not limited to ARM. The same problem exists on x86 UP. So yes, the call makes sense, but the changelog is not really useful. Let me add a comment to this.
Thanks,
tglx
On 12/11/20 7:37 AM, Thomas Gleixner wrote:
On Fri, Dec 11 2020 at 13:10, Jürgen Groß wrote:
On 11.12.20 00:20, boris.ostrovsky@oracle.com wrote:
On 12/10/20 2:26 PM, Thomas Gleixner wrote:
All event channel setups bind the interrupt on CPU0 or the target CPU for percpu interrupts and overwrite the affinity mask with the corresponding cpumask. That does not make sense.
The XEN implementation of irqchip::irq_set_affinity() already picks a single target CPU out of the affinity mask and the actual target is stored in the effective CPU mask, so destroying the user chosen affinity mask which might contain more than one CPU is wrong.
Change the implementation so that the channel is bound to CPU0 at the XEN level and leave the affinity mask alone. At startup of the interrupt affinity will be assigned out of the affinity mask and the XEN binding will be updated.
If that's the case then I wonder whether we need this call at all and instead bind at startup time.
After some discussion with Thomas on IRC and xen-devel archaeology the result is: this will be needed especially for systems running on a single vcpu (e.g. small guests), as the .irq_set_affinity() callback won't be called in this case when starting the irq.
On UP are we not then going to end up with an empty affinity mask? Or are we guaranteed to have it set to 1 by interrupt generic code?
This is actually why I brought this up in the first place --- a potential mismatch between the affinity mask and Xen-specific data (e.g. info->cpu and then protocol-specific data in event channel code). Even if they are re-synchronized later, at startup time (for SMP).
I don't see anything that would cause a problem right now but I worry that this inconsistency may come up at some point.
-boris
That's right, but not limited to ARM. The same problem exists on x86 UP. So yes, the call makes sense, but the changelog is not really useful. Let me add a comment to this.
Thanks,
tglx
On Fri, Dec 11 2020 at 09:29, boris ostrovsky wrote:
On 12/11/20 7:37 AM, Thomas Gleixner wrote:
On Fri, Dec 11 2020 at 13:10, Jürgen Groß wrote:
On 11.12.20 00:20, boris.ostrovsky@oracle.com wrote:
On 12/10/20 2:26 PM, Thomas Gleixner wrote:
Change the implementation so that the channel is bound to CPU0 at the XEN level and leave the affinity mask alone. At startup of the interrupt affinity will be assigned out of the affinity mask and the XEN binding will be updated.
If that's the case then I wonder whether we need this call at all and instead bind at startup time.
After some discussion with Thomas on IRC and xen-devel archaeology the result is: this will be needed especially for systems running on a single vcpu (e.g. small guests), as the .irq_set_affinity() callback won't be called in this case when starting the irq.
On UP are we not then going to end up with an empty affinity mask? Or are we guaranteed to have it set to 1 by interrupt generic code?
An UP kernel does not ever look on the affinity mask. The chip::irq_set_affinity() callback is not invoked so the mask is irrelevant.
A SMP kernel on a UP machine sets CPU0 in the mask so all is good.
This is actually why I brought this up in the first place --- a potential mismatch between the affinity mask and Xen-specific data (e.g. info->cpu and then protocol-specific data in event channel code). Even if they are re-synchronized later, at startup time (for SMP).
Which is not a problem either. The affinity mask is only relevant for setting the affinity, but it's not relevant for delivery and never can be.
I don't see anything that would cause a problem right now but I worry that this inconsistency may come up at some point.
As long as the affinity mask becomes not part of the event channel magic this should never matter.
Look at it from hardware:
interrupt is affine to CPU0
CPU0 runs:
set_affinity(CPU0 -> CPU1) local_irq_disable()
--> interrupt is raised in hardware and pending on CPU0
irq hardware is reconfigured to be affine to CPU1
local_irq_enable()
--> interrupt is handled on CPU0
the next interrupt will be raised on CPU1
So info->cpu which is registered via the hypercall binds the 'hardware delivery' and whenever the new affinity is written it is rebound to some other CPU and the next interrupt is then raised on this other CPU.
It's not any different from the hardware example at least not as far as I understood the code.
Thanks,
tglx
On 11/12/2020 21:27, Thomas Gleixner wrote:
On Fri, Dec 11 2020 at 09:29, boris ostrovsky wrote:
On 12/11/20 7:37 AM, Thomas Gleixner wrote:
On Fri, Dec 11 2020 at 13:10, Jürgen Groß wrote:
On 11.12.20 00:20, boris.ostrovsky@oracle.com wrote:
On 12/10/20 2:26 PM, Thomas Gleixner wrote:
Change the implementation so that the channel is bound to CPU0 at the XEN level and leave the affinity mask alone. At startup of the interrupt affinity will be assigned out of the affinity mask and the XEN binding will be updated.
If that's the case then I wonder whether we need this call at all and instead bind at startup time.
After some discussion with Thomas on IRC and xen-devel archaeology the result is: this will be needed especially for systems running on a single vcpu (e.g. small guests), as the .irq_set_affinity() callback won't be called in this case when starting the irq.
On UP are we not then going to end up with an empty affinity mask? Or are we guaranteed to have it set to 1 by interrupt generic code?
An UP kernel does not ever look on the affinity mask. The chip::irq_set_affinity() callback is not invoked so the mask is irrelevant.
A SMP kernel on a UP machine sets CPU0 in the mask so all is good.
This is actually why I brought this up in the first place --- a potential mismatch between the affinity mask and Xen-specific data (e.g. info->cpu and then protocol-specific data in event channel code). Even if they are re-synchronized later, at startup time (for SMP).
Which is not a problem either. The affinity mask is only relevant for setting the affinity, but it's not relevant for delivery and never can be.
I don't see anything that would cause a problem right now but I worry that this inconsistency may come up at some point.
As long as the affinity mask becomes not part of the event channel magic this should never matter.
Look at it from hardware:
interrupt is affine to CPU0
CPU0 runs: set_affinity(CPU0 -> CPU1) local_irq_disable()
--> interrupt is raised in hardware and pending on CPU0
irq hardware is reconfigured to be affine to CPU1 local_irq_enable()
--> interrupt is handled on CPU0
the next interrupt will be raised on CPU1
So info->cpu which is registered via the hypercall binds the 'hardware delivery' and whenever the new affinity is written it is rebound to some other CPU and the next interrupt is then raised on this other CPU.
It's not any different from the hardware example at least not as far as I understood the code.
Xen's event channels do have a couple of quirks.
Binding an event channel always results in one spurious event being delivered. This is to cover notifications which can get lost during the bidirectional setup, or re-setups in certain configurations.
Binding an interdomain or pirq event channel always defaults to vCPU0. There is no way to atomically set the affinity while binding. I believe the API predates SMP guest support in Xen, and noone has fixed it up since.
As a consequence, the guest will observe the event raised on vCPU0 as part of setting up the event, even if it attempts to set a different affinity immediately afterwards. A little bit of care needs to be taken when binding an event channel on vCPUs other than 0, to ensure that the callback is safe with respect to any remaining state needing initialisation.
Beyond this, there is nothing magic I'm aware of.
We have seen soft lockups before in certain scenarios, simply due to the quantity of events hitting vCPU0 before irqbalance gets around to spreading the load. This is why there is an attempt to round-robin the userspace event channel affinities by default, but I still don't see why this would need custom affinity logic itself.
Thanks,
~Andrew
Andrew,
On Fri, Dec 11 2020 at 22:21, Andrew Cooper wrote:
On 11/12/2020 21:27, Thomas Gleixner wrote:
It's not any different from the hardware example at least not as far as I understood the code.
Xen's event channels do have a couple of quirks.
Why am I not surprised?
Binding an event channel always results in one spurious event being delivered. This is to cover notifications which can get lost during the bidirectional setup, or re-setups in certain configurations.
Binding an interdomain or pirq event channel always defaults to vCPU0. There is no way to atomically set the affinity while binding. I believe the API predates SMP guest support in Xen, and noone has fixed it up since.
That's fine. I'm not changing that.
What I'm changing is the unwanted and unnecessary overwriting of the actual affinity mask.
We have a similar issue on real hardware where we can only target _one_ CPU and not all CPUs in the affinity mask. So we still can preserve the (user) requested mask and just affine it to one CPU which is reflected in the effective affinity mask. This the right thing to do for two reasons:
1) It allows proper interrupt distribution
2) It does not break (user) requested affinity when the effective target CPU goes offline and the affinity mask still contains online CPUs. If you overwrite it you lost track of the requested broader mask.
As a consequence, the guest will observe the event raised on vCPU0 as part of setting up the event, even if it attempts to set a different affinity immediately afterwards. A little bit of care needs to be taken when binding an event channel on vCPUs other than 0, to ensure that the callback is safe with respect to any remaining state needing initialisation.
That's preserved for all non percpu interrupts. The percpu variant of VIRQ and IPIs did binding to vCPU != 0 already before this change.
Beyond this, there is nothing magic I'm aware of.
We have seen soft lockups before in certain scenarios, simply due to the quantity of events hitting vCPU0 before irqbalance gets around to spreading the load. This is why there is an attempt to round-robin the userspace event channel affinities by default, but I still don't see why this would need custom affinity logic itself.
Just the previous attempt makes no sense for the reasons I outlined in the changelog. So now with this new spreading mechanics you get the distribution for all cases:
1) Post setup using and respecting the default affinity mask which can be set as a kernel commandline parameter.
2) Runtime (user) requested affinity change with a mask which contains more than one vCPU. The previous logic always chose the first one in the mask.
So assume userspace affines 4 irqs to a CPU 0-3 and 4 irqs to CPU 4-7 then 4 irqs end up on CPU0 and 4 on CPU4
The new algorithm which is similar to what we have on x86 (minus the vector space limitation) picks the CPU which has the least number of channels affine to it at that moment. If e.g. all 8 CPUs have the same number of vectors before that change then in the example above the first 4 are spread to CPU0-3 and the second 4 to CPU4-7
Thanks,
tglx
To prepare for interrupt spreading reduce the storage size of irq_info::spurious_cnt to u8 so the required flag for the spreading logic will not increase the storage size.
Protect the usage site against overruns.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Boris Ostrovsky boris.ostrovsky@oracle.com Cc: Juergen Gross jgross@suse.com Cc: Stefano Stabellini sstabellini@kernel.org Cc: xen-devel@lists.xenproject.org --- drivers/xen/events/events_base.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-)
--- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -95,7 +95,7 @@ struct irq_info { struct list_head list; struct list_head eoi_list; short refcnt; - short spurious_cnt; + u8 spurious_cnt; enum xen_irq_type type; /* type */ unsigned irq; evtchn_port_t evtchn; /* event channel */ @@ -528,8 +528,10 @@ static void xen_irq_lateeoi_locked(struc return;
if (spurious) { - if ((1 << info->spurious_cnt) < (HZ << 2)) - info->spurious_cnt++; + if ((1 << info->spurious_cnt) < (HZ << 2)) { + if (info->spurious_cnt != 0xFF) + info->spurious_cnt++; + } if (info->spurious_cnt > 1) { delay = 1 << (info->spurious_cnt - 2); if (delay > HZ)
Keep track of the assignments of event channels to CPUs and select the online CPU with the least assigned channels in the affinity mask which is handed to irq_chip::irq_set_affinity() from the core code.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: Boris Ostrovsky boris.ostrovsky@oracle.com Cc: Juergen Gross jgross@suse.com Cc: Stefano Stabellini sstabellini@kernel.org Cc: xen-devel@lists.xenproject.org --- drivers/xen/events/events_base.c | 72 ++++++++++++++++++++++++++++++++++----- 1 file changed, 64 insertions(+), 8 deletions(-)
--- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -96,6 +96,7 @@ struct irq_info { struct list_head eoi_list; short refcnt; u8 spurious_cnt; + u8 is_accounted; enum xen_irq_type type; /* type */ unsigned irq; evtchn_port_t evtchn; /* event channel */ @@ -161,6 +162,9 @@ static DEFINE_PER_CPU(int [NR_VIRQS], vi /* IRQ <-> IPI mapping */ static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1};
+/* Event channel distribution data */ +static atomic_t channels_on_cpu[NR_CPUS]; + static int **evtchn_to_irq; #ifdef CONFIG_X86 static unsigned long *pirq_eoi_map; @@ -257,6 +261,32 @@ static void set_info_for_irq(unsigned in irq_set_chip_data(irq, info); }
+/* Per CPU channel accounting */ +static void channels_on_cpu_dec(struct irq_info *info) +{ + if (!info->is_accounted) + return; + + info->is_accounted = 0; + + if (WARN_ON_ONCE(info->cpu >= nr_cpu_ids)) + return; + + WARN_ON_ONCE(!atomic_add_unless(&channels_on_cpu[info->cpu], -1 , 0)); +} + +static void channels_on_cpu_inc(struct irq_info *info) +{ + if (WARN_ON_ONCE(info->cpu >= nr_cpu_ids)) + return; + + if (WARN_ON_ONCE(!atomic_add_unless(&channels_on_cpu[info->cpu], 1, + INT_MAX))) + return; + + info->is_accounted = 1; +} + /* Constructors for packed IRQ information. */ static int xen_irq_info_common_setup(struct irq_info *info, unsigned irq, @@ -339,6 +369,7 @@ static void xen_irq_info_cleanup(struct { set_evtchn_to_irq(info->evtchn, -1); info->evtchn = 0; + channels_on_cpu_dec(info); }
/* @@ -449,7 +480,9 @@ static void bind_evtchn_to_cpu(evtchn_po
xen_evtchn_port_bind_to_cpu(evtchn, cpu, info->cpu);
+ channels_on_cpu_dec(info); info->cpu = cpu; + channels_on_cpu_inc(info); }
/** @@ -622,11 +655,6 @@ static void xen_irq_init(unsigned irq) { struct irq_info *info;
-#ifdef CONFIG_SMP - /* By default all event channels notify CPU#0. */ - cpumask_copy(irq_get_affinity_mask(irq), cpumask_of(0)); -#endif - info = kzalloc(sizeof(*info), GFP_KERNEL); if (info == NULL) panic("Unable to allocate metadata for IRQ%d\n", irq); @@ -1691,10 +1719,34 @@ static int xen_rebind_evtchn_to_cpu(evtc return 0; }
+/* + * Find the CPU within @dest mask which has the least number of channels + * assigned. This is not precise as the per cpu counts can be modified + * concurrently. + */ +static unsigned int select_target_cpu(const struct cpumask *dest) +{ + unsigned int cpu, best_cpu = UINT_MAX, minch = UINT_MAX; + + for_each_cpu_and(cpu, dest, cpu_online_mask) { + unsigned int curch = atomic_read(&channels_on_cpu[cpu]); + + if (curch < minch) { + minch = curch; + best_cpu = cpu; + } + } + + /* If this happens accounting is screwed up */ + if (WARN_ON_ONCE(best_cpu == UINT_MAX)) + best_cpu = cpumask_first(dest); + return best_cpu; +} + static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest, bool force) { - unsigned tcpu = cpumask_first_and(dest, cpu_online_mask); + unsigned int tcpu = select_target_cpu(dest); int ret;
ret = xen_rebind_evtchn_to_cpu(evtchn_from_irq(data->irq), tcpu); @@ -1922,8 +1974,12 @@ void xen_irq_resume(void) xen_evtchn_resume();
/* No IRQ <-> event-channel mappings. */ - list_for_each_entry(info, &xen_irq_list_head, list) - info->evtchn = 0; /* zap event-channel binding */ + list_for_each_entry(info, &xen_irq_list_head, list) { + /* Zap event-channel binding */ + info->evtchn = 0; + /* Adjust accounting */ + channels_on_cpu_dec(info); + }
clear_evtchn_to_irq_all();
No more (ab)use in modules finally. Remove the export so there won't come new ones.
Signed-off-by: Thomas Gleixner tglx@linutronix.de --- kernel/irq/irqdesc.c | 1 - 1 file changed, 1 deletion(-)
--- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -352,7 +352,6 @@ struct irq_desc *irq_to_desc(unsigned in { return radix_tree_lookup(&irq_desc_tree, irq); } -EXPORT_SYMBOL(irq_to_desc);
static void delete_irq_desc(unsigned int irq) {
dri-devel@lists.freedesktop.org