for_each_bit() macro family uses find_bit() functions, so it's better to have for_each_bit() and find_bit() functions in the same header.
This series puts for_each_bit() to a proper place and optimizes its usage over the kernel.
The series is based on this: https://lore.kernel.org/linux-arch/20210612123639.329047-1-yury.norov@gmail....
The full series can be found here: https://github.com/norov/linux/commits/bm-final
Yury Norov (3): include/linux: move for_each_bit() macros from bitops.h to find.h find: micro-optimize for_each_{set,clear}_bit() Replace for_each_*_bit_from() with for_each_*_bit() where appropriate
arch/x86/kernel/apic/vector.c | 4 ++-- drivers/gpu/drm/etnaviv/etnaviv_gpu.c | 4 ++-- drivers/hwmon/ltc2992.c | 3 +-- include/linux/bitops.h | 34 --------------------------- include/linux/find.h | 34 +++++++++++++++++++++++++++ 5 files changed, 39 insertions(+), 40 deletions(-)
for_each_bit() macros depend on find_bit() machinery, and so the proper place for them is the find.h header.
Signed-off-by: Yury Norov yury.norov@gmail.com --- include/linux/bitops.h | 34 ---------------------------------- include/linux/find.h | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 34 deletions(-)
diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 26bf15e6cd35..31ae1ae1a974 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -31,40 +31,6 @@ extern unsigned long __sw_hweight64(__u64 w); */ #include <asm/bitops.h>
-#define for_each_set_bit(bit, addr, size) \ - for ((bit) = find_first_bit((addr), (size)); \ - (bit) < (size); \ - (bit) = find_next_bit((addr), (size), (bit) + 1)) - -/* same as for_each_set_bit() but use bit as value to start with */ -#define for_each_set_bit_from(bit, addr, size) \ - for ((bit) = find_next_bit((addr), (size), (bit)); \ - (bit) < (size); \ - (bit) = find_next_bit((addr), (size), (bit) + 1)) - -#define for_each_clear_bit(bit, addr, size) \ - for ((bit) = find_first_zero_bit((addr), (size)); \ - (bit) < (size); \ - (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) - -/* same as for_each_clear_bit() but use bit as value to start with */ -#define for_each_clear_bit_from(bit, addr, size) \ - for ((bit) = find_next_zero_bit((addr), (size), (bit)); \ - (bit) < (size); \ - (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) - -/** - * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits - * @start: bit offset to start search and to store the current iteration offset - * @clump: location to store copy of current 8-bit clump - * @bits: bitmap address to base the search on - * @size: bitmap size in number of bits - */ -#define for_each_set_clump8(start, clump, bits, size) \ - for ((start) = find_first_clump8(&(clump), (bits), (size)); \ - (start) < (size); \ - (start) = find_next_clump8(&(clump), (bits), (size), (start) + 8)) - static inline int get_bitmask_order(unsigned int count) { int order; diff --git a/include/linux/find.h b/include/linux/find.h index 6048f8c97418..4500e8ab93e2 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -279,4 +279,38 @@ unsigned long find_next_bit_le(const void *addr, unsigned #error "Please fix <asm/byteorder.h>" #endif
+#define for_each_set_bit(bit, addr, size) \ + for ((bit) = find_first_bit((addr), (size)); \ + (bit) < (size); \ + (bit) = find_next_bit((addr), (size), (bit) + 1)) + +/* same as for_each_set_bit() but use bit as value to start with */ +#define for_each_set_bit_from(bit, addr, size) \ + for ((bit) = find_next_bit((addr), (size), (bit)); \ + (bit) < (size); \ + (bit) = find_next_bit((addr), (size), (bit) + 1)) + +#define for_each_clear_bit(bit, addr, size) \ + for ((bit) = find_first_zero_bit((addr), (size)); \ + (bit) < (size); \ + (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) + +/* same as for_each_clear_bit() but use bit as value to start with */ +#define for_each_clear_bit_from(bit, addr, size) \ + for ((bit) = find_next_zero_bit((addr), (size), (bit)); \ + (bit) < (size); \ + (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) + +/** + * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits + * @start: bit offset to start search and to store the current iteration offset + * @clump: location to store copy of current 8-bit clump + * @bits: bitmap address to base the search on + * @size: bitmap size in number of bits + */ +#define for_each_set_clump8(start, clump, bits, size) \ + for ((start) = find_first_clump8(&(clump), (bits), (size)); \ + (start) < (size); \ + (start) = find_next_clump8(&(clump), (bits), (size), (start) + 8)) + #endif /*__LINUX_FIND_H_ */
On Fri, Jun 18, 2021 at 12:57:33PM -0700, Yury Norov wrote:
for_each_bit() macros depend on find_bit() machinery, and so the proper place for them is the find.h header.
Fine with me. Reviewed-by: Andy Shevchenko andriy.shevchenko@linux.intel.com
Signed-off-by: Yury Norov yury.norov@gmail.com
include/linux/bitops.h | 34 ---------------------------------- include/linux/find.h | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 34 deletions(-)
diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 26bf15e6cd35..31ae1ae1a974 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -31,40 +31,6 @@ extern unsigned long __sw_hweight64(__u64 w); */ #include <asm/bitops.h>
-#define for_each_set_bit(bit, addr, size) \
- for ((bit) = find_first_bit((addr), (size)); \
(bit) < (size); \
(bit) = find_next_bit((addr), (size), (bit) + 1))
-/* same as for_each_set_bit() but use bit as value to start with */ -#define for_each_set_bit_from(bit, addr, size) \
- for ((bit) = find_next_bit((addr), (size), (bit)); \
(bit) < (size); \
(bit) = find_next_bit((addr), (size), (bit) + 1))
-#define for_each_clear_bit(bit, addr, size) \
- for ((bit) = find_first_zero_bit((addr), (size)); \
(bit) < (size); \
(bit) = find_next_zero_bit((addr), (size), (bit) + 1))
-/* same as for_each_clear_bit() but use bit as value to start with */ -#define for_each_clear_bit_from(bit, addr, size) \
- for ((bit) = find_next_zero_bit((addr), (size), (bit)); \
(bit) < (size); \
(bit) = find_next_zero_bit((addr), (size), (bit) + 1))
-/**
- for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits
- @start: bit offset to start search and to store the current iteration offset
- @clump: location to store copy of current 8-bit clump
- @bits: bitmap address to base the search on
- @size: bitmap size in number of bits
- */
-#define for_each_set_clump8(start, clump, bits, size) \
- for ((start) = find_first_clump8(&(clump), (bits), (size)); \
(start) < (size); \
(start) = find_next_clump8(&(clump), (bits), (size), (start) + 8))
static inline int get_bitmask_order(unsigned int count) { int order; diff --git a/include/linux/find.h b/include/linux/find.h index 6048f8c97418..4500e8ab93e2 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -279,4 +279,38 @@ unsigned long find_next_bit_le(const void *addr, unsigned #error "Please fix <asm/byteorder.h>" #endif
+#define for_each_set_bit(bit, addr, size) \
- for ((bit) = find_first_bit((addr), (size)); \
(bit) < (size); \
(bit) = find_next_bit((addr), (size), (bit) + 1))
+/* same as for_each_set_bit() but use bit as value to start with */ +#define for_each_set_bit_from(bit, addr, size) \
- for ((bit) = find_next_bit((addr), (size), (bit)); \
(bit) < (size); \
(bit) = find_next_bit((addr), (size), (bit) + 1))
+#define for_each_clear_bit(bit, addr, size) \
- for ((bit) = find_first_zero_bit((addr), (size)); \
(bit) < (size); \
(bit) = find_next_zero_bit((addr), (size), (bit) + 1))
+/* same as for_each_clear_bit() but use bit as value to start with */ +#define for_each_clear_bit_from(bit, addr, size) \
- for ((bit) = find_next_zero_bit((addr), (size), (bit)); \
(bit) < (size); \
(bit) = find_next_zero_bit((addr), (size), (bit) + 1))
+/**
- for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits
- @start: bit offset to start search and to store the current iteration offset
- @clump: location to store copy of current 8-bit clump
- @bits: bitmap address to base the search on
- @size: bitmap size in number of bits
- */
+#define for_each_set_clump8(start, clump, bits, size) \
- for ((start) = find_first_clump8(&(clump), (bits), (size)); \
(start) < (size); \
(start) = find_next_clump8(&(clump), (bits), (size), (start) + 8))
#endif /*__LINUX_FIND_H_ */
2.30.2
The macros iterate thru all set/clear bits in a bitmap. They search a first bit using find_first_bit(), and the rest bits using find_next_bit().
Since find_next_bit() is called shortly after find_first_bit(), we can save few lines of I-cache by not using find_first_bit().
Signed-off-by: Yury Norov yury.norov@gmail.com --- include/linux/find.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/include/linux/find.h b/include/linux/find.h index 4500e8ab93e2..ae9ed52b52b8 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -280,7 +280,7 @@ unsigned long find_next_bit_le(const void *addr, unsigned #endif
#define for_each_set_bit(bit, addr, size) \ - for ((bit) = find_first_bit((addr), (size)); \ + for ((bit) = find_next_bit((addr), (size), 0); \ (bit) < (size); \ (bit) = find_next_bit((addr), (size), (bit) + 1))
@@ -291,7 +291,7 @@ unsigned long find_next_bit_le(const void *addr, unsigned (bit) = find_next_bit((addr), (size), (bit) + 1))
#define for_each_clear_bit(bit, addr, size) \ - for ((bit) = find_first_zero_bit((addr), (size)); \ + for ((bit) = find_next_zero_bit((addr), (size), 0); \ (bit) < (size); \ (bit) = find_next_zero_bit((addr), (size), (bit) + 1))
On Fri, Jun 18, 2021 at 12:57:34PM -0700, Yury Norov wrote:
The macros iterate thru all set/clear bits in a bitmap. They search a first bit using find_first_bit(), and the rest bits using find_next_bit().
Since find_next_bit() is called shortly after find_first_bit(), we can save few lines of I-cache by not using find_first_bit().
Any number available?
On Fri, 18 Jun 2021 20:57:34 +0100, Yury Norov yury.norov@gmail.com wrote:
The macros iterate thru all set/clear bits in a bitmap. They search a first bit using find_first_bit(), and the rest bits using find_next_bit().
Since find_next_bit() is called shortly after find_first_bit(), we can save few lines of I-cache by not using find_first_bit().
Really?
Signed-off-by: Yury Norov yury.norov@gmail.com
include/linux/find.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/include/linux/find.h b/include/linux/find.h index 4500e8ab93e2..ae9ed52b52b8 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -280,7 +280,7 @@ unsigned long find_next_bit_le(const void *addr, unsigned #endif
#define for_each_set_bit(bit, addr, size) \
- for ((bit) = find_first_bit((addr), (size)); \
- for ((bit) = find_next_bit((addr), (size), 0); \
On which architecture do you observe a gain? Only 32bit ARM and m68k implement their own version of find_first_bit(), and everyone else uses the canonical implementation:
#ifndef find_first_bit #define find_first_bit(addr, size) find_next_bit((addr), (size), 0) #endif
These architectures explicitly have different implementations for find_first_bit() and find_next_bit() because they can do better (whether that is true or not is another debate). I don't think you should remove this optimisation until it has been measured on these two architectures.
Thanks,
M.
On Sat, Jun 19, 2021 at 05:24:15PM +0100, Marc Zyngier wrote:
On Fri, 18 Jun 2021 20:57:34 +0100, Yury Norov yury.norov@gmail.com wrote:
The macros iterate thru all set/clear bits in a bitmap. They search a first bit using find_first_bit(), and the rest bits using find_next_bit().
Since find_next_bit() is called shortly after find_first_bit(), we can save few lines of I-cache by not using find_first_bit().
Really?
Signed-off-by: Yury Norov yury.norov@gmail.com
include/linux/find.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/include/linux/find.h b/include/linux/find.h index 4500e8ab93e2..ae9ed52b52b8 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -280,7 +280,7 @@ unsigned long find_next_bit_le(const void *addr, unsigned #endif
#define for_each_set_bit(bit, addr, size) \
- for ((bit) = find_first_bit((addr), (size)); \
- for ((bit) = find_next_bit((addr), (size), 0); \
On which architecture do you observe a gain? Only 32bit ARM and m68k implement their own version of find_first_bit(), and everyone else uses the canonical implementation:
And those who enable GENERIC_FIND_FIRST_BIT - x86, arm64, arc, mips and s390.
#ifndef find_first_bit #define find_first_bit(addr, size) find_next_bit((addr), (size), 0) #endif
These architectures explicitly have different implementations for find_first_bit() and find_next_bit() because they can do better (whether that is true or not is another debate). I don't think you should remove this optimisation until it has been measured on these two architectures.
This patch is based on a series that enables separate implementation of find_first_bit() for all architectures; according to my tests, find_first* is ~ twice faster than find_next* on arm64 and x86.
https://lore.kernel.org/lkml/20210612123639.329047-1-yury.norov@gmail.com/T/...
After applying the series, I noticed that my small kernel module that calls for_each_set_bit() is now using find_first_bit() to just find one bit, and find_next_bit() for all others. I think it's better to always use find_next_bit() in this case to minimize the chance of cache miss. But if it's not that obvious, I'll try to write some test.
On Sat, Jun 19, 2021 at 10:28:07AM -0700, Yury Norov wrote:
On Sat, Jun 19, 2021 at 05:24:15PM +0100, Marc Zyngier wrote:
On Fri, 18 Jun 2021 20:57:34 +0100, Yury Norov yury.norov@gmail.com wrote:
The macros iterate thru all set/clear bits in a bitmap. They search a first bit using find_first_bit(), and the rest bits using find_next_bit().
Since find_next_bit() is called shortly after find_first_bit(), we can save few lines of I-cache by not using find_first_bit().
Really?
Signed-off-by: Yury Norov yury.norov@gmail.com
include/linux/find.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/include/linux/find.h b/include/linux/find.h index 4500e8ab93e2..ae9ed52b52b8 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -280,7 +280,7 @@ unsigned long find_next_bit_le(const void *addr, unsigned #endif
#define for_each_set_bit(bit, addr, size) \
- for ((bit) = find_first_bit((addr), (size)); \
- for ((bit) = find_next_bit((addr), (size), 0); \
On which architecture do you observe a gain? Only 32bit ARM and m68k implement their own version of find_first_bit(), and everyone else uses the canonical implementation:
And those who enable GENERIC_FIND_FIRST_BIT - x86, arm64, arc, mips and s390.
#ifndef find_first_bit #define find_first_bit(addr, size) find_next_bit((addr), (size), 0) #endif
These architectures explicitly have different implementations for find_first_bit() and find_next_bit() because they can do better (whether that is true or not is another debate). I don't think you should remove this optimisation until it has been measured on these two architectures.
This patch is based on a series that enables separate implementation of find_first_bit() for all architectures; according to my tests, find_first* is ~ twice faster than find_next* on arm64 and x86.
https://lore.kernel.org/lkml/20210612123639.329047-1-yury.norov@gmail.com/T/...
After applying the series, I noticed that my small kernel module that calls for_each_set_bit() is now using find_first_bit() to just find one bit, and find_next_bit() for all others. I think it's better to always use find_next_bit() in this case to minimize the chance of cache miss. But if it's not that obvious, I'll try to write some test.
This test measures the difference between for_each_set_bit() and for_each_set_bit_from().
diff --git a/lib/find_bit_benchmark.c b/lib/find_bit_benchmark.c index 5637c5711db9..1f37e99090b0 100644 --- a/lib/find_bit_benchmark.c +++ b/lib/find_bit_benchmark.c @@ -111,6 +111,59 @@ static int __init test_find_next_and_bit(const void *bitmap, return 0; }
+#ifdef CONFIG_X86_64 +#define flush_cache_all() wbinvd() +#endif + +static int __init test_for_each_set_bit(int flags) +{ +#ifdef flush_cache_all + DECLARE_BITMAP(bm, BITS_PER_LONG * 2); + unsigned long i, cnt = 0; + ktime_t time; + + bm[0] = 1; bm[1] = 0; + + time = ktime_get(); + while (cnt < 1000) { + if (flags) + flush_cache_all(); + for_each_set_bit(i, bm, BITS_PER_LONG * 2) + cnt++; + } + + time = ktime_get() - time; + + pr_err("for_each_set_bit: %18llu ns, %6ld iterations\n", time, cnt); +#endif + return 0; +} + +static int __init test_for_each_set_bit_from(int flags) +{ +#ifdef flush_cache_all + DECLARE_BITMAP(bm, BITS_PER_LONG * 2); + unsigned long i, cnt = 0; + ktime_t time; + + bm[0] = 1; bm[1] = 0; + + time = ktime_get(); + while (cnt < 1000) { + if (flags) + flush_cache_all(); + i = 0; + for_each_set_bit_from(i, bm, BITS_PER_LONG * 2) + cnt++; + } + + time = ktime_get() - time; + + pr_err("for_each_set_bit_from:%16llu ns, %6ld iterations\n", time, cnt); +#endif + return 0; +} + static int __init find_bit_test(void) { unsigned long nbits = BITMAP_LEN / SPARSE; @@ -147,6 +200,16 @@ static int __init find_bit_test(void) test_find_first_bit(bitmap, BITMAP_LEN); test_find_next_and_bit(bitmap, bitmap2, BITMAP_LEN);
+ pr_err("\nStart testing for_each_bit()\n"); + + test_for_each_set_bit(0); + test_for_each_set_bit_from(0); + + pr_err("\nStart testing for_each_bit() with cash flushing\n"); + + test_for_each_set_bit(1); + test_for_each_set_bit_from(1); + /* * Everything is OK. Return error just to let user run benchmark * again without annoying rmmod.
Here on each iteration: - for_each_set_bit() calls find_first_bit() once, and find_next_bit() once. - for_each_set_bit_from() calls find_next_bit() twice.
On my AMD Ryzen 7 4700U, the result is like this:
Start testing for_each_bit() for_each_set_bit: 15296 ns, 1000 iterations for_each_set_bit_from: 15225 ns, 1000 iterations
Start testing for_each_bit() with cash flushing for_each_set_bit: 547626 ns, 1000 iterations for_each_set_bit_from: 497899 ns, 1000 iterations
for_each_set_bit_from() is ~10% faster than for_each_set_bit() in case of cold caches, and no significant difference was observed if flush_cache_all() is not called.
So, it looks reasonable to switch for_each_set_bit() to use find_next_bit() only.
Thanks, Yury
A couple of kernel functions call for_each_*_bit_from() with start bit equal to 0. Replace them with for_each_*_bit().
No functional changes, but might improve on readability.
Signed-off-by: Yury Norov yury.norov@gmail.com --- arch/x86/kernel/apic/vector.c | 4 ++-- drivers/gpu/drm/etnaviv/etnaviv_gpu.c | 4 ++-- drivers/hwmon/ltc2992.c | 3 +-- 3 files changed, 5 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index fb67ed5e7e6a..d099ef226f55 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -760,9 +760,9 @@ void __init lapic_update_legacy_vectors(void)
void __init lapic_assign_system_vectors(void) { - unsigned int i, vector = 0; + unsigned int i, vector;
- for_each_set_bit_from(vector, system_vectors, NR_VECTORS) + for_each_set_bit(vector, system_vectors, NR_VECTORS) irq_matrix_assign_system(vector_matrix, vector, false);
if (nr_legacy_irqs() > 1) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c index 4102bcea3341..42ce3287d3be 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c @@ -1032,7 +1032,7 @@ int etnaviv_gpu_debugfs(struct etnaviv_gpu *gpu, struct seq_file *m)
void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu) { - unsigned int i = 0; + unsigned int i;
dev_err(gpu->dev, "recover hung GPU!\n");
@@ -1045,7 +1045,7 @@ void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu)
/* complete all events, the GPU won't do it after the reset */ spin_lock(&gpu->event_spinlock); - for_each_set_bit_from(i, gpu->event_bitmap, ETNA_NR_EVENTS) + for_each_set_bit(i, gpu->event_bitmap, ETNA_NR_EVENTS) complete(&gpu->event_free); bitmap_zero(gpu->event_bitmap, ETNA_NR_EVENTS); spin_unlock(&gpu->event_spinlock); diff --git a/drivers/hwmon/ltc2992.c b/drivers/hwmon/ltc2992.c index 2a4bed0ab226..7352d2b3c756 100644 --- a/drivers/hwmon/ltc2992.c +++ b/drivers/hwmon/ltc2992.c @@ -248,8 +248,7 @@ static int ltc2992_gpio_get_multiple(struct gpio_chip *chip, unsigned long *mask
gpio_status = reg;
- gpio_nr = 0; - for_each_set_bit_from(gpio_nr, mask, LTC2992_GPIO_NR) { + for_each_set_bit(gpio_nr, mask, LTC2992_GPIO_NR) { if (test_bit(LTC2992_GPIO_BIT(gpio_nr), &gpio_status)) set_bit(gpio_nr, bits); }
On Fri, Jun 18, 2021 at 12:57:35PM -0700, Yury Norov wrote:
A couple of kernel functions call for_each_*_bit_from() with start bit equal to 0. Replace them with for_each_*_bit().
No functional changes, but might improve on readability.
...
--- a/drivers/hwmon/ltc2992.c +++ b/drivers/hwmon/ltc2992.c @@ -248,8 +248,7 @@ static int ltc2992_gpio_get_multiple(struct gpio_chip *chip, unsigned long *mask
gpio_status = reg;
- gpio_nr = 0;
- for_each_set_bit_from(gpio_nr, mask, LTC2992_GPIO_NR) {
- for_each_set_bit(gpio_nr, mask, LTC2992_GPIO_NR) { if (test_bit(LTC2992_GPIO_BIT(gpio_nr), &gpio_status)) set_bit(gpio_nr, bits); }
I would replace the entire loop by bitmap_replace() call.
Something like bitmap_replace(bits, bits, &gpio_status, mask, LTC2992_GPIO_NR);
(Good to split sometimes :-)
On Sat, Jun 19, 2021 at 01:49:58PM +0300, Andy Shevchenko wrote:
On Fri, Jun 18, 2021 at 12:57:35PM -0700, Yury Norov wrote:
A couple of kernel functions call for_each_*_bit_from() with start bit equal to 0. Replace them with for_each_*_bit().
No functional changes, but might improve on readability.
...
--- a/drivers/hwmon/ltc2992.c +++ b/drivers/hwmon/ltc2992.c @@ -248,8 +248,7 @@ static int ltc2992_gpio_get_multiple(struct gpio_chip *chip, unsigned long *mask
gpio_status = reg;
- gpio_nr = 0;
- for_each_set_bit_from(gpio_nr, mask, LTC2992_GPIO_NR) {
- for_each_set_bit(gpio_nr, mask, LTC2992_GPIO_NR) { if (test_bit(LTC2992_GPIO_BIT(gpio_nr), &gpio_status)) set_bit(gpio_nr, bits); }
I would replace the entire loop by bitmap_replace() call.
Something like bitmap_replace(bits, bits, &gpio_status, mask, LTC2992_GPIO_NR);
Okay, it wouldn't work directly because it involves LTC2992_GPIO_BIT() macro. So, it rather some kind of bitmap_remap().
On Fri, Jun 18, 2021 at 12:57:35PM -0700, Yury Norov wrote:
A couple of kernel functions call for_each_*_bit_from() with start bit equal to 0. Replace them with for_each_*_bit().
No functional changes, but might improve on readability.
Signed-off-by: Yury Norov yury.norov@gmail.com
arch/x86/kernel/apic/vector.c | 4 ++-- drivers/gpu/drm/etnaviv/etnaviv_gpu.c | 4 ++-- drivers/hwmon/ltc2992.c | 3 +--
This should be three different patches, one per subsystem.
Guenter
3 files changed, 5 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index fb67ed5e7e6a..d099ef226f55 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -760,9 +760,9 @@ void __init lapic_update_legacy_vectors(void)
void __init lapic_assign_system_vectors(void) {
- unsigned int i, vector = 0;
- unsigned int i, vector;
- for_each_set_bit_from(vector, system_vectors, NR_VECTORS)
for_each_set_bit(vector, system_vectors, NR_VECTORS) irq_matrix_assign_system(vector_matrix, vector, false);
if (nr_legacy_irqs() > 1)
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c index 4102bcea3341..42ce3287d3be 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c @@ -1032,7 +1032,7 @@ int etnaviv_gpu_debugfs(struct etnaviv_gpu *gpu, struct seq_file *m)
void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu) {
- unsigned int i = 0;
unsigned int i;
dev_err(gpu->dev, "recover hung GPU!\n");
@@ -1045,7 +1045,7 @@ void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu)
/* complete all events, the GPU won't do it after the reset */ spin_lock(&gpu->event_spinlock);
- for_each_set_bit_from(i, gpu->event_bitmap, ETNA_NR_EVENTS)
- for_each_set_bit(i, gpu->event_bitmap, ETNA_NR_EVENTS) complete(&gpu->event_free); bitmap_zero(gpu->event_bitmap, ETNA_NR_EVENTS); spin_unlock(&gpu->event_spinlock);
diff --git a/drivers/hwmon/ltc2992.c b/drivers/hwmon/ltc2992.c index 2a4bed0ab226..7352d2b3c756 100644 --- a/drivers/hwmon/ltc2992.c +++ b/drivers/hwmon/ltc2992.c @@ -248,8 +248,7 @@ static int ltc2992_gpio_get_multiple(struct gpio_chip *chip, unsigned long *mask
gpio_status = reg;
- gpio_nr = 0;
- for_each_set_bit_from(gpio_nr, mask, LTC2992_GPIO_NR) {
- for_each_set_bit(gpio_nr, mask, LTC2992_GPIO_NR) { if (test_bit(LTC2992_GPIO_BIT(gpio_nr), &gpio_status)) set_bit(gpio_nr, bits); }
On Mon, Jun 21, 2021 at 01:17:11PM -0700, Guenter Roeck wrote:
On Fri, Jun 18, 2021 at 12:57:35PM -0700, Yury Norov wrote:
A couple of kernel functions call for_each_*_bit_from() with start bit equal to 0. Replace them with for_each_*_bit().
No functional changes, but might improve on readability.
Signed-off-by: Yury Norov yury.norov@gmail.com
arch/x86/kernel/apic/vector.c | 4 ++-- drivers/gpu/drm/etnaviv/etnaviv_gpu.c | 4 ++-- drivers/hwmon/ltc2992.c | 3 +--
This should be three different patches, one per subsystem.
It was discussed recently. https://lore.kernel.org/linux-arch/20210614180706.1e8564854bfed648dd4c039b@l...
Ping?
On Fri, Jun 18, 2021 at 12:57:32PM -0700, Yury Norov wrote:
for_each_bit() macro family uses find_bit() functions, so it's better to have for_each_bit() and find_bit() functions in the same header.
This series puts for_each_bit() to a proper place and optimizes its usage over the kernel.
The series is based on this: https://lore.kernel.org/linux-arch/20210612123639.329047-1-yury.norov@gmail....
The full series can be found here: https://github.com/norov/linux/commits/bm-final
Yury Norov (3): include/linux: move for_each_bit() macros from bitops.h to find.h find: micro-optimize for_each_{set,clear}_bit() Replace for_each_*_bit_from() with for_each_*_bit() where appropriate
arch/x86/kernel/apic/vector.c | 4 ++-- drivers/gpu/drm/etnaviv/etnaviv_gpu.c | 4 ++-- drivers/hwmon/ltc2992.c | 3 +-- include/linux/bitops.h | 34 --------------------------- include/linux/find.h | 34 +++++++++++++++++++++++++++ 5 files changed, 39 insertions(+), 40 deletions(-)
-- 2.30.2
dri-devel@lists.freedesktop.org