Greetings,
I met $subject in master-rt post drm merge, but taking the config (attached) to virgin v4.12-10624-g9967468c0a10, it's reproducible.
KERNEL: vmlinux-4.12.0.g9967468-preempt.gz DUMPFILE: vmcore CPUS: 8 DATE: Tue Jul 11 18:55:28 2017 UPTIME: 00:02:03 LOAD AVERAGE: 3.43, 1.39, 0.52 TASKS: 467 NODENAME: homer RELEASE: 4.12.0.g9967468-preempt VERSION: #155 SMP PREEMPT Tue Jul 11 18:18:11 CEST 2017 MACHINE: x86_64 (3591 Mhz) MEMORY: 16 GB PANIC: "BUG: unable to handle kernel paging request at ffffffffa022990f" PID: 4658 COMMAND: "kworker/u16:26" TASK: ffff8803c6068f80 [THREAD_INFO: ffff8803c6068f80] CPU: 7 STATE: TASK_RUNNING (PANIC)
crash> bt PID: 4658 TASK: ffff8803c6068f80 CPU: 7 COMMAND: "kworker/u16:26" #0 [ffffc900039f76a0] machine_kexec at ffffffff810481fc #1 [ffffc900039f76f0] __crash_kexec at ffffffff81109e3a #2 [ffffc900039f77b0] crash_kexec at ffffffff8110adc9 #3 [ffffc900039f77c8] oops_end at ffffffff8101d059 #4 [ffffc900039f77e8] no_context at ffffffff81055ce5 #5 [ffffc900039f7838] do_page_fault at ffffffff81056c5b #6 [ffffc900039f7860] page_fault at ffffffff81690a88 [exception RIP: report_bug+93] RIP: ffffffff8167227d RSP: ffffc900039f7918 RFLAGS: 00010002 RAX: ffffffffa0229905 RBX: ffffffffa020af0f RCX: 0000000000000001 RDX: 0000000000000907 RSI: ffffffffa020af11 RDI: ffffffffffff98f6 RBP: ffffc900039f7a58 R8: 0000000000000001 R9: 00000000000003fc R10: ffffffff81a01906 R11: ffff8803f84711f8 R12: ffffffffa02231fb R13: 0000000000000260 R14: 0000000000000004 R15: 0000000000000006 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #7 [ffffc900039f7910] report_bug at ffffffff81672248 #8 [ffffc900039f7938] fixup_bug at ffffffff8101af85 #9 [ffffc900039f7950] do_trap at ffffffff8101b0d9 #10 [ffffc900039f79a0] do_error_trap at ffffffff8101b190 #11 [ffffc900039f7a50] invalid_op at ffffffff8169063e [exception RIP: drm_calc_vbltimestamp_from_scanoutpos+335] RIP: ffffffffa020af0f RSP: ffffc900039f7b00 RFLAGS: 00010086 RAX: ffffffffa04fa100 RBX: ffff8803f9550800 RCX: 0000000000000001 RDX: ffffffffa0228a58 RSI: 0000000000000001 RDI: ffffffffa022321b RBP: ffffc900039f7b80 R8: 0000000000000000 R9: ffffffffa020adc0 R10: ffffffffa048a1b0 R11: ffff8803f84711f8 R12: 0000000000000001 R13: ffff8803f8471000 R14: ffffc900039f7b94 R15: ffffc900039f7bd0 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #12 [ffffc900039f7b18] gf119_head_vblank_put at ffffffffa04422f9 [nouveau] #13 [ffffc900039f7b88] drm_get_last_vbltimestamp at ffffffffa020ad91 [drm] #14 [ffffc900039f7ba8] drm_update_vblank_count at ffffffffa020b3e1 [drm] #15 [ffffc900039f7c10] drm_vblank_disable_and_save at ffffffffa020bbe9 [drm] #16 [ffffc900039f7c40] drm_crtc_vblank_off at ffffffffa020c3c0 [drm] #17 [ffffc900039f7cb0] nouveau_display_fini at ffffffffa048a4d6 [nouveau] #18 [ffffc900039f7ce0] nouveau_display_suspend at ffffffffa048ac4f [nouveau] #19 [ffffc900039f7d00] nouveau_do_suspend at ffffffffa047e5ec [nouveau] #20 [ffffc900039f7d38] nouveau_pmops_suspend at ffffffffa047e77d [nouveau] #21 [ffffc900039f7d50] pci_pm_suspend at ffffffff813b1ff0 #22 [ffffc900039f7d80] dpm_run_callback at ffffffff814c4dbd #23 [ffffc900039f7db8] __device_suspend at ffffffff814c5a61 #24 [ffffc900039f7e30] async_suspend at ffffffff814c5cfa #25 [ffffc900039f7e48] async_run_entry_fn at ffffffff81091683 #26 [ffffc900039f7e70] process_one_work at ffffffff810882bc #27 [ffffc900039f7eb0] worker_thread at ffffffff8108854a #28 [ffffc900039f7f10] kthread at ffffffff8108e387 #29 [ffffc900039f7f50] ret_from_fork at ffffffff8168fa85 crash> gdb list *drm_calc_vbltimestamp_from_scanoutpos+335 0xffffffffa020af0f is in drm_calc_vbltimestamp_from_scanoutpos (drivers/gpu/drm/drm_vblank.c:608). 603 /* If mode timing undefined, just return as no-op: 604 * Happens during initial modesetting of a crtc. 605 */ 606 if (mode->crtc_clock == 0) { 607 DRM_DEBUG("crtc %u: Noop due to uninitialized mode.\n", pipe); 608 WARN_ON_ONCE(drm_drv_uses_atomic_modeset(dev)); 609 610 return false; 611 } 612 crash> gdb list *report_bug+93 0xffffffff8167227d is in report_bug (lib/bug.c:177). 172 return BUG_TRAP_TYPE_WARN; 173 174 /* 175 * Since this is the only store, concurrency is not an issue. 176 */ 177 bug->flags |= BUGFLAG_DONE; 178 } 179 } 180 181 if (warning) { crash>
Some details that may be useful in analysis of the bug:
1. lspci -nn -d 10de: 2. What displays, if any, you have plugged into the NVIDIA board when this happens? 3. Any boot parameters, esp relating to ACPI, PM, or related?
Cheers,
-ilia
On Tue, Jul 11, 2017 at 1:32 PM, Mike Galbraith efault@gmx.de wrote:
Greetings,
I met $subject in master-rt post drm merge, but taking the config (attached) to virgin v4.12-10624-g9967468c0a10, it's reproducible.
KERNEL: vmlinux-4.12.0.g9967468-preempt.gz DUMPFILE: vmcore CPUS: 8 DATE: Tue Jul 11 18:55:28 2017 UPTIME: 00:02:03
LOAD AVERAGE: 3.43, 1.39, 0.52 TASKS: 467 NODENAME: homer RELEASE: 4.12.0.g9967468-preempt VERSION: #155 SMP PREEMPT Tue Jul 11 18:18:11 CEST 2017 MACHINE: x86_64 (3591 Mhz) MEMORY: 16 GB PANIC: "BUG: unable to handle kernel paging request at ffffffffa022990f" PID: 4658 COMMAND: "kworker/u16:26" TASK: ffff8803c6068f80 [THREAD_INFO: ffff8803c6068f80] CPU: 7 STATE: TASK_RUNNING (PANIC)
crash> bt PID: 4658 TASK: ffff8803c6068f80 CPU: 7 COMMAND: "kworker/u16:26" #0 [ffffc900039f76a0] machine_kexec at ffffffff810481fc #1 [ffffc900039f76f0] __crash_kexec at ffffffff81109e3a #2 [ffffc900039f77b0] crash_kexec at ffffffff8110adc9 #3 [ffffc900039f77c8] oops_end at ffffffff8101d059 #4 [ffffc900039f77e8] no_context at ffffffff81055ce5 #5 [ffffc900039f7838] do_page_fault at ffffffff81056c5b #6 [ffffc900039f7860] page_fault at ffffffff81690a88 [exception RIP: report_bug+93] RIP: ffffffff8167227d RSP: ffffc900039f7918 RFLAGS: 00010002 RAX: ffffffffa0229905 RBX: ffffffffa020af0f RCX: 0000000000000001 RDX: 0000000000000907 RSI: ffffffffa020af11 RDI: ffffffffffff98f6 RBP: ffffc900039f7a58 R8: 0000000000000001 R9: 00000000000003fc R10: ffffffff81a01906 R11: ffff8803f84711f8 R12: ffffffffa02231fb R13: 0000000000000260 R14: 0000000000000004 R15: 0000000000000006 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #7 [ffffc900039f7910] report_bug at ffffffff81672248 #8 [ffffc900039f7938] fixup_bug at ffffffff8101af85 #9 [ffffc900039f7950] do_trap at ffffffff8101b0d9 #10 [ffffc900039f79a0] do_error_trap at ffffffff8101b190 #11 [ffffc900039f7a50] invalid_op at ffffffff8169063e [exception RIP: drm_calc_vbltimestamp_from_scanoutpos+335] RIP: ffffffffa020af0f RSP: ffffc900039f7b00 RFLAGS: 00010086 RAX: ffffffffa04fa100 RBX: ffff8803f9550800 RCX: 0000000000000001 RDX: ffffffffa0228a58 RSI: 0000000000000001 RDI: ffffffffa022321b RBP: ffffc900039f7b80 R8: 0000000000000000 R9: ffffffffa020adc0 R10: ffffffffa048a1b0 R11: ffff8803f84711f8 R12: 0000000000000001 R13: ffff8803f8471000 R14: ffffc900039f7b94 R15: ffffc900039f7bd0 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #12 [ffffc900039f7b18] gf119_head_vblank_put at ffffffffa04422f9 [nouveau] #13 [ffffc900039f7b88] drm_get_last_vbltimestamp at ffffffffa020ad91 [drm] #14 [ffffc900039f7ba8] drm_update_vblank_count at ffffffffa020b3e1 [drm] #15 [ffffc900039f7c10] drm_vblank_disable_and_save at ffffffffa020bbe9 [drm] #16 [ffffc900039f7c40] drm_crtc_vblank_off at ffffffffa020c3c0 [drm] #17 [ffffc900039f7cb0] nouveau_display_fini at ffffffffa048a4d6 [nouveau] #18 [ffffc900039f7ce0] nouveau_display_suspend at ffffffffa048ac4f [nouveau] #19 [ffffc900039f7d00] nouveau_do_suspend at ffffffffa047e5ec [nouveau] #20 [ffffc900039f7d38] nouveau_pmops_suspend at ffffffffa047e77d [nouveau] #21 [ffffc900039f7d50] pci_pm_suspend at ffffffff813b1ff0 #22 [ffffc900039f7d80] dpm_run_callback at ffffffff814c4dbd #23 [ffffc900039f7db8] __device_suspend at ffffffff814c5a61 #24 [ffffc900039f7e30] async_suspend at ffffffff814c5cfa #25 [ffffc900039f7e48] async_run_entry_fn at ffffffff81091683 #26 [ffffc900039f7e70] process_one_work at ffffffff810882bc #27 [ffffc900039f7eb0] worker_thread at ffffffff8108854a #28 [ffffc900039f7f10] kthread at ffffffff8108e387 #29 [ffffc900039f7f50] ret_from_fork at ffffffff8168fa85 crash> gdb list *drm_calc_vbltimestamp_from_scanoutpos+335 0xffffffffa020af0f is in drm_calc_vbltimestamp_from_scanoutpos (drivers/gpu/drm/drm_vblank.c:608). 603 /* If mode timing undefined, just return as no-op: 604 * Happens during initial modesetting of a crtc. 605 */ 606 if (mode->crtc_clock == 0) { 607 DRM_DEBUG("crtc %u: Noop due to uninitialized mode.\n", pipe); 608 WARN_ON_ONCE(drm_drv_uses_atomic_modeset(dev)); 609 610 return false; 611 } 612 crash> gdb list *report_bug+93 0xffffffff8167227d is in report_bug (lib/bug.c:177). 172 return BUG_TRAP_TYPE_WARN; 173 174 /* 175 * Since this is the only store, concurrency is not an issue. 176 */ 177 bug->flags |= BUGFLAG_DONE; 178 } 179 } 180 181 if (warning) { crash>
On Tue, 2017-07-11 at 13:51 -0400, Ilia Mirkin wrote:
Some details that may be useful in analysis of the bug:
- lspci -nn -d 10de:
01:00.0 VGA compatible controller [0300]: NVIDIA Corporation GM204 [GeForce GTX 980] [10de:13c0] (rev a1) 01:00.1 Audio device [0403]: NVIDIA Corporation GM204 High Definition Audio Controller [10de:0fbb] (rev a1
- What displays, if any, you have plugged into the NVIDIA board when
this happens?
A Philips 273V, via DVI.
- Any boot parameters, esp relating to ACPI, PM, or related?
None for those, what's there that will be unfamiliar to you are for patches that aren't applied.
nortsched hpc_cpusets skew_tick=1 ftrace_dump_on_oops audit=0 nodelayacct cgroup_disable=memory rtkthreads=1 rtworkqueues=2 panic=60 ignore_loglevel crashkernel=256M,high
-Mike
On Tue, Jul 11, 2017 at 2:08 PM, Mike Galbraith efault@gmx.de wrote:
On Tue, 2017-07-11 at 13:51 -0400, Ilia Mirkin wrote:
Some details that may be useful in analysis of the bug:
- lspci -nn -d 10de:
01:00.0 VGA compatible controller [0300]: NVIDIA Corporation GM204 [GeForce GTX 980] [10de:13c0] (rev a1) 01:00.1 Audio device [0403]: NVIDIA Corporation GM204 High Definition Audio Controller [10de:0fbb] (rev a1
- What displays, if any, you have plugged into the NVIDIA board when
this happens?
A Philips 273V, via DVI.
- Any boot parameters, esp relating to ACPI, PM, or related?
None for those, what's there that will be unfamiliar to you are for patches that aren't applied.
nortsched hpc_cpusets skew_tick=1 ftrace_dump_on_oops audit=0 nodelayacct cgroup_disable=memory rtkthreads=1 rtworkqueues=2 panic=60 ignore_loglevel crashkernel=256M,high
OK, thanks. So in other words, a fairly standard desktop with a PCIe board plugged in. No funny business. (Laptops can create a ton of additional weirdness, which I assumed you had since you were talking about STR.)
My best guess is that gf119_head_vblank_put either has a bogus head id (should be in the 0..3 range) which causes it to do an out-of-bounds read on MMIO space, or that the MMIO mapping has already been removed by the time nouveau_display_suspend runs. Adding Ben Skeggs for additional insight.
Some display stuff did change for 4.13 for GM20x+ boards. If it's not too much trouble, a bisect would be pretty useful.
Cheers,
-ilia
On Tue, 2017-07-11 at 14:22 -0400, Ilia Mirkin wrote:
OK, thanks. So in other words, a fairly standard desktop with a PCIe board plugged in. No funny business. (Laptops can create a ton of additional weirdness, which I assumed you had since you were talking about STR.)
Yup, garden variety deskside box.
My best guess is that gf119_head_vblank_put either has a bogus head id (should be in the 0..3 range) which causes it to do an out-of-bounds read on MMIO space, or that the MMIO mapping has already been removed by the time nouveau_display_suspend runs. Adding Ben Skeggs for additional insight.
Some display stuff did change for 4.13 for GM20x+ boards. If it's not too much trouble, a bisect would be pretty useful.
Vacation -> back to work happens in the very early AM, so bisection will have to wait a bit.
-Mike
On Tue, 2017-07-11 at 20:53 +0200, Mike Galbraith wrote:
On Tue, 2017-07-11 at 14:22 -0400, Ilia Mirkin wrote:
Some display stuff did change for 4.13 for GM20x+ boards. If it's not too much trouble, a bisect would be pretty useful.
Vacation -> back to work happens in the very early AM, so bisection will have to wait a bit.
Hm, my backup workstation (old GeForce 8600 GT box) has the same issue, so perhaps I can bisect it as I work on backlog (multitasking: screw up multiple tasks concurrently).
-Mike
On Tue, 2017-07-11 at 14:22 -0400, Ilia Mirkin wrote:
Some display stuff did change for 4.13 for GM20x+ boards. If it's not too much trouble, a bisect would be pretty useful.
Bisection seemingly went fine, but the result is odd.
e98c58e55f68f8785aebfab1f8c9a03d8de0afe1 is the first bad commit
-Mike
On Wed, 2017-07-12 at 11:55 +0200, Mike Galbraith wrote:
On Tue, 2017-07-11 at 14:22 -0400, Ilia Mirkin wrote:
Some display stuff did change for 4.13 for GM20x+ boards. If it's not too much trouble, a bisect would be pretty useful.
Bisection seemingly went fine, but the result is odd.
e98c58e55f68f8785aebfab1f8c9a03d8de0afe1 is the first bad commit
But it really really is bad. Looking at gitk fork in the road leading to it...
52d9d38c183b drm/sti:fix spelling mistake: "compoment" -> "component" - good e4e818cc2d7c drm: make drm_panel.h self-contained - good 9cf8f5802f39 drm: add missing declaration to drm_blend.h - good
Before the git highway splits, all is well. The lane with commits works fine at both ends, but e98c58e55f68 is busted. Merge arfifact?
-Mike
On Wed, Jul 12, 2017 at 7:25 AM, Mike Galbraith efault@gmx.de wrote:
On Wed, 2017-07-12 at 11:55 +0200, Mike Galbraith wrote:
On Tue, 2017-07-11 at 14:22 -0400, Ilia Mirkin wrote:
Some display stuff did change for 4.13 for GM20x+ boards. If it's not too much trouble, a bisect would be pretty useful.
Bisection seemingly went fine, but the result is odd.
e98c58e55f68f8785aebfab1f8c9a03d8de0afe1 is the first bad commit
But it really really is bad. Looking at gitk fork in the road leading to it...
52d9d38c183b drm/sti:fix spelling mistake: "compoment" -> "component" - good e4e818cc2d7c drm: make drm_panel.h self-contained - good 9cf8f5802f39 drm: add missing declaration to drm_blend.h - good
Before the git highway splits, all is well. The lane with commits works fine at both ends, but e98c58e55f68 is busted. Merge arfifact?
Hmmm... that tree does not appear to have gotten a v4.12 backmerge at any point. The last backmerge from Linus as far as I can tell was v4.11-rc7. Could be an interaction with some out-of-tree change.
On Wed, 2017-07-12 at 07:37 -0400, Ilia Mirkin wrote:
On Wed, Jul 12, 2017 at 7:25 AM, Mike Galbraith efault@gmx.de wrote:
On Wed, 2017-07-12 at 11:55 +0200, Mike Galbraith wrote:
On Tue, 2017-07-11 at 14:22 -0400, Ilia Mirkin wrote:
Some display stuff did change for 4.13 for GM20x+ boards. If it's not too much trouble, a bisect would be pretty useful.
Bisection seemingly went fine, but the result is odd.
e98c58e55f68f8785aebfab1f8c9a03d8de0afe1 is the first bad commit
But it really really is bad. Looking at gitk fork in the road leading to it...
52d9d38c183b drm/sti:fix spelling mistake: "compoment" -> "component" - good e4e818cc2d7c drm: make drm_panel.h self-contained - good 9cf8f5802f39 drm: add missing declaration to drm_blend.h - good
Before the git highway splits, all is well. The lane with commits works fine at both ends, but e98c58e55f68 is busted. Merge arfifact?
Hmmm... that tree does not appear to have gotten a v4.12 backmerge at any point. The last backmerge from Linus as far as I can tell was v4.11-rc7. Could be an interaction with some out-of-tree change.
FWIW, checking out the fingered commit then..
git log --oneline 52d9d38c183b..e98c58e55f68|grep nouveau and reverting the lot helped not at all.
Checking out 6b7781b42dc9 and reverting the fingered commit did. Given the nouveau bits reverted are mostly the vblank changes, CC to Daniel, maybe he'll know why both GTX 980 and GeForce 8600 GT get all upset.
Either I'm damn lucky, both of my nvidia equipped boxen going boom 100% repeatably, or there are a lot of folks out there who haven't yet tried suspend with our latest/greatest kernel. I suspect the later.
-Mike
On 7/12/17 7:19 PM, Mike Galbraith wrote:
On Wed, 2017-07-12 at 07:37 -0400, Ilia Mirkin wrote:
On Wed, Jul 12, 2017 at 7:25 AM, Mike Galbraith efault@gmx.de wrote:
On Wed, 2017-07-12 at 11:55 +0200, Mike Galbraith wrote:
On Tue, 2017-07-11 at 14:22 -0400, Ilia Mirkin wrote:
Some display stuff did change for 4.13 for GM20x+ boards. If it's not too much trouble, a bisect would be pretty useful.
Bisection seemingly went fine, but the result is odd.
e98c58e55f68f8785aebfab1f8c9a03d8de0afe1 is the first bad commit
But it really really is bad. Looking at gitk fork in the road leading to it...
52d9d38c183b drm/sti:fix spelling mistake: "compoment" -> "component" - good e4e818cc2d7c drm: make drm_panel.h self-contained - good 9cf8f5802f39 drm: add missing declaration to drm_blend.h - good
Before the git highway splits, all is well. The lane with commits works fine at both ends, but e98c58e55f68 is busted. Merge arfifact?
Hmmm... that tree does not appear to have gotten a v4.12 backmerge at any point. The last backmerge from Linus as far as I can tell was v4.11-rc7. Could be an interaction with some out-of-tree change.
FWIW, checking out the fingered commit then..
git log --oneline 52d9d38c183b..e98c58e55f68|grep nouveau and reverting the lot helped not at all.
Checking out 6b7781b42dc9 and reverting the fingered commit did. Given the nouveau bits reverted are mostly the vblank changes, CC to Daniel, maybe he'll know why both GTX 980 and GeForce 8600 GT get all upset.
Either I'm damn lucky, both of my nvidia equipped boxen going boom 100% repeatably, or there are a lot of folks out there who haven't yet tried suspend with our latest/greatest kernel. I suspect the later.
-Mike
I should have had a look at my inbox, would have save me a log of work bisecting. Yet i come to the same conclusion:
# first bad commit: [e98c58e55f68f8785aebfab1f8c9a03d8de0afe1] Merge tag 'drm-misc-next-2017-05-16' of git://anongit.freedesktop.org/git/drm-misc into drm-next
I suspect it is some vblank change as it shows up in every trace i have seen while bisecting, but that is just a wild guess...
Greetings,
Tobias
On Wed, 2017-07-12 at 07:37 -0400, Ilia Mirkin wrote:
On Wed, Jul 12, 2017 at 7:25 AM, Mike Galbraith efault@gmx.de wrote:
On Wed, 2017-07-12 at 11:55 +0200, Mike Galbraith wrote:
On Tue, 2017-07-11 at 14:22 -0400, Ilia Mirkin wrote:
Some display stuff did change for 4.13 for GM20x+ boards. If it's not too much trouble, a bisect would be pretty useful.
Bisection seemingly went fine, but the result is odd.
e98c58e55f68f8785aebfab1f8c9a03d8de0afe1 is the first bad commit
But it really really is bad. Looking at gitk fork in the road leading to it...
52d9d38c183b drm/sti:fix spelling mistake: "compoment" -> "component" - good e4e818cc2d7c drm: make drm_panel.h self-contained - good 9cf8f5802f39 drm: add missing declaration to drm_blend.h - good
Before the git highway splits, all is well. The lane with commits works fine at both ends, but e98c58e55f68 is busted. Merge arfifact?
Hmmm... that tree does not appear to have gotten a v4.12 backmerge at any point. The last backmerge from Linus as far as I can tell was v4.11-rc7. Could be an interaction with some out-of-tree change.
Ok, a network outage gave me time to go hunting. Indeed it is a bad interaction with the tree DRM merged into. All DRM did was to slip a WARN_ON_ONCE() that nouveau triggers into a kernel module where such things no longer warn, they blow the box out of the water. I made a dinky testcase module (attached), and bisected to the real root....
19d436268dde95389c616bb3819da73f0a8b28a8 is the first bad commit commit 19d436268dde95389c616bb3819da73f0a8b28a8 Author: Peter Zijlstra peterz@infradead.org Date: Sat Feb 25 08:56:53 2017 +0100
debug: Add _ONCE() logic to report_bug()
Josh suggested moving the _ONCE logic inside the trap handler, using a bit in the bug_entry::flags field, avoiding the need for the extra variable.
Sadly this only works for WARN_ON_ONCE(), since the others have printk() statements prior to triggering the trap.
Still, this saves a fair amount of text and some data:
text data filename 10682460 4530992 defconfig-build/vmlinux.orig 10665111 4530096 defconfig-build/vmlinux.patched
Suggested-by: Josh Poimboeuf jpoimboe@redhat.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Cc: Andy Lutomirski luto@kernel.org Cc: Arnd Bergmann arnd@arndb.de Cc: Borislav Petkov bp@alien8.de Cc: Brian Gerst brgerst@gmail.com Cc: Denys Vlasenko dvlasenk@redhat.com Cc: H. Peter Anvin hpa@zytor.com Cc: Linus Torvalds torvalds@linux-foundation.org Cc: Peter Zijlstra peterz@infradead.org Cc: Thomas Gleixner tglx@linutronix.de Signed-off-by: Ingo Molnar mingo@kernel.org
:040000 040000 9f47f66ec4c234f6ee8e2a09e991c95fe47cf2c1 3e92aa9e77b39ed075ae2c3bdf041d92ef898f62 M arch :040000 040000 34f70b73d40c82533dd7df9b289106be69e2fa8d dd5d7248694a36b3e170f2dca5d9c4121535a990 M include :040000 040000 f6e627b0d378f0a00d2987fdd0c7b215306e6e3c b360d4ee2579744cce530184d7dab13493f73ee0 M lib
On Fri, 2017-07-14 at 15:36 +0200, Mike Galbraith wrote:
All DRM did was to slip a WARN_ON_ONCE() that nouveau triggers into a kernel module where such things no longer warn, they blow the box out of the water.
BTW, turn that irksome WARN_ON_ONCE() in drivers/gpu/drm/drm_vblank.c into a WARN_ONCE(), and all is peachy, you get the warning, box lives.
--- drivers/gpu/drm/drm_vblank.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
--- a/drivers/gpu/drm/drm_vblank.c +++ b/drivers/gpu/drm/drm_vblank.c @@ -605,7 +605,8 @@ bool drm_calc_vbltimestamp_from_scanoutp */ if (mode->crtc_clock == 0) { DRM_DEBUG("crtc %u: Noop due to uninitialized mode.\n", pipe); - WARN_ON_ONCE(drm_drv_uses_atomic_modeset(dev)); + WARN_ONCE(drm_drv_uses_atomic_modeset(dev), "%s: report me.\n", + dev->driver->name);
return false; }
On 7/14/17 3:41 PM, Mike Galbraith wrote:
On Fri, 2017-07-14 at 15:36 +0200, Mike Galbraith wrote:
All DRM did was to slip a WARN_ON_ONCE() that nouveau triggers into a kernel module where such things no longer warn, they blow the box out of the water.
BTW, turn that irksome WARN_ON_ONCE() in drivers/gpu/drm/drm_vblank.c into a WARN_ONCE(), and all is peachy, you get the warning, box lives.
drivers/gpu/drm/drm_vblank.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
--- a/drivers/gpu/drm/drm_vblank.c +++ b/drivers/gpu/drm/drm_vblank.c @@ -605,7 +605,8 @@ bool drm_calc_vbltimestamp_from_scanoutp */ if (mode->crtc_clock == 0) { DRM_DEBUG("crtc %u: Noop due to uninitialized mode.\n", pipe);
WARN_ON_ONCE(drm_drv_uses_atomic_modeset(dev));
WARN_ONCE(drm_drv_uses_atomic_modeset(dev), "%s: report me.\n",
dev->driver->name);
return false; }
Hey,
confirmed this helps saving the box, but we still have to find the root cause! Backtrace with the above fix applied (and the one which came in with the latest drm-fixes merge)!
[1] https://hastebin.com/uyoqifijed.http
Thanks,
Tobias
Yeah, we shouldn't let the machine die. Are there more WARN_ON_ONCE usage we could convert to WARN_ONCE?
Reviewed-By: Karol Herbst karolherbst@gmail.com
On Fri, Jul 14, 2017 at 5:05 PM, Tobias Klausmann tobias.johannes.klausmann@mni.thm.de wrote:
On 7/14/17 3:41 PM, Mike Galbraith wrote:
On Fri, 2017-07-14 at 15:36 +0200, Mike Galbraith wrote:
All DRM did was to slip a WARN_ON_ONCE() that nouveau triggers into a kernel module where such things no longer warn, they blow the box out of the water.
BTW, turn that irksome WARN_ON_ONCE() in drivers/gpu/drm/drm_vblank.c into a WARN_ONCE(), and all is peachy, you get the warning, box lives.
drivers/gpu/drm/drm_vblank.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
--- a/drivers/gpu/drm/drm_vblank.c +++ b/drivers/gpu/drm/drm_vblank.c @@ -605,7 +605,8 @@ bool drm_calc_vbltimestamp_from_scanoutp */ if (mode->crtc_clock == 0) { DRM_DEBUG("crtc %u: Noop due to uninitialized mode.\n", pipe);
WARN_ON_ONCE(drm_drv_uses_atomic_modeset(dev));
WARN_ONCE(drm_drv_uses_atomic_modeset(dev), "%s: report
me.\n",
dev->driver->name); return false; }
Hey,
confirmed this helps saving the box, but we still have to find the root cause! Backtrace with the above fix applied (and the one which came in with the latest drm-fixes merge)!
[1] https://hastebin.com/uyoqifijed.http
Thanks,
Tobias Reviewed-By: Karol Herbst karolherbst@gmail.com _______________________________________________ Nouveau mailing list Nouveau@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/nouveau
On Fri, 2017-07-14 at 17:10 +0200, Karol Herbst wrote:
Yeah, we shouldn't let the machine die. Are there more WARN_ON_ONCE usage we could convert to WARN_ONCE?
Shooting the messenger is generally considered uncool :)
-Mike
On Fri, Jul 14, 2017 at 11:15 AM, Mike Galbraith efault@gmx.de wrote:
On Fri, 2017-07-14 at 17:10 +0200, Karol Herbst wrote:
Yeah, we shouldn't let the machine die. Are there more WARN_ON_ONCE usage we could convert to WARN_ONCE?
Shooting the messenger is generally considered uncool :)
That's never stopped it from being a popular practice...
The conversion is a nice catch, but i'd like to have a bit more context, see below!
With a better description:
Tobias Klausmann tobias.johannes.klausmann@mni.thm.de
On 7/14/17 5:10 PM, Karol Herbst wrote:
Yeah, we shouldn't let the machine die. Are there more WARN_ON_ONCE usage we could convert to WARN_ONCE?
Reviewed-By: Karol Herbst karolherbst@gmail.com
On Fri, Jul 14, 2017 at 5:05 PM, Tobias Klausmann tobias.johannes.klausmann@mni.thm.de wrote:
On 7/14/17 3:41 PM, Mike Galbraith wrote:
On Fri, 2017-07-14 at 15:36 +0200, Mike Galbraith wrote:
All DRM did was to slip a WARN_ON_ONCE() that nouveau triggers into a kernel module where such things no longer warn, they blow the box out of the water.
BTW, turn that irksome WARN_ON_ONCE() in drivers/gpu/drm/drm_vblank.c into a WARN_ONCE(), and all is peachy, you get the warning, box lives.
drivers/gpu/drm/drm_vblank.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
--- a/drivers/gpu/drm/drm_vblank.c +++ b/drivers/gpu/drm/drm_vblank.c @@ -605,7 +605,8 @@ bool drm_calc_vbltimestamp_from_scanoutp */ if (mode->crtc_clock == 0) { DRM_DEBUG("crtc %u: Noop due to uninitialized mode.\n", pipe);
WARN_ON_ONCE(drm_drv_uses_atomic_modeset(dev));
WARN_ONCE(drm_drv_uses_atomic_modeset(dev), "%s: report
me.\n",
"report me" seems a bit odd, maybe just uninitialized mode?
dev->driver->name); return false; }
Hey,
confirmed this helps saving the box, but we still have to find the root cause! Backtrace with the above fix applied (and the one which came in with the latest drm-fixes merge)!
[1] https://hastebin.com/uyoqifijed.http
Thanks,
Tobias Reviewed-By: Karol Herbst karolherbst@gmail.com _______________________________________________ Nouveau mailing list Nouveau@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/nouveau
On Fri, Jul 14, 2017 at 11:19 AM, Tobias Klausmann tobias.johannes.klausmann@mni.thm.de wrote:
The conversion is a nice catch, but i'd like to have a bit more context, see below!
With a better description:
Tobias Klausmann tobias.johannes.klausmann@mni.thm.de
I don't think it was meant as a serious patch. WARN_ON_ONCE should work. The fix isn't to remove all instances of WARN_ON_ONCE. The fix is to fix WARN_ON_ONCE.
On Fri, Jul 14, 2017 at 11:20:01AM -0400, Ilia Mirkin wrote:
On Fri, Jul 14, 2017 at 11:19 AM, Tobias Klausmann tobias.johannes.klausmann@mni.thm.de wrote:
The conversion is a nice catch, but i'd like to have a bit more context, see below!
With a better description:
Tobias Klausmann tobias.johannes.klausmann@mni.thm.de
I don't think it was meant as a serious patch. WARN_ON_ONCE should work. The fix isn't to remove all instances of WARN_ON_ONCE. The fix is to fix WARN_ON_ONCE.
Quite so. Clearly I buggered it for modules; that really wasn't the plan.
On Fri, 2017-07-14 at 17:05 +0200, Tobias Klausmann wrote:
On 7/14/17 3:41 PM, Mike Galbraith wrote:
On Fri, 2017-07-14 at 15:36 +0200, Mike Galbraith wrote:
All DRM did was to slip a WARN_ON_ONCE() that nouveau triggers into a kernel module where such things no longer warn, they blow the box out of the water.
BTW, turn that irksome WARN_ON_ONCE() in drivers/gpu/drm/drm_vblank.c into a WARN_ONCE(), and all is peachy, you get the warning, box lives.
drivers/gpu/drm/drm_vblank.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
--- a/drivers/gpu/drm/drm_vblank.c +++ b/drivers/gpu/drm/drm_vblank.c @@ -605,7 +605,8 @@ bool drm_calc_vbltimestamp_from_scanoutp */ if (mode->crtc_clock == 0) { DRM_DEBUG("crtc %u: Noop due to uninitialized mode.\n", pipe);
WARN_ON_ONCE(drm_drv_uses_atomic_modeset(dev));
WARN_ONCE(drm_drv_uses_atomic_modeset(dev), "%s: report me.\n",
dev->driver->name);
return false; }
Hey,
confirmed this helps saving the box, but we still have to find the root cause! Backtrace with the above fix applied (and the one which came in with the latest drm-fixes merge)!
Yeah, I'll be reporting some extra whining from my 8600 GT backup box.
-Mike
On Fri, Jul 14, 2017 at 03:36:08PM +0200, Mike Galbraith wrote:
Ok, a network outage gave me time to go hunting. Indeed it is a bad interaction with the tree DRM merged into. All DRM did was to slip a WARN_ON_ONCE() that nouveau triggers into a kernel module where such things no longer warn, they blow the box out of the water. I made a dinky testcase module (attached), and bisected to the real root....
19d436268dde95389c616bb3819da73f0a8b28a8 is the first bad commit commit 19d436268dde95389c616bb3819da73f0a8b28a8 Author: Peter Zijlstra peterz@infradead.org Date: Sat Feb 25 08:56:53 2017 +0100
debug: Add _ONCE() logic to report_bug()
Urgh, is for some mysterious reason the __bug_table section of modules ending up in RO memory?
I forever get lost in that link magic :/
On Fri, 2017-07-14 at 17:50 +0200, Peter Zijlstra wrote:
On Fri, Jul 14, 2017 at 03:36:08PM +0200, Mike Galbraith wrote:
Ok, a network outage gave me time to go hunting. Indeed it is a bad interaction with the tree DRM merged into. All DRM did was to slip a WARN_ON_ONCE() that nouveau triggers into a kernel module where such things no longer warn, they blow the box out of the water. I made a dinky testcase module (attached), and bisected to the real root....
19d436268dde95389c616bb3819da73f0a8b28a8 is the first bad commit commit 19d436268dde95389c616bb3819da73f0a8b28a8 Author: Peter Zijlstra peterz@infradead.org Date: Sat Feb 25 08:56:53 2017 +0100
debug: Add _ONCE() logic to report_bug()
Urgh, is for some mysterious reason the __bug_table section of modules ending up in RO memory?
I forever get lost in that link magic :/
+1
drm.ko 20 __bug_table 00000630 0000000000000000 0000000000000000 0004bff3 2**0 CONTENTS, ALLOC, LOAD, RELOC, READONLY, DATA vmlinux 15 __bug_table 0000ba84 ffffffff81af26c0 0000000001af26c0 00cf26c0 2**0 CONTENTS, ALLOC, LOAD, READONLY, DATA
Danged if I know... um um RELOC business mucks things up?
-Mike
On Fri, Jul 14, 2017 at 05:58:18PM +0200, Mike Galbraith wrote:
On Fri, 2017-07-14 at 17:50 +0200, Peter Zijlstra wrote:
Urgh, is for some mysterious reason the __bug_table section of modules ending up in RO memory?
I forever get lost in that link magic :/
+1
drm.ko 20 __bug_table 00000630 0000000000000000 0000000000000000 0004bff3 2**0 CONTENTS, ALLOC, LOAD, RELOC, READONLY, DATA vmlinux 15 __bug_table 0000ba84 ffffffff81af26c0 0000000001af26c0 00cf26c0 2**0 CONTENTS, ALLOC, LOAD, READONLY, DATA
Danged if I know... um um RELOC business mucks things up?
Argh, it shouldn't be READONLY for vmlinux either, but apparently that is working for mysterious reasons.
Some architectures were in fact complaining that I broke that, and hence patch:
b5effd3815cc ("debug: Fix __bug_table[] in arch linker scripts")
I think we need professional help with this linking stuff, but who to ask?
On Fri, 2017-07-14 at 18:10 +0200, Peter Zijlstra wrote:
On Fri, Jul 14, 2017 at 05:58:18PM +0200, Mike Galbraith wrote:
On Fri, 2017-07-14 at 17:50 +0200, Peter Zijlstra wrote:
Urgh, is for some mysterious reason the __bug_table section of modules ending up in RO memory?
I forever get lost in that link magic :/
+1
drm.ko 20 __bug_table 00000630 0000000000000000 0000000000000000 0004bff3 2**0 CONTENTS, ALLOC, LOAD, RELOC, READONLY, DATA vmlinux 15 __bug_table 0000ba84 ffffffff81af26c0 0000000001af26c0 00cf26c0 2**0 CONTENTS, ALLOC, LOAD, READONLY, DATA
Danged if I know... um um RELOC business mucks things up?
Argh, it shouldn't be READONLY for vmlinux either, but apparently that is working for mysterious reasons.
Some architectures were in fact complaining that I broke that, and hence patch:
b5effd3815cc ("debug: Fix __bug_table[] in arch linker scripts")
I think we need professional help with this linking stuff, but who to ask?
Andy Lutomirski?
On Fri, Jul 14, 2017 at 06:33:01PM +0200, Mike Galbraith wrote:
On Fri, 2017-07-14 at 18:10 +0200, Peter Zijlstra wrote:
On Fri, Jul 14, 2017 at 05:58:18PM +0200, Mike Galbraith wrote:
On Fri, 2017-07-14 at 17:50 +0200, Peter Zijlstra wrote:
Urgh, is for some mysterious reason the __bug_table section of modules ending up in RO memory?
I forever get lost in that link magic :/
+1
drm.ko 20 __bug_table 00000630 0000000000000000 0000000000000000 0004bff3 2**0 CONTENTS, ALLOC, LOAD, RELOC, READONLY, DATA vmlinux 15 __bug_table 0000ba84 ffffffff81af26c0 0000000001af26c0 00cf26c0 2**0 CONTENTS, ALLOC, LOAD, READONLY, DATA
Danged if I know... um um RELOC business mucks things up?
Argh, it shouldn't be READONLY for vmlinux either, but apparently that is working for mysterious reasons.
Some architectures were in fact complaining that I broke that, and hence patch:
b5effd3815cc ("debug: Fix __bug_table[] in arch linker scripts")
I think we need professional help with this linking stuff, but who to ask?
Andy Lutomirski?
Does this fix it?
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 39e702d..aa6b202 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -35,7 +35,7 @@ #define _BUG_FLAGS(ins, flags) \ do { \ asm volatile("1:\t" ins "\n" \ - ".pushsection __bug_table,"a"\n" \ + ".pushsection __bug_table,"aw"\n" \ "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \ "\t" __BUG_REL(%c0) "\t# bug_entry::file\n" \ "\t.word %c1" "\t# bug_entry::line\n" \ @@ -52,7 +52,7 @@ do { \ #define _BUG_FLAGS(ins, flags) \ do { \ asm volatile("1:\t" ins "\n" \ - ".pushsection __bug_table,"a"\n" \ + ".pushsection __bug_table,"aw"\n" \ "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \ "\t.word %c0" "\t# bug_entry::flags\n" \ "\t.org 2b+%c1\n" \
On Fri, 2017-07-14 at 14:42 -0500, Josh Poimboeuf wrote:
Does this fix it?
Yup, both READONLY __bug_table and "extra stern" warning are gone.
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 39e702d..aa6b202 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -35,7 +35,7 @@ #define _BUG_FLAGS(ins, flags) \ do { \ asm volatile("1:\t" ins "\n" \
".pushsection __bug_table,\"a\"\n" \
".pushsection __bug_table,\"aw\"\n" \ "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \ "\t" __BUG_REL(%c0) "\t# bug_entry::file\n" \ "\t.word %c1" "\t# bug_entry::line\n" \
@@ -52,7 +52,7 @@ do { \ #define _BUG_FLAGS(ins, flags) \ do { \ asm volatile("1:\t" ins "\n" \
".pushsection __bug_table,\"a\"\n" \
".pushsection __bug_table,\"aw\"\n" \ "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \ "\t.word %c0" "\t# bug_entry::flags\n" \ "\t.org 2b+%c1\n" \
+++ Peter Zijlstra [14/07/17 18:10 +0200]:
On Fri, Jul 14, 2017 at 05:58:18PM +0200, Mike Galbraith wrote:
On Fri, 2017-07-14 at 17:50 +0200, Peter Zijlstra wrote:
Urgh, is for some mysterious reason the __bug_table section of modules ending up in RO memory?
I forever get lost in that link magic :/
+1
drm.ko 20 __bug_table 00000630 0000000000000000 0000000000000000 0004bff3 2**0 CONTENTS, ALLOC, LOAD, RELOC, READONLY, DATA vmlinux 15 __bug_table 0000ba84 ffffffff81af26c0 0000000001af26c0 00cf26c0 2**0 CONTENTS, ALLOC, LOAD, READONLY, DATA
Danged if I know... um um RELOC business mucks things up?
Argh, it shouldn't be READONLY for vmlinux either, but apparently that is working for mysterious reasons.
If I'm not mistaken, this works because __bug_table falls outside of the RO range as specified in the vmlinux linker script (using x86_64 as the example, that means _text - __end_rodata_hpage_align). mark_rodata_ro() only sets ro memory protections for pages within this range, so __bug_table remains rw in memory despite its Elf section flags. Interestingly enough, my .rodata section is set 'WA' (rw) in vmlinux on my f25 system, so that leads me to think that Elf section flags in vmlinux don't seem to matter much when it comes to setting ro/nx protections..
However, in the module loader it's a different story; we do set page protections strictly according to the section flags, so since __bug_table only has SHF_ALLOC set, it assumes it's a readonly section and gets treated as such. So I would think that Josh's patch would fix this issue.
Jessica
dri-devel@lists.freedesktop.org