RC6 should always work on IVB, and should work on SNB whenever IO remapping is disabled. Make the default value for the parameter turn it on in these cases. Setting the value to either 0 or 1 will force the specified behavior.
Signed-off-by: Keith Packard keithp@keithp.com --- drivers/gpu/drm/i915/i915_drv.c | 4 ++-- drivers/gpu/drm/i915/i915_drv.h | 2 +- drivers/gpu/drm/i915/intel_display.c | 26 +++++++++++++++++++++++--- 3 files changed, 26 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index 15bfa91..cf5c1bd 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -63,10 +63,10 @@ module_param_named(semaphores, i915_semaphores, int, 0600); MODULE_PARM_DESC(semaphores, "Use semaphores for inter-ring sync (default: false)");
-unsigned int i915_enable_rc6 __read_mostly = 0; +int i915_enable_rc6 __read_mostly = -1; module_param_named(i915_enable_rc6, i915_enable_rc6, int, 0600); MODULE_PARM_DESC(i915_enable_rc6, - "Enable power-saving render C-state 6 (default: true)"); + "Enable power-saving render C-state 6 (default: -1 (false when VT-d enabled)");
int i915_enable_fbc __read_mostly = -1; module_param_named(i915_enable_fbc, i915_enable_fbc, int, 0600); diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 4a9c1b9..172b022 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1002,7 +1002,7 @@ extern unsigned int i915_semaphores __read_mostly; extern unsigned int i915_lvds_downclock __read_mostly; extern int i915_panel_use_ssc __read_mostly; extern int i915_vbt_sdvo_panel_type __read_mostly; -extern unsigned int i915_enable_rc6 __read_mostly; +extern int i915_enable_rc6 __read_mostly; extern int i915_enable_fbc __read_mostly; extern bool i915_enable_hangcheck __read_mostly;
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index e77a863..13fd4c0 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -38,8 +38,11 @@ #include "i915_drv.h" #include "i915_trace.h" #include "drm_dp_helper.h" - #include "drm_crtc_helper.h" +#ifdef CONFIG_INTEL_IOMMU +#include <linux/dma_remapping.h> +#include <linux/dmar.h> +#endif
#define HAS_eDP (intel_pipe_has_type(crtc, INTEL_OUTPUT_EDP))
@@ -7887,6 +7890,23 @@ void intel_init_emon(struct drm_device *dev) dev_priv->corr = (lcfuse & LCFUSE_HIV_MASK); }
+static bool intel_enable_rc6(struct drm_device *dev) +{ + if (i915_enable_rc6 >= 0) + return i915_enable_rc6; + if (INTEL_INFO(dev)->gen >= 7) + return 1; +#ifdef CONFIG_INTEL_IOMMU + /* + * Only enable RC6 if all dma remapping is disabled, or if + * there's no iommu present in the machine. + */ + if (INTEL_INFO(dev)->gen == 6) + return no_iommu || dmar_disabled; +#endif + return 0; +} + void gen6_enable_rps(struct drm_i915_private *dev_priv) { u32 rp_state_cap = I915_READ(GEN6_RP_STATE_CAP); @@ -7923,7 +7943,7 @@ void gen6_enable_rps(struct drm_i915_private *dev_priv) I915_WRITE(GEN6_RC6p_THRESHOLD, 100000); I915_WRITE(GEN6_RC6pp_THRESHOLD, 64000); /* unused */
- if (i915_enable_rc6) + if (intel_enable_rc6(dev_priv->dev)) rc6_mask = GEN6_RC_CTL_RC6p_ENABLE | GEN6_RC_CTL_RC6_ENABLE;
@@ -8372,7 +8392,7 @@ void ironlake_enable_rc6(struct drm_device *dev) /* rc6 disabled by default due to repeated reports of hanging during * boot and resume. */ - if (!i915_enable_rc6) + if (!intel_enable_rc6(dev)) return;
mutex_lock(&dev->struct_mutex);
On 11/18/2011 10:41 PM, Keith Packard wrote:
RC6 should always work on IVB, and should work on SNB whenever IO remapping is disabled. Make the default value for the parameter turn it on in these cases. Setting the value to either 0 or 1 will force the specified behavior.
Signed-off-by: Keith Packard keithp@keithp.com
drivers/gpu/drm/i915/i915_drv.c | 4 ++-- drivers/gpu/drm/i915/i915_drv.h | 2 +- drivers/gpu/drm/i915/intel_display.c | 26 +++++++++++++++++++++++--- 3 files changed, 26 insertions(+), 6 deletions(-)
Reviewed-by: Kenneth Graunke kenneth@whitecape.org
On Sat, Nov 19, 2011 at 04:41, Keith Packard keithp@keithp.com wrote:
RC6 should always work on IVB, and should work on SNB whenever IO remapping is disabled. Make the default value for the parameter turn it on in these cases. Setting the value to either 0 or 1 will force the specified behavior.
Signed-off-by: Keith Packard keithp@keithp.com
Reviewed-by: Eugeni Dodonov eugeni.dodonov@intel.com
On Sat, Nov 19, 2011 at 04:41, Keith Packard keithp@keithp.com wrote:
+static bool intel_enable_rc6(struct drm_device *dev) +{
- if (i915_enable_rc6 >= 0)
- return i915_enable_rc6;
- if (INTEL_INFO(dev)->gen >= 7)
- return 1;
+#ifdef CONFIG_INTEL_IOMMU
- /*
- * Only enable RC6 if all dma remapping is disabled, or if
- * there's no iommu present in the machine.
- */
- if (INTEL_INFO(dev)->gen == 6)
- return no_iommu || dmar_disabled;
+#endif
- return 0;
+}
Just one question I caught on 2nd read. Shouldn't we have #else within this #ifdef block, to return 1? Otherwise, if CONFIG_INTEL_IOMMU is not defined, we'll always disable rc6.
Or we just always intend to disable rc6 in case CONFIG_INTEL_IOMMU is not there?
-- Eugeni Dodonov
On Sat, 19 Nov 2011 07:25:09 -0200, Eugeni Dodonov eugeni@dodonov.net wrote:
Just one question I caught on 2nd read. Shouldn't we have #else within this #ifdef block, to return 1? Otherwise, if CONFIG_INTEL_IOMMU is not defined, we'll always disable rc6.
Oops! Thanks for catching this. Here's a new version of that function (the rest of the patch is the same). This one has explicit conditions for Ironlake and Sandybridge (when CONFIG_INTEL_IOMMU is set), allowing the Ivybridge and Sandybridge-without-IOMMU cases to take the default path. This will also cause all future chips to enable rc6 by default.
+static bool intel_enable_rc6(struct drm_device *dev) +{ + /* + * Respect the kernel parameter if it is set + */ + if (i915_enable_rc6 >= 0) + return i915_enable_rc6; + + /* + * Disable RC6 on Ironlake + */ + if (INTEL_INFO(dev)->gen == 5) + return 0; + +#ifdef CONFIG_INTEL_IOMMU + /* + * Enable rc6 on Sandybridge if DMA remapping is disabled + */ + if (INTEL_INFO(dev)->gen == 6) + return no_iommu || dmar_disabled; +#endif + return 1; +} +
On Sat, Nov 19, 2011 at 16:32, Keith Packard keithp@keithp.com wrote:
On Sat, 19 Nov 2011 07:25:09 -0200, Eugeni Dodonov eugeni@dodonov.net wrote:
Just one question I caught on 2nd read. Shouldn't we have #else within this #ifdef block, to return 1? Otherwise, if CONFIG_INTEL_IOMMU is not defined, we'll always disable rc6.
Oops! Thanks for catching this. Here's a new version of that function (the rest of the patch is the same). This one has explicit conditions for Ironlake and Sandybridge (when CONFIG_INTEL_IOMMU is set), allowing the Ivybridge and Sandybridge-without-IOMMU cases to take the default path. This will also cause all future chips to enable rc6 by default.
Great, I think it catches all cases now.
Reviewed-by: Eugeni Dodonov eugeni.dodonov@intel.com
Thanks!
On Fri, Nov 18, 2011 at 10:41:29PM -0800, Keith Packard wrote:
- /*
* Only enable RC6 if all dma remapping is disabled, or if
* there's no iommu present in the machine.
*/
- if (INTEL_INFO(dev)->gen == 6)
return no_iommu || dmar_disabled;
So the user has to choose between 5W of power saving or having dmar? And we default to giving them dmar? I think that's going to come as a surprise to people.
On Tue, Nov 22, 2011 at 18:15, Matthew Garrett mjg@redhat.com wrote:
On Fri, Nov 18, 2011 at 10:41:29PM -0800, Keith Packard wrote:
/*
* Only enable RC6 if all dma remapping is disabled, or if
* there's no iommu present in the machine.
*/
if (INTEL_INFO(dev)->gen == 6)
return no_iommu || dmar_disabled;
So the user has to choose between 5W of power saving or having dmar? And we default to giving them dmar?
From the latest investigations, it is either this, or random gpu hangs and
crashes when both are enabled :(.
When the root cause will be discovered, we should allow both of course. But so far, all the attempts on this weren't successful.
On Tue, Nov 22, 2011 at 06:46:21PM -0200, Eugeni Dodonov wrote:
On Tue, Nov 22, 2011 at 18:15, Matthew Garrett mjg@redhat.com wrote:
On Fri, Nov 18, 2011 at 10:41:29PM -0800, Keith Packard wrote:
/*
* Only enable RC6 if all dma remapping is disabled, or if
* there's no iommu present in the machine.
*/
if (INTEL_INFO(dev)->gen == 6)
return no_iommu || dmar_disabled;
So the user has to choose between 5W of power saving or having dmar? And we default to giving them dmar?
From the latest investigations, it is either this, or random gpu hangs and crashes when both are enabled :(.
So blacklist dmar on sandybridge. The power saving is going to be more important for most users.
On Tue, 22 Nov 2011 20:15:31 +0000, Matthew Garrett mjg@redhat.com wrote:
So the user has to choose between 5W of power saving or having dmar? And we default to giving them dmar? I think that's going to come as a surprise to people.
You'd have to go into the BIOS to turn this on for most machines at least?
But, yeah, it seems like we should be turning DMAR off unless explicitly requested; I can't understand how you'd ever need this running native on the hardware. Not exactly an area I care about deeply; I've always worked hard to make sure all virtualization garbage is disabled on every machine I use.
On Tue, Nov 22, 2011 at 07:31:34PM -0800, Keith Packard wrote:
On Tue, 22 Nov 2011 20:15:31 +0000, Matthew Garrett mjg@redhat.com wrote:
So the user has to choose between 5W of power saving or having dmar? And we default to giving them dmar? I think that's going to come as a surprise to people.
You'd have to go into the BIOS to turn this on for most machines at least?
But, yeah, it seems like we should be turning DMAR off unless explicitly requested; I can't understand how you'd ever need this running native on the hardware. Not exactly an area I care about deeply; I've always worked hard to make sure all virtualization garbage is disabled on every machine I use.
Problem is that we need to disable dmar on the entire box, afaics. And I assume that a bunch of people abusing desktop boards as servers will call "regression" on that. -Daniel
On Wed, 2011-11-23 at 11:26 +0100, Daniel Vetter wrote:
On Tue, Nov 22, 2011 at 07:31:34PM -0800, Keith Packard wrote:
On Tue, 22 Nov 2011 20:15:31 +0000, Matthew Garrett mjg@redhat.com wrote:
So the user has to choose between 5W of power saving or having dmar? And we default to giving them dmar? I think that's going to come as a surprise to people.
You'd have to go into the BIOS to turn this on for most machines at least?
But, yeah, it seems like we should be turning DMAR off unless explicitly requested; I can't understand how you'd ever need this running native on the hardware. Not exactly an area I care about deeply; I've always worked hard to make sure all virtualization garbage is disabled on every machine I use.
Problem is that we need to disable dmar on the entire box, afaics. And I assume that a bunch of people abusing desktop boards as servers will call "regression" on that.
Hm, do you really have to disable it for the entire box, or just the graphics?
Do we have a coherent erratum from Intel for the issues mentioned above with DMAR+gfx+RC6?
Keith, do you know if a sighting has been filed and the hardware folks are working on it?
Rajesh, are you familiar with this issue?
On Wed, Nov 23, 2011 at 02:01:54PM +0000, David Woodhouse wrote:
On Wed, 2011-11-23 at 11:26 +0100, Daniel Vetter wrote:
On Tue, Nov 22, 2011 at 07:31:34PM -0800, Keith Packard wrote:
On Tue, 22 Nov 2011 20:15:31 +0000, Matthew Garrett mjg@redhat.com wrote:
So the user has to choose between 5W of power saving or having dmar? And we default to giving them dmar? I think that's going to come as a surprise to people.
You'd have to go into the BIOS to turn this on for most machines at least?
But, yeah, it seems like we should be turning DMAR off unless explicitly requested; I can't understand how you'd ever need this running native on the hardware. Not exactly an area I care about deeply; I've always worked hard to make sure all virtualization garbage is disabled on every machine I use.
Problem is that we need to disable dmar on the entire box, afaics. And I assume that a bunch of people abusing desktop boards as servers will call "regression" on that.
Hm, do you really have to disable it for the entire box, or just the graphics?
At least for the dmar+gfx+semaphores hang I can reproduce, just disabling dmar with intel_iommu=igfx_off is not good enough and iirc the same holds for the dmar+rc6 hangs reported.
Do we have a coherent erratum from Intel for the issues mentioned above with DMAR+gfx+RC6?
Afaik no errata applies to our dmar related troubles on snb. I've hoped that ppgtt would magically fix this, and it seems to help quite a bit for the semaphore hangs (but not everywhere). Couldn't yet look more into this. -Daniel
On Wed, 2011-11-23 at 15:39 +0100, Daniel Vetter wrote:
At least for the dmar+gfx+semaphores hang I can reproduce, just disabling dmar with intel_iommu=igfx_off is not good enough and iirc the same holds for the dmar+rc6 hangs reported.
Um... let me restate that for clarity (and partly for Rajesh's benefit).
The DMAR associated with the integrated graphics is *disabled*. Turned off. Not active. Ever.
You have a problem when you enable the *other* DMAR units in the system, which should not be affecting the graphics device in any way.
When you do this, you see 'hangs' with semaphores and RC6. Is there a better description of these 'hangs' somewhere? Is the hardware completely locked?
These hangs go away when you disable the DMAR units. Again, that is the *other* DMAR units in the system that have nothing to do with graphics.
While I'm getting quite used to DMAR-related errata, this one does make me stop and think 'wtf?'. It just seems so incongruous that disabling an *unrelated* IOMMU would make the problem go away, and it makes me wonder if it's actually a timing-related issue which is always there, but something about the use of DMAR for network/disk/etc. makes it more likely to trigger?
We definitely need the hardware folks to get to the bottom of this one.
On Wed, Nov 23, 2011 at 03:03:43PM +0000, David Woodhouse wrote:
On Wed, 2011-11-23 at 15:39 +0100, Daniel Vetter wrote:
At least for the dmar+gfx+semaphores hang I can reproduce, just disabling dmar with intel_iommu=igfx_off is not good enough and iirc the same holds for the dmar+rc6 hangs reported.
Um... let me restate that for clarity (and partly for Rajesh's benefit).
The DMAR associated with the integrated graphics is *disabled*. Turned off. Not active. Ever.
You have a problem when you enable the *other* DMAR units in the system, which should not be affecting the graphics device in any way.
When you do this, you see 'hangs' with semaphores and RC6. Is there a better description of these 'hangs' somewhere? Is the hardware completely locked?
These hangs go away when you disable the DMAR units. Again, that is the *other* DMAR units in the system that have nothing to do with graphics.
While I'm getting quite used to DMAR-related errata, this one does make me stop and think 'wtf?'. It just seems so incongruous that disabling an *unrelated* IOMMU would make the problem go away, and it makes me wonder if it's actually a timing-related issue which is always there, but something about the use of DMAR for network/disk/etc. makes it more likely to trigger?
We definitely need the hardware folks to get to the bottom of this one.
Ok, let me document the recipe I use to hang my box here. It's about the dmar+semaphores hang I can reproduce, so might be slightly different in the actual cause than the dmar+rc6 bug (for that one we only have bug reports talking about hard freezing requiring power cycling).
- Grab a GT2+ mobile snb (both my and the only other reporters machine fits this, so maybe it matters). pci rev 09 (i.e. first production silicon). - Install fc15 with the kde4 spin. I can't reproduce it with any other userspace than kde4. - Grab latest d-i-f from Keith and latest userspace graphics code (to avoid hitting any other snb hangs we've tracked down meanwhile). - Compile kernel with dmar and enable VT-d in the bios. - Login into the systems with gdm, the machine usually dies within a few seconds (while kde4 loads). If that's not good enough, a few minutes of light desktop usage will kill it. - Wait 2 minutes for the stuck-in-atomic detection logic to kick in and grab the backtrace over netconsole. Notice that the kernel is stuck trying to flush the dmar tlb cache (that's how I managed to track it down to a dmar interaction). Backtrace almost identical to the dmar issue on ilk. I've lost the backtrace, if you want I can regrab it.
Things I've tried that don't work around the issue: - Disable dmar for the igfx with intel_iommu=igfx_off - Apply the ilk workaround (i.e. synchronous dmar tlb flushes + gpu idling while flushing).
Things that work: - Disabling semaphores. - Disabling dmar in either the bios or on the cmdline with intel_iommu=off
All reporters that tried confirmed that igfx_off is not good enough, only fully disabling dmar (for both the semaphores and the rc6 related hangs).
Things that look interesting: - ppgtt support (i.e. using per-proces pagetables on the gfx instead of the global gtt) seems to paper over the issue for the original reporter of the semaphore related hangs. Unfortunately not for me, gpu still hangs (but doesn't take down the entire system with it). I've not yet investigated this one closely. Fyi, the windows driver uses ppgtt unconditionally on snb. Also, ppgtt seems to have no effect for at least one report of dmar related rc6 hangs.
Cheers, Daniel
On Wed, 2011-11-23 at 16:31 +0100, Daniel Vetter wrote:
Things I've tried that don't work around the issue:
- Disable dmar for the igfx with intel_iommu=igfx_off
- Apply the ilk workaround (i.e. synchronous dmar tlb flushes + gpu
idling while flushing).
Well, the ILK workaround was only ensuring that the gfx was idle while the *graphics* IOTLB was flushed. We are still flushing the IOTLB for *other* IOMMUs at arbitrary times.
Might be interesting to go for *deferred* IOTLB flushes (i.e. the normal mode without the workaround or 'intel_iommu=strict' on the command line), *BUT* to make sure the batched flush (the flush_unmaps() call) can *only* happen when the GPU is quiescent. You could do that with a dirty hack for testing, and if *that* fixes the issue.... well, $DEITY knows, someone still needs a good kicking, but at least we'll have learned something :)
On Wed, Nov 23, 2011 at 04:31:54PM +0100, Daniel Vetter wrote:
- Wait 2 minutes for the stuck-in-atomic detection logic to kick in and grab the backtrace over netconsole. Notice that the kernel is stuck trying to flush the dmar tlb cache (that's how I managed to track it down to a dmar interaction). Backtrace almost identical to the dmar issue on ilk. I've lost the backtrace, if you want I can regrab it.
Ok, I've recaptured the last words from my dying machine:
Listening for netconsole messages [ 136.897673] ------------[ cut here ]------------ [ 136.897694] WARNING: at kernel/watchdog.c:241 watchdog_overflow_callback+0x9b/0xa6() [ 136.897701] Hardware name: HP EliteBook 8460p [ 136.897707] Watchdog detected hard LOCKUP on cpu 0 [ 136.897713] Modules linked in: sunrpc 8021q garp stp llc cpufreq_ondemand acpi_cpufreq freq_table mperf snd_hda_codec_hdmi snd_hda_codec_idt arc4 iwlwifi mac80211 hp_wmi sparse_keymap ppdev uvcvideo videodev v4l2_compat_ioctl32 btusb microcode snd_hda_intel snd_hda_codec snd_hwdep snd_seq bluetooth snd_seq_device iTCO_wdt snd_pcm snd_timer snd cfg80211 serio_raw iTCO_vendor_support joydev xhci_hcd rfkill e1000e soundcore snd_page_alloc parport_pc parport tpm_infineon wmi intel_ips ipv6 firewire_ohci firewire_core crc_itu_t sdhci_pci sdhci mmc_core i915 drm_kms_helper drm i2c_algo_bit i2c_core video [last unloaded: scsi_wait_scan] [ 136.897967] Pid: 0, comm: swapper Not tainted 3.2.0-rc2+ #162 [ 136.897972] Call Trace: [ 136.897978] <NMI> [<ffffffff8105679a>] warn_slowpath_common+0x83/0x9b [ 136.897998] [<ffffffff81056855>] warn_slowpath_fmt+0x46/0x48 [ 136.898007] [<ffffffff810152b1>] ? native_sched_clock+0x34/0x36 [ 136.898016] [<ffffffff810ad68b>] watchdog_overflow_callback+0x9b/0xa6 [ 136.898026] [<ffffffff810d78c3>] __perf_event_overflow+0x100/0x17f [ 136.898035] [<ffffffff810d5f53>] ? perf_event_update_userpage+0xf/0xa2 [ 136.898045] [<ffffffff8101be7b>] ? x86_perf_event_set_period+0x107/0x113 [ 136.898053] [<ffffffff810d7efc>] perf_event_overflow+0x14/0x16 [ 136.898062] [<ffffffff8101f4bc>] intel_pmu_handle_irq+0x211/0x271 [ 136.898073] [<ffffffff81476b65>] perf_event_nmi_handler+0x19/0x1b [ 136.898082] [<ffffffff814764f7>] nmi_handle+0x42/0x67 [ 136.898091] [<ffffffff814765a8>] do_nmi+0x8c/0x26b [ 136.898099] [<ffffffff81475db0>] nmi+0x20/0x30 [ 136.898109] [<ffffffff81083ccc>] ? do_raw_spin_lock+0x1/0x25 [ 136.898115] <<EOE>> <IRQ> [<ffffffff8147547e>] ? _raw_spin_lock+0xe/0x10 [ 136.898135] [<ffffffff813b712b>] qi_submit_sync+0x30d/0x312 [ 136.898143] [<ffffffff813b7222>] qi_flush_iotlb+0x7a/0x7c [ 136.898152] [<ffffffff813b918f>] flush_unmaps+0x72/0x131 [ 136.898161] [<ffffffff813b926d>] flush_unmaps_timeout+0x1f/0x32 [ 136.898169] [<ffffffff81062d9d>] run_timer_softirq+0x19b/0x280 [ 136.898177] [<ffffffff81014e05>] ? paravirt_read_tsc+0x9/0xd [ 136.898186] [<ffffffff813b924e>] ? flush_unmaps+0x131/0x131 [ 136.898195] [<ffffffff8105c477>] __do_softirq+0xc9/0x1b5 [ 136.898203] [<ffffffff81014e05>] ? paravirt_read_tsc+0x9/0xd [ 136.898212] [<ffffffff8147de6c>] call_softirq+0x1c/0x30 [ 136.898222] [<ffffffff81010add>] do_softirq+0x46/0x81 [ 136.898230] [<ffffffff8105c73f>] irq_exit+0x57/0xb1 [ 136.898238] [<ffffffff8147e7e1>] smp_apic_timer_interrupt+0x7c/0x8a [ 136.898251] [<ffffffff8147c6de>] apic_timer_interrupt+0x6e/0x80 [ 136.898256] <EOI> [<ffffffff81014e05>] ? paravirt_read_tsc+0x9/0xd [ 136.898271] [<ffffffff812766b3>] ? intel_idle+0xef/0x120 [ 136.898279] [<ffffffff81276695>] ? intel_idle+0xd1/0x120 [ 136.898289] [<ffffffff8139f10b>] cpuidle_idle_call+0xe2/0x181 [ 136.898297] [<ffffffff8100e2ed>] cpu_idle+0xa9/0xf0 [ 136.898306] [<ffffffff81456a1e>] rest_init+0x72/0x74 [ 136.898316] [<ffffffff81aceb71>] start_kernel+0x3b0/0x3bd [ 136.898324] [<ffffffff81ace2c4>] x86_64_start_reservations+0xaf/0xb3 [ 136.898332] [<ffffffff81ace140>] ? early_idt_handlers+0x140/0x140 [ 136.898340] [<ffffffff81ace3ca>] x86_64_start_kernel+0x102/0x111 [ 136.898348] ---[ end trace 2d22d2d9c3bfe5c8 ]--- [ 161.821354] ------------[ cut here ]------------ [ 161.821365] WARNING: at kernel/watchdog.c:241 watchdog_overflow_callback+0x9b/0xa6() [ 161.821370] Hardware name: HP EliteBook 8460p [ 161.821376] Watchdog detected hard LOCKUP on cpu 1 [ 161.821381] Modules linked in: sunrpc 8021q garp stp llc cpufreq_ondemand acpi_cpufreq freq_table mperf snd_hda_codec_hdmi snd_hda_codec_idt arc4 iwlwifi mac80211 hp_wmi sparse_keymap ppdev uvcvideo videodev v4l2_compat_ioctl32 btusb microcode snd_hda_intel snd_hda_codec snd_hwdep snd_seq bluetooth snd_seq_device iTCO_wdt snd_pcm snd_timer snd cfg80211 serio_raw iTCO_vendor_support joydev xhci_hcd rfkill e1000e soundcore snd_page_alloc parport_pc parport tpm_infineon wmi intel_ips ipv6 firewire_ohci firewire_core crc_itu_t sdhci_pci sdhci mmc_core i915 drm_kms_helper drm i2c_algo_bit i2c_core video [last unloaded: scsi_wait_scan] [ 161.821609] Pid: 1134, comm: X Tainted: G W 3.2.0-rc2+ #162 [ 161.821615] Call Trace: [ 161.821619] <NMI> [<ffffffff8105679a>] warn_slowpath_common+0x83/0x9b [ 161.821633] [<ffffffff81056855>] warn_slowpath_fmt+0x46/0x48 [ 161.821640] [<ffffffff810152b1>] ? native_sched_clock+0x34/0x36 [ 161.821648] [<ffffffff810ad68b>] watchdog_overflow_callback+0x9b/0xa6 [ 161.821655] [<ffffffff810d78c3>] __perf_event_overflow+0x100/0x17f [ 161.821663] [<ffffffff810d5f53>] ? perf_event_update_userpage+0xf/0xa2 [ 161.821669] [<ffffffff8101be7b>] ? x86_perf_event_set_period+0x107/0x113 [ 161.821677] [<ffffffff810d7efc>] perf_event_overflow+0x14/0x16 [ 161.821684] [<ffffffff8101f4bc>] intel_pmu_handle_irq+0x211/0x271 [ 161.821692] [<ffffffff81476b65>] perf_event_nmi_handler+0x19/0x1b [ 161.821700] [<ffffffff814764f7>] nmi_handle+0x42/0x67 [ 161.821708] [<ffffffff814765a8>] do_nmi+0x8c/0x26b [ 161.821715] [<ffffffff81475db0>] nmi+0x20/0x30 [ 161.821723] [<ffffffff814754aa>] ? _raw_spin_lock_irqsave+0x2a/0x2f [ 161.821728] <<EOE>> [<ffffffff813b988f>] add_unmap+0x21/0xb8 [ 161.821744] [<ffffffff813bad66>] intel_unmap_sg+0x101/0x110 [ 161.821753] [<ffffffff811341bc>] ? __pollwait+0xcc/0xcc [ 161.821761] [<ffffffff812df78f>] intel_gtt_unmap_memory+0x50/0x70 [ 161.821784] [<ffffffffa007dad1>] i915_gem_gtt_unbind_object+0x9c/0xc7 [i915] [ 161.821805] [<ffffffffa0079377>] i915_gem_object_unbind+0x111/0x1cb [i915] [ 161.821822] [<ffffffffa0079453>] i915_gem_free_object_tail+0x22/0xd3 [i915] [ 161.821839] [<ffffffffa007b58c>] i915_gem_free_object+0x46/0x4b [i915] [ 161.821856] [<ffffffffa001ffa1>] ? drm_gem_handle_create+0xcb/0xcb [drm] [ 161.821870] [<ffffffffa001ffcc>] drm_gem_object_free+0x2b/0x2d [drm] [ 161.821877] [<ffffffff8123057b>] kref_put+0x43/0x4d [ 161.821890] [<ffffffffa001fca5>] drm_gem_object_unreference_unlocked+0x33/0x40 [drm] [ 161.821904] [<ffffffffa001fdf8>] drm_gem_object_handle_unreference_unlocked.part.1+0x27/0x2c [drm] [ 161.821918] [<ffffffffa001fec8>] drm_gem_handle_delete+0x84/0x92 [drm] [ 161.821933] [<ffffffffa0020305>] drm_gem_close_ioctl+0x28/0x2a [drm] [ 161.821946] [<ffffffffa001e7ae>] drm_ioctl+0x2c8/0x3a5 [drm] [ 161.821958] [<ffffffffa00202dd>] ? drm_gem_destroy+0x43/0x43 [drm] [ 161.821966] [<ffffffff810749a6>] ? __hrtimer_start_range_ns+0x2cd/0x2ed [ 161.821974] [<ffffffff811337f4>] do_vfs_ioctl+0x45d/0x49e [ 161.821982] [<ffffffff81124f86>] ? fsnotify_access+0x5f/0x67 [ 161.821988] [<ffffffff8113388b>] sys_ioctl+0x56/0x7b [ 161.821995] [<ffffffff8147bc02>] system_call_fastpath+0x16/0x1b [ 161.822002] ---[ end trace 2d22d2d9c3bfe5c9 ]---
On Wed, Nov 23, 2011 at 03:03:43PM +0000, David Woodhouse wrote:
On Wed, 2011-11-23 at 15:39 +0100, Daniel Vetter wrote:
At least for the dmar+gfx+semaphores hang I can reproduce, just disabling dmar with intel_iommu=igfx_off is not good enough and iirc the same holds for the dmar+rc6 hangs reported.
Um... let me restate that for clarity (and partly for Rajesh's benefit).
The DMAR associated with the integrated graphics is *disabled*. Turned off. Not active. Ever.
Hm, that comment confuses me a bit. I've always thought that igfx_off only instantiates a identity mapping and leaves the dmar unit on. Is that wrong? -Daniel
On Wed, 2011-11-23 at 16:41 +0100, Daniel Vetter wrote:
Hm, that comment confuses me a bit. I've always thought that igfx_off only instantiates a identity mapping and leaves the dmar unit on. Is that wrong?
It's completely off. If a DMAR unit has *only* graphics devices behind it (as the one for the built-in graphics does, because it's a whole boatload of speshul case for integration with the GTT), we just don't enable it at all. See the second for_each_drhd_unit() loop in init_no_remapping_devices().
On Wed, Nov 23, 2011 at 03:43:05PM +0000, David Woodhouse wrote:
On Wed, 2011-11-23 at 16:41 +0100, Daniel Vetter wrote:
Hm, that comment confuses me a bit. I've always thought that igfx_off only instantiates a identity mapping and leaves the dmar unit on. Is that wrong?
It's completely off. If a DMAR unit has *only* graphics devices behind it (as the one for the built-in graphics does, because it's a whole boatload of speshul case for integration with the GTT), we just don't enable it at all. See the second for_each_drhd_unit() loop in init_no_remapping_devices().
That explanation confused me even more. So I've rechecked with intel_iommu=igfx_off and already thought I've made a decent fool of myself because I couldn't readily hang it. Turns out it's just much harder to hang with that. So I think moving around the tlb flushing for other dmar nodes to align with the idled igfx isn't a great solution, simply because I can't reliably tell whether it fixes anything. -Daniel
Those are needed by the rc6 and semaphores support in i915 driver.
In i915 driver, we do not enable either rc6 or semaphores on SNB when dmar is enabled. So we use those variables to check the io remapping and iommu status.
CC: Keith Packard keithp@keithp.com CC: Daniel Vetter daniel.vetter@ffwll.ch Signed-off-by: Eugeni Dodonov eugeni.dodonov@intel.com --- arch/x86/kernel/pci-dma.c | 1 + drivers/iommu/intel-iommu.c | 1 + 2 files changed, 2 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 80dc793..4c9191e 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -33,6 +33,7 @@ int force_iommu __read_mostly = 0; int iommu_merge __read_mostly = 0;
int no_iommu __read_mostly; +EXPORT_SYMBOL_GPL(no_iommu); /* Set this to 1 if there is a HW IOMMU in the system */ int iommu_detected __read_mostly = 0;
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index c0c7820..dfe5fd3 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -404,6 +404,7 @@ int dmar_disabled = 0; #else int dmar_disabled = 1; #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/ +EXPORT_SYMBOL_GPL(dmar_disabled);
static int dmar_map_gfx = 1; static int dmar_forcedac;
On Wed, 2011-11-23 at 16:42 -0200, Eugeni Dodonov wrote:
Those are needed by the rc6 and semaphores support in i915 driver.
In i915 driver, we do not enable either rc6 or semaphores on SNB when dmar is enabled. So we use those variables to check the io remapping and iommu status.
This is horrid. In the general case, drivers have no business knowing this. We need to properly identify what is wrong with this hardware, and put in a quirk for it — perhaps refusing to enable the IOMMU at all on the broken chipsets.
On Wed, 23 Nov 2011 20:36:00 +0000, David Woodhouse dwmw2@infradead.org wrote:
This is horrid. In the general case, drivers have no business knowing this. We need to properly identify what is wrong with this hardware, and put in a quirk for it — perhaps refusing to enable the IOMMU at all on the broken chipsets.
That'd be fine with me, right now we're disabling RC6 and semaphores -- semaphores aren't all that important, although they do improve performance a bit. RC6 is important, saving a huge amount of power.
I'd also be OK with requiring special options to enable DMAR and having that also disable RC6/semaphores, if you'd rather expose that. In either case, we need something that works and avoids hanging machines.
Ok, here's a "final" patch set to enable RC6 where possible on SNB and IVB machines.
The first patch creates a new variable, intel_iommu_enabled, that is exported by the intel iommu code and set when that code has successfully initialized itself. The old plan of using no_iommu || dmar_disabled would work -- those variables are set only by kernel parameters and don't reflect what the system is actually doing about virtualization.
The second patch uses that value on SNB to tell whether RC6 can be enabled by default. On IVB, RC6 is always enabled.
Of course, in all cases, you can override the RC6 setting with the i915 module parameter.
For those of you who have experienced the delights of RC6 crashing your machines, please test as this will be heading to 3.2 unless you find something wrong with it.
-keith
From: Eugeni Dodonov eugeni.dodonov@intel.com
In i915 driver, we do not enable either rc6 or semaphores on SNB when dmar is enabled. The new 'intel_iommu_enabled' variable signals when the iommu code is in operation.
Cc: Ted Phelps phelps@gnusto.com Cc: Peter pab1612@gmail.com Cc: Lukas Hejtmanek xhejtman@fi.muni.cz Cc: Andrew Lutomirski luto@mit.edu CC: Daniel Vetter daniel.vetter@ffwll.ch Cc: Eugeni Dodonov eugeni.dodonov@intel.com Signed-off-by: Keith Packard keithp@keithp.com --- drivers/iommu/intel-iommu.c | 5 +++++ include/linux/dma_remapping.h | 2 ++ 2 files changed, 7 insertions(+), 0 deletions(-)
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index c0c7820..8dc19b8 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -405,6 +405,9 @@ int dmar_disabled = 0; int dmar_disabled = 1; #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
+int intel_iommu_enabled = 0; +EXPORT_SYMBOL_GPL(intel_iommu_enabled); + static int dmar_map_gfx = 1; static int dmar_forcedac; static int intel_iommu_strict; @@ -3647,6 +3650,8 @@ int __init intel_iommu_init(void)
bus_register_notifier(&pci_bus_type, &device_nb);
+ intel_iommu_enabled = 1; + return 0; }
diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h index ef90cbd..57c9a8a 100644 --- a/include/linux/dma_remapping.h +++ b/include/linux/dma_remapping.h @@ -31,6 +31,7 @@ extern void free_dmar_iommu(struct intel_iommu *iommu); extern int iommu_calculate_agaw(struct intel_iommu *iommu); extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu); extern int dmar_disabled; +extern int intel_iommu_enabled; #else static inline int iommu_calculate_agaw(struct intel_iommu *iommu) { @@ -44,6 +45,7 @@ static inline void free_dmar_iommu(struct intel_iommu *iommu) { } #define dmar_disabled (1) +#define intel_iommu_enabled (0) #endif
RC6 should always work on IVB, and should work on SNB whenever IO remapping is disabled. RC6 never works on Ironlake. Make the default value for the parameter follow these guidelines. Setting the value to either 0 or 1 will force the specified behavior.
Signed-off-by: Keith Packard keithp@keithp.com Reviewed-by: Kenneth Graunke kenneth@whitecape.org Reviewed-by: Eugeni Dodonov eugeni.dodonov@intel.com Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=38567 Cc: Ted Phelps phelps@gnusto.com Cc: Peter pab1612@gmail.com Cc: Lukas Hejtmanek xhejtman@fi.muni.cz Cc: Andrew Lutomirski luto@mit.edu --- drivers/gpu/drm/i915/i915_drv.c | 4 ++-- drivers/gpu/drm/i915/i915_drv.h | 2 +- drivers/gpu/drm/i915/intel_display.c | 33 ++++++++++++++++++++++++++++++--- 3 files changed, 33 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index 28836fe..47a42eb 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -63,10 +63,10 @@ module_param_named(semaphores, i915_semaphores, int, 0600); MODULE_PARM_DESC(semaphores, "Use semaphores for inter-ring sync (default: false)");
-unsigned int i915_enable_rc6 __read_mostly = 0; +int i915_enable_rc6 __read_mostly = -1; module_param_named(i915_enable_rc6, i915_enable_rc6, int, 0600); MODULE_PARM_DESC(i915_enable_rc6, - "Enable power-saving render C-state 6 (default: true)"); + "Enable power-saving render C-state 6 (default: -1 (use per-chip default)");
int i915_enable_fbc __read_mostly = -1; module_param_named(i915_enable_fbc, i915_enable_fbc, int, 0600); diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 1421118..bb82ef8 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1008,7 +1008,7 @@ extern unsigned int i915_semaphores __read_mostly; extern unsigned int i915_lvds_downclock __read_mostly; extern int i915_panel_use_ssc __read_mostly; extern int i915_vbt_sdvo_panel_type __read_mostly; -extern unsigned int i915_enable_rc6 __read_mostly; +extern int i915_enable_rc6 __read_mostly; extern int i915_enable_fbc __read_mostly; extern bool i915_enable_hangcheck __read_mostly;
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index a03ebf6..3097104 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -38,8 +38,8 @@ #include "i915_drv.h" #include "i915_trace.h" #include "drm_dp_helper.h" - #include "drm_crtc_helper.h" +#include <linux/dma_remapping.h>
#define HAS_eDP (intel_pipe_has_type(crtc, INTEL_OUTPUT_EDP))
@@ -7928,6 +7928,33 @@ void intel_init_emon(struct drm_device *dev) dev_priv->corr = (lcfuse & LCFUSE_HIV_MASK); }
+static bool intel_enable_rc6(struct drm_device *dev) +{ + /* + * Respect the kernel parameter if it is set + */ + if (i915_enable_rc6 >= 0) + return i915_enable_rc6; + + /* + * Disable RC6 on Ironlake + */ + if (INTEL_INFO(dev)->gen == 5) + return 0; + + /* + * Enable rc6 on Sandybridge if DMA remapping is disabled + */ + if (INTEL_INFO(dev)->gen == 6) { + DRM_DEBUG_DRIVER("Sandybridge: intel_iommu_enabled %s -- RC6 %sabled\n", + intel_iommu_enabled ? "true" : "false", + !intel_iommu_enabled ? "en" : "dis"); + return !intel_iommu_enabled; + } + DRM_DEBUG_DRIVER("RC6 enabled\n"); + return 1; +} + void gen6_enable_rps(struct drm_i915_private *dev_priv) { u32 rp_state_cap = I915_READ(GEN6_RP_STATE_CAP); @@ -7964,7 +7991,7 @@ void gen6_enable_rps(struct drm_i915_private *dev_priv) I915_WRITE(GEN6_RC6p_THRESHOLD, 100000); I915_WRITE(GEN6_RC6pp_THRESHOLD, 64000); /* unused */
- if (i915_enable_rc6) + if (intel_enable_rc6(dev_priv->dev)) rc6_mask = GEN6_RC_CTL_RC6p_ENABLE | GEN6_RC_CTL_RC6_ENABLE;
@@ -8413,7 +8440,7 @@ void ironlake_enable_rc6(struct drm_device *dev) /* rc6 disabled by default due to repeated reports of hanging during * boot and resume. */ - if (!i915_enable_rc6) + if (!intel_enable_rc6(dev)) return;
mutex_lock(&dev->struct_mutex);
On Fri, Dec 09, 2011 at 03:53:49PM -0800, Keith Packard wrote:
RC6 should always work on IVB, and should work on SNB whenever IO remapping is disabled. RC6 never works on Ironlake. Make the default value for the parameter follow these guidelines. Setting the value to either 0 or 1 will force the specified behavior.
I still don't like this. We're in a situation where we clearly have to disable one feature or the other on SNB - but we're disabling the one that's going to be useful to a large number of people, and leaving the niche feature enabled. If we're going to merge this then let's turn off iommu on SNB by default.
dri-devel@lists.freedesktop.org