Re: nouveau shuts the machine down with v3.9-rc1 (temperature (72 C) hit the 'shutdown' threshold).

5 Mar 2013

On Tue, Mar 05, 2013 at 12:13:57PM +0100, Martin Peres wrote:
...
On 04/03/2013 22:41, Konrad Rzeszutek Wilk wrote:
...
Pls CC me in case you would like me also to test them with the
mdelay patch.
Hi Konrad,
Marcin proposed me another explanation for the issue you are seeing
and it made me look again at the code.
I don't have enough nv4x hw to test all the conditions but with the
attached patches, you may get a saner
behaviour than a computer that shut-downs whenever you turn it on
(like a "most useless machine ever").
The most important patch is the 8th one.
Please try applying them on top of your 3.9-rc1 kernel and send me
back your kernel logs + sensors output.
I also added on top of this a debug patch to twidle with the values:

diff --git a/drivers/gpu/drm/nouveau/core/subdev/therm/nv40.c b/drivers/gpu/drm/nouveau/core/subdev/therm/nv40.c
index 92f3fca..a5a8abe 100644
--- a/drivers/gpu/drm/nouveau/core/subdev/therm/nv40.c
+++ b/drivers/gpu/drm/nouveau/core/subdev/therm/nv40.c
@@ -31,22 +31,28 @@ struct nv40_therm_priv {
enum nv40_sensor_style { INVALID_STYLE = -1, OLD_STYLE = 0, NEW_STYLE = 1 };
+extern int hack_old_style;
+extern int hack_mdelay;
 static enum nv40_sensor_style
 nv40_is_older_style_sensor(struct nouveau_therm *therm)
 {
    struct nouveau_device *device = nv_device(therm);
+	if (hack_old_style) {
+		if (device->chipset == 0x4c)
+			return OLD_STYLE;
+	}
    switch (device->chipset) {
    case 0x43:
    case 0x44:
    case 0x4a:
    case 0x47:
-	case 0x4c:
    	return OLD_STYLE;
case 0x46:
    case 0x49:
    case 0x4b:
+	case 0x4c:
    case 0x4e:
    case 0x67:
    case 0x68:
@@ -66,11 +72,17 @@ nv40_sensor_setup(struct nouveau_therm *therm)
    if (style == NEW_STYLE) {
    	nv_mask(therm, 0x15b8, 0x80000000, 0);
    	nv_wr32(therm, 0x15b0, 0x80003fff);
-		mdelay(20); /* wait for the temperature to stabilize */
+		if (hack_mdelay)
+			mdelay(hack_mdelay);
+		else
+			mdelay(20); /* wait for the temperature to stabilize */
    	return nv_rd32(therm, 0x15b4) & 0x3fff;
    } else if (style == OLD_STYLE) {
    	nv_wr32(therm, 0x15b0, 0xff);
-		mdelay(20); /* wait for the temperature to stabilize */
+		if (hack_mdelay)
+			mdelay(hack_mdelay);
+		else
+			mdelay(20); /* wait for the temperature to stabilize */
    	return nv_rd32(therm, 0x15b4) & 0xff;
    } else
    	return -ENODEV;
diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c
index d109936..d51bf21 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drm.c
@@ -69,6 +69,12 @@ MODULE_PARM_DESC(modeset, "enable driver (default: auto, "
 int nouveau_modeset = -1;
 module_param_named(modeset, nouveau_modeset, int, 0400);
+int hack_mdelay = 0;
+module_param_named(mdelay, hack_mdelay, int, 0400);
+
+int hack_old_style = 1;
+module_param_named(old_style, hack_old_style, int, 0400);
+
 static struct drm_driver driver;
static int
With that I am still getting the issues (even with an insance delay of 100 seconds).
Here is the serial log with various runs.

    

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

Re: nouveau shuts the machine down with v3.9-rc1 (temperature (72 C) hit the 'shutdown' threshold).