Skip to content

Commit b75efe8

Browse files
Evan Quanalexdeucher
authored andcommitted
drm/amd/pm: avoid unintentional shutdown due to temperature momentary fluctuation
An intentional delay is added on soft ctf triggered. Then there will be a double check for the GPU temperature before taking further action. This can avoid unintended shutdown due to temperature momentary fluctuation. Signed-off-by: Evan Quan <[email protected]> Reviewed-by: Lijo Lazar <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent 064329c commit b75efe8

File tree

8 files changed

+102
-32
lines changed

8 files changed

+102
-32
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,9 @@ extern int amdgpu_user_partt_mode;
286286
#define AMDGPU_SMARTSHIFT_MAX_BIAS (100)
287287
#define AMDGPU_SMARTSHIFT_MIN_BIAS (-100)
288288

289+
/* Extra time delay(in ms) to eliminate the influence of temperature momentary fluctuation */
290+
#define AMDGPU_SWCTF_EXTRA_DELAY 50
291+
289292
struct amdgpu_xcp_mgr;
290293
struct amdgpu_device;
291294
struct amdgpu_irq_src;

drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include <linux/gfp.h>
2727
#include <linux/slab.h>
2828
#include <linux/firmware.h>
29+
#include <linux/reboot.h>
2930
#include "amd_shared.h"
3031
#include "amd_powerplay.h"
3132
#include "power_state.h"
@@ -91,6 +92,45 @@ static int pp_early_init(void *handle)
9192
return 0;
9293
}
9394

95+
static void pp_swctf_delayed_work_handler(struct work_struct *work)
96+
{
97+
struct pp_hwmgr *hwmgr =
98+
container_of(work, struct pp_hwmgr, swctf_delayed_work.work);
99+
struct amdgpu_device *adev = hwmgr->adev;
100+
struct amdgpu_dpm_thermal *range =
101+
&adev->pm.dpm.thermal;
102+
uint32_t gpu_temperature, size;
103+
int ret;
104+
105+
/*
106+
* If the hotspot/edge temperature is confirmed as below SW CTF setting point
107+
* after the delay enforced, nothing will be done.
108+
* Otherwise, a graceful shutdown will be performed to prevent further damage.
109+
*/
110+
if (range->sw_ctf_threshold &&
111+
hwmgr->hwmgr_func->read_sensor) {
112+
ret = hwmgr->hwmgr_func->read_sensor(hwmgr,
113+
AMDGPU_PP_SENSOR_HOTSPOT_TEMP,
114+
&gpu_temperature,
115+
&size);
116+
/*
117+
* For some legacy ASICs, hotspot temperature retrieving might be not
118+
* supported. Check the edge temperature instead then.
119+
*/
120+
if (ret == -EOPNOTSUPP)
121+
ret = hwmgr->hwmgr_func->read_sensor(hwmgr,
122+
AMDGPU_PP_SENSOR_EDGE_TEMP,
123+
&gpu_temperature,
124+
&size);
125+
if (!ret && gpu_temperature / 1000 < range->sw_ctf_threshold)
126+
return;
127+
}
128+
129+
dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
130+
dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
131+
orderly_poweroff(true);
132+
}
133+
94134
static int pp_sw_init(void *handle)
95135
{
96136
struct amdgpu_device *adev = handle;
@@ -101,6 +141,10 @@ static int pp_sw_init(void *handle)
101141

102142
pr_debug("powerplay sw init %s\n", ret ? "failed" : "successfully");
103143

144+
if (!ret)
145+
INIT_DELAYED_WORK(&hwmgr->swctf_delayed_work,
146+
pp_swctf_delayed_work_handler);
147+
104148
return ret;
105149
}
106150

@@ -135,6 +179,8 @@ static int pp_hw_fini(void *handle)
135179
struct amdgpu_device *adev = handle;
136180
struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;
137181

182+
cancel_delayed_work_sync(&hwmgr->swctf_delayed_work);
183+
138184
hwmgr_hw_fini(hwmgr);
139185

140186
return 0;
@@ -221,6 +267,8 @@ static int pp_suspend(void *handle)
221267
struct amdgpu_device *adev = handle;
222268
struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;
223269

270+
cancel_delayed_work_sync(&hwmgr->swctf_delayed_work);
271+
224272
return hwmgr_suspend(hwmgr);
225273
}
226274

drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -603,21 +603,17 @@ int phm_irq_process(struct amdgpu_device *adev,
603603
struct amdgpu_irq_src *source,
604604
struct amdgpu_iv_entry *entry)
605605
{
606+
struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;
606607
uint32_t client_id = entry->client_id;
607608
uint32_t src_id = entry->src_id;
608609

609610
if (client_id == AMDGPU_IRQ_CLIENTID_LEGACY) {
610611
if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_LOW_TO_HIGH) {
611-
dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
612-
/*
613-
* SW CTF just occurred.
614-
* Try to do a graceful shutdown to prevent further damage.
615-
*/
616-
dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
617-
orderly_poweroff(true);
618-
} else if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_HIGH_TO_LOW)
612+
schedule_delayed_work(&hwmgr->swctf_delayed_work,
613+
msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));
614+
} else if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_HIGH_TO_LOW) {
619615
dev_emerg(adev->dev, "ERROR: GPU under temperature range detected!\n");
620-
else if (src_id == VISLANDS30_IV_SRCID_GPIO_19) {
616+
} else if (src_id == VISLANDS30_IV_SRCID_GPIO_19) {
621617
dev_emerg(adev->dev, "ERROR: GPU HW Critical Temperature Fault(aka CTF) detected!\n");
622618
/*
623619
* HW CTF just occurred. Shutdown to prevent further damage.
@@ -626,15 +622,10 @@ int phm_irq_process(struct amdgpu_device *adev,
626622
orderly_poweroff(true);
627623
}
628624
} else if (client_id == SOC15_IH_CLIENTID_THM) {
629-
if (src_id == 0) {
630-
dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
631-
/*
632-
* SW CTF just occurred.
633-
* Try to do a graceful shutdown to prevent further damage.
634-
*/
635-
dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
636-
orderly_poweroff(true);
637-
} else
625+
if (src_id == 0)
626+
schedule_delayed_work(&hwmgr->swctf_delayed_work,
627+
msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));
628+
else
638629
dev_emerg(adev->dev, "ERROR: GPU under temperature range detected!\n");
639630
} else if (client_id == SOC15_IH_CLIENTID_ROM_SMUIO) {
640631
dev_emerg(adev->dev, "ERROR: GPU HW Critical Temperature Fault(aka CTF) detected!\n");

drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -811,6 +811,8 @@ struct pp_hwmgr {
811811
bool gfxoff_state_changed_by_workload;
812812
uint32_t pstate_sclk_peak;
813813
uint32_t pstate_mclk_peak;
814+
815+
struct delayed_work swctf_delayed_work;
814816
};
815817

816818
int hwmgr_early_init(struct pp_hwmgr *hwmgr);

drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
#include <linux/firmware.h>
2626
#include <linux/pci.h>
27+
#include <linux/reboot.h>
2728

2829
#include "amdgpu.h"
2930
#include "amdgpu_smu.h"
@@ -1078,6 +1079,34 @@ static void smu_interrupt_work_fn(struct work_struct *work)
10781079
smu->ppt_funcs->interrupt_work(smu);
10791080
}
10801081

1082+
static void smu_swctf_delayed_work_handler(struct work_struct *work)
1083+
{
1084+
struct smu_context *smu =
1085+
container_of(work, struct smu_context, swctf_delayed_work.work);
1086+
struct smu_temperature_range *range =
1087+
&smu->thermal_range;
1088+
struct amdgpu_device *adev = smu->adev;
1089+
uint32_t hotspot_tmp, size;
1090+
1091+
/*
1092+
* If the hotspot temperature is confirmed as below SW CTF setting point
1093+
* after the delay enforced, nothing will be done.
1094+
* Otherwise, a graceful shutdown will be performed to prevent further damage.
1095+
*/
1096+
if (range->software_shutdown_temp &&
1097+
smu->ppt_funcs->read_sensor &&
1098+
!smu->ppt_funcs->read_sensor(smu,
1099+
AMDGPU_PP_SENSOR_HOTSPOT_TEMP,
1100+
&hotspot_tmp,
1101+
&size) &&
1102+
hotspot_tmp / 1000 < range->software_shutdown_temp)
1103+
return;
1104+
1105+
dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
1106+
dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
1107+
orderly_poweroff(true);
1108+
}
1109+
10811110
static int smu_sw_init(void *handle)
10821111
{
10831112
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
@@ -1120,6 +1149,9 @@ static int smu_sw_init(void *handle)
11201149
smu->smu_dpm.dpm_level = AMD_DPM_FORCED_LEVEL_AUTO;
11211150
smu->smu_dpm.requested_dpm_level = AMD_DPM_FORCED_LEVEL_AUTO;
11221151

1152+
INIT_DELAYED_WORK(&smu->swctf_delayed_work,
1153+
smu_swctf_delayed_work_handler);
1154+
11231155
ret = smu_smc_table_sw_init(smu);
11241156
if (ret) {
11251157
dev_err(adev->dev, "Failed to sw init smc table!\n");
@@ -1600,6 +1632,8 @@ static int smu_smc_hw_cleanup(struct smu_context *smu)
16001632
return ret;
16011633
}
16021634

1635+
cancel_delayed_work_sync(&smu->swctf_delayed_work);
1636+
16031637
ret = smu_disable_dpms(smu);
16041638
if (ret) {
16051639
dev_err(adev->dev, "Fail to disable dpm features!\n");

drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -573,6 +573,8 @@ struct smu_context
573573
u32 debug_param_reg;
574574
u32 debug_msg_reg;
575575
u32 debug_resp_reg;
576+
577+
struct delayed_work swctf_delayed_work;
576578
};
577579

578580
struct i2c_adapter;

drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1412,13 +1412,8 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev,
14121412
if (client_id == SOC15_IH_CLIENTID_THM) {
14131413
switch (src_id) {
14141414
case THM_11_0__SRCID__THM_DIG_THERM_L2H:
1415-
dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
1416-
/*
1417-
* SW CTF just occurred.
1418-
* Try to do a graceful shutdown to prevent further damage.
1419-
*/
1420-
dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
1421-
orderly_poweroff(true);
1415+
schedule_delayed_work(&smu->swctf_delayed_work,
1416+
msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));
14221417
break;
14231418
case THM_11_0__SRCID__THM_DIG_THERM_H2L:
14241419
dev_emerg(adev->dev, "ERROR: GPU under temperature range detected\n");

drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1353,13 +1353,8 @@ static int smu_v13_0_irq_process(struct amdgpu_device *adev,
13531353
if (client_id == SOC15_IH_CLIENTID_THM) {
13541354
switch (src_id) {
13551355
case THM_11_0__SRCID__THM_DIG_THERM_L2H:
1356-
dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
1357-
/*
1358-
* SW CTF just occurred.
1359-
* Try to do a graceful shutdown to prevent further damage.
1360-
*/
1361-
dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
1362-
orderly_poweroff(true);
1356+
schedule_delayed_work(&smu->swctf_delayed_work,
1357+
msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));
13631358
break;
13641359
case THM_11_0__SRCID__THM_DIG_THERM_H2L:
13651360
dev_emerg(adev->dev, "ERROR: GPU under temperature range detected\n");

0 commit comments

Comments
 (0)