Skip to content

Commit 98cbc97

Browse files
lotus-nexthopnate-nexthop
authored andcommitted
Add a patch for printing the AMD Zen CPU reset reason
If I intentionally trigger a CPU soft reset I see this: ``` admin@gold208-dut:~$ sudo dmesg | grep -i reason [ 0.635233] x86/amd: Previous system reset reason [0x00080800]: software wrote 0x6 to reset control register 0xCF9 ``` If I intentionally trigger the CPU FCH Watchdog, I see this: ``` admin@gold208-dut:~$ sudo dmesg | grep reason [ 0.632563] x86/amd: Previous system reset reason [0x02000800]: hardware watchdog timer expired ``` Upstream from here: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=ab8131028710d009ab93d6bffd2a2749ade909b0 The patch had to be adapted to v6.1 we're using, that was basically adding the entire contents (5 constants) of `fch.h` as the file didn't exist in v6.1, and updating the patch for `amd.c` for context. Signed-off-by: Nate White <[email protected]>
1 parent b451639 commit 98cbc97

File tree

3 files changed

+211
-0
lines changed

3 files changed

+211
-0
lines changed
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
From 498de92a48a4aadc233cd560414f585601f54e52 Mon Sep 17 00:00:00 2001
2+
From: Yazen Ghannam <[email protected]>
3+
Date: Tue, 22 Apr 2025 18:48:30 -0500
4+
Subject: [PATCH 1/2] x86/CPU/AMD: Print the reason for the last reset
5+
6+
[ Upstream commit ab8131028710d009ab93d6bffd2a2749ade909b0 ]
7+
8+
The following register contains bits that indicate the cause for the
9+
previous reset.
10+
11+
PMx000000C0 (FCH::PM::S5_RESET_STATUS)
12+
13+
This is useful for debug. The reasons for reset are broken into 6 high level
14+
categories. Decode it by category and print during boot.
15+
16+
Specifics within a category are split off into debugging documentation.
17+
18+
The register is accessed indirectly through a "PM" port in the FCH. Use
19+
MMIO access in order to avoid restrictions with legacy port access.
20+
21+
Use a late_initcall() to ensure that MMIO has been set up before trying to
22+
access the register.
23+
24+
This register was introduced with AMD Family 17h, so avoid access on older
25+
families. There is no CPUID feature bit for this register.
26+
27+
[ bp: Simplify the reason dumping loop.
28+
- merge a fix to not access an array element after the last one:
29+
https://lore.kernel.org/r/[email protected]
30+
Reported-by: James Dutton <[email protected]>
31+
]
32+
33+
[ mingo:
34+
- Use consistent .rst formatting
35+
- Fix 'Sleep' class field to 'ACPI-State'
36+
- Standardize pin messages around the 'tripped' verbiage
37+
- Remove reference to ring-buffer printing & simplify the wording
38+
- Use curly braces for multi-line conditional statements ]
39+
40+
Signed-off-by: Yazen Ghannam <[email protected]>
41+
Co-developed-by: Mario Limonciello <[email protected]>
42+
Signed-off-by: Mario Limonciello <[email protected]>
43+
Signed-off-by: Borislav Petkov (AMD) <[email protected]>
44+
Signed-off-by: Ingo Molnar <[email protected]>
45+
Signed-off-by: Borislav Petkov (AMD) <[email protected]>
46+
Link: https://lore.kernel.org/[email protected]
47+
---
48+
arch/x86/include/asm/amd/fch.h | 13 ++++++++
49+
arch/x86/kernel/cpu/amd.c | 54 ++++++++++++++++++++++++++++++++++
50+
2 files changed, 67 insertions(+)
51+
create mode 100644 arch/x86/include/asm/amd/fch.h
52+
53+
diff --git a/arch/x86/include/asm/amd/fch.h b/arch/x86/include/asm/amd/fch.h
54+
new file mode 100644
55+
index 000000000..2cf5153ed
56+
--- /dev/null
57+
+++ b/arch/x86/include/asm/amd/fch.h
58+
@@ -0,0 +1,13 @@
59+
+/* SPDX-License-Identifier: GPL-2.0 */
60+
+#ifndef _ASM_X86_AMD_FCH_H_
61+
+#define _ASM_X86_AMD_FCH_H_
62+
+
63+
+#define FCH_PM_BASE 0xFED80300
64+
+
65+
+/* Register offsets from PM base: */
66+
+#define FCH_PM_DECODEEN 0x00
67+
+#define FCH_PM_DECODEEN_SMBUS0SEL GENMASK(20, 19)
68+
+#define FCH_PM_SCRATCH 0x80
69+
+#define FCH_PM_S5_RESET_STATUS 0xC0
70+
+
71+
+#endif /* _ASM_X86_AMD_FCH_H_ */
72+
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
73+
index 37796a1d0..6a2177a22 100644
74+
--- a/arch/x86/kernel/cpu/amd.c
75+
+++ b/arch/x86/kernel/cpu/amd.c
76+
@@ -9,6 +9,7 @@
77+
#include <linux/sched/clock.h>
78+
#include <linux/random.h>
79+
#include <linux/topology.h>
80+
+#include <asm/amd/fch.h>
81+
#include <asm/processor.h>
82+
#include <asm/apic.h>
83+
#include <asm/cacheinfo.h>
84+
@@ -1310,3 +1311,56 @@ void noinstr amd_clear_divider(void)
85+
:: "a" (0), "d" (0), "r" (1));
86+
}
87+
EXPORT_SYMBOL_GPL(amd_clear_divider);
88+
+
89+
+static const char * const s5_reset_reason_txt[] = {
90+
+ [0] = "thermal pin BP_THERMTRIP_L was tripped",
91+
+ [1] = "power button was pressed for 4 seconds",
92+
+ [2] = "shutdown pin was tripped",
93+
+ [4] = "remote ASF power off command was received",
94+
+ [9] = "internal CPU thermal limit was tripped",
95+
+ [16] = "system reset pin BP_SYS_RST_L was tripped",
96+
+ [17] = "software issued PCI reset",
97+
+ [18] = "software wrote 0x4 to reset control register 0xCF9",
98+
+ [19] = "software wrote 0x6 to reset control register 0xCF9",
99+
+ [20] = "software wrote 0xE to reset control register 0xCF9",
100+
+ [21] = "ACPI power state transition occurred",
101+
+ [22] = "keyboard reset pin KB_RST_L was tripped",
102+
+ [23] = "internal CPU shutdown event occurred",
103+
+ [24] = "system failed to boot before failed boot timer expired",
104+
+ [25] = "hardware watchdog timer expired",
105+
+ [26] = "remote ASF reset command was received",
106+
+ [27] = "an uncorrected error caused a data fabric sync flood event",
107+
+ [29] = "FCH and MP1 failed warm reset handshake",
108+
+ [30] = "a parity error occurred",
109+
+ [31] = "a software sync flood event occurred",
110+
+};
111+
+
112+
+static __init int print_s5_reset_status_mmio(void)
113+
+{
114+
+ unsigned long value;
115+
+ void __iomem *addr;
116+
+ int i;
117+
+
118+
+ if (!cpu_feature_enabled(X86_FEATURE_ZEN))
119+
+ return 0;
120+
+
121+
+ addr = ioremap(FCH_PM_BASE + FCH_PM_S5_RESET_STATUS, sizeof(value));
122+
+ if (!addr)
123+
+ return 0;
124+
+
125+
+ value = ioread32(addr);
126+
+ iounmap(addr);
127+
+
128+
+ for (i = 0; i < ARRAY_SIZE(s5_reset_reason_txt); i++) {
129+
+ if (!(value & BIT(i)))
130+
+ continue;
131+
+
132+
+ if (s5_reset_reason_txt[i]) {
133+
+ pr_info("x86/amd: Previous system reset reason [0x%08lx]: %s\n",
134+
+ value, s5_reset_reason_txt[i]);
135+
+ }
136+
+ }
137+
+
138+
+ return 0;
139+
+}
140+
+late_initcall(print_s5_reset_status_mmio);
141+
--
142+
2.39.5
143+
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
From 5dbe8f4b35737d66b465fd3a16dd8db84f987d1a Mon Sep 17 00:00:00 2001
2+
From: Yazen Ghannam <[email protected]>
3+
Date: Mon, 21 Jul 2025 18:11:54 +0000
4+
Subject: [PATCH 2/2] x86/CPU/AMD: Ignore invalid reset reason value
5+
6+
[ Upstream commit e9576e078220c50ace9e9087355423de23e25fa5 ]
7+
8+
The reset reason value may be "all bits set", e.g. 0xFFFFFFFF. This is a
9+
commonly used error response from hardware. This may occur due to a real
10+
hardware issue or when running in a VM.
11+
12+
The user will see all reset reasons reported in this case.
13+
14+
Check for an error response value and return early to avoid decoding
15+
invalid data.
16+
17+
Also, adjust the data variable type to match the hardware register size.
18+
19+
Fixes: ab8131028710 ("x86/CPU/AMD: Print the reason for the last reset")
20+
Reported-by: Libing He <[email protected]>
21+
Signed-off-by: Yazen Ghannam <[email protected]>
22+
Signed-off-by: Borislav Petkov (AMD) <[email protected]>
23+
Reviewed-by: Mario Limonciello <[email protected]>
24+
25+
Link: https://lore.kernel.org/[email protected]
26+
---
27+
arch/x86/kernel/cpu/amd.c | 8 ++++++--
28+
1 file changed, 6 insertions(+), 2 deletions(-)
29+
30+
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
31+
index 6a2177a22..eb5777d6f 100644
32+
--- a/arch/x86/kernel/cpu/amd.c
33+
+++ b/arch/x86/kernel/cpu/amd.c
34+
@@ -1337,8 +1337,8 @@ static const char * const s5_reset_reason_txt[] = {
35+
36+
static __init int print_s5_reset_status_mmio(void)
37+
{
38+
- unsigned long value;
39+
void __iomem *addr;
40+
+ u32 value;
41+
int i;
42+
43+
if (!cpu_feature_enabled(X86_FEATURE_ZEN))
44+
@@ -1351,12 +1351,16 @@ static __init int print_s5_reset_status_mmio(void)
45+
value = ioread32(addr);
46+
iounmap(addr);
47+
48+
+ /* Value with "all bits set" is an error response and should be ignored. */
49+
+ if (value == U32_MAX)
50+
+ return 0;
51+
+
52+
for (i = 0; i < ARRAY_SIZE(s5_reset_reason_txt); i++) {
53+
if (!(value & BIT(i)))
54+
continue;
55+
56+
if (s5_reset_reason_txt[i]) {
57+
- pr_info("x86/amd: Previous system reset reason [0x%08lx]: %s\n",
58+
+ pr_info("x86/amd: Previous system reset reason [0x%08x]: %s\n",
59+
value, s5_reset_reason_txt[i]);
60+
}
61+
}
62+
--
63+
2.39.5
64+

patches-sonic/series

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,10 @@ cisco-npu-disable-other-bars.patch
201201
0001-fix-os-crash-caused-by-optoe-when-class-switch.patch
202202
0001-tty-8250-HSUART-DMA-be-deactivated-for-DNV-CPU.patch
203203

204+
# Nexthop patches
205+
0001-x86-CPU-AMD-Print-the-reason-for-the-last-reset.patch
206+
0002-x86-CPU-AMD-Ignore-invalid-reset-reason-value.patch
207+
204208
# Fix to avoid kernel panic on Kernel 6.1.94
205209
# https://github.com/sonic-net/sonic-buildimage/issues/20901
206210
#PCI-ASPM-Fix-link-state-exit-during-switch-upstream.patch # Upstreamed

0 commit comments

Comments
 (0)