Skip to content

Commit d4fca13

Browse files
Avadhut Naikbp3tk0v
authored andcommitted
x86/MCE/AMD: Add support for new MCA_SYND{1,2} registers
Starting with Zen4, AMD's Scalable MCA systems incorporate two new registers: MCA_SYND1 and MCA_SYND2. These registers will include supplemental error information in addition to the existing MCA_SYND register. The data within these registers is considered valid if MCA_STATUS[SyndV] is set. Userspace error decoding tools like rasdaemon gather related hardware error information through the tracepoints. Therefore, export these two registers through the mce_record tracepoint so that tools like rasdaemon can parse them and output the supplemental error information like FRU text contained in them. [ bp: Massage. ] Signed-off-by: Yazen Ghannam <[email protected]> Signed-off-by: Avadhut Naik <[email protected]> Signed-off-by: Borislav Petkov (AMD) <[email protected]> Reviewed-by: Qiuxu Zhuo <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent e52750f commit d4fca13

File tree

6 files changed

+46
-7
lines changed

6 files changed

+46
-7
lines changed

arch/x86/include/asm/mce.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,9 @@
122122
#define MSR_AMD64_SMCA_MC0_DESTAT 0xc0002008
123123
#define MSR_AMD64_SMCA_MC0_DEADDR 0xc0002009
124124
#define MSR_AMD64_SMCA_MC0_MISC1 0xc000200a
125+
/* Registers MISC2 to MISC4 are at offsets B to D. */
126+
#define MSR_AMD64_SMCA_MC0_SYND1 0xc000200e
127+
#define MSR_AMD64_SMCA_MC0_SYND2 0xc000200f
125128
#define MSR_AMD64_SMCA_MCx_CTL(x) (MSR_AMD64_SMCA_MC0_CTL + 0x10*(x))
126129
#define MSR_AMD64_SMCA_MCx_STATUS(x) (MSR_AMD64_SMCA_MC0_STATUS + 0x10*(x))
127130
#define MSR_AMD64_SMCA_MCx_ADDR(x) (MSR_AMD64_SMCA_MC0_ADDR + 0x10*(x))
@@ -132,6 +135,8 @@
132135
#define MSR_AMD64_SMCA_MCx_DESTAT(x) (MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x))
133136
#define MSR_AMD64_SMCA_MCx_DEADDR(x) (MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x))
134137
#define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
138+
#define MSR_AMD64_SMCA_MCx_SYND1(x) (MSR_AMD64_SMCA_MC0_SYND1 + 0x10*(x))
139+
#define MSR_AMD64_SMCA_MCx_SYND2(x) (MSR_AMD64_SMCA_MC0_SYND2 + 0x10*(x))
135140

136141
#define XEC(x, mask) (((x) >> 16) & mask)
137142

@@ -190,9 +195,25 @@ enum mce_notifier_prios {
190195
/**
191196
* struct mce_hw_err - Hardware Error Record.
192197
* @m: Machine Check record.
198+
* @vendor: Vendor-specific error information.
199+
*
200+
* Vendor-specific fields should not be added to struct mce. Instead, vendors
201+
* should export their vendor-specific data through their structure in the
202+
* vendor union below.
203+
*
204+
* AMD's vendor data is parsed by error decoding tools for supplemental error
205+
* information. Thus, current offsets of existing fields must be maintained.
206+
* Only add new fields at the end of AMD's vendor structure.
193207
*/
194208
struct mce_hw_err {
195209
struct mce m;
210+
211+
union vendor_info {
212+
struct {
213+
u64 synd1; /* MCA_SYND1 MSR */
214+
u64 synd2; /* MCA_SYND2 MSR */
215+
} amd;
216+
} vendor;
196217
};
197218

198219
#define to_mce_hw_err(mce) container_of(mce, struct mce_hw_err, m)

arch/x86/include/uapi/asm/mce.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88
/*
99
* Fields are zero when not available. Also, this struct is shared with
1010
* userspace mcelog and thus must keep existing fields at current offsets.
11-
* Only add new fields to the end of the structure
11+
* Only add new, shared fields to the end of the structure.
12+
* Do not add vendor-specific fields.
1213
*/
1314
struct mce {
1415
__u64 status; /* Bank's MCi_STATUS MSR */

arch/x86/kernel/cpu/mce/amd.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -797,8 +797,11 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
797797
if (mce_flags.smca) {
798798
rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid);
799799

800-
if (m->status & MCI_STATUS_SYNDV)
800+
if (m->status & MCI_STATUS_SYNDV) {
801801
rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd);
802+
rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1);
803+
rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2);
804+
}
802805
}
803806

804807
mce_log(&err);

arch/x86/kernel/cpu/mce/core.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,10 @@ static void __print_mce(struct mce_hw_err *err)
202202
if (mce_flags.smca) {
203203
if (m->synd)
204204
pr_cont("SYND %llx ", m->synd);
205+
if (err->vendor.amd.synd1)
206+
pr_cont("SYND1 %llx ", err->vendor.amd.synd1);
207+
if (err->vendor.amd.synd2)
208+
pr_cont("SYND2 %llx ", err->vendor.amd.synd2);
205209
if (m->ipid)
206210
pr_cont("IPID %llx ", m->ipid);
207211
}
@@ -678,8 +682,11 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i)
678682
if (mce_flags.smca) {
679683
m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
680684

681-
if (m->status & MCI_STATUS_SYNDV)
685+
if (m->status & MCI_STATUS_SYNDV) {
682686
m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
687+
err->vendor.amd.synd1 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(i));
688+
err->vendor.amd.synd2 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(i));
689+
}
683690
}
684691
}
685692

drivers/edac/mce_amd.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -793,6 +793,7 @@ static int
793793
amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
794794
{
795795
struct mce *m = (struct mce *)data;
796+
struct mce_hw_err *err = to_mce_hw_err(m);
796797
unsigned int fam = x86_family(m->cpuid);
797798
int ecc;
798799

@@ -850,8 +851,11 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
850851
if (boot_cpu_has(X86_FEATURE_SMCA)) {
851852
pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
852853

853-
if (m->status & MCI_STATUS_SYNDV)
854-
pr_cont(", Syndrome: 0x%016llx", m->synd);
854+
if (m->status & MCI_STATUS_SYNDV) {
855+
pr_cont(", Syndrome: 0x%016llx\n", m->synd);
856+
pr_emerg(HW_ERR "Syndrome1: 0x%016llx, Syndrome2: 0x%016llx",
857+
err->vendor.amd.synd1, err->vendor.amd.synd2);
858+
}
855859

856860
pr_cont("\n");
857861

include/trace/events/mce.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ TRACE_EVENT(mce_record,
4343
__field( u8, bank )
4444
__field( u8, cpuvendor )
4545
__field( u32, microcode )
46+
__dynamic_array(u8, v_data, sizeof(err->vendor))
4647
),
4748

4849
TP_fast_assign(
@@ -65,9 +66,10 @@ TRACE_EVENT(mce_record,
6566
__entry->bank = err->m.bank;
6667
__entry->cpuvendor = err->m.cpuvendor;
6768
__entry->microcode = err->m.microcode;
69+
memcpy(__get_dynamic_array(v_data), &err->vendor, sizeof(err->vendor));
6870
),
6971

70-
TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, IPID: %016Lx, ADDR: %016Lx, MISC: %016Lx, SYND: %016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PPIN: %llx, vendor: %u, CPUID: %x, time: %llu, socket: %u, APIC: %x, microcode: %x",
72+
TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016llx, IPID: %016llx, ADDR: %016llx, MISC: %016llx, SYND: %016llx, RIP: %02x:<%016llx>, TSC: %llx, PPIN: %llx, vendor: %u, CPUID: %x, time: %llu, socket: %u, APIC: %x, microcode: %x, vendor data: %s",
7173
__entry->cpu,
7274
__entry->mcgcap, __entry->mcgstatus,
7375
__entry->bank, __entry->status,
@@ -83,7 +85,8 @@ TRACE_EVENT(mce_record,
8385
__entry->walltime,
8486
__entry->socketid,
8587
__entry->apicid,
86-
__entry->microcode)
88+
__entry->microcode,
89+
__print_dynamic_array(v_data, sizeof(u8)))
8790
);
8891

8992
#endif /* _TRACE_MCE_H */

0 commit comments

Comments
 (0)