Skip to content

Commit 71a8440

Browse files
yghannamsuryasaimadhu
authored andcommitted
x86/MCE/AMD: Don't report L1 BTB MCA errors on some family 17h models
AMD family 17h Models 10h-2Fh may report a high number of L1 BTB MCA errors under certain conditions. The errors are benign and can safely be ignored. However, the high error rate may cause the MCA threshold counter to overflow causing a high rate of thresholding interrupts. In addition, users may see the errors reported through the AMD MCE decoder module, even with the interrupt disabled, due to MCA polling. Clear the "Counter Present" bit in the Instruction Fetch bank's MCA_MISC0 register. This will prevent enabling MCA thresholding on this bank which will prevent the high interrupt rate due to this error. Define an AMD-specific function to filter these errors from the MCE event pool so that they don't get reported during early boot. Rename filter function in EDAC/mce_amd to avoid a naming conflict, while at it. [ bp: Move function prototype to the internal header and massage/cleanup, fix typos. ] Reported-by: Rafał Miłecki <[email protected]> Signed-off-by: Yazen Ghannam <[email protected]> Signed-off-by: Borislav Petkov <[email protected]> Cc: "H. Peter Anvin" <[email protected]> Cc: "[email protected]" <[email protected]> Cc: Arnd Bergmann <[email protected]> Cc: Ingo Molnar <[email protected]> Cc: James Morse <[email protected]> Cc: Kees Cook <[email protected]> Cc: Mauro Carvalho Chehab <[email protected]> Cc: Pu Wen <[email protected]> Cc: Qiuxu Zhuo <[email protected]> Cc: Shirish S <[email protected]> Cc: Thomas Gleixner <[email protected]> Cc: Tony Luck <[email protected]> Cc: Vishal Verma <[email protected]> Cc: linux-edac <[email protected]> Cc: x86-ml <[email protected]> Cc: <[email protected]> # 5.0.x: c95b323: x86/MCE/AMD: Turn off MC4_MISC thresholding on all family 0x15 models Cc: <[email protected]> # 5.0.x: 30aa3d2: x86/MCE/AMD: Carve out the MC4_MISC thresholding quirk Cc: <[email protected]> # 5.0.x: 9308fd4: x86/MCE: Group AMD function prototypes in <asm/mce.h> Cc: <[email protected]> # 5.0.x Link: https://lkml.kernel.org/r/[email protected]
1 parent 45d4b7b commit 71a8440

File tree

4 files changed

+50
-15
lines changed

4 files changed

+50
-15
lines changed

arch/x86/kernel/cpu/mce/amd.c

Lines changed: 39 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -563,33 +563,59 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
563563
return offset;
564564
}
565565

566+
bool amd_filter_mce(struct mce *m)
567+
{
568+
enum smca_bank_types bank_type = smca_get_bank_type(m->bank);
569+
struct cpuinfo_x86 *c = &boot_cpu_data;
570+
u8 xec = (m->status >> 16) & 0x3F;
571+
572+
/* See Family 17h Models 10h-2Fh Erratum #1114. */
573+
if (c->x86 == 0x17 &&
574+
c->x86_model >= 0x10 && c->x86_model <= 0x2F &&
575+
bank_type == SMCA_IF && xec == 10)
576+
return true;
577+
578+
return false;
579+
}
580+
566581
/*
567-
* Turn off MC4_MISC thresholding banks on all family 0x15 models since
568-
* they're not supported there.
582+
* Turn off thresholding banks for the following conditions:
583+
* - MC4_MISC thresholding is not supported on Family 0x15.
584+
* - Prevent possible spurious interrupts from the IF bank on Family 0x17
585+
* Models 0x10-0x2F due to Erratum #1114.
569586
*/
570-
void disable_err_thresholding(struct cpuinfo_x86 *c)
587+
void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
571588
{
572-
int i;
589+
int i, num_msrs;
573590
u64 hwcr;
574591
bool need_toggle;
575-
u32 msrs[] = {
576-
0x00000413, /* MC4_MISC0 */
577-
0xc0000408, /* MC4_MISC1 */
578-
};
592+
u32 msrs[NR_BLOCKS];
593+
594+
if (c->x86 == 0x15 && bank == 4) {
595+
msrs[0] = 0x00000413; /* MC4_MISC0 */
596+
msrs[1] = 0xc0000408; /* MC4_MISC1 */
597+
num_msrs = 2;
598+
} else if (c->x86 == 0x17 &&
599+
(c->x86_model >= 0x10 && c->x86_model <= 0x2F)) {
579600

580-
if (c->x86 != 0x15)
601+
if (smca_get_bank_type(bank) != SMCA_IF)
602+
return;
603+
604+
msrs[0] = MSR_AMD64_SMCA_MCx_MISC(bank);
605+
num_msrs = 1;
606+
} else {
581607
return;
608+
}
582609

583610
rdmsrl(MSR_K7_HWCR, hwcr);
584611

585612
/* McStatusWrEn has to be set */
586613
need_toggle = !(hwcr & BIT(18));
587-
588614
if (need_toggle)
589615
wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
590616

591617
/* Clear CntP bit safely */
592-
for (i = 0; i < ARRAY_SIZE(msrs); i++)
618+
for (i = 0; i < num_msrs; i++)
593619
msr_clear_bit(msrs[i], 62);
594620

595621
/* restore old settings */
@@ -604,12 +630,12 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
604630
unsigned int bank, block, cpu = smp_processor_id();
605631
int offset = -1;
606632

607-
disable_err_thresholding(c);
608-
609633
for (bank = 0; bank < mca_cfg.banks; ++bank) {
610634
if (mce_flags.smca)
611635
smca_configure(bank, cpu);
612636

637+
disable_err_thresholding(c, bank);
638+
613639
for (block = 0; block < NR_BLOCKS; ++block) {
614640
address = get_block_address(address, low, high, bank, block);
615641
if (!address)

arch/x86/kernel/cpu/mce/core.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1777,6 +1777,9 @@ static void __mcheck_cpu_init_timer(void)
17771777

17781778
bool filter_mce(struct mce *m)
17791779
{
1780+
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
1781+
return amd_filter_mce(m);
1782+
17801783
return false;
17811784
}
17821785

arch/x86/kernel/cpu/mce/internal.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,4 +176,10 @@ extern struct mca_msr_regs msr_ops;
176176
/* Decide whether to add MCE record to MCE event pool or filter it out. */
177177
extern bool filter_mce(struct mce *m);
178178

179+
#ifdef CONFIG_X86_MCE_AMD
180+
extern bool amd_filter_mce(struct mce *m);
181+
#else
182+
static inline bool amd_filter_mce(struct mce *m) { return false; };
183+
#endif
184+
179185
#endif /* __X86_MCE_INTERNAL_H__ */

drivers/edac/mce_amd.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1004,7 +1004,7 @@ static inline void amd_decode_err_code(u16 ec)
10041004
/*
10051005
* Filter out unwanted MCE signatures here.
10061006
*/
1007-
static bool amd_filter_mce(struct mce *m)
1007+
static bool ignore_mce(struct mce *m)
10081008
{
10091009
/*
10101010
* NB GART TLB error reporting is disabled by default.
@@ -1038,7 +1038,7 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
10381038
unsigned int fam = x86_family(m->cpuid);
10391039
int ecc;
10401040

1041-
if (amd_filter_mce(m))
1041+
if (ignore_mce(m))
10421042
return NOTIFY_STOP;
10431043

10441044
pr_emerg(HW_ERR "%s\n", decode_error_status(m));

0 commit comments

Comments
 (0)