Skip to content

Commit 01ae815

Browse files
committed
Merge tag 'ras_core_for_6.7_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 RAS updates from Borislav Petkov: - Specify what error addresses reported on AMD are actually usable memory error addresses for further decoding * tag 'ras_core_for_6.7_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/mce: Cleanup mce_usable_address() x86/mce: Define amd_mce_usable_address() x86/MCE/AMD: Split amd_mce_is_memory_error()
2 parents 66cc883 + 1bae0cf commit 01ae815

File tree

5 files changed

+99
-27
lines changed

5 files changed

+99
-27
lines changed

arch/x86/include/asm/mce.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ static inline void cmci_recheck(void) {}
245245
int mce_available(struct cpuinfo_x86 *c);
246246
bool mce_is_memory_error(struct mce *m);
247247
bool mce_is_correctable(struct mce *m);
248-
int mce_usable_address(struct mce *m);
248+
bool mce_usable_address(struct mce *m);
249249

250250
DECLARE_PER_CPU(unsigned, mce_exception_count);
251251
DECLARE_PER_CPU(unsigned, mce_poll_count);

arch/x86/kernel/cpu/mce/amd.c

Lines changed: 63 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -713,17 +713,75 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
713713
deferred_error_interrupt_enable(c);
714714
}
715715

716-
bool amd_mce_is_memory_error(struct mce *m)
716+
/*
717+
* DRAM ECC errors are reported in the Northbridge (bank 4) with
718+
* Extended Error Code 8.
719+
*/
720+
static bool legacy_mce_is_memory_error(struct mce *m)
721+
{
722+
return m->bank == 4 && XEC(m->status, 0x1f) == 8;
723+
}
724+
725+
/*
726+
* DRAM ECC errors are reported in Unified Memory Controllers with
727+
* Extended Error Code 0.
728+
*/
729+
static bool smca_mce_is_memory_error(struct mce *m)
717730
{
718731
enum smca_bank_types bank_type;
719-
/* ErrCodeExt[20:16] */
720-
u8 xec = (m->status >> 16) & 0x1f;
732+
733+
if (XEC(m->status, 0x3f))
734+
return false;
721735

722736
bank_type = smca_get_bank_type(m->extcpu, m->bank);
737+
738+
return bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2;
739+
}
740+
741+
bool amd_mce_is_memory_error(struct mce *m)
742+
{
723743
if (mce_flags.smca)
724-
return (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) && xec == 0x0;
744+
return smca_mce_is_memory_error(m);
745+
else
746+
return legacy_mce_is_memory_error(m);
747+
}
748+
749+
/*
750+
* AMD systems do not have an explicit indicator that the value in MCA_ADDR is
751+
* a system physical address. Therefore, individual cases need to be detected.
752+
* Future cases and checks will be added as needed.
753+
*
754+
* 1) General case
755+
* a) Assume address is not usable.
756+
* 2) Poison errors
757+
* a) Indicated by MCA_STATUS[43]: poison. Defined for all banks except legacy
758+
* northbridge (bank 4).
759+
* b) Refers to poison consumption in the core. Does not include "no action",
760+
* "action optional", or "deferred" error severities.
761+
* c) Will include a usable address so that immediate action can be taken.
762+
* 3) Northbridge DRAM ECC errors
763+
* a) Reported in legacy bank 4 with extended error code (XEC) 8.
764+
* b) MCA_STATUS[43] is *not* defined as poison in legacy bank 4. Therefore,
765+
* this bit should not be checked.
766+
*
767+
* NOTE: SMCA UMC memory errors fall into case #1.
768+
*/
769+
bool amd_mce_usable_address(struct mce *m)
770+
{
771+
/* Check special northbridge case 3) first. */
772+
if (!mce_flags.smca) {
773+
if (legacy_mce_is_memory_error(m))
774+
return true;
775+
else if (m->bank == 4)
776+
return false;
777+
}
725778

726-
return m->bank == 4 && xec == 0x8;
779+
/* Check poison bit for all other bank types. */
780+
if (m->status & MCI_STATUS_POISON)
781+
return true;
782+
783+
/* Assume address is not usable for all others. */
784+
return false;
727785
}
728786

729787
static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)

arch/x86/kernel/cpu/mce/core.c

Lines changed: 11 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -453,32 +453,22 @@ static void mce_irq_work_cb(struct irq_work *entry)
453453
mce_schedule_work();
454454
}
455455

456-
/*
457-
* Check if the address reported by the CPU is in a format we can parse.
458-
* It would be possible to add code for most other cases, but all would
459-
* be somewhat complicated (e.g. segment offset would require an instruction
460-
* parser). So only support physical addresses up to page granularity for now.
461-
*/
462-
int mce_usable_address(struct mce *m)
456+
bool mce_usable_address(struct mce *m)
463457
{
464458
if (!(m->status & MCI_STATUS_ADDRV))
465-
return 0;
466-
467-
/* Checks after this one are Intel/Zhaoxin-specific: */
468-
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
469-
boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
470-
return 1;
471-
472-
if (!(m->status & MCI_STATUS_MISCV))
473-
return 0;
459+
return false;
474460

475-
if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
476-
return 0;
461+
switch (m->cpuvendor) {
462+
case X86_VENDOR_AMD:
463+
return amd_mce_usable_address(m);
477464

478-
if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
479-
return 0;
465+
case X86_VENDOR_INTEL:
466+
case X86_VENDOR_ZHAOXIN:
467+
return intel_mce_usable_address(m);
480468

481-
return 1;
469+
default:
470+
return true;
471+
}
482472
}
483473
EXPORT_SYMBOL_GPL(mce_usable_address);
484474

arch/x86/kernel/cpu/mce/intel.c

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -536,3 +536,23 @@ bool intel_filter_mce(struct mce *m)
536536

537537
return false;
538538
}
539+
540+
/*
541+
* Check if the address reported by the CPU is in a format we can parse.
542+
* It would be possible to add code for most other cases, but all would
543+
* be somewhat complicated (e.g. segment offset would require an instruction
544+
* parser). So only support physical addresses up to page granularity for now.
545+
*/
546+
bool intel_mce_usable_address(struct mce *m)
547+
{
548+
if (!(m->status & MCI_STATUS_MISCV))
549+
return false;
550+
551+
if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
552+
return false;
553+
554+
if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
555+
return false;
556+
557+
return true;
558+
}

arch/x86/kernel/cpu/mce/internal.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ void intel_init_cmci(void);
4949
void intel_init_lmce(void);
5050
void intel_clear_lmce(void);
5151
bool intel_filter_mce(struct mce *m);
52+
bool intel_mce_usable_address(struct mce *m);
5253
#else
5354
# define cmci_intel_adjust_timer mce_adjust_timer_default
5455
static inline bool mce_intel_cmci_poll(void) { return false; }
@@ -58,6 +59,7 @@ static inline void intel_init_cmci(void) { }
5859
static inline void intel_init_lmce(void) { }
5960
static inline void intel_clear_lmce(void) { }
6061
static inline bool intel_filter_mce(struct mce *m) { return false; }
62+
static inline bool intel_mce_usable_address(struct mce *m) { return false; }
6163
#endif
6264

6365
void mce_timer_kick(unsigned long interval);
@@ -210,6 +212,7 @@ extern bool filter_mce(struct mce *m);
210212

211213
#ifdef CONFIG_X86_MCE_AMD
212214
extern bool amd_filter_mce(struct mce *m);
215+
bool amd_mce_usable_address(struct mce *m);
213216

214217
/*
215218
* If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits
@@ -237,6 +240,7 @@ static __always_inline void smca_extract_err_addr(struct mce *m)
237240

238241
#else
239242
static inline bool amd_filter_mce(struct mce *m) { return false; }
243+
static inline bool amd_mce_usable_address(struct mce *m) { return false; }
240244
static inline void smca_extract_err_addr(struct mce *m) { }
241245
#endif
242246

0 commit comments

Comments
 (0)