Skip to content

Commit 2397f79

Browse files
qzhuo2aegl
authored andcommitted
EDAC/skx_common: Differentiate memory error sources
The current skx_common determines whether the memory error source is the near memory of the 2LM system and then retrieves the decoded error results from the ADXL components (near-memory vs. far-memory) accordingly. However, some memory controllers may have limitations in correctly reporting the memory error source, leading to the retrieval of incorrect decoded parts from the ADXL. To address these limitations, instead of simply determining whether the memory error is from the near memory of the 2LM system, it is necessary to distinguish the memory error source details as follows: Memory error from the near memory of the 2LM system. Memory error from the far memory of the 2LM system. Memory error from the 1LM system. Not a memory error. This will enable the i10nm_edac driver to take appropriate actions for those memory controllers that have limitations in reporting the memory error source. Fixes: ba987ea ("EDAC/i10nm: Add Intel Granite Rapids server support") Signed-off-by: Qiuxu Zhuo <[email protected]> Signed-off-by: Tony Luck <[email protected]> Tested-by: Diego Garcia Rodriguez <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent ddb8a8a commit 2397f79

File tree

2 files changed

+23
-18
lines changed

2 files changed

+23
-18
lines changed

drivers/edac/skx_common.c

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ void skx_adxl_put(void)
119119
}
120120
EXPORT_SYMBOL_GPL(skx_adxl_put);
121121

122-
static bool skx_adxl_decode(struct decoded_addr *res, bool error_in_1st_level_mem)
122+
static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src)
123123
{
124124
struct skx_dev *d;
125125
int i, len = 0;
@@ -136,7 +136,7 @@ static bool skx_adxl_decode(struct decoded_addr *res, bool error_in_1st_level_me
136136
}
137137

138138
res->socket = (int)adxl_values[component_indices[INDEX_SOCKET]];
139-
if (error_in_1st_level_mem) {
139+
if (err_src == ERR_SRC_2LM_NM) {
140140
res->imc = (adxl_nm_bitmap & BIT_NM_MEMCTRL) ?
141141
(int)adxl_values[component_indices[INDEX_NM_MEMCTRL]] : -1;
142142
res->channel = (adxl_nm_bitmap & BIT_NM_CHANNEL) ?
@@ -620,40 +620,38 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
620620
optype, skx_msg);
621621
}
622622

623-
static bool skx_error_in_1st_level_mem(const struct mce *m)
623+
static enum error_source skx_error_source(const struct mce *m)
624624
{
625-
u32 errcode;
625+
u32 errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK;
626626

627-
if (!skx_mem_cfg_2lm)
628-
return false;
629-
630-
errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK;
631-
632-
return errcode == MCACOD_EXT_MEM_ERR;
633-
}
627+
if (errcode != MCACOD_MEM_CTL_ERR && errcode != MCACOD_EXT_MEM_ERR)
628+
return ERR_SRC_NOT_MEMORY;
634629

635-
static bool skx_error_in_mem(const struct mce *m)
636-
{
637-
u32 errcode;
630+
if (!skx_mem_cfg_2lm)
631+
return ERR_SRC_1LM;
638632

639-
errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK;
633+
if (errcode == MCACOD_EXT_MEM_ERR)
634+
return ERR_SRC_2LM_NM;
640635

641-
return (errcode == MCACOD_MEM_CTL_ERR || errcode == MCACOD_EXT_MEM_ERR);
636+
return ERR_SRC_2LM_FM;
642637
}
643638

644639
int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
645640
void *data)
646641
{
647642
struct mce *mce = (struct mce *)data;
643+
enum error_source err_src;
648644
struct decoded_addr res;
649645
struct mem_ctl_info *mci;
650646
char *type;
651647

652648
if (mce->kflags & MCE_HANDLED_CEC)
653649
return NOTIFY_DONE;
654650

651+
err_src = skx_error_source(mce);
652+
655653
/* Ignore unless this is memory related with an address */
656-
if (!skx_error_in_mem(mce) || !(mce->status & MCI_STATUS_ADDRV))
654+
if (err_src == ERR_SRC_NOT_MEMORY || !(mce->status & MCI_STATUS_ADDRV))
657655
return NOTIFY_DONE;
658656

659657
memset(&res, 0, sizeof(res));
@@ -667,7 +665,7 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
667665
/* Try driver decoder first */
668666
if (!(driver_decode && driver_decode(&res))) {
669667
/* Then try firmware decoder (ACPI DSM methods) */
670-
if (!(adxl_component_count && skx_adxl_decode(&res, skx_error_in_1st_level_mem(mce))))
668+
if (!(adxl_component_count && skx_adxl_decode(&res, err_src)))
671669
return NOTIFY_DONE;
672670
}
673671

drivers/edac/skx_common.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,13 @@ enum {
146146
INDEX_MAX
147147
};
148148

149+
enum error_source {
150+
ERR_SRC_1LM,
151+
ERR_SRC_2LM_NM,
152+
ERR_SRC_2LM_FM,
153+
ERR_SRC_NOT_MEMORY,
154+
};
155+
149156
#define BIT_NM_MEMCTRL BIT_ULL(INDEX_NM_MEMCTRL)
150157
#define BIT_NM_CHANNEL BIT_ULL(INDEX_NM_CHANNEL)
151158
#define BIT_NM_DIMM BIT_ULL(INDEX_NM_DIMM)

0 commit comments

Comments
 (0)