Skip to content

Commit d9207cf

Browse files
qzhuo2aegl
authored andcommitted
EDAC/{skx_common,i10nm}: Fix some missing error reports on Emerald Rapids
When doing error injection to some memory DIMMs on certain Intel Emerald Rapids servers, the i10nm_edac missed error reports for some memory DIMMs. Certain BIOS configurations may hide some memory controllers, and the i10nm_edac doesn't enumerate these hidden memory controllers. However, the ADXL decodes memory errors using memory controller physical indices even if there are hidden memory controllers. Therefore, the memory controller physical indices reported by the ADXL may mismatch the logical indices enumerated by the i10nm_edac, resulting in missed error reports for some memory DIMMs. Fix this issue by creating a mapping table from memory controller physical indices (used by the ADXL) to logical indices (used by the i10nm_edac) and using it to convert the physical indices to the logical indices during the error handling process. Fixes: c545f5e ("EDAC/i10nm: Skip the absent memory controllers") Reported-by: Kevin Chang <[email protected]> Tested-by: Kevin Chang <[email protected]> Reported-by: Thomas Chen <[email protected]> Tested-by: Thomas Chen <[email protected]> Signed-off-by: Qiuxu Zhuo <[email protected]> Signed-off-by: Tony Luck <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent 267e5b1 commit d9207cf

File tree

3 files changed

+46
-0
lines changed

3 files changed

+46
-0
lines changed

drivers/edac/i10nm_base.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -751,6 +751,8 @@ static int i10nm_get_ddr_munits(void)
751751
continue;
752752
} else {
753753
d->imc[lmc].mdev = mdev;
754+
if (res_cfg->type == SPR)
755+
skx_set_mc_mapping(d, i, lmc);
754756
lmc++;
755757
}
756758
}

drivers/edac/skx_common.c

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,35 @@ void skx_adxl_put(void)
121121
}
122122
EXPORT_SYMBOL_GPL(skx_adxl_put);
123123

124+
static void skx_init_mc_mapping(struct skx_dev *d)
125+
{
126+
/*
127+
* By default, the BIOS presents all memory controllers within each
128+
* socket to the EDAC driver. The physical indices are the same as
129+
* the logical indices of the memory controllers enumerated by the
130+
* EDAC driver.
131+
*/
132+
for (int i = 0; i < NUM_IMC; i++)
133+
d->mc_mapping[i] = i;
134+
}
135+
136+
void skx_set_mc_mapping(struct skx_dev *d, u8 pmc, u8 lmc)
137+
{
138+
edac_dbg(0, "Set the mapping of mc phy idx to logical idx: %02d -> %02d\n",
139+
pmc, lmc);
140+
141+
d->mc_mapping[pmc] = lmc;
142+
}
143+
EXPORT_SYMBOL_GPL(skx_set_mc_mapping);
144+
145+
static u8 skx_get_mc_mapping(struct skx_dev *d, u8 pmc)
146+
{
147+
edac_dbg(0, "Get the mapping of mc phy idx to logical idx: %02d -> %02d\n",
148+
pmc, d->mc_mapping[pmc]);
149+
150+
return d->mc_mapping[pmc];
151+
}
152+
124153
static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src)
125154
{
126155
struct skx_dev *d;
@@ -188,6 +217,8 @@ static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src)
188217
return false;
189218
}
190219

220+
res->imc = skx_get_mc_mapping(d, res->imc);
221+
191222
for (i = 0; i < adxl_component_count; i++) {
192223
if (adxl_values[i] == ~0x0ull)
193224
continue;
@@ -326,6 +357,8 @@ int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list)
326357
d->bus[0], d->bus[1], d->bus[2], d->bus[3]);
327358
list_add_tail(&d->list, &dev_edac_list);
328359
prev = pdev;
360+
361+
skx_init_mc_mapping(d);
329362
}
330363

331364
if (list)

drivers/edac/skx_common.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,16 @@ struct skx_dev {
9393
struct pci_dev *uracu; /* for i10nm CPU */
9494
struct pci_dev *pcu_cr3; /* for HBM memory detection */
9595
u32 mcroute;
96+
/*
97+
* Some server BIOS may hide certain memory controllers, and the
98+
* EDAC driver skips those hidden memory controllers. However, the
99+
* ADXL still decodes memory error address using physical memory
100+
* controller indices. The mapping table is used to convert the
101+
* physical indices (reported by ADXL) to the logical indices
102+
* (used the EDAC driver) of present memory controllers during the
103+
* error handling process.
104+
*/
105+
u8 mc_mapping[NUM_IMC];
96106
struct skx_imc {
97107
struct mem_ctl_info *mci;
98108
struct pci_dev *mdev; /* for i10nm CPU */
@@ -242,6 +252,7 @@ void skx_adxl_put(void);
242252
void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log);
243253
void skx_set_mem_cfg(bool mem_cfg_2lm);
244254
void skx_set_res_cfg(struct res_config *cfg);
255+
void skx_set_mc_mapping(struct skx_dev *d, u8 pmc, u8 lmc);
245256

246257
int skx_get_src_id(struct skx_dev *d, int off, u8 *id);
247258

0 commit comments

Comments
 (0)