Skip to content

Commit 3c70ec7

Browse files
djbwdavejiang
authored andcommitted
cxl/ras: Fix CPER handler device confusion
By inspection, cxl_cper_handle_prot_err() is making a series of fragile assumptions that can lead to crashes: 1/ It assumes that endpoints identified in the record are a CXL-type-3 device, nothing guarantees that. 2/ It assumes that the device is bound to the cxl_pci driver, nothing guarantees that. 3/ Minor, it holds the device lock over the switch-port tracing for no reason as the trace is 100% generated from data in the record. Correct those by checking that the PCIe endpoint parents a cxl_memdev before assuming the format of the driver data, and move the lock to where it is required. Consequently this also makes the implementation ready for CXL accelerators that are not bound to cxl_pci. Fixes: 36f257e ("acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors") Cc: Terry Bowman <[email protected]> Cc: Li Ming <[email protected]> Cc: Alison Schofield <[email protected]> Cc: Ira Weiny <[email protected]> Cc: Tony Luck <[email protected]> Reviewed-by: Smita Koralahalli <[email protected]> Reviewed-by: Dave Jiang <[email protected]> Signed-off-by: Dan Williams <[email protected]> Reviewed-by: Jonathan Cameron <[email protected]> Reviewed-by: Li Ming <[email protected]> Link: https://patch.msgid.link/[email protected] Signed-off-by: Dave Jiang <[email protected]>
1 parent a403fe6 commit 3c70ec7

File tree

1 file changed

+27
-20
lines changed

1 file changed

+27
-20
lines changed

drivers/cxl/core/ras.c

Lines changed: 27 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -31,40 +31,38 @@ static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev,
3131
ras_cap.header_log);
3232
}
3333

34-
static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev,
35-
struct cxl_ras_capability_regs ras_cap)
34+
static void cxl_cper_trace_corr_prot_err(struct cxl_memdev *cxlmd,
35+
struct cxl_ras_capability_regs ras_cap)
3636
{
3737
u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
38-
struct cxl_dev_state *cxlds;
3938

40-
cxlds = pci_get_drvdata(pdev);
41-
if (!cxlds)
42-
return;
43-
44-
trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
39+
trace_cxl_aer_correctable_error(cxlmd, status);
4540
}
4641

47-
static void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev,
48-
struct cxl_ras_capability_regs ras_cap)
42+
static void
43+
cxl_cper_trace_uncorr_prot_err(struct cxl_memdev *cxlmd,
44+
struct cxl_ras_capability_regs ras_cap)
4945
{
5046
u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
51-
struct cxl_dev_state *cxlds;
5247
u32 fe;
5348

54-
cxlds = pci_get_drvdata(pdev);
55-
if (!cxlds)
56-
return;
57-
5849
if (hweight32(status) > 1)
5950
fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
6051
ras_cap.cap_control));
6152
else
6253
fe = status;
6354

64-
trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
55+
trace_cxl_aer_uncorrectable_error(cxlmd, status, fe,
6556
ras_cap.header_log);
6657
}
6758

59+
static int match_memdev_by_parent(struct device *dev, const void *uport)
60+
{
61+
if (is_cxl_memdev(dev) && dev->parent == uport)
62+
return 1;
63+
return 0;
64+
}
65+
6866
static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
6967
{
7068
unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device,
@@ -73,13 +71,12 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
7371
pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment,
7472
data->prot_err.agent_addr.bus,
7573
devfn);
74+
struct cxl_memdev *cxlmd;
7675
int port_type;
7776

7877
if (!pdev)
7978
return;
8079

81-
guard(device)(&pdev->dev);
82-
8380
port_type = pci_pcie_type(pdev);
8481
if (port_type == PCI_EXP_TYPE_ROOT_PORT ||
8582
port_type == PCI_EXP_TYPE_DOWNSTREAM ||
@@ -92,10 +89,20 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
9289
return;
9390
}
9491

92+
guard(device)(&pdev->dev);
93+
if (!pdev->dev.driver)
94+
return;
95+
96+
struct device *mem_dev __free(put_device) = bus_find_device(
97+
&cxl_bus_type, NULL, pdev, match_memdev_by_parent);
98+
if (!mem_dev)
99+
return;
100+
101+
cxlmd = to_cxl_memdev(mem_dev);
95102
if (data->severity == AER_CORRECTABLE)
96-
cxl_cper_trace_corr_prot_err(pdev, data->ras_cap);
103+
cxl_cper_trace_corr_prot_err(cxlmd, data->ras_cap);
97104
else
98-
cxl_cper_trace_uncorr_prot_err(pdev, data->ras_cap);
105+
cxl_cper_trace_uncorr_prot_err(cxlmd, data->ras_cap);
99106
}
100107

101108
static void cxl_cper_prot_err_work_fn(struct work_struct *work)

0 commit comments

Comments
 (0)