Skip to content

Commit b8825e5

Browse files
[202405] PCIe AER printk ratelimiting backport
Backported patches that implement ratelimiting for AER report printks. This reduces log spam for devices that regularly report corrected issues on its link. kernel 6.1 version of sonic-net#520 applied to 202405. This is similar to sonic-net#521 applied to 202505. Upstream discussion: https://lore.kernel.org/linux-pci/20250522232339.1525671-1-helgaas@kernel.org/T/
1 parent d10d26c commit b8825e5

13 files changed

+1276
-0
lines changed
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
From 8aec28caf011788522c1fd2dea92959477016053 Mon Sep 17 00:00:00 2001
2+
From: Bjorn Helgaas <bhelgaas@google.com>
3+
Date: Thu, 22 May 2025 18:21:09 -0500
4+
Subject: [PATCH] PCI/AER: Factor COR/UNCOR error handling out from
5+
aer_isr_one_error()
6+
MIME-Version: 1.0
7+
Content-Type: text/plain; charset=UTF-8
8+
Content-Transfer-Encoding: 8bit
9+
10+
aer_isr_one_error() duplicates the Error Source ID logging and AER error
11+
processing for Correctable Errors and Uncorrectable Errors. Factor out the
12+
duplicated code to aer_isr_one_error_type().
13+
14+
aer_isr_one_error() doesn't need the struct aer_rpc pointer, so pass it the
15+
Root Port or RCEC pci_dev pointer instead.
16+
17+
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
18+
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
19+
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
20+
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
21+
Link: https://patch.msgid.link/20250522232339.1525671-4-helgaas@kernel.org
22+
(cherry picked from commit 6fc4dae74afcf29ef82afbaaa9b082893871eda4)
23+
---
24+
drivers/pci/pcie/aer.c | 36 +++++++++++++++++++++++-------------
25+
1 file changed, 23 insertions(+), 13 deletions(-)
26+
27+
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
28+
index 01234567..89abcdef 100644
29+
--- a/drivers/pci/pcie/aer.c
30+
+++ b/drivers/pci/pcie/aer.c
31+
@@ -1287,17 +1287,32 @@ static inline void aer_process_err_devices(struct aer_err_info *e_info)
32+
}
33+
34+
/**
35+
- * aer_isr_one_error - consume an error detected by root port
36+
- * @rpc: pointer to the root port which holds an error
37+
+ * aer_isr_one_error_type - consume a Correctable or Uncorrectable Error
38+
+ * detected by Root Port or RCEC
39+
+ * @root: pointer to Root Port or RCEC that signaled AER interrupt
40+
+ * @info: pointer to AER error info
41+
+ */
42+
+static void aer_isr_one_error_type(struct pci_dev *root,
43+
+ struct aer_err_info *info)
44+
+{
45+
+ aer_print_port_info(root, info);
46+
+
47+
+ if (find_source_device(root, info))
48+
+ aer_process_err_devices(info);
49+
+}
50+
+
51+
+/**
52+
+ * aer_isr_one_error - consume error(s) signaled by an AER interrupt from
53+
+ * Root Port or RCEC
54+
+ * @root: pointer to Root Port or RCEC that signaled AER interrupt
55+
* @e_src: pointer to an error source
56+
*/
57+
-static void aer_isr_one_error(struct aer_rpc *rpc,
58+
+static void aer_isr_one_error(struct pci_dev *root,
59+
struct aer_err_source *e_src)
60+
{
61+
- struct pci_dev *pdev = rpc->rpd;
62+
struct aer_err_info e_info;
63+
64+
- pci_rootport_aer_stats_incr(pdev, e_src);
65+
+ pci_rootport_aer_stats_incr(root, e_src);
66+
67+
/*
68+
* There is a possibility that both correctable error and
69+
@@ -1312,10 +1327,8 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
70+
e_info.multi_error_valid = 1;
71+
else
72+
e_info.multi_error_valid = 0;
73+
- aer_print_port_info(pdev, &e_info);
74+
75+
- if (find_source_device(pdev, &e_info))
76+
- aer_process_err_devices(&e_info);
77+
+ aer_isr_one_error_type(root, &e_info);
78+
}
79+
80+
if (e_src->status & PCI_ERR_ROOT_UNCOR_RCV) {
81+
@@ -1332,10 +1345,7 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
82+
else
83+
e_info.multi_error_valid = 0;
84+
85+
- aer_print_port_info(pdev, &e_info);
86+
-
87+
- if (find_source_device(pdev, &e_info))
88+
- aer_process_err_devices(&e_info);
89+
+ aer_isr_one_error_type(root, &e_info);
90+
}
91+
}
92+
93+
@@ -1356,7 +1366,7 @@ static irqreturn_t aer_isr(int irq, void *context)
94+
return IRQ_NONE;
95+
96+
while (kfifo_get(&rpc->aer_fifo, &e_src))
97+
- aer_isr_one_error(rpc, &e_src);
98+
+ aer_isr_one_error(rpc->rpd, &e_src);
99+
return IRQ_HANDLED;
100+
}
101+
102+
--
103+
2.47.0
104+
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
From cf5770619326108794a72ca7b3500ad9e3aefa90 Mon Sep 17 00:00:00 2001
2+
From: Vernon Yang <yanglincheng@kylinos.cn>
3+
Date: Fri, 5 Sep 2025 02:25:27 +0800
4+
Subject: [PATCH 1/2] PCI/AER: Fix NULL pointer access by aer_info
5+
6+
The kzalloc(GFP_KERNEL) may return NULL, so all accesses to aer_info->xxx
7+
will result in kernel panic. Fix it.
8+
9+
Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
10+
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
11+
Link: https://patch.msgid.link/20250904182527.67371-1-vernon2gm@gmail.com
12+
(cherry picked from commit 0a27bdb14b028fed30a10cec2f945c38cb5ca4fa)
13+
---
14+
drivers/pci/pcie/aer.c | 6 +++++-
15+
1 file changed, 5 insertions(+), 1 deletion(-)
16+
17+
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
18+
index 01234567..89abcdef 100644
19+
--- a/drivers/pci/pcie/aer.c
20+
+++ b/drivers/pci/pcie/aer.c
21+
@@ -385,7 +385,11 @@ void pci_aer_init(struct pci_dev *dev)
22+
if (!dev->aer_cap)
23+
return;
24+
25+
- dev->aer_stats = kzalloc(sizeof(struct aer_stats), GFP_KERNEL);
26+
+ dev->aer_stats = kzalloc(sizeof(*dev->aer_stats), GFP_KERNEL);
27+
+ if (!dev->aer_stats) {
28+
+ dev->aer_cap = 0;
29+
+ return;
30+
+ }
31+
32+
ratelimit_state_init(&dev->aer_stats->correctable_ratelimit,
33+
DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
34+
--
35+
2.39.5 (Apple Git-154)
36+
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
From fed119ab131c202e7677ce0228d17f5cb74baa29 Mon Sep 17 00:00:00 2001
2+
From: Bjorn Helgaas <bhelgaas@google.com>
3+
Date: Thu, 22 May 2025 18:21:15 -0500
4+
Subject: [PATCH 1/8] PCI/AER: Simplify pci_print_aer()
5+
MIME-Version: 1.0
6+
Content-Type: text/plain; charset=UTF-8
7+
Content-Transfer-Encoding: 8bit
8+
9+
Simplify pci_print_aer() by initializing the struct aer_err_info "info"
10+
with a designated initializer list (it was previously initialized with
11+
memset()) and using pci_name().
12+
13+
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
14+
Tested-by: Krzysztof Wilczyński <kwilczynski@kernel.org>
15+
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
16+
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
17+
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
18+
Link: https://patch.msgid.link/20250522232339.1525671-10-helgaas@kernel.org
19+
(cherry picked from commit ad9839137cf9fb0f0c2d531bd04bc4382e6f2de9)
20+
---
21+
drivers/pci/pcie/aer.c | 16 ++++++++--------
22+
1 file changed, 8 insertions(+), 8 deletions(-)
23+
24+
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
25+
index 01234567..89abcdef 100644
26+
--- a/drivers/pci/pcie/aer.c
27+
+++ b/drivers/pci/pcie/aer.c
28+
@@ -774,7 +774,10 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
29+
{
30+
int layer, agent, tlp_header_valid = 0;
31+
u32 status, mask;
32+
- struct aer_err_info info;
33+
+ struct aer_err_info info = {
34+
+ .severity = aer_severity,
35+
+ .first_error = PCI_ERR_CAP_FEP(aer->cap_control),
36+
+ };
37+
38+
if (aer_severity == AER_CORRECTABLE) {
39+
status = aer->cor_status;
40+
@@ -785,14 +788,11 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
41+
tlp_header_valid = status & AER_LOG_TLP_MASKS;
42+
}
43+
44+
- layer = AER_GET_LAYER_ERROR(aer_severity, status);
45+
- agent = AER_GET_AGENT(aer_severity, status);
46+
-
47+
- memset(&info, 0, sizeof(info));
48+
- info.severity = aer_severity;
49+
info.status = status;
50+
info.mask = mask;
51+
- info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
52+
+
53+
+ layer = AER_GET_LAYER_ERROR(aer_severity, status);
54+
+ agent = AER_GET_AGENT(aer_severity, status);
55+
56+
pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
57+
__aer_print_error(dev, &info);
58+
@@ -806,7 +806,7 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
59+
if (tlp_header_valid)
60+
__print_tlp_header(dev, &aer->header_log);
61+
62+
- trace_aer_event(dev_name(&dev->dev), (status & ~mask),
63+
+ trace_aer_event(pci_name(dev), (status & ~mask),
64+
aer_severity, tlp_header_valid, &aer->header_log);
65+
}
66+
#endif
67+
--
68+
2.47.0
69+
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
From 23d8218139853dda49859e3041f17111cdc47400 Mon Sep 17 00:00:00 2001
2+
From: Breno Leitao <leitao@debian.org>
3+
Date: Mon, 29 Sep 2025 02:15:47 -0700
4+
Subject: [PATCH 2/2] PCI/AER: Avoid NULL pointer dereference in
5+
aer_ratelimit()
6+
7+
When platform firmware supplies error information to the OS, e.g., via the
8+
ACPI APEI GHES mechanism, it may identify an error source device that
9+
doesn't advertise an AER Capability and therefore dev->aer_info, which
10+
contains AER stats and ratelimiting data, is NULL.
11+
12+
pci_dev_aer_stats_incr() already checks dev->aer_info for NULL, but
13+
aer_ratelimit() did not, leading to NULL pointer dereferences like this one
14+
from the URL below:
15+
16+
{1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 0
17+
{1}[Hardware Error]: event severity: corrected
18+
{1}[Hardware Error]: device_id: 0000:00:00.0
19+
{1}[Hardware Error]: vendor_id: 0x8086, device_id: 0x2020
20+
{1}[Hardware Error]: aer_cor_status: 0x00001000, aer_cor_mask: 0x00002000
21+
BUG: kernel NULL pointer dereference, address: 0000000000000264
22+
RIP: 0010:___ratelimit+0xc/0x1b0
23+
pci_print_aer+0x141/0x360
24+
aer_recover_work_func+0xb5/0x130
25+
26+
[8086:2020] is an Intel "Sky Lake-E DMI3 Registers" device that claims to
27+
be a Root Port but does not advertise an AER Capability.
28+
29+
Add a NULL check in aer_ratelimit() to avoid the NULL pointer dereference.
30+
Note that this also prevents ratelimiting these events from GHES.
31+
32+
Fixes: a57f2bfb4a5863 ("PCI/AER: Ratelimit correctable and non-fatal error logging")
33+
Link: https://lore.kernel.org/r/buduna6darbvwfg3aogl5kimyxkggu3n4romnmq6sozut6axeu@clnx7sfsy457/
34+
Signed-off-by: Breno Leitao <leitao@debian.org>
35+
[bhelgaas: add crash details to commit log]
36+
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
37+
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
38+
Cc: stable@vger.kernel.org
39+
Link: https://patch.msgid.link/20250929-aer_crash_2-v1-1-68ec4f81c356@debian.org
40+
(cherry picked from commit deb2f228388ff3a9d0623e3b59a053e9235c341d)
41+
---
42+
drivers/pci/pcie/aer.c | 3 +++
43+
1 file changed, 3 insertions(+)
44+
45+
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
46+
index 01234567..89abcdef 100644
47+
--- a/drivers/pci/pcie/aer.c
48+
+++ b/drivers/pci/pcie/aer.c
49+
@@ -800,6 +800,9 @@ static void __print_tlp_header(struct pci_dev *dev,
50+
51+
static int aer_ratelimit(struct pci_dev *dev, unsigned int severity)
52+
{
53+
+ if (!dev->aer_stats)
54+
+ return 1;
55+
+
56+
switch (severity) {
57+
case AER_NONFATAL:
58+
return __ratelimit(&dev->aer_stats->nonfatal_ratelimit);
59+
--
60+
2.39.5 (Apple Git-154)
61+
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
From 8bbcbe849d91b64da2850797482bf0915772898b Mon Sep 17 00:00:00 2001
2+
From: Bjorn Helgaas <bhelgaas@google.com>
3+
Date: Thu, 22 May 2025 18:21:16 -0500
4+
Subject: [PATCH 2/8] PCI/AER: Update statistics before ratelimiting
5+
MIME-Version: 1.0
6+
Content-Type: text/plain; charset=UTF-8
7+
Content-Transfer-Encoding: 8bit
8+
9+
There are two AER logging entry points:
10+
11+
- aer_print_error() is used by DPC (dpc_process_error()) and native AER
12+
handling (aer_process_err_devices()).
13+
14+
- pci_print_aer() is used by GHES (aer_recover_work_func()) and CXL
15+
(cxl_handle_rdport_errors())
16+
17+
Both use __aer_print_error() to print the AER error bits. Previously
18+
__aer_print_error() also incremented the AER statistics via
19+
pci_dev_aer_stats_incr().
20+
21+
Call pci_dev_aer_stats_incr() early in the entry points instead of in
22+
__aer_print_error() so we update the statistics even if the actual printing
23+
of error bits is rate limited by a future change.
24+
25+
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
26+
Tested-by: Krzysztof Wilczyński <kwilczynski@kernel.org>
27+
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
28+
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
29+
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
30+
Link: https://patch.msgid.link/20250522232339.1525671-11-helgaas@kernel.org
31+
(cherry picked from commit 88a7765e62b9e4c79c7ca2c7b749ae04f54a5668)
32+
---
33+
drivers/pci/pcie/aer.c | 5 ++++-
34+
1 file changed, 4 insertions(+), 1 deletion(-)
35+
36+
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
37+
index 01234567..89abcdef 100644
38+
--- a/drivers/pci/pcie/aer.c
39+
+++ b/drivers/pci/pcie/aer.c
40+
@@ -703,7 +703,6 @@ static void __aer_print_error(struct pci_dev *dev,
41+
pci_printk(level, dev, " [%2d] %-22s%s\n", i, errmsg,
42+
info->first_error == i ? " (First)" : "");
43+
}
44+
- pci_dev_aer_stats_incr(dev, info);
45+
}
46+
47+
void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
48+
@@ -712,6 +711,8 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
49+
int id = ((dev->bus->number << 8) | dev->devfn);
50+
const char *level;
51+
52+
+ pci_dev_aer_stats_incr(dev, info);
53+
+
54+
if (!info->status) {
55+
pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
56+
aer_error_severity_string[info->severity]);
57+
@@ -791,6 +792,8 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
58+
info.status = status;
59+
info.mask = mask;
60+
61+
+ pci_dev_aer_stats_incr(dev, &info);
62+
+
63+
layer = AER_GET_LAYER_ERROR(aer_severity, status);
64+
agent = AER_GET_AGENT(aer_severity, status);
65+
66+
--
67+
2.47.0
68+

0 commit comments

Comments
 (0)