|
28 | 28 | #include <linux/interrupt.h>
|
29 | 29 | #include <linux/delay.h>
|
30 | 30 | #include <linux/kfifo.h>
|
| 31 | +#include <linux/ratelimit.h> |
31 | 32 | #include <linux/slab.h>
|
32 | 33 | #include <acpi/apei.h>
|
33 | 34 | #include <acpi/ghes.h>
|
@@ -88,6 +89,10 @@ struct aer_info {
|
88 | 89 | u64 rootport_total_cor_errs;
|
89 | 90 | u64 rootport_total_fatal_errs;
|
90 | 91 | u64 rootport_total_nonfatal_errs;
|
| 92 | + |
| 93 | + /* Ratelimits for errors */ |
| 94 | + struct ratelimit_state correctable_ratelimit; |
| 95 | + struct ratelimit_state nonfatal_ratelimit; |
91 | 96 | };
|
92 | 97 |
|
93 | 98 | #define AER_LOG_TLP_MASKS (PCI_ERR_UNC_POISON_TLP| \
|
@@ -379,6 +384,11 @@ void pci_aer_init(struct pci_dev *dev)
|
379 | 384 |
|
380 | 385 | dev->aer_info = kzalloc(sizeof(*dev->aer_info), GFP_KERNEL);
|
381 | 386 |
|
| 387 | + ratelimit_state_init(&dev->aer_info->correctable_ratelimit, |
| 388 | + DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); |
| 389 | + ratelimit_state_init(&dev->aer_info->nonfatal_ratelimit, |
| 390 | + DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); |
| 391 | + |
382 | 392 | /*
|
383 | 393 | * We save/restore PCI_ERR_UNCOR_MASK, PCI_ERR_UNCOR_SEVER,
|
384 | 394 | * PCI_ERR_COR_MASK, and PCI_ERR_CAP. Root and Root Complex Event
|
@@ -669,6 +679,18 @@ static void pci_rootport_aer_stats_incr(struct pci_dev *pdev,
|
669 | 679 | }
|
670 | 680 | }
|
671 | 681 |
|
| 682 | +static int aer_ratelimit(struct pci_dev *dev, unsigned int severity) |
| 683 | +{ |
| 684 | + switch (severity) { |
| 685 | + case AER_NONFATAL: |
| 686 | + return __ratelimit(&dev->aer_info->nonfatal_ratelimit); |
| 687 | + case AER_CORRECTABLE: |
| 688 | + return __ratelimit(&dev->aer_info->correctable_ratelimit); |
| 689 | + default: |
| 690 | + return 1; /* Don't ratelimit fatal errors */ |
| 691 | + } |
| 692 | +} |
| 693 | + |
672 | 694 | static void __aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
|
673 | 695 | {
|
674 | 696 | const char **strings;
|
@@ -721,6 +743,9 @@ void aer_print_error(struct aer_err_info *info, int i)
|
721 | 743 | trace_aer_event(pci_name(dev), (info->status & ~info->mask),
|
722 | 744 | info->severity, info->tlp_header_valid, &info->tlp);
|
723 | 745 |
|
| 746 | + if (!info->ratelimit_print[i]) |
| 747 | + return; |
| 748 | + |
724 | 749 | if (!info->status) {
|
725 | 750 | pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
|
726 | 751 | aer_error_severity_string[info->severity]);
|
@@ -790,6 +815,9 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
|
790 | 815 | trace_aer_event(pci_name(dev), (status & ~mask),
|
791 | 816 | aer_severity, tlp_header_valid, &aer->header_log);
|
792 | 817 |
|
| 818 | + if (!aer_ratelimit(dev, info.severity)) |
| 819 | + return; |
| 820 | + |
793 | 821 | layer = AER_GET_LAYER_ERROR(aer_severity, status);
|
794 | 822 | agent = AER_GET_AGENT(aer_severity, status);
|
795 | 823 |
|
@@ -824,6 +852,18 @@ static int add_error_device(struct aer_err_info *e_info, struct pci_dev *dev)
|
824 | 852 | e_info->dev[i] = pci_dev_get(dev);
|
825 | 853 | e_info->error_dev_num++;
|
826 | 854 |
|
| 855 | + /* |
| 856 | + * Ratelimit AER log messages. "dev" is either the source |
| 857 | + * identified by the root's Error Source ID or it has an unmasked |
| 858 | + * error logged in its own AER Capability. Messages are emitted |
| 859 | + * when "ratelimit_print[i]" is non-zero. If we will print detail |
| 860 | + * for a downstream device, make sure we print the Error Source ID |
| 861 | + * from the root as well. |
| 862 | + */ |
| 863 | + if (aer_ratelimit(dev, e_info->severity)) { |
| 864 | + e_info->ratelimit_print[i] = 1; |
| 865 | + e_info->root_ratelimit_print = 1; |
| 866 | + } |
827 | 867 | return 0;
|
828 | 868 | }
|
829 | 869 |
|
@@ -918,7 +958,7 @@ static int find_device_iter(struct pci_dev *dev, void *data)
|
918 | 958 | * e_info->error_dev_num and e_info->dev[], based on the given information.
|
919 | 959 | */
|
920 | 960 | static bool find_source_device(struct pci_dev *parent,
|
921 |
| - struct aer_err_info *e_info) |
| 961 | + struct aer_err_info *e_info) |
922 | 962 | {
|
923 | 963 | struct pci_dev *dev = parent;
|
924 | 964 | int result;
|
@@ -1144,9 +1184,10 @@ static void aer_recover_work_func(struct work_struct *work)
|
1144 | 1184 | pdev = pci_get_domain_bus_and_slot(entry.domain, entry.bus,
|
1145 | 1185 | entry.devfn);
|
1146 | 1186 | if (!pdev) {
|
1147 |
| - pr_err("no pci_dev for %04x:%02x:%02x.%x\n", |
1148 |
| - entry.domain, entry.bus, |
1149 |
| - PCI_SLOT(entry.devfn), PCI_FUNC(entry.devfn)); |
| 1187 | + pr_err_ratelimited("%04x:%02x:%02x.%x: no pci_dev found\n", |
| 1188 | + entry.domain, entry.bus, |
| 1189 | + PCI_SLOT(entry.devfn), |
| 1190 | + PCI_FUNC(entry.devfn)); |
1150 | 1191 | continue;
|
1151 | 1192 | }
|
1152 | 1193 | pci_print_aer(pdev, entry.severity, entry.regs);
|
@@ -1294,7 +1335,22 @@ static void aer_isr_one_error_type(struct pci_dev *root,
|
1294 | 1335 | bool found;
|
1295 | 1336 |
|
1296 | 1337 | found = find_source_device(root, info);
|
1297 |
| - aer_print_source(root, info, found); |
| 1338 | + |
| 1339 | + /* |
| 1340 | + * If we're going to log error messages, we've already set |
| 1341 | + * "info->root_ratelimit_print" and "info->ratelimit_print[i]" to |
| 1342 | + * non-zero (which enables printing) because this is either an |
| 1343 | + * ERR_FATAL or we found a device with an error logged in its AER |
| 1344 | + * Capability. |
| 1345 | + * |
| 1346 | + * If we didn't find the Error Source device, at least log the |
| 1347 | + * Requester ID from the ERR_* Message received by the Root Port or |
| 1348 | + * RCEC, ratelimited by the RP or RCEC. |
| 1349 | + */ |
| 1350 | + if (info->root_ratelimit_print || |
| 1351 | + (!found && aer_ratelimit(root, info->severity))) |
| 1352 | + aer_print_source(root, info, found); |
| 1353 | + |
1298 | 1354 | if (found)
|
1299 | 1355 | aer_process_err_devices(info);
|
1300 | 1356 | }
|
|
0 commit comments