Skip to content

Commit 1010b4c

Browse files
madscientist159maddy-kerneldev
authored andcommitted
powerpc/eeh: Make EEH driver device hotplug safe
Multiple race conditions existed between the PCIe hotplug driver and the EEH driver, leading to a variety of kernel oopses of the same general nature: <pcie device unplug> <eeh driver trigger> <hotplug removal trigger> <pcie tree reconfiguration> <eeh recovery next step> <oops in EEH driver bus iteration loop> A second class of oops is also seen when the underlying bus disappears during device recovery. Refactor the EEH module to be PCI rescan and remove safe. Also clean up a few minor formatting / readability issues. Signed-off-by: Timothy Pearson <[email protected]> Signed-off-by: Bjorn Helgaas <[email protected]> Signed-off-by: Madhavan Srinivasan <[email protected]> Link: https://patch.msgid.link/1334208367.1359861.1752615503144.JavaMail.zimbra@raptorengineeringinc.com
1 parent e82b34e commit 1010b4c

File tree

2 files changed

+38
-20
lines changed

2 files changed

+38
-20
lines changed

arch/powerpc/kernel/eeh_driver.c

Lines changed: 32 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -257,13 +257,12 @@ static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn,
257257
struct pci_driver *driver;
258258
enum pci_ers_result new_result;
259259

260-
pci_lock_rescan_remove();
261260
pdev = edev->pdev;
262261
if (pdev)
263262
get_device(&pdev->dev);
264-
pci_unlock_rescan_remove();
265263
if (!pdev) {
266264
eeh_edev_info(edev, "no device");
265+
*result = PCI_ERS_RESULT_DISCONNECT;
267266
return;
268267
}
269268
device_lock(&pdev->dev);
@@ -304,8 +303,9 @@ static void eeh_pe_report(const char *name, struct eeh_pe *root,
304303
struct eeh_dev *edev, *tmp;
305304

306305
pr_info("EEH: Beginning: '%s'\n", name);
307-
eeh_for_each_pe(root, pe) eeh_pe_for_each_dev(pe, edev, tmp)
308-
eeh_pe_report_edev(edev, fn, result);
306+
eeh_for_each_pe(root, pe)
307+
eeh_pe_for_each_dev(pe, edev, tmp)
308+
eeh_pe_report_edev(edev, fn, result);
309309
if (result)
310310
pr_info("EEH: Finished:'%s' with aggregate recovery state:'%s'\n",
311311
name, pci_ers_result_name(*result));
@@ -383,6 +383,8 @@ static void eeh_dev_restore_state(struct eeh_dev *edev, void *userdata)
383383
if (!edev)
384384
return;
385385

386+
pci_lock_rescan_remove();
387+
386388
/*
387389
* The content in the config space isn't saved because
388390
* the blocked config space on some adapters. We have
@@ -393,14 +395,19 @@ static void eeh_dev_restore_state(struct eeh_dev *edev, void *userdata)
393395
if (list_is_last(&edev->entry, &edev->pe->edevs))
394396
eeh_pe_restore_bars(edev->pe);
395397

398+
pci_unlock_rescan_remove();
396399
return;
397400
}
398401

399402
pdev = eeh_dev_to_pci_dev(edev);
400-
if (!pdev)
403+
if (!pdev) {
404+
pci_unlock_rescan_remove();
401405
return;
406+
}
402407

403408
pci_restore_state(pdev);
409+
410+
pci_unlock_rescan_remove();
404411
}
405412

406413
/**
@@ -647,9 +654,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
647654
if (any_passed || driver_eeh_aware || (pe->type & EEH_PE_VF)) {
648655
eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data);
649656
} else {
650-
pci_lock_rescan_remove();
651657
pci_hp_remove_devices(bus);
652-
pci_unlock_rescan_remove();
653658
}
654659

655660
/*
@@ -665,16 +670,13 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
665670
if (rc)
666671
return rc;
667672

668-
pci_lock_rescan_remove();
669-
670673
/* Restore PE */
671674
eeh_ops->configure_bridge(pe);
672675
eeh_pe_restore_bars(pe);
673676

674677
/* Clear frozen state */
675678
rc = eeh_clear_pe_frozen_state(pe, false);
676679
if (rc) {
677-
pci_unlock_rescan_remove();
678680
return rc;
679681
}
680682

@@ -709,7 +711,6 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
709711
pe->tstamp = tstamp;
710712
pe->freeze_count = cnt;
711713

712-
pci_unlock_rescan_remove();
713714
return 0;
714715
}
715716

@@ -843,10 +844,13 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
843844
{LIST_HEAD_INIT(rmv_data.removed_vf_list), 0};
844845
int devices = 0;
845846

847+
pci_lock_rescan_remove();
848+
846849
bus = eeh_pe_bus_get(pe);
847850
if (!bus) {
848851
pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n",
849852
__func__, pe->phb->global_number, pe->addr);
853+
pci_unlock_rescan_remove();
850854
return;
851855
}
852856

@@ -1094,10 +1098,15 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
10941098
eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
10951099
eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
10961100

1097-
pci_lock_rescan_remove();
1098-
pci_hp_remove_devices(bus);
1099-
pci_unlock_rescan_remove();
1101+
bus = eeh_pe_bus_get(pe);
1102+
if (bus)
1103+
pci_hp_remove_devices(bus);
1104+
else
1105+
pr_err("%s: PCI bus for PHB#%x-PE#%x disappeared\n",
1106+
__func__, pe->phb->global_number, pe->addr);
1107+
11001108
/* The passed PE should no longer be used */
1109+
pci_unlock_rescan_remove();
11011110
return;
11021111
}
11031112

@@ -1114,6 +1123,8 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
11141123
eeh_clear_slot_attention(edev->pdev);
11151124

11161125
eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
1126+
1127+
pci_unlock_rescan_remove();
11171128
}
11181129

11191130
/**
@@ -1132,6 +1143,7 @@ void eeh_handle_special_event(void)
11321143
unsigned long flags;
11331144
int rc;
11341145

1146+
pci_lock_rescan_remove();
11351147

11361148
do {
11371149
rc = eeh_ops->next_error(&pe);
@@ -1171,10 +1183,12 @@ void eeh_handle_special_event(void)
11711183

11721184
break;
11731185
case EEH_NEXT_ERR_NONE:
1186+
pci_unlock_rescan_remove();
11741187
return;
11751188
default:
11761189
pr_warn("%s: Invalid value %d from next_error()\n",
11771190
__func__, rc);
1191+
pci_unlock_rescan_remove();
11781192
return;
11791193
}
11801194

@@ -1186,7 +1200,9 @@ void eeh_handle_special_event(void)
11861200
if (rc == EEH_NEXT_ERR_FROZEN_PE ||
11871201
rc == EEH_NEXT_ERR_FENCED_PHB) {
11881202
eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
1203+
pci_unlock_rescan_remove();
11891204
eeh_handle_normal_event(pe);
1205+
pci_lock_rescan_remove();
11901206
} else {
11911207
eeh_for_each_pe(pe, tmp_pe)
11921208
eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev)
@@ -1199,7 +1215,6 @@ void eeh_handle_special_event(void)
11991215
eeh_report_failure, NULL);
12001216
eeh_set_channel_state(pe, pci_channel_io_perm_failure);
12011217

1202-
pci_lock_rescan_remove();
12031218
list_for_each_entry(hose, &hose_list, list_node) {
12041219
phb_pe = eeh_phb_pe_get(hose);
12051220
if (!phb_pe ||
@@ -1218,7 +1233,6 @@ void eeh_handle_special_event(void)
12181233
}
12191234
pci_hp_remove_devices(bus);
12201235
}
1221-
pci_unlock_rescan_remove();
12221236
}
12231237

12241238
/*
@@ -1228,4 +1242,6 @@ void eeh_handle_special_event(void)
12281242
if (rc == EEH_NEXT_ERR_DEAD_IOC)
12291243
break;
12301244
} while (rc != EEH_NEXT_ERR_NONE);
1245+
1246+
pci_unlock_rescan_remove();
12311247
}

arch/powerpc/kernel/eeh_pe.c

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -671,10 +671,12 @@ static void eeh_bridge_check_link(struct eeh_dev *edev)
671671
eeh_ops->write_config(edev, cap + PCI_EXP_LNKCTL, 2, val);
672672

673673
/* Check link */
674-
if (!edev->pdev->link_active_reporting) {
675-
eeh_edev_dbg(edev, "No link reporting capability\n");
676-
msleep(1000);
677-
return;
674+
if (edev->pdev) {
675+
if (!edev->pdev->link_active_reporting) {
676+
eeh_edev_dbg(edev, "No link reporting capability\n");
677+
msleep(1000);
678+
return;
679+
}
678680
}
679681

680682
/* Wait the link is up until timeout (5s) */

0 commit comments

Comments
 (0)