Skip to content

Commit a2a2a6f

Browse files
madscientist159maddy-kerneldev
authored andcommitted
PCI: pnv_php: Fix surprise plug detection and recovery
The existing PowerNV hotplug code did not handle surprise plug events correctly, leading to a complete failure of the hotplug system after device removal and a required reboot to detect new devices. This comes down to two issues: 1) When a device is surprise removed, often the bridge upstream port will cause a PE freeze on the PHB. If this freeze is not cleared, the MSI interrupts from the bridge hotplug notification logic will not be received by the kernel, stalling all plug events on all slots associated with the PE. 2) When a device is removed from a slot, regardless of surprise or programmatic removal, the associated PHB/PE ls left frozen. If this freeze is not cleared via a fundamental reset, skiboot is unable to clear the freeze and cannot retrain / rescan the slot. This also requires a reboot to clear the freeze and redetect the device in the slot. Issue the appropriate unfreeze and rescan commands on hotplug events, and don't oops on hotplug if pci_bus_to_OF_node() returns NULL. Signed-off-by: Timothy Pearson <[email protected]> [bhelgaas: tidy comments] Signed-off-by: Bjorn Helgaas <[email protected]> Signed-off-by: Madhavan Srinivasan <[email protected]> Link: https://patch.msgid.link/171044224.1359864.1752615546988.JavaMail.zimbra@raptorengineeringinc.com
1 parent 1010b4c commit a2a2a6f

File tree

2 files changed

+110
-3
lines changed

2 files changed

+110
-3
lines changed

arch/powerpc/kernel/pci-hotplug.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,9 @@ void pci_hp_add_devices(struct pci_bus *bus)
141141
struct pci_controller *phb;
142142
struct device_node *dn = pci_bus_to_OF_node(bus);
143143

144+
if (!dn)
145+
return;
146+
144147
phb = pci_bus_to_host(bus);
145148

146149
mode = PCI_PROBE_NORMAL;

drivers/pci/hotplug/pnv_php.c

Lines changed: 107 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,14 @@
44
*
55
* Copyright Gavin Shan, IBM Corporation 2016.
66
* Copyright (C) 2025 Raptor Engineering, LLC
7+
* Copyright (C) 2025 Raptor Computing Systems, LLC
78
*/
89

910
#include <linux/bitfield.h>
1011
#include <linux/libfdt.h>
1112
#include <linux/module.h>
1213
#include <linux/pci.h>
14+
#include <linux/delay.h>
1315
#include <linux/pci_hotplug.h>
1416
#include <linux/of_fdt.h>
1517

@@ -469,6 +471,61 @@ static int pnv_php_set_attention_state(struct hotplug_slot *slot, u8 state)
469471
return 0;
470472
}
471473

474+
static int pnv_php_activate_slot(struct pnv_php_slot *php_slot,
475+
struct hotplug_slot *slot)
476+
{
477+
int ret, i;
478+
479+
/*
480+
* Issue initial slot activation command to firmware
481+
*
482+
* Firmware will power slot on, attempt to train the link, and
483+
* discover any downstream devices. If this process fails, firmware
484+
* will return an error code and an invalid device tree. Failure
485+
* can be caused for multiple reasons, including a faulty
486+
* downstream device, poor connection to the downstream device, or
487+
* a previously latched PHB fence. On failure, issue fundamental
488+
* reset up to three times before aborting.
489+
*/
490+
ret = pnv_php_set_slot_power_state(slot, OPAL_PCI_SLOT_POWER_ON);
491+
if (ret) {
492+
SLOT_WARN(
493+
php_slot,
494+
"PCI slot activation failed with error code %d, possible frozen PHB",
495+
ret);
496+
SLOT_WARN(
497+
php_slot,
498+
"Attempting complete PHB reset before retrying slot activation\n");
499+
for (i = 0; i < 3; i++) {
500+
/*
501+
* Slot activation failed, PHB may be fenced from a
502+
* prior device failure.
503+
*
504+
* Use the OPAL fundamental reset call to both try a
505+
* device reset and clear any potentially active PHB
506+
* fence / freeze.
507+
*/
508+
SLOT_WARN(php_slot, "Try %d...\n", i + 1);
509+
pci_set_pcie_reset_state(php_slot->pdev,
510+
pcie_warm_reset);
511+
msleep(250);
512+
pci_set_pcie_reset_state(php_slot->pdev,
513+
pcie_deassert_reset);
514+
515+
ret = pnv_php_set_slot_power_state(
516+
slot, OPAL_PCI_SLOT_POWER_ON);
517+
if (!ret)
518+
break;
519+
}
520+
521+
if (i >= 3)
522+
SLOT_WARN(php_slot,
523+
"Failed to bring slot online, aborting!\n");
524+
}
525+
526+
return ret;
527+
}
528+
472529
static int pnv_php_enable(struct pnv_php_slot *php_slot, bool rescan)
473530
{
474531
struct hotplug_slot *slot = &php_slot->slot;
@@ -531,7 +588,7 @@ static int pnv_php_enable(struct pnv_php_slot *php_slot, bool rescan)
531588
goto scan;
532589

533590
/* Power is off, turn it on and then scan the slot */
534-
ret = pnv_php_set_slot_power_state(slot, OPAL_PCI_SLOT_POWER_ON);
591+
ret = pnv_php_activate_slot(php_slot, slot);
535592
if (ret)
536593
return ret;
537594

@@ -838,16 +895,63 @@ static int pnv_php_enable_msix(struct pnv_php_slot *php_slot)
838895
return entry.vector;
839896
}
840897

898+
static void
899+
pnv_php_detect_clear_suprise_removal_freeze(struct pnv_php_slot *php_slot)
900+
{
901+
struct pci_dev *pdev = php_slot->pdev;
902+
struct eeh_dev *edev;
903+
struct eeh_pe *pe;
904+
int i, rc;
905+
906+
/*
907+
* When a device is surprise removed from a downstream bridge slot,
908+
* the upstream bridge port can still end up frozen due to related EEH
909+
* events, which will in turn block the MSI interrupts for slot hotplug
910+
* detection.
911+
*
912+
* Detect and thaw any frozen upstream PE after slot deactivation.
913+
*/
914+
edev = pci_dev_to_eeh_dev(pdev);
915+
pe = edev ? edev->pe : NULL;
916+
rc = eeh_pe_get_state(pe);
917+
if ((rc == -ENODEV) || (rc == -ENOENT)) {
918+
SLOT_WARN(
919+
php_slot,
920+
"Upstream bridge PE state unknown, hotplug detect may fail\n");
921+
} else {
922+
if (pe->state & EEH_PE_ISOLATED) {
923+
SLOT_WARN(
924+
php_slot,
925+
"Upstream bridge PE %02x frozen, thawing...\n",
926+
pe->addr);
927+
for (i = 0; i < 3; i++)
928+
if (!eeh_unfreeze_pe(pe))
929+
break;
930+
if (i >= 3)
931+
SLOT_WARN(
932+
php_slot,
933+
"Unable to thaw PE %02x, hotplug detect will fail!\n",
934+
pe->addr);
935+
else
936+
SLOT_WARN(php_slot,
937+
"PE %02x thawed successfully\n",
938+
pe->addr);
939+
}
940+
}
941+
}
942+
841943
static void pnv_php_event_handler(struct work_struct *work)
842944
{
843945
struct pnv_php_event *event =
844946
container_of(work, struct pnv_php_event, work);
845947
struct pnv_php_slot *php_slot = event->php_slot;
846948

847-
if (event->added)
949+
if (event->added) {
848950
pnv_php_enable_slot(&php_slot->slot);
849-
else
951+
} else {
850952
pnv_php_disable_slot(&php_slot->slot);
953+
pnv_php_detect_clear_suprise_removal_freeze(php_slot);
954+
}
851955

852956
kfree(event);
853957
}

0 commit comments

Comments
 (0)