Skip to content

Commit 28b70cd

Browse files
kwan-intcjgunthorpe
authored andcommitted
IB/hfi1: Do not destroy hfi1_wq when the device is shut down
The workqueue hfi1_wq is destroyed in function shutdown_device(), which is called by either shutdown_one() or remove_one(). The function shutdown_one() is called when the kernel is rebooted while remove_one() is called when the hfi1 driver is unloaded. When the kernel is rebooted, hfi1_wq is destroyed while all qps are still active, leading to a kernel crash: BUG: unable to handle kernel NULL pointer dereference at 0000000000000102 IP: [<ffffffff94cb7b02>] __queue_work+0x32/0x3e0 PGD 0 Oops: 0000 [#1] SMP Modules linked in: dm_round_robin nvme_rdma(OE) nvme_fabrics(OE) nvme_core(OE) ib_isert iscsi_target_mod target_core_mod ib_ucm mlx4_ib iTCO_wdt iTCO_vendor_support mxm_wmi sb_edac intel_powerclamp coretemp intel_rapl iosf_mbi kvm rpcrdma sunrpc irqbypass crc32_pclmul ghash_clmulni_intel rdma_ucm aesni_intel ib_uverbs lrw gf128mul opa_vnic glue_helper ablk_helper ib_iser cryptd ib_umad rdma_cm iw_cm ses enclosure libiscsi scsi_transport_sas pcspkr joydev ib_ipoib(OE) scsi_transport_iscsi ib_cm sg ipmi_ssif mei_me lpc_ich i2c_i801 mei ioatdma ipmi_si dm_multipath ipmi_devintf ipmi_msghandler wmi acpi_pad acpi_power_meter hangcheck_timer ip_tables ext4 mbcache jbd2 mlx4_en sd_mod crc_t10dif crct10dif_generic mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm hfi1(OE) crct10dif_pclmul crct10dif_common crc32c_intel drm ahci mlx4_core libahci rdmavt(OE) igb megaraid_sas ib_core libata drm_panel_orientation_quirks ptp pps_core devlink dca i2c_algo_bit dm_mirror dm_region_hash dm_log dm_mod CPU: 19 PID: 0 Comm: swapper/19 Kdump: loaded Tainted: G OE ------------ 3.10.0-957.el7.x86_64 #1 Hardware name: Phegda X2226A/S2600CW, BIOS SE5C610.86B.01.01.0024.021320181901 02/13/2018 task: ffff8a799ba0d140 ti: ffff8a799bad8000 task.ti: ffff8a799bad8000 RIP: 0010:[<ffffffff94cb7b02>] [<ffffffff94cb7b02>] __queue_work+0x32/0x3e0 RSP: 0018:ffff8a90dde43d80 EFLAGS: 00010046 RAX: 0000000000000082 RBX: 0000000000000086 RCX: 0000000000000000 RDX: ffff8a90b924fcb8 RSI: 0000000000000000 RDI: 000000000000001b RBP: ffff8a90dde43db8 R08: ffff8a799ba0d6d8 R09: ffff8a90dde53900 R10: 0000000000000002 R11: ffff8a90dde43de8 R12: ffff8a90b924fcb8 R13: 000000000000001b R14: 0000000000000000 R15: ffff8a90d2890000 FS: 0000000000000000(0000) GS:ffff8a90dde40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000102 CR3: 0000001a70410000 CR4: 00000000001607e0 Call Trace: [<ffffffff94cb8105>] queue_work_on+0x45/0x50 [<ffffffffc03f781e>] _hfi1_schedule_send+0x6e/0xc0 [hfi1] [<ffffffffc03f78a2>] hfi1_schedule_send+0x32/0x70 [hfi1] [<ffffffffc02cf2d9>] rvt_rc_timeout+0xe9/0x130 [rdmavt] [<ffffffff94ce563a>] ? trigger_load_balance+0x6a/0x280 [<ffffffffc02cf1f0>] ? rvt_free_qpn+0x40/0x40 [rdmavt] [<ffffffff94ca7f58>] call_timer_fn+0x38/0x110 [<ffffffffc02cf1f0>] ? rvt_free_qpn+0x40/0x40 [rdmavt] [<ffffffff94caa3bd>] run_timer_softirq+0x24d/0x300 [<ffffffff94ca0f05>] __do_softirq+0xf5/0x280 [<ffffffff9537832c>] call_softirq+0x1c/0x30 [<ffffffff94c2e675>] do_softirq+0x65/0xa0 [<ffffffff94ca1285>] irq_exit+0x105/0x110 [<ffffffff953796c8>] smp_apic_timer_interrupt+0x48/0x60 [<ffffffff95375df2>] apic_timer_interrupt+0x162/0x170 <EOI> [<ffffffff951adfb7>] ? cpuidle_enter_state+0x57/0xd0 [<ffffffff951ae10e>] cpuidle_idle_call+0xde/0x230 [<ffffffff94c366de>] arch_cpu_idle+0xe/0xc0 [<ffffffff94cfc3ba>] cpu_startup_entry+0x14a/0x1e0 [<ffffffff94c57db7>] start_secondary+0x1f7/0x270 [<ffffffff94c000d5>] start_cpu+0x5/0x14 The solution is to destroy the workqueue only when the hfi1 driver is unloaded, not when the device is shut down. In addition, when the device is shut down, no more work should be scheduled on the workqueues and the workqueues are flushed. Fixes: 8d3e711 ("IB/{hfi1, qib}: Add handling of kernel restart") Link: https://lore.kernel.org/r/[email protected] Cc: <[email protected]> Reviewed-by: Mike Marciniszyn <[email protected]> Signed-off-by: Kaike Wan <[email protected]> Signed-off-by: Dennis Dalessandro <[email protected]> Signed-off-by: Jason Gunthorpe <[email protected]>
1 parent f81b456 commit 28b70cd

File tree

3 files changed

+31
-6
lines changed

3 files changed

+31
-6
lines changed

drivers/infiniband/hw/hfi1/init.c

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -830,6 +830,25 @@ static int create_workqueues(struct hfi1_devdata *dd)
830830
return -ENOMEM;
831831
}
832832

833+
/**
834+
* destroy_workqueues - destroy per port workqueues
835+
* @dd: the hfi1_ib device
836+
*/
837+
static void destroy_workqueues(struct hfi1_devdata *dd)
838+
{
839+
int pidx;
840+
struct hfi1_pportdata *ppd;
841+
842+
for (pidx = 0; pidx < dd->num_pports; ++pidx) {
843+
ppd = dd->pport + pidx;
844+
845+
if (ppd->hfi1_wq) {
846+
destroy_workqueue(ppd->hfi1_wq);
847+
ppd->hfi1_wq = NULL;
848+
}
849+
}
850+
}
851+
833852
/**
834853
* enable_general_intr() - Enable the IRQs that will be handled by the
835854
* general interrupt handler.
@@ -1104,11 +1123,10 @@ static void shutdown_device(struct hfi1_devdata *dd)
11041123
*/
11051124
hfi1_quiet_serdes(ppd);
11061125

1107-
if (ppd->hfi1_wq) {
1108-
destroy_workqueue(ppd->hfi1_wq);
1109-
ppd->hfi1_wq = NULL;
1110-
}
1126+
if (ppd->hfi1_wq)
1127+
flush_workqueue(ppd->hfi1_wq);
11111128
if (ppd->link_wq) {
1129+
flush_workqueue(ppd->link_wq);
11121130
destroy_workqueue(ppd->link_wq);
11131131
ppd->link_wq = NULL;
11141132
}
@@ -1756,6 +1774,7 @@ static void remove_one(struct pci_dev *pdev)
17561774
* clear dma engines, etc.
17571775
*/
17581776
shutdown_device(dd);
1777+
destroy_workqueues(dd);
17591778

17601779
stop_timers(dd);
17611780

drivers/infiniband/hw/hfi1/qp.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,10 @@ bool _hfi1_schedule_send(struct rvt_qp *qp)
367367
struct hfi1_ibport *ibp =
368368
to_iport(qp->ibqp.device, qp->port_num);
369369
struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
370-
struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
370+
struct hfi1_devdata *dd = ppd->dd;
371+
372+
if (dd->flags & HFI1_SHUTDOWN)
373+
return true;
371374

372375
return iowait_schedule(&priv->s_iowait, ppd->hfi1_wq,
373376
priv->s_sde ?

drivers/infiniband/hw/hfi1/tid_rdma.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5406,7 +5406,10 @@ static bool _hfi1_schedule_tid_send(struct rvt_qp *qp)
54065406
struct hfi1_ibport *ibp =
54075407
to_iport(qp->ibqp.device, qp->port_num);
54085408
struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
5409-
struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
5409+
struct hfi1_devdata *dd = ppd->dd;
5410+
5411+
if ((dd->flags & HFI1_SHUTDOWN))
5412+
return true;
54105413

54115414
return iowait_tid_schedule(&priv->s_iowait, ppd->hfi1_wq,
54125415
priv->s_sde ?

0 commit comments

Comments
 (0)