Skip to content

Commit 53989fa

Browse files
committed
cxl/pmem: Fix module reload vs workqueue state
A test of the form: while true; do modprobe -r cxl_pmem; modprobe cxl_pmem; done May lead to a crash signature of the form: BUG: unable to handle page fault for address: ffffffffc0660030 #PF: supervisor instruction fetch in kernel mode #PF: error_code(0x0010) - not-present page [..] Workqueue: cxl_pmem 0xffffffffc0660030 RIP: 0010:0xffffffffc0660030 Code: Unable to access opcode bytes at RIP 0xffffffffc0660006. [..] Call Trace: ? process_one_work+0x4ec/0x9c0 ? pwq_dec_nr_in_flight+0x100/0x100 ? rwlock_bug.part.0+0x60/0x60 ? worker_thread+0x2eb/0x700 In that report the 0xffffffffc0660030 address corresponds to the former function address of cxl_nvb_update_state() from a previous load of the module, not the current address. Fix that by arranging for ->state_work in the 'struct cxl_nvdimm_bridge' object to be reinitialized on cxl_pmem module reload. Details: Recall that CXL subsystem wants to link a CXL memory expander device to an NVDIMM sub-hierarchy when both a persistent memory range has been registered by the CXL platform driver (cxl_acpi) *and* when that CXL memory expander has published persistent memory capacity (Get Partition Info). To this end the cxl_nvdimm_bridge driver arranges to rescan the CXL bus when either of those conditions change. The helper bus_rescan_devices() can not be called underneath the device_lock() for any device on that bus, so the cxl_nvdimm_bridge driver uses a workqueue for the rescan. Typically a driver allocates driver data to hold a 'struct work_struct' for a driven device, but for a workqueue that may run after ->remove() returns, driver data will have been freed. The 'struct cxl_nvdimm_bridge' object holds the state and work_struct directly. Unfortunately it was only arranging for that infrastructure to be initialized once per device creation rather than the necessary once per workqueue (cxl_pmem_wq) creation. Introduce is_cxl_nvdimm_bridge() and cxl_nvdimm_bridge_reset() in support of invalidating stale references to a recently destroyed cxl_pmem_wq. Cc: <[email protected]> Fixes: 8fdcb17 ("cxl/pmem: Add initial infrastructure for pmem support") Reported-by: Vishal Verma <[email protected]> Tested-by: Vishal Verma <[email protected]> Link: https://lore.kernel.org/r/163665474585.3505991.8397182770066720755.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams <[email protected]>
1 parent fd49f99 commit 53989fa

File tree

3 files changed

+42
-3
lines changed

3 files changed

+42
-3
lines changed

drivers/cxl/core/pmem.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,16 @@ struct cxl_nvdimm_bridge *to_cxl_nvdimm_bridge(struct device *dev)
5151
}
5252
EXPORT_SYMBOL_NS_GPL(to_cxl_nvdimm_bridge, CXL);
5353

54-
__mock int match_nvdimm_bridge(struct device *dev, const void *data)
54+
bool is_cxl_nvdimm_bridge(struct device *dev)
5555
{
5656
return dev->type == &cxl_nvdimm_bridge_type;
5757
}
58+
EXPORT_SYMBOL_NS_GPL(is_cxl_nvdimm_bridge, CXL);
59+
60+
__mock int match_nvdimm_bridge(struct device *dev, const void *data)
61+
{
62+
return is_cxl_nvdimm_bridge(dev);
63+
}
5864

5965
struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_nvdimm *cxl_nvd)
6066
{

drivers/cxl/cxl.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,13 @@ struct cxl_decoder {
196196
};
197197

198198

199+
/**
200+
* enum cxl_nvdimm_brige_state - state machine for managing bus rescans
201+
* @CXL_NVB_NEW: Set at bridge create and after cxl_pmem_wq is destroyed
202+
* @CXL_NVB_DEAD: Set at brige unregistration to preclude async probing
203+
* @CXL_NVB_ONLINE: Target state after successful ->probe()
204+
* @CXL_NVB_OFFLINE: Target state after ->remove() or failed ->probe()
205+
*/
199206
enum cxl_nvdimm_brige_state {
200207
CXL_NVB_NEW,
201208
CXL_NVB_DEAD,
@@ -308,6 +315,7 @@ struct cxl_nvdimm_bridge *devm_cxl_add_nvdimm_bridge(struct device *host,
308315
struct cxl_port *port);
309316
struct cxl_nvdimm *to_cxl_nvdimm(struct device *dev);
310317
bool is_cxl_nvdimm(struct device *dev);
318+
bool is_cxl_nvdimm_bridge(struct device *dev);
311319
int devm_cxl_add_nvdimm(struct device *host, struct cxl_memdev *cxlmd);
312320
struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_nvdimm *cxl_nvd);
313321

drivers/cxl/pmem.c

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,31 @@ static struct cxl_driver cxl_nvdimm_bridge_driver = {
315315
.id = CXL_DEVICE_NVDIMM_BRIDGE,
316316
};
317317

318+
/*
319+
* Return all bridges to the CXL_NVB_NEW state to invalidate any
320+
* ->state_work referring to the now destroyed cxl_pmem_wq.
321+
*/
322+
static int cxl_nvdimm_bridge_reset(struct device *dev, void *data)
323+
{
324+
struct cxl_nvdimm_bridge *cxl_nvb;
325+
326+
if (!is_cxl_nvdimm_bridge(dev))
327+
return 0;
328+
329+
cxl_nvb = to_cxl_nvdimm_bridge(dev);
330+
device_lock(dev);
331+
cxl_nvb->state = CXL_NVB_NEW;
332+
device_unlock(dev);
333+
334+
return 0;
335+
}
336+
337+
static void destroy_cxl_pmem_wq(void)
338+
{
339+
destroy_workqueue(cxl_pmem_wq);
340+
bus_for_each_dev(&cxl_bus_type, NULL, NULL, cxl_nvdimm_bridge_reset);
341+
}
342+
318343
static __init int cxl_pmem_init(void)
319344
{
320345
int rc;
@@ -340,15 +365,15 @@ static __init int cxl_pmem_init(void)
340365
err_nvdimm:
341366
cxl_driver_unregister(&cxl_nvdimm_bridge_driver);
342367
err_bridge:
343-
destroy_workqueue(cxl_pmem_wq);
368+
destroy_cxl_pmem_wq();
344369
return rc;
345370
}
346371

347372
static __exit void cxl_pmem_exit(void)
348373
{
349374
cxl_driver_unregister(&cxl_nvdimm_driver);
350375
cxl_driver_unregister(&cxl_nvdimm_bridge_driver);
351-
destroy_workqueue(cxl_pmem_wq);
376+
destroy_cxl_pmem_wq();
352377
}
353378

354379
MODULE_LICENSE("GPL v2");

0 commit comments

Comments
 (0)