Skip to content

Commit e9ee9fe

Browse files
committed
dax: Assign RAM regions to memory-hotplug by default
The default mode for device-dax instances is backwards for RAM-regions as evidenced by the fact that it tends to catch end users by surprise. "Where is my memory?". Recall that platforms are increasingly shipping with performance-differentiated memory pools beyond typical DRAM and NUMA effects. This includes HBM (high-bandwidth-memory) and CXL (dynamic interleave, varied media types, and future fabric attached possibilities). For this reason the EFI_MEMORY_SP (EFI Special Purpose Memory => Linux 'Soft Reserved') attribute is expected to be applied to all memory-pools that are not the general purpose pool. This designation gives an Operating System a chance to defer usage of a memory pool until later in the boot process where its performance properties can be interrogated and administrator policy can be applied. 'Soft Reserved' memory can be anything from too limited and precious to be part of the general purpose pool (HBM), too slow to host hot kernel data structures (some PMEM media), or anything in between. However, in the absence of an explicit policy, the memory should at least be made usable by default. The current device-dax default hides all non-general-purpose memory behind a device interface. The expectation is that the distribution of users that want the memory online by default vs device-dedicated-access by default follows the Pareto principle. A small number of enlightened users may want to do userspace memory management through a device, but general users just want the kernel to make the memory available with an option to get more advanced later. Arrange for all device-dax instances not backed by PMEM to default to attaching to the dax_kmem driver. From there the baseline memory hotplug policy (CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE / memhp_default_state=) gates whether the memory comes online or stays offline. Where, if it stays offline, it can be reliably converted back to device-mode where it can be partitioned, or fronted by a userspace allocator. So, if someone wants device-dax instances for their 'Soft Reserved' memory: 1/ Build a kernel with CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=n or boot with memhp_default_state=offline, or roll the dice and hope that the kernel has not pinned a page in that memory before step 2. 2/ Write a udev rule to convert the target dax device(s) from 'system-ram' mode to 'devdax' mode: daxctl reconfigure-device $dax -m devdax -f Cc: Michal Hocko <[email protected]> Cc: David Hildenbrand <[email protected]> Cc: Dave Hansen <[email protected]> Reviewed-by: Gregory Price <[email protected]> Tested-by: Fan Ni <[email protected]> Reviewed-by: Dave Jiang <[email protected]> Link: https://lore.kernel.org/r/167602003336.1924368.6809503401422267885.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Dan Williams <[email protected]>
1 parent 7dab174 commit e9ee9fe

File tree

6 files changed

+46
-37
lines changed

6 files changed

+46
-37
lines changed

drivers/dax/Kconfig

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ config DEV_DAX_HMEM_DEVICES
5050
def_bool y
5151

5252
config DEV_DAX_KMEM
53-
tristate "KMEM DAX: volatile-use of persistent memory"
53+
tristate "KMEM DAX: map dax-devices as System-RAM"
5454
default DEV_DAX
5555
depends on DEV_DAX
5656
depends on MEMORY_HOTPLUG # for add_memory() and friends

drivers/dax/bus.c

Lines changed: 22 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,25 @@ static int dax_match_id(struct dax_device_driver *dax_drv, struct device *dev)
5656
return match;
5757
}
5858

59+
static int dax_match_type(struct dax_device_driver *dax_drv, struct device *dev)
60+
{
61+
enum dax_driver_type type = DAXDRV_DEVICE_TYPE;
62+
struct dev_dax *dev_dax = to_dev_dax(dev);
63+
64+
if (dev_dax->region->res.flags & IORESOURCE_DAX_KMEM)
65+
type = DAXDRV_KMEM_TYPE;
66+
67+
if (dax_drv->type == type)
68+
return 1;
69+
70+
/* default to device mode if dax_kmem is disabled */
71+
if (dax_drv->type == DAXDRV_DEVICE_TYPE &&
72+
!IS_ENABLED(CONFIG_DEV_DAX_KMEM))
73+
return 1;
74+
75+
return 0;
76+
}
77+
5978
enum id_action {
6079
ID_REMOVE,
6180
ID_ADD,
@@ -216,14 +235,9 @@ static int dax_bus_match(struct device *dev, struct device_driver *drv)
216235
{
217236
struct dax_device_driver *dax_drv = to_dax_drv(drv);
218237

219-
/*
220-
* All but the 'device-dax' driver, which has 'match_always'
221-
* set, requires an exact id match.
222-
*/
223-
if (dax_drv->match_always)
238+
if (dax_match_id(dax_drv, dev))
224239
return 1;
225-
226-
return dax_match_id(dax_drv, dev);
240+
return dax_match_type(dax_drv, dev);
227241
}
228242

229243
/*
@@ -1413,13 +1427,10 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data)
14131427
}
14141428
EXPORT_SYMBOL_GPL(devm_create_dev_dax);
14151429

1416-
static int match_always_count;
1417-
14181430
int __dax_driver_register(struct dax_device_driver *dax_drv,
14191431
struct module *module, const char *mod_name)
14201432
{
14211433
struct device_driver *drv = &dax_drv->drv;
1422-
int rc = 0;
14231434

14241435
/*
14251436
* dax_bus_probe() calls dax_drv->probe() unconditionally.
@@ -1434,26 +1445,7 @@ int __dax_driver_register(struct dax_device_driver *dax_drv,
14341445
drv->mod_name = mod_name;
14351446
drv->bus = &dax_bus_type;
14361447

1437-
/* there can only be one default driver */
1438-
mutex_lock(&dax_bus_lock);
1439-
match_always_count += dax_drv->match_always;
1440-
if (match_always_count > 1) {
1441-
match_always_count--;
1442-
WARN_ON(1);
1443-
rc = -EINVAL;
1444-
}
1445-
mutex_unlock(&dax_bus_lock);
1446-
if (rc)
1447-
return rc;
1448-
1449-
rc = driver_register(drv);
1450-
if (rc && dax_drv->match_always) {
1451-
mutex_lock(&dax_bus_lock);
1452-
match_always_count -= dax_drv->match_always;
1453-
mutex_unlock(&dax_bus_lock);
1454-
}
1455-
1456-
return rc;
1448+
return driver_register(drv);
14571449
}
14581450
EXPORT_SYMBOL_GPL(__dax_driver_register);
14591451

@@ -1463,7 +1455,6 @@ void dax_driver_unregister(struct dax_device_driver *dax_drv)
14631455
struct dax_id *dax_id, *_id;
14641456

14651457
mutex_lock(&dax_bus_lock);
1466-
match_always_count -= dax_drv->match_always;
14671458
list_for_each_entry_safe(dax_id, _id, &dax_drv->ids, list) {
14681459
list_del(&dax_id->list);
14691460
kfree(dax_id);

drivers/dax/bus.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,10 @@ struct dax_device;
1111
struct dax_region;
1212
void dax_region_put(struct dax_region *dax_region);
1313

14-
#define IORESOURCE_DAX_STATIC (1UL << 0)
14+
/* dax bus specific ioresource flags */
15+
#define IORESOURCE_DAX_STATIC BIT(0)
16+
#define IORESOURCE_DAX_KMEM BIT(1)
17+
1518
struct dax_region *alloc_dax_region(struct device *parent, int region_id,
1619
struct range *range, int target_node, unsigned int align,
1720
unsigned long flags);
@@ -25,10 +28,15 @@ struct dev_dax_data {
2528

2629
struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data);
2730

31+
enum dax_driver_type {
32+
DAXDRV_KMEM_TYPE,
33+
DAXDRV_DEVICE_TYPE,
34+
};
35+
2836
struct dax_device_driver {
2937
struct device_driver drv;
3038
struct list_head ids;
31-
int match_always;
39+
enum dax_driver_type type;
3240
int (*probe)(struct dev_dax *dev);
3341
void (*remove)(struct dev_dax *dev);
3442
};

drivers/dax/device.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -475,8 +475,7 @@ EXPORT_SYMBOL_GPL(dev_dax_probe);
475475

476476
static struct dax_device_driver device_dax_driver = {
477477
.probe = dev_dax_probe,
478-
/* all probe actions are unwound by devm, so .remove isn't necessary */
479-
.match_always = 1,
478+
.type = DAXDRV_DEVICE_TYPE,
480479
};
481480

482481
static int __init dax_init(void)

drivers/dax/hmem/hmem.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,25 @@ module_param_named(region_idle, region_idle, bool, 0644);
1111

1212
static int dax_hmem_probe(struct platform_device *pdev)
1313
{
14+
unsigned long flags = IORESOURCE_DAX_KMEM;
1415
struct device *dev = &pdev->dev;
1516
struct dax_region *dax_region;
1617
struct memregion_info *mri;
1718
struct dev_dax_data data;
1819
struct dev_dax *dev_dax;
1920

21+
/*
22+
* @region_idle == true indicates that an administrative agent
23+
* wants to manipulate the range partitioning before the devices
24+
* are created, so do not send them to the dax_kmem driver by
25+
* default.
26+
*/
27+
if (region_idle)
28+
flags = 0;
29+
2030
mri = dev->platform_data;
2131
dax_region = alloc_dax_region(dev, pdev->id, &mri->range,
22-
mri->target_node, PMD_SIZE, 0);
32+
mri->target_node, PMD_SIZE, flags);
2333
if (!dax_region)
2434
return -ENOMEM;
2535

drivers/dax/kmem.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,7 @@ static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
239239
static struct dax_device_driver device_dax_kmem_driver = {
240240
.probe = dev_dax_kmem_probe,
241241
.remove = dev_dax_kmem_remove,
242+
.type = DAXDRV_KMEM_TYPE,
242243
};
243244

244245
static int __init dax_kmem_init(void)

0 commit comments

Comments
 (0)