Skip to content

Commit cf8741a

Browse files
djbwrafaeljw
authored andcommitted
ACPI: NUMA: HMAT: Register "soft reserved" memory as an "hmem" device
Memory that has been tagged EFI_MEMORY_SP, and has performance properties described by the ACPI HMAT is expected to have an application specific consumer. Those consumers may want 100% of the memory capacity to be reserved from any usage by the kernel. By default, with this enabling, a platform device is created to represent this differentiated resource. The device-dax "hmem" driver claims these devices by default and provides an mmap interface for the target application. If the administrator prefers, the hmem resource range can be made available to the core-mm via the device-dax hotplug facility, kmem, to online the memory with its own numa node. This was tested with an emulated HMAT produced by qemu (with the pending HMAT enabling patches), and "efi_fake_mem=8G@9G:0x40000" on the kernel command line to mark the memory ranges associated with node2 and node3 as EFI_MEMORY_SP. qemu numa configuration options: -numa node,mem=4G,cpus=0-19,nodeid=0 -numa node,mem=4G,cpus=20-39,nodeid=1 -numa node,mem=4G,nodeid=2 -numa node,mem=4G,nodeid=3 -numa dist,src=0,dst=0,val=10 -numa dist,src=0,dst=1,val=21 -numa dist,src=0,dst=2,val=21 -numa dist,src=0,dst=3,val=21 -numa dist,src=1,dst=0,val=21 -numa dist,src=1,dst=1,val=10 -numa dist,src=1,dst=2,val=21 -numa dist,src=1,dst=3,val=21 -numa dist,src=2,dst=0,val=21 -numa dist,src=2,dst=1,val=21 -numa dist,src=2,dst=2,val=10 -numa dist,src=2,dst=3,val=21 -numa dist,src=3,dst=0,val=21 -numa dist,src=3,dst=1,val=21 -numa dist,src=3,dst=2,val=21 -numa dist,src=3,dst=3,val=10 -numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-latency,base-lat=10,latency=5 -numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=5 -numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-latency,base-lat=10,latency=10 -numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=10 -numa hmat-lb,initiator=0,target=2,hierarchy=memory,data-type=access-latency,base-lat=10,latency=15 -numa hmat-lb,initiator=0,target=2,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=15 -numa hmat-lb,initiator=0,target=3,hierarchy=memory,data-type=access-latency,base-lat=10,latency=20 -numa hmat-lb,initiator=0,target=3,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=20 -numa hmat-lb,initiator=1,target=0,hierarchy=memory,data-type=access-latency,base-lat=10,latency=10 -numa hmat-lb,initiator=1,target=0,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=10 -numa hmat-lb,initiator=1,target=1,hierarchy=memory,data-type=access-latency,base-lat=10,latency=5 -numa hmat-lb,initiator=1,target=1,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=5 -numa hmat-lb,initiator=1,target=2,hierarchy=memory,data-type=access-latency,base-lat=10,latency=15 -numa hmat-lb,initiator=1,target=2,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=15 -numa hmat-lb,initiator=1,target=3,hierarchy=memory,data-type=access-latency,base-lat=10,latency=20 -numa hmat-lb,initiator=1,target=3,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=20 Result: [ { "path":"\/platform\/hmem.1", "id":1, "size":"4.00 GiB (4.29 GB)", "align":2097152, "devices":[ { "chardev":"dax1.0", "size":"4.00 GiB (4.29 GB)" } ] }, { "path":"\/platform\/hmem.0", "id":0, "size":"4.00 GiB (4.29 GB)", "align":2097152, "devices":[ { "chardev":"dax0.0", "size":"4.00 GiB (4.29 GB)" } ] } ] [..] 240000000-43fffffff : Soft Reserved 240000000-33fffffff : hmem.0 240000000-33fffffff : dax0.0 340000000-43fffffff : hmem.1 340000000-43fffffff : dax1.0 Reviewed-by: Dave Hansen <[email protected]> Signed-off-by: Dan Williams <[email protected]> Acked-by: Thomas Gleixner <[email protected]> Signed-off-by: Rafael J. Wysocki <[email protected]>
1 parent 0f847f8 commit cf8741a

File tree

2 files changed

+125
-12
lines changed

2 files changed

+125
-12
lines changed

drivers/acpi/numa/Kconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ config ACPI_HMAT
99
bool "ACPI Heterogeneous Memory Attribute Table Support"
1010
depends on ACPI_NUMA
1111
select HMEM_REPORTING
12+
select MEMREGION
1213
help
1314
If set, this option has the kernel parse and report the
1415
platform's ACPI HMAT (Heterogeneous Memory Attributes Table),

drivers/acpi/numa/hmat.c

Lines changed: 124 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,18 @@
88
* the applicable attributes with the node's interfaces.
99
*/
1010

11+
#define pr_fmt(fmt) "acpi/hmat: " fmt
12+
#define dev_fmt(fmt) "acpi/hmat: " fmt
13+
1114
#include <linux/acpi.h>
1215
#include <linux/bitops.h>
1316
#include <linux/device.h>
1417
#include <linux/init.h>
1518
#include <linux/list.h>
19+
#include <linux/mm.h>
20+
#include <linux/platform_device.h>
1621
#include <linux/list_sort.h>
22+
#include <linux/memregion.h>
1723
#include <linux/memory.h>
1824
#include <linux/mutex.h>
1925
#include <linux/node.h>
@@ -49,6 +55,7 @@ struct memory_target {
4955
struct list_head node;
5056
unsigned int memory_pxm;
5157
unsigned int processor_pxm;
58+
struct resource memregions;
5259
struct node_hmem_attrs hmem_attrs;
5360
struct list_head caches;
5461
struct node_cache_attrs cache_attrs;
@@ -104,22 +111,36 @@ static __init void alloc_memory_initiator(unsigned int cpu_pxm)
104111
list_add_tail(&initiator->node, &initiators);
105112
}
106113

107-
static __init void alloc_memory_target(unsigned int mem_pxm)
114+
static __init void alloc_memory_target(unsigned int mem_pxm,
115+
resource_size_t start, resource_size_t len)
108116
{
109117
struct memory_target *target;
110118

111119
target = find_mem_target(mem_pxm);
112-
if (target)
113-
return;
114-
115-
target = kzalloc(sizeof(*target), GFP_KERNEL);
116-
if (!target)
117-
return;
120+
if (!target) {
121+
target = kzalloc(sizeof(*target), GFP_KERNEL);
122+
if (!target)
123+
return;
124+
target->memory_pxm = mem_pxm;
125+
target->processor_pxm = PXM_INVAL;
126+
target->memregions = (struct resource) {
127+
.name = "ACPI mem",
128+
.start = 0,
129+
.end = -1,
130+
.flags = IORESOURCE_MEM,
131+
};
132+
list_add_tail(&target->node, &targets);
133+
INIT_LIST_HEAD(&target->caches);
134+
}
118135

119-
target->memory_pxm = mem_pxm;
120-
target->processor_pxm = PXM_INVAL;
121-
list_add_tail(&target->node, &targets);
122-
INIT_LIST_HEAD(&target->caches);
136+
/*
137+
* There are potentially multiple ranges per PXM, so record each
138+
* in the per-target memregions resource tree.
139+
*/
140+
if (!__request_region(&target->memregions, start, len, "memory target",
141+
IORESOURCE_MEM))
142+
pr_warn("failed to reserve %#llx - %#llx in pxm: %d\n",
143+
start, start + len, mem_pxm);
123144
}
124145

125146
static __init const char *hmat_data_type(u8 type)
@@ -452,7 +473,7 @@ static __init int srat_parse_mem_affinity(union acpi_subtable_headers *header,
452473
return -EINVAL;
453474
if (!(ma->flags & ACPI_SRAT_MEM_ENABLED))
454475
return 0;
455-
alloc_memory_target(ma->proximity_domain);
476+
alloc_memory_target(ma->proximity_domain, ma->base_address, ma->length);
456477
return 0;
457478
}
458479

@@ -613,10 +634,91 @@ static void hmat_register_target_perf(struct memory_target *target)
613634
node_set_perf_attrs(mem_nid, &target->hmem_attrs, 0);
614635
}
615636

637+
static void hmat_register_target_device(struct memory_target *target,
638+
struct resource *r)
639+
{
640+
/* define a clean / non-busy resource for the platform device */
641+
struct resource res = {
642+
.start = r->start,
643+
.end = r->end,
644+
.flags = IORESOURCE_MEM,
645+
};
646+
struct platform_device *pdev;
647+
struct memregion_info info;
648+
int rc, id;
649+
650+
rc = region_intersects(res.start, resource_size(&res), IORESOURCE_MEM,
651+
IORES_DESC_SOFT_RESERVED);
652+
if (rc != REGION_INTERSECTS)
653+
return;
654+
655+
id = memregion_alloc(GFP_KERNEL);
656+
if (id < 0) {
657+
pr_err("memregion allocation failure for %pr\n", &res);
658+
return;
659+
}
660+
661+
pdev = platform_device_alloc("hmem", id);
662+
if (!pdev) {
663+
pr_err("hmem device allocation failure for %pr\n", &res);
664+
goto out_pdev;
665+
}
666+
667+
pdev->dev.numa_node = acpi_map_pxm_to_online_node(target->memory_pxm);
668+
info = (struct memregion_info) {
669+
.target_node = acpi_map_pxm_to_node(target->memory_pxm),
670+
};
671+
rc = platform_device_add_data(pdev, &info, sizeof(info));
672+
if (rc < 0) {
673+
pr_err("hmem memregion_info allocation failure for %pr\n", &res);
674+
goto out_pdev;
675+
}
676+
677+
rc = platform_device_add_resources(pdev, &res, 1);
678+
if (rc < 0) {
679+
pr_err("hmem resource allocation failure for %pr\n", &res);
680+
goto out_resource;
681+
}
682+
683+
rc = platform_device_add(pdev);
684+
if (rc < 0) {
685+
dev_err(&pdev->dev, "device add failed for %pr\n", &res);
686+
goto out_resource;
687+
}
688+
689+
return;
690+
691+
out_resource:
692+
put_device(&pdev->dev);
693+
out_pdev:
694+
memregion_free(id);
695+
}
696+
697+
static __init void hmat_register_target_devices(struct memory_target *target)
698+
{
699+
struct resource *res;
700+
701+
/*
702+
* Do not bother creating devices if no driver is available to
703+
* consume them.
704+
*/
705+
if (!IS_ENABLED(CONFIG_DEV_DAX_HMEM))
706+
return;
707+
708+
for (res = target->memregions.child; res; res = res->sibling)
709+
hmat_register_target_device(target, res);
710+
}
711+
616712
static void hmat_register_target(struct memory_target *target)
617713
{
618714
int nid = pxm_to_node(target->memory_pxm);
619715

716+
/*
717+
* Devices may belong to either an offline or online
718+
* node, so unconditionally add them.
719+
*/
720+
hmat_register_target_devices(target);
721+
620722
/*
621723
* Skip offline nodes. This can happen when memory
622724
* marked EFI_MEMORY_SP, "specific purpose", is applied
@@ -677,11 +779,21 @@ static __init void hmat_free_structures(void)
677779
struct target_cache *tcache, *cnext;
678780

679781
list_for_each_entry_safe(target, tnext, &targets, node) {
782+
struct resource *res, *res_next;
783+
680784
list_for_each_entry_safe(tcache, cnext, &target->caches, node) {
681785
list_del(&tcache->node);
682786
kfree(tcache);
683787
}
788+
684789
list_del(&target->node);
790+
res = target->memregions.child;
791+
while (res) {
792+
res_next = res->sibling;
793+
__release_region(&target->memregions, res->start,
794+
resource_size(res));
795+
res = res_next;
796+
}
685797
kfree(target);
686798
}
687799

0 commit comments

Comments
 (0)