Skip to content

Commit 5d30f92

Browse files
committed
x86/NUMA: Provide a range-to-target_node lookup facility
The DEV_DAX_KMEM facility is a generic mechanism to allow device-dax instances, fronting performance-differentiated-memory like pmem, to be added to the System RAM pool. The NUMA node for that hot-added memory is derived from the device-dax instance's 'target_node' attribute. Recall that the 'target_node' is the ACPI-PXM-to-node translation for memory when it comes online whereas the 'numa_node' attribute of the device represents the closest online cpu node. Presently useful target_node information from the ACPI SRAT is discarded with the expectation that "Reserved" memory will never be onlined. Now, DEV_DAX_KMEM violates that assumption, there is a need to retain the translation. Move, rather than discard, numa_memblk data to a secondary array that memory_add_physaddr_to_target_node() may consider at a later point in time. Cc: Dave Hansen <[email protected]> Cc: Andy Lutomirski <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Thomas Gleixner <[email protected]> Cc: Borislav Petkov <[email protected]> Cc: "H. Peter Anvin" <[email protected]> Cc: <[email protected]> Cc: Andrew Morton <[email protected]> Cc: David Hildenbrand <[email protected]> Cc: Michal Hocko <[email protected]> Reported-by: kbuild test robot <[email protected]> Reviewed-by: Ingo Molnar <[email protected]> Signed-off-by: Dan Williams <[email protected]> Reviewed-by: Thomas Gleixner <[email protected]> Link: https://lore.kernel.org/r/158188326978.894464.217282995221175417.stgit@dwillia2-desk3.amr.corp.intel.com
1 parent 1e5d8e1 commit 5d30f92

File tree

2 files changed

+64
-11
lines changed

2 files changed

+64
-11
lines changed

arch/x86/mm/numa.c

Lines changed: 51 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
2626
EXPORT_SYMBOL(node_data);
2727

2828
static struct numa_meminfo numa_meminfo __initdata_or_meminfo;
29+
static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo;
2930

3031
static int numa_distance_cnt;
3132
static u8 *numa_distance;
@@ -164,6 +165,19 @@ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
164165
(mi->nr_blks - idx) * sizeof(mi->blk[0]));
165166
}
166167

168+
/**
169+
* numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another
170+
* @dst: numa_meminfo to append block to
171+
* @idx: Index of memblk to remove
172+
* @src: numa_meminfo to remove memblk from
173+
*/
174+
static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx,
175+
struct numa_meminfo *src)
176+
{
177+
dst->blk[dst->nr_blks++] = src->blk[idx];
178+
numa_remove_memblk_from(idx, src);
179+
}
180+
167181
/**
168182
* numa_add_memblk - Add one numa_memblk to numa_meminfo
169183
* @nid: NUMA node ID of the new memblk
@@ -233,14 +247,19 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
233247
for (i = 0; i < mi->nr_blks; i++) {
234248
struct numa_memblk *bi = &mi->blk[i];
235249

236-
/* make sure all blocks are inside the limits */
250+
/* move / save reserved memory ranges */
251+
if (!memblock_overlaps_region(&memblock.memory,
252+
bi->start, bi->end - bi->start)) {
253+
numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi);
254+
continue;
255+
}
256+
257+
/* make sure all non-reserved blocks are inside the limits */
237258
bi->start = max(bi->start, low);
238259
bi->end = min(bi->end, high);
239260

240-
/* and there's no empty or non-exist block */
241-
if (bi->start >= bi->end ||
242-
!memblock_overlaps_region(&memblock.memory,
243-
bi->start, bi->end - bi->start))
261+
/* and there's no empty block */
262+
if (bi->start >= bi->end)
244263
numa_remove_memblk_from(i--, mi);
245264
}
246265

@@ -877,16 +896,38 @@ EXPORT_SYMBOL(cpumask_of_node);
877896

878897
#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
879898

880-
#ifdef CONFIG_MEMORY_HOTPLUG
881-
int memory_add_physaddr_to_nid(u64 start)
899+
#ifdef CONFIG_NUMA_KEEP_MEMINFO
900+
static int meminfo_to_nid(struct numa_meminfo *mi, u64 start)
882901
{
883-
struct numa_meminfo *mi = &numa_meminfo;
884-
int nid = mi->blk[0].nid;
885902
int i;
886903

887904
for (i = 0; i < mi->nr_blks; i++)
888905
if (mi->blk[i].start <= start && mi->blk[i].end > start)
889-
nid = mi->blk[i].nid;
906+
return mi->blk[i].nid;
907+
return NUMA_NO_NODE;
908+
}
909+
910+
int phys_to_target_node(phys_addr_t start)
911+
{
912+
int nid = meminfo_to_nid(&numa_meminfo, start);
913+
914+
/*
915+
* Prefer online nodes, but if reserved memory might be
916+
* hot-added continue the search with reserved ranges.
917+
*/
918+
if (nid != NUMA_NO_NODE)
919+
return nid;
920+
921+
return meminfo_to_nid(&numa_reserved_meminfo, start);
922+
}
923+
EXPORT_SYMBOL_GPL(phys_to_target_node);
924+
925+
int memory_add_physaddr_to_nid(u64 start)
926+
{
927+
int nid = meminfo_to_nid(&numa_meminfo, start);
928+
929+
if (nid == NUMA_NO_NODE)
930+
nid = numa_meminfo.blk[0].nid;
890931
return nid;
891932
}
892933
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);

include/linux/numa.h

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/* SPDX-License-Identifier: GPL-2.0 */
22
#ifndef _LINUX_NUMA_H
33
#define _LINUX_NUMA_H
4-
4+
#include <linux/types.h>
55

66
#ifdef CONFIG_NODES_SHIFT
77
#define NODES_SHIFT CONFIG_NODES_SHIFT
@@ -21,12 +21,24 @@
2121
#endif
2222

2323
#ifdef CONFIG_NUMA
24+
/* Generic implementation available */
2425
int numa_map_to_online_node(int node);
26+
27+
/*
28+
* Optional architecture specific implementation, users need a "depends
29+
* on $ARCH"
30+
*/
31+
int phys_to_target_node(phys_addr_t addr);
2532
#else
2633
static inline int numa_map_to_online_node(int node)
2734
{
2835
return NUMA_NO_NODE;
2936
}
37+
38+
static inline int phys_to_target_node(phys_addr_t addr)
39+
{
40+
return NUMA_NO_NODE;
41+
}
3042
#endif
3143

3244
#endif /* _LINUX_NUMA_H */

0 commit comments

Comments
 (0)