Skip to content

Commit 7aa61ef

Browse files
authored
Merge pull request #568 from bgoglin/fakenuma
Workaround fake numa in the Linux kernel Booting Linux with things like numa=fake=8U can split physical NUMA nodes into multiple smaller ones. That was mostly for debugging but it looks like some modes with be used in production in the future. hwloc currently assumes multiple nodes with identical locality in Linux was a BIOS bug, hence only the first one was kept. Detect fake numa in the kernel cmdline and allow such nodes with identical cpusets. Disable memory attributes and memory-side caches since HMAT information isn't updated in Linux when fake NUMA is enabled (while SLIT distances are). By the way, update lstopo to show these many NUMA nodes in a rectangular layout since 8 nodes on a single row makes the row way too long.
2 parents cd3db48 + 1e89d4e commit 7aa61ef

File tree

6 files changed

+97
-20
lines changed

6 files changed

+97
-20
lines changed

doc/hwloc.doxy

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1292,7 +1292,9 @@ following environment variables.
12921292
if 1, ignore KNL info from hwloc-dump-hwdata and fallback to heuristic
12931293
if 0, never fallback to the hardwired heuristic, useful if the heuristic is wrong
12941294
HWLOC_DEBUG_ALLOW_OVERLAPPING_NODE_CPUSETS
1295-
don't ignore linux numa nodes with overlapping cpusets
1295+
if 0 (default), non-first nodes with overlapping cpusets are ignored
1296+
if 1, don't ignore linux numa nodes with overlapping cpusets
1297+
if 2, don't ignore either and don't even warn about it
12961298
HWLOC_DEBUG_SORT_CHILDREN
12971299
sort osdev I/O children by name to make sure the topology doesn't depend
12981300
on the ordering of dentries in the local filesystem (for Linux fsroot tests)

hwloc/topology-linux.c

Lines changed: 65 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ struct hwloc_linux_backend_data_s {
5656
} arch;
5757
int is_knl;
5858
int is_amd_with_CU;
59+
int is_fake_numa_uniform; /* 0 if not fake, -1 if fake non-uniform, N if fake=<N>U */
5960
int use_numa_distances;
6061
int use_numa_distances_for_cpuless;
6162
int use_numa_initiators;
@@ -3971,11 +3972,26 @@ look_sysfsnode(struct hwloc_topology *topology,
39713972
unsigned failednodes = 0;
39723973
unsigned i;
39733974
DIR *dir;
3974-
int allow_overlapping_node_cpusets = (getenv("HWLOC_DEBUG_ALLOW_OVERLAPPING_NODE_CPUSETS") != NULL);
3975+
char *env;
3976+
int allow_overlapping_node_cpusets = 0;
39753977
int need_memcaches = hwloc_filter_check_keep_object_type(topology, HWLOC_OBJ_MEMCACHE);
3978+
int need_memattrs = !(topology->flags & HWLOC_TOPOLOGY_FLAG_NO_MEMATTRS);
39763979

39773980
hwloc_debug("\n\n * Topology extraction from /sys/devices/system/node *\n\n");
39783981

3982+
if (data->is_fake_numa_uniform) {
3983+
hwloc_debug("Disabling memory-side caches, memory attributes and HMAT initiators because of fake numa\n");
3984+
need_memcaches = 0;
3985+
need_memattrs = 0;
3986+
data->use_numa_initiators = 0;
3987+
allow_overlapping_node_cpusets = 2; /* accept without warning */
3988+
}
3989+
3990+
env = getenv("HWLOC_DEBUG_ALLOW_OVERLAPPING_NODE_CPUSETS");
3991+
if (env) {
3992+
allow_overlapping_node_cpusets = atoi(env); /* 0 drop non-first overlapping nodes, 1 allows with warning, 2 allows without warning */
3993+
}
3994+
39793995
/* NUMA nodes cannot be filtered out */
39803996
indexes = list_sysfsnode(topology, data, &nbnodes);
39813997
if (!indexes)
@@ -4023,7 +4039,7 @@ look_sysfsnode(struct hwloc_topology *topology,
40234039
failednodes++;
40244040
continue;
40254041
}
4026-
if (HWLOC_SHOW_CRITICAL_ERRORS())
4042+
if (allow_overlapping_node_cpusets < 2 && HWLOC_SHOW_CRITICAL_ERRORS())
40274043
fprintf(stderr, "hwloc/linux: node P#%u cpuset intersects with previous nodes, forcing its acceptance\n", osnode);
40284044
}
40294045
hwloc_bitmap_or(nodes_cpuset, nodes_cpuset, cpuset);
@@ -4043,8 +4059,9 @@ look_sysfsnode(struct hwloc_topology *topology,
40434059
dir = hwloc_opendir("/proc/driver/nvidia/gpus", data->root_fd);
40444060
if (dir) {
40454061
struct dirent *dirent;
4046-
char *env = getenv("HWLOC_KEEP_NVIDIA_GPU_NUMA_NODES");
4047-
int keep = env && atoi(env);
4062+
int keep;
4063+
env = getenv("HWLOC_KEEP_NVIDIA_GPU_NUMA_NODES");
4064+
keep = env && atoi(env);
40484065
while ((dirent = readdir(dir)) != NULL) {
40494066
char nvgpunumapath[300], line[256];
40504067
int err;
@@ -4115,8 +4132,9 @@ look_sysfsnode(struct hwloc_topology *topology,
41154132

41164133
if (data->is_knl) {
41174134
/* apply KNL quirks */
4118-
char *env = getenv("HWLOC_KNL_NUMA_QUIRK");
4119-
int noquirk = (env && !atoi(env));
4135+
int noquirk;
4136+
env = getenv("HWLOC_KNL_NUMA_QUIRK");
4137+
noquirk = (env && !atoi(env));
41204138
if (!noquirk) {
41214139
hwloc_linux_knl_numa_quirk(topology, data, nodes, nbnodes, distances, &failednodes);
41224140
free(distances);
@@ -4174,7 +4192,7 @@ look_sysfsnode(struct hwloc_topology *topology,
41744192
trees[nr_trees++] = tree;
41754193
}
41764194
/* By the way, get their memattrs now that cpuset is fixed */
4177-
if (!(topology->flags & HWLOC_TOPOLOGY_FLAG_NO_MEMATTRS))
4195+
if (need_memattrs)
41784196
read_node_local_memattrs(topology, data, node);
41794197
}
41804198

@@ -5531,6 +5549,40 @@ static int check_sysfs_cpu_path(int root_fd, int *old_filenames)
55315549
return -1;
55325550
}
55335551

5552+
static void
5553+
hwloc_linuxfs_check_kernel_cmdline(struct hwloc_linux_backend_data_s *data)
5554+
{
5555+
FILE *file;
5556+
char cmdline[4096];
5557+
char *fakenuma;
5558+
5559+
file = hwloc_fopen("/proc/cmdline", "r", data->root_fd);
5560+
if (!file)
5561+
return;
5562+
5563+
cmdline[0] = 0;
5564+
fgets(cmdline, sizeof(cmdline), file);
5565+
5566+
fakenuma = strstr(cmdline, "numa=fake=");
5567+
if (fakenuma) {
5568+
/* in fake numa emulation, SLIT is updated but HMAT isn't, hence we need to disable/fix things later */
5569+
unsigned width = 0;
5570+
char type = 0;
5571+
if (sscanf(fakenuma+10, "%u%c", &width, &type) == 2 && type == 'U') {
5572+
/* if <N>U, each node is split in 8 nodes, we can still do things in this case */
5573+
data->is_fake_numa_uniform = width;
5574+
} else {
5575+
/* otherwise fake nodes are created by just dividing the entire RAM,
5576+
* without respecting locality at all
5577+
*/
5578+
data->is_fake_numa_uniform = -1;
5579+
}
5580+
hwloc_debug("Found fake numa %d\n", data->is_fake_numa_uniform);
5581+
}
5582+
5583+
fclose(file);
5584+
}
5585+
55345586
static int
55355587
hwloc_linuxfs_look_cpu(struct hwloc_backend *backend, struct hwloc_disc_status *dstatus)
55365588
{
@@ -5583,6 +5635,11 @@ hwloc_linuxfs_look_cpu(struct hwloc_backend *backend, struct hwloc_disc_status *
55835635
*/
55845636
hwloc_gather_system_info(topology, data);
55855637

5638+
/**********************************
5639+
* Detect things in /proc/cmdline
5640+
*/
5641+
hwloc_linuxfs_check_kernel_cmdline(data);
5642+
55865643
/**********************
55875644
* /proc/cpuinfo
55885645
*/
@@ -7215,6 +7272,7 @@ hwloc_linux_component_instantiate(struct hwloc_topology *topology,
72157272
data->arch = HWLOC_LINUX_ARCH_UNKNOWN;
72167273
data->is_knl = 0;
72177274
data->is_amd_with_CU = 0;
7275+
data->is_fake_numa_uniform = 0;
72187276
data->is_real_fsroot = 1;
72197277
data->root_path = NULL;
72207278
fsroot_path = getenv("HWLOC_FSROOT");

utils/lstopo/lstopo-draw.c

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -371,9 +371,6 @@ static float pci_link_speed(hwloc_obj_t obj)
371371
* Placing children in rectangle
372372
*/
373373

374-
/* preferred width/height compromise */
375-
#define RATIO (4.f/3.f)
376-
377374
/* returns a score <= 1. close to 1 is better */
378375
static __hwloc_inline
379376
float rectangle_score(unsigned width, unsigned height, float ratio)
@@ -553,10 +550,13 @@ place_children_rect(struct lstopo_output *loutput, hwloc_obj_t parent,
553550
float ratio;
554551
int i;
555552

556-
if (parent->type == HWLOC_OBJ_CORE)
557-
ratio = 1/RATIO;
553+
/* preferred width/height compromise */
554+
if (kind == LSTOPO_CHILD_KIND_MEMORY)
555+
ratio = 8.f; /* very large for memory above objects since the parent is usually very large */
556+
else if (parent->type == HWLOC_OBJ_CORE)
557+
ratio = 3.f/4.f; /* rather high Core objects since they often contain 2 PUs that we don't want horizontal */
558558
else
559-
ratio = RATIO;
559+
ratio = 4.f/3.f; /* rather largeother objects */
560560
find_children_rectangle(loutput, parent, kind, separator, &rows, &columns, ratio);
561561

562562
rowwidth = 0;
@@ -626,7 +626,7 @@ place_children(struct lstopo_output *loutput, hwloc_obj_t parent,
626626
unsigned xrel, unsigned yrel /* position of children within parent */)
627627
{
628628
struct lstopo_obj_userdata *plud = parent->userdata;
629-
enum lstopo_orient_e main_orient, right_orient, below_orient;
629+
enum lstopo_orient_e main_orient, right_orient, below_orient, above_orient;
630630
unsigned border = loutput->gridsize;
631631
unsigned separator = loutput->gridsize;
632632
unsigned separator_below_cache = loutput->gridsize;
@@ -674,6 +674,10 @@ place_children(struct lstopo_output *loutput, hwloc_obj_t parent,
674674
below_orient = loutput->below_force_orient;
675675
if (below_orient == LSTOPO_ORIENT_NONE)
676676
below_orient = loutput->force_orient[parent->type];
677+
/* place above children in rectangle by default */
678+
above_orient = loutput->above_force_orient;
679+
if (above_orient == LSTOPO_ORIENT_NONE)
680+
above_orient = LSTOPO_ORIENT_RECT;
677681

678682
/* defaults */
679683
plud->children.box = 0;
@@ -790,7 +794,6 @@ place_children(struct lstopo_output *loutput, hwloc_obj_t parent,
790794

791795
/* compute the size of the above children section (Memory), if any */
792796
if (plud->above_children.kinds) {
793-
enum lstopo_orient_e morient = LSTOPO_ORIENT_HORIZ;
794797
int need_box;
795798

796799
assert(plud->above_children.kinds == LSTOPO_CHILD_KIND_MEMORY);
@@ -801,7 +804,7 @@ place_children(struct lstopo_output *loutput, hwloc_obj_t parent,
801804
need_box = !hwloc_obj_type_is_memory(parent->type)
802805
&& (parent->memory_arity + parent->memory_first_child->memory_arity > 1);
803806

804-
place__children(loutput, parent, plud->above_children.kinds, &morient, need_box ? border : 0, separator, &above_children_width, &above_children_height);
807+
place__children(loutput, parent, plud->above_children.kinds, &above_orient, need_box ? border : 0, separator, &above_children_width, &above_children_height);
805808
if (parent->type == HWLOC_OBJ_MEMCACHE)
806809
above_children_height -= separator;
807810

utils/lstopo/lstopo-no-graphics.1in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -349,7 +349,7 @@ on the right.
349349

350350
Up to hwloc 2.5, the default was rather to \fImemory:above,plain\fR.
351351

352-
Additionally, \fIio:right\fR, \fIio:below\fR, \fImisc:right\fR
352+
Additionally, \fImemory:above\fR, \fIio:right\fR, \fIio:below\fR, \fImisc:right\fR
353353
and \fImisc:below\fR may be suffixed with
354354
\fI:horiz\fR, \fI:vert\fR or \fI:rect\fR to force the horizontal,
355355
vertical or rectangular layout of children inside these sections.

utils/lstopo/lstopo.c

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -303,11 +303,13 @@ lstopo_check_pci_domains(hwloc_topology_t topology)
303303

304304
static void
305305
lstopo_parse_children_order(char *s, unsigned *children_order_p,
306+
enum lstopo_orient_e *above_force_orient_p,
306307
enum lstopo_orient_e *right_force_orient_p,
307308
enum lstopo_orient_e *below_force_orient_p)
308309
{
309310
char *tmp, *next;
310311
unsigned children_order;
312+
enum lstopo_orient_e above_force_orient = LSTOPO_ORIENT_NONE;
311313
enum lstopo_orient_e right_force_orient = LSTOPO_ORIENT_NONE;
312314
enum lstopo_orient_e below_force_orient= LSTOPO_ORIENT_NONE;
313315

@@ -327,6 +329,15 @@ lstopo_parse_children_order(char *s, unsigned *children_order_p,
327329

328330
if (!strcmp(tmp, "memory:above") || !strcmp(tmp, "memoryabove") /* backward compat with 2.5 */) {
329331
children_order |= LSTOPO_ORDER_MEMORY_ABOVE;
332+
} else if (!strcmp(tmp, "memory:above:horiz")) {
333+
children_order |= LSTOPO_ORDER_MEMORY_ABOVE;
334+
above_force_orient = LSTOPO_ORIENT_HORIZ;
335+
} else if (!strcmp(tmp, "memory:above:vert")) {
336+
children_order |= LSTOPO_ORDER_MEMORY_ABOVE;
337+
above_force_orient = LSTOPO_ORIENT_VERT;
338+
} else if (!strcmp(tmp, "memory:above:rect")) {
339+
children_order |= LSTOPO_ORDER_MEMORY_ABOVE;
340+
above_force_orient = LSTOPO_ORIENT_RECT;
330341

331342
} else if (!strcmp(tmp, "io:right")) {
332343
children_order |= LSTOPO_ORDER_IO_RIGHT;
@@ -384,6 +395,7 @@ lstopo_parse_children_order(char *s, unsigned *children_order_p,
384395
}
385396

386397
*children_order_p = children_order;
398+
*above_force_orient_p = above_force_orient;
387399
*right_force_orient_p = right_force_orient;
388400
*below_force_orient_p = below_force_orient;
389401
}
@@ -896,6 +908,7 @@ main (int argc, char *argv[])
896908
loutput.force_orient[i] = LSTOPO_ORIENT_HORIZ;
897909
loutput.force_orient[HWLOC_OBJ_NUMANODE] = LSTOPO_ORIENT_HORIZ;
898910
loutput.force_orient[HWLOC_OBJ_MEMCACHE] = LSTOPO_ORIENT_HORIZ;
911+
loutput.above_force_orient = LSTOPO_ORIENT_NONE;
899912
loutput.right_force_orient = LSTOPO_ORIENT_NONE;
900913
loutput.below_force_orient = LSTOPO_ORIENT_NONE;
901914
for(i=HWLOC_OBJ_TYPE_MIN; i<HWLOC_OBJ_TYPE_MAX; i++) {
@@ -1383,7 +1396,7 @@ main (int argc, char *argv[])
13831396
if (argc < 2)
13841397
goto out_usagefailure;
13851398
lstopo_parse_children_order(argv[1], &loutput.children_order,
1386-
&loutput.right_force_orient, &loutput.below_force_orient);
1399+
&loutput.above_force_orient, &loutput.right_force_orient, &loutput.below_force_orient);
13871400
opt = 1;
13881401
}
13891402
else if (!strcmp (argv[0], "--no-cpukinds")) {

utils/lstopo/lstopo.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Copyright © 2009 CNRS
3-
* Copyright © 2009-2022 Inria. All rights reserved.
3+
* Copyright © 2009-2023 Inria. All rights reserved.
44
* Copyright © 2009-2010, 2012, 2015 Université Bordeaux
55
* Copyright © 2011 Cisco Systems, Inc. All rights reserved.
66
* Copyright © 2020 Hewlett Packard Enterprise. All rights reserved.
@@ -112,6 +112,7 @@ struct lstopo_output {
112112
unsigned int gridsize, fontsize, linespacing, thickness;
113113
float text_xscale;
114114
enum lstopo_orient_e force_orient[HWLOC_OBJ_TYPE_MAX]; /* orientation of children within an object of the given type */
115+
enum lstopo_orient_e above_force_orient;
115116
enum lstopo_orient_e right_force_orient;
116117
enum lstopo_orient_e below_force_orient;
117118
int show_indexes[HWLOC_OBJ_TYPE_MAX]; /* enabled by global toggle index_type */

0 commit comments

Comments
 (0)