Skip to content

Commit 3a2d6a5

Browse files
author
Ralph Castain
committed
Begin to reduce reliance of application procs on the topology tree itself by having the daemon provide more detailed info. In this case, provide the topology description string so that procs can readily determine the number of types of objects on the node, and a "locality" string that describes which objects this process is executing upon. The latter allows a process to compute the objects of overlap between itself and another proc without consulting the topology tree.
Signed-off-by: Ralph Castain <[email protected]>
1 parent 75be023 commit 3a2d6a5

File tree

7 files changed

+401
-158
lines changed

7 files changed

+401
-158
lines changed

opal/mca/btl/sm/btl_sm.c

Lines changed: 74 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
* All rights reserved.
1717
* Copyright (c) 2010-2012 IBM Corporation. All rights reserved.
1818
* Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved.
19-
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
19+
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
2020
* Copyright (c) 2014-2015 Research Organization for Information Science
2121
* and Technology (RIST). All rights reserved.
2222
* Copyright (c) 2016 ARM, Inc. All rights reserved.
@@ -52,6 +52,7 @@
5252
#include "opal/util/show_help.h"
5353
#include "opal/util/printf.h"
5454
#include "opal/mca/hwloc/base/base.h"
55+
#include "opal/mca/pmix/pmix.h"
5556
#include "opal/mca/shmem/base/base.h"
5657
#include "opal/mca/shmem/shmem.h"
5758

@@ -223,23 +224,28 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl,
223224
int my_mem_node, num_mem_nodes, i, rc;
224225
mca_common_sm_mpool_resources_t *res = NULL;
225226
mca_btl_sm_component_t* m = &mca_btl_sm_component;
227+
char *loc, *mynuma;
228+
opal_process_name_t wildcard_rank;
226229

227230
/* Assume we don't have hwloc support and fill in dummy info */
228231
mca_btl_sm_component.mem_node = my_mem_node = 0;
229232
mca_btl_sm_component.num_mem_nodes = num_mem_nodes = 1;
230233

231-
/* If we have hwloc support, then get accurate information */
232-
if (NULL != opal_hwloc_topology) {
233-
i = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology,
234-
HWLOC_OBJ_NODE, 0,
235-
OPAL_HWLOC_AVAILABLE);
236-
237-
/* If we find >0 NUMA nodes, then investigate further */
238-
if (i > 0) {
239-
int numa=0, w;
240-
unsigned n_bound=0;
241-
hwloc_cpuset_t avail;
242-
hwloc_obj_t obj;
234+
/* see if we were given a topology signature */
235+
wildcard_rank.jobid = OPAL_PROC_MY_NAME.jobid;
236+
wildcard_rank.vpid = OPAL_VPID_WILDCARD;
237+
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_TOPOLOGY_SIGNATURE,
238+
&wildcard_rank, &loc, OPAL_STRING);
239+
if (OPAL_SUCCESS == rc) {
240+
/* the number of NUMA nodes is right at the front */
241+
mca_btl_sm_component.num_mem_nodes = num_mem_nodes = strtoul(loc, NULL, 10);
242+
free(loc);
243+
} else {
244+
/* If we have hwloc support, then get accurate information */
245+
if (NULL != opal_hwloc_topology) {
246+
i = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology,
247+
HWLOC_OBJ_NODE, 0,
248+
OPAL_HWLOC_AVAILABLE);
243249

244250
/* JMS This tells me how many numa nodes are *available*,
245251
but it's not how many are being used *by this job*.
@@ -248,33 +254,65 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl,
248254
should be improved to be how many NUMA nodes are being
249255
used *in this job*. */
250256
mca_btl_sm_component.num_mem_nodes = num_mem_nodes = i;
257+
}
258+
}
259+
/* see if we were given our location */
260+
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_LOCALITY_STRING,
261+
&OPAL_PROC_MY_NAME, &loc, OPAL_STRING);
262+
if (OPAL_SUCCESS == rc) {
263+
if (NULL == loc) {
264+
mca_btl_sm_component.mem_node = my_mem_node = -1;
265+
} else {
266+
/* get our NUMA location */
267+
mynuma = opal_hwloc_base_get_location(loc, HWLOC_OBJ_NODE, 0);
268+
if (NULL == mynuma ||
269+
NULL != strchr(mynuma, ',') ||
270+
NULL != strchr(mynuma, '-')) {
271+
/* we either have no idea what NUMA we are on, or we
272+
* are on multiple NUMA nodes */
273+
mca_btl_sm_component.mem_node = my_mem_node = -1;
274+
} else {
275+
/* we are bound to a single NUMA node */
276+
my_mem_node = strtoul(mynuma, NULL, 10);
277+
mca_btl_sm_component.mem_node = my_mem_node;
278+
}
279+
if (NULL != mynuma) {
280+
free(mynuma);
281+
}
282+
free(loc);
283+
}
284+
} else {
285+
/* If we have hwloc support, then get accurate information */
286+
if (NULL != opal_hwloc_topology && num_mem_nodes > 0 &&
287+
NULL != opal_process_info.cpuset) {
288+
int numa=0, w;
289+
unsigned n_bound=0;
290+
hwloc_cpuset_t avail;
291+
hwloc_obj_t obj;
251292

252-
/* if we are not bound, then there is nothing further to do */
253-
if (NULL != opal_process_info.cpuset) {
254-
/* count the number of NUMA nodes to which we are bound */
255-
for (w=0; w < i; w++) {
256-
if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology,
257-
HWLOC_OBJ_NODE, 0, w,
258-
OPAL_HWLOC_AVAILABLE))) {
259-
continue;
260-
}
261-
/* get that NUMA node's available cpus */
262-
avail = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj);
263-
/* see if we intersect */
264-
if (hwloc_bitmap_intersects(avail, opal_hwloc_my_cpuset)) {
265-
n_bound++;
266-
numa = w;
267-
}
293+
/* count the number of NUMA nodes to which we are bound */
294+
for (w=0; w < i; w++) {
295+
if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology,
296+
HWLOC_OBJ_NODE, 0, w,
297+
OPAL_HWLOC_AVAILABLE))) {
298+
continue;
268299
}
269-
/* if we are located on more than one NUMA, or we didn't find
270-
* a NUMA we are on, then not much we can do
271-
*/
272-
if (1 == n_bound) {
273-
mca_btl_sm_component.mem_node = my_mem_node = numa;
274-
} else {
275-
mca_btl_sm_component.mem_node = my_mem_node = -1;
300+
/* get that NUMA node's available cpus */
301+
avail = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj);
302+
/* see if we intersect */
303+
if (hwloc_bitmap_intersects(avail, opal_hwloc_my_cpuset)) {
304+
n_bound++;
305+
numa = w;
276306
}
277307
}
308+
/* if we are located on more than one NUMA, or we didn't find
309+
* a NUMA we are on, then not much we can do
310+
*/
311+
if (1 == n_bound) {
312+
mca_btl_sm_component.mem_node = my_mem_node = numa;
313+
} else {
314+
mca_btl_sm_component.mem_node = my_mem_node = -1;
315+
}
278316
}
279317
}
280318

opal/mca/hwloc/base/base.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
3-
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
3+
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
44
* $COPYRIGHT$
55
*
66
* Additional copyrights may follow
@@ -276,6 +276,16 @@ OPAL_DECLSPEC hwloc_obj_t opal_hwloc_base_get_pu(hwloc_topology_t topo,
276276
OPAL_DECLSPEC char* opal_hwloc_base_get_topo_signature(hwloc_topology_t topo);
277277

278278

279+
/* get a string describing the locality of a given process */
280+
OPAL_DECLSPEC char* opal_hwloc_base_get_locality_string(hwloc_topology_t topo, char *bitmap);
281+
282+
/* extract a location from the locality string */
283+
OPAL_DECLSPEC char* opal_hwloc_base_get_location(char *locality,
284+
hwloc_obj_type_t type,
285+
unsigned index);
286+
287+
OPAL_DECLSPEC opal_hwloc_locality_t opal_hwloc_compute_relative_locality(char *loc1, char *loc2);
288+
279289
END_C_DECLS
280290

281291
#endif /* OPAL_HWLOC_BASE_H */

0 commit comments

Comments
 (0)