Skip to content

Commit fe68f23

Browse files
author
Ralph Castain
committed
Only instantiate the HWLOC topology in an MPI process if it actually will be used.
There are only five places in the non-daemon code paths where opal_hwloc_topology is currently referenced: * shared memory BTLs (sm, smcuda). I have added a code path to those components that uses the location string instead of the topology itself, if available, thus avoiding instantiating the topology * openib BTL. This uses the distance matrix. At present, I haven't developed a method for replacing that reference. Thus, this component will instantiate the topology * usnic BTL. Uses the distance matrix. * treematch TOPO component. Does some complex tree-based algorithm, so it will instantiate the topology * ess base functions. If a process is direct launched and not bound at launch, this code attempts to bind it. Thus, procs in this scenario will instantiate the topology Note that instantiating the topology on complex chips such as KNL can consume megabytes of memory. Fix pernode binding policy Properly handle the unbound case Correct pointer usage Do not free static error messages! Signed-off-by: Ralph Castain <[email protected]>
1 parent 52533f7 commit fe68f23

File tree

14 files changed

+313
-201
lines changed

14 files changed

+313
-201
lines changed

ompi/errhandler/errhandler_predefined.c

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
1616
* Copyright (c) 2012 Los Alamos National Security, LLC.
1717
* All rights reserved.
18+
* Copyright (c) 2016 Intel, Inc. All rights reserved.
1819
* $COPYRIGHT$
1920
*
2021
* Additional copyrights may follow
@@ -51,7 +52,7 @@ static void out(char *str, char *arg);
5152

5253

5354
void ompi_mpi_errors_are_fatal_comm_handler(struct ompi_communicator_t **comm,
54-
int *error_code, ...)
55+
int *error_code, ...)
5556
{
5657
char *name;
5758
struct ompi_communicator_t *abort_comm;
@@ -72,7 +73,7 @@ void ompi_mpi_errors_are_fatal_comm_handler(struct ompi_communicator_t **comm,
7273

7374

7475
void ompi_mpi_errors_are_fatal_file_handler(struct ompi_file_t **file,
75-
int *error_code, ...)
76+
int *error_code, ...)
7677
{
7778
char *name;
7879
struct ompi_communicator_t *abort_comm;
@@ -93,7 +94,7 @@ void ompi_mpi_errors_are_fatal_file_handler(struct ompi_file_t **file,
9394

9495

9596
void ompi_mpi_errors_are_fatal_win_handler(struct ompi_win_t **win,
96-
int *error_code, ...)
97+
int *error_code, ...)
9798
{
9899
char *name;
99100
struct ompi_communicator_t *abort_comm = NULL;
@@ -111,7 +112,7 @@ void ompi_mpi_errors_are_fatal_win_handler(struct ompi_win_t **win,
111112
}
112113

113114
void ompi_mpi_errors_return_comm_handler(struct ompi_communicator_t **comm,
114-
int *error_code, ...)
115+
int *error_code, ...)
115116
{
116117
/* Don't need anything more -- just need this function to exist */
117118
/* Silence some compiler warnings */
@@ -123,7 +124,7 @@ void ompi_mpi_errors_return_comm_handler(struct ompi_communicator_t **comm,
123124

124125

125126
void ompi_mpi_errors_return_file_handler(struct ompi_file_t **file,
126-
int *error_code, ...)
127+
int *error_code, ...)
127128
{
128129
/* Don't need anything more -- just need this function to exist */
129130
/* Silence some compiler warnings */
@@ -135,7 +136,7 @@ void ompi_mpi_errors_return_file_handler(struct ompi_file_t **file,
135136

136137

137138
void ompi_mpi_errors_return_win_handler(struct ompi_win_t **win,
138-
int *error_code, ...)
139+
int *error_code, ...)
139140
{
140141
/* Don't need anything more -- just need this function to exist */
141142
/* Silence some compiler warnings */
@@ -181,6 +182,7 @@ static void backend_fatal_aggregate(char *type,
181182
const char* const unknown_error_code = "Error code: %d (no associated error message)";
182183
const char* const unknown_error = "Unknown error";
183184
const char* const unknown_prefix = "[?:?]";
185+
bool generated = false;
184186

185187
// these do not own what they point to; they're
186188
// here to avoid repeating expressions such as
@@ -209,6 +211,8 @@ static void backend_fatal_aggregate(char *type,
209211
err_msg = NULL;
210212
opal_output(0, "%s", "Could not write to err_msg");
211213
opal_output(0, unknown_error_code, *error_code);
214+
} else {
215+
generated = true;
212216
}
213217
}
214218
}
@@ -254,7 +258,9 @@ static void backend_fatal_aggregate(char *type,
254258
}
255259

256260
free(prefix);
257-
free(err_msg);
261+
if (generated) {
262+
free(err_msg);
263+
}
258264
}
259265

260266
/*

ompi/mca/topo/treematch/topo_treematch_component.c

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
* reserved.
55
* Copyright (c) 2011-2015 INRIA. All rights reserved.
66
* Copyright (c) 2011-2015 Université Bordeaux 1
7+
* Copyright (c) 2016 Intel, Inc. All rights reserved.
78
* $COPYRIGHT$
89
*
910
* Additional copyrights may follow
@@ -61,9 +62,6 @@ mca_topo_treematch_component_2_2_0_t mca_topo_treematch_component =
6162

6263
static int init_query(bool enable_progress_threads, bool enable_mpi_threads)
6364
{
64-
if(NULL == opal_hwloc_topology) {
65-
return OPAL_ERR_NOT_SUPPORTED;
66-
}
6765
return OMPI_SUCCESS;
6866
}
6967

@@ -97,4 +95,3 @@ static int mca_topo_treematch_component_register(void)
9795
MCA_BASE_VAR_SCOPE_READONLY, &mca_topo_treematch_component.reorder_mode);
9896
return OMPI_SUCCESS;
9997
}
100-

ompi/mca/topo/treematch/topo_treematch_dist_graph_create.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
* reserved.
66
* Copyright (c) 2011-2015 INRIA. All rights reserved.
77
* Copyright (c) 2012-2015 Bordeaux Poytechnic Institute
8-
* Copyright (c) 2015 Intel, Inc. All rights reserved
8+
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
99
* Copyright (c) 2015-2016 Research Organization for Information Science
1010
* and Technology (RIST). All rights reserved.
1111
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
@@ -256,7 +256,9 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
256256
/* Then, we need to know if the processes are bound */
257257
/* We make the hypothesis that all processes are in */
258258
/* the same state : all bound or none bound */
259-
assert(NULL != opal_hwloc_topology);
259+
if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
260+
goto fallback;
261+
}
260262
root_obj = hwloc_get_root_obj(opal_hwloc_topology);
261263
if (NULL == root_obj) goto fallback;
262264

@@ -873,7 +875,7 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
873875
if( -1 == hwloc_err) goto fallback;
874876

875877
/* Report new binding to ORTE/OPAL */
876-
/* hwloc_bitmap_list_asprintf(&orte_process_info.cpuset,set); */
878+
/* hwloc_bitmap_list_asprintf(&orte_process_info.cpuset,set); */
877879
err = hwloc_bitmap_snprintf (set_as_string,64,set);
878880

879881
#ifdef __DEBUG__

ompi/runtime/ompi_mpi_init.c

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -508,16 +508,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
508508
/* check for timing request - get stop time and report elapsed time if so */
509509
OPAL_TIMING_MNEXT((&tm,"time from completion of rte_init to modex"));
510510

511-
/* if hwloc is available but didn't get setup for some
512-
* reason, do so now
513-
*/
514-
if (NULL == opal_hwloc_topology) {
515-
if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) {
516-
error = "Topology init";
517-
goto error;
518-
}
519-
}
520-
521511
/* Register the default errhandler callback */
522512
errtrk.status = OPAL_ERROR;
523513
errtrk.active = true;

opal/mca/btl/openib/btl_openib_component.c

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
* Copyright (c) 2009-2012 Oracle and/or its affiliates. All rights reserved.
1919
* Copyright (c) 2011-2015 NVIDIA Corporation. All rights reserved.
2020
* Copyright (c) 2012 Oak Ridge National Laboratory. All rights reserved
21-
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
21+
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
2222
* Copyright (c) 2014-2016 Research Organization for Information Science
2323
* and Technology (RIST). All rights reserved.
2424
* Copyright (c) 2014 Bull SAS. All rights reserved.
@@ -1502,13 +1502,33 @@ static uint64_t read_module_param(char *file, uint64_t value, uint64_t max)
15021502
static uint64_t calculate_total_mem (void)
15031503
{
15041504
hwloc_obj_t machine;
1505+
int rc;
1506+
uint64_t mem, *mptr;
1507+
opal_process_name_t wildcard_rank;
15051508

1506-
machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL);
1507-
if (NULL == machine) {
1508-
return 0;
1509+
/* first try to retrieve it from PMIx as it may have
1510+
* been provided */
1511+
wildcard_rank.jobid = OPAL_PROC_MY_NAME.jobid;
1512+
wildcard_rank.vpid = OPAL_VPID_WILDCARD;
1513+
mptr = &mem;
1514+
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_AVAIL_PHYS_MEMORY,
1515+
&wildcard_rank, &mptr, OPAL_UINT64);
1516+
if (OPAL_SUCCESS == rc) {
1517+
return mem;
1518+
}
1519+
1520+
/* if not available, then ensure that the topology has been
1521+
* loaded and try to get it from there */
1522+
if (OPAL_SUCCESS == opal_hwloc_base_get_topology()) {
1523+
machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL);
1524+
if (NULL == machine) {
1525+
return 0;
1526+
}
1527+
return machine->memory.total_memory;
15091528
}
15101529

1511-
return machine->memory.total_memory;
1530+
/* couldn't find it */
1531+
return 0;
15121532
}
15131533

15141534

@@ -2312,7 +2332,8 @@ static float get_ib_dev_distance(struct ibv_device *dev)
23122332
float distance = 0;
23132333

23142334
/* Override any distance logic so all devices are used */
2315-
if (0 != mca_btl_openib_component.ignore_locality) {
2335+
if (0 != mca_btl_openib_component.ignore_locality ||
2336+
OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
23162337
return distance;
23172338
}
23182339

opal/mca/btl/sm/btl_sm.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252
#include "opal/util/show_help.h"
5353
#include "opal/util/printf.h"
5454
#include "opal/mca/hwloc/base/base.h"
55-
#include "opal/mca/pmix/pmix.h"
55+
#include "opal/mca/pmix/base/base.h"
5656
#include "opal/mca/shmem/base/base.h"
5757
#include "opal/mca/shmem/shmem.h"
5858

@@ -242,7 +242,7 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl,
242242
free(loc);
243243
} else {
244244
/* If we have hwloc support, then get accurate information */
245-
if (NULL != opal_hwloc_topology) {
245+
if (OPAL_SUCCESS == opal_hwloc_base_get_topology()) {
246246
i = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology,
247247
HWLOC_OBJ_NODE, 0,
248248
OPAL_HWLOC_AVAILABLE);
@@ -257,6 +257,7 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl,
257257
}
258258
}
259259
/* see if we were given our location */
260+
loc = NULL;
260261
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_LOCALITY_STRING,
261262
&OPAL_PROC_MY_NAME, &loc, OPAL_STRING);
262263
if (OPAL_SUCCESS == rc) {
@@ -283,8 +284,7 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl,
283284
}
284285
} else {
285286
/* If we have hwloc support, then get accurate information */
286-
if (NULL != opal_hwloc_topology && num_mem_nodes > 0 &&
287-
NULL != opal_process_info.cpuset) {
287+
if (OPAL_SUCCESS == opal_hwloc_base_get_topology() && num_mem_nodes > 0) {
288288
int numa=0, w;
289289
unsigned n_bound=0;
290290
hwloc_cpuset_t avail;

0 commit comments

Comments
 (0)