Skip to content

Commit a161628

Browse files
author
rhc54
authored
Merge pull request #2648 from rhc54/topic/topo
Only instantiate the HWLOC topology in an MPI process if it actually will be used.
2 parents 52533f7 + fe68f23 commit a161628

File tree

14 files changed

+313
-201
lines changed

14 files changed

+313
-201
lines changed

ompi/errhandler/errhandler_predefined.c

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
1616
* Copyright (c) 2012 Los Alamos National Security, LLC.
1717
* All rights reserved.
18+
* Copyright (c) 2016 Intel, Inc. All rights reserved.
1819
* $COPYRIGHT$
1920
*
2021
* Additional copyrights may follow
@@ -51,7 +52,7 @@ static void out(char *str, char *arg);
5152

5253

5354
void ompi_mpi_errors_are_fatal_comm_handler(struct ompi_communicator_t **comm,
54-
int *error_code, ...)
55+
int *error_code, ...)
5556
{
5657
char *name;
5758
struct ompi_communicator_t *abort_comm;
@@ -72,7 +73,7 @@ void ompi_mpi_errors_are_fatal_comm_handler(struct ompi_communicator_t **comm,
7273

7374

7475
void ompi_mpi_errors_are_fatal_file_handler(struct ompi_file_t **file,
75-
int *error_code, ...)
76+
int *error_code, ...)
7677
{
7778
char *name;
7879
struct ompi_communicator_t *abort_comm;
@@ -93,7 +94,7 @@ void ompi_mpi_errors_are_fatal_file_handler(struct ompi_file_t **file,
9394

9495

9596
void ompi_mpi_errors_are_fatal_win_handler(struct ompi_win_t **win,
96-
int *error_code, ...)
97+
int *error_code, ...)
9798
{
9899
char *name;
99100
struct ompi_communicator_t *abort_comm = NULL;
@@ -111,7 +112,7 @@ void ompi_mpi_errors_are_fatal_win_handler(struct ompi_win_t **win,
111112
}
112113

113114
void ompi_mpi_errors_return_comm_handler(struct ompi_communicator_t **comm,
114-
int *error_code, ...)
115+
int *error_code, ...)
115116
{
116117
/* Don't need anything more -- just need this function to exist */
117118
/* Silence some compiler warnings */
@@ -123,7 +124,7 @@ void ompi_mpi_errors_return_comm_handler(struct ompi_communicator_t **comm,
123124

124125

125126
void ompi_mpi_errors_return_file_handler(struct ompi_file_t **file,
126-
int *error_code, ...)
127+
int *error_code, ...)
127128
{
128129
/* Don't need anything more -- just need this function to exist */
129130
/* Silence some compiler warnings */
@@ -135,7 +136,7 @@ void ompi_mpi_errors_return_file_handler(struct ompi_file_t **file,
135136

136137

137138
void ompi_mpi_errors_return_win_handler(struct ompi_win_t **win,
138-
int *error_code, ...)
139+
int *error_code, ...)
139140
{
140141
/* Don't need anything more -- just need this function to exist */
141142
/* Silence some compiler warnings */
@@ -181,6 +182,7 @@ static void backend_fatal_aggregate(char *type,
181182
const char* const unknown_error_code = "Error code: %d (no associated error message)";
182183
const char* const unknown_error = "Unknown error";
183184
const char* const unknown_prefix = "[?:?]";
185+
bool generated = false;
184186

185187
// these do not own what they point to; they're
186188
// here to avoid repeating expressions such as
@@ -209,6 +211,8 @@ static void backend_fatal_aggregate(char *type,
209211
err_msg = NULL;
210212
opal_output(0, "%s", "Could not write to err_msg");
211213
opal_output(0, unknown_error_code, *error_code);
214+
} else {
215+
generated = true;
212216
}
213217
}
214218
}
@@ -254,7 +258,9 @@ static void backend_fatal_aggregate(char *type,
254258
}
255259

256260
free(prefix);
257-
free(err_msg);
261+
if (generated) {
262+
free(err_msg);
263+
}
258264
}
259265

260266
/*

ompi/mca/topo/treematch/topo_treematch_component.c

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
* reserved.
55
* Copyright (c) 2011-2015 INRIA. All rights reserved.
66
* Copyright (c) 2011-2015 Université Bordeaux 1
7+
* Copyright (c) 2016 Intel, Inc. All rights reserved.
78
* $COPYRIGHT$
89
*
910
* Additional copyrights may follow
@@ -61,9 +62,6 @@ mca_topo_treematch_component_2_2_0_t mca_topo_treematch_component =
6162

6263
static int init_query(bool enable_progress_threads, bool enable_mpi_threads)
6364
{
64-
if(NULL == opal_hwloc_topology) {
65-
return OPAL_ERR_NOT_SUPPORTED;
66-
}
6765
return OMPI_SUCCESS;
6866
}
6967

@@ -97,4 +95,3 @@ static int mca_topo_treematch_component_register(void)
9795
MCA_BASE_VAR_SCOPE_READONLY, &mca_topo_treematch_component.reorder_mode);
9896
return OMPI_SUCCESS;
9997
}
100-

ompi/mca/topo/treematch/topo_treematch_dist_graph_create.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
* reserved.
66
* Copyright (c) 2011-2015 INRIA. All rights reserved.
77
* Copyright (c) 2012-2015 Bordeaux Poytechnic Institute
8-
* Copyright (c) 2015 Intel, Inc. All rights reserved
8+
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
99
* Copyright (c) 2015-2016 Research Organization for Information Science
1010
* and Technology (RIST). All rights reserved.
1111
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
@@ -256,7 +256,9 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
256256
/* Then, we need to know if the processes are bound */
257257
/* We make the hypothesis that all processes are in */
258258
/* the same state : all bound or none bound */
259-
assert(NULL != opal_hwloc_topology);
259+
if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
260+
goto fallback;
261+
}
260262
root_obj = hwloc_get_root_obj(opal_hwloc_topology);
261263
if (NULL == root_obj) goto fallback;
262264

@@ -873,7 +875,7 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
873875
if( -1 == hwloc_err) goto fallback;
874876

875877
/* Report new binding to ORTE/OPAL */
876-
/* hwloc_bitmap_list_asprintf(&orte_process_info.cpuset,set); */
878+
/* hwloc_bitmap_list_asprintf(&orte_process_info.cpuset,set); */
877879
err = hwloc_bitmap_snprintf (set_as_string,64,set);
878880

879881
#ifdef __DEBUG__

ompi/runtime/ompi_mpi_init.c

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -508,16 +508,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
508508
/* check for timing request - get stop time and report elapsed time if so */
509509
OPAL_TIMING_MNEXT((&tm,"time from completion of rte_init to modex"));
510510

511-
/* if hwloc is available but didn't get setup for some
512-
* reason, do so now
513-
*/
514-
if (NULL == opal_hwloc_topology) {
515-
if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) {
516-
error = "Topology init";
517-
goto error;
518-
}
519-
}
520-
521511
/* Register the default errhandler callback */
522512
errtrk.status = OPAL_ERROR;
523513
errtrk.active = true;

opal/mca/btl/openib/btl_openib_component.c

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
* Copyright (c) 2009-2012 Oracle and/or its affiliates. All rights reserved.
1919
* Copyright (c) 2011-2015 NVIDIA Corporation. All rights reserved.
2020
* Copyright (c) 2012 Oak Ridge National Laboratory. All rights reserved
21-
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
21+
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
2222
* Copyright (c) 2014-2016 Research Organization for Information Science
2323
* and Technology (RIST). All rights reserved.
2424
* Copyright (c) 2014 Bull SAS. All rights reserved.
@@ -1502,13 +1502,33 @@ static uint64_t read_module_param(char *file, uint64_t value, uint64_t max)
15021502
static uint64_t calculate_total_mem (void)
15031503
{
15041504
hwloc_obj_t machine;
1505+
int rc;
1506+
uint64_t mem, *mptr;
1507+
opal_process_name_t wildcard_rank;
15051508

1506-
machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL);
1507-
if (NULL == machine) {
1508-
return 0;
1509+
/* first try to retrieve it from PMIx as it may have
1510+
* been provided */
1511+
wildcard_rank.jobid = OPAL_PROC_MY_NAME.jobid;
1512+
wildcard_rank.vpid = OPAL_VPID_WILDCARD;
1513+
mptr = &mem;
1514+
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_AVAIL_PHYS_MEMORY,
1515+
&wildcard_rank, &mptr, OPAL_UINT64);
1516+
if (OPAL_SUCCESS == rc) {
1517+
return mem;
1518+
}
1519+
1520+
/* if not available, then ensure that the topology has been
1521+
* loaded and try to get it from there */
1522+
if (OPAL_SUCCESS == opal_hwloc_base_get_topology()) {
1523+
machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL);
1524+
if (NULL == machine) {
1525+
return 0;
1526+
}
1527+
return machine->memory.total_memory;
15091528
}
15101529

1511-
return machine->memory.total_memory;
1530+
/* couldn't find it */
1531+
return 0;
15121532
}
15131533

15141534

@@ -2312,7 +2332,8 @@ static float get_ib_dev_distance(struct ibv_device *dev)
23122332
float distance = 0;
23132333

23142334
/* Override any distance logic so all devices are used */
2315-
if (0 != mca_btl_openib_component.ignore_locality) {
2335+
if (0 != mca_btl_openib_component.ignore_locality ||
2336+
OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
23162337
return distance;
23172338
}
23182339

opal/mca/btl/sm/btl_sm.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252
#include "opal/util/show_help.h"
5353
#include "opal/util/printf.h"
5454
#include "opal/mca/hwloc/base/base.h"
55-
#include "opal/mca/pmix/pmix.h"
55+
#include "opal/mca/pmix/base/base.h"
5656
#include "opal/mca/shmem/base/base.h"
5757
#include "opal/mca/shmem/shmem.h"
5858

@@ -242,7 +242,7 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl,
242242
free(loc);
243243
} else {
244244
/* If we have hwloc support, then get accurate information */
245-
if (NULL != opal_hwloc_topology) {
245+
if (OPAL_SUCCESS == opal_hwloc_base_get_topology()) {
246246
i = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology,
247247
HWLOC_OBJ_NODE, 0,
248248
OPAL_HWLOC_AVAILABLE);
@@ -257,6 +257,7 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl,
257257
}
258258
}
259259
/* see if we were given our location */
260+
loc = NULL;
260261
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_LOCALITY_STRING,
261262
&OPAL_PROC_MY_NAME, &loc, OPAL_STRING);
262263
if (OPAL_SUCCESS == rc) {
@@ -283,8 +284,7 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl,
283284
}
284285
} else {
285286
/* If we have hwloc support, then get accurate information */
286-
if (NULL != opal_hwloc_topology && num_mem_nodes > 0 &&
287-
NULL != opal_process_info.cpuset) {
287+
if (OPAL_SUCCESS == opal_hwloc_base_get_topology() && num_mem_nodes > 0) {
288288
int numa=0, w;
289289
unsigned n_bound=0;
290290
hwloc_cpuset_t avail;

0 commit comments

Comments
 (0)