Skip to content

Commit 8d0baf1

Browse files
committed
If the RTE fails to deliver the daemon information,
gracefully fallback to a non-reordered communicator. Optimize the loops building the process hierarchy.
1 parent 2388675 commit 8d0baf1

File tree

1 file changed

+26
-21
lines changed

1 file changed

+26
-21
lines changed

ompi/mca/topo/treematch/topo_treematch_dist_graph_create.c

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,9 @@
4444
while(0);
4545

4646
#define FALLBACK() \
47-
do { free(nodes_roots); \
47+
do { free(nodes_roots); \
4848
free(local_procs); \
49-
hwloc_bitmap_free(set); \
49+
if( NULL != set) hwloc_bitmap_free(set); \
5050
goto fallback; } \
5151
while(0);
5252

@@ -181,19 +181,16 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
181181
num_procs_in_node++;
182182
}
183183

184-
/* Get the ranks of the local procs in comm_old */
184+
vpids = (int *)malloc(size * sizeof(int));
185+
colors = (int *)malloc(size * sizeof(int));
185186
local_procs = (int *)malloc(num_procs_in_node * sizeof(int));
186187
for(i = idx = 0 ; i < size ; i++){
187188
proc = ompi_group_peer_lookup(comm_old->c_local_group, i);
188189
if (( i == rank ) ||
189-
(OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)))
190+
(OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags))) {
190191
local_procs[idx++] = i;
191-
}
192+
}
192193

193-
vpids = (int *)malloc(size * sizeof(int));
194-
colors = (int *)malloc(size * sizeof(int));
195-
for(i = 0; i < size ; i++) {
196-
proc = ompi_group_peer_lookup(comm_old->c_local_group, i);
197194
pval = &val;
198195
OPAL_MODEX_RECV_VALUE(err, OPAL_PMIX_NODEID, &(proc->super.proc_name), &pval, OPAL_UINT32);
199196
if( OPAL_SUCCESS != err ) {
@@ -220,22 +217,30 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
220217
}
221218
#endif
222219
/* clean-up dupes in the array */
223-
for(i = 0; i < size ; i++)
224-
if ( -1 == vpids[i] )
220+
for(i = 0; i < size; i++) {
221+
if( -1 == vpids[i] )
225222
continue;
226-
else
227-
for(j = i+1 ; j < size ; j++)
228-
if( vpids[j] != -1 )
229-
if( vpids[i] == vpids[j] )
230-
vpids[j] = -1;
231-
/* compute number of nodes */
232-
for(i = 0; i < size ; i++)
233-
if( vpids[i] != -1 )
234-
num_nodes++;
223+
224+
num_nodes++; /* update the number of nodes */
225+
226+
for(j = i+1; j < size; j++)
227+
if( vpids[j] != -1 )
228+
if( vpids[i] == vpids[j] )
229+
vpids[j] = -1;
230+
}
231+
if( 0 == num_nodes ) {
232+
/* No useful info has been retrieved from the runtime. Fallback
233+
* and create a duplicate of the original communicator */
234+
free(vpids);
235+
free(colors);
236+
free(local_procs);
237+
err = OMPI_SUCCESS; /* return with success */
238+
goto fallback;
239+
}
235240
/* compute local roots ranks in comm_old */
236241
/* Only the global root needs to do this */
237242
if(0 == rank) {
238-
nodes_roots = (int *)calloc(num_nodes,sizeof(int));
243+
nodes_roots = (int *)calloc(num_nodes, sizeof(int));
239244
for(i = idx = 0; i < size ; i++)
240245
if( vpids[i] != -1 )
241246
nodes_roots[idx++] = i;

0 commit comments

Comments
 (0)