Skip to content

Commit 48fc339

Browse files
author
Ralph Castain
committed
Create an alternative mapping method that pushes responsibility
onto the backend daemons. By default, let mpirun only pack the app_context info and send that to the backend daemons where the mapping will be done. This significantly reduces the computational time on mpirun as it isn't running up/down the topology tree computing thousands of binding locations, and it reduces the launch message to a very small number of bytes. When running -novm, fall back to the old way of doing things where mpirun computes the entire map and binding, and then sends the full info to the backend daemon. Add a new cmd line option/mca param --fwd-mpirun-port that allows mpirun to dynamically select a port, but then passes that back to all the other daemons so they will use that port as a static port for their own wireup. In this mode, we no longer "phone home" directly to mpirun, but instead use the static port to wireup at daemon start. We then use the routing tree to rollup the initial launch report, and limit the number of open sockets on mpirun's node. Update ras simulator to track the new nidmap code Cleanup some bugs in the nidmap regex code, and enhance the error message for not enough slots to include the host on which the problem is found. Update gadget platform file Initialize the range count when starting a new range Fix the no-np case in managed allocation Ensure DVM node usage gets cleaned up after each job Update scaling.pl script to use --fwd-mpirun-port. Pre-connect the daemon to its parent during launch while we are otherwise waiting for the daemon's children to send their "phone home" rollup messages Signed-off-by: Ralph Castain <[email protected]>
1 parent 7240bee commit 48fc339

39 files changed

+1039
-746
lines changed

contrib/platform/intel/bend/gadget

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ enable_heterogeneous=no
88
enable_picky=yes
99
enable_debug=yes
1010
enable_shared=yes
11-
enable_static=no
11+
enable_static=yes
1212
enable_memchecker=no
1313
enable_ipv6=no
1414
enable_mpi_fortran=no
@@ -18,7 +18,10 @@ enable_cxx_exceptions=no
1818
enable_oshmem=no
1919
enable_mpi_java=no
2020
enable_io_romio=no
21+
enable_builtin_atomics=no
2122
enable_contrib_no_build=libnbc
23+
enable_mca_no_build=btl-tcp,btl-sm,rcache-udreg
24+
enable_mca_direct=pml-ob1
2225
with_memory_manager=no
2326
with_tm=no
2427
with_verbs=no

contrib/scaling/scaling.pl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
my @csvrow;
2525

2626
my @tests = qw(/bin/true ./orte_no_op ./mpi_no_op ./mpi_no_op ./mpi_no_op);
27-
my @options = ("", "", "", "-mca mpi_add_procs_cutoff 0 -mca pmix_base_async_modex 1", "-mca mpi_add_procs_cutoff 0 -mca pmix_base_async_modex 1 -mca async_mpi_init 1 -mca async_mpi_finalize 1");
27+
my @options = ("", "", "", "--fwd-mpirun-port -mca mpi_add_procs_cutoff 0 -mca pmix_base_async_modex 1", "--fwd-mpirun-port -mca mpi_add_procs_cutoff 0 -mca pmix_base_async_modex 1 -mca async_mpi_init 1 -mca async_mpi_finalize 1");
2828
my @starterlist = qw(mpirun orterun srun aprun);
2929
my @starteroptionlist = ("--novm",
3030
"--hnp file:dvm_uri",

orte/mca/ess/base/ess_base_std_orted.c

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
#include "orte/mca/plm/base/base.h"
5959
#include "orte/mca/odls/base/base.h"
6060
#include "orte/mca/errmgr/errmgr.h"
61+
#include "orte/mca/rmaps/base/base.h"
6162
#if OPAL_ENABLE_FT_CR == 1
6263
#include "orte/mca/snapc/base/base.h"
6364
#include "orte/mca/sstore/base/base.h"
@@ -116,6 +117,7 @@ int orte_ess_base_orted_setup(char **hosts)
116117
char *param;
117118
hwloc_obj_t obj;
118119
unsigned i, j;
120+
orte_topology_t *t;
119121
opal_list_t transports;
120122

121123
/* my name is set, xfer it to the OPAL layer */
@@ -333,13 +335,8 @@ int orte_ess_base_orted_setup(char **hosts)
333335
/* create and store a node object where we are */
334336
node = OBJ_NEW(orte_node_t);
335337
node->name = strdup(orte_process_info.nodename);
336-
node->index = opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node);
337-
/* point our topology to the one detected locally */
338-
node->topology = OBJ_NEW(orte_topology_t);
339-
node->topology->sig = strdup(orte_topo_signature);
340-
node->topology->topo = opal_hwloc_topology;
341-
/* add it to the array of known ones */
342-
opal_pointer_array_add(orte_node_topologies, node->topology);
338+
node->index = ORTE_PROC_MY_NAME->vpid;
339+
opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node);
343340

344341
/* create and store a proc object for us */
345342
proc = OBJ_NEW(orte_proc_t);
@@ -496,14 +493,40 @@ int orte_ess_base_orted_setup(char **hosts)
496493
error = "orte_rtc_base_select";
497494
goto error;
498495
}
496+
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rmaps_base_framework, 0))) {
497+
ORTE_ERROR_LOG(ret);
498+
error = "orte_rmaps_base_open";
499+
goto error;
500+
}
501+
if (ORTE_SUCCESS != (ret = orte_rmaps_base_select())) {
502+
ORTE_ERROR_LOG(ret);
503+
error = "orte_rmaps_base_find_available";
504+
goto error;
505+
}
506+
507+
/* if a topology file was given, then the rmaps framework open
508+
* will have reset our topology. Ensure we always get the right
509+
* one by setting our node topology afterwards
510+
*/
511+
t = OBJ_NEW(orte_topology_t);
512+
t->topo = opal_hwloc_topology;
513+
/* generate the signature */
514+
orte_topo_signature = opal_hwloc_base_get_topo_signature(opal_hwloc_topology);
515+
t->sig = strdup(orte_topo_signature);
516+
opal_pointer_array_add(orte_node_topologies, t);
517+
node->topology = t;
518+
if (15 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) {
519+
opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
520+
opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO);
521+
}
499522

500-
/* if we are using static ports, then we need to setup
523+
/* if we were given the host list, then we need to setup
501524
* the daemon info so the RML can function properly
502525
* without requiring a wireup stage. This must be done
503526
* after we enable_comm as that function determines our
504527
* own port, which we need in order to construct the nidmap
505528
*/
506-
if (orte_static_ports) {
529+
if (NULL != hosts) {
507530
/* extract the node info from the environment and
508531
* build a nidmap from it - this will update the
509532
* routing plan as well

orte/mca/ess/hnp/ess_hnp_module.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -427,7 +427,8 @@ static int rte_init(void)
427427
/* create and store a node object where we are */
428428
node = OBJ_NEW(orte_node_t);
429429
node->name = strdup(orte_process_info.nodename);
430-
node->index = opal_pointer_array_set_item(orte_node_pool, 0, node);
430+
node->index = ORTE_PROC_MY_NAME->vpid;
431+
opal_pointer_array_set_item(orte_node_pool, 0, node);
431432

432433
/* create and store a proc object for us */
433434
proc = OBJ_NEW(orte_proc_t);

orte/mca/grpcomm/base/grpcomm_base_stubs.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -545,6 +545,8 @@ static int pack_xcast(orte_grpcomm_signature_t *sig,
545545
OBJ_DESTRUCT(&data);
546546
}
547547

548+
OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
549+
"MSG SIZE: %lu", buffer->bytes_used));
548550
return ORTE_SUCCESS;
549551
}
550552

0 commit comments

Comments
 (0)