Skip to content

Commit 4c1160e

Browse files
committed
Fix tree spawn routed component issue
* Fix open-mpi#6618 - See comments on Issue open-mpi#6618 for finer details. * The `plm/rsh` component uses the highest priority `routed` component to construct the launch tree. The remote orted's will activate all available `routed` components when updating routes. This allows the opportunity for the parent vpid on the remote `orted` to not match that which was expected in the tree launch. The result is that the remote orted tries to contact their parent with the wrong contact information and orted wireup will fail. * This fix forces the orteds to use the same `routed` component as the HNP used when contructing the tree, if tree launch is enabled. Signed-off-by: Joshua Hursey <[email protected]>
1 parent 390e0bc commit 4c1160e

File tree

1 file changed

+28
-1
lines changed

1 file changed

+28
-1
lines changed

orte/mca/plm/rsh/plm_rsh_module.c

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -343,11 +343,12 @@ static int setup_launch(int *argcptr, char ***argvptr,
343343
char *orted_cmd, *orted_prefix, *final_cmd;
344344
int orted_index;
345345
int rc;
346-
int i, j;
346+
int i, j, cnt;
347347
bool found;
348348
char *lib_base=NULL, *bin_base=NULL;
349349
char *opal_prefix = getenv("OPAL_PREFIX");
350350
char* full_orted_cmd = NULL;
351+
char * rtmod;
351352

352353
/* Figure out the basenames for the libdir and bindir. This
353354
requires some explanation:
@@ -609,6 +610,18 @@ static int setup_launch(int *argcptr, char ***argvptr,
609610
(mca_plm_rsh_component.using_llspawn && mca_plm_rsh_component.daemonize_llspawn))) {
610611
}
611612

613+
if (!mca_plm_rsh_component.no_tree_spawn) {
614+
// Remove problematic and/or conflicting command line arguments that
615+
// should not be passed on to our children.
616+
cnt = opal_argv_count(orted_cmd_line);
617+
for (i=0; i < cnt; i+=3) {
618+
if (0 == strcmp(orted_cmd_line[i+1], "routed")) {
619+
opal_argv_delete(&cnt, &orted_cmd_line, i, 3);
620+
break;
621+
}
622+
}
623+
}
624+
612625
/*
613626
* Add the basic arguments to the orted command line, including
614627
* all debug options
@@ -627,6 +640,16 @@ static int setup_launch(int *argcptr, char ***argvptr,
627640
if (!mca_plm_rsh_component.no_tree_spawn) {
628641
opal_argv_append(&argc, &argv, "--tree-spawn");
629642
orte_oob_base_get_addr(&param);
643+
644+
// When tree-spawn'ing we need to force the remote daemons to use
645+
// the routing component that was used to setup the launch tree.
646+
// Otherwise the orte_parent_uri will not match the orted they
647+
// expect to find in the routing tree.
648+
rtmod = orte_rml.get_routed(orte_coll_conduit);
649+
opal_argv_append(&argc, &argv, "-"OPAL_MCA_CMD_LINE_ID);
650+
opal_argv_append(&argc, &argv, "routed");
651+
opal_argv_append(&argc, &argv, rtmod);
652+
630653
opal_argv_append(&argc, &argv, "-"OPAL_MCA_CMD_LINE_ID);
631654
opal_argv_append(&argc, &argv, "orte_parent_uri");
632655
opal_argv_append(&argc, &argv, param);
@@ -1187,6 +1210,10 @@ static void launch_daemons(int fd, short args, void *cbdata)
11871210
OBJ_CONSTRUCT(&coll, opal_list_t);
11881211
rtmod = orte_rml.get_routed(orte_coll_conduit);
11891212
orte_routed.get_routing_list(rtmod, &coll);
1213+
1214+
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1215+
"%s plm:rsh:launch Tree Launch using routed/%s",
1216+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rtmod));
11901217
}
11911218

11921219
/* setup the launch */

0 commit comments

Comments
 (0)