Skip to content

Commit 7c795f4

Browse files
author
Ralph Castain
committed
If the HNP is going to request topology info, it cannot do so via a routed OOB message as the intervening daemons may not be ready. So disable routing until the VM is ready, and have daemons start routing as they receive the xcast launch msg (which includes the data they need to talk to their peers).
Do a little optimization and minimize recomputation of the routing plan. Signed-off-by: Ralph Castain <[email protected]>
1 parent d672fad commit 7c795f4

File tree

13 files changed

+44
-35
lines changed

13 files changed

+44
-35
lines changed

orte/mca/ess/base/ess_base_std_orted.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -497,12 +497,6 @@ int orte_ess_base_orted_setup(char **hosts)
497497
goto error;
498498
}
499499

500-
/* be sure to update the routing tree so the initial "phone home"
501-
* to mpirun goes through the tree if static ports were enabled - still
502-
* need to do it anyway just to initialize things
503-
*/
504-
orte_routed.update_routing_plan(NULL);
505-
506500
/* if we are using static ports, then we need to setup
507501
* the daemon info so the RML can function properly
508502
* without requiring a wireup stage. This must be done
@@ -519,6 +513,12 @@ int orte_ess_base_orted_setup(char **hosts)
519513
error = "construct daemon map from static ports";
520514
goto error;
521515
}
516+
/* be sure to update the routing tree so the initial "phone home"
517+
* to mpirun goes through the tree if static ports were enabled
518+
*/
519+
orte_routed.update_routing_plan(NULL);
520+
/* routing can be enabled */
521+
orte_routed_base.routing_enabled = true;
522522
}
523523

524524
/* Now provide a chance for the PLM

orte/mca/grpcomm/direct/grpcomm_direct.c

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
#include "orte/mca/errmgr/errmgr.h"
2828
#include "orte/mca/rml/base/base.h"
2929
#include "orte/mca/rml/base/rml_contact.h"
30-
#include "orte/mca/routed/routed.h"
30+
#include "orte/mca/routed/base/base.h"
3131
#include "orte/mca/state/state.h"
3232
#include "orte/util/compress.h"
3333
#include "orte/util/name_fns.h"
@@ -386,8 +386,14 @@ static void xcast_recv(int status, orte_process_name_t* sender,
386386
goto relay;
387387
}
388388

389-
/* update the routing plan */
390-
orte_routed.update_routing_plan(rtmod);
389+
if (!ORTE_PROC_IS_HNP) {
390+
/* update the routing plan - the HNP already did
391+
* it when it computed the VM, so don't waste time
392+
* re-doing it here */
393+
orte_routed.update_routing_plan(rtmod);
394+
}
395+
/* routing is now possible */
396+
orte_routed_base.routing_enabled = true;
391397

392398
/* see if we have wiring info as well */
393399
cnt=1;

orte/mca/plm/base/plm_base_launch_support.c

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -410,15 +410,6 @@ void orte_plm_base_complete_setup(int fd, short args, void *cbdata)
410410
return;
411411
}
412412

413-
orte_process_info.num_procs = jdatorted->num_procs;
414-
415-
if (orte_process_info.max_procs < orte_process_info.num_procs) {
416-
orte_process_info.max_procs = orte_process_info.num_procs;
417-
}
418-
419-
/* ensure all routing plans are up-to-date */
420-
orte_routed.update_routing_plan(NULL);
421-
422413
/* If this job is being started by me, then there is nothing
423414
* further we need to do as any user directives (e.g., to tie
424415
* off IO to /dev/null) will have been included in the launch
@@ -2158,7 +2149,8 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
21582149
orte_process_info.max_procs = orte_process_info.num_procs;
21592150
}
21602151

2161-
/* ensure all routing plans are up-to-date */
2152+
/* ensure all routing plans are up-to-date - we need this
2153+
* so we know how to tree-spawn and/or xcast info */
21622154
orte_routed.update_routing_plan(NULL);
21632155
}
21642156

orte/mca/plm/rsh/plm_rsh_module.c

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -801,17 +801,20 @@ static int remote_spawn(opal_buffer_t *launch)
801801
goto cleanup;
802802
}
803803

804-
/* get the updated routing list */
805-
rtmod = orte_rml.get_routed(orte_coll_conduit);
806-
OBJ_CONSTRUCT(&coll, opal_list_t);
807-
orte_routed.get_routing_list(rtmod, &coll);
808-
809804
/* extract and update the daemon map */
810805
if (ORTE_SUCCESS != (rc = orte_util_decode_daemon_nodemap(launch))) {
811806
ORTE_ERROR_LOG(rc);
812807
goto cleanup;
813808
}
814809

810+
/* since we are tree-spawning, we need to update the routing plan */
811+
orte_routed.update_routing_plan(NULL);
812+
813+
/* get the updated routing list */
814+
rtmod = orte_rml.get_routed(orte_coll_conduit);
815+
OBJ_CONSTRUCT(&coll, opal_list_t);
816+
orte_routed.get_routing_list(rtmod, &coll);
817+
815818
/* if I have no children, just return */
816819
if (0 == opal_list_get_size(&coll)) {
817820
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,

orte/mca/routed/base/base.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*
22
* Copyright (c) 2007-2013 Los Alamos National Security, LLC.
33
* All rights reserved.
4-
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
4+
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
55
* $COPYRIGHT$
66
*
77
* Additional copyrights may follow
@@ -41,6 +41,7 @@ OBJ_CLASS_DECLARATION(orte_routed_base_active_t);
4141

4242
typedef struct {
4343
opal_list_t actives;
44+
bool routing_enabled;
4445
} orte_routed_base_t;
4546
ORTE_DECLSPEC extern orte_routed_base_t orte_routed_base;
4647

orte/mca/routed/base/routed_base_fns.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
1313
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
1414
* reserved.
15-
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
15+
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
1616
* $COPYRIGHT$
1717
*
1818
* Additional copyrights may follow
@@ -110,7 +110,7 @@ orte_process_name_t orte_routed_base_get_route(char *module, orte_process_name_t
110110
orte_routed_base_active_t *active;
111111

112112
/* a NULL module corresponds to direct */
113-
if (NULL == module) {
113+
if (!orte_routed_base.routing_enabled || NULL == module) {
114114
return *target;
115115
}
116116

@@ -178,6 +178,7 @@ void orte_routed_base_update_routing_plan(char *module)
178178
}
179179
}
180180
}
181+
181182
return;
182183
}
183184

orte/mca/routed/base/routed_base_frame.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* reserved.
1111
* Copyright (c) 2015 Research Organization for Information Science
1212
* and Technology (RIST). All rights reserved.
13-
* Copyright (c) 2016 Intel, Inc. All rights reserved.
13+
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
1414
* $COPYRIGHT$
1515
*
1616
* Additional copyrights may follow
@@ -58,6 +58,8 @@ static int orte_routed_base_open(mca_base_open_flag_t flags)
5858
{
5959
/* setup our list of actives */
6060
OBJ_CONSTRUCT(&orte_routed_base.actives, opal_list_t);
61+
/* start with routing DISABLED */
62+
orte_routed_base.routing_enabled = false;
6163

6264
/* Open up all available components */
6365
return mca_base_framework_components_open(&orte_routed_base_framework, flags);

orte/mca/routed/debruijn/routed_debruijn_component.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ static int orte_routed_debruijn_component_query(mca_base_module_t **module, int
4949
* systems, we will allow other options that have even fewer hops to
5050
* support wireup
5151
*/
52-
*priority = 70;
52+
*priority = 10;
5353
*module = (mca_base_module_t *) &orte_routed_debruijn_module;
5454
return ORTE_SUCCESS;
5555
}

orte/mca/routed/radix/routed_radix.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
* reserved.
77
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
88
* reserved.
9-
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
9+
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
1010
* $COPYRIGHT$
1111
*
1212
* Additional copyrights may follow
@@ -538,4 +538,3 @@ static int radix_ft_event(int state)
538538
return exit_status;
539539
}
540540
#endif
541-

orte/mca/routed/radix/routed_radix_component.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ static int orte_routed_radix_component_query(mca_base_module_t **module, int *pr
6969
return ORTE_ERR_BAD_PARAM;
7070
}
7171

72-
*priority = 50;
72+
*priority = 70;
7373
*module = (mca_base_module_t *) &orte_routed_radix_module;
7474
return ORTE_SUCCESS;
7575
}

0 commit comments

Comments
 (0)