Skip to content

Commit 410befd

Browse files
author
Ralph Castain
authored
Merge pull request #2864 from rhc54/topic/rsh
Repair rsh/ssh tree spawn
2 parents 3440b46 + 7c795f4 commit 410befd

24 files changed

+156
-140
lines changed

orte/mca/errmgr/base/help-errmgr-base.txt

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
# University of Stuttgart. All rights reserved.
1111
# Copyright (c) 2004-2005 The Regents of the University of California.
1212
# All rights reserved.
13-
# Copyright (c) 2014 Intel, Inc. All rights reserved.
13+
# Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
1414
# $COPYRIGHT$
1515
#
1616
# Additional copyrights may follow
@@ -69,3 +69,30 @@ This is usually due to either a failure of the TCP network
6969
connection to the node, or possibly an internal failure of
7070
the daemon itself. We cannot recover from this failure, and
7171
therefore will terminate the job.
72+
#
73+
[no-path]
74+
ORTE does not know how to route a message to the specified daemon
75+
located on the indicated node:
76+
77+
my node: %s
78+
target node: %s
79+
80+
This is usually an internal programming error that should be
81+
reported to the developers. In the meantime, a workaround may
82+
be to set the MCA param routed=direct on the command line or
83+
in your environment. We apologize for the problem.
84+
#
85+
[no-connect]
86+
ORTE is unable to establish a communication connection to the
87+
specified daemon located on the indicated node:
88+
89+
my node: %s
90+
target node: %s
91+
92+
This is usually due to a lack of common network interfaces and/or
93+
no route found between them. Please check network connectivity (including
94+
firewalls and network routing requirements). If these look okay,
95+
then it could be an internal programming error that should be
96+
reported to the developers. In the meantime, a workaround may
97+
be to set the MCA param routed=direct on the command line or
98+
in your environment.

orte/mca/errmgr/default_hnp/errmgr_default_hnp.c

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
* Copyright (c) 2011 Oracle and/or all its affiliates. All rights reserved.
1010
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
1111
* All rights reserved.
12-
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
12+
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
1313
* $COPYRIGHT$
1414
*
1515
* Additional copyrights may follow
@@ -664,6 +664,48 @@ static void proc_errors(int fd, short args, void *cbdata)
664664
}
665665
break;
666666

667+
case ORTE_PROC_STATE_NO_PATH_TO_TARGET:
668+
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
669+
"%s errmgr:hnp: no message path to proc %s",
670+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
671+
ORTE_NAME_PRINT(proc)));
672+
orte_show_help("help-errmgr-base.txt", "no-path", true,
673+
orte_process_info.nodename, pptr->node->name);
674+
/* if this proc is one of my daemons, then we are truly
675+
* hosed - so just exit out
676+
*/
677+
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
678+
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
679+
break;
680+
}
681+
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
682+
/* abnormal termination - abort, but only do it once
683+
* to avoid creating a lot of confusion */
684+
default_hnp_abort(jdata);
685+
}
686+
break;
687+
688+
case ORTE_PROC_STATE_FAILED_TO_CONNECT:
689+
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
690+
"%s errmgr:hnp: cannot connect to proc %s",
691+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
692+
ORTE_NAME_PRINT(proc)));
693+
orte_show_help("help-errmgr-base.txt", "no-connect", true,
694+
orte_process_info.nodename, pptr->node->name);
695+
/* if this proc is one of my daemons, then we are truly
696+
* hosed - so just exit out
697+
*/
698+
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
699+
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
700+
break;
701+
}
702+
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
703+
/* abnormal termination - abort, but only do it once
704+
* to avoid creating a lot of confusion */
705+
default_hnp_abort(jdata);
706+
}
707+
break;
708+
667709
default:
668710
/* shouldn't get this, but terminate job if required */
669711
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,

orte/mca/ess/base/ess_base_std_orted.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -497,12 +497,6 @@ int orte_ess_base_orted_setup(char **hosts)
497497
goto error;
498498
}
499499

500-
/* be sure to update the routing tree so the initial "phone home"
501-
* to mpirun goes through the tree if static ports were enabled - still
502-
* need to do it anyway just to initialize things
503-
*/
504-
orte_routed.update_routing_plan(NULL);
505-
506500
/* if we are using static ports, then we need to setup
507501
* the daemon info so the RML can function properly
508502
* without requiring a wireup stage. This must be done
@@ -519,6 +513,12 @@ int orte_ess_base_orted_setup(char **hosts)
519513
error = "construct daemon map from static ports";
520514
goto error;
521515
}
516+
/* be sure to update the routing tree so the initial "phone home"
517+
* to mpirun goes through the tree if static ports were enabled
518+
*/
519+
orte_routed.update_routing_plan(NULL);
520+
/* routing can be enabled */
521+
orte_routed_base.routing_enabled = true;
522522
}
523523

524524
/* Now provide a chance for the PLM

orte/mca/grpcomm/direct/grpcomm_direct.c

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
#include "orte/mca/errmgr/errmgr.h"
2828
#include "orte/mca/rml/base/base.h"
2929
#include "orte/mca/rml/base/rml_contact.h"
30-
#include "orte/mca/routed/routed.h"
30+
#include "orte/mca/routed/base/base.h"
3131
#include "orte/mca/state/state.h"
3232
#include "orte/util/compress.h"
3333
#include "orte/util/name_fns.h"
@@ -386,8 +386,14 @@ static void xcast_recv(int status, orte_process_name_t* sender,
386386
goto relay;
387387
}
388388

389-
/* update the routing plan */
390-
orte_routed.update_routing_plan(rtmod);
389+
if (!ORTE_PROC_IS_HNP) {
390+
/* update the routing plan - the HNP already did
391+
* it when it computed the VM, so don't waste time
392+
* re-doing it here */
393+
orte_routed.update_routing_plan(rtmod);
394+
}
395+
/* routing is now possible */
396+
orte_routed_base.routing_enabled = true;
391397

392398
/* see if we have wiring info as well */
393399
cnt=1;

orte/mca/oob/base/oob_base_stubs.c

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
/*
33
* Copyright (c) 2012-2014 Los Alamos National Security, LLC. All rights
44
* reserved.
5-
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
5+
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
66
* $COPYRIGHT$
77
*
88
* Additional copyrights may follow
@@ -46,9 +46,18 @@ void orte_oob_base_send_nb(int fd, short args, void *cbdata)
4646
OBJ_RELEASE(cd);
4747

4848
opal_output_verbose(5, orte_oob_base_framework.framework_output,
49-
"%s oob:base:send to target %s",
49+
"%s oob:base:send to target %s - %u attempt",
5050
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
51-
ORTE_NAME_PRINT(&msg->dst));
51+
ORTE_NAME_PRINT(&msg->dst), msg->retries);
52+
53+
/* don't try forever - if we have exceeded the number of retries,
54+
* then report this message as undeliverable even if someone continues
55+
* to think they could reach it */
56+
if (orte_rml_base.max_retries <= msg->retries) {
57+
msg->status = ORTE_ERR_NO_PATH_TO_TARGET;
58+
ORTE_RML_SEND_COMPLETE(msg);
59+
return;
60+
}
5261

5362
/* check if we have this peer in our hash table */
5463
memcpy(&ui64, (char*)&msg->dst, sizeof(uint64_t));

orte/mca/oob/tcp/oob_tcp.c

Lines changed: 0 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -76,14 +76,12 @@ static void accept_connection(const int accepted_fd,
7676
const struct sockaddr *addr);
7777
static void ping(const orte_process_name_t *proc);
7878
static void send_nb(orte_rml_send_t *msg);
79-
static void resend(struct mca_oob_tcp_msg_error_t *mop);
8079
static void ft_event(int state);
8180

8281
mca_oob_tcp_module_t mca_oob_tcp_module = {
8382
.accept_connection = accept_connection,
8483
.ping = ping,
8584
.send_nb = send_nb,
86-
.resend = resend,
8785
.ft_event = ft_event
8886
};
8987

@@ -242,68 +240,6 @@ static void send_nb(orte_rml_send_t *msg)
242240
}
243241
}
244242

245-
static void resend(struct mca_oob_tcp_msg_error_t *mpi)
246-
{
247-
mca_oob_tcp_msg_error_t *mp = (mca_oob_tcp_msg_error_t*)mpi;
248-
mca_oob_tcp_peer_t *peer;
249-
250-
opal_output_verbose(2, orte_oob_base_framework.framework_output,
251-
"%s:tcp processing resend to peer %s",
252-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
253-
ORTE_NAME_PRINT(&mp->hop));
254-
255-
/* do we know this peer? */
256-
if (NULL == (peer = mca_oob_tcp_peer_lookup(&mp->hop))) {
257-
/* push this back to the component so it can try
258-
* another module within this transport. If no
259-
* module can be found, the component can push back
260-
* to the framework so another component can try
261-
*/
262-
opal_output_verbose(2, orte_oob_base_framework.framework_output,
263-
"%s:[%s:%d] peer %s unknown",
264-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
265-
__FILE__, __LINE__,
266-
ORTE_NAME_PRINT(&mp->hop));
267-
ORTE_ACTIVATE_TCP_MSG_ERROR(mp->snd, NULL, &mp->hop, mca_oob_tcp_component_hop_unknown);
268-
return;
269-
}
270-
271-
/* should be impossible, but...has this peer had a progress thread assigned yet? */
272-
if (NULL == peer->ev_base) {
273-
/* nope - assign one */
274-
ORTE_OOB_TCP_NEXT_BASE(peer);
275-
}
276-
277-
/* add the msg to this peer's send queue */
278-
if (MCA_OOB_TCP_CONNECTED == peer->state) {
279-
opal_output_verbose(2, orte_oob_base_framework.framework_output,
280-
"%s tcp:resend: already connected to %s - queueing for send",
281-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
282-
ORTE_NAME_PRINT(&peer->name));
283-
MCA_OOB_TCP_QUEUE_MSG(peer, mp->snd, true);
284-
return;
285-
}
286-
287-
if (MCA_OOB_TCP_CONNECTING != peer->state &&
288-
MCA_OOB_TCP_CONNECT_ACK != peer->state) {
289-
/* add the message to the queue for sending after the
290-
* connection is formed
291-
*/
292-
MCA_OOB_TCP_QUEUE_MSG(peer, mp->snd, false);
293-
/* we have to initiate the connection - again, we do not
294-
* want to block while the connection is created.
295-
* So throw us into an event that will create
296-
* the connection via a mini-state-machine :-)
297-
*/
298-
opal_output_verbose(2, orte_oob_base_framework.framework_output,
299-
"%s tcp:send_nb: initiating connection to %s",
300-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
301-
ORTE_NAME_PRINT(&peer->name));
302-
peer->state = MCA_OOB_TCP_CONNECTING;
303-
ORTE_ACTIVATE_TCP_CONN_STATE(peer, mca_oob_tcp_peer_try_connect);
304-
}
305-
}
306-
307243
/*
308244
* Event callback when there is data available on the registered
309245
* socket to recv. This is called for the listen sockets to accept an

orte/mca/oob/tcp/oob_tcp.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,14 +59,12 @@ typedef void (*mca_oob_tcp_module_accept_connection_fn_t)(const int accepted_fd,
5959
const struct sockaddr *addr);
6060
typedef void (*mca_oob_tcp_module_ping_fn_t)(const orte_process_name_t *proc);
6161
typedef void (*mca_oob_tcp_module_send_nb_fn_t)(orte_rml_send_t *msg);
62-
typedef void (*mca_oob_tcp_module_resend_nb_fn_t)(struct mca_oob_tcp_msg_error_t *mop);
6362
typedef void (*mca_oob_tcp_module_ft_event_fn_t)(int state);
6463

6564
typedef struct {
6665
mca_oob_tcp_module_accept_connection_fn_t accept_connection;
6766
mca_oob_tcp_module_ping_fn_t ping;
6867
mca_oob_tcp_module_send_nb_fn_t send_nb;
69-
mca_oob_tcp_module_resend_nb_fn_t resend;
7068
mca_oob_tcp_module_ft_event_fn_t ft_event;
7169
} mca_oob_tcp_module_t;
7270
ORTE_MODULE_DECLSPEC extern mca_oob_tcp_module_t mca_oob_tcp_module;

orte/mca/oob/tcp/oob_tcp_component.c

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1149,14 +1149,9 @@ void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata)
11491149
/* report the error back to the OOB and let it try other components
11501150
* or declare a problem
11511151
*/
1152-
if (!orte_finalizing && !orte_abnormal_term_ordered) {
1153-
/* if this was a lifeline, then alert */
1154-
if (ORTE_SUCCESS != orte_routed.route_lost(mop->rmsg->routed, &mop->hop)) {
1155-
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_LIFELINE_LOST);
1156-
} else {
1157-
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_COMM_FAILED);
1158-
}
1159-
}
1152+
mop->rmsg->retries++;
1153+
/* activate the OOB send state */
1154+
ORTE_OOB_SEND(mop->rmsg);
11601155

11611156
OBJ_RELEASE(mop);
11621157
}
@@ -1219,6 +1214,7 @@ void mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata)
12191214
*/
12201215
MCA_OOB_TCP_HDR_NTOH(&mop->snd->hdr);
12211216
snd = OBJ_NEW(orte_rml_send_t);
1217+
snd->retries = mop->rmsg->retries + 1;
12221218
snd->dst = mop->snd->hdr.dst;
12231219
snd->origin = mop->snd->hdr.origin;
12241220
snd->tag = mop->snd->hdr.tag;
@@ -1257,12 +1253,7 @@ void mca_oob_tcp_component_failed_to_connect(int fd, short args, void *cbdata)
12571253
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
12581254
ORTE_NAME_PRINT(&pop->peer));
12591255

1260-
/* if this was a lifeline, then alert */
1261-
if (ORTE_SUCCESS != orte_routed.route_lost(pop->rtmod, &pop->peer)) {
1262-
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_LIFELINE_LOST);
1263-
} else {
1264-
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_COMM_FAILED);
1265-
}
1256+
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_FAILED_TO_CONNECT);
12661257
OBJ_RELEASE(pop);
12671258
}
12681259

orte/mca/oob/tcp/oob_tcp_sendrecv.h

Lines changed: 3 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -291,23 +291,6 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_msg_error_t);
291291
opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \
292292
} while(0);
293293

294-
#define ORTE_ACTIVATE_TCP_POST_RESEND(mop, cbfunc) \
295-
do { \
296-
mca_oob_tcp_msg_error_t *mp; \
297-
opal_output_verbose(5, orte_oob_base_framework.framework_output, \
298-
"%s:[%s:%d] post resend to %s", \
299-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \
300-
__FILE__, __LINE__, \
301-
ORTE_NAME_PRINT(&((mop)->hop))); \
302-
mp = OBJ_NEW(mca_oob_tcp_msg_error_t); \
303-
mp->snd = (mop)->snd; \
304-
mp->hop = (mop)->hop; \
305-
opal_event_set(op->snd->peer->ev_base, &mp->ev, -1, \
306-
OPAL_EV_WRITE, (cbfunc), mp); \
307-
opal_event_set_priority(&mp->ev, ORTE_MSG_PRI); \
308-
opal_event_active(&mp->ev, OPAL_EV_WRITE, 1); \
309-
} while(0);
310-
311294
#define ORTE_ACTIVATE_TCP_NO_ROUTE(r, h, c) \
312295
do { \
313296
mca_oob_tcp_msg_error_t *mop; \
@@ -320,8 +303,9 @@ OBJ_CLASS_DECLARATION(mca_oob_tcp_msg_error_t);
320303
mop->rmsg = (r); \
321304
mop->hop.jobid = (h)->jobid; \
322305
mop->hop.vpid = (h)->vpid; \
323-
/* this goes to the OOB framework, so use that event base */ \
324-
opal_event_set(orte_oob_base.ev_base, &mop->ev, -1, \
306+
/* this goes to the component, so use the framework \
307+
* event base */ \
308+
opal_event_set(orte_oob_base.ev_base, &mop->ev, -1, \
325309
OPAL_EV_WRITE, (c), mop); \
326310
opal_event_set_priority(&mop->ev, ORTE_MSG_PRI); \
327311
opal_event_active(&mop->ev, OPAL_EV_WRITE, 1); \

orte/mca/plm/base/plm_base_launch_support.c

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -410,15 +410,6 @@ void orte_plm_base_complete_setup(int fd, short args, void *cbdata)
410410
return;
411411
}
412412

413-
orte_process_info.num_procs = jdatorted->num_procs;
414-
415-
if (orte_process_info.max_procs < orte_process_info.num_procs) {
416-
orte_process_info.max_procs = orte_process_info.num_procs;
417-
}
418-
419-
/* ensure all routing plans are up-to-date */
420-
orte_routed.update_routing_plan(NULL);
421-
422413
/* If this job is being started by me, then there is nothing
423414
* further we need to do as any user directives (e.g., to tie
424415
* off IO to /dev/null) will have been included in the launch
@@ -2158,7 +2149,8 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata)
21582149
orte_process_info.max_procs = orte_process_info.num_procs;
21592150
}
21602151

2161-
/* ensure all routing plans are up-to-date */
2152+
/* ensure all routing plans are up-to-date - we need this
2153+
* so we know how to tree-spawn and/or xcast info */
21622154
orte_routed.update_routing_plan(NULL);
21632155
}
21642156

0 commit comments

Comments
 (0)