Skip to content

Commit df8ac7b

Browse files
author
Ralph Castain
committed
Properly mark a node as down and decrease the number of daemons so any
subsequent grpcomm collectives can correctly operate. Note that only the direct grpcomm component knows how to deal with down nodes.
1 parent 2a9f818 commit df8ac7b

File tree

2 files changed

+6
-1
lines changed

2 files changed

+6
-1
lines changed

orte/mca/errmgr/dvm/errmgr_dvm.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,10 @@ static void proc_errors(int fd, short args, void *cbdata)
331331
}
332332
/* mark the daemon as gone */
333333
ORTE_FLAG_UNSET(pptr, ORTE_PROC_FLAG_ALIVE);
334+
/* update the state */
335+
pptr->state = state;
336+
/* adjust our num_procs */
337+
--orte_process_info.num_procs;
334338
/* if we have ordered orteds to terminate or abort
335339
* is in progress, record it */
336340
if (orte_orteds_term_ordered || orte_abnormal_term_ordered) {

orte/mca/grpcomm/direct/grpcomm_direct.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -432,7 +432,8 @@ static void xcast_recv(int status, orte_process_name_t* sender,
432432
OBJ_RELEASE(item);
433433
continue;
434434
}
435-
if (ORTE_PROC_STATE_RUNNING < rec->state) {
435+
if (ORTE_PROC_STATE_RUNNING < rec->state ||
436+
!ORTE_FLAG_TEST(rec, ORTE_PROC_FLAG_ALIVE)) {
436437
opal_output(0, "%s grpcomm:direct:send_relay proc %s not running - cannot relay",
437438
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name));
438439
OBJ_RELEASE(rly);

0 commit comments

Comments
 (0)