Skip to content

Commit 4a65b19

Browse files
author
rhc54
authored
Merge pull request #2271 from rhc54/topic/ft
Properly mark a node as down and decrease the number of daemons so any
2 parents 900ae15 + df8ac7b commit 4a65b19

File tree

2 files changed

+6
-1
lines changed

2 files changed

+6
-1
lines changed

orte/mca/errmgr/dvm/errmgr_dvm.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,10 @@ static void proc_errors(int fd, short args, void *cbdata)
331331
}
332332
/* mark the daemon as gone */
333333
ORTE_FLAG_UNSET(pptr, ORTE_PROC_FLAG_ALIVE);
334+
/* update the state */
335+
pptr->state = state;
336+
/* adjust our num_procs */
337+
--orte_process_info.num_procs;
334338
/* if we have ordered orteds to terminate or abort
335339
* is in progress, record it */
336340
if (orte_orteds_term_ordered || orte_abnormal_term_ordered) {

orte/mca/grpcomm/direct/grpcomm_direct.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -432,7 +432,8 @@ static void xcast_recv(int status, orte_process_name_t* sender,
432432
OBJ_RELEASE(item);
433433
continue;
434434
}
435-
if (ORTE_PROC_STATE_RUNNING < rec->state) {
435+
if (ORTE_PROC_STATE_RUNNING < rec->state ||
436+
!ORTE_FLAG_TEST(rec, ORTE_PROC_FLAG_ALIVE)) {
436437
opal_output(0, "%s grpcomm:direct:send_relay proc %s not running - cannot relay",
437438
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name));
438439
OBJ_RELEASE(rly);

0 commit comments

Comments
 (0)