Skip to content

Commit eba6c6b

Browse files
author
Ralph Castain
authored
Merge pull request #3301 from rhc54/topic/faults
Correctly identify the source of the event when notifying of abnormal termination by a proces
2 parents 666386f + b526bca commit eba6c6b

File tree

2 files changed

+43
-11
lines changed

2 files changed

+43
-11
lines changed

orte/mca/state/base/state_base_fns.c

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -460,6 +460,7 @@ void orte_state_base_report_progress(int fd, short argc, void *cbdata)
460460
}
461461

462462
static void _send_notification(int status,
463+
orte_proc_state_t state,
463464
orte_process_name_t *proc,
464465
orte_process_name_t *target)
465466
{
@@ -485,19 +486,43 @@ static void _send_notification(int status,
485486
return;
486487
}
487488

488-
/* the source is me */
489-
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
489+
/* the source is the proc */
490+
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, proc, 1, ORTE_NAME))) {
490491
ORTE_ERROR_LOG(rc);
491492
OBJ_RELEASE(buf);
492493
return;
493494
}
494495

495-
/* we are going to pass three opal_value_t's */
496-
rc = 3;
497-
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rc, 1, OPAL_INT))) {
498-
ORTE_ERROR_LOG(rc);
499-
OBJ_RELEASE(buf);
500-
return;
496+
if (OPAL_ERR_PROC_ABORTED == status) {
497+
/* we will pass four opal_value_t's */
498+
rc = 4;
499+
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rc, 1, OPAL_INT))) {
500+
ORTE_ERROR_LOG(rc);
501+
OBJ_RELEASE(buf);
502+
return;
503+
}
504+
/* pass along the affected proc(s) */
505+
OBJ_CONSTRUCT(&kv, opal_value_t);
506+
kv.key = strdup(OPAL_PMIX_EVENT_AFFECTED_PROC);
507+
kv.type = OPAL_NAME;
508+
kv.data.name.jobid = proc->jobid;
509+
kv.data.name.vpid = proc->vpid;
510+
kvptr = &kv;
511+
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &kvptr, 1, OPAL_VALUE))) {
512+
ORTE_ERROR_LOG(rc);
513+
OBJ_DESTRUCT(&kv);
514+
OBJ_RELEASE(buf);
515+
return;
516+
}
517+
OBJ_DESTRUCT(&kv);
518+
} else {
519+
/* we are going to pass three opal_value_t's */
520+
rc = 3;
521+
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rc, 1, OPAL_INT))) {
522+
ORTE_ERROR_LOG(rc);
523+
OBJ_RELEASE(buf);
524+
return;
525+
}
501526
}
502527

503528
/* pass along the affected proc(s) */
@@ -699,19 +724,19 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
699724
/* notify everyone who asked for it */
700725
target.jobid = jdata->jobid;
701726
target.vpid = ORTE_VPID_WILDCARD;
702-
_send_notification(OPAL_ERR_JOB_TERMINATED, &target, ORTE_NAME_WILDCARD);
727+
_send_notification(OPAL_ERR_JOB_TERMINATED, pdata->state, &target, ORTE_NAME_WILDCARD);
703728
} else {
704729
target.jobid = jdata->jobid;
705730
target.vpid = ORTE_VPID_WILDCARD;
706-
_send_notification(OPAL_ERR_JOB_TERMINATED, &target, &parent);
731+
_send_notification(OPAL_ERR_JOB_TERMINATED, pdata->state, &target, &parent);
707732
}
708733
}
709734
} else if (ORTE_PROC_STATE_TERMINATED < pdata->state &&
710735
!orte_job_term_ordered) {
711736
/* if this was an abnormal term, notify the other procs of the termination */
712737
parent.jobid = jdata->jobid;
713738
parent.vpid = ORTE_VPID_WILDCARD;
714-
_send_notification(OPAL_ERR_PROC_ABORTED, &pdata->name, &parent);
739+
_send_notification(OPAL_ERR_PROC_ABORTED, pdata->state, &pdata->name, &parent);
715740
}
716741
}
717742

orte/util/nidmap.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -641,6 +641,13 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer)
641641
OBJ_CONSTRUCT(&bucket, opal_buffer_t);
642642
while (NULL != (item = opal_list_remove_first(&topos))) {
643643
rng = (orte_regex_range_t*)item;
644+
if (NULL == rng->t) {
645+
/* when we pass thru here prior to launching the daemons, we
646+
* won't have topologies for them and so this entry might
647+
* be NULL - protect ourselves */
648+
OBJ_RELEASE(item);
649+
continue;
650+
}
644651
if (NULL == tmp) {
645652
asprintf(&tmp, "%d", rng->cnt);
646653
} else {

0 commit comments

Comments
 (0)