Skip to content

Commit c1b8538

Browse files
author
rhc54
authored
Merge pull request #2600 from rhc54/topic/dbg
Transfer debugger support changes
2 parents 54c4925 + c6f6f40 commit c1b8538

File tree

4 files changed

+76
-8
lines changed

4 files changed

+76
-8
lines changed

opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_notification.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -455,8 +455,8 @@ static void _notify_client_event(int sd, short args, void *cbdata)
455455
bool matched;
456456

457457
pmix_output_verbose(2, pmix_globals.debug_output,
458-
"pmix_server: _notify_error notifying clients of error %d",
459-
cd->status);
458+
"pmix_server: _notify_error notifying clients of error %s",
459+
PMIx_Error_string(cd->status));
460460

461461
/* we cannot know if everyone who wants this notice has had a chance
462462
* to register for it - the notice may be coming too early. So cache

opal/mca/pmix/pmix2x/pmix/src/util/error.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
#endif
3333

3434
#include <pmix_common.h>
35-
35+
#include "src/include/pmix_globals.h"
3636
#include "src/util/error.h"
3737

3838
const char* PMIx_Error_string(pmix_status_t errnum)
@@ -151,8 +151,12 @@ const char* PMIx_Error_string(pmix_status_t errnum)
151151
return "PMIX_ERR_FILE_READ_FAILURE";
152152
case PMIX_ERR_PERM:
153153
return "PMIX_ERR_PERM";
154+
case PMIX_ERR_JOB_TERMINATED:
155+
return "PMIX_ERR_JOB_TERMINATED";
154156
case PMIX_SUCCESS:
155157
return "SUCCESS";
158+
case PMIX_MAX_ERR_CONSTANT:
159+
return "PMIX_ERR_WILDCARD";
156160
default:
157161
return "ERROR STRING NOT FOUND";
158162
}

orte/mca/state/base/state_base_fns.c

Lines changed: 68 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -518,16 +518,74 @@ static void _send_notification(int status, orte_process_name_t *proc)
518518
OBJ_DESTRUCT(&buf);
519519
}
520520

521+
static void _send_direct_notify(int status, orte_process_name_t *proc)
522+
{
523+
opal_buffer_t *buf;
524+
int rc;
525+
opal_value_t kv, *kvptr;
526+
orte_process_name_t daemon;
527+
528+
buf = OBJ_NEW(opal_buffer_t);
529+
530+
/* pack the status */
531+
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &status, 1, OPAL_INT))) {
532+
ORTE_ERROR_LOG(rc);
533+
OBJ_RELEASE(buf);
534+
return;
535+
}
536+
537+
/* the source is me */
538+
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
539+
ORTE_ERROR_LOG(rc);
540+
OBJ_RELEASE(buf);
541+
return;
542+
}
543+
544+
/* pass along the proc to be notified (one opal_value_t) */
545+
rc = 1;
546+
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rc, 1, OPAL_INT))) {
547+
ORTE_ERROR_LOG(rc);
548+
OBJ_RELEASE(buf);
549+
return;
550+
}
551+
OBJ_CONSTRUCT(&kv, opal_value_t);
552+
kv.key = strdup(OPAL_PMIX_EVENT_CUSTOM_RANGE);
553+
kv.type = OPAL_NAME;
554+
kv.data.name.jobid = proc->jobid;
555+
kv.data.name.vpid = proc->vpid;
556+
kvptr = &kv;
557+
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &kvptr, 1, OPAL_VALUE))) {
558+
ORTE_ERROR_LOG(rc);
559+
OBJ_DESTRUCT(&kv);
560+
OBJ_RELEASE(buf);
561+
return;
562+
}
563+
OBJ_DESTRUCT(&kv);
564+
565+
566+
/* get the daemon hosting the proc to be notified */
567+
daemon.jobid = ORTE_PROC_MY_NAME->jobid;
568+
daemon.vpid = orte_get_proc_daemon_vpid(proc);
569+
/* send the notification to that daemon */
570+
if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
571+
&daemon, buf,
572+
ORTE_RML_TAG_NOTIFICATION,
573+
orte_rml_send_callback, NULL))) {
574+
ORTE_ERROR_LOG(rc);
575+
OBJ_RELEASE(buf);
576+
}
577+
}
578+
521579
void orte_state_base_track_procs(int fd, short argc, void *cbdata)
522580
{
523581
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
524582
orte_process_name_t *proc = &caddy->name;
525-
orte_process_name_t wildcard_rank;
526583
orte_proc_state_t state = caddy->proc_state;
527584
orte_job_t *jdata;
528585
orte_proc_t *pdata;
529586
int i;
530587
char *rtmod;
588+
orte_process_name_t parent, *npptr;
531589

532590
opal_output_verbose(5, orte_state_base_framework.framework_output,
533591
"%s state:base:track_procs called for proc %s state %s",
@@ -636,9 +694,15 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
636694
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
637695
/* if they requested notification upon completion, provide it */
638696
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NOTIFY_COMPLETION, NULL, OPAL_BOOL)) {
639-
wildcard_rank.jobid = jdata->jobid;
640-
wildcard_rank.vpid = ORTE_VPID_WILDCARD;
641-
_send_notification(OPAL_ERR_JOB_TERMINATED, &wildcard_rank);
697+
/* notify_completion => notify the parent of the termination
698+
* of this child job. So get the parent jobid info */
699+
npptr = &parent;
700+
if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCH_PROXY, (void**)&npptr, OPAL_NAME)) {
701+
/* notify everyone who asked for it */
702+
_send_direct_notify(OPAL_ERR_JOB_TERMINATED, ORTE_NAME_WILDCARD);
703+
} else {
704+
_send_direct_notify(OPAL_ERR_JOB_TERMINATED, &parent);
705+
}
642706
}
643707
} else if (ORTE_PROC_STATE_TERMINATED < pdata->state &&
644708
!orte_job_term_ordered) {

orte/orted/pmix/pmix_server_dyn.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ int pmix_server_spawn_fn(opal_process_name_t *requestor,
245245
} else if (0 == strcmp(info->key, OPAL_PMIX_NOTIFY_COMPLETION)) {
246246
if (OPAL_UNDEF == info->type || info->data.flag) {
247247
orte_set_attribute(&jdata->attributes, ORTE_JOB_NOTIFY_COMPLETION,
248-
ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
248+
ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
249249
}
250250
} else if (0 == strcmp(info->key, OPAL_PMIX_DEBUG_STOP_ON_EXEC)) {
251251
/* we don't know how to do this */

0 commit comments

Comments
 (0)