Skip to content

Commit 0acdceb

Browse files
author
rhc54
authored
Merge pull request #2601 from rhc54/topic/dbgr
Transfer across final fixes from debugger attach work
2 parents c1b8538 + 256b5ad commit 0acdceb

File tree

5 files changed

+126
-80
lines changed

5 files changed

+126
-80
lines changed

opal/mca/pmix/pmix2x/pmix/src/event/pmix_event_notification.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -501,8 +501,9 @@ static void _notify_client_event(int sd, short args, void *cbdata)
501501
}
502502
}
503503
pmix_output_verbose(2, pmix_globals.debug_output,
504-
"pmix_server: notifying client %s:%d",
505-
pr->peer->info->nptr->nspace, pr->peer->info->rank);
504+
"pmix_server: notifying client %s:%d of status %s",
505+
pr->peer->info->nptr->nspace, pr->peer->info->rank,
506+
PMIx_Error_string(cd->status));
506507
PMIX_RETAIN(cd->buf);
507508
PMIX_SERVER_QUEUE_REPLY(pr->peer, 0, cd->buf);
508509
}
@@ -555,7 +556,7 @@ static pmix_status_t notify_client_of_event(pmix_status_t status,
555556
for (n=0; n < ninfo; n++) {
556557
if (0 == strncmp(info[n].key, PMIX_EVENT_NON_DEFAULT, PMIX_MAX_KEYLEN)) {
557558
cd->nondefault = true;
558-
} else if (strncmp(info[n].key, PMIX_EVENT_CUSTOM_RANGE, PMIX_MAX_KEYLEN)) {
559+
} else if (0 == strncmp(info[n].key, PMIX_EVENT_CUSTOM_RANGE, PMIX_MAX_KEYLEN)) {
559560
/* provides an array of pmix_proc_t identifying the procs
560561
* that are to receive this notification */
561562
if (PMIX_DATA_ARRAY == info[n].value.type &&
@@ -570,6 +571,7 @@ static pmix_status_t notify_client_of_event(pmix_status_t status,
570571
memcpy(cd->targets, info[n].value.data.proc, sizeof(pmix_proc_t));
571572
} else {
572573
/* this is an error */
574+
PMIX_ERROR_LOG(PMIX_ERR_BAD_PARAM);
573575
return PMIX_ERR_BAD_PARAM;
574576
}
575577
}

opal/mca/pmix/pmix2x/pmix2x.c

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -733,6 +733,9 @@ pmix_persistence_t pmix2x_convert_opalpersist(opal_pmix_persistence_t persist)
733733
void pmix2x_value_load(pmix_value_t *v,
734734
opal_value_t *kv)
735735
{
736+
opal_pmix2x_jobid_trkr_t *job;
737+
bool found;
738+
736739
switch(kv->type) {
737740
case OPAL_UNDEF:
738741
v->type = PMIX_UNDEF;
@@ -829,7 +832,18 @@ void pmix2x_value_load(pmix_value_t *v,
829832
v->type = PMIX_PROC;
830833
/* have to stringify the jobid */
831834
PMIX_PROC_CREATE(v->data.proc, 1);
832-
(void)opal_snprintf_jobid(v->data.proc->nspace, PMIX_MAX_NSLEN, kv->data.name.vpid);
835+
/* see if this job is in our list of known nspaces */
836+
found = false;
837+
OPAL_LIST_FOREACH(job, &mca_pmix_pmix2x_component.jobids, opal_pmix2x_jobid_trkr_t) {
838+
if (job->jobid == kv->data.name.jobid) {
839+
(void)strncpy(v->data.proc->nspace, job->nspace, PMIX_MAX_NSLEN);
840+
found = true;
841+
break;
842+
}
843+
}
844+
if (!found) {
845+
(void)opal_snprintf_jobid(v->data.proc->nspace, PMIX_MAX_NSLEN, kv->data.name.vpid);
846+
}
833847
v->data.proc->rank = pmix2x_convert_opalrank(kv->data.name.vpid);
834848
break;
835849
case OPAL_BYTE_OBJECT:
@@ -875,7 +889,8 @@ int pmix2x_value_unload(opal_value_t *kv,
875889
const pmix_value_t *v)
876890
{
877891
int rc=OPAL_SUCCESS;
878-
892+
bool found;
893+
opal_pmix2x_jobid_trkr_t *job;
879894

880895
switch(v->type) {
881896
case PMIX_UNDEF:
@@ -969,8 +984,19 @@ int pmix2x_value_unload(opal_value_t *kv,
969984
break;
970985
case PMIX_PROC:
971986
kv->type = OPAL_NAME;
972-
if (OPAL_SUCCESS != (rc = opal_convert_string_to_jobid(&kv->data.name.jobid, v->data.proc->nspace))) {
973-
return pmix2x_convert_opalrc(rc);
987+
/* see if this job is in our list of known nspaces */
988+
found = false;
989+
OPAL_LIST_FOREACH(job, &mca_pmix_pmix2x_component.jobids, opal_pmix2x_jobid_trkr_t) {
990+
if (0 == strncmp(job->nspace, v->data.proc->nspace, PMIX_MAX_NSLEN)) {
991+
kv->data.name.jobid = job->jobid;
992+
found = true;
993+
break;
994+
}
995+
}
996+
if (!found) {
997+
if (OPAL_SUCCESS != (rc = opal_convert_string_to_jobid(&kv->data.name.jobid, v->data.proc->nspace))) {
998+
return pmix2x_convert_opalrc(rc);
999+
}
9741000
}
9751001
kv->data.name.vpid = pmix2x_convert_rank(v->data.proc->rank);
9761002
break;

opal/mca/pmix/pmix2x/pmix2x_server_north.c

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -920,13 +920,22 @@ static void toolcbfunc(int status,
920920
pmix2x_opalcaddy_t *opalcaddy = (pmix2x_opalcaddy_t*)cbdata;
921921
pmix_status_t rc;
922922
pmix_proc_t p;
923+
opal_pmix2x_jobid_trkr_t *job;
923924

924925
/* convert the status */
925926
rc = pmix2x_convert_opalrc(status);
926927

927-
/* convert the process name */
928-
(void)opal_snprintf_jobid(p.nspace, PMIX_MAX_NSLEN, proc.jobid);
929-
p.rank = pmix2x_convert_opalrank(proc.vpid);
928+
memset(&p, 0, sizeof(pmix_proc_t));
929+
if (OPAL_SUCCESS == status) {
930+
/* convert the process name */
931+
(void)opal_snprintf_jobid(p.nspace, PMIX_MAX_NSLEN, proc.jobid);
932+
p.rank = pmix2x_convert_opalrank(proc.vpid);
933+
/* store this job in our list of known nspaces */
934+
job = OBJ_NEW(opal_pmix2x_jobid_trkr_t);
935+
(void)strncpy(job->nspace, p.nspace, PMIX_MAX_NSLEN);
936+
job->jobid = proc.jobid;
937+
opal_list_append(&mca_pmix_pmix2x_component.jobids, &job->super);
938+
}
930939

931940
/* pass it down */
932941
if (NULL != opalcaddy->toolcbfunc) {

opal/mca/pmix/pmix2x/pmix2x_server_south.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -499,6 +499,7 @@ int pmix2x_server_notify_event(int status,
499499
OPAL_LIST_FOREACH(kv, info, opal_value_t) {
500500
(void)strncpy(pinfo[n].key, kv->key, PMIX_MAX_KEYLEN);
501501
pmix2x_value_load(&pinfo[n].value, kv);
502+
++n;
502503
}
503504
} else {
504505
sz = 0;

orte/mca/state/base/state_base_fns.c

Lines changed: 78 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -459,100 +459,82 @@ void orte_state_base_report_progress(int fd, short argc, void *cbdata)
459459
OBJ_RELEASE(caddy);
460460
}
461461

462-
static void _send_notification(int status, orte_process_name_t *proc)
462+
static void _send_notification(int status,
463+
orte_process_name_t *proc,
464+
orte_process_name_t *target)
463465
{
464-
opal_buffer_t buf;
466+
opal_buffer_t *buf;
465467
orte_grpcomm_signature_t sig;
466468
int rc;
467469
opal_value_t kv, *kvptr;
470+
orte_process_name_t daemon;
471+
472+
buf = OBJ_NEW(opal_buffer_t);
468473

469-
OBJ_CONSTRUCT(&buf, opal_buffer_t);
474+
opal_output_verbose(5, orte_state_base_framework.framework_output,
475+
"%s state:base:sending notification %s proc %s target %s",
476+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
477+
ORTE_ERROR_NAME(status),
478+
ORTE_NAME_PRINT(proc),
479+
ORTE_NAME_PRINT(target));
470480

471481
/* pack the status */
472-
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &status, 1, OPAL_INT))) {
482+
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &status, 1, OPAL_INT))) {
473483
ORTE_ERROR_LOG(rc);
474-
OBJ_DESTRUCT(&buf);
484+
OBJ_RELEASE(buf);
475485
return;
476486
}
477487

478488
/* the source is me */
479-
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
489+
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
480490
ORTE_ERROR_LOG(rc);
481-
OBJ_DESTRUCT(&buf);
491+
OBJ_RELEASE(buf);
482492
return;
483493
}
484494

485-
/* pass along the affected proc (one opal_value_t) */
486-
rc = 1;
487-
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &rc, 1, OPAL_INT))) {
495+
/* we are going to pass three opal_value_t's */
496+
rc = 3;
497+
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rc, 1, OPAL_INT))) {
488498
ORTE_ERROR_LOG(rc);
489-
OBJ_DESTRUCT(&buf);
499+
OBJ_RELEASE(buf);
490500
return;
491501
}
502+
503+
/* pass along the affected proc(s) */
492504
OBJ_CONSTRUCT(&kv, opal_value_t);
493505
kv.key = strdup(OPAL_PMIX_EVENT_AFFECTED_PROC);
494506
kv.type = OPAL_NAME;
495507
kv.data.name.jobid = proc->jobid;
496508
kv.data.name.vpid = proc->vpid;
497509
kvptr = &kv;
498-
if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &kvptr, 1, OPAL_VALUE))) {
510+
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &kvptr, 1, OPAL_VALUE))) {
499511
ORTE_ERROR_LOG(rc);
500512
OBJ_DESTRUCT(&kv);
501-
OBJ_DESTRUCT(&buf);
502-
return;
503-
}
504-
OBJ_DESTRUCT(&kv);
505-
506-
507-
/* xcast it to everyone */
508-
OBJ_CONSTRUCT(&sig, orte_grpcomm_signature_t);
509-
sig.signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
510-
sig.signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
511-
sig.signature[0].vpid = ORTE_VPID_WILDCARD;
512-
sig.sz = 1;
513-
514-
if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(&sig, ORTE_RML_TAG_NOTIFICATION, &buf))) {
515-
ORTE_ERROR_LOG(rc);
516-
}
517-
OBJ_DESTRUCT(&sig);
518-
OBJ_DESTRUCT(&buf);
519-
}
520-
521-
static void _send_direct_notify(int status, orte_process_name_t *proc)
522-
{
523-
opal_buffer_t *buf;
524-
int rc;
525-
opal_value_t kv, *kvptr;
526-
orte_process_name_t daemon;
527-
528-
buf = OBJ_NEW(opal_buffer_t);
529-
530-
/* pack the status */
531-
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &status, 1, OPAL_INT))) {
532-
ORTE_ERROR_LOG(rc);
533513
OBJ_RELEASE(buf);
534514
return;
535515
}
516+
OBJ_DESTRUCT(&kv);
536517

537-
/* the source is me */
538-
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
518+
/* pass along the proc(s) to be notified */
519+
OBJ_CONSTRUCT(&kv, opal_value_t);
520+
kv.key = strdup(OPAL_PMIX_EVENT_CUSTOM_RANGE);
521+
kv.type = OPAL_NAME;
522+
kv.data.name.jobid = target->jobid;
523+
kv.data.name.vpid = target->vpid;
524+
kvptr = &kv;
525+
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &kvptr, 1, OPAL_VALUE))) {
539526
ORTE_ERROR_LOG(rc);
527+
OBJ_DESTRUCT(&kv);
540528
OBJ_RELEASE(buf);
541529
return;
542530
}
531+
OBJ_DESTRUCT(&kv);
543532

544-
/* pass along the proc to be notified (one opal_value_t) */
545-
rc = 1;
546-
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rc, 1, OPAL_INT))) {
547-
ORTE_ERROR_LOG(rc);
548-
OBJ_RELEASE(buf);
549-
return;
550-
}
533+
/* mark this as intended for non-default event handlers */
551534
OBJ_CONSTRUCT(&kv, opal_value_t);
552-
kv.key = strdup(OPAL_PMIX_EVENT_CUSTOM_RANGE);
553-
kv.type = OPAL_NAME;
554-
kv.data.name.jobid = proc->jobid;
555-
kv.data.name.vpid = proc->vpid;
535+
kv.key = strdup(OPAL_PMIX_EVENT_NON_DEFAULT);
536+
kv.type = OPAL_BOOL;
537+
kv.data.flag = true;
556538
kvptr = &kv;
557539
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &kvptr, 1, OPAL_VALUE))) {
558540
ORTE_ERROR_LOG(rc);
@@ -562,17 +544,37 @@ static void _send_direct_notify(int status, orte_process_name_t *proc)
562544
}
563545
OBJ_DESTRUCT(&kv);
564546

547+
/* if the targets are a wildcard, then xcast it to everyone */
548+
if (ORTE_VPID_WILDCARD == target->vpid) {
549+
OBJ_CONSTRUCT(&sig, orte_grpcomm_signature_t);
550+
sig.signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
551+
sig.signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
552+
sig.signature[0].vpid = ORTE_VPID_WILDCARD;
553+
sig.sz = 1;
565554

566-
/* get the daemon hosting the proc to be notified */
567-
daemon.jobid = ORTE_PROC_MY_NAME->jobid;
568-
daemon.vpid = orte_get_proc_daemon_vpid(proc);
569-
/* send the notification to that daemon */
570-
if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
571-
&daemon, buf,
572-
ORTE_RML_TAG_NOTIFICATION,
573-
orte_rml_send_callback, NULL))) {
574-
ORTE_ERROR_LOG(rc);
555+
if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(&sig, ORTE_RML_TAG_NOTIFICATION, buf))) {
556+
ORTE_ERROR_LOG(rc);
557+
}
558+
OBJ_DESTRUCT(&sig);
575559
OBJ_RELEASE(buf);
560+
} else {
561+
/* get the daemon hosting the proc to be notified */
562+
daemon.jobid = ORTE_PROC_MY_NAME->jobid;
563+
daemon.vpid = orte_get_proc_daemon_vpid(target);
564+
/* send the notification to that daemon */
565+
opal_output_verbose(5, orte_state_base_framework.framework_output,
566+
"%s state:base:sending notification %s to proc %s at daemon %s",
567+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
568+
ORTE_ERROR_NAME(status),
569+
ORTE_NAME_PRINT(target),
570+
ORTE_NAME_PRINT(&daemon));
571+
if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
572+
&daemon, buf,
573+
ORTE_RML_TAG_NOTIFICATION,
574+
orte_rml_send_callback, NULL))) {
575+
ORTE_ERROR_LOG(rc);
576+
OBJ_RELEASE(buf);
577+
}
576578
}
577579
}
578580

@@ -585,7 +587,7 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
585587
orte_proc_t *pdata;
586588
int i;
587589
char *rtmod;
588-
orte_process_name_t parent, *npptr;
590+
orte_process_name_t parent, target, *npptr;
589591

590592
opal_output_verbose(5, orte_state_base_framework.framework_output,
591593
"%s state:base:track_procs called for proc %s state %s",
@@ -699,15 +701,21 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
699701
npptr = &parent;
700702
if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCH_PROXY, (void**)&npptr, OPAL_NAME)) {
701703
/* notify everyone who asked for it */
702-
_send_direct_notify(OPAL_ERR_JOB_TERMINATED, ORTE_NAME_WILDCARD);
704+
target.jobid = jdata->jobid;
705+
target.vpid = ORTE_VPID_WILDCARD;
706+
_send_notification(OPAL_ERR_JOB_TERMINATED, &target, ORTE_NAME_WILDCARD);
703707
} else {
704-
_send_direct_notify(OPAL_ERR_JOB_TERMINATED, &parent);
708+
target.jobid = jdata->jobid;
709+
target.vpid = ORTE_VPID_WILDCARD;
710+
_send_notification(OPAL_ERR_JOB_TERMINATED, &target, &parent);
705711
}
706712
}
707713
} else if (ORTE_PROC_STATE_TERMINATED < pdata->state &&
708714
!orte_job_term_ordered) {
709715
/* if this was an abnormal term, notify the other procs of the termination */
710-
_send_notification(OPAL_ERR_PROC_ABORTED, &pdata->name);
716+
parent.jobid = jdata->jobid;
717+
parent.vpid = ORTE_VPID_WILDCARD;
718+
_send_notification(OPAL_ERR_PROC_ABORTED, &pdata->name, &parent);
711719
}
712720
}
713721

0 commit comments

Comments
 (0)