@@ -518,16 +518,74 @@ static void _send_notification(int status, orte_process_name_t *proc)
518518 OBJ_DESTRUCT (& buf );
519519}
520520
521+ static void _send_direct_notify (int status , orte_process_name_t * proc )
522+ {
523+ opal_buffer_t * buf ;
524+ int rc ;
525+ opal_value_t kv , * kvptr ;
526+ orte_process_name_t daemon ;
527+
528+ buf = OBJ_NEW (opal_buffer_t );
529+
530+ /* pack the status */
531+ if (ORTE_SUCCESS != (rc = opal_dss .pack (buf , & status , 1 , OPAL_INT ))) {
532+ ORTE_ERROR_LOG (rc );
533+ OBJ_RELEASE (buf );
534+ return ;
535+ }
536+
537+ /* the source is me */
538+ if (ORTE_SUCCESS != (rc = opal_dss .pack (buf , ORTE_PROC_MY_NAME , 1 , ORTE_NAME ))) {
539+ ORTE_ERROR_LOG (rc );
540+ OBJ_RELEASE (buf );
541+ return ;
542+ }
543+
544+ /* pass along the proc to be notified (one opal_value_t) */
545+ rc = 1 ;
546+ if (ORTE_SUCCESS != (rc = opal_dss .pack (buf , & rc , 1 , OPAL_INT ))) {
547+ ORTE_ERROR_LOG (rc );
548+ OBJ_RELEASE (buf );
549+ return ;
550+ }
551+ OBJ_CONSTRUCT (& kv , opal_value_t );
552+ kv .key = strdup (OPAL_PMIX_EVENT_CUSTOM_RANGE );
553+ kv .type = OPAL_NAME ;
554+ kv .data .name .jobid = proc -> jobid ;
555+ kv .data .name .vpid = proc -> vpid ;
556+ kvptr = & kv ;
557+ if (ORTE_SUCCESS != (rc = opal_dss .pack (buf , & kvptr , 1 , OPAL_VALUE ))) {
558+ ORTE_ERROR_LOG (rc );
559+ OBJ_DESTRUCT (& kv );
560+ OBJ_RELEASE (buf );
561+ return ;
562+ }
563+ OBJ_DESTRUCT (& kv );
564+
565+
566+ /* get the daemon hosting the proc to be notified */
567+ daemon .jobid = ORTE_PROC_MY_NAME -> jobid ;
568+ daemon .vpid = orte_get_proc_daemon_vpid (proc );
569+ /* send the notification to that daemon */
570+ if (ORTE_SUCCESS != (rc = orte_rml .send_buffer_nb (orte_mgmt_conduit ,
571+ & daemon , buf ,
572+ ORTE_RML_TAG_NOTIFICATION ,
573+ orte_rml_send_callback , NULL ))) {
574+ ORTE_ERROR_LOG (rc );
575+ OBJ_RELEASE (buf );
576+ }
577+ }
578+
521579void orte_state_base_track_procs (int fd , short argc , void * cbdata )
522580{
523581 orte_state_caddy_t * caddy = (orte_state_caddy_t * )cbdata ;
524582 orte_process_name_t * proc = & caddy -> name ;
525- orte_process_name_t wildcard_rank ;
526583 orte_proc_state_t state = caddy -> proc_state ;
527584 orte_job_t * jdata ;
528585 orte_proc_t * pdata ;
529586 int i ;
530587 char * rtmod ;
588+ orte_process_name_t parent , * npptr ;
531589
532590 opal_output_verbose (5 , orte_state_base_framework .framework_output ,
533591 "%s state:base:track_procs called for proc %s state %s" ,
@@ -636,9 +694,15 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
636694 ORTE_ACTIVATE_JOB_STATE (jdata , ORTE_JOB_STATE_TERMINATED );
637695 /* if they requested notification upon completion, provide it */
638696 if (orte_get_attribute (& jdata -> attributes , ORTE_JOB_NOTIFY_COMPLETION , NULL , OPAL_BOOL )) {
639- wildcard_rank .jobid = jdata -> jobid ;
640- wildcard_rank .vpid = ORTE_VPID_WILDCARD ;
641- _send_notification (OPAL_ERR_JOB_TERMINATED , & wildcard_rank );
697+ /* notify_completion => notify the parent of the termination
698+ * of this child job. So get the parent jobid info */
699+ npptr = & parent ;
700+ if (!orte_get_attribute (& jdata -> attributes , ORTE_JOB_LAUNCH_PROXY , (void * * )& npptr , OPAL_NAME )) {
701+ /* notify everyone who asked for it */
702+ _send_direct_notify (OPAL_ERR_JOB_TERMINATED , ORTE_NAME_WILDCARD );
703+ } else {
704+ _send_direct_notify (OPAL_ERR_JOB_TERMINATED , & parent );
705+ }
642706 }
643707 } else if (ORTE_PROC_STATE_TERMINATED < pdata -> state &&
644708 !orte_job_term_ordered ) {
0 commit comments