@@ -518,16 +518,74 @@ static void _send_notification(int status, orte_process_name_t *proc)
518
518
OBJ_DESTRUCT (& buf );
519
519
}
520
520
521
+ static void _send_direct_notify (int status , orte_process_name_t * proc )
522
+ {
523
+ opal_buffer_t * buf ;
524
+ int rc ;
525
+ opal_value_t kv , * kvptr ;
526
+ orte_process_name_t daemon ;
527
+
528
+ buf = OBJ_NEW (opal_buffer_t );
529
+
530
+ /* pack the status */
531
+ if (ORTE_SUCCESS != (rc = opal_dss .pack (buf , & status , 1 , OPAL_INT ))) {
532
+ ORTE_ERROR_LOG (rc );
533
+ OBJ_RELEASE (buf );
534
+ return ;
535
+ }
536
+
537
+ /* the source is me */
538
+ if (ORTE_SUCCESS != (rc = opal_dss .pack (buf , ORTE_PROC_MY_NAME , 1 , ORTE_NAME ))) {
539
+ ORTE_ERROR_LOG (rc );
540
+ OBJ_RELEASE (buf );
541
+ return ;
542
+ }
543
+
544
+ /* pass along the proc to be notified (one opal_value_t) */
545
+ rc = 1 ;
546
+ if (ORTE_SUCCESS != (rc = opal_dss .pack (buf , & rc , 1 , OPAL_INT ))) {
547
+ ORTE_ERROR_LOG (rc );
548
+ OBJ_RELEASE (buf );
549
+ return ;
550
+ }
551
+ OBJ_CONSTRUCT (& kv , opal_value_t );
552
+ kv .key = strdup (OPAL_PMIX_EVENT_CUSTOM_RANGE );
553
+ kv .type = OPAL_NAME ;
554
+ kv .data .name .jobid = proc -> jobid ;
555
+ kv .data .name .vpid = proc -> vpid ;
556
+ kvptr = & kv ;
557
+ if (ORTE_SUCCESS != (rc = opal_dss .pack (buf , & kvptr , 1 , OPAL_VALUE ))) {
558
+ ORTE_ERROR_LOG (rc );
559
+ OBJ_DESTRUCT (& kv );
560
+ OBJ_RELEASE (buf );
561
+ return ;
562
+ }
563
+ OBJ_DESTRUCT (& kv );
564
+
565
+
566
+ /* get the daemon hosting the proc to be notified */
567
+ daemon .jobid = ORTE_PROC_MY_NAME -> jobid ;
568
+ daemon .vpid = orte_get_proc_daemon_vpid (proc );
569
+ /* send the notification to that daemon */
570
+ if (ORTE_SUCCESS != (rc = orte_rml .send_buffer_nb (orte_mgmt_conduit ,
571
+ & daemon , buf ,
572
+ ORTE_RML_TAG_NOTIFICATION ,
573
+ orte_rml_send_callback , NULL ))) {
574
+ ORTE_ERROR_LOG (rc );
575
+ OBJ_RELEASE (buf );
576
+ }
577
+ }
578
+
521
579
void orte_state_base_track_procs (int fd , short argc , void * cbdata )
522
580
{
523
581
orte_state_caddy_t * caddy = (orte_state_caddy_t * )cbdata ;
524
582
orte_process_name_t * proc = & caddy -> name ;
525
- orte_process_name_t wildcard_rank ;
526
583
orte_proc_state_t state = caddy -> proc_state ;
527
584
orte_job_t * jdata ;
528
585
orte_proc_t * pdata ;
529
586
int i ;
530
587
char * rtmod ;
588
+ orte_process_name_t parent , * npptr ;
531
589
532
590
opal_output_verbose (5 , orte_state_base_framework .framework_output ,
533
591
"%s state:base:track_procs called for proc %s state %s" ,
@@ -636,9 +694,15 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
636
694
ORTE_ACTIVATE_JOB_STATE (jdata , ORTE_JOB_STATE_TERMINATED );
637
695
/* if they requested notification upon completion, provide it */
638
696
if (orte_get_attribute (& jdata -> attributes , ORTE_JOB_NOTIFY_COMPLETION , NULL , OPAL_BOOL )) {
639
- wildcard_rank .jobid = jdata -> jobid ;
640
- wildcard_rank .vpid = ORTE_VPID_WILDCARD ;
641
- _send_notification (OPAL_ERR_JOB_TERMINATED , & wildcard_rank );
697
+ /* notify_completion => notify the parent of the termination
698
+ * of this child job. So get the parent jobid info */
699
+ npptr = & parent ;
700
+ if (!orte_get_attribute (& jdata -> attributes , ORTE_JOB_LAUNCH_PROXY , (void * * )& npptr , OPAL_NAME )) {
701
+ /* notify everyone who asked for it */
702
+ _send_direct_notify (OPAL_ERR_JOB_TERMINATED , ORTE_NAME_WILDCARD );
703
+ } else {
704
+ _send_direct_notify (OPAL_ERR_JOB_TERMINATED , & parent );
705
+ }
642
706
}
643
707
} else if (ORTE_PROC_STATE_TERMINATED < pdata -> state &&
644
708
!orte_job_term_ordered ) {
0 commit comments