@@ -361,12 +361,10 @@ bcast_kary_tree_top(void *buff, int count,
361361 /* Divide buffer into segments */
362362 if (seg <= nb_long ) length = seg_size + 1 ;
363363 else length = seg_size ;
364- opal_output_verbose (10 , ompi_coll_base_framework .framework_output ,
365- "bcast with k-ary tree : segment of size %ld" , length );
366364
367365 /* compute the triggering threshold to send data to the children */
368- trig_thr = ( rank == root ) ? ( segment_nb ) :
369- ( segment_nb + seg ) ;
366+ trig_thr = segment_nb + seg - 1 ; /* To be sure the set of PtlTriggeredPut of DATA will be executed in order */
367+ if ( rank != root ) trig_thr ++ ;
370368
371369 /*
372370 ** Send Data to children
@@ -389,6 +387,17 @@ bcast_kary_tree_top(void *buff, int count,
389387 }
390388 }
391389
390+ if (rank == root ) {
391+ trig_thr = segment_nb ;
392+ ct_inc .success = segment_nb ;
393+ ct_inc .failure = 0 ;
394+
395+ if ((ret = PtlTriggeredCTInc (request -> u .bcast .trig_ct_h , ct_inc ,
396+ request -> u .bcast .trig_ct_h , trig_thr )) != 0 ) {
397+ return opal_stderr ("PtlTriggeredCTInc failed" , __FILE__ , __LINE__ , ret );
398+ }
399+ }
400+
392401 ack_thr = child_nb ;
393402
394403 if (is_sync ) {
@@ -419,7 +428,26 @@ bcast_kary_tree_top(void *buff, int count,
419428 if (rank != root ) {
420429 trig_thr = segment_nb ;
421430 if (is_sync ) {
422- if ((ret = PtlCTWait (request -> u .bcast .trig_ct_h , trig_thr , & ct )) != 0 ) {
431+ /* Each leaf has a pending PtlTriggeredPut (to send the final ACK). We must call PtlTriggeredCTInc twice.
432+ Otherwise, we could pass the PtlCTWait and then free the CT too early and the Put wouldn't be triggered.
433+
434+ This is necessary because portals4 does not insure the order in the triggered operations associated
435+ with the same threshold. In the case where PtlCTWait is not called (else case), this is not necessary. */
436+
437+ ct_inc .success = 1 ;
438+ ct_inc .failure = 0 ;
439+
440+ if ((ret = PtlTriggeredCTInc (request -> u .bcast .trig_ct_h , ct_inc ,
441+ request -> u .bcast .trig_ct_h , trig_thr )) != 0 ) {
442+ return opal_stderr ("PtlTriggeredCTInc failed" , __FILE__ , __LINE__ , ret );
443+ }
444+
445+ if ((ret = PtlTriggeredCTInc (request -> u .bcast .trig_ct_h , ct_inc ,
446+ request -> u .bcast .trig_ct_h , trig_thr + 1 )) != 0 ) {
447+ return opal_stderr ("PtlTriggeredCTInc failed" , __FILE__ , __LINE__ , ret );
448+ }
449+
450+ if ((ret = PtlCTWait (request -> u .bcast .trig_ct_h , trig_thr + 2 , & ct )) != 0 ) {
423451 opal_stderr ("PtlCTWait failed" , __FILE__ , __LINE__ , ret );
424452 }
425453 }
@@ -659,12 +687,10 @@ bcast_pipeline_top(void *buff, int count,
659687 /* Divide buffer into segments */
660688 if (seg <= nb_long ) length = seg_size + 1 ;
661689 else length = seg_size ;
662- opal_output_verbose (10 , ompi_coll_base_framework .framework_output ,
663- "bcast with pipeline : segment of size %ld \n" , length );
664690
665691 /* compute the triggering threshold to send data to the children */
666- trig_thr = ( rank == root ) ? ( segment_nb ) :
667- ( segment_nb + seg ) ;
692+ trig_thr = segment_nb + seg - 1 ; /* To be sure the PtlTriggeredPut will be executed in order */
693+ if ( rank != root ) trig_thr ++ ;
668694
669695 /*
670696 ** Send Data to children
@@ -684,6 +710,16 @@ bcast_pipeline_top(void *buff, int count,
684710 }
685711 }
686712 }
713+ if (rank == root ) {
714+ trig_thr = segment_nb ;
715+ ct_inc .success = segment_nb ;
716+ ct_inc .failure = 0 ;
717+
718+ if ((ret = PtlTriggeredCTInc (request -> u .bcast .trig_ct_h , ct_inc ,
719+ request -> u .bcast .trig_ct_h , trig_thr )) != 0 ) {
720+ return opal_stderr ("PtlTriggeredCTInc failed" , __FILE__ , __LINE__ , ret );
721+ }
722+ }
687723
688724 if (is_sync ) {
689725 if ((ret = PtlCTWait (request -> u .bcast .ack_ct_h , 1 , & ct )) != 0 ) {
@@ -713,8 +749,28 @@ bcast_pipeline_top(void *buff, int count,
713749
714750 if (rank != root ) {
715751 trig_thr = segment_nb ;
752+
716753 if (is_sync ) {
717- if ((ret = PtlCTWait (request -> u .bcast .trig_ct_h , trig_thr , & ct )) != 0 ) {
754+ /* Each leaf has a pending PtlTriggeredPut (to send the final ACK). We must call PtlTriggeredCTInc twice.
755+ Otherwise, we could pass the PtlCTWait and then free the CT too early and the Put wouldn't be triggered.
756+
757+ This is necessary because portals4 does not insure the order in the triggered operations associated
758+ with the same threshold. In the case where PtlCTWait is not called (else case), this is not necessary. */
759+
760+ ct_inc .success = 1 ;
761+ ct_inc .failure = 0 ;
762+
763+ if ((ret = PtlTriggeredCTInc (request -> u .bcast .trig_ct_h , ct_inc ,
764+ request -> u .bcast .trig_ct_h , trig_thr )) != 0 ) {
765+ return opal_stderr ("PtlTriggeredCTInc failed" , __FILE__ , __LINE__ , ret );
766+ }
767+
768+ if ((ret = PtlTriggeredCTInc (request -> u .bcast .trig_ct_h , ct_inc ,
769+ request -> u .bcast .trig_ct_h , trig_thr + 1 )) != 0 ) {
770+ return opal_stderr ("PtlTriggeredCTInc failed" , __FILE__ , __LINE__ , ret );
771+ }
772+
773+ if ((ret = PtlCTWait (request -> u .bcast .trig_ct_h , trig_thr + 2 , & ct )) != 0 ) {
718774 opal_stderr ("PtlCTWait failed" , __FILE__ , __LINE__ , ret );
719775 }
720776 }
0 commit comments