@@ -89,12 +89,20 @@ static int prepare_bcast_data (struct ompi_communicator_t *comm,
8989 }
9090
9191 /* Number of segments */
92- request -> u .bcast .segment_nb = (request -> u .bcast .tmpsize > COLL_PORTALS4_MAX_BW ) ?
93- (((request -> u .bcast .tmpsize + COLL_PORTALS4_MAX_BW - 1 ) / COLL_PORTALS4_MAX_BW ) < COLL_PORTALS4_MAX_SEGMENT ?
94- ((request -> u .bcast .tmpsize + COLL_PORTALS4_MAX_BW - 1 ) / COLL_PORTALS4_MAX_BW ) :
95- COLL_PORTALS4_MAX_SEGMENT ) :
92+ {
93+ size_t max_msg_size = (COLL_PORTALS4_MAX_BW > mca_coll_portals4_component .ni_limits .max_msg_size ) ?
94+ mca_coll_portals4_component .ni_limits .max_msg_size :
95+ COLL_PORTALS4_MAX_BW ;
96+
97+ //TODO : Either make compatible Portals size limits and COLL_PORTALS4_MAX_SEGMENT or remove COLL_PORTALS4_MAX_SEGMENT
98+ request -> u .bcast .segment_nb = (request -> u .bcast .tmpsize > max_msg_size ) ?
99+ (((request -> u .bcast .tmpsize + max_msg_size - 1 ) / max_msg_size ) < COLL_PORTALS4_MAX_SEGMENT ?
100+ ((request -> u .bcast .tmpsize + max_msg_size - 1 ) / max_msg_size ) : COLL_PORTALS4_MAX_SEGMENT ) :
96101 1 ;
97102
103+ OPAL_OUTPUT_VERBOSE ((10 , ompi_coll_base_framework .framework_output ,
104+ "seg_number=%d , seg_size_max=%lu" , request -> u .bcast .segment_nb , max_msg_size ));
105+ }
98106 if (request -> u .bcast .segment_nb > COLL_PORTALS4_BCAST_ALGO_THRESHOLD ) {
99107 request -> u .bcast .algo = OMPI_COLL_PORTALS4_BCAST_PIPELINE_ALGO ;
100108 }
@@ -137,9 +145,9 @@ bcast_kary_tree_top(void *buff, int count,
137145 mca_coll_portals4_module_t * portals4_module )
138146{
139147 bool is_sync = request -> is_sync ;
140- int ret , seg ;
141- unsigned int i ;
142- int segment_nb = request -> u .bcast .segment_nb ;
148+ int ret ;
149+ unsigned int i , seg , seg_size , nb_long ;
150+ unsigned int segment_nb = request -> u .bcast .segment_nb ;
143151 unsigned int child_nb ;
144152 int size = ompi_comm_size (comm );
145153 int rank = ompi_comm_rank (comm );
@@ -201,15 +209,22 @@ bcast_kary_tree_top(void *buff, int count,
201209 COLL_PORTALS4_SET_BITS (match_bits , ompi_comm_get_cid (comm ), 0 , 0 ,
202210 COLL_PORTALS4_BCAST , 0 , internal_count );
203211
212+ /* The data will be cut in segment_nb segments.
213+ * nb_long segments will have a size of (seg_size + 1)
214+ * and (segment_nb - nb_long) segments will have a size of seg_size
215+ */
216+ seg_size = request -> u .bcast .tmpsize / segment_nb ;
217+ nb_long = request -> u .bcast .tmpsize % segment_nb ;
218+ opal_output_verbose (10 , ompi_coll_base_framework .framework_output , "seg_size=%d nb_long=%d segment_nb=%d" , seg_size , nb_long , segment_nb );
219+
204220 if (rank != root ) {
205221 for (seg = 1 , offset = 0 , length = 0 ;
206222 seg <= segment_nb ;
207223 seg ++ , offset += length ) {
208224
209225 /* Divide buffer into segments */
210- length = (seg < segment_nb ) ?
211- (request -> u .bcast .tmpsize + segment_nb - 1 ) / segment_nb :
212- request -> u .bcast .tmpsize - ((request -> u .bcast .tmpsize + segment_nb - 1 ) / segment_nb ) * (segment_nb - 1 );
226+ if (seg <= nb_long ) length = seg_size + 1 ;
227+ else length = seg_size ;
213228
214229 /*
215230 ** Prepare Data ME
@@ -352,13 +367,14 @@ bcast_kary_tree_top(void *buff, int count,
352367 seg ++ , offset += length ) {
353368
354369 /* Divide buffer into segments */
355- length = (seg < segment_nb ) ?
356- (request -> u .bcast .tmpsize + segment_nb - 1 ) / segment_nb :
357- request -> u .bcast .tmpsize - ((request -> u .bcast .tmpsize + segment_nb - 1 ) / segment_nb ) * (segment_nb - 1 );
370+ if (seg <= nb_long ) length = seg_size + 1 ;
371+ else length = seg_size ;
372+ opal_output_verbose (10 , ompi_coll_base_framework .framework_output ,
373+ "bcast with k-ary tree : segment of size %ld" , length );
358374
359375 /* compute the triggering threshold to send data to the children */
360- trig_thr = ( rank == root ) ? ( segment_nb ) :
361- ( segment_nb + seg ) ;
376+ trig_thr = segment_nb + seg - 1 ; /* To be sure the set of PtlTriggeredPut of DATA will be executed in order */
377+ if ( rank != root ) trig_thr ++ ;
362378
363379 /*
364380 ** Send Data to children
@@ -381,6 +397,17 @@ bcast_kary_tree_top(void *buff, int count,
381397 }
382398 }
383399
400+ if (rank == root ) {
401+ trig_thr = segment_nb ;
402+ ct_inc .success = segment_nb ;
403+ ct_inc .failure = 0 ;
404+
405+ if ((ret = PtlTriggeredCTInc (request -> u .bcast .trig_ct_h , ct_inc ,
406+ request -> u .bcast .trig_ct_h , trig_thr )) != 0 ) {
407+ return opal_stderr ("PtlTriggeredCTInc failed" , __FILE__ , __LINE__ , ret );
408+ }
409+ }
410+
384411 ack_thr = child_nb ;
385412
386413 if (is_sync ) {
@@ -409,9 +436,28 @@ bcast_kary_tree_top(void *buff, int count,
409436 */
410437
411438 if (rank != root ) {
412- ack_thr = segment_nb ;
439+ trig_thr = segment_nb ;
413440 if (is_sync ) {
414- if ((ret = PtlCTWait (request -> u .bcast .trig_ct_h , ack_thr , & ct )) != 0 ) {
441+ /* Each leaf has a pending PtlTriggeredPut (to send the final ACK). We must call PtlTriggeredCTInc twice.
442+ Otherwise, we could pass the PtlCTWait and then free the CT too early and the Put wouldn't be triggered.
443+
444+ This is necessary because portals4 does not insure the order in the triggered operations associated
445+ with the same threshold. In the case where PtlCTWait is not called (else case), this is not necessary. */
446+
447+ ct_inc .success = 1 ;
448+ ct_inc .failure = 0 ;
449+
450+ if ((ret = PtlTriggeredCTInc (request -> u .bcast .trig_ct_h , ct_inc ,
451+ request -> u .bcast .trig_ct_h , trig_thr )) != 0 ) {
452+ return opal_stderr ("PtlTriggeredCTInc failed" , __FILE__ , __LINE__ , ret );
453+ }
454+
455+ if ((ret = PtlTriggeredCTInc (request -> u .bcast .trig_ct_h , ct_inc ,
456+ request -> u .bcast .trig_ct_h , trig_thr + 1 )) != 0 ) {
457+ return opal_stderr ("PtlTriggeredCTInc failed" , __FILE__ , __LINE__ , ret );
458+ }
459+
460+ if ((ret = PtlCTWait (request -> u .bcast .trig_ct_h , trig_thr + 2 , & ct )) != 0 ) {
415461 opal_stderr ("PtlCTWait failed" , __FILE__ , __LINE__ , ret );
416462 }
417463 }
@@ -421,7 +467,7 @@ bcast_kary_tree_top(void *buff, int count,
421467 mca_coll_portals4_component .finish_pt_idx ,
422468 0 , 0 , NULL , (uintptr_t ) request ,
423469 request -> u .bcast .trig_ct_h ,
424- ack_thr )) != 0 ) {
470+ trig_thr )) != 0 ) {
425471 return opal_stderr ("PtlTriggeredPut failed" , __FILE__ , __LINE__ , ret );
426472 }
427473
@@ -440,8 +486,9 @@ bcast_pipeline_top(void *buff, int count,
440486 mca_coll_portals4_module_t * portals4_module )
441487{
442488 bool is_sync = request -> is_sync ;
443- int ret , seg ;
444- int segment_nb = request -> u .bcast .segment_nb ;
489+ int ret ;
490+ unsigned int seg , seg_size , nb_long ;
491+ unsigned int segment_nb = request -> u .bcast .segment_nb ;
445492 int size = ompi_comm_size (comm );
446493 int rank = ompi_comm_rank (comm );
447494 ptl_rank_t parent , child ;
@@ -492,16 +539,22 @@ bcast_pipeline_top(void *buff, int count,
492539
493540 COLL_PORTALS4_SET_BITS (match_bits , ompi_comm_get_cid (comm ), 0 , 0 ,
494541 COLL_PORTALS4_BCAST , 0 , internal_count );
542+ /* The data will be cut in segment_nb segments.
543+ * nb_long segments will have a size of (seg_size + 1)
544+ * and (segment_nb - nb_long) segments will have a size of seg_size
545+ */
546+ seg_size = request -> u .bcast .tmpsize / segment_nb ;
547+ nb_long = request -> u .bcast .tmpsize % segment_nb ;
548+ opal_output_verbose (10 , ompi_coll_base_framework .framework_output , "seg_size=%d nb_long=%d" , seg_size , nb_long );
495549
496550 if (rank != root ) {
497551 for (seg = 1 , offset = 0 , length = 0 ;
498552 seg <= segment_nb ;
499553 seg ++ , offset += length ) {
500554
501555 /* Divide buffer into segments */
502- length = (seg < segment_nb ) ?
503- (request -> u .bcast .tmpsize + segment_nb - 1 ) / segment_nb :
504- request -> u .bcast .tmpsize - ((request -> u .bcast .tmpsize + segment_nb - 1 ) / segment_nb ) * (segment_nb - 1 );
556+ if (seg <= nb_long ) length = seg_size + 1 ;
557+ else length = seg_size ;
505558
506559 /*
507560 ** Prepare Data ME
@@ -642,13 +695,14 @@ bcast_pipeline_top(void *buff, int count,
642695 seg ++ , offset += length ) {
643696
644697 /* Divide buffer into segments */
645- length = (seg < segment_nb ) ?
646- (request -> u .bcast .tmpsize + segment_nb - 1 ) / segment_nb :
647- request -> u .bcast .tmpsize - ((request -> u .bcast .tmpsize + segment_nb - 1 ) / segment_nb ) * (segment_nb - 1 );
698+ if (seg <= nb_long ) length = seg_size + 1 ;
699+ else length = seg_size ;
700+ opal_output_verbose (10 , ompi_coll_base_framework .framework_output ,
701+ "bcast with pipeline : segment of size %ld \n" , length );
648702
649703 /* compute the triggering threshold to send data to the children */
650- trig_thr = ( rank == root ) ? ( segment_nb ) :
651- ( segment_nb + seg ) ;
704+ trig_thr = segment_nb + seg - 1 ; /* To be sure the PtlTriggeredPut will be executed in order */
705+ if ( rank != root ) trig_thr ++ ;
652706
653707 /*
654708 ** Send Data to children
@@ -668,6 +722,16 @@ bcast_pipeline_top(void *buff, int count,
668722 }
669723 }
670724 }
725+ if (rank == root ) {
726+ trig_thr = segment_nb ;
727+ ct_inc .success = segment_nb ;
728+ ct_inc .failure = 0 ;
729+
730+ if ((ret = PtlTriggeredCTInc (request -> u .bcast .trig_ct_h , ct_inc ,
731+ request -> u .bcast .trig_ct_h , trig_thr )) != 0 ) {
732+ return opal_stderr ("PtlTriggeredCTInc failed" , __FILE__ , __LINE__ , ret );
733+ }
734+ }
671735
672736 if (is_sync ) {
673737 if ((ret = PtlCTWait (request -> u .bcast .ack_ct_h , 1 , & ct )) != 0 ) {
@@ -696,8 +760,29 @@ bcast_pipeline_top(void *buff, int count,
696760 */
697761
698762 if (rank != root ) {
763+ trig_thr = segment_nb ;
764+
699765 if (is_sync ) {
700- if ((ret = PtlCTWait (request -> u .bcast .trig_ct_h , segment_nb , & ct )) != 0 ) {
766+ /* Each leaf has a pending PtlTriggeredPut (to send the final ACK). We must call PtlTriggeredCTInc twice.
767+ Otherwise, we could pass the PtlCTWait and then free the CT too early and the Put wouldn't be triggered.
768+
769+ This is necessary because portals4 does not insure the order in the triggered operations associated
770+ with the same threshold. In the case where PtlCTWait is not called (else case), this is not necessary. */
771+
772+ ct_inc .success = 1 ;
773+ ct_inc .failure = 0 ;
774+
775+ if ((ret = PtlTriggeredCTInc (request -> u .bcast .trig_ct_h , ct_inc ,
776+ request -> u .bcast .trig_ct_h , trig_thr )) != 0 ) {
777+ return opal_stderr ("PtlTriggeredCTInc failed" , __FILE__ , __LINE__ , ret );
778+ }
779+
780+ if ((ret = PtlTriggeredCTInc (request -> u .bcast .trig_ct_h , ct_inc ,
781+ request -> u .bcast .trig_ct_h , trig_thr + 1 )) != 0 ) {
782+ return opal_stderr ("PtlTriggeredCTInc failed" , __FILE__ , __LINE__ , ret );
783+ }
784+
785+ if ((ret = PtlCTWait (request -> u .bcast .trig_ct_h , trig_thr + 2 , & ct )) != 0 ) {
701786 opal_stderr ("PtlCTWait failed" , __FILE__ , __LINE__ , ret );
702787 }
703788 }
@@ -707,7 +792,7 @@ bcast_pipeline_top(void *buff, int count,
707792 mca_coll_portals4_component .finish_pt_idx ,
708793 0 , 0 , NULL , (uintptr_t ) request ,
709794 request -> u .bcast .trig_ct_h ,
710- segment_nb )) != 0 ) {
795+ trig_thr )) != 0 ) {
711796 return opal_stderr ("PtlTriggeredPut failed" , __FILE__ , __LINE__ , ret );
712797 }
713798 }
@@ -831,7 +916,7 @@ ompi_coll_portals4_ibcast_intra(void *buff, int count,
831916 return OMPI_ERROR ;
832917 }
833918
834- puts ( "ibcast " );
919+ opal_output_verbose ( 10 , ompi_coll_base_framework . framework_output , "ibcast_intra " );
835920 return (OMPI_SUCCESS );
836921}
837922
@@ -860,5 +945,6 @@ ompi_coll_portals4_ibcast_intra_fini(ompi_coll_portals4_request_t *request)
860945 ompi_request_complete (& request -> super , true);
861946 OPAL_THREAD_UNLOCK (& ompi_request_lock );
862947
948+ opal_output_verbose (10 , ompi_coll_base_framework .framework_output , "ibcast_intra_fini" );
863949 return (OMPI_SUCCESS );
864950}
0 commit comments