@@ -89,12 +89,20 @@ static int prepare_bcast_data (struct ompi_communicator_t *comm,
8989    }
9090
9191    /* Number of segments */ 
92-     request -> u .bcast .segment_nb  =   (request -> u .bcast .tmpsize  >  COLL_PORTALS4_MAX_BW ) ?
93-             (((request -> u .bcast .tmpsize  +  COLL_PORTALS4_MAX_BW  - 1 )  / COLL_PORTALS4_MAX_BW ) <  COLL_PORTALS4_MAX_SEGMENT  ?
94-                     ((request -> u .bcast .tmpsize  +  COLL_PORTALS4_MAX_BW  - 1 )  / COLL_PORTALS4_MAX_BW ) :
95-                     COLL_PORTALS4_MAX_SEGMENT ) :
92+     {
93+         size_t  max_msg_size  =  (COLL_PORTALS4_MAX_BW  >   mca_coll_portals4_component .ni_limits .max_msg_size ) ?
94+             mca_coll_portals4_component .ni_limits .max_msg_size  :
95+             COLL_PORTALS4_MAX_BW ;
96+ 
97+         //TODO : Either make compatible Portals size limits and COLL_PORTALS4_MAX_SEGMENT or remove COLL_PORTALS4_MAX_SEGMENT 
98+         request -> u .bcast .segment_nb  =   (request -> u .bcast .tmpsize  >  max_msg_size ) ?
99+             (((request -> u .bcast .tmpsize  +  max_msg_size  - 1 )  / max_msg_size ) <  COLL_PORTALS4_MAX_SEGMENT  ?
100+                 ((request -> u .bcast .tmpsize  +  max_msg_size  - 1 )  / max_msg_size ) : COLL_PORTALS4_MAX_SEGMENT ) :
96101                    1 ;
97102
103+         OPAL_OUTPUT_VERBOSE ((10 , ompi_coll_base_framework .framework_output ,
104+                 "seg_number=%d , seg_size_max=%lu" , request -> u .bcast .segment_nb , max_msg_size ));
105+     }
98106    if  (request -> u .bcast .segment_nb  >  COLL_PORTALS4_BCAST_ALGO_THRESHOLD ) {
99107        request -> u .bcast .algo  =  OMPI_COLL_PORTALS4_BCAST_PIPELINE_ALGO ;
100108    }
@@ -137,9 +145,9 @@ bcast_kary_tree_top(void *buff, int count,
137145        mca_coll_portals4_module_t  * portals4_module )
138146{
139147    bool  is_sync  =  request -> is_sync ;
140-     int  ret ,  seg ;
141-     unsigned int   i ;
142-     int  segment_nb  =  request -> u .bcast .segment_nb ;
148+     int  ret ;
149+     unsigned int   i ,  seg ,  seg_size ,  nb_long ;
150+     unsigned  int segment_nb  =  request -> u .bcast .segment_nb ;
143151    unsigned int   child_nb ;
144152    int  size  =  ompi_comm_size (comm );
145153    int  rank  =  ompi_comm_rank (comm );
@@ -201,15 +209,22 @@ bcast_kary_tree_top(void *buff, int count,
201209    COLL_PORTALS4_SET_BITS (match_bits , ompi_comm_get_cid (comm ), 0 , 0 ,
202210            COLL_PORTALS4_BCAST , 0 , internal_count );
203211
212+     /* The data will be cut in segment_nb segments. 
213+      * nb_long segments will have a size of (seg_size + 1) 
214+      * and (segment_nb - nb_long) segments will have a size of seg_size 
215+      */ 
216+     seg_size  =  request -> u .bcast .tmpsize  / segment_nb ;
217+     nb_long  =  request -> u .bcast .tmpsize  % segment_nb ;
218+     opal_output_verbose (10 , ompi_coll_base_framework .framework_output , "seg_size=%d nb_long=%d segment_nb=%d" , seg_size , nb_long , segment_nb );
219+ 
204220    if  (rank  !=  root ) {
205221        for  (seg  =  1 , offset  =  0 , length  =  0  ;
206222                seg  <= segment_nb  ;
207223                seg ++ , offset  +=  length ) {
208224
209225            /* Divide buffer into segments */ 
210-             length  =  (seg  <  segment_nb ) ?
211-                     (request -> u .bcast .tmpsize  +  segment_nb  -  1 ) / segment_nb  :
212-                     request -> u .bcast .tmpsize  -  ((request -> u .bcast .tmpsize  +  segment_nb  -  1 ) / segment_nb ) *  (segment_nb  -  1 );
226+             if  (seg  <= nb_long ) length  =  seg_size  +  1 ;
227+             else  length  =  seg_size ;
213228
214229            /* 
215230             ** Prepare Data ME 
@@ -352,13 +367,14 @@ bcast_kary_tree_top(void *buff, int count,
352367                seg ++ , offset  +=  length ) {
353368
354369            /* Divide buffer into segments */ 
355-             length  =  (seg  <  segment_nb ) ?
356-                     (request -> u .bcast .tmpsize  +  segment_nb  -  1 ) / segment_nb  :
357-                     request -> u .bcast .tmpsize  -  ((request -> u .bcast .tmpsize  +  segment_nb  -  1 ) / segment_nb ) *  (segment_nb  -  1 );
370+             if  (seg  <= nb_long ) length  =  seg_size  +  1 ;
371+             else  length  =  seg_size ;
372+             opal_output_verbose (10 , ompi_coll_base_framework .framework_output ,
373+                 "bcast with k-ary tree : segment of size %ld" , length );
358374
359375            /* compute the triggering threshold to send data to the children */ 
360-             trig_thr  =  ( rank   ==   root ) ? ( segment_nb ) : 
361-                     ( segment_nb   +   seg ) ;
376+             trig_thr  =  segment_nb   +   seg   -   1 ;  /* To be sure the set of PtlTriggeredPut of DATA will be executed in order */ 
377+             if  ( rank   !=   root )  trig_thr   ++ ;
362378
363379            /* 
364380             ** Send Data to children 
@@ -381,6 +397,17 @@ bcast_kary_tree_top(void *buff, int count,
381397            }
382398        }
383399
400+         if  (rank  ==  root ) {
401+             trig_thr  =  segment_nb ;
402+             ct_inc .success  =  segment_nb ;
403+             ct_inc .failure  =  0 ;
404+ 
405+             if  ((ret  =  PtlTriggeredCTInc (request -> u .bcast .trig_ct_h , ct_inc ,
406+                    request -> u .bcast .trig_ct_h , trig_thr )) !=  0 ) {
407+                 return  opal_stderr ("PtlTriggeredCTInc failed" , __FILE__ , __LINE__ , ret );
408+             }
409+         }
410+ 
384411        ack_thr  =  child_nb ;
385412
386413        if  (is_sync ) {
@@ -409,9 +436,28 @@ bcast_kary_tree_top(void *buff, int count,
409436         */ 
410437
411438        if  (rank  !=  root ) {
412-             ack_thr  =  segment_nb ;
439+             trig_thr  =  segment_nb ;
413440            if  (is_sync ) {
414-                 if  ((ret  =  PtlCTWait (request -> u .bcast .trig_ct_h , ack_thr , & ct )) !=  0 ) {
441+                 /* Each leaf has a pending PtlTriggeredPut (to send the final ACK). We must call PtlTriggeredCTInc twice. 
442+                    Otherwise, we could pass the PtlCTWait and then free the CT too early and the Put wouldn't be triggered. 
443+ 
444+                    This is necessary because portals4 does not insure the order in the triggered operations associated 
445+                    with the same threshold. In the case where PtlCTWait is not called (else case), this is not necessary. */ 
446+ 
447+                 ct_inc .success  =  1 ;
448+                 ct_inc .failure  =  0 ;
449+ 
450+                 if  ((ret  =  PtlTriggeredCTInc (request -> u .bcast .trig_ct_h , ct_inc ,
451+                         request -> u .bcast .trig_ct_h , trig_thr )) !=  0 ) {
452+                     return  opal_stderr ("PtlTriggeredCTInc failed" , __FILE__ , __LINE__ , ret );
453+                 }
454+ 
455+                 if  ((ret  =  PtlTriggeredCTInc (request -> u .bcast .trig_ct_h , ct_inc ,
456+                         request -> u .bcast .trig_ct_h , trig_thr  +  1 )) !=  0 ) {
457+                     return  opal_stderr ("PtlTriggeredCTInc failed" , __FILE__ , __LINE__ , ret );
458+                 }
459+ 
460+                 if  ((ret  =  PtlCTWait (request -> u .bcast .trig_ct_h , trig_thr  +  2 , & ct )) !=  0 ) {
415461                    opal_stderr ("PtlCTWait failed" , __FILE__ , __LINE__ , ret );
416462                }
417463            }
@@ -421,7 +467,7 @@ bcast_kary_tree_top(void *buff, int count,
421467                        mca_coll_portals4_component .finish_pt_idx ,
422468                        0 , 0 , NULL , (uintptr_t ) request ,
423469                        request -> u .bcast .trig_ct_h ,
424-                         ack_thr )) !=  0 ) {
470+                         trig_thr )) !=  0 ) {
425471                    return  opal_stderr ("PtlTriggeredPut failed" , __FILE__ , __LINE__ , ret );
426472                }
427473
@@ -440,8 +486,9 @@ bcast_pipeline_top(void *buff, int count,
440486        mca_coll_portals4_module_t  * portals4_module )
441487{
442488    bool  is_sync  =  request -> is_sync ;
443-     int  ret , seg ;
444-     int  segment_nb  =  request -> u .bcast .segment_nb ;
489+     int  ret ;
490+     unsigned int   seg , seg_size , nb_long ;
491+     unsigned int   segment_nb  =  request -> u .bcast .segment_nb ;
445492    int  size  =  ompi_comm_size (comm );
446493    int  rank  =  ompi_comm_rank (comm );
447494    ptl_rank_t  parent , child ;
@@ -492,16 +539,22 @@ bcast_pipeline_top(void *buff, int count,
492539
493540    COLL_PORTALS4_SET_BITS (match_bits , ompi_comm_get_cid (comm ), 0 , 0 ,
494541            COLL_PORTALS4_BCAST , 0 , internal_count );
542+     /* The data will be cut in segment_nb segments. 
543+      * nb_long segments will have a size of (seg_size + 1) 
544+      * and (segment_nb - nb_long) segments will have a size of seg_size 
545+      */ 
546+     seg_size  =  request -> u .bcast .tmpsize  / segment_nb ;
547+     nb_long  =  request -> u .bcast .tmpsize  % segment_nb ;
548+     opal_output_verbose (10 , ompi_coll_base_framework .framework_output , "seg_size=%d nb_long=%d" , seg_size , nb_long );
495549
496550    if  (rank  !=  root ) {
497551        for  (seg  =  1 , offset  =  0 , length  =  0  ;
498552                seg  <= segment_nb  ;
499553                seg ++ , offset  +=  length ) {
500554
501555            /* Divide buffer into segments */ 
502-             length  =  (seg  <  segment_nb ) ?
503-                     (request -> u .bcast .tmpsize  +  segment_nb  -  1 ) / segment_nb  :
504-                     request -> u .bcast .tmpsize  -  ((request -> u .bcast .tmpsize  +  segment_nb  -  1 ) / segment_nb ) *  (segment_nb  -  1 );
556+             if  (seg  <= nb_long ) length  =  seg_size  +  1 ;
557+             else  length  =  seg_size ;
505558
506559            /* 
507560             ** Prepare Data ME 
@@ -642,13 +695,14 @@ bcast_pipeline_top(void *buff, int count,
642695                seg ++ , offset  +=  length ) {
643696
644697            /* Divide buffer into segments */ 
645-             length  =  (seg  <  segment_nb ) ?
646-                     (request -> u .bcast .tmpsize  +  segment_nb  -  1 ) / segment_nb  :
647-                     request -> u .bcast .tmpsize  -  ((request -> u .bcast .tmpsize  +  segment_nb  -  1 ) / segment_nb ) *  (segment_nb  -  1 );
698+             if  (seg  <= nb_long ) length  =  seg_size  +  1 ;
699+             else  length  =  seg_size ;
700+             opal_output_verbose (10 , ompi_coll_base_framework .framework_output ,
701+                 "bcast with pipeline  :  segment of size %ld \n" , length );
648702
649703            /* compute the triggering threshold to send data to the children */ 
650-             trig_thr  =  ( rank   ==   root ) ? ( segment_nb ) : 
651-                     ( segment_nb   +   seg ) ;
704+             trig_thr  =  segment_nb   +   seg   -   1 ;  /* To be sure the PtlTriggeredPut will be executed in order */ 
705+             if  ( rank   !=   root )  trig_thr   ++ ;
652706
653707            /* 
654708             ** Send Data to children 
@@ -668,6 +722,16 @@ bcast_pipeline_top(void *buff, int count,
668722                }
669723            }
670724        }
725+         if  (rank  ==  root ) {
726+             trig_thr  =  segment_nb ;
727+             ct_inc .success  =  segment_nb ;
728+             ct_inc .failure  =  0 ;
729+ 
730+             if  ((ret  =  PtlTriggeredCTInc (request -> u .bcast .trig_ct_h , ct_inc ,
731+                    request -> u .bcast .trig_ct_h , trig_thr )) !=  0 ) {
732+                 return  opal_stderr ("PtlTriggeredCTInc failed" , __FILE__ , __LINE__ , ret );
733+             }
734+         }
671735
672736        if  (is_sync ) {
673737            if  ((ret  =  PtlCTWait (request -> u .bcast .ack_ct_h , 1 , & ct )) !=  0 ) {
@@ -696,8 +760,29 @@ bcast_pipeline_top(void *buff, int count,
696760         */ 
697761
698762        if  (rank  !=  root ) {
763+             trig_thr  =  segment_nb ;
764+ 
699765            if  (is_sync ) {
700-                 if  ((ret  =  PtlCTWait (request -> u .bcast .trig_ct_h , segment_nb , & ct )) !=  0 ) {
766+                 /* Each leaf has a pending PtlTriggeredPut (to send the final ACK). We must call PtlTriggeredCTInc twice. 
767+                    Otherwise, we could pass the PtlCTWait and then free the CT too early and the Put wouldn't be triggered. 
768+ 
769+                    This is necessary because portals4 does not insure the order in the triggered operations associated 
770+                    with the same threshold. In the case where PtlCTWait is not called (else case), this is not necessary. */ 
771+ 
772+                 ct_inc .success  =  1 ;
773+                 ct_inc .failure  =  0 ;
774+ 
775+                 if  ((ret  =  PtlTriggeredCTInc (request -> u .bcast .trig_ct_h , ct_inc ,
776+                         request -> u .bcast .trig_ct_h , trig_thr )) !=  0 ) {
777+                     return  opal_stderr ("PtlTriggeredCTInc failed" , __FILE__ , __LINE__ , ret );
778+                 }
779+ 
780+                 if  ((ret  =  PtlTriggeredCTInc (request -> u .bcast .trig_ct_h , ct_inc ,
781+                         request -> u .bcast .trig_ct_h , trig_thr  +  1 )) !=  0 ) {
782+                     return  opal_stderr ("PtlTriggeredCTInc failed" , __FILE__ , __LINE__ , ret );
783+                 }
784+ 
785+                 if  ((ret  =  PtlCTWait (request -> u .bcast .trig_ct_h , trig_thr  +  2 , & ct )) !=  0 ) {
701786                    opal_stderr ("PtlCTWait failed" , __FILE__ , __LINE__ , ret );
702787                }
703788            }
@@ -707,7 +792,7 @@ bcast_pipeline_top(void *buff, int count,
707792                        mca_coll_portals4_component .finish_pt_idx ,
708793                        0 , 0 , NULL , (uintptr_t ) request ,
709794                        request -> u .bcast .trig_ct_h ,
710-                         segment_nb )) !=  0 ) {
795+                         trig_thr )) !=  0 ) {
711796                    return  opal_stderr ("PtlTriggeredPut failed" , __FILE__ , __LINE__ , ret );
712797                }
713798            }
@@ -831,7 +916,7 @@ ompi_coll_portals4_ibcast_intra(void *buff, int count,
831916        return  OMPI_ERROR ;
832917    }
833918
834-     puts ( "ibcast "
919+     opal_output_verbose ( 10 ,  ompi_coll_base_framework . framework_output ,  "ibcast_intra "
835920    return  (OMPI_SUCCESS );
836921}
837922
@@ -860,5 +945,6 @@ ompi_coll_portals4_ibcast_intra_fini(ompi_coll_portals4_request_t *request)
860945    ompi_request_complete (& request -> super , true);
861946    OPAL_THREAD_UNLOCK (& ompi_request_lock );
862947
948+     opal_output_verbose (10 , ompi_coll_base_framework .framework_output , "ibcast_intra_fini" );
863949    return  (OMPI_SUCCESS );
864950}
0 commit comments