1212 * Copyright (c) 2006-2010 University of Houston. All rights reserved.
1313 * Copyright (c) 2015-2017 Research Organization for Information Science
1414 * and Technology (RIST). All rights reserved.
15+ * Copyright (c) 2022 IBM Corporation. All rights reserved.
1516 * $COPYRIGHT$
1617 *
1718 * Additional copyrights may follow
@@ -48,9 +49,10 @@ mca_coll_inter_allgather_inter(const void *sbuf, int scount,
4849 struct ompi_communicator_t * comm ,
4950 mca_coll_base_module_t * module )
5051{
51- int rank , root = 0 , size , rsize , err = OMPI_SUCCESS ;
52+ int rank , root = 0 , size , rsize , err = OMPI_SUCCESS , i ;
5253 char * ptmp_free = NULL , * ptmp = NULL ;
5354 ptrdiff_t gap , span ;
55+ void * rbuf_ptr ;
5456
5557 rank = ompi_comm_rank (comm );
5658 size = ompi_comm_size (comm -> c_local_comm );
@@ -76,9 +78,9 @@ mca_coll_inter_allgather_inter(const void *sbuf, int scount,
7678
7779 if (rank == root ) {
7880 /* Do a send-recv between the two root procs. to avoid deadlock */
79- err = ompi_coll_base_sendrecv_actual (ptmp , scount * size , sdtype , 0 ,
81+ err = ompi_coll_base_sendrecv_actual (ptmp , scount * ( size_t ) size , sdtype , 0 ,
8082 MCA_COLL_BASE_TAG_ALLGATHER ,
81- rbuf , rcount * rsize , rdtype , 0 ,
83+ rbuf , rcount * ( size_t ) rsize , rdtype , 0 ,
8284 MCA_COLL_BASE_TAG_ALLGATHER ,
8385 comm , MPI_STATUS_IGNORE );
8486 if (OMPI_SUCCESS != err ) {
@@ -87,12 +89,28 @@ mca_coll_inter_allgather_inter(const void *sbuf, int scount,
8789 }
8890 /* bcast the message to all the local processes */
8991 if ( rcount > 0 ) {
90- err = comm -> c_local_comm -> c_coll -> coll_bcast (rbuf , rcount * rsize , rdtype ,
91- root , comm -> c_local_comm ,
92- comm -> c_local_comm -> c_coll -> coll_bcast_module );
93- if (OMPI_SUCCESS != err ) {
94- goto exit ;
95- }
92+ if ( OPAL_UNLIKELY (rcount * (size_t )rsize > INT_MAX ) ) {
93+ // Sending the message in the coll_bcast as "rcount*rsize" would exceed
94+ // the 'int count' parameter in the coll_bcast() function. Instead broadcast
95+ // the result in "rcount" chunks to the local group.
96+ span = opal_datatype_span (& rdtype -> super , rcount , & gap );
97+ for ( i = 0 ; i < rsize ; ++ i ) {
98+ rbuf_ptr = (char * )rbuf + span * (size_t )i ;
99+ err = comm -> c_local_comm -> c_coll -> coll_bcast (rbuf_ptr , rcount , rdtype ,
100+ root , comm -> c_local_comm ,
101+ comm -> c_local_comm -> c_coll -> coll_bcast_module );
102+ if (OMPI_SUCCESS != err ) {
103+ goto exit ;
104+ }
105+ }
106+ } else {
107+ err = comm -> c_local_comm -> c_coll -> coll_bcast (rbuf , rcount * rsize , rdtype ,
108+ root , comm -> c_local_comm ,
109+ comm -> c_local_comm -> c_coll -> coll_bcast_module );
110+ if (OMPI_SUCCESS != err ) {
111+ goto exit ;
112+ }
113+ }
96114 }
97115
98116 exit :
0 commit comments