3939#include "ompi/mca/coll/base/coll_base_functions.h"
4040#include "coll_base_topo.h"
4141#include "coll_base_util.h"
42+ #include "opal/util/minmax.h"
4243
4344/*
4445 * We want to minimize the amount of temporary memory needed while allowing as many ranks
4546 * to exchange data simultaneously. We use a variation of the ring algorithm, where in a
46- * single step a process echange the data with both neighbors at distance k (on the left
47+ * single step a process exchange the data with both neighbors at distance k (on the left
4748 * and the right on a logical ring topology). With this approach we need to pack the data
4849 * for a single of the two neighbors, as we can then use the original buffer (and datatype
4950 * and count) to send the data to the other.
@@ -58,16 +59,22 @@ mca_coll_base_alltoallv_intra_basic_inplace(const void *rbuf, const int *rcounts
5859 ptrdiff_t extent ;
5960 ompi_request_t * req = MPI_REQUEST_NULL ;
6061 char * tmp_buffer ;
61- size_t packed_size = 0 , max_size ;
62+ size_t packed_size = 0 , max_size , type_size ;
6263 opal_convertor_t convertor ;
6364
6465 /* Initialize. */
6566
6667 size = ompi_comm_size (comm );
6768 rank = ompi_comm_rank (comm );
69+ ompi_datatype_type_size (rdtype , & type_size );
6870
69- ompi_datatype_type_size (rdtype , & max_size );
70- max_size *= rcounts [rank ];
71+ for (i = 0 , max_size = 0 ; i < size ; ++ i ) {
72+ if (i == rank ) {
73+ continue ;
74+ }
75+ packed_size = rcounts [i ] * type_size ;
76+ max_size = opal_max (packed_size , max_size );
77+ }
7178
7279 /* Easy way out */
7380 if ((1 == size ) || (0 == max_size ) ) {
0 commit comments