2020 */
2121#include "nbc_internal.h"
2222
23+ static inline int allgather_sched_linear (
24+ int rank , int comm_size , NBC_Schedule * schedule , const void * sendbuf ,
25+ int scount , struct ompi_datatype_t * sdtype , void * recvbuf , int rcount ,
26+ struct ompi_datatype_t * rdtype );
27+ static inline int allgather_sched_recursivedoubling (
28+ int rank , int comm_size , NBC_Schedule * schedule , const void * sbuf ,
29+ int scount , struct ompi_datatype_t * sdtype , void * rbuf , int rcount ,
30+ struct ompi_datatype_t * rdtype );
31+
2332#ifdef NBC_CACHE_SCHEDULE
2433/* tree comparison function for schedule cache */
2534int NBC_Allgather_args_compare (NBC_Allgather_args * a , NBC_Allgather_args * b , void * param ) {
@@ -40,27 +49,38 @@ int NBC_Allgather_args_compare(NBC_Allgather_args *a, NBC_Allgather_args *b, voi
4049}
4150#endif
4251
43- /* simple linear MPI_Iallgather
44- * the algorithm uses p-1 rounds
45- * each node sends the packet it received last round (or has in round 0) to it's right neighbor (modulo p)
46- * each node receives from it's left (modulo p) neighbor */
4752static int nbc_allgather_init (const void * sendbuf , int sendcount , MPI_Datatype sendtype , void * recvbuf , int recvcount ,
4853 MPI_Datatype recvtype , struct ompi_communicator_t * comm , ompi_request_t * * request ,
4954 struct mca_coll_base_module_2_3_0_t * module , bool persistent )
5055{
5156 int rank , p , res ;
5257 MPI_Aint rcvext ;
5358 NBC_Schedule * schedule ;
54- char * rbuf , * sbuf , inplace ;
59+ char * rbuf , inplace ;
5560#ifdef NBC_CACHE_SCHEDULE
5661 NBC_Allgather_args * args , * found , search ;
5762#endif
63+ enum { NBC_ALLGATHER_LINEAR , NBC_ALLGATHER_RDBL } alg ;
5864 ompi_coll_libnbc_module_t * libnbc_module = (ompi_coll_libnbc_module_t * ) module ;
5965
6066 NBC_IN_PLACE (sendbuf , recvbuf , inplace );
6167
6268 rank = ompi_comm_rank (comm );
6369 p = ompi_comm_size (comm );
70+ int is_commsize_pow2 = !(p & (p - 1 ));
71+
72+ if (libnbc_iallgather_algorithm == 0 ) {
73+ alg = NBC_ALLGATHER_LINEAR ;
74+ } else {
75+ /* user forced dynamic decision */
76+ if (libnbc_iallgather_algorithm == 1 ) {
77+ alg = NBC_ALLGATHER_LINEAR ;
78+ } else if (libnbc_iallgather_algorithm == 2 && is_commsize_pow2 ) {
79+ alg = NBC_ALLGATHER_RDBL ;
80+ } else {
81+ alg = NBC_ALLGATHER_LINEAR ;
82+ }
83+ }
6484
6585 res = ompi_datatype_type_extent (recvtype , & rcvext );
6686 if (MPI_SUCCESS != res ) {
@@ -98,36 +118,34 @@ static int nbc_allgather_init(const void* sendbuf, int sendcount, MPI_Datatype s
98118 return OMPI_ERR_OUT_OF_RESOURCE ;
99119 }
100120
101- sbuf = (char * )recvbuf + rank * recvcount * rcvext ;
102-
103- if (persistent && !inplace ) { /* for nonblocking, data has been copied already */
121+ if (persistent && !inplace ) {
122+ /* for nonblocking, data has been copied already */
104123 /* copy my data to receive buffer (= send buffer of NBC_Sched_send) */
105- res = NBC_Sched_copy ((void * )sendbuf , false, sendcount , sendtype ,
106- sbuf , false, recvcount , recvtype , schedule , true);
124+ rbuf = (char * )recvbuf + rank * recvcount * rcvext ;
125+ res = NBC_Sched_copy ((void * )sendbuf , false, sendcount , sendtype ,
126+ rbuf , false, recvcount , recvtype , schedule , true);
107127 if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) {
108128 OBJ_RELEASE (schedule );
109129 return res ;
110130 }
111131 }
112132
113- /* do p-1 rounds */
114- for (int r = 0 ; r < p ; ++ r ) {
115- if (r != rank ) {
116- /* recv from rank r */
117- rbuf = (char * )recvbuf + r * recvcount * rcvext ;
118- res = NBC_Sched_recv (rbuf , false, recvcount , recvtype , r , schedule , false);
119- if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) {
120- OBJ_RELEASE (schedule );
121- return res ;
122- }
133+ switch (alg ) {
134+ case NBC_ALLGATHER_LINEAR :
135+ if (rank == 0 ) printf ("MK: LINEAR\n" );
136+ res = allgather_sched_linear (rank , p , schedule , sendbuf , sendcount , sendtype ,
137+ recvbuf , recvcount , recvtype );
138+ break ;
139+ case NBC_ALLGATHER_RDBL :
140+ if (rank == 0 ) printf ("MK: RDBL\n" );
141+ res = allgather_sched_recursivedoubling (rank , p , schedule , sendbuf , sendcount ,
142+ sendtype , recvbuf , recvcount , recvtype );
143+ break ;
144+ }
123145
124- /* send to rank r - not from the sendbuf to optimize MPI_IN_PLACE */
125- res = NBC_Sched_send (sbuf , false, recvcount , recvtype , r , schedule , false);
126- if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) {
127- OBJ_RELEASE (schedule );
128- return res ;
129- }
130- }
146+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) {
147+ OBJ_RELEASE (schedule );
148+ return res ;
131149 }
132150
133151 res = NBC_Sched_commit (schedule );
@@ -270,6 +288,109 @@ int ompi_coll_libnbc_iallgather_inter(const void* sendbuf, int sendcount, MPI_Da
270288 return OMPI_SUCCESS ;
271289}
272290
291+ /*
292+ * allgather_sched_linear
293+ *
294+ * Description: an implementation of Iallgather using linear algorithm
295+ *
296+ * Time: O(comm_size)
297+ * Schedule length (rounds): O(comm_size)
298+ */
299+ static inline int allgather_sched_linear (
300+ int rank , int comm_size , NBC_Schedule * schedule , const void * sendbuf ,
301+ int scount , struct ompi_datatype_t * sdtype , void * recvbuf , int rcount ,
302+ struct ompi_datatype_t * rdtype )
303+ {
304+ int res = OMPI_SUCCESS ;
305+ ptrdiff_t rlb , rext ;
306+
307+ res = ompi_datatype_get_extent (rdtype , & rlb , & rext );
308+ char * sbuf = (char * )recvbuf + rank * rcount * rext ;
309+
310+ for (int remote = 0 ; remote < comm_size ; ++ remote ) {
311+ if (remote != rank ) {
312+ /* Recv from rank remote */
313+ char * rbuf = (char * )recvbuf + remote * rcount * rext ;
314+ res = NBC_Sched_recv (rbuf , false, rcount , rdtype , remote , schedule , false);
315+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
316+
317+ /* Send to rank remote - not from the sendbuf to optimize MPI_IN_PLACE */
318+ res = NBC_Sched_send (sbuf , false, rcount , rdtype , remote , schedule , false);
319+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
320+ }
321+ }
322+
323+ cleanup_and_return :
324+ return res ;
325+ }
326+
327+ /*
328+ * allgather_sched_recursivedoubling
329+ *
330+ * Description: an implementation of Iallgather using recursive doubling algorithm
331+ * Limitation: power-of-two number of processes only
332+ * Time: O(log(comm_size))
333+ * Schedule length (rounds): O(log(comm_size))
334+ * Memory: no additional memory requirements beyond user-supplied buffers.
335+ *
336+ * Example on 4 nodes:
337+ * Initialization: everyone has its own buffer at location rank in rbuf
338+ * # 0 1 2 3
339+ * [0] [ ] [ ] [ ]
340+ * [ ] [1] [ ] [ ]
341+ * [ ] [ ] [2] [ ]
342+ * [ ] [ ] [ ] [3]
343+ * Step 0: exchange data with (rank ^ 2^0)
344+ * # 0 1 2 3
345+ * [0] [0] [ ] [ ]
346+ * [1] [1] [ ] [ ]
347+ * [ ] [ ] [2] [2]
348+ * [ ] [ ] [3] [3]
349+ * Step 1: exchange data with (rank ^ 2^1) (if you can)
350+ * # 0 1 2 3
351+ * [0] [0] [0] [0]
352+ * [1] [1] [1] [1]
353+ * [2] [2] [2] [2]
354+ * [3] [3] [3] [3]
355+ *
356+ */
357+ static inline int allgather_sched_recursivedoubling (
358+ int rank , int comm_size , NBC_Schedule * schedule , const void * sbuf ,
359+ int scount , struct ompi_datatype_t * sdtype , void * rbuf , int rcount ,
360+ struct ompi_datatype_t * rdtype )
361+ {
362+ int res = OMPI_SUCCESS ;
363+ ptrdiff_t rlb , rext ;
364+ char * tmpsend = NULL , * tmprecv = NULL ;
365+
366+ res = ompi_datatype_get_extent (rdtype , & rlb , & rext );
367+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
368+
369+ int sendblocklocation = rank ;
370+ for (int distance = 1 ; distance < comm_size ; distance <<= 1 ) {
371+ int remote = rank ^ distance ;
372+
373+ tmpsend = (char * )rbuf + (ptrdiff_t )sendblocklocation * (ptrdiff_t )rcount * rext ;
374+ if (rank < remote ) {
375+ tmprecv = (char * )rbuf + (ptrdiff_t )(sendblocklocation + distance ) * (ptrdiff_t )rcount * rext ;
376+ } else {
377+ tmprecv = (char * )rbuf + (ptrdiff_t )(sendblocklocation - distance ) * (ptrdiff_t )rcount * rext ;
378+ sendblocklocation -= distance ;
379+ }
380+
381+ res = NBC_Sched_send (tmpsend , false, (ptrdiff_t )distance * (ptrdiff_t )rcount ,
382+ rdtype , remote , schedule , false);
383+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
384+
385+ res = NBC_Sched_recv (tmprecv , false, (ptrdiff_t )distance * (ptrdiff_t )rcount ,
386+ rdtype , remote , schedule , true);
387+ if (OPAL_UNLIKELY (OMPI_SUCCESS != res )) { goto cleanup_and_return ; }
388+ }
389+
390+ cleanup_and_return :
391+ return res ;
392+ }
393+
273394int ompi_coll_libnbc_allgather_init (const void * sendbuf , int sendcount , MPI_Datatype sendtype , void * recvbuf , int recvcount ,
274395 MPI_Datatype recvtype , struct ompi_communicator_t * comm , MPI_Info info , ompi_request_t * * request ,
275396 struct mca_coll_base_module_2_3_0_t * module ) {
0 commit comments