@@ -341,7 +341,7 @@ def local_shapes(self):
341341 return self ._nccl_local_shapes (False )
342342 else :
343343 return self ._allgather (self .base_comm ,
344- self .base_comm_nccl ,
344+ self .base_comm_nccl ,
345345 self .local_shape )
346346
347347 @property
@@ -383,7 +383,7 @@ def asarray(self, masked: bool = False):
383383 final_array = self ._allgather_subcomm (self .local_array )
384384 else :
385385 final_array = self ._allgather (self .base_comm ,
386- self .base_comm_nccl ,
386+ self .base_comm_nccl ,
387387 self .local_array ,
388388 engine = self .engine )
389389 return np .concatenate (final_array , axis = self .axis )
@@ -484,7 +484,7 @@ def _nccl_local_shapes(self, masked: bool):
484484 all_tuples = self ._allgather_subcomm (self .local_shape ).get ()
485485 else :
486486 all_tuples = self ._allgather (self .base_comm ,
487- self .base_comm_nccl ,
487+ self .base_comm_nccl ,
488488 self .local_shape ).get ()
489489 # NCCL returns the flat array that packs every tuple as 1-dimensional array
490490 # unpack each tuple from each rank
@@ -625,12 +625,12 @@ def _compute_vector_norm(self, local_array: NDArray,
625625 # CuPy + non-CUDA-aware MPI: This will call non-buffered communication
626626 # which return a list of object - must be copied back to a GPU memory.
627627 recv_buf = self ._allreduce_subcomm (self .sub_comm , self .base_comm_nccl ,
628- send_buf .get (), recv_buf .get (),
628+ send_buf .get (), recv_buf .get (),
629629 op = MPI .MAX , engine = self .engine )
630630 recv_buf = ncp .asarray (ncp .squeeze (recv_buf , axis = axis ))
631631 else :
632632 recv_buf = self ._allreduce_subcomm (self .sub_comm , self .base_comm_nccl ,
633- send_buf , recv_buf , op = MPI .MAX ,
633+ send_buf , recv_buf , op = MPI .MAX ,
634634 engine = self .engine )
635635 # TODO (tharitt): In current implementation, there seems to be a semantic difference between Buffered MPI and NCCL
636636 # the (1, size) is collapsed to (size, ) with buffered MPI while NCCL retains it.
@@ -643,18 +643,18 @@ def _compute_vector_norm(self, local_array: NDArray,
643643 send_buf = ncp .min (ncp .abs (local_array ), axis = axis ).astype (ncp .float64 )
644644 if self .engine == "cupy" and self .base_comm_nccl is None and not deps .cuda_aware_mpi_enabled :
645645 recv_buf = self ._allreduce_subcomm (self .sub_comm , self .base_comm_nccl ,
646- send_buf .get (), recv_buf .get (),
646+ send_buf .get (), recv_buf .get (),
647647 op = MPI .MIN , engine = self .engine )
648648 recv_buf = ncp .asarray (ncp .squeeze (recv_buf , axis = axis ))
649649 else :
650650 recv_buf = self ._allreduce_subcomm (self .sub_comm , self .base_comm_nccl ,
651- send_buf , recv_buf ,
651+ send_buf , recv_buf ,
652652 op = MPI .MIN , engine = self .engine )
653653 if self .base_comm_nccl :
654654 recv_buf = ncp .asarray (ncp .squeeze (recv_buf , axis = axis ))
655655 else :
656656 recv_buf = self ._allreduce_subcomm (self .sub_comm , self .base_comm_nccl ,
657- ncp .sum (ncp .abs (ncp .float_power (local_array , ord )), axis = axis ),
657+ ncp .sum (ncp .abs (ncp .float_power (local_array , ord )), axis = axis ),
658658 engine = self .engine )
659659 recv_buf = ncp .power (recv_buf , 1.0 / ord )
660660 return recv_buf
0 commit comments