PyLops
diff --git a/‎pylops_mpi/Distributed.py‎
Lines changed: 50 additions & 5 deletions b/‎pylops_mpi/Distributed.py‎
Lines changed: 50 additions & 5 deletions
diff --git a/‎pylops_mpi/DistributedArray.py‎
Lines changed: 1 addition & 63 deletions b/‎pylops_mpi/DistributedArray.py‎
Lines changed: 1 addition & 63 deletions
diff --git a/‎pylops_mpi/utils/_mpi.py‎
Lines changed: 105 additions & 4 deletions b/‎pylops_mpi/utils/_mpi.py‎
Lines changed: 105 additions & 4 deletions
@@ -1,8 +1,8 @@
-from typing import Any, NewType
+from typing import Any, NewType, Tuple
 
 from mpi4py import MPI
 from pylops.utils import deps as pylops_deps  # avoid namespace crashes with pylops_mpi.utils
-from pylops_mpi.utils._mpi import mpi_allreduce, mpi_send
+from pylops_mpi.utils._mpi import mpi_allreduce, mpi_allgather, mpi_send, mpi_recv, _prepare_allgather_inputs, _unroll_allgather_recv
 from pylops_mpi.utils import deps
 
 cupy_message = pylops_deps.cupy_import("the DistributedArray module")
@@ -11,11 +11,9 @@
 if nccl_message is None and cupy_message is None:
     from pylops_mpi.utils._nccl import (
         nccl_allgather, nccl_allreduce, 
-        nccl_asarray, nccl_bcast, nccl_split, nccl_send, nccl_recv,
-        _prepare_nccl_allgather_inputs, _unroll_nccl_allgather_recv
+        nccl_asarray, nccl_bcast, nccl_split, nccl_send, nccl_recv
     )
 
-
 class DistributedMixIn:
     r"""Distributed Mixin class
 
@@ -44,6 +42,36 @@ def _allreduce_subcomm(self, send_buf, recv_buf=None, op: MPI.Op = MPI.SUM):
             return mpi_allreduce(self.sub_comm, send_buf, 
                                  recv_buf, self.engine, op)
 
+    def _allgather(self, send_buf, recv_buf=None):
+        """Allgather operation
+        """
+        if deps.nccl_enabled and self.base_comm_nccl:
+            if isinstance(send_buf, (tuple, list, int)):
+                return nccl_allgather(self.base_comm_nccl, send_buf, recv_buf)
+            else:
+                send_shapes = self.base_comm.allgather(send_buf.shape)
+                (padded_send, padded_recv) = _prepare_allgather_inputs(send_buf, send_shapes, engine="cupy")
+                raw_recv = nccl_allgather(self.base_comm_nccl, padded_send, recv_buf if recv_buf else padded_recv)
+                return _unroll_allgather_recv(raw_recv, padded_send.shape, send_shapes)
+        else:
+            if isinstance(send_buf, (tuple, list, int)):
+                return self.base_comm.allgather(send_buf)
+            return mpi_allgather(self.base_comm, send_buf, recv_buf, self.engine)
+
+    def _allgather_subcomm(self, send_buf, recv_buf=None):
+        """Allgather operation with subcommunicator
+        """
+        if deps.nccl_enabled and getattr(self, "base_comm_nccl"):
+            if isinstance(send_buf, (tuple, list, int)):
+                return nccl_allgather(self.sub_comm, send_buf, recv_buf)
+            else:
+                send_shapes = self._allgather_subcomm(send_buf.shape)
+                (padded_send, padded_recv) = _prepare_allgather_inputs(send_buf, send_shapes, engine="cupy")
+                raw_recv = nccl_allgather(self.sub_comm, padded_send, recv_buf if recv_buf else padded_recv)
+                return _unroll_allgather_recv(raw_recv, padded_send.shape, send_shapes)
+        else:
+            return mpi_allgather(self.sub_comm, send_buf, recv_buf, self.engine)
+
     def _send(self, send_buf, dest, count=None, tag=0):
         """Send operation
         """
@@ -55,3 +83,20 @@ def _send(self, send_buf, dest, count=None, tag=0):
             mpi_send(self.base_comm,
                      send_buf, dest, count, tag=tag,
                      engine=self.engine)
+
+    def _recv(self, recv_buf=None, source=0, count=None, tag=0):
+        """Receive operation
+        """
+        if deps.nccl_enabled and self.base_comm_nccl:
+            if recv_buf is None:
+                raise ValueError("recv_buf must be supplied when using NCCL")
+            if count is None:
+                count = recv_buf.size
+            nccl_recv(self.base_comm_nccl, recv_buf, source, count)
+            return recv_buf
+        else:
+            return mpi_recv(self.base_comm,
+                     recv_buf, source, count, tag=tag,
+                     engine=self.engine)
+
+
@@ -9,14 +9,13 @@
 from pylops.utils import deps as pylops_deps  # avoid namespace crashes with pylops_mpi.utils
 from pylops.utils._internal import _value_or_sized_to_tuple
 from pylops.utils.backend import get_array_module, get_module, get_module_name
-from pylops_mpi.utils._mpi import mpi_allreduce, mpi_send
 from pylops_mpi.utils import deps
 
 cupy_message = pylops_deps.cupy_import("the DistributedArray module")
 nccl_message = deps.nccl_import("the DistributedArray module")
 
 if nccl_message is None and cupy_message is None:
-    from pylops_mpi.utils._nccl import nccl_allgather, nccl_allreduce, nccl_asarray, nccl_bcast, nccl_split, nccl_send, nccl_recv, _prepare_nccl_allgather_inputs, _unroll_nccl_allgather_recv
+    from pylops_mpi.utils._nccl import nccl_asarray, nccl_bcast, nccl_split 
     from cupy.cuda.nccl import NcclCommunicator
 else:
     NcclCommunicator = Any
@@ -474,67 +473,6 @@ def _check_mask(self, dist_array):
         if not np.array_equal(self.mask, dist_array.mask):
             raise ValueError("Mask of both the arrays must be same")
 
-    def _allgather(self, send_buf, recv_buf=None):
-        """Allgather operation
-        """
-        if deps.nccl_enabled and self.base_comm_nccl:
-            if isinstance(send_buf, (tuple, list, int)):
-                return nccl_allgather(self.base_comm_nccl, send_buf, recv_buf)
-            else:
-                send_shapes = self.base_comm.allgather(send_buf.shape)
-                (padded_send, padded_recv) = _prepare_nccl_allgather_inputs(send_buf, send_shapes)
-                raw_recv = nccl_allgather(self.base_comm_nccl, padded_send, recv_buf if recv_buf else padded_recv)
-                return _unroll_nccl_allgather_recv(raw_recv, padded_send.shape, send_shapes)
-        else:
-            if recv_buf is None:
-                return self.base_comm.allgather(send_buf)
-            self.base_comm.Allgather(send_buf, recv_buf)
-            return recv_buf
-
-    def _allgather_subcomm(self, send_buf, recv_buf=None):
-        """Allgather operation with subcommunicator
-        """
-        if deps.nccl_enabled and getattr(self, "base_comm_nccl"):
-            if isinstance(send_buf, (tuple, list, int)):
-                return nccl_allgather(self.sub_comm, send_buf, recv_buf)
-            else:
-                send_shapes = self._allgather_subcomm(send_buf.shape)
-                (padded_send, padded_recv) = _prepare_nccl_allgather_inputs(send_buf, send_shapes)
-                raw_recv = nccl_allgather(self.sub_comm, padded_send, recv_buf if recv_buf else padded_recv)
-                return _unroll_nccl_allgather_recv(raw_recv, padded_send.shape, send_shapes)
-        else:
-            if recv_buf is None:
-                return self.sub_comm.allgather(send_buf)
-            self.sub_comm.Allgather(send_buf, recv_buf)
-
-    def _recv(self, recv_buf=None, source=0, count=None, tag=0):
-        """Receive operation
-        """
-        if deps.nccl_enabled and self.base_comm_nccl:
-            if recv_buf is None:
-                raise ValueError("recv_buf must be supplied when using NCCL")
-            if count is None:
-                count = recv_buf.size
-            nccl_recv(self.base_comm_nccl, recv_buf, source, count)
-            return recv_buf
-        else:
-            # NumPy + MPI will benefit from buffered communication regardless of MPI installation
-            if deps.cuda_aware_mpi_enabled or self.engine == "numpy":
-                ncp = get_module(self.engine)
-                if recv_buf is None:
-                    if count is None:
-                        raise ValueError("Must provide either recv_buf or count for MPI receive")
-                    # Default to int32 works currently because add_ghost_cells() is called
-                    # with recv_buf and is not affected by this branch. The int32 is for when
-                    # dimension or shape-related integers are send/recv
-                    recv_buf = ncp.zeros(count, dtype=ncp.int32)
-                mpi_type = MPI._typedict[recv_buf.dtype.char]
-                self.base_comm.Recv([recv_buf, recv_buf.size, mpi_type], source=source, tag=tag)
-            else:
-                # Uses CuPy without CUDA-aware MPI
-                recv_buf = self.base_comm.recv(source=source, tag=tag)
-            return recv_buf
-
     def _nccl_local_shapes(self, masked: bool):
         """Get the the list of shapes of every GPU in the communicator
         """
 
@@ -1,19 +1,100 @@
 __all__ = [
-    # "mpi_allgather",
+    "mpi_allgather",
     "mpi_allreduce",
     # "mpi_bcast",
     # "mpi_asarray",
     "mpi_send",
-    # "mpi_recv",
+    "mpi_recv",
+    "_prepare_allgather_inputs",
+    "_unroll_allgather_recv"
 ]
 
-from typing import Optional
+from typing import Optional, Tuple
 
 import numpy as np
 from mpi4py import MPI
 from pylops.utils.backend import get_module
 from pylops_mpi.utils import deps
 
+# TODO: return type annotation for both cupy and numpy
+def _prepare_allgather_inputs(send_buf, send_buf_shapes, engine):
+    r""" Prepare send_buf and recv_buf for NCCL allgather (nccl_allgather)
+
+    Buffered Allgather (MPI and NCCL) requires the sending buffer to have the same size for every device.
+    Therefore, padding is required when the array is not evenly partitioned across
+    all the ranks. The padding is applied such that the each dimension of the sending buffers
+    is equal to the max size of that dimension across all ranks.
+
+    Similarly, each receiver buffer (recv_buf) is created with size equal to :math:n_rank \cdot send_buf.size
+
+    Parameters
+    ----------
+    send_buf : :obj: `numpy.ndarray` or `cupy.ndarray` or array-like
+        The data buffer from the local GPU to be sent for allgather.
+    send_buf_shapes: :obj:`list`
+        A list of shapes for each GPU send_buf (used to calculate padding size)
+    engine : :obj:`str` 
+        Engine used to store array (``numpy`` or ``cupy``)
+
+    Returns
+    -------
+    send_buf: :obj:`cupy.ndarray`
+        A buffer containing the data and padded elements to be sent by this rank.
+    recv_buf : :obj:`cupy.ndarray`
+        An empty, padded buffer to gather data from all GPUs.
+    """
+    ncp = get_module(engine)
+    sizes_each_dim = list(zip(*send_buf_shapes))
+    send_shape = tuple(map(max, sizes_each_dim))
+    pad_size = [
+        (0, s_shape - l_shape) for s_shape, l_shape in zip(send_shape, send_buf.shape)
+    ]
+
+    send_buf = ncp.pad(
+        send_buf, pad_size, mode="constant", constant_values=0
+    )
+
+    ndev = len(send_buf_shapes)
+    recv_buf = ncp.zeros(ndev * send_buf.size, dtype=send_buf.dtype)
+
+    return send_buf, recv_buf
+
+
+def _unroll_allgather_recv(recv_buf, padded_send_buf_shape, send_buf_shapes) -> list:
+    r"""Unrolll recv_buf after Buffered Allgather (MPI and NCCL)
+
+    Remove the padded elements in recv_buff, extract an individual array from each device and return them as a list of arrays
+    Each GPU may send array with a different shape, so the return type has to be a list of array
+    instead of the concatenated array.
+
+    Parameters
+    ----------
+    recv_buf: :obj:`cupy.ndarray` or array-like
+        The data buffer returned from nccl_allgather call
+    padded_send_buf_shape: :obj:`tuple`:int
+        The size of send_buf after padding used in nccl_allgather
+    send_buf_shapes: :obj:`list`
+        A list of original shapes for each GPU send_buf prior to padding
+
+    Returns
+    -------
+    chunks: :obj:`list`
+        A list of `cupy.ndarray` from each GPU with the padded element removed
+    """
+    ndev = len(send_buf_shapes)
+    # extract an individual array from each device
+    chunk_size = np.prod(padded_send_buf_shape)
+    chunks = [
+        recv_buf[i * chunk_size:(i + 1) * chunk_size] for i in range(ndev)
+    ]
+
+    # Remove padding from each array: the padded value may appear somewhere
+    # in the middle of the flat array and thus the reshape and slicing for each dimension is required
+    for i in range(ndev):
+        slicing = tuple(slice(0, end) for end in send_buf_shapes[i])
+        chunks[i] = chunks[i].reshape(padded_send_buf_shape)[slicing]
+
+    return chunks
 
 def mpi_allreduce(base_comm: MPI.Comm,
                   send_buf, recv_buf=None, 
@@ -57,7 +138,27 @@ def mpi_allreduce(base_comm: MPI.Comm,
         # For MIN and MAX which require recv_buf
         base_comm.Allreduce(send_buf, recv_buf, op)
         return recv_buf
-    
+
+
+def mpi_allgather(base_comm: MPI.Comm,
+                  send_buf, recv_buf=None, 
+                  engine: Optional[str] = "numpy",
+                ) -> np.ndarray:
+
+    if deps.cuda_aware_mpi_enabled or engine == "numpy":
+        send_shapes = base_comm.allgather(send_buf.shape)
+        (padded_send, padded_recv) = _prepare_allgather_inputs(send_buf, send_shapes, engine=engine)
+        recv_buffer_to_use = recv_buf if recv_buf else padded_recv
+        base_comm.Allgather(padded_send, recv_buffer_to_use)
+        return _unroll_allgather_recv(recv_buffer_to_use, padded_send.shape, send_shapes)
+
+    else:
+        # CuPy with non-CUDA-aware MPI
+        if recv_buf is None:
+            return base_comm.allgather(send_buf)
+        base_comm.Allgather(send_buf, recv_buf)
+        return recv_buf
+
 
 def mpi_send(base_comm: MPI.Comm,
              send_buf, dest, count, tag=0,