Fixes from PR commennts & add test for nccl_utils (PR#151)

tharittk · tharittk · commit a2a23b0dcb4c · 2025-07-03T08:47:58.000-05:00
diff --git a/pylops_mpi/DistributedArray.py b/pylops_mpi/DistributedArray.py
@@ -500,15 +500,13 @@ def _allgather(self, send_buf, recv_buf=None):
         """Allgather operation
         """
         if deps.nccl_enabled and self.base_comm_nccl:
-            if hasattr(send_buf, "shape"):
+            if isinstance(send_buf, (tuple, list, int)):
+                return nccl_allgather(self.base_comm_nccl, send_buf, recv_buf)
+            else:
                 send_shapes = self.base_comm.allgather(send_buf.shape)
                 (padded_send, padded_recv) = _prepare_nccl_allgather_inputs(send_buf, send_shapes)
-                # TODO: Should we ignore recv_buf completely in this case ?
                 raw_recv = nccl_allgather(self.base_comm_nccl, padded_send, recv_buf if recv_buf else padded_recv)
                 return _unroll_nccl_allgather_recv(raw_recv, padded_send.shape, send_shapes)
-            else:
-                # still works for a send_buf whose type is a tuple for _nccl_local_shapes
-                return nccl_allgather(self.base_comm_nccl, send_buf, recv_buf)
         else:
             if recv_buf is None:
                 return self.base_comm.allgather(send_buf)
@@ -519,13 +517,13 @@ def _allgather_subcomm(self, send_buf, recv_buf=None):
         """Allgather operation with subcommunicator
         """
         if deps.nccl_enabled and getattr(self, "base_comm_nccl"):
-            if hasattr(send_buf, "shape"):
-                send_shapes = self.base_comm.allgather(send_buf.shape)
+            if isinstance(send_buf, (tuple, list, int)):
+                return nccl_allgather(self.sub_comm, send_buf, recv_buf)
+            else:
+                send_shapes = self._allgather_subcomm(send_buf.shape)
                 (padded_send, padded_recv) = _prepare_nccl_allgather_inputs(send_buf, send_shapes)
                 raw_recv = nccl_allgather(self.sub_comm, padded_send, recv_buf if recv_buf else padded_recv)
                 return _unroll_nccl_allgather_recv(raw_recv, padded_send.shape, send_shapes)
-            else:
-                return nccl_allgather(self.sub_comm, send_buf, recv_buf)
         else:
             if recv_buf is None:
                 return self.sub_comm.allgather(send_buf)
diff --git a/pylops_mpi/utils/_nccl.py b/pylops_mpi/utils/_nccl.py
@@ -1,4 +1,6 @@
 __all__ = [
+    "_prepare_nccl_allgather_inputs",
+    "_unroll_nccl_allgather_recv",
     "initialize_nccl_comm",
     "nccl_split",
     "nccl_allgather",
@@ -7,17 +9,17 @@
     "nccl_asarray",
     "nccl_send",
     "nccl_recv",
-    "_prepare_nccl_allgather_inputs",
-    "_unroll_nccl_allgather_recv"
 ]
 
 from enum import IntEnum
+from typing import Tuple
 from mpi4py import MPI
 import os
 import numpy as np
 import cupy as cp
 import cupy.cuda.nccl as nccl
 
+
 cupy_to_nccl_dtype = {
     "float32": nccl.NCCL_FLOAT32,
     "float64": nccl.NCCL_FLOAT64,
@@ -61,6 +63,85 @@ def _nccl_buf_size(buf, count=None):
         return count if count else buf.size
 
 
+def _prepare_nccl_allgather_inputs(send_buf, send_buf_shapes) -> Tuple[cp.ndarray, cp.ndarray]:
+    r""" Prepare send_buf and recv_buf for NCCL allgather (nccl_allgather)
+
+    NCCL's allGather requires the sending buffer to have the same size for every device.
+    Therefore, padding is required when the array is not evenly partitioned across
+    all the ranks. The padding is applied such that the each dimension of the sending buffers
+    is equal to the max size of that dimension across all ranks.
+
+    Similarly, each receiver buffer (recv_buf) is created with size equal to :math:n_rank \cdot send_buf.size
+
+    Parameters
+    ----------
+    send_buf : :obj:`cupy.ndarray` or array-like
+        The data buffer from the local GPU to be sent for allgather.
+    send_buf_shapes: :obj:`list`
+        A list of shapes for each GPU send_buf (used to calculate padding size)
+
+    Returns
+    -------
+    send_buf: :obj:`cupy.ndarray`
+        A buffer containing the data and padded elements to be sent by this rank.
+    recv_buf : :obj:`cupy.ndarray`
+        An empty, padded buffer to gather data from all GPUs.
+    """
+    sizes_each_dim = list(zip(*send_buf_shapes))
+    send_shape = tuple(map(max, sizes_each_dim))
+    pad_size = [
+        (0, s_shape - l_shape) for s_shape, l_shape in zip(send_shape, send_buf.shape)
+    ]
+
+    send_buf = cp.pad(
+        send_buf, pad_size, mode="constant", constant_values=0
+    )
+
+    # NCCL recommends to use one MPI Process per GPU and so size of receiving buffer can be inferred
+    ndev = len(send_buf_shapes)
+    recv_buf = cp.zeros(ndev * send_buf.size, dtype=send_buf.dtype)
+
+    return send_buf, recv_buf
+
+
+def _unroll_nccl_allgather_recv(recv_buf, padded_send_buf_shape, send_buf_shapes) -> list:
+    """Unrolll recv_buf after NCCL allgather (nccl_allgather)
+
+    Remove the padded elements in recv_buff, extract an individual array from each device and return them as a list of arrays
+    Each GPU may send array with a different shape, so the return type has to be a list of array
+    instead of the concatenated array.
+
+    Parameters
+    ----------
+    recv_buf: :obj:`cupy.ndarray` or array-like
+        The data buffer returned from nccl_allgather call
+    padded_send_buf_shape: :obj:`tuple`:int
+        The size of send_buf after padding used in nccl_allgather
+    send_buf_shapes: :obj:`list`
+        A list of original shapes for each GPU send_buf prior to padding
+
+    Returns
+    -------
+    chunks: :obj:`list`
+        A list of `cupy.ndarray` from each GPU with the padded element removed
+    """
+
+    ndev = len(send_buf_shapes)
+    # extract an individual array from each device
+    chunk_size = np.prod(padded_send_buf_shape)
+    chunks = [
+        recv_buf[i * chunk_size:(i + 1) * chunk_size] for i in range(ndev)
+    ]
+
+    # Remove padding from each array: the padded value may appear somewhere
+    # in the middle of the flat array and thus the reshape and slicing for each dimension is required
+    for i in range(ndev):
+        slicing = tuple(slice(0, end) for end in send_buf_shapes[i])
+        chunks[i] = chunks[i].reshape(padded_send_buf_shape)[slicing]
+
+    return chunks
+
+
 def mpi_op_to_nccl(mpi_op) -> NcclOp:
     """ Map MPI reduction operation to NCCL equivalent
 
@@ -253,83 +334,6 @@ def nccl_bcast(nccl_comm, local_array, index, value) -> None:
     )
 
 
-def _prepare_nccl_allgather_inputs(send_buf, send_buf_shapes) -> tuple[cp.ndarray, cp.ndarray]:
-    """ Preparing the send_buf and recv_buf for the NCCL allgather (nccl_allgather)
-
-    NCCL's allGather requires the sending buffer to have the same size for every device.
-    Therefore, the padding is required when the array is not evenly partitioned across
-    all the ranks. The padding is applied such that the sending buffer has the size of
-    each dimension corresponding to the max possible size of that dimension.
-
-    Receiver buff (recv_buf) will have the size n_rank * send_buf.size
-
-    Parameters
-    ----------
-    send_buf : :obj:`cupy.ndarray` or array-like
-        The data buffer from the local GPU to be sent for allgather.
-    send_buf_shapes: :obj:`list`
-        A list of shapes for each GPU send_buf (used to calculate padding size)
-
-    Returns
-    -------
-    tuple[send_buf, recv_buf]: :obj:`tuple`
-        A tuple of (send_buf, recv_buf) will an appropriate size, shape and dtype for NCCL allgather
-
-    """
-    sizes_each_dim = list(zip(*send_buf_shapes))
-    send_shape = tuple(map(max, sizes_each_dim))
-    pad_size = [
-        (0, s_shape - l_shape) for s_shape, l_shape in zip(send_shape, send_buf.shape)
-    ]
-
-    send_buf = cp.pad(
-        send_buf, pad_size, mode="constant", constant_values=0
-    )
-
-    # NCCL recommends to use one MPI Process per GPU and so size of receiving buffer can be inferred
-    ndev = len(send_buf_shapes)
-    recv_buf = cp.zeros(ndev * send_buf.size, dtype=send_buf.dtype)
-
-    return (send_buf, recv_buf)
-
-
-def _unroll_nccl_allgather_recv(recv_buf, padded_send_buf_shape, send_buf_shapes) -> list:
-    """ Remove the padded elements in recv_buff, extract an individual array from each device and return them as a list of arrays
-
-    Each GPU may send array with a different shape, so the return type has to be a list of array
-    instead of the concatenated array.
-
-    Parameters
-    ----------
-    recv_buf: :obj:`cupy.ndarray` or array-like
-        The data buffer returned from nccl_allgather call
-    padded_send_buf_shape: :obj:`tuple`:int
-        The size of send_buf after padding used in nccl_allgather
-    send_buf_shapes: :obj:`list`
-        A list of original shapes for each GPU send_buf prior to padding
-
-    Returns
-    -------
-    chunks: :obj:`list`
-        A list of `cupy.ndarray` from each GPU with the padded element removed
-    """
-
-    ndev = len(send_buf_shapes)
-    # extract an individual array from each device
-    chunk_size = np.prod(padded_send_buf_shape)
-    chunks = [
-        recv_buf[i * chunk_size:(i + 1) * chunk_size] for i in range(ndev)
-    ]
-
-    # Remove padding from each array: the padded value may appear somewhere
-    # in the middle of the flat array and thus the reshape and slicing for each dimension is required
-    for i in range(ndev):
-        slicing = tuple(slice(0, end) for end in send_buf_shapes[i])
-        chunks[i] = chunks[i].reshape(padded_send_buf_shape)[slicing]
-
-    return chunks
-
-
 def nccl_asarray(nccl_comm, local_array, local_shapes, axis) -> cp.ndarray:
     """Global view of the array
 
@@ -352,7 +356,7 @@ def nccl_asarray(nccl_comm, local_array, local_shapes, axis) -> cp.ndarray:
         Global array gathered from all GPUs and concatenated along `axis`.
     """
 
-    (send_buf, recv_buf) = _prepare_nccl_allgather_inputs(local_array, local_shapes)
+    send_buf, recv_buf = _prepare_nccl_allgather_inputs(local_array, local_shapes)
     nccl_allgather(nccl_comm, send_buf, recv_buf)
     chunks = _unroll_nccl_allgather_recv(recv_buf, send_buf.shape, local_shapes)
 
diff --git a/tests_nccl/test_ncclutils_nccl.py b/tests_nccl/test_ncclutils_nccl.py
@@ -0,0 +1,102 @@
+"""Test basic NCCL functionalities in _nccl
+Designed to run with n GPUs (with 1 MPI process per GPU)
+$ mpiexec -n 10 pytest test_ncclutils_nccl.py --with-mpi
+"""
+from mpi4py import MPI
+import numpy as np
+import cupy as cp
+from numpy.testing import assert_allclose
+import pytest
+
+from pylops_mpi.utils._nccl import initialize_nccl_comm, nccl_allgather, _prepare_nccl_allgather_inputs, _unroll_nccl_allgather_recv
+
+np.random.seed(42)
+
+nccl_comm = initialize_nccl_comm()
+
+par1 = {'n': 3, 'dtype': np.float64}
+
+
+@pytest.mark.mpi(min_size=2)
+@pytest.mark.parametrize("par", [(par1), ])
+def test_allgather_samesize(par):
+    """Test nccl_allgather with arrays of same size"""
+    size = MPI.COMM_WORLD.Get_size()
+    rank = MPI.COMM_WORLD.Get_rank()
+
+    # Local array
+    local_array = rank * cp.ones(par['n'], dtype=par['dtype'])
+
+    # Gathered array
+    gathered_array = nccl_allgather(nccl_comm, local_array)
+
+    # Compare with global array created in rank0
+    if rank == 0:
+        global_array = np.ones(par['n'] * size, dtype=par['dtype'])
+        for irank in range(size):
+            global_array[irank * par["n"]: (irank + 1) * par["n"]] = irank
+
+        assert_allclose(
+            gathered_array.get(),
+            global_array,
+            rtol=1e-14,
+        )
+
+
+@pytest.mark.mpi(min_size=2)
+@pytest.mark.parametrize("par", [(par1), ])
+def test_allgather_samesize_withrecbuf(par):
+    """Test nccl_allgather with arrays of same size and rec_buf"""
+    size = MPI.COMM_WORLD.Get_size()
+    rank = MPI.COMM_WORLD.Get_rank()
+
+    # Local array
+    local_array = rank * cp.ones(par['n'], dtype=par['dtype'])
+
+    # Gathered array
+    gathered_array = cp.zeros(par['n'] * size, dtype=par['dtype'])
+    gathered_array = nccl_allgather(nccl_comm, local_array, recv_buf=gathered_array)
+
+    # Compare with global array created in rank0
+    if rank == 0:
+        global_array = np.ones(par['n'] * size, dtype=par['dtype'])
+        for irank in range(size):
+            global_array[irank * par["n"]: (irank + 1) * par["n"]] = irank
+
+        assert_allclose(
+            gathered_array.get(),
+            global_array,
+            rtol=1e-14,
+        )
+
+
+@pytest.mark.mpi(min_size=2)
+@pytest.mark.parametrize("par", [(par1), ])
+def test_allgather_differentsize_withrecbuf(par):
+    """Test nccl_allgather with arrays of different size and rec_buf"""
+    size = MPI.COMM_WORLD.Get_size()
+    rank = MPI.COMM_WORLD.Get_rank()
+
+    # Local array
+    n = par['n'] + (1 if rank == size - 1 else 0)
+    local_array = rank * cp.ones(n, dtype=par['dtype'])
+
+    # Gathered array
+    send_shapes = MPI.COMM_WORLD.allgather(local_array.shape)
+    (send_buf, recv_buf) = _prepare_nccl_allgather_inputs(local_array, send_shapes)
+    recv_buf = nccl_allgather(nccl_comm, send_buf, recv_buf)
+    chunks = _unroll_nccl_allgather_recv(recv_buf, send_buf.shape, send_shapes)
+    gathered_array = cp.concatenate(chunks)
+
+    # Compare with global array created in rank0
+    if rank == 0:
+        global_array = np.ones(par['n'] * size + 1, dtype=par['dtype'])
+        for irank in range(size - 1):
+            global_array[irank * par["n"]: (irank + 1) * par["n"]] = irank
+        global_array[(size - 1) * par["n"]:] = size - 1
+
+        assert_allclose(
+            gathered_array.get(),
+            global_array,
+            rtol=1e-14,
+        )