support nccl in add_ghost_cells and NCCL-VStack

tharittk · tharittk · commit b567f86905b6 · 2025-06-03T20:55:18.000+07:00
diff --git a/pylops_mpi/DistributedArray.py b/pylops_mpi/DistributedArray.py
@@ -748,7 +748,13 @@ def add_ghost_cells(self, cells_front: Optional[int] = None,
         """
         ghosted_array = self.local_array.copy()
         if cells_front is not None:
-            total_cells_front = self._allgather(cells_front) + [0]
+            # TODO: these are metadata (small size). Under current API, it will
+            # call nccl allgather, should we force it to always use MPI?
+            cells_fronts = self._allgather(cells_front)
+            if deps.nccl_enabled and getattr(self, "base_comm_nccl"):
+                total_cells_front = cells_fronts.tolist() + [0]
+            else:
+                total_cells_front = cells_fronts + [0]
             # Read cells_front which needs to be sent to rank + 1(cells_front for rank + 1)
             cells_front = total_cells_front[self.rank + 1]
             if self.rank != 0:
@@ -761,10 +767,16 @@ def add_ghost_cells(self, cells_front: Optional[int] = None,
                                      f"{self.local_shape[self.axis]} < {cells_front}; "
                                      f"to achieve this use NUM_PROCESSES <= "
                                      f"{max(1, self.global_shape[self.axis] // cells_front)}")
+                # TODO: this array maybe large. Currently it will always use MPI.
+                # Should we enable NCCL point-point here ?
                 self.base_comm.send(np.take(self.local_array, np.arange(-cells_front, 0), axis=self.axis),
                                     dest=self.rank + 1, tag=1)
         if cells_back is not None:
-            total_cells_back = self._allgather(cells_back) + [0]
+            cells_backs = self._allgather(cells_back)
+            if deps.nccl_enabled and getattr(self, "base_comm_nccl"):
+                total_cells_back = cells_backs.tolist() + [0]
+            else:
+                total_cells_back = cells_backs + [0]
             # Read cells_back which needs to be sent to rank - 1(cells_back for rank - 1)
             cells_back = total_cells_back[self.rank - 1]
             if self.rank != 0:
diff --git a/pylops_mpi/basicoperators/VStack.py b/pylops_mpi/basicoperators/VStack.py
@@ -6,6 +6,7 @@
 from pylops import LinearOperator
 from pylops.utils import DTypeLike
 from pylops.utils.backend import get_module
+from pylops.utils import deps as pylops_deps  # avoid namespace crashes with pylops_mpi.utils
 
 from pylops_mpi import (
     MPILinearOperator,
@@ -15,6 +16,14 @@
     StackedDistributedArray
 )
 from pylops_mpi.utils.decorators import reshaped
+from pylops_mpi.DistributedArray import NcclCommunicatorType
+from pylops_mpi.utils import deps
+
+cupy_message = pylops_deps.cupy_import("the DistributedArray module")
+nccl_message = deps.nccl_import("the DistributedArray module")
+
+if nccl_message is None and cupy_message is None:
+    from pylops_mpi.utils._nccl import nccl_allreduce
 
 
 class MPIVStack(MPILinearOperator):
@@ -31,6 +40,8 @@ class MPIVStack(MPILinearOperator):
         One or more :class:`pylops.LinearOperator` to be vertically stacked.
     base_comm : :obj:`mpi4py.MPI.Comm`, optional
         Base MPI Communicator. Defaults to ``mpi4py.MPI.COMM_WORLD``.
+    base_comm_nccl : :obj:`cupy.cuda.nccl.NcclCommunicator`, optional
+        NCCL Communicator over which operators and arrays are distributed.
     dtype : :obj:`str`, optional
         Type of elements in input array.
 
@@ -99,8 +110,10 @@ class MPIVStack(MPILinearOperator):
 
     def __init__(self, ops: Sequence[LinearOperator],
                  base_comm: MPI.Comm = MPI.COMM_WORLD,
+                 base_comm_nccl: NcclCommunicatorType = None,
                  dtype: Optional[DTypeLike] = None):
         self.ops = ops
+        self.base_comm_nccl = base_comm_nccl
         nops = np.zeros(len(self.ops), dtype=np.int64)
         for iop, oper in enumerate(self.ops):
             nops[iop] = oper.shape[0]
@@ -121,7 +134,8 @@ def _matvec(self, x: DistributedArray) -> DistributedArray:
         if x.partition not in [Partition.BROADCAST, Partition.UNSAFE_BROADCAST]:
             raise ValueError(f"x should have partition={Partition.BROADCAST},{Partition.UNSAFE_BROADCAST}"
                              f"Got  {x.partition} instead...")
-        y = DistributedArray(global_shape=self.shape[0], local_shapes=self.local_shapes_n,
+        # the output y should use NCCL if the operand x uses it
+        y = DistributedArray(global_shape=self.shape[0], base_comm_nccl=x.base_comm_nccl, local_shapes=self.local_shapes_n,
                              engine=x.engine, dtype=self.dtype)
         y1 = []
         for iop, oper in enumerate(self.ops):
@@ -132,13 +146,16 @@ def _matvec(self, x: DistributedArray) -> DistributedArray:
     @reshaped(forward=False, stacking=True)
     def _rmatvec(self, x: DistributedArray) -> DistributedArray:
         ncp = get_module(x.engine)
-        y = DistributedArray(global_shape=self.shape[1], partition=Partition.BROADCAST,
+        y = DistributedArray(global_shape=self.shape[1], base_comm_nccl=x.base_comm_nccl, partition=Partition.BROADCAST,
                              engine=x.engine, dtype=self.dtype)
         y1 = []
         for iop, oper in enumerate(self.ops):
             y1.append(oper.rmatvec(x.local_array[self.nnops[iop]: self.nnops[iop + 1]]))
         y1 = ncp.sum(ncp.vstack(y1), axis=0)
-        y[:] = self.base_comm.allreduce(y1, op=MPI.SUM)
+        if deps.nccl_enabled and self.base_comm_nccl:
+            y[:] = nccl_allreduce(self.base_comm_nccl, y1, op=MPI.SUM)
+        else:
+            y[:] = self.base_comm.allreduce(y1, op=MPI.SUM)
         return y
 
 
diff --git a/tests_nccl/test_stack_nccl.py b/tests_nccl/test_stack_nccl.py
@@ -0,0 +1,122 @@
+"""Test the stacking classes
+    Designed to run with n GPUs (with 1 MPI process per GPU)
+    $ mpiexec -n 10 pytest test_stack_nccl.py --with-mpi
+
+This file employs the same test sets as test_stack under NCCL environment
+"""
+import numpy as np
+import cupy as cp
+from numpy.testing import assert_allclose
+from mpi4py import MPI
+import pytest
+
+import pylops
+import pylops_mpi
+from pylops_mpi.utils.dottest import dottest
+from pylops_mpi.utils._nccl import initialize_nccl_comm
+
+nccl_comm = initialize_nccl_comm()
+
+# imag part is left to future complex-number support
+par1 = {'ny': 101, 'nx': 101, 'imag': 0, 'dtype': np.float64}
+par2 = {'ny': 301, 'nx': 101, 'imag': 0, 'dtype': np.float64}
+
+
+@pytest.mark.mpi(min_size=2)
+@pytest.mark.parametrize("par", [(par1), (par2)])
+def test_vstack_nccl(par):
+    """Test the MPIVStack operator with NCCL"""
+    size = MPI.COMM_WORLD.Get_size()
+    rank = MPI.COMM_WORLD.Get_rank()
+    A_gpu = cp.ones(shape=(par['ny'], par['nx'])) + par['imag'] * cp.ones(shape=(par['ny'], par['nx']))
+    Op = pylops.MatrixMult(A=((rank + 1) * A_gpu).astype(par['dtype']))
+    VStack_MPI = pylops_mpi.MPIVStack(ops=[Op, ], base_comm_nccl=nccl_comm)
+
+    # Broadcasted DistributedArray(global_shape == local_shape)
+    x = pylops_mpi.DistributedArray(global_shape=par['nx'],
+                                    base_comm_nccl=nccl_comm,
+                                    partition=pylops_mpi.Partition.BROADCAST,
+                                    dtype=par['dtype'],
+                                    engine="cupy")
+    x[:] = cp.ones(shape=par['nx'], dtype=par['dtype'])
+    x_global = x.asarray()
+
+    # Scattered DistributedArray
+    y = pylops_mpi.DistributedArray(global_shape=size * par['ny'],
+                                    base_comm_nccl=nccl_comm,
+                                    partition=pylops_mpi.Partition.SCATTER,
+                                    dtype=par['dtype'],
+                                    engine="cupy")
+    y[:] = cp.ones(shape=par['ny'], dtype=par['dtype'])
+    y_global = y.asarray()
+
+    # Forward
+    x_mat = VStack_MPI @ x
+    # Adjoint
+    y_rmat = VStack_MPI.H @ y
+    assert isinstance(x_mat, pylops_mpi.DistributedArray)
+    assert isinstance(y_rmat, pylops_mpi.DistributedArray)
+    # Dot test
+    dottest(VStack_MPI, x, y, size * par['ny'], par['nx'])
+
+    x_mat_mpi = x_mat.asarray()
+    y_rmat_mpi = y_rmat.asarray()
+
+    if rank == 0:
+        A = A_gpu.get()
+        ops = [pylops.MatrixMult(A=((i + 1) * A).astype(par['dtype'])) for i in range(size)]
+        VStack = pylops.VStack(ops=ops)
+        x_mat_np = VStack @ x_global.get()
+        y_rmat_np = VStack.H @ y_global.get()
+        assert_allclose(x_mat_mpi.get(), x_mat_np, rtol=1e-14)
+        assert_allclose(y_rmat_mpi.get(), y_rmat_np, rtol=1e-14)
+
+
+@pytest.mark.mpi(min_size=2)
+@pytest.mark.parametrize("par", [(par1), (par2)])
+def test_stacked_vstack_nccl(par):
+    """Test the MPIStackedVStack operator with NCCL"""
+    size = MPI.COMM_WORLD.Get_size()
+    rank = MPI.COMM_WORLD.Get_rank()
+    A_gpu = cp.ones(shape=(par['ny'], par['nx'])) + par['imag'] * cp.ones(shape=(par['ny'], par['nx']))
+    Op = pylops.MatrixMult(A=((rank + 1) * A_gpu).astype(par['dtype']))
+    VStack_MPI = pylops_mpi.MPIVStack(ops=[Op, ], base_comm_nccl=nccl_comm)
+    StackedVStack_MPI = pylops_mpi.MPIStackedVStack([VStack_MPI, VStack_MPI])
+
+    # Broadcasted DistributedArray(global_shape == local_shape)
+    x = pylops_mpi.DistributedArray(global_shape=par['nx'],
+                                    base_comm_nccl=nccl_comm,
+                                    partition=pylops_mpi.Partition.BROADCAST,
+                                    dtype=par['dtype'],
+                                    engine="cupy")
+    x[:] = cp.ones(shape=par['nx'], dtype=par['dtype'])
+    x_global = x.asarray()
+
+    # Stacked DistributedArray
+    dist1 = pylops_mpi.DistributedArray(global_shape=size * par['ny'], base_comm_nccl=nccl_comm, dtype=par['dtype'], engine="cupy")
+    dist1[:] = cp.ones(dist1.local_shape, dtype=par['dtype'])
+    dist2 = pylops_mpi.DistributedArray(global_shape=size * par['ny'], base_comm_nccl=nccl_comm, dtype=par['dtype'], engine="cupy")
+    dist2[:] = cp.ones(dist1.local_shape, dtype=par['dtype'])
+    y = pylops_mpi.StackedDistributedArray(distarrays=[dist1, dist2])
+    y_global = y.asarray()
+
+    x_mat = StackedVStack_MPI @ x
+    y_rmat = StackedVStack_MPI.H @ y
+    assert isinstance(x_mat, pylops_mpi.StackedDistributedArray)
+    assert isinstance(y_rmat, pylops_mpi.DistributedArray)
+
+    x_mat_mpi = x_mat.asarray()
+    y_rmat_mpi = y_rmat.asarray()
+
+    if rank == 0:
+        A = A_gpu.get()
+        ops = [pylops.MatrixMult(A=((i + 1) * A).astype(par['dtype'])) for i in range(size)]
+        VStack = pylops.VStack(ops=ops)
+        VStack_final = pylops.VStack(ops=[VStack, VStack])
+        x_mat_np = VStack_final @ x_global.get()
+        y_rmat_np = VStack_final.H @ y_global.get()
+        assert_allclose(x_mat_mpi.get(), x_mat_np, rtol=1e-14)
+        assert_allclose(y_rmat_mpi.get(), y_rmat_np, rtol=1e-14)
+
+
+# TODO: Test of HStack