feat: WIP DistributedMix

mrava87 · mrava87 · commit ca558fd70c56 · 2025-09-07T20:47:35.000Z
A new DistributedMix class is create with the aim of simpflify and unify
all comm. calls in both DistributedArray and operators (further hiding
away all implementation details).
diff --git a/pylops_mpi/Distributed.py b/pylops_mpi/Distributed.py
@@ -0,0 +1,45 @@
+from typing import Any, NewType
+
+from mpi4py import MPI
+from pylops.utils import deps as pylops_deps  # avoid namespace crashes with pylops_mpi.utils
+from pylops_mpi.utils._mpi import mpi_allreduce
+from pylops_mpi.utils import deps
+
+cupy_message = pylops_deps.cupy_import("the DistributedArray module")
+nccl_message = deps.nccl_import("the DistributedArray module")
+
+if nccl_message is None and cupy_message is None:
+    from pylops_mpi.utils._nccl import (
+        nccl_allgather, nccl_allreduce, 
+        nccl_asarray, nccl_bcast, nccl_split, nccl_send, nccl_recv,
+        _prepare_nccl_allgather_inputs, _unroll_nccl_allgather_recv
+    )
+
+
+class DistributedMixIn:
+    r"""Distributed Mixin class
+
+    This class implements all methods associated with communication primitives
+    from MPI and NCCL. It is mostly charged to identifying which commuicator
+    to use and whether the buffered or object MPI primitives should be used
+    (the former in the case of NumPy arrays or CuPy arrays when a CUDA-Aware
+    MPI installation is available, the latter with CuPy arrays when a CUDA-Aware
+    MPI installation is not available).
+    """
+    def _allreduce(self, send_buf, recv_buf=None, op: MPI.Op = MPI.SUM):
+        """Allreduce operation
+        """
+        if deps.nccl_enabled and getattr(self, "base_comm_nccl"):
+            return nccl_allreduce(self.base_comm_nccl, send_buf, recv_buf, op)
+        else:
+            return mpi_allreduce(self.base_comm, send_buf, 
+                                 recv_buf, self.engine, op)
+
+    def _allreduce_subcomm(self, send_buf, recv_buf=None, op: MPI.Op = MPI.SUM):
+        """Allreduce operation with subcommunicator
+        """
+        if deps.nccl_enabled and getattr(self, "base_comm_nccl"):
+            return nccl_allreduce(self.sub_comm, send_buf, recv_buf, op)
+        else:
+            return mpi_allreduce(self.sub_comm, send_buf, 
+                                 recv_buf, self.engine, op)
diff --git a/pylops_mpi/DistributedArray.py b/pylops_mpi/DistributedArray.py
@@ -3,12 +3,13 @@
 from typing import Any, List, Optional, Tuple, Union, NewType
 
 import numpy as np
-import os
 from mpi4py import MPI
+from pylops_mpi.Distributed import DistributedMixIn
 from pylops.utils import DTypeLike, NDArray
 from pylops.utils import deps as pylops_deps  # avoid namespace crashes with pylops_mpi.utils
 from pylops.utils._internal import _value_or_sized_to_tuple
 from pylops.utils.backend import get_array_module, get_module, get_module_name
+from pylops_mpi.utils._mpi import mpi_allreduce, mpi_send
 from pylops_mpi.utils import deps
 
 cupy_message = pylops_deps.cupy_import("the DistributedArray module")
@@ -22,10 +23,6 @@
 
 NcclCommunicatorType = NewType("NcclCommunicator", NcclCommunicator)
 
-if int(os.environ.get("PYLOPS_MPI_CUDA_AWARE", 0)):
-    is_cuda_aware_mpi = True
-else:
-    is_cuda_aware_mpi = False
 
 class Partition(Enum):
     r"""Enum class
@@ -104,7 +101,7 @@ def subcomm_split(mask, comm: Optional[Union[MPI.Comm, NcclCommunicatorType]] =
     return sub_comm
 
 
-class DistributedArray:
+class DistributedArray(DistributedMixIn):
     r"""Distributed Numpy Arrays
 
     Multidimensional NumPy-like distributed arrays.
@@ -477,44 +474,6 @@ def _check_mask(self, dist_array):
         if not np.array_equal(self.mask, dist_array.mask):
             raise ValueError("Mask of both the arrays must be same")
 
-    def _allreduce(self, send_buf, recv_buf=None, op: MPI.Op = MPI.SUM):
-        """Allreduce operation
-        """
-        if deps.nccl_enabled and getattr(self, "base_comm_nccl"):
-            return nccl_allreduce(self.base_comm_nccl, send_buf, recv_buf, op)
-        else:
-            if is_cuda_aware_mpi or self.engine == "numpy":
-                ncp = get_module(self.engine)
-                recv_buf = ncp.zeros(send_buf.size, dtype=send_buf.dtype)
-                self.base_comm.Allreduce(send_buf, recv_buf, op)
-                return recv_buf
-            else:
-                # CuPy with non-CUDA-aware MPI
-                if recv_buf is None:
-                    return self.base_comm.allreduce(send_buf, op)
-                # For MIN and MAX which require recv_buf
-                self.base_comm.Allreduce(send_buf, recv_buf, op)
-                return recv_buf
-
-    def _allreduce_subcomm(self, send_buf, recv_buf=None, op: MPI.Op = MPI.SUM):
-        """Allreduce operation with subcommunicator
-        """
-        if deps.nccl_enabled and getattr(self, "base_comm_nccl"):
-            return nccl_allreduce(self.sub_comm, send_buf, recv_buf, op)
-        else:
-            if is_cuda_aware_mpi or self.engine == "numpy":
-                ncp = get_module(self.engine)
-                recv_buf = ncp.zeros(send_buf.size, dtype=send_buf.dtype)
-                self.sub_comm.Allreduce(send_buf, recv_buf, op)
-                return recv_buf
-            else:
-                # CuPy with non-CUDA-aware MPI
-                if recv_buf is None:
-                    return self.sub_comm.allreduce(send_buf, op)
-                # For MIN and MAX which require recv_buf
-                self.sub_comm.Allreduce(send_buf, recv_buf, op)
-                return recv_buf
-
     def _allgather(self, send_buf, recv_buf=None):
         """Allgather operation
         """
@@ -556,16 +515,9 @@ def _send(self, send_buf, dest, count=None, tag=0):
                 count = send_buf.size
             nccl_send(self.base_comm_nccl, send_buf, dest, count)
         else:
-            if is_cuda_aware_mpi or self.engine == "numpy":
-                # Determine MPI type based on array dtype
-                mpi_type = MPI._typedict[send_buf.dtype.char]
-                if count is None:
-                    count = send_buf.size
-                self.base_comm.Send([send_buf, count, mpi_type], dest=dest, tag=tag)
-            else:
-                # Uses CuPy without CUDA-aware MPI
-                self.base_comm.send(send_buf, dest, tag)
-
+            mpi_send(self.base_comm,
+                     send_buf, dest, count, tag=tag,
+                     engine=self.engine)
 
     def _recv(self, recv_buf=None, source=0, count=None, tag=0):
         """Receive operation
@@ -579,7 +531,7 @@ def _recv(self, recv_buf=None, source=0, count=None, tag=0):
             return recv_buf
         else:
             # NumPy + MPI will benefit from buffered communication regardless of MPI installation
-            if is_cuda_aware_mpi or self.engine == "numpy":
+            if deps.cuda_aware_mpi_enabled or self.engine == "numpy":
                 ncp = get_module(self.engine)
                 if recv_buf is None:
                     if count is None:
@@ -734,7 +686,7 @@ def _compute_vector_norm(self, local_array: NDArray,
             # CuPy + non-CUDA-aware MPI does not work well with buffered communication, particularly
             # with MAX, MIN operator. Here we copy the array back to CPU, transfer, and copy them back to GPUs
             send_buf = ncp.max(ncp.abs(local_array), axis=axis).astype(ncp.float64)
-            if self.engine == "cupy" and self.base_comm_nccl is None and not is_cuda_aware_mpi:
+            if self.engine == "cupy" and self.base_comm_nccl is None and not deps.cuda_aware_mpi_enabled:
                 # CuPy + non-CUDA-aware MPI: This will call non-buffered communication 
                 # which return a list of object - must be copied back to a GPU memory.
                 recv_buf = self._allreduce_subcomm(send_buf.get(), recv_buf.get(), op=MPI.MAX)
@@ -750,7 +702,7 @@ def _compute_vector_norm(self, local_array: NDArray,
             # Calculate min followed by min reduction
             # See the comment above in +infinity norm
             send_buf = ncp.min(ncp.abs(local_array), axis=axis).astype(ncp.float64)
-            if self.engine == "cupy" and self.base_comm_nccl is None and not is_cuda_aware_mpi:
+            if self.engine == "cupy" and self.base_comm_nccl is None and not deps.cuda_aware_mpi_enabled:
                 recv_buf = self._allreduce_subcomm(send_buf.get(), recv_buf.get(), op=MPI.MIN)
                 recv_buf = ncp.asarray(ncp.squeeze(recv_buf, axis=axis))
             else:
diff --git a/pylops_mpi/basicoperators/VStack.py b/pylops_mpi/basicoperators/VStack.py
@@ -15,6 +15,7 @@
     Partition,
     StackedDistributedArray
 )
+from pylops_mpi.Distributed import DistributedMixIn
 from pylops_mpi.utils.decorators import reshaped
 from pylops_mpi.utils import deps
 
@@ -25,7 +26,7 @@
     from pylops_mpi.utils._nccl import nccl_allreduce
 
 
-class MPIVStack(MPILinearOperator):
+class MPIVStack(DistributedMixIn, MPILinearOperator):
     r"""MPI VStack Operator
 
     Create a vertical stack of a set of linear operators using MPI. Each rank must
@@ -141,16 +142,19 @@ def _matvec(self, x: DistributedArray) -> DistributedArray:
     @reshaped(forward=False, stacking=True)
     def _rmatvec(self, x: DistributedArray) -> DistributedArray:
         ncp = get_module(x.engine)
-        y = DistributedArray(global_shape=self.shape[1], base_comm=x.base_comm, base_comm_nccl=x.base_comm_nccl, partition=Partition.BROADCAST,
+        # TODO: consider adding base_comm, base_comm_nccl, engine to the
+        # input parameters of _allreduce instead of relying on self
+        self.base_comm, self.base_comm_nccl, self.engine = \
+            x.base_comm, x.base_comm_nccl, x.engine
+        y = DistributedArray(global_shape=self.shape[1], base_comm=x.base_comm, 
+                             base_comm_nccl=x.base_comm_nccl, 
+                             partition=Partition.BROADCAST,
                              engine=x.engine, dtype=self.dtype)
         y1 = []
         for iop, oper in enumerate(self.ops):
             y1.append(oper.rmatvec(x.local_array[self.nnops[iop]: self.nnops[iop + 1]]))
         y1 = ncp.sum(ncp.vstack(y1), axis=0)
-        if deps.nccl_enabled and x.base_comm_nccl:
-            y[:] = nccl_allreduce(x.base_comm_nccl, y1, op=MPI.SUM)
-        else:
-            y[:] = self.base_comm.allreduce(y1, op=MPI.SUM)
+        y[:] = self._allreduce(y1, op=MPI.SUM) 
         return y
 
 
diff --git a/pylops_mpi/utils/deps.py b/pylops_mpi/utils/deps.py
@@ -39,6 +39,10 @@ def nccl_import(message: Optional[str] = None) -> str:
     return nccl_message
 
 
+cuda_aware_mpi_enabled: bool = (
+    True if int(os.getenv("PYLOPS_MPI_CUDA_AWARE", 1) == 1) else False
+)
+
 nccl_enabled: bool = (
     True if (nccl_import() is None and int(os.getenv("NCCL_PYLOPS_MPI", 1)) == 1) else False
 )

Original file line number	Diff line number	Diff line change
`@@ -39,6 +39,10 @@ def nccl_import(message: Optional[str] = None) -> str:`
`39`	`39`	`return nccl_message`
`40`	`40`
`41`	`41`
	`42`	`+cuda_aware_mpi_enabled: bool = (`
	`43`	`+ True if int(os.getenv("PYLOPS_MPI_CUDA_AWARE", 1) == 1) else False`
	`44`	`+)`
	`45`	`+`
`42`	`46`	`nccl_enabled: bool = (`
`43`	`47`	`True if (nccl_import() is None and int(os.getenv("NCCL_PYLOPS_MPI", 1)) == 1) else False`
`44`	`48`	`)`