PyLops
diff --git a/‎Makefile‎
Lines changed: 1 addition & 1 deletion b/‎Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pylops_mpi/DistributedArray.py‎
Lines changed: 21 additions & 12 deletions b/‎pylops_mpi/DistributedArray.py‎
Lines changed: 21 additions & 12 deletions
diff --git a/‎pylops_mpi/utils/__init__.py‎
Lines changed: 1 addition & 2 deletions b/‎pylops_mpi/utils/__init__.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎pylops_mpi/utils/_nccl.py‎
Lines changed: 288 additions & 0 deletions b/‎pylops_mpi/utils/_nccl.py‎
Lines changed: 288 additions & 0 deletions
@@ -2,7 +2,7 @@ PIP := $(shell command -v pip3 2> /dev/null || command which pip 2> /dev/null)
 PYTHON := $(shell command -v python3 2> /dev/null || command which python 2> /dev/null)
 NUM_PROCESSES = 3
 
-.PHONY: install dev-install install_conda dev-install_conda tests tests_nccl doc docupdate run_examples run_tutorials
+.PHONY: install dev-install install_conda dev-install_conda tests doc docupdate run_examples run_tutorials
 
 pipcheck:
 ifndef PIP
 
@@ -1,13 +1,22 @@
 from enum import Enum
 from numbers import Integral
-from typing import List, Optional, Tuple, Union
+from typing import Any, List, Optional, Tuple, Union
 
 import numpy as np
 from mpi4py import MPI
-from pylops.utils import DTypeLike, NDArray
+from pylops.utils import DTypeLike, NDArray, deps
 from pylops.utils._internal import _value_or_sized_to_tuple
 from pylops.utils.backend import get_array_module, get_module, get_module_name
-from pylops_mpi.utils.backend import nccl_split, nccl_allgather, nccl_allreduce, nccl_bcast, nccl_asarray
+from pylops_mpi.utils import deps as pylops_mpi_deps
+
+cupy_message = deps.cupy_import("the DistributedArray module")
+nccl_message = pylops_mpi_deps.nccl_import("the DistributedArray module")
+
+if nccl_message is None and cupy_message is None:
+    from pylops_mpi.utils._nccl import nccl_allgather, nccl_allreduce, nccl_asarray, nccl_bcast, nccl_split
+    from cupy.cuda.nccl import NcclCommunicator
+else:
+    NcclCommunicator = Any
 
 
 class Partition(Enum):
@@ -62,9 +71,10 @@ def local_split(global_shape: Tuple, base_comm: MPI.Comm,
 
 def subcomm_split(mask, base_comm: MPI.Comm = MPI.COMM_WORLD):
     """Create new communicators based on mask
+
     This method creates new NCCL communicators based on ``mask``.
-    Contrary to MPI, NCCL does not provide support for splitting of a communicator in multiple subcommunicators;
-    this is therefore handled explicitly by this method.
+    Contrary to MPI, NCCL does not provide support for splitting of a communicator 
+    in multiple subcommunicators; this is therefore handled explicitly by this method.
 
     Parameters
     ----------
@@ -78,7 +88,8 @@ def subcomm_split(mask, base_comm: MPI.Comm = MPI.COMM_WORLD):
 
     Returns:
     -------
-        Union[mpi4py.MPI.Comm, cupy.cuda.nccl.NcclCommunicator]]: a subcommunicator according to mask
+    sub_comm : :obj:`mpi4py.MPI.Comm` or :obj:`cupy.cuda.nccl.NcclCommunicator`
+        Subcommunicator according to mask
     """
     if isinstance(base_comm, MPI.Comm):
         comm = MPI.COMM_WORLD
@@ -128,9 +139,8 @@ class DistributedArray:
         Type of elements in input array. Defaults to ``numpy.float64``.
     """
 
-    # TODO: Type Annotation for base_comm without NCCL import
     def __init__(self, global_shape: Union[Tuple, Integral],
-                 base_comm=MPI.COMM_WORLD,
+                 base_comm: Optional[Union[MPI.Comm, NcclCommunicator]] = MPI.COMM_WORLD,
                  partition: Partition = Partition.SCATTER, axis: int = 0,
                  local_shapes: Optional[List[Union[Tuple, Integral]]] = None,
                  mask: Optional[List[Integral]] = None,
@@ -320,11 +330,11 @@ def local_shapes(self):
         if self.base_comm is MPI.COMM_WORLD:
             return self._allgather(self.local_shape)
         else:
-            # NCCL allgather returns the 1-Dimensional array
-            # of shapes from every rank
-            tuple_len = len(self.local_shape)
+            # gather tuple of shapes from every rank and copy from GPU to CPU
             all_tuples = self._allgather(self.local_shape).get()
+            # NCCL returns the flat array that packs every tuple as 1-dimensional array
             # unpack each tuple from each rank
+            tuple_len = len(self.local_shape)
             return [tuple(all_tuples[i : i + tuple_len]) for i in range(0, len(all_tuples), tuple_len)]
 
     @property
@@ -455,7 +465,6 @@ def _allreduce(self, send_buf, recv_buf=None, op: MPI.Op = MPI.SUM):
             return nccl_allreduce(self.base_comm, send_buf, recv_buf, op)
 
     def _allreduce_subcomm(self, send_buf, recv_buf=None, op: MPI.Op = MPI.SUM):
-
         """Allreduce operation with subcommunicator
         """
 
 
@@ -1,6 +1,5 @@
 # isort: skip_file
 
 # currently dottest create circular dependency with DistributedArray.py
-# from .dottest import * 
+# from .dottest import *
 from .deps import *
-from .backend import *
@@ -0,0 +1,288 @@
+__all__ = [
+    "initialize_nccl_comm",
+    "nccl_split",
+    "nccl_allgather",
+    "nccl_allreduce",
+    "nccl_bcast",
+    "nccl_asarray"
+]
+
+from enum import IntEnum
+from mpi4py import MPI
+import os
+import numpy as np
+import cupy as cp
+import cupy.cuda.nccl as nccl
+
+cupy_to_nccl_dtype = {
+    "float32": nccl.NCCL_FLOAT32,
+    "float64": nccl.NCCL_FLOAT64,
+    "int32": nccl.NCCL_INT32,
+    "int64": nccl.NCCL_INT64,
+    "uint8": nccl.NCCL_UINT8,
+    "int8": nccl.NCCL_INT8,
+    "uint32": nccl.NCCL_UINT32,
+    "uint64": nccl.NCCL_UINT64,
+}
+
+
+class NcclOp(IntEnum):
+    SUM = nccl.NCCL_SUM
+    PROD = nccl.NCCL_PROD
+    MAX = nccl.NCCL_MAX
+    MIN = nccl.NCCL_MIN
+
+
+def mpi_op_to_nccl(mpi_op) -> NcclOp:
+    """ Map MPI reduction operation to NCCL equivalent
+
+    Parameters
+    ----------
+    mpi_op : :obj:`MPI.Op`
+        A MPI reduction operation (e.g., MPI.SUM, MPI.PROD, MPI.MAX, MPI.MIN).
+
+    Returns:
+    -------
+    NcclOp : :obj:`IntEnum`
+        A corresponding NCCL reduction operation.
+    """
+    if mpi_op is MPI.SUM:
+        return NcclOp.SUM
+    elif mpi_op is MPI.PROD:
+        return NcclOp.PROD
+    elif mpi_op is MPI.MAX:
+        return NcclOp.MAX
+    elif mpi_op is MPI.MIN:
+        return NcclOp.MIN
+    else:
+        raise ValueError(f"Unsupported MPI.Op for NCCL: {mpi_op}")
+
+
+def initialize_nccl_comm() -> nccl.NcclCommunicator:
+    """ Initialize NCCL world communicator for every GPU device
+
+    Each GPU must be managed by exactly one MPI process.
+    i.e. the number of MPI process launched must be equal to
+    number of GPUs in communications
+
+    Returns:
+    -------
+    nccl_comm : :obj:`cupy.cuda.nccl.NcclCommunicator`
+        A corresponding NCCL communicator
+    """
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    size = comm.Get_size()
+    device_id = int(
+        os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK")
+        or rank % cp.cuda.runtime.getDeviceCount()
+    )
+    cp.cuda.Device(device_id).use()
+
+    if rank == 0:
+        with cp.cuda.Device(device_id):
+            nccl_id_bytes = nccl.get_unique_id()
+    else:
+        nccl_id_bytes = None
+    nccl_id_bytes = comm.bcast(nccl_id_bytes, root=0)
+
+    nccl_comm = nccl.NcclCommunicator(size, nccl_id_bytes, rank)
+    return nccl_comm
+
+
+def nccl_split(mask) -> nccl.NcclCommunicator:
+    """ NCCL-equivalent of MPI.Split()
+
+    Splitting the communicator into multiple NCCL subcommunicators
+
+    Parameters
+    ----------
+    mask : :obj:`list`
+        Mask defining subsets of ranks to consider when performing 'global'
+        operations on the distributed array such as dot product or norm.
+
+    Returns:
+    -------
+    sub_comm : :obj:`cupy.cuda.nccl.NcclCommunicator`
+        Subcommunicator according to mask
+    """
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    sub_comm = comm.Split(color=mask[rank], key=rank)
+
+    sub_rank = sub_comm.Get_rank()
+    sub_size = sub_comm.Get_size()
+
+    if sub_rank == 0:
+        nccl_id_bytes = nccl.get_unique_id()
+    else:
+        nccl_id_bytes = None
+    nccl_id_bytes = sub_comm.bcast(nccl_id_bytes, root=0)
+    sub_comm = nccl.NcclCommunicator(sub_size, nccl_id_bytes, sub_rank)
+
+    return sub_comm
+
+
+def nccl_allgather(nccl_comm, send_buf, recv_buf=None) -> cp.ndarray:
+    """ NCCL equivalent of MPI_Allgather. Gathers data from all GPUs
+    and distributes the concatenated result to all participants.
+
+    Parameters
+    ----------
+    nccl_comm : :obj:`cupy.cuda.nccl.NcclCommunicator`
+        The NCCL communicator over which data will be gathered.
+    send_buf : :obj:`cupy.ndarray` or array-like
+        The data buffer from the local GPU to be sent.
+    recv_buf : :obj:`cupy.ndarray`, optional
+        The buffer to receive data from all GPUs. If None, a new
+        buffer will be allocated with the appropriate shape.
+
+    Returns
+    -------
+    recv_buf : :obj:`cupy.ndarray`
+        A buffer containing the gathered data from all GPUs.
+    """
+    send_buf = (
+        send_buf if isinstance(send_buf, cp.ndarray) else cp.asarray(send_buf)
+    )
+    if recv_buf is None:
+        recv_buf = cp.zeros(
+            MPI.COMM_WORLD.Get_size() * send_buf.size,
+            dtype=send_buf.dtype,
+        )
+    nccl_comm.allGather(
+        send_buf.data.ptr,
+        recv_buf.data.ptr,
+        send_buf.size,
+        cupy_to_nccl_dtype[str(send_buf.dtype)],
+        cp.cuda.Stream.null.ptr,
+    )
+    return recv_buf
+
+
+def nccl_allreduce(nccl_comm, send_buf, recv_buf=None, op: MPI.Op = MPI.SUM) -> cp.ndarray:
+    """ NCCL equivalent of MPI_Allreduce. Applies a reduction operation
+    (e.g., sum, max) across all GPUs and distributes the result.
+
+    Parameters
+    ----------
+    nccl_comm : :obj:`cupy.cuda.nccl.NcclCommunicator`
+        The NCCL communicator used for collective communication.
+    send_buf : :obj:`cupy.ndarray` or array-like
+        The data buffer from the local GPU to be reduced.
+    recv_buf : :obj:`cupy.ndarray`, optional
+        The buffer to store the result of the reduction. If None,
+        a new buffer will be allocated with the appropriate shape.
+    op : :obj:mpi4py.MPI.Op, optional
+        The reduction operation to apply. Defaults to MPI.SUM.
+
+    Returns
+    -------
+    recv_buf : :obj:`cupy.ndarray`
+        A buffer containing the result of the reduction, broadcasted
+        to all GPUs.
+    """
+    send_buf = (
+        send_buf if isinstance(send_buf, cp.ndarray) else cp.asarray(send_buf)
+    )
+    if recv_buf is None:
+        recv_buf = cp.zeros(send_buf.size, dtype=send_buf.dtype)
+
+    nccl_comm.allReduce(
+        send_buf.data.ptr,
+        recv_buf.data.ptr,
+        send_buf.size,
+        cupy_to_nccl_dtype[str(send_buf.dtype)],
+        mpi_op_to_nccl(op),
+        cp.cuda.Stream.null.ptr,
+    )
+    return recv_buf
+
+
+def nccl_bcast(nccl_comm, local_array, index, value) -> None:
+    """ NCCL equivalent of MPI_Bcast. Broadcasts a single value at the given index
+    from the root GPU (rank 0) to all other GPUs.
+
+    Parameters
+    ----------
+    nccl_comm : :obj:`cupy.cuda.nccl.NcclCommunicator`
+        The NCCL communicator used for collective communication.
+    local_array : :obj:`cupy.ndarray`
+        The local array on each GPU. The value at `index` will be broadcasted.
+    index : :obj:`int`
+        The index in the array to be broadcasted.
+    value : :obj:`scalar`
+        The value to broadcast (only used by the root GPU, rank 0).
+
+    Returns
+    -------
+    None
+    """
+    if nccl_comm.rank_id() == 0:
+        local_array[index] = value
+    nccl_comm.bcast(
+        local_array[index].data.ptr,
+        local_array[index].size,
+        cupy_to_nccl_dtype[str(local_array[index].dtype)],
+        0,
+        cp.cuda.Stream.null.ptr,
+    )
+
+
+def nccl_asarray(nccl_comm, local_array, local_shapes, axis) -> cp.ndarray:
+    """Global view of the array
+
+    Gather all local GPU arrays into a single global array via NCCL all-gather.
+
+    Parameters
+    ----------
+    nccl_comm : :obj:`cupy.cuda.nccl.NcclCommunicator`
+        The NCCL communicator used for collective communication.
+    local_array : :obj:`cupy.ndarray`
+        The local array on the current GPU.
+    local_shapes : :obj:`list`
+        A list of shapes for each GPU local array (used to trim padding).
+    axis : :obj:`int`
+        The axis along which to concatenate the gathered arrays.
+
+    Returns
+    -------
+    final_array : :obj:`cupy.ndarray`
+        Global array gathered from all GPUs and concatenated along `axis`.
+
+    Notes
+    -----
+    NCCL's allGather requires the sending buffer to have the same size for every device.
+    Therefore, the padding is required when the array is not evenly partitioned across
+    all the ranks. The padding is applied such that the sending buffer has the size of
+    each dimension corresponding to the max possible size of that dimension.
+    """
+    sizes_each_dim = list(zip(*local_shapes))
+
+    send_shape = tuple(map(max, sizes_each_dim))
+    pad_size = [
+        (0, s_shape - l_shape) for s_shape, l_shape in zip(send_shape, local_array.shape)
+    ]
+
+    send_buf = cp.pad(
+        local_array, pad_size, mode="constant", constant_values=0
+    )
+
+    # NCCL recommends to use one MPI Process per GPU and so size of receiving buffer can be inferred
+    ndev = len(local_shapes)
+    recv_buf = cp.zeros(ndev * send_buf.size, dtype=send_buf.dtype)
+    nccl_allgather(nccl_comm, send_buf, recv_buf)
+
+    # extract an individual array from each device
+    chunk_size = np.prod(send_shape)
+    chunks = [
+        recv_buf[i * chunk_size:(i + 1) * chunk_size] for i in range(ndev)
+    ]
+
+    # Remove padding from each array: the padded value may appear somewhere
+    # in the middle of the flat array and thus the reshape and slicing for each dimension is required
+    for i in range(ndev):
+        slicing = tuple(slice(0, end) for end in local_shapes[i])
+        chunks[i] = chunks[i].reshape(send_shape)[slicing]
+    # combine back to single global array
+    return cp.concatenate(chunks, axis=axis)