feat: moved methods shared by _mpi and _nccl to _common

mrava87SW · mrava87SW · commit 78d753847aa8 · 2025-09-23T20:58:13.000Z
diff --git a/pylops_mpi/utils/_common.py b/pylops_mpi/utils/_common.py
@@ -0,0 +1,92 @@
+__all__ = [
+    "_prepare_allgather_inputs",
+    "_unroll_allgather_recv"
+]
+
+from typing import Optional
+
+import numpy as np
+from mpi4py import MPI
+from pylops.utils.backend import get_module
+from pylops_mpi.utils import deps
+
+
+# TODO: return type annotation for both cupy and numpy
+def _prepare_allgather_inputs(send_buf, send_buf_shapes, engine):
+    r""" Prepare send_buf and recv_buf for NCCL allgather (nccl_allgather)
+
+    Buffered Allgather (MPI and NCCL) requires the sending buffer to have the same size for every device.
+    Therefore, padding is required when the array is not evenly partitioned across
+    all the ranks. The padding is applied such that the each dimension of the sending buffers
+    is equal to the max size of that dimension across all ranks.
+
+    Similarly, each receiver buffer (recv_buf) is created with size equal to :math:n_rank \cdot send_buf.size
+
+    Parameters
+    ----------
+    send_buf : :obj: `numpy.ndarray` or `cupy.ndarray` or array-like
+        The data buffer from the local GPU to be sent for allgather.
+    send_buf_shapes: :obj:`list`
+        A list of shapes for each GPU send_buf (used to calculate padding size)
+    engine : :obj:`str`
+        Engine used to store array (``numpy`` or ``cupy``)
+
+    Returns
+    -------
+    send_buf: :obj:`cupy.ndarray`
+        A buffer containing the data and padded elements to be sent by this rank.
+    recv_buf : :obj:`cupy.ndarray`
+        An empty, padded buffer to gather data from all GPUs.
+    """
+    ncp = get_module(engine)
+    sizes_each_dim = list(zip(*send_buf_shapes))
+    send_shape = tuple(map(max, sizes_each_dim))
+    pad_size = [
+        (0, s_shape - l_shape) for s_shape, l_shape in zip(send_shape, send_buf.shape)
+    ]
+
+    send_buf = ncp.pad(
+        send_buf, pad_size, mode="constant", constant_values=0
+    )
+
+    ndev = len(send_buf_shapes)
+    recv_buf = ncp.zeros(ndev * send_buf.size, dtype=send_buf.dtype)
+
+    return send_buf, recv_buf
+
+
+def _unroll_allgather_recv(recv_buf, padded_send_buf_shape, send_buf_shapes) -> list:
+    r"""Unrolll recv_buf after Buffered Allgather (MPI and NCCL)
+
+    Remove the padded elements in recv_buff, extract an individual array from each device and return them as a list of arrays
+    Each GPU may send array with a different shape, so the return type has to be a list of array
+    instead of the concatenated array.
+
+    Parameters
+    ----------
+    recv_buf: :obj:`cupy.ndarray` or array-like
+        The data buffer returned from nccl_allgather call
+    padded_send_buf_shape: :obj:`tuple`:int
+        The size of send_buf after padding used in nccl_allgather
+    send_buf_shapes: :obj:`list`
+        A list of original shapes for each GPU send_buf prior to padding
+
+    Returns
+    -------
+    chunks: :obj:`list`
+        A list of `cupy.ndarray` from each GPU with the padded element removed
+    """
+    ndev = len(send_buf_shapes)
+    # extract an individual array from each device
+    chunk_size = np.prod(padded_send_buf_shape)
+    chunks = [
+        recv_buf[i * chunk_size:(i + 1) * chunk_size] for i in range(ndev)
+    ]
+
+    # Remove padding from each array: the padded value may appear somewhere
+    # in the middle of the flat array and thus the reshape and slicing for each dimension is required
+    for i in range(ndev):
+        slicing = tuple(slice(0, end) for end in send_buf_shapes[i])
+        chunks[i] = chunks[i].reshape(padded_send_buf_shape)[slicing]
+
+    return chunks
diff --git a/pylops_mpi/utils/_mpi.py b/pylops_mpi/utils/_mpi.py
@@ -1,12 +1,10 @@
 __all__ = [
     "mpi_allgather",
     "mpi_allreduce",
-    # "mpi_bcast",
+    "mpi_bcast",
     # "mpi_asarray",
     "mpi_send",
     "mpi_recv",
-    "_prepare_allgather_inputs",
-    "_unroll_allgather_recv"
 ]
 
 from typing import Optional
@@ -15,87 +13,26 @@
 from mpi4py import MPI
 from pylops.utils.backend import get_module
 from pylops_mpi.utils import deps
+from pylops_mpi.utils._common import _prepare_allgather_inputs, _unroll_allgather_recv
 
 
-# TODO: return type annotation for both cupy and numpy
-def _prepare_allgather_inputs(send_buf, send_buf_shapes, engine):
-    r""" Prepare send_buf and recv_buf for NCCL allgather (nccl_allgather)
-
-    Buffered Allgather (MPI and NCCL) requires the sending buffer to have the same size for every device.
-    Therefore, padding is required when the array is not evenly partitioned across
-    all the ranks. The padding is applied such that the each dimension of the sending buffers
-    is equal to the max size of that dimension across all ranks.
-
-    Similarly, each receiver buffer (recv_buf) is created with size equal to :math:n_rank \cdot send_buf.size
-
-    Parameters
-    ----------
-    send_buf : :obj: `numpy.ndarray` or `cupy.ndarray` or array-like
-        The data buffer from the local GPU to be sent for allgather.
-    send_buf_shapes: :obj:`list`
-        A list of shapes for each GPU send_buf (used to calculate padding size)
-    engine : :obj:`str`
-        Engine used to store array (``numpy`` or ``cupy``)
-
-    Returns
-    -------
-    send_buf: :obj:`cupy.ndarray`
-        A buffer containing the data and padded elements to be sent by this rank.
-    recv_buf : :obj:`cupy.ndarray`
-        An empty, padded buffer to gather data from all GPUs.
-    """
-    ncp = get_module(engine)
-    sizes_each_dim = list(zip(*send_buf_shapes))
-    send_shape = tuple(map(max, sizes_each_dim))
-    pad_size = [
-        (0, s_shape - l_shape) for s_shape, l_shape in zip(send_shape, send_buf.shape)
-    ]
-
-    send_buf = ncp.pad(
-        send_buf, pad_size, mode="constant", constant_values=0
-    )
-
-    ndev = len(send_buf_shapes)
-    recv_buf = ncp.zeros(ndev * send_buf.size, dtype=send_buf.dtype)
-
-    return send_buf, recv_buf
-
-
-def _unroll_allgather_recv(recv_buf, padded_send_buf_shape, send_buf_shapes) -> list:
-    r"""Unrolll recv_buf after Buffered Allgather (MPI and NCCL)
-
-    Remove the padded elements in recv_buff, extract an individual array from each device and return them as a list of arrays
-    Each GPU may send array with a different shape, so the return type has to be a list of array
-    instead of the concatenated array.
-
-    Parameters
-    ----------
-    recv_buf: :obj:`cupy.ndarray` or array-like
-        The data buffer returned from nccl_allgather call
-    padded_send_buf_shape: :obj:`tuple`:int
-        The size of send_buf after padding used in nccl_allgather
-    send_buf_shapes: :obj:`list`
-        A list of original shapes for each GPU send_buf prior to padding
-
-    Returns
-    -------
-    chunks: :obj:`list`
-        A list of `cupy.ndarray` from each GPU with the padded element removed
-    """
-    ndev = len(send_buf_shapes)
-    # extract an individual array from each device
-    chunk_size = np.prod(padded_send_buf_shape)
-    chunks = [
-        recv_buf[i * chunk_size:(i + 1) * chunk_size] for i in range(ndev)
-    ]
+def mpi_allgather(base_comm: MPI.Comm,
+                  send_buf, recv_buf=None,
+                  engine: Optional[str] = "numpy") -> np.ndarray:
 
-    # Remove padding from each array: the padded value may appear somewhere
-    # in the middle of the flat array and thus the reshape and slicing for each dimension is required
-    for i in range(ndev):
-        slicing = tuple(slice(0, end) for end in send_buf_shapes[i])
-        chunks[i] = chunks[i].reshape(padded_send_buf_shape)[slicing]
+    if deps.cuda_aware_mpi_enabled or engine == "numpy":
+        send_shapes = base_comm.allgather(send_buf.shape)
+        (padded_send, padded_recv) = _prepare_allgather_inputs(send_buf, send_shapes, engine=engine)
+        recv_buffer_to_use = recv_buf if recv_buf else padded_recv
+        base_comm.Allgather(padded_send, recv_buffer_to_use)
+        return _unroll_allgather_recv(recv_buffer_to_use, padded_send.shape, send_shapes)
 
-    return chunks
+    else:
+        # CuPy with non-CUDA-aware MPI
+        if recv_buf is None:
+            return base_comm.allgather(send_buf)
+        base_comm.Allgather(send_buf, recv_buf)
+        return recv_buf
 
 
 def mpi_allreduce(base_comm: MPI.Comm,
@@ -142,23 +79,16 @@ def mpi_allreduce(base_comm: MPI.Comm,
         return recv_buf
 
 
-def mpi_allgather(base_comm: MPI.Comm,
-                  send_buf, recv_buf=None,
-                  engine: Optional[str] = "numpy") -> np.ndarray:
-
+def mpi_bcast(base_comm: MPI.Comm,
+              rank, local_array, index, value,
+              engine: Optional[str] = "numpy") -> np.ndarray:
     if deps.cuda_aware_mpi_enabled or engine == "numpy":
-        send_shapes = base_comm.allgather(send_buf.shape)
-        (padded_send, padded_recv) = _prepare_allgather_inputs(send_buf, send_shapes, engine=engine)
-        recv_buffer_to_use = recv_buf if recv_buf else padded_recv
-        base_comm.Allgather(padded_send, recv_buffer_to_use)
-        return _unroll_allgather_recv(recv_buffer_to_use, padded_send.shape, send_shapes)
-
+        if rank == 0:
+            local_array[index] = value
+        base_comm.Bcast(local_array[index])
     else:
         # CuPy with non-CUDA-aware MPI
-        if recv_buf is None:
-            return base_comm.allgather(send_buf)
-        base_comm.Allgather(send_buf, recv_buf)
-        return recv_buf
+        local_array[index] = base_comm.bcast(value)
 
 
 def mpi_send(base_comm: MPI.Comm,
diff --git a/pylops_mpi/utils/_nccl.py b/pylops_mpi/utils/_nccl.py
@@ -15,7 +15,7 @@
 import os
 import cupy as cp
 import cupy.cuda.nccl as nccl
-from pylops_mpi.utils._mpi import _prepare_allgather_inputs, _unroll_allgather_recv
+from pylops_mpi.utils._common import _prepare_allgather_inputs, _unroll_allgather_recv
 
 cupy_to_nccl_dtype = {
     "float32": nccl.NCCL_FLOAT32,