Merge branch 'main' into cuda-aware

tharittk · tharittk · commit a317a884efc5 · 2025-10-16T22:39:26.000+07:00
diff --git a/Makefile b/Makefile
@@ -78,7 +78,7 @@ doc_nccl:
 	rm tutorials/*_cupy.py tutorials/*_nccl.py
 
 docupdate:
-	cd docs && make html && cd ..
+	cd docs && NCCL_PYLOPS_MPI=0 make html && cd ..
 
 servedoc:
 	$(PYTHON) -m http.server --directory docs/build/
diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst
@@ -118,6 +118,15 @@ Utils
     local_split
 
 
+.. currentmodule:: pylops_mpi.basicoperators.MatrixMult
+
+.. autosummary::
+   :toctree: generated/
+
+    block_gather
+    local_block_split
+    active_grid_comm
+
 .. currentmodule:: pylops_mpi.utils
 
 .. autosummary::
diff --git a/docs/source/gpu.rst b/docs/source/gpu.rst
@@ -128,41 +128,68 @@ one MPI process. In fact, minor communications like those dealing with array-rel
    The CuPy and NCCL backend is in active development, with many examples not yet in the docs.
    You can find many `other examples <https://github.com/PyLops/pylops_notebooks/tree/master/developement-mpi/Cupy_MPI>`_ from the `PyLops Notebooks repository <https://github.com/PyLops/pylops_notebooks>`_.
 
-Supports for NCCL Backend
-----------------------------
-In the following, we provide a list of modules (i.e., operators and solvers) where we plan to support NCCL and the current status:
+Supports for CuPy and NCCL
+--------------------------
+In the following, we provide a list of modules (i.e., operators and solvers) with their current status (available on CPU+MPI, 
+GPU+MPI, and GPU+NCCL):
 
 .. list-table::
-   :widths: 50 25 
+   :widths: 50 25 25 25
    :header-rows: 1
 
-   * - modules
-     - NCCL supported
+   * - Operator/method
+     - CPU
+     - GPU+MPI
+     - GPU+NCCL
    * - :class:`pylops_mpi.DistributedArray`
-     - ✅ 
-   * - :class:`pylops_mpi.basicoperators.MPIVStack`
-     - ✅ 
+     - ✅
+     - ✅
+     - ✅
+   * - :class:`pylops_mpi.basicoperators.MPIMatrixMult`
+     - ✅
+     - 🔴
+     - 🔴
    * - :class:`pylops_mpi.basicoperators.MPIVStack`
-     - ✅ 
+     - ✅
+     - ✅
+     - ✅
    * - :class:`pylops_mpi.basicoperators.MPIHStack`
-     - ✅ 
+     - ✅
+     - ✅
+     - ✅
    * - :class:`pylops_mpi.basicoperators.MPIBlockDiag`
-     - ✅ 
-   * - :class:`pylops_mpi.basicoperators.MPIGradient`
-     - ✅ 
+     - ✅
+     - ✅
+     - ✅
    * - :class:`pylops_mpi.basicoperators.MPIFirstDerivative`
+     - ✅
+     - ✅
      - ✅ 
    * - :class:`pylops_mpi.basicoperators.MPISecondDerivative`
-     - ✅ 
+     - ✅
+     - ✅
+     - ✅
    * - :class:`pylops_mpi.basicoperators.MPILaplacian`
-     - ✅ 
+     - ✅
+     - ✅
+     - ✅
+   * - :class:`pylops_mpi.basicoperators.MPIGradient`
+     - ✅
+     - ✅
+     - ✅
+   * - :class:`pylops_mpi.signalprocessing.MPIFredhoml1`
+     - ✅
+     - ✅
+     - ✅
+   * - :class:`pylops_mpi.waveeqprocessing.MPIMDC`
+     - ✅
+     - ✅
+     - ✅
    * - :class:`pylops_mpi.optimization.basic.cg`
-     - ✅ 
+     - ✅
+     - ✅
+     - ✅
    * - :class:`pylops_mpi.optimization.basic.cgls`
-     - ✅ 
-   * - :class:`pylops_mpi.signalprocessing.Fredhoml1`
-     - ✅ 
-   * - Complex Numeric Data Type for NCCL 
-     - ✅ 
-   * - ISTA Solver
-     - Planned ⏳
+     - ✅
+     - ✅
+     - ✅
diff --git a/examples/plot_matrixmult.py b/examples/plot_matrixmult.py
@@ -1,13 +1,13 @@
 r"""
-Distributed Matrix Multiplication
-=================================
+Distributed Matrix Multiplication - Block-row-column decomposition
+==================================================================
 This example shows how to use the :py:class:`pylops_mpi.basicoperators.MPIMatrixMult`
-operator to perform matrix-matrix multiplication between a matrix :math:`\mathbf{A}`
-blocked over rows (i.e., blocks of rows are stored over different ranks) and a
-matrix :math:`\mathbf{X}` blocked over columns (i.e., blocks of columns are
-stored over different ranks), with equal number of row and column blocks.
-Similarly, the adjoint operation can be peformed with a matrix :math:`\mathbf{Y}`
-blocked in the same fashion of matrix :math:`\mathbf{X}`.
+operator with ``kind='blocked'`` to perform matrix-matrix multiplication between 
+a matrix :math:`\mathbf{A}` blocked over rows (i.e., blocks of rows are stored 
+over different ranks) and a matrix :math:`\mathbf{X}` blocked over columns 
+(i.e., blocks of columns are stored over different ranks), with equal number 
+of row and column blocks. Similarly, the adjoint operation can be peformed with 
+a matrix :math:`\mathbf{Y}` blocked in the same fashion of matrix :math:`\mathbf{X}`.
 
 Note that whilst the different blocks of the matrix :math:`\mathbf{A}` are directly
 stored in the operator on different ranks, the matrix :math:`\mathbf{X}` is
@@ -19,15 +19,16 @@
 
 """
 
-from matplotlib import pyplot as plt
 import math
 import numpy as np
 from mpi4py import MPI
+from matplotlib import pyplot as plt
 
 import pylops
 
 import pylops_mpi
 from pylops_mpi import Partition
+from pylops_mpi.basicoperators.MatrixMult import active_grid_comm, MPIMatrixMult
 
 plt.close("all")
 
@@ -39,8 +40,7 @@
 
 ###############################################################################
 # We are now ready to create the input matrices :math:`\mathbf{A}` of size
-# :math:`M \times k` :math:`\mathbf{A}` of size and :math:`\mathbf{A}` of size
-# :math:`K \times N`.
+# :math:`M \times k` and :math:`\mathbf{X}` of size :math:`K \times N`.
 N, K, M = 4, 4, 4
 A = np.random.rand(N * K).astype(dtype=np.float32).reshape(N, K)
 X = np.random.rand(K * M).astype(dtype=np.float32).reshape(K, M)
@@ -88,8 +88,7 @@
 # than the row or columm ranks.
 
 base_comm = MPI.COMM_WORLD
-comm, rank, row_id, col_id, is_active = \
-    pylops_mpi.MPIMatrixMult.active_grid_comm(base_comm, N, M)
+comm, rank, row_id, col_id, is_active = active_grid_comm(base_comm, N, M)
 print(f"Process {base_comm.Get_rank()} is {'active' if is_active else 'inactive'}")
 if not is_active: exit(0)
 
@@ -147,7 +146,7 @@
 ################################################################################
 # We are now ready to create the :py:class:`pylops_mpi.basicoperators.MPIMatrixMult`
 # operator and the input matrix :math:`\mathbf{X}`
-Aop = pylops_mpi.MPIMatrixMult(A_p, M, base_comm=comm, dtype="float32")
+Aop = MPIMatrixMult(A_p, M, base_comm=comm, dtype="float32", kind="block")
 
 col_lens = comm.allgather(my_own_cols)
 total_cols = np.sum(col_lens)
diff --git a/examples/plot_summamatrixmult.py b/examples/plot_summamatrixmult.py
@@ -0,0 +1,159 @@
+r"""
+Distributed Matrix Multiplication - SUMMA
+=========================================
+This example shows how to use the :py:class:`pylops_mpi.basicoperators.MPIMatrixMult`
+operator with ``kind='summa'`` to perform matrix-matrix multiplication between 
+a matrix :math:`\mathbf{A}` distributed in 2D blocks across a square process 
+grid and matrices :math:`\mathbf{X}` and :math:`\mathbf{Y}` distributed in 2D 
+blocks across the same grid. Similarly, the adjoint operation can be performed 
+with a matrix :math:`\mathbf{Y}` distributed in the same fashion as matrix 
+:math:`\mathbf{X}`.
+
+Note that whilst the different blocks of matrix :math:`\mathbf{A}` are directly
+stored in the operator on different ranks, the matrices :math:`\mathbf{X}` and
+:math:`\mathbf{Y}` are effectively represented by 1-D :py:class:`pylops_mpi.DistributedArray`
+objects where the different blocks are flattened and stored on different ranks.
+Note that to optimize communications, the ranks are organized in a square grid and
+blocks of :math:`\mathbf{A}` and :math:`\mathbf{X}` are systematically broadcast
+across different ranks during computation - see below for details.
+"""
+
+import math
+import numpy as np
+from mpi4py import MPI
+from matplotlib import pyplot as plt
+
+import pylops_mpi
+from pylops import Conj
+from pylops_mpi.basicoperators.MatrixMult import \
+    local_block_split, MPIMatrixMult, active_grid_comm
+
+plt.close("all")
+
+###############################################################################
+# We set the seed such that all processes can create the input matrices filled
+# with the same random number. In practical applications, such matrices will be
+# filled with data that is appropriate to the use-case.
+np.random.seed(42)
+
+###############################################################################
+# We are now ready to create the input matrices for our distributed matrix
+# multiplication example. We need to set up:
+#
+# - Matrix :math:`\mathbf{A}` of size :math:`N \times K` (the left operand)
+# - Matrix :math:`\mathbf{X}` of size :math:`K \times M` (the right operand)  
+# - The result will be :math:`\mathbf{Y} = \mathbf{A} \mathbf{X}` of size 
+# :math:`N \times M`
+#
+# We create here global test matrices with sequential values for easy verification:
+#
+# - Matrix A: Each element :math:`A_{i,j} = i \cdot K + j` (row-major ordering)
+# - Matrix X: Each element :math:`X_{i,j} = i \cdot M + j`
+
+N, M, K = 6, 6, 6
+A_shape, x_shape, y_shape = (N, K), (K, M), (N, M)
+
+A_data = np.arange(int(A_shape[0] * A_shape[1])).reshape(A_shape)
+x_data = np.arange(int(x_shape[0] * x_shape[1])).reshape(x_shape)
+
+################################################################################
+# For distributed computation, we arrange processes in a square grid of size
+# :math:`P' \times P'` where :math:`P' = \sqrt{P}` and :math:`P` is the total 
+# number of MPI processes. Each process will own a block of each matrix 
+# according to this 2D grid layout.
+
+base_comm = MPI.COMM_WORLD
+comm, rank, row_id, col_id, is_active = active_grid_comm(base_comm, N, M)
+print(f"Process {base_comm.Get_rank()} is {'active' if is_active else 'inactive'}")
+
+p_prime = math.isqrt(comm.Get_size())
+print(f"Process grid: {p_prime} x {p_prime} = {comm.Get_size()} processes")
+
+if rank == 0:
+    print(f"Global matrix A shape: {A_shape} (N={A_shape[0]}, K={A_shape[1]})")
+    print(f"Global matrix X shape: {x_shape} (K={x_shape[0]}, M={x_shape[1]})")
+    print(f"Expected Global result Y shape: ({A_shape[0]}, {x_shape[1]}) = (N, M)")
+
+################################################################################
+# Next we must determine which block of each matrix each process should own.
+#
+# The 2D block distribution requires:
+#
+# - Process at grid position :math:`(i,j)` gets block 
+#   :math:`\mathbf{A}[i_{start}:i_{end}, j_{start}:j_{end}]`
+# - Block sizes are approximately :math:`\lceil N/P' \rceil \times \lceil K/P' \rceil`
+#   with edge processes handling remainder
+#
+# .. raw:: html
+#
+#   <div style="text-align: left; font-family: monospace; white-space: pre;">
+#   <b>Example: 2x2 Process Grid with 6x6 Matrices</b>
+#   
+#   Matrix A (6x6):                    Matrix X (6x6):
+#   ┌───────────┬───────────┐      ┌───────────┬───────────┐
+#   │  0  1  2  │  3  4  5  │      │  0  1  2  │  3  4  5  │
+#   │  6  7  8  │  9 10 11  │      │  6  7  8  │  9 10 11  │
+#   │ 12 13 14  │ 15 16 17  │      │ 12 13 14  │ 15 16 17  │
+#   ├───────────┼───────────┤      ├───────────┼───────────┤
+#   │ 18 19 20  │ 21 22 23  │      │ 18 19 20  │ 21 22 23  │
+#   │ 24 25 26  │ 27 28 29  │      │ 24 25 26  │ 27 28 29  │
+#   │ 30 31 32  │ 33 34 35  │      │ 30 31 32  │ 33 34 35  │
+#   └───────────┴───────────┘      └───────────┴───────────┘
+#   
+#   Process (0,0): A[0:3, 0:3], X[0:3, 0:3]
+#   Process (0,1): A[0:3, 3:6], X[0:3, 3:6]  
+#   Process (1,0): A[3:6, 0:3], X[3:6, 0:3]
+#   Process (1,1): A[3:6, 3:6], X[3:6, 3:6]
+#   </div>
+#
+
+A_slice = local_block_split(A_shape, rank, comm)
+x_slice = local_block_split(x_shape, rank, comm)
+
+################################################################################
+# Extract the local portion of each matrix for this process
+A_local = A_data[A_slice]
+x_local = x_data[x_slice]
+
+print(f"Process {rank}: A_local shape {A_local.shape}, X_local shape {x_local.shape}")
+print(f"Process {rank}: A slice {A_slice}, X slice {x_slice}")
+
+################################################################################
+
+################################################################################
+# We are now ready to create the SUMMA :py:class:`pylops_mpi.basicoperators.MPIMatrixMult`
+# operator and the input matrix :math:`\mathbf{X}`
+
+Aop = MPIMatrixMult(A_local, M, base_comm=comm, kind="summa", dtype=A_local.dtype)
+
+x_dist = pylops_mpi.DistributedArray(
+    global_shape=(K * M),
+    local_shapes=comm.allgather(x_local.shape[0] * x_local.shape[1]),
+    base_comm=comm,
+    partition=pylops_mpi.Partition.SCATTER,
+    dtype=x_local.dtype)
+x_dist[:] = x_local.flatten()
+
+################################################################################
+# We can now apply the forward pass :math:`\mathbf{y} = \mathbf{Ax}` (which
+# effectively implements a distributed matrix-matrix multiplication
+# :math:`Y = \mathbf{AX}`). Note :math:`\mathbf{Y}` is distributed in the same
+# way as the input :math:`\mathbf{X}` in a block-block fashion.
+y_dist = Aop @ x_dist
+
+###############################################################################
+# Next we apply the adjoint pass :math:`\mathbf{x}_{adj} = \mathbf{A}^H \mathbf{x}`
+# (which effectively implements a distributed summa matrix-matrix multiplication
+# :math:`\mathbf{X}_{adj} = \mathbf{A}^H \mathbf{X}`). Note that
+# :math:`\mathbf{X}_{adj}` is again distributed in the same way as the input
+# :math:`\mathbf{X}` in a block-block fashion.
+xadj_dist = Aop.H @ y_dist
+
+###############################################################################
+# Finally, we show that the SUMMA :py:class:`pylops_mpi.basicoperators.MPIMatrixMult`
+# operator can be combined with any other PyLops-MPI operator. We are going to
+# apply here a conjugate operator to the output of the matrix multiplication.
+Dop = Conj(dims=(A_local.shape[0], x_local.shape[1]))
+DBop = pylops_mpi.MPIBlockDiag(ops=[Dop, ])
+Op = DBop @ Aop
+y1 = Op @ x_dist
diff --git a/pylops_mpi/LinearOperator.py b/pylops_mpi/LinearOperator.py
@@ -76,7 +76,6 @@ def matvec(self, x: DistributedArray) -> DistributedArray:
 
         """
         M, N = self.shape
-
         if x.global_shape != (N,):
             raise ValueError("dimension mismatch")
 
diff --git a/pylops_mpi/basicoperators/MatrixMult.py b/pylops_mpi/basicoperators/MatrixMult.py
diff --git a/tests/test_matrixmult.py b/tests/test_matrixmult.py