PyLops
diff --git a/‎docs/source/api/index.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/api/index.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/plot_matrixmult.py‎
Lines changed: 116 additions & 76 deletions b/‎examples/plot_matrixmult.py‎
Lines changed: 116 additions & 76 deletions
@@ -42,6 +42,7 @@ Basic Operators
 .. autosummary::
    :toctree: generated/
 
+    MPIMatrixMult
     MPIBlockDiag
     MPIStackedBlockDiag
     MPIVStack
 
@@ -1,9 +1,22 @@
 """
 Distributed Matrix Multiplication
 =================================
-This example shows how to use the :py:class:`pylops_mpi.basicoperators.MatrixMult.MPIMatrixMult`.
-This class provides a way to distribute arrays across multiple processes in
-a parallel computing environment.
+This example shows how to use the :py:class:`pylops_mpi.basicoperators.MPIMatrixMult`
+operator to perform matrix-matrix multiplication between a matrix :math:`\mathbf{A}` 
+blocked over rows (i.e., blocks of rows are stored over different ranks) and a
+matrix :math:`\mathbf{X}` blocked over columns (i.e., blocks of columns are 
+stored over different ranks), with equal number of row and column blocks. 
+Similarly, the adjoint operation can be peformed with a matrix :math:`\mathbf{Y}` 
+blocked in the same fashion of matrix :math:`\mathbf{X}.
+
+Note that whilst the different blocks of the matrix :math:`\mathbf{A}` are directly 
+stored in the operator on different ranks, the matrix :math:`\mathbf{X}` is 
+effectively represented by a 1-D :py:class:`pylops_mpi.DistributedArray` where 
+the different blocks are flattened and stored on different ranks. Note that to
+optimize communications, the ranks are organized in a 2D grid and some of the 
+row blocks of :math:`\mathbf{A}` and column blocks of :math:`\mathbf{X}` are 
+replicated across different ranks - see below for details.  
+
 """
 from matplotlib import pyplot as plt
 import math
@@ -14,49 +27,56 @@
 from pylops_mpi.basicoperators.MatrixMult import MPIMatrixMult
 
 plt.close("all")
+
 ###############################################################################
-# We set the seed such that all processes initially start out with the same initial matrix.
-# Ideally this data would be loaded in a manner appropriate to the use-case.
+# We set the seed such that all processes can create the input matrices filled 
+# with the same random number. In practical application, such matrices will be 
+# filled with data that is appropriate that is appropriate the use-case.
 np.random.seed(42)
 
-# MPI parameters
+###############################################################################
+# Next we obtain the MPI parameters for each rank and check that the number 
+# of processes (``size``) is a square number
 comm = MPI.COMM_WORLD
 rank = comm.Get_rank() # rank of current process
 size = comm.Get_size() # number of processes
 
 p_prime = int(math.ceil(math.sqrt(size)))
-C = int(math.ceil(size / p_prime))
+repl_factor = int(math.ceil(size / p_prime))
 
-if (p_prime * C) != size:
-    print("No. of procs has to be a square number")
+if (p_prime * repl_factor) != size:
+    print(f"Number of processes must be a square number, provided {size} instead...")
     exit(-1)
 
-# matrix dims
+###############################################################################
+# We are now ready to create the input matrices :math:`\mathbf{A}` of size
+# :math:`M \times k` :math:`\mathbf{A}` of size and :math:`\mathbf{A}` of size
+# :math:`K \times N`.
 M, K, N = 4, 4, 4
 A = np.random.rand(M * K).astype(dtype=np.float32).reshape(M, K)
 X = np.random.rand(K * N).astype(dtype=np.float32).reshape(K, N)
+
 ################################################################################
-#Process Grid Organization
-#*************************
-#
-#The processes are arranged in a :math:`\sqrt{P} \times \sqrt{P}` grid, where :math:`P` is the total number of processes.
+# The processes are now arranged in a :math:`\sqrt{P} \times \sqrt{P}` grid, 
+# where :math:`P` is the total number of processes.
 #
-#Define
+# We define
 #
-#.. math::
-#   P' = \bigl \lceil \sqrt{P} \bigr \rceil
+# .. math::
+#    P' = \bigl \lceil \sqrt{P} \bigr \rceil
 #
-#and the replication factor
+# and the replication factor
 #
-#.. math::
-#   C = \bigl\lceil \tfrac{P}{P'} \bigr\rceil.
+# .. math::
+#    R = \bigl\lceil \tfrac{P}{P'} \bigr\rceil.
 #
-#Each process is assigned a pair of coordinates :math:`(g, l)` within this grid:
+# Each process is therefore assigned a pair of coordinates 
+# :math:`(g, l)` within this grid:
 #
-#.. math::
-#   g = \mathrm{rank} \bmod P',
-#   \quad
-#   l = \left\lfloor \frac{\mathrm{rank}}{P'} \right\rfloor.
+# .. math::
+#    g = \mathrm{rank} \bmod P',
+#    \quad
+#    l = \left\lfloor \frac{\mathrm{rank}}{P'} \right\rfloor.
 #
 #For example, when :math:`P = 4` we have :math:`P' = 2`, giving a 2×2 layout:
 #
@@ -75,30 +95,18 @@
 my_group = rank % p_prime
 my_layer = rank // p_prime
 
-# Create the sub‐communicators
+# Create sub‐communicators
 layer_comm = comm.Split(color=my_layer, key=my_group)  # all procs in same layer
 group_comm = comm.Split(color=my_group, key=my_layer)  # all procs in same group
 
-blk_rows = int(math.ceil(M / p_prime))
-blk_cols = int(math.ceil(N / p_prime))
-
-rs = my_group * blk_rows
-re = min(M, rs + blk_rows)
-my_own_rows = re - rs
-
-cs = my_layer * blk_cols
-ce = min(N, cs + blk_cols)
-my_own_cols = ce - cs
-
 ################################################################################
-#Each rank will end up with:
-#      - :math:`A_{p} \in \mathbb{R}^{\text{my_own_rows}\times K}`
-#      - :math:`X_{p} \in \mathbb{R}^{K\times \text{my_own_cols}}`
-#as follows:
-A_p, X_p = A[rs:re, :].copy(), X[:, cs:ce].copy()
-
-################################################################################
-#.. raw:: html
+# At this point we divide the rows and columns of :math:`\mathbf{A}` and  
+# :math:`\mathbf{X}`, respectively, such that each rank ends up with:
+#
+#  - :math:`A_{p} \in \mathbb{R}^{\text{my_own_rows}\times K}`
+#  - :math:`X_{p} \in \mathbb{R}^{K\times \text{my_own_cols}}`
+#
+# .. raw:: html
 #
 #   <div style="text-align: left; font-family: monospace; white-space: pre;">
 #   <b>Matrix A (4 x 4):</b>
@@ -111,10 +119,10 @@
 #   └─────────────────┘
 #   </div>
 #
-#.. raw:: html
+# .. raw:: html
 #
 #   <div style="text-align: left; font-family: monospace; white-space: pre;">
-#   <b>Matrix B (4 x 4):</b>
+#   <b>Matrix X (4 x 4):</b>
 #   ┌─────────┬─────────┐
 #   │ b11 b12 │ b13 b14 │ <- Cols 0–1 (Layer 0), Cols 2–3 (Layer 1)
 #   │ b21 b22 │ b23 b24 │
@@ -125,48 +133,80 @@
 #   </div>
 #
 
+blk_rows = int(math.ceil(M / p_prime))
+blk_cols = int(math.ceil(N / p_prime))
+
+rs = my_group * blk_rows
+re = min(M, rs + blk_rows)
+my_own_rows = re - rs
+
+cs = my_layer * blk_cols
+ce = min(N, cs + blk_cols)
+my_own_cols = ce - cs
+
+A_p, X_p = A[rs:re, :].copy(), X[:, cs:ce].copy()
+
 ################################################################################
-#Forward Operation
-#*****************
-#To perform our distributed matrix-matrix multiplication :math:`Y = \text{Aop} \times X` we need to create our distributed operator :math:`\text{Aop}` and distributed operand :math:`X` from :math:`A_p` and
-#:math:`X_p` respectively
+# We are now ready to create the :py:class:`pylops_mpi.basicoperators.MPIMatrixMult` 
+# operator and the input matrix math:`\mathbf{X}`
 Aop = MPIMatrixMult(A_p, N, dtype="float32")
-################################################################################
-# While as well passing the appropriate values.
+
 col_lens = comm.allgather(my_own_cols)
-total_cols =  np.sum(col_lens)
-x = DistributedArray(global_shape=K * total_cols,
+x = DistributedArray(global_shape=K * N,
                      local_shapes=[K * col_len for col_len in col_lens],
                      partition=Partition.SCATTER,
-                     mask=[i // p_prime for i in range(comm.Get_size())],
+                     mask=[i % p_prime for i in range(comm.Get_size())],
                      base_comm=comm,
                      dtype="float32")
 x[:] = X_p.flatten()
+
 ################################################################################
-#When we perform the matrix-matrix multiplication we shall then obtain a distributed :math:`Y` in the same way our :math:`X` was distributed.
+# We can now apply the forward pass :math:`\mathbf{y} = \mathbf{Ax}` (which effectively 
+# implements a distributed matrix-matrix multiplication :math:`Y = \mathbf{AX}`)
+# Note :math:`\mathbf{Y}` is distributed in the same way as the input 
+# :math:`\mathbf{X}`.
 y = Aop @ x
+
 ###############################################################################
-#Adjoint Operation
-#*****************
-# In a similar fashion we then perform the Adjoint :math:`Xadj = A^H * Y`
+# Next we apply the adjoint pass :math:`\mathbf{x}_{adj} = \mathbf{A}^H \mathbf{x}` 
+# (which effectively implements a distributed matrix-matrix multiplication 
+# :math:`\mathbf{X}_{adj} = \mathbf{A}^H \mathbf{X}`). Note that
+# :math:`\mathbf{X}_{adj}` is again distributed in the same way as the input 
+# :math:`\mathbf{X}`.
 xadj = Aop.H @ y
+
 ###############################################################################
-#Here we verify the result against the equivalent serial version of the operation. Each rank checks that it has computed the correct values for it partition.
+# To conclude we verify our result against the equivalent serial version of 
+# the operation by gathering the resulting matrices in rank0 and reorganizing
+# the returned 1D-arrays into 2D-arrays.
+
+# Local benchmarks
 y_loc = A @ X
 xadj_loc = (A.T.dot(y_loc.conj())).conj()
 
-expected_y_loc = y_loc[:, cs:ce].flatten().astype(np.float32)
-expected_xadj_loc = xadj_loc[:, cs:ce].flatten().astype(np.float32)
-
-if not np.allclose(y.local_array, expected_y_loc, rtol=1e-6):
-    print(f"RANK {rank}: FORWARD VERIFICATION FAILED")
-    print(f'{rank} local: {y.local_array}, expected: {y_loc[:, cs:ce]}')
-else:
-    print(f"RANK {rank}: FORWARD VERIFICATION PASSED")
-
-if not np.allclose(xadj.local_array, expected_xadj_loc, rtol=1e-6):
-    print(f"RANK {rank}: ADJOINT VERIFICATION FAILED")
-    print(f'{rank} local: {xadj.local_array}, expected: {xadj_loc[:, cs:ce]}')
-else:
-    print(f"RANK {rank}: ADJOINT VERIFICATION PASSED")
-
+y = y.asarray(masked=True)
+if N > 1:
+    y = y.reshape(p_prime, M, blk_cols)
+y = np.hstack([yblock for yblock in y])
+xadj = xadj.asarray(masked=True)
+if N > 1:
+    xadj = xadj.reshape(p_prime, K, blk_cols)
+xadj = np.hstack([xadjblock for xadjblock in xadj])
+
+if rank == 0:
+    y_loc = (A @ X).squeeze()
+    xadj_loc = (A.T.dot(y_loc.conj())).conj().squeeze()
+
+    if not np.allclose(y, y_loc, rtol=1e-6):
+        print(f" FORWARD VERIFICATION FAILED")
+        print(f'distributed: {y}') 
+        print(f'expected: {y_loc}')
+    else:
+        print(f"FORWARD VERIFICATION PASSED")
+
+    if not np.allclose(xadj, xadj_loc, rtol=1e-6):
+        print(f" ADJOINT VERIFICATION FAILED")
+        print(f'distributed: {xadj}') 
+        print(f'expected: {xadj_loc}')
+    else:
+        print(f"ADJOINT VERIFICATION PASSED")