minor: small improvements to text

mrava87 · mrava87 · commit ae5661be9ea7 · 2025-06-29T22:29:19.000Z
diff --git a/examples/plot_matrixmult.py b/examples/plot_matrixmult.py
@@ -7,7 +7,7 @@
 matrix :math:`\mathbf{X}` blocked over columns (i.e., blocks of columns are 
 stored over different ranks), with equal number of row and column blocks. 
 Similarly, the adjoint operation can be peformed with a matrix :math:`\mathbf{Y}` 
-blocked in the same fashion of matrix :math:`\mathbf{X}.
+blocked in the same fashion of matrix :math:`\mathbf{X}`.
 
 Note that whilst the different blocks of the matrix :math:`\mathbf{A}` are directly 
 stored in the operator on different ranks, the matrix :math:`\mathbf{X}` is 
@@ -18,6 +18,7 @@
 replicated across different ranks - see below for details.  
 
 """
+
 from matplotlib import pyplot as plt
 import math
 import numpy as np
@@ -41,7 +42,7 @@
 rank = comm.Get_rank() # rank of current process
 size = comm.Get_size() # number of processes
 
-p_prime     = math.isqrt(size)
+p_prime = math.isqrt(size)
 repl_factor = p_prime
 
 if (p_prime * repl_factor) != size:
@@ -78,19 +79,19 @@
 #    \quad
 #    c = \mathrm{rank} \bmod P'.
 #
-#For example, when :math:`P = 4` we have :math:`P' = 2`, giving a 2×2 layout:
+# For example, when :math:`P = 4` we have :math:`P' = 2`, giving a 2×2 layout:
 #
-#.. raw:: html
+# .. raw:: html
 #
-#   <div style="text-align: center; font-family: monospace; white-space: pre;">
-#  ┌────────────┬────────────┐
-#  │ Rank 0     │ Rank 1     │
-#  │ (r=0, c=0) │ (r=0, c=1) │
-#  ├────────────┼────────────┤
-#  │ Rank 2     │ Rank 3     │
-#  │ (r=1, c=0) │ (r=1, c=1) │
-#  └────────────┴────────────┘
-#   </div>
+#    <div style="text-align: center; font-family: monospace; white-space: pre;">
+#   ┌────────────┬────────────┐
+#   │ Rank 0     │ Rank 1     │
+#   │ (r=0, c=0) │ (r=0, c=1) │
+#   ├────────────┼────────────┤
+#   │ Rank 2     │ Rank 3     │
+#   │ (r=1, c=0) │ (r=1, c=1) │
+#   └────────────┴────────────┘
+#    </div>
 
 my_col = rank % p_prime
 my_row = rank // p_prime
@@ -111,10 +112,10 @@
 #   <div style="text-align: left; font-family: monospace; white-space: pre;">
 #   <b>Matrix A (4 x 4):</b>
 #   ┌─────────────────┐
-#   │ a11 a12 a13 a14 │ <- Rows 0–1 (Process Grid Col 0)
+#   │ a11 a12 a13 a14 │ <- Rows 0–1 (Process Grid Row 0)
 #   │ a21 a22 a23 a24 │
 #   ├─────────────────┤
-#   │ a41 a42 a43 a44 │ <- Rows 2–3 (Process Grid Col 1)
+#   │ a41 a42 a43 a44 │ <- Rows 2–3 (Process Grid Row 1)
 #   │ a51 a52 a53 a54 │
 #   └─────────────────┘
 #   </div>
@@ -124,7 +125,7 @@
 #   <div style="text-align: left; font-family: monospace; white-space: pre;">
 #   <b>Matrix X (4 x 4):</b>
 #   ┌─────────┬─────────┐
-#   │ b11 b12 │ b13 b14 │ <- Cols 0–1 (Process Grid Row 0), Cols 2–3 (Process Grid Row 1)
+#   │ b11 b12 │ b13 b14 │ <- Cols 0–1 (Process Grid Col 0), Cols 2–3 (Process Grid Col 1)
 #   │ b21 b22 │ b23 b24 │
 #   │ b31 b32 │ b33 b34 │
 #   │ b41 b42 │ b43 b44 │
@@ -147,7 +148,7 @@
 
 ################################################################################
 # We are now ready to create the :py:class:`pylops_mpi.basicoperators.MPIMatrixMult` 
-# operator and the input matrix math:`\mathbf{X}`
+# operator and the input matrix :math:`\mathbf{X}`
 Aop = MPIMatrixMult(A_p, M, dtype="float32")
 
 col_lens = comm.allgather(my_own_cols)
diff --git a/pylops_mpi/basicoperators/MatrixMult.py b/pylops_mpi/basicoperators/MatrixMult.py
@@ -23,7 +23,7 @@ class MPIMatrixMult(MPILinearOperator):
     ----------
     A : :obj:`numpy.ndarray`
         Local block of the matrix of shape :math:`[N_{loc} \times K]`
-        where ``N_loc`` is the number of rows stored on this MPI rank and
+        where :math:`N_{loc}` is the number of rows stored on this MPI rank and
         ``K`` is the global number of columns.
     M : :obj:`int`
         Global leading dimension (i.e., number of columns) of the matrices 
@@ -46,7 +46,7 @@ class MPIMatrixMult(MPILinearOperator):
     Raises
     ------
     Exception
-        If the operator is created without a square number of mpi ranks.
+        If the operator is created with a non-square number of MPI ranks.
     ValueError
         If input vector does not have the correct partition type.
 
@@ -64,15 +64,15 @@ class MPIMatrixMult(MPILinearOperator):
     :math:`\mathbf{A}^H` is the complex conjugate and transpose of :math:`\mathbf{A}`.
     
     This implementation is based on a 1D block distribution of the operator 
-    matrix and reshaped model and data vectors replicated across math:`P` 
+    matrix and reshaped model and data vectors replicated across :math:`P`
     processes by a factor equivalent to :math:`\sqrt{P}` across a square process 
     grid (:math:`\sqrt{P}\times\sqrt{P}`). More specifically:
 
     - The matrix ``A`` is distributed across MPI processes in a block-row fashion
       and each process holds a local block of ``A`` with shape 
       :math:`[N_{loc} \times K]`
     - The operand matrix ``X`` is distributed in a block-column fashion and
-    each process holds a local block of ``X`` with shape
+      each process holds a local block of ``X`` with shape
       :math:`[K \times M_{loc}]`
     - Communication is minimized by using a 2D process grid layout
 
@@ -82,17 +82,13 @@ class MPIMatrixMult(MPILinearOperator):
        of shape ``(K, M)``) is reshaped to ``(K, M_local)`` where ``M_local``
        is the number of columns assigned to the current process.
 
-    2. **Data Broadcasting**: Within each row (processes with same ``row_id``),
-       the operand data is broadcast from the process whose ``col_id`` matches
-       the ``row_id`` (processes along the diagonal). This ensures all processes
-       in a row have access to the same operand columns.
-
-    3. **Local Computation**: Each process computes ``A_local @ X_local`` where:
+    2. **Local Computation**: Each process computes ``A_local @ X_local`` where:
        - ``A_local`` is the local block of matrix ``A`` (shape ``N_local x K``)
        - ``X_local`` is the broadcasted operand (shape ``K x M_local``)
 
-    4. **Row-wise Gather**: Results from all processes in each row are gathered
-       using ``allgather`` to reconstruct the full result matrix vertically.
+    3. **Row-wise Gather**: Results from all processes in each row are gathered
+       using ``allgather`` to ensure that each rank has a block-column of the
+       output matrix.
 
     **Adjoint Operation step-by-step**
 
@@ -101,21 +97,20 @@ class MPIMatrixMult(MPILinearOperator):
     1. **Input Reshaping**: The input vector ``x`` is reshaped to ``(N, M_local)``
        representing the local columns of the input matrix.
 
-    2. **Local Adjoint Computation**:
-        Each process computes ``A_local.H @ X_tile``
-            where ``A_local.H`` is either:
-                - Pre-computed ``At`` (if ``saveAt=True``)
-                - Computed on-the-fly as ``A.T.conj()`` (if ``saveAt=False``)
-        Each process multiplies its transposed  local ``A`` block ``A_local^H`` 
-        (shape ``K x N_block``)
-        with the extracted  ``X_tile`` (shape ``N_block x M_local``),
-        producing a partial result of  shape ``(K, M_local)``.
-        This computes the local contribution of columns of  ``A^H`` to the final result.
+    2. **Local Adjoint Computation**: Each process computes
+       ``A_local.H @ X_tile`` where ``A_local.H`` is either i) Pre-computed
+       and stored in ``At`` (if ``saveAt=True``), ii) computed on-the-fly as
+       ``A.T.conj()`` (if ``saveAt=False``). Each process multiplies its
+       transposed  local ``A`` block ``A_local^H`` (shape ``K x N_block``)
+       with the extracted  ``X_tile`` (shape ``N_block x M_local``),
+       producing a partial result of  shape ``(K, M_local)``.
+       This computes the local contribution of columns of  ``A^H`` to the final
+       result.
 
     3. **Row-wise Reduction**: Since the full result ``Y = A^H \cdot X`` is the
-       sum of contributions from all column blocks of ``A^H``, processes in the 
-       same rows perform an ``allreduce`` sum to combine their partial results.
-       This gives the complete ``(K, M_local)`` result for their assigned columns.
+       sum of the contributions from all column blocks of ``A^H``, processes in
+       the same row perform an ``allreduce`` sum to combine their partial results.
+       This gives the complete ``(K, M_local)`` result for their assigned column.
     
     """
     def __init__(
diff --git a/tests/test_matrixmult.py b/tests/test_matrixmult.py
@@ -1,39 +1,42 @@
-import pytest
+"""Test the MPIMatrixMult class
+    Designed to run with n processes
+    $ mpiexec -n 10 pytest test_matrixmult.py --with-mpi
+"""
+import math
 import numpy as np
 from numpy.testing import assert_allclose
 from mpi4py import MPI
-import math
-import sys
+import pytest
 
 from pylops_mpi import DistributedArray, Partition
 from pylops_mpi.basicoperators.MatrixMult import MPIMatrixMult
 
 np.random.seed(42)
-
 comm = MPI.COMM_WORLD
 rank = comm.Get_rank()
 size = comm.Get_size()
 
-# Define test cases: (N K, M, dtype_str)
+# Define test cases: (N, K, M, dtype_str)
 # M, K, N are matrix dimensions A(N,K), B(K,M)
 # P_prime will be ceil(sqrt(size)).
 test_params = [
-    pytest.param(37, 37, 37, "float32",   id="f32_37_37_37"),
-    pytest.param(50, 30, 40, "float64",   id="f64_50_30_40"),
+    pytest.param(37, 37, 37, "float32", id="f32_37_37_37"),
+    pytest.param(50, 30, 40, "float64", id="f64_50_30_40"),
     pytest.param(22, 20, 16, "complex64", id="c64_22_20_16"),
-    pytest.param( 3,  4,  5, "float32",   id="f32_3_4_5"),
-    pytest.param( 1,  2,  1, "float64",   id="f64_1_2_1",),
-    pytest.param( 2,  1,  3, "float32",   id="f32_2_1_3",),
+    pytest.param( 3,  4,  5, "float32", id="f32_3_4_5"),
+    pytest.param( 1,  2,  1, "float64", id="f64_1_2_1",),
+    pytest.param( 2,  1,  3, "float32", id="f32_2_1_3",),
 ]
 
 
-@pytest.mark.mpi(min_size=1)  # SUMMA should also work for 1 process.
+@pytest.mark.mpi(min_size=1)
 @pytest.mark.parametrize("M, K, N, dtype_str", test_params)
 def test_SUMMAMatrixMult(N, K, M, dtype_str):
     p_prime = math.isqrt(size)
     C = p_prime
     if  p_prime * C != size:
-        pytest.skip(f"Number of processes must be a square number, provided {size} instead...")
+        pytest.skip(f"Number of processes must be a square number, "
+                    "provided {size} instead...")
 
     dtype = np.dtype(dtype_str)
 
@@ -86,11 +89,13 @@ def test_SUMMAMatrixMult(N, K, M, dtype_str):
 
     x_dist.local_array[:] = X_p.ravel()
 
-    # Forward operation: y = A @ B (distributed)
+    # Forward operation: y = A @ x (distributed)
     y_dist = Aop @ x_dist
+
     # Adjoint operation: xadj = A.H @ y (distributed)
     xadj_dist = Aop.H @ y_dist
 
+    # Re-organize in local matrix
     y = y_dist.asarray(masked=True)
     col_counts = [min(blk_cols_X, M - j * blk_cols_X) for j in range(p_prime)]
     y_blocks = []