Fixed Notation

astroC86 · astroC86 · commit 4de0f1fa8300 · 2025-06-17T09:23:24.000+02:00
diff --git a/examples/plot_matrixmult.py b/examples/plot_matrixmult.py
@@ -52,9 +52,9 @@
 # We are now ready to create the input matrices :math:`\mathbf{A}` of size
 # :math:`M \times k` :math:`\mathbf{A}` of size and :math:`\mathbf{A}` of size
 # :math:`K \times N`.
-M, K, N = 4, 4, 4
-A = np.random.rand(M * K).astype(dtype=np.float32).reshape(M, K)
-X = np.random.rand(K * N).astype(dtype=np.float32).reshape(K, N)
+N, K, M = 4, 4, 4
+A = np.random.rand(N * K).astype(dtype=np.float32).reshape(N, K)
+X = np.random.rand(K * M).astype(dtype=np.float32).reshape(K, M)
 
 ################################################################################
 # The processes are now arranged in a :math:`\sqrt{P} \times \sqrt{P}` grid, 
@@ -129,27 +129,26 @@
 #   │ b31 b32 │ b33 b34 │
 #   │ b41 b42 │ b43 b44 │
 #   └─────────┴─────────┘
-#
 #   </div>
 #
 
-blk_rows = int(math.ceil(M / p_prime))
-blk_cols = int(math.ceil(N / p_prime))
+blk_rows = int(math.ceil(N / p_prime))
+blk_cols = int(math.ceil(M / p_prime))
 
 rs = my_group * blk_rows
-re = min(M, rs + blk_rows)
+re = min(N, rs + blk_rows)
 my_own_rows = re - rs
 
 cs = my_layer * blk_cols
-ce = min(N, cs + blk_cols)
+ce = min(M, cs + blk_cols)
 my_own_cols = ce - cs
 
 A_p, X_p = A[rs:re, :].copy(), X[:, cs:ce].copy()
 
 ################################################################################
 # We are now ready to create the :py:class:`pylops_mpi.basicoperators.MPIMatrixMult` 
 # operator and the input matrix math:`\mathbf{X}`
-Aop = MPIMatrixMult(A_p, N, dtype="float32")
+Aop = MPIMatrixMult(A_p, M, dtype="float32")
 
 col_lens = comm.allgather(my_own_cols)
 total_cols =  np.sum(col_lens)
@@ -183,13 +182,13 @@
 
 # Local benchmarks
 y = y.asarray(masked=True)
-col_counts = [min(blk_cols, N - j * blk_cols) for j in range(p_prime)]
+col_counts = [min(blk_cols, M - j * blk_cols) for j in range(p_prime)]
 y_blocks = []
 offset = 0
 for cnt in col_counts:
-    block_size = M * cnt
+    block_size = N * cnt
     y_blocks.append(
-        y[offset: offset + block_size].reshape(M, cnt)
+        y[offset: offset + block_size].reshape(N, cnt)
     )
     offset += block_size
 y = np.hstack(y_blocks)
diff --git a/pylops_mpi/basicoperators/MatrixMult.py b/pylops_mpi/basicoperators/MatrixMult.py
@@ -25,7 +25,7 @@ class MPIMatrixMult(MPILinearOperator):
         Local block of the matrix of shape :math:`[M_{loc} \times K]`
         where ``M_loc`` is the number of rows stored on this MPI rank and
         ``K`` is the global number of columns.
-    N : :obj:`int`
+    M : :obj:`int`
         Global leading dimension (i.e., number of columns) of the matrices 
         representing the input model and data vectors.
     saveAt : :obj:`bool`, optional
@@ -55,9 +55,9 @@ class MPIMatrixMult(MPILinearOperator):
     This operator performs a matrix-matrix multiplication, whose forward 
     operation can be described as :math:`Y = A \cdot X` where:
 
-    - :math:`\mathbf{A}` is the distributed matrix operator of shape :math:`[M \times K]`
-    - :math:`\mathbf{X}` is the distributed operand matrix of shape :math:`[K \times N]`
-    - :math:`\mathbf{Y}` is the resulting distributed matrix of shape :math:`[M \times N]`
+    - :math:`\mathbf{A}` is the distributed matrix operator of shape :math:`[N \times K]`
+    - :math:`\mathbf{X}` is the distributed operand matrix of shape :math:`[K \times M]`
+    - :math:`\mathbf{Y}` is the resulting distributed matrix of shape :math:`[N \times M]`
 
     whilst the adjoint operation is represented by 
     :math:`\mathbf{X}_{adj} = \mathbf{A}^H \cdot \mathbf{Y}` where 
@@ -70,16 +70,16 @@ class MPIMatrixMult(MPILinearOperator):
 
     - The matrix ``A`` is distributed across MPI processes in a block-row fashion
       and each process holds a local block of ``A`` with shape 
-      :math:`[M_{loc} \times K]`
+      :math:`[N_{loc} \times K]`
     - The operand matrix ``X`` is distributed in a block-column fashion and
-      and each process holds a local block of ``X`` with shape 
-      :math:`[K \times N_{loc}]`
+    each process holds a local block of ``X`` with shape
+      :math:`[K \times M_{loc}]`
     - Communication is minimized by using a 2D process grid layout
 
     **Forward Operation step-by-step**
 
     1. **Input Preparation**: The input vector ``x`` (flattened from matrix ``X``
-       of shape ``(K, N)``) is reshaped to ``(K, N_local)`` where ``N_local``
+       of shape ``(K, M)``) is reshaped to ``(K, M_local)`` where ``M_local``
        is the number of columns assigned to the current process.
 
     2. **Data Broadcasting**: Within each layer (processes with same ``layer_id``),
@@ -88,8 +88,8 @@ class MPIMatrixMult(MPILinearOperator):
        the same operand columns.
 
     3. **Local Computation**: Each process computes ``A_local @ X_local`` where:
-       - ``A_local`` is the local block of matrix ``A`` (shape ``M_local x K``)
-       - ``X_local`` is the broadcasted operand (shape ``K x N_local``)
+       - ``A_local`` is the local block of matrix ``A`` (shape ``N_local x K``)
+       - ``X_local`` is the broadcasted operand (shape ``K x M_local``)
 
     4. **Layer Gather**: Results from all processes in each layer are gathered
        using ``allgather`` to reconstruct the full result matrix vertically.
@@ -98,7 +98,7 @@ class MPIMatrixMult(MPILinearOperator):
 
     The adjoint operation performs the conjugate transpose multiplication:
 
-    1. **Input Reshaping**: The input vector ``x`` is reshaped to ``(M, N_local)``
+    1. **Input Reshaping**: The input vector ``x`` is reshaped to ``(N, M_local)``
        representing the local columns of the input matrix.
 
     2. **Local Adjoint Computation**:
@@ -107,21 +107,21 @@ class MPIMatrixMult(MPILinearOperator):
                 - Pre-computed ``At`` (if ``saveAt=True``)
                 - Computed on-the-fly as ``A.T.conj()`` (if ``saveAt=False``)
         Each process multiplies its transposed  local ``A`` block ``A_local^H`` 
-        (shape ``K x M_block``)
-        with the extracted  ``X_tile`` (shape ``M_block x N_local``),
-        producing a partial result of  shape ``(K, N_local)``.
+        (shape ``K x N_block``)
+        with the extracted  ``X_tile`` (shape ``N_block x M_local``),
+        producing a partial result of  shape ``(K, M_local)``.
         This computes the local contribution of columns of  ``A^H`` to the final result.
 
     3. **Layer Reduction**: Since the full result ``Y = A^H \cdot X`` is the
        sum of contributions from all column blocks of ``A^H``, processes in the 
        same layer perform an ``allreduce`` sum to combine their partial results. 
-       This gives the complete ``(K, N_local)`` result for their assigned columns.
+       This gives the complete ``(K, M_local)`` result for their assigned columns.
     
     """
     def __init__(
             self,
             A: NDArray,
-            N: int,
+            M: int,
             saveAt: bool = False,
             base_comm: MPI.Comm = MPI.COMM_WORLD,
             dtype: DTypeLike = "float64",
@@ -147,25 +147,25 @@ def __init__(
         self.A = A.astype(np.dtype(dtype))
         if saveAt: self.At = A.T.conj()
 
-        self.M = self._layer_comm.allreduce(self.A.shape[0], op=MPI.SUM)
+        self.N = self._layer_comm.allreduce(self.A.shape[0], op=MPI.SUM)
         self.K = A.shape[1]
-        self.N = N
+        self.M = M
 
-        block_cols = int(math.ceil(self.N / self._P_prime))
-        blk_rows = int(math.ceil(self.M / self._P_prime))
+        block_cols = int(math.ceil(self.M / self._P_prime))
+        blk_rows = int(math.ceil(self.N / self._P_prime))
 
         self._row_start = self._group_id * blk_rows
-        self._row_end = min(self.M, self._row_start + blk_rows)
+        self._row_end = min(self.N, self._row_start + blk_rows)
 
         self._col_start = self._layer_id * block_cols
-        self._col_end = min(self.N, self._col_start + block_cols)
+        self._col_end = min(self.M, self._col_start + block_cols)
 
         self._local_ncols = self._col_end - self._col_start
         self._rank_col_lens = self.base_comm.allgather(self._local_ncols)
         total_ncols = np.sum(self._rank_col_lens)
 
         self.dims = (self.K, total_ncols)
-        self.dimsd = (self.M, total_ncols)
+        self.dimsd = (self.N, total_ncols)
         shape = (int(np.prod(self.dimsd)), int(np.prod(self.dims)))
         super().__init__(shape=shape, dtype=np.dtype(dtype), base_comm=base_comm)
 
@@ -174,8 +174,8 @@ def _matvec(self, x: DistributedArray) -> DistributedArray:
         if x.partition != Partition.SCATTER:
             raise ValueError(f"x should have partition={Partition.SCATTER} Got {x.partition} instead...")
 
-        y = DistributedArray(global_shape=(self.M * self.dimsd[1]),
-                             local_shapes=[(self.M * c) for c in self._rank_col_lens],
+        y = DistributedArray(global_shape=(self.N * self.dimsd[1]),
+                             local_shapes=[(self.N * c) for c in self._rank_col_lens],
                              mask=x.mask,
                              partition=Partition.SCATTER,
                              dtype=self.dtype)
@@ -204,7 +204,7 @@ def _rmatvec(self, x: DistributedArray) -> DistributedArray:
             dtype=self.dtype,
         )
 
-        x_arr = x.local_array.reshape((self.M, self._local_ncols)).astype(self.dtype)
+        x_arr = x.local_array.reshape((self.N, self._local_ncols)).astype(self.dtype)
         X_tile = x_arr[self._row_start:self._row_end, :]
         A_local = self.At if hasattr(self, "At") else self.A.T.conj()
         Y_local = ncp.matmul(A_local, X_tile)
diff --git a/tests/test_matrixmult.py b/tests/test_matrixmult.py
@@ -15,21 +15,21 @@
 size = comm.Get_size()
 
 # Define test cases: (M, K, N, dtype_str)
-# M, K, N are matrix dimensions A(M,K), B(K,N)
+# M, K, N are matrix dimensions A(N,K), B(K,M)
 # P_prime will be ceil(sqrt(size)).
 test_params = [
     pytest.param(37, 37, 37, "float32",   id="f32_37_37_37"),
-    pytest.param(40, 30, 50, "float64",   id="f64_40_30_50"),
-    pytest.param(16, 20, 22, "complex64", id="c64_16_20_22"),
-    pytest.param(5,   4,  3, "float32",   id="f32_5_4_3"),
-    pytest.param(1,   2,  1, "float64",   id="f64_1_2_1",),
-    pytest.param(3,   1,  2, "float32",   id="f32_3_1_2",),
+    pytest.param(50, 30, 40, "float64",   id="f64_40_30_50"),
+    pytest.param(22, 20, 16, "complex64", id="c64_16_20_22"),
+    pytest.param( 3,  4,  5, "float32",   id="f32_5_4_3"),
+    pytest.param( 1,  2,  1, "float64",   id="f64_1_2_1",),
+    pytest.param( 2,  1,  3, "float32",   id="f32_3_1_2",),
 ]
 
 
 @pytest.mark.mpi(min_size=1)  # SUMMA should also work for 1 process.
 @pytest.mark.parametrize("M, K, N, dtype_str", test_params)
-def test_SUMMAMatrixMult(M, K, N, dtype_str):
+def test_SUMMAMatrixMult(N, K, M, dtype_str):
     dtype = np.dtype(dtype_str)
 
     cmplx = 1j if np.issubdtype(dtype, np.complexfloating) else 0
@@ -47,28 +47,28 @@ def test_SUMMAMatrixMult(M, K, N, dtype_str):
     group_comm = comm.Split(color=my_group, key=my_layer)
 
     # Calculate local matrix dimensions
-    blk_rows_A = int(math.ceil(M / p_prime))
+    blk_rows_A = int(math.ceil(N / p_prime))
     row_start_A = my_group * blk_rows_A
-    row_end_A = min(M, row_start_A + blk_rows_A)
+    row_end_A = min(N, row_start_A + blk_rows_A)
 
-    blk_cols_X = int(math.ceil(N / p_prime))
+    blk_cols_X = int(math.ceil(M / p_prime))
     col_start_X = my_layer * blk_cols_X
-    col_end_X = min(N, col_start_X + blk_cols_X)
+    col_end_X = min(M, col_start_X + blk_cols_X)
     local_col_X_len = max(0, col_end_X - col_start_X)
 
-    A_glob_real = np.arange(M * K, dtype=base_float_dtype).reshape(M, K)
-    A_glob_imag = np.arange(M * K, dtype=base_float_dtype).reshape(M, K) * 0.5
+    A_glob_real = np.arange(N * K, dtype=base_float_dtype).reshape(N, K)
+    A_glob_imag = np.arange(N * K, dtype=base_float_dtype).reshape(N, K) * 0.5
     A_glob = (A_glob_real + cmplx * A_glob_imag).astype(dtype)
 
-    X_glob_real = np.arange(K * N, dtype=base_float_dtype).reshape(K, N)
-    X_glob_imag = np.arange(K * N, dtype=base_float_dtype).reshape(K, N) * 0.7
+    X_glob_real = np.arange(K * M, dtype=base_float_dtype).reshape(K, M)
+    X_glob_imag = np.arange(K * M, dtype=base_float_dtype).reshape(K, M) * 0.7
     X_glob = (X_glob_real + cmplx * X_glob_imag).astype(dtype)
 
     A_p = A_glob[row_start_A:row_end_A,:]
     X_p = X_glob[:,col_start_X:col_end_X]
 
     # Create MPIMatrixMult operator
-    Aop = MPIMatrixMult(A_p, N, base_comm=comm, dtype=dtype_str)
+    Aop = MPIMatrixMult(A_p, M, base_comm=comm, dtype=dtype_str)
 
     # Create DistributedArray for input x (representing B flattened)
     all_local_col_len = comm.allgather(local_col_X_len)
@@ -91,13 +91,13 @@ def test_SUMMAMatrixMult(M, K, N, dtype_str):
     xadj_dist = Aop.H @ y_dist
 
     y = y_dist.asarray(masked=True)
-    col_counts = [min(blk_cols_X, N - j * blk_cols_X) for j in range(p_prime)]
+    col_counts = [min(blk_cols_X, M - j * blk_cols_X) for j in range(p_prime)]
     y_blocks = []
     offset = 0
     for cnt in col_counts:
-        block_size = M * cnt
+        block_size = N * cnt
         y_blocks.append(
-            y[offset: offset + block_size].reshape(M, cnt)
+            y[offset: offset + block_size].reshape(N, cnt)
         )
         offset += block_size
     y = np.hstack(y_blocks)