Addressed comments

astroC86 · astroC86 · commit 9e1a49fd9570 · 2025-06-13T09:45:13.000+02:00
diff --git a/examples/plot_matrixmult.py b/examples/plot_matrixmult.py
@@ -11,7 +11,7 @@
 from mpi4py import MPI
 
 from pylops_mpi import DistributedArray, Partition
-from pylops_mpi.basicoperators.MatrixMult import MPISUMMAMatrixMult
+from pylops_mpi.basicoperators.MatrixMult import MPIMatrixMult
 
 np.random.seed(42)
 
@@ -22,17 +22,17 @@
 P_prime = int(math.ceil(math.sqrt(n_procs)))
 C = int(math.ceil(n_procs / P_prime))
 
-if P_prime * C < n_procs:
+if (P_prime * C) != n_procs:
     print("No. of procs has to be a square number")
     exit(-1)
 
 # matrix dims
 M = 32
-K = 32
-N = 35
+K = 35
+N = 37
 
-blk_rows = int(math.ceil(M / P_prime))
-blk_cols = int(math.ceil(N / P_prime))
+A = np.random.rand(M * K).astype(dtype=np.float32).reshape(M, K)
+B = np.random.rand(K * N).astype(dtype=np.float32).reshape(K, N)
 
 my_group = rank % P_prime
 my_layer = rank // P_prime
@@ -41,75 +41,59 @@
 layer_comm = comm.Split(color=my_layer, key=my_group)  # all procs in same layer
 group_comm = comm.Split(color=my_group, key=my_layer)  # all procs in same group
 
-# Each rank will end up with:
-#   A_p: shape (my_own_rows, K)
-#   B_p: shape (K, my_own_cols)
-# where
+
+#Each rank will end up with:
+#      - :math:`A_{p} \in \mathbb{R}^{\text{my\_own\_rows}\times K}`
+#      - :math:`B_{p} \in \mathbb{R}^{K\times \text{my\_own\_cols}}`
+#    where
+blk_rows = int(math.ceil(M / P_prime))
 row_start = my_group * blk_rows
 row_end = min(M, row_start + blk_rows)
 my_own_rows = row_end - row_start
 
-col_start = my_group * blk_cols  # note: same my_group index on cols
+blk_cols = int(math.ceil(N / P_prime))
+col_start = my_layer * blk_cols
 col_end = min(N, col_start + blk_cols)
 my_own_cols = col_end - col_start
 
-# ======================= BROADCASTING THE SLICES =======================
-if rank == 0:
-    A = np.arange(M * K, dtype=np.float32).reshape(M, K)
-    B = np.arange(K * N, dtype=np.float32).reshape(K, N)
-    for dest in range(n_procs):
-        pg = dest % P_prime
-        rs = pg * blk_rows;
-        re = min(M, rs + blk_rows)
-        cs = pg * blk_cols;
-        ce = min(N, cs + blk_cols)
-        a_block, b_block = A[rs:re, :], B[:, cs:ce]
-        if dest == 0:
-            A_p, B_p = a_block, b_block
-        else:
-            comm.Send(a_block, dest=dest, tag=100 + dest)
-            comm.Send(b_block, dest=dest, tag=200 + dest)
-else:
-    A_p = np.empty((my_own_rows, K), dtype=np.float32)
-    B_p = np.empty((K, my_own_cols), dtype=np.float32)
-    comm.Recv(A_p, source=0, tag=100 + rank)
-    comm.Recv(B_p, source=0, tag=200 + rank)
 
-comm.Barrier()
+rs = (rank % P_prime) * blk_rows
+re = min(M, rs + blk_rows)
 
-Aop = MPISUMMAMatrixMult(A_p, N)
+cs = (rank // P_prime) * blk_cols
+ce = min(N, cs + blk_cols)
+A_p, B_p = A[rs:re, :].copy(), B[:, cs:ce].copy()
+
+Aop = MPIMatrixMult(A_p, N, dtype="float32")
 col_lens = comm.allgather(my_own_cols)
 total_cols =  np.sum(col_lens)
 x = DistributedArray(global_shape=K * total_cols,
                      local_shapes=[K * col_len for col_len in col_lens],
                      partition=Partition.SCATTER,
                      mask=[i % P_prime for i in range(comm.Get_size())],
-                     dtype=np.float32)
+                     base_comm=comm,
+                     dtype="float32")
 x[:] = B_p.flatten()
 y = Aop @ x
 
 # ======================= VERIFICATION =================-=============
-A = np.arange(M * K).reshape(M, K).astype(np.float32)
-B = np.arange(K * N).reshape(K, N).astype(np.float32)
-C_true = A @ B
-Z_true = (A.T.dot(C_true.conj())).conj()
+y_loc = A @ B
+xadj_loc = (A.T.dot(y_loc.conj())).conj()
 
-col_start = my_layer * blk_cols  # note: same my_group index on cols
-col_end = min(N, col_start + blk_cols)
-my_own_cols = col_end - col_start
-expected_y = C_true[:, col_start:col_end].flatten()
 
-xadj = Aop.H @ y
+expected_y_loc = y_loc[:, col_start:col_end].flatten().astype(np.float32)
+expected_xadj_loc = xadj_loc[:, col_start:col_end].flatten().astype(np.float32)
 
-if not np.allclose(y.local_array, expected_y, atol=1e-6, rtol=1e-14):
+xadj = Aop.H @ y
+if not np.allclose(y.local_array, expected_y_loc, rtol=1e-6):
     print(f"RANK {rank}: FORWARD VERIFICATION FAILED")
-    print(f'{rank} local: {y.local_array}, expected: {C_true[:, col_start:col_end]}')
+    print(f'{rank} local: {y.local_array}, expected: {y_loc[:, col_start:col_end]}')
 else:
     print(f"RANK {rank}: FORWARD VERIFICATION PASSED")
 
-expected_z = Z_true[:, col_start:col_end].flatten()
-if not np.allclose(xadj.local_array, expected_z, atol=1e-6, rtol=1e-14):
+if not np.allclose(xadj.local_array, expected_xadj_loc, rtol=1e-6):
     print(f"RANK {rank}: ADJOINT VERIFICATION FAILED")
-    print(f'{rank} local: {xadj.local_array}, expected: {Z_true[:, col_start:col_end]}')
+    print(f'{rank} local: {xadj.local_array}, expected: {xadj_loc[:, col_start:col_end]}')
 else:
     print(f"RANK {rank}: ADJOINT VERIFICATION PASSED")
+
diff --git a/pylops_mpi/basicoperators/MatrixMult.py b/pylops_mpi/basicoperators/MatrixMult.py
@@ -11,7 +11,7 @@
 )
 
 
-class MPISUMMAMatrixMult(MPILinearOperator):
+class MPIMatrixMult(MPILinearOperator):
     def __init__(
             self,
             A: NDArray,
@@ -36,8 +36,7 @@ def __init__(
         self.base_comm   = base_comm
         self._layer_comm = base_comm.Split(color=self._layer_id, key=self._group_id)
         self._group_comm = base_comm.Split(color=self._group_id, key=self._layer_id)
-
-        self.A    = np.array(A, dtype=self.dtype, copy=False)
+        self.A = A.astype(np.dtype(dtype))
 
         self.M = self._layer_comm.allreduce(self.A.shape[0], op=MPI.SUM)
         self.K = A.shape[1]
@@ -61,19 +60,19 @@ def __init__(
 
         self.dimsd = (self.M, total_layer_cols)
         shape = (int(np.prod(self.dimsd)), int(np.prod(self.dims)))
-
         super().__init__(shape=shape, dtype=np.dtype(dtype), base_comm=base_comm)
         
     def _matvec(self, x: DistributedArray) -> DistributedArray:
         ncp = get_module(x.engine)
         if x.partition != Partition.SCATTER:
             raise ValueError(f"x should have partition={Partition.SCATTER} Got {x.partition} instead...")
         blk_cols    = int(math.ceil(self.N / self._P_prime))
-        col_start   = self._group_id * blk_cols
+        col_start   = self._layer_id * blk_cols
         col_end     = min(self.N, col_start + blk_cols)
         my_own_cols = max(0, col_end - col_start)
         x = x.local_array.reshape((self.dims[0], my_own_cols))
-        x = x.astype(self.dtype, copy=False)
+        x = x.astype(self.dtype)
+
         B_block = self._layer_comm.bcast(x if self._group_id == self._layer_id else None, root=self._layer_id)
         C_local = ncp.vstack(
             self._layer_comm.allgather(
@@ -106,24 +105,24 @@ def _rmatvec(self, x: DistributedArray) -> DistributedArray:
         layer_col_end   = min(self.N, layer_col_start + blk_cols)
         layer_ncols     = layer_col_end - layer_col_start
         layer_col_lens  = self.base_comm.allgather(layer_ncols)
-        x               = x.local_array.reshape((self.M, layer_ncols))
+        x               = x.local_array.reshape((self.M, layer_ncols)).astype(self.dtype)
 
         # Determine local row block for this process group
         blk_rows  = int(math.ceil(self.M / self._P_prime))
         row_start = self._group_id * blk_rows
         row_end   = min(self.M, row_start + blk_rows)
 
-        B_tile = x[row_start:row_end, :].astype(self.dtype, copy=False)
-        A_local = self.A.T.conj()
+        B_tile = x[row_start:row_end, :].astype(self.dtype)
+        A_local = self.A.T.conj().astype(self.dtype)
 
         m, b    = A_local.shape
         pad     = (-m) % self._P_prime
         r       = (m + pad) // self._P_prime
-        A_pad   = np.pad(A_local, ((0, pad), (0, 0)),  mode='constant', constant_values=0)
+        A_pad   = np.pad(A_local, ((0, pad), (0, 0)),  mode='constant', constant_values=self.dtype.type(0.0))
         A_batch = A_pad.reshape(self._P_prime, r, b)
 
         # Perform local matmul and unpad
-        Y_batch = ncp.matmul(A_batch, B_tile)
+        Y_batch = ncp.matmul(A_batch, B_tile).astype(self.dtype)
         Y_pad   = Y_batch.reshape(r * self._P_prime, -1)
         y_local = Y_pad[:m, :]
         y_layer = self._layer_comm.allreduce(y_local, op=MPI.SUM)
diff --git a/tests/test_matrixmult.py b/tests/test_matrixmult.py
@@ -3,9 +3,10 @@
 from numpy.testing import assert_allclose
 from mpi4py import MPI
 import math
+import sys
 
 from pylops_mpi import DistributedArray, Partition
-from pylops_mpi.basicoperators.MatrixMult import MPISUMMAMatrixMult
+from pylops_mpi.basicoperators.MatrixMult import MPIMatrixMult
 
 np.random.seed(42)
 
@@ -28,7 +29,7 @@
 
 @pytest.mark.mpi(min_size=1)  # SUMMA should also work for 1 process.
 @pytest.mark.parametrize("M, K, N, dtype_str", test_params)
-def test_MPIMatrixMult(M, K, N, dtype_str):
+def test_SUMMAMatrixMult(M, K, N, dtype_str):
     dtype = np.dtype(dtype_str)
 
     cmplx = 1j if np.issubdtype(dtype, np.complexfloating) else 0
@@ -60,18 +61,15 @@ def test_MPIMatrixMult(M, K, N, dtype_str):
     A_p = np.empty((my_own_rows_A, K), dtype=dtype)
     B_p = np.empty((K, my_own_cols_B), dtype=dtype)
 
-    # Generate and distribute test matrices
-    A_glob, B_glob = None, None
-    if rank == 0:
-        # Create global matrices with complex components if needed
-        A_glob_real = np.arange(M * K, dtype=base_float_dtype).reshape(M, K)
-        A_glob_imag = np.arange(M * K, dtype=base_float_dtype).reshape(M, K) * 0.5
-        A_glob = (A_glob_real + cmplx * A_glob_imag).astype(dtype)
+    A_glob_real = np.arange(M * K, dtype=base_float_dtype).reshape(M, K)
+    A_glob_imag = np.arange(M * K, dtype=base_float_dtype).reshape(M, K) * 0.5
+    A_glob = (A_glob_real + cmplx * A_glob_imag).astype(dtype)
 
-        B_glob_real = np.arange(K * N, dtype=base_float_dtype).reshape(K, N)
-        B_glob_imag = np.arange(K * N, dtype=base_float_dtype).reshape(K, N) * 0.7
-        B_glob = (B_glob_real + cmplx * B_glob_imag).astype(dtype)
+    B_glob_real = np.arange(K * N, dtype=base_float_dtype).reshape(K, N)
+    B_glob_imag = np.arange(K * N, dtype=base_float_dtype).reshape(K, N) * 0.7
+    B_glob = (B_glob_real + cmplx * B_glob_imag).astype(dtype)
 
+    if rank == 0:
         # Distribute matrix blocks to all ranks
         for dest_rank in range(size):
             dest_my_group = dest_rank % P_prime
@@ -108,7 +106,7 @@ def test_MPIMatrixMult(M, K, N, dtype_str):
     comm.Barrier()
 
     # Create SUMMAMatrixMult operator
-    Aop = MPISUMMAMatrixMult(A_p, N, base_comm=comm, dtype=dtype_str)
+    Aop = MPIMatrixMult(A_p, N, base_comm=comm, dtype=dtype_str)
 
     # Create DistributedArray for input x (representing B flattened)
     all_my_own_cols_B = comm.allgather(my_own_cols_B)
@@ -133,26 +131,44 @@ def test_MPIMatrixMult(M, K, N, dtype_str):
 
     # Forward operation: y = A @ B (distributed)
     y_dist = Aop @ x_dist
-    y = y_dist.asarray(),
 
     # Adjoint operation: z = A.H @ y (distributed y representing C)
-    y_adj_dist = Aop.H @ y_dist
-    y_adj = y_adj_dist.asarray()
+    z_dist = Aop.H @ y_dist
 
-    if rank == 0:
-        y_np = A_glob @ B_glob
-        y_adj_np = A_glob.conj().T @ y_np
+    C_true = A_glob @ B_glob
+    Z_true = A_glob.conj().T @ C_true
+
+    col_start_C_dist   = my_layer * blk_cols_BC
+    col_end_C_dist     = min(N, col_start_C_dist + blk_cols_BC)
+    my_own_cols_C_dist = max(0, col_end_C_dist - col_start_C_dist)
+    expected_y_shape   = (M * my_own_cols_C_dist,)
+
+    assert y_dist.local_array.shape == expected_y_shape, (
+        f"Rank {rank}: y_dist shape {y_dist.local_array.shape} != expected {expected_y_shape}"
+    )
+
+    if y_dist.local_array.size > 0 and C_true is not None and C_true.size > 0:
+        expected_y_slice = C_true[:, col_start_C_dist:col_end_C_dist]
         assert_allclose(
-            y,
-            y_np.ravel(),
-            rtol=1e-14,
+            y_dist.local_array,
+            expected_y_slice.ravel(),
+            rtol=np.finfo(np.dtype(dtype)).resolution,
             err_msg=f"Rank {rank}: Forward verification failed."
         )
 
+    # Verify adjoint operation (z = A.H @ y)
+    expected_z_shape = (K * my_own_cols_C_dist,)
+    assert z_dist.local_array.shape == expected_z_shape, (
+        f"Rank {rank}: z_dist shape {z_dist.local_array.shape} != expected {expected_z_shape}"
+    )
+
+    # Verify adjoint result values
+    if z_dist.local_array.size > 0 and Z_true is not None and Z_true.size > 0:
+        expected_z_slice = Z_true[:, col_start_C_dist:col_end_C_dist]
         assert_allclose(
-            y_adj,
-            y_adj_np.ravel(),
-            rtol=1e-14,
+            z_dist.local_array,
+            expected_z_slice.ravel(),
+            rtol=np.finfo(np.dtype(dtype)).resolution,
             err_msg=f"Rank {rank}: Adjoint verification failed."
         )