MatrixMul works with non-square prcs by creating square subcommunicator

astroC86 · astroC86 · commit 9aedd7c4ef81 · 2025-06-30T02:37:42.000+02:00
diff --git a/examples/plot_matrixmult.py b/examples/plot_matrixmult.py
@@ -35,20 +35,6 @@
 # filled with data that is appropriate that is appropriate the use-case.
 np.random.seed(42)
 
-###############################################################################
-# Next we obtain the MPI parameters for each rank and check that the number
-# of processes (``size``) is a square number
-comm = MPI.COMM_WORLD
-rank = comm.Get_rank()  # rank of current process
-size = comm.Get_size()  # number of processes
-
-p_prime = math.isqrt(size)
-repl_factor = p_prime
-
-if (p_prime * repl_factor) != size:
-    print(f"Number of processes must be a square number, provided {size} instead...")
-    exit(-1)
-
 ###############################################################################
 # We are now ready to create the input matrices :math:`\mathbf{A}` of size
 # :math:`M \times k` :math:`\mathbf{A}` of size and :math:`\mathbf{A}` of size
@@ -93,12 +79,15 @@
 #   └────────────┴────────────┘
 #    </div>
 
-my_col = rank % p_prime
-my_row = rank // p_prime
+base_comm = MPI.COMM_WORLD
+comm, rank, row_id, col_id, is_active = MPIMatrixMult.active_grid_comm(base_comm, N, M)
+print(f"Process {base_comm.Get_rank()}  is {"active" if is_active else "inactive"}")
+if not is_active: exit(0)
+p_prime = math.isqrt(comm.Get_size())
 
 # Create sub‐communicators
-row_comm = comm.Split(color=my_row, key=my_col)  # all procs in same row
-col_comm = comm.Split(color=my_col, key=my_row)  # all procs in same col
+row_comm = comm.Split(color=row_id, key=col_id)  # all procs in same row
+col_comm = comm.Split(color=col_id, key=row_id)  # all procs in same col
 
 ################################################################################
 # At this point we divide the rows and columns of :math:`\mathbf{A}` and
@@ -136,20 +125,20 @@
 blk_rows = int(math.ceil(N / p_prime))
 blk_cols = int(math.ceil(M / p_prime))
 
-rs = my_col * blk_rows
+rs = col_id * blk_rows
 re = min(N, rs + blk_rows)
-my_own_rows = re - rs
+my_own_rows = max(0,re - rs)
 
-cs = my_row * blk_cols
+cs = row_id * blk_cols
 ce = min(M, cs + blk_cols)
-my_own_cols = ce - cs
+my_own_cols = max(0,ce - cs)
 
 A_p, X_p = A[rs:re, :].copy(), X[:, cs:ce].copy()
 
 ################################################################################
 # We are now ready to create the :py:class:`pylops_mpi.basicoperators.MPIMatrixMult`
 # operator and the input matrix :math:`\mathbf{X}`
-Aop = MPIMatrixMult(A_p, M, dtype="float32")
+Aop = MPIMatrixMult(A_p, M, base_comm=comm, dtype="float32")
 
 col_lens = comm.allgather(my_own_cols)
 total_cols = np.sum(col_lens)
@@ -188,9 +177,11 @@
 offset = 0
 for cnt in col_counts:
     block_size = N * cnt
-    y_blocks.append(
-        y[offset: offset + block_size].reshape(N, cnt)
-    )
+    y_block = y[offset: offset + block_size]
+    if len(y_block) != 0:
+        y_blocks.append(
+            y_block.reshape(N, cnt)
+        )
     offset += block_size
 y = np.hstack(y_blocks)
 
@@ -199,13 +190,15 @@
 offset = 0
 for cnt in col_counts:
     block_size = K * cnt
-    xadj_blocks.append(
-        xadj[offset: offset + block_size].reshape(K, cnt)
-    )
+    xadj_blk = xadj[offset: offset + block_size]
+    if len(xadj_blk)!= 0:
+        xadj_blocks.append(
+            xadj_blk.reshape(K, cnt)
+        )
     offset += block_size
 xadj = np.hstack(xadj_blocks)
 
-if rank == 0:
+if comm.Get_rank() == 0:
     y_loc = (A @ X).squeeze()
     xadj_loc = (A.T.dot(y_loc.conj())).conj().squeeze()
 
diff --git a/pylops_mpi/basicoperators/MatrixMult.py b/pylops_mpi/basicoperators/MatrixMult.py
@@ -154,7 +154,7 @@ def __init__(
         self._col_start = self._row_id * block_cols
         self._col_end = min(self.M, self._col_start + block_cols)
 
-        self._local_ncols = self._col_end - self._col_start
+        self._local_ncols = max(0, self._col_end - self._col_start)
         self._rank_col_lens = self.base_comm.allgather(self._local_ncols)
         total_ncols = np.sum(self._rank_col_lens)
 
@@ -168,11 +168,14 @@ def _matvec(self, x: DistributedArray) -> DistributedArray:
         if x.partition != Partition.SCATTER:
             raise ValueError(f"x should have partition={Partition.SCATTER} Got {x.partition} instead...")
 
-        y = DistributedArray(global_shape=(self.N * self.dimsd[1]),
-                             local_shapes=[(self.N * c) for c in self._rank_col_lens],
-                             mask=x.mask,
-                             partition=Partition.SCATTER,
-                             dtype=self.dtype)
+        y = DistributedArray(
+            global_shape=(self.N * self.dimsd[1]),
+            local_shapes=[(self.N * c) for c in self._rank_col_lens],
+            mask=x.mask,
+            partition=Partition.SCATTER,
+            dtype=self.dtype,
+            base_comm=self.base_comm
+        )
 
         my_own_cols = self._rank_col_lens[self.rank]
         x_arr = x.local_array.reshape((self.dims[0], my_own_cols))
@@ -185,6 +188,53 @@ def _matvec(self, x: DistributedArray) -> DistributedArray:
         y[:] = Y_local.flatten()
         return y
 
+    @staticmethod
+    def active_grid_comm(base_comm:MPI.Comm, N:int, M:int):
+        """
+        Configure a square process grid from a parent MPI communicator and select the subset of "active" processes.
+
+        Each process in base_comm is assigned to a logical 2D grid of size p_prime x p_prime,
+        where p_prime = floor(sqrt(total_ranks)). Only the first `active_dim x active_dim` processes
+        (by row-major order) are considered "active". Inactive ranks return immediately with no new communicator.
+
+        Parameters:
+        -----------
+        base_comm : MPI.Comm
+            The parent communicator (e.g., MPI.COMM_WORLD).
+        N : int
+            Number of rows of your global data domain.
+        M : int
+            Number of columns of your global data domain.
+
+        Returns:
+        --------
+        tuple:
+            comm (MPI.Comm or None) : Sub-communicator including only active ranks.
+            rank (int)              : Rank within the new sub-communicator (or original rank if inactive).
+            row (int)               : Grid row index of this process in the active grid (or original rank if inactive).
+            col (int)               : Grid column index of this process in the active grid (or original rank if inactive).
+            is_active (bool)        : Flag indicating whether this rank is in the active sub-grid.
+        """
+        rank = base_comm.Get_rank()
+        size = base_comm.Get_size()
+        p_prime = math.isqrt(size)
+        row, col = divmod(rank, p_prime)
+        active_dim = min(N, M, p_prime)
+        is_active = (row < active_dim and col < active_dim)
+
+        if not is_active:
+            return None, rank, row, col, False
+
+        active_ranks = [r for r in range(size)
+                        if (r // p_prime) < active_dim and (r % p_prime) < active_dim]
+        new_group = base_comm.Get_group().Incl(active_ranks)
+        new_comm = base_comm.Create_group(new_group)
+
+        p_prime_new = math.isqrt(len(active_ranks))
+        new_rank = new_comm.Get_rank()
+        new_row, new_col = divmod(new_rank, p_prime_new)
+        return new_comm, new_rank, new_row, new_col, True
+
     def _rmatvec(self, x: DistributedArray) -> DistributedArray:
         ncp = get_module(x.engine)
         if x.partition != Partition.SCATTER:
@@ -196,6 +246,7 @@ def _rmatvec(self, x: DistributedArray) -> DistributedArray:
             mask=x.mask,
             partition=Partition.SCATTER,
             dtype=self.dtype,
+            base_comm=self.base_comm
         )
 
         x_arr = x.local_array.reshape((self.N, self._local_ncols)).astype(self.dtype)
diff --git a/tests/test_matrixmult.py b/tests/test_matrixmult.py
@@ -12,9 +12,7 @@
 from pylops_mpi.basicoperators.MatrixMult import MPIMatrixMult
 
 np.random.seed(42)
-comm = MPI.COMM_WORLD
-rank = comm.Get_rank()
-size = comm.Get_size()
+base_comm = MPI.COMM_WORLD
 
 # Define test cases: (N, K, M, dtype_str)
 # M, K, N are matrix dimensions A(N,K), B(K,M)
@@ -32,31 +30,25 @@
 @pytest.mark.mpi(min_size=1)
 @pytest.mark.parametrize("M, K, N, dtype_str", test_params)
 def test_SUMMAMatrixMult(N, K, M, dtype_str):
-    p_prime = math.isqrt(size)
-    C = p_prime
-    if p_prime * C != size:
-        pytest.skip("Number of processes must be a square number, "
-                    "provided {size} instead...")
-
     dtype = np.dtype(dtype_str)
 
     cmplx = 1j if np.issubdtype(dtype, np.complexfloating) else 0
     base_float_dtype = np.float32 if dtype == np.complex64 else np.float64
 
-    my_col = rank % p_prime
-    my_row = rank // p_prime
+    comm, rank, row_id, col_id, is_active = MPIMatrixMult.active_grid_comm(base_comm, N, M)
+    print(f"Process {base_comm.Get_rank()}  is {"active" if is_active else "inactive"}")
+    if not is_active: return
 
-    # Create sub-communicators
-    row_comm = comm.Split(color=my_row, key=my_col)
-    col_comm = comm.Split(color=my_col, key=my_row)
+    size = comm.Get_size()
+    p_prime = math.isqrt(size)
 
     # Calculate local matrix dimensions
     blk_rows_A = int(math.ceil(N / p_prime))
-    row_start_A = my_col * blk_rows_A
+    row_start_A = col_id * blk_rows_A
     row_end_A = min(N, row_start_A + blk_rows_A)
 
     blk_cols_X = int(math.ceil(M / p_prime))
-    col_start_X = my_row * blk_cols_X
+    col_start_X = row_id * blk_cols_X
     col_end_X = min(M, col_start_X + blk_cols_X)
     local_col_X_len = max(0, col_end_X - col_start_X)
 
@@ -102,9 +94,11 @@ def test_SUMMAMatrixMult(N, K, M, dtype_str):
     offset = 0
     for cnt in col_counts:
         block_size = N * cnt
-        y_blocks.append(
-            y[offset: offset + block_size].reshape(N, cnt)
-        )
+        y_block = y[offset: offset + block_size]
+        if len(y_block) != 0:
+            y_blocks.append(
+                y_block.reshape(N, cnt)
+            )
         offset += block_size
     y = np.hstack(y_blocks)
 
@@ -113,9 +107,11 @@ def test_SUMMAMatrixMult(N, K, M, dtype_str):
     offset = 0
     for cnt in col_counts:
         block_size = K * cnt
-        xadj_blocks.append(
-            xadj[offset: offset + block_size].reshape(K, cnt)
-        )
+        xadj_blk = xadj[offset: offset + block_size]
+        if len(xadj_blk) != 0:
+            xadj_blocks.append(
+                xadj_blk.reshape(K, cnt)
+            )
         offset += block_size
     xadj = np.hstack(xadj_blocks)
 
@@ -134,7 +130,4 @@ def test_SUMMAMatrixMult(N, K, M, dtype_str):
             xadj_loc.squeeze(),
             rtol=np.finfo(np.dtype(dtype)).resolution,
             err_msg=f"Rank {rank}: Ajoint verification failed."
-        )
-
-    col_comm.Free()
-    row_comm.Free()
+        )