ensure unqiue gpu device for each mpi rank in CuPy MPI tests

tharittk · tharittk · commit 66e3b166a6a1 · 2025-08-05T08:01:33.000-05:00
diff --git a/pylops_mpi/DistributedArray.py b/pylops_mpi/DistributedArray.py
@@ -697,7 +697,7 @@ def _compute_vector_norm(self, local_array: NDArray,
             # TODO (tharitt): currently CuPy + MPI does not work well with buffered communication, particularly
             # with MAX, MIN operator. Here we copy the array back to CPU, transfer, and copy them back to GPUs
             send_buf = ncp.max(ncp.abs(local_array), axis=axis).astype(ncp.float64)
-            if self.engine=="cupy" and self.base_comm_nccl is None:
+            if self.engine == "cupy" and self.base_comm_nccl is None:
                 recv_buf = self._allreduce_subcomm(send_buf.get(), recv_buf.get(), op=MPI.MAX)
                 recv_buf = ncp.asarray(ncp.squeeze(recv_buf, axis=axis))
             else:
diff --git a/tests/test_blockdiag.py b/tests/test_blockdiag.py
@@ -27,6 +27,14 @@
 par2j = {'ny': 301, 'nx': 101, 'dtype': np.complex128}
 
 np.random.seed(42)
+rank = MPI.COMM_WORLD.Get_rank()
+if backend == "cupy":
+    device_count = np.cuda.runtime.getDeviceCount()
+    device_id = int(
+        os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK")
+        or rank % np.cuda.runtime.getDeviceCount()
+    )
+    np.cuda.Device(device_id).use()
 
 
 @pytest.mark.mpi(min_size=2)
diff --git a/tests/test_derivative.py b/tests/test_derivative.py
@@ -25,6 +25,14 @@
 np.random.seed(42)
 rank = MPI.COMM_WORLD.Get_rank()
 size = MPI.COMM_WORLD.Get_size()
+if backend == "cupy":
+    device_count = np.cuda.runtime.getDeviceCount()
+    device_id = int(
+        os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK")
+        or rank % np.cuda.runtime.getDeviceCount()
+    )
+    np.cuda.Device(device_id).use()
+
 
 par1 = {
     "nz": 600,
diff --git a/tests/test_distributedarray.py b/tests/test_distributedarray.py
@@ -21,6 +21,14 @@
 from pylops_mpi.DistributedArray import local_split
 
 np.random.seed(42)
+rank = MPI.COMM_WORLD.Get_rank()
+if backend == "cupy":
+    device_count = np.cuda.runtime.getDeviceCount()
+    device_id = int(
+        os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK")
+        or rank % np.cuda.runtime.getDeviceCount()
+    )
+    np.cuda.Device(device_id).use()
 
 par1 = {'global_shape': (500, 501),
         'partition': Partition.SCATTER, 'dtype': np.float64,
@@ -206,7 +214,7 @@ def test_distributed_norm(par):
 
     # TODO (tharitt): FAIL with CuPy + MPI for inf norm
     assert_allclose(arr.norm(ord=np.inf, axis=par['axis']),
-                        np.linalg.norm(par['x'], ord=np.inf, axis=par['axis']), rtol=1e-14)
+                    np.linalg.norm(par['x'], ord=np.inf, axis=par['axis']), rtol=1e-14)
     assert_allclose(arr.norm(), np.linalg.norm(par['x'].flatten()), rtol=1e-13)
 
 
diff --git a/tests/test_fredholm.py b/tests/test_fredholm.py
@@ -29,6 +29,13 @@
 np.random.seed(42)
 rank = MPI.COMM_WORLD.Get_rank()
 size = MPI.COMM_WORLD.Get_size()
+if backend == "cupy":
+    device_count = np.cuda.runtime.getDeviceCount()
+    device_id = int(
+        os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK")
+        or rank % np.cuda.runtime.getDeviceCount()
+    )
+    np.cuda.Device(device_id).use()
 
 par1 = {
     "nsl": 21,
diff --git a/tests/test_linearop.py b/tests/test_linearop.py
@@ -31,6 +31,13 @@
 np.random.seed(42)
 rank = MPI.COMM_WORLD.Get_rank()
 size = MPI.COMM_WORLD.Get_size()
+if backend == "cupy":
+    device_count = np.cuda.runtime.getDeviceCount()
+    device_id = int(
+        os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK")
+        or rank % np.cuda.runtime.getDeviceCount()
+    )
+    np.cuda.Device(device_id).use()
 
 par1 = {'ny': 101, 'nx': 101, 'dtype': np.float64}
 par1j = {'ny': 101, 'nx': 101, 'dtype': np.complex128}
@@ -142,7 +149,7 @@ def test_power(par):
     Op = pylops.MatrixMult(A=((rank + 1) * np.ones(shape=(par['ny'], par['nx']))).astype(par['dtype']),
                            dtype=par['dtype'])
     BDiag_MPI = MPIBlockDiag(ops=[Op, ])
-    
+
     # Power Operator
     Pop_MPI = BDiag_MPI ** 3
 
@@ -166,7 +173,7 @@ def test_power(par):
         ops = [pylops.MatrixMult((i + 1) * np.ones(shape=(par['ny'], par['nx'])).astype(par['dtype'])) for i in
                range(size)]
         BDiag = pylops.BlockDiag(ops=ops)
-        Pop = BDiag * BDiag * BDiag  ## temporarely replaced BDiag ** 3 until bug in PyLops is fixed
+        Pop = BDiag * BDiag * BDiag  # temporarely replaced BDiag ** 3 until bug in PyLops is fixed
         assert_allclose(Pop_x_np, Pop @ x_global, rtol=1e-9)
         assert_allclose(Pop_y_np, Pop.H @ y_global, rtol=1e-9)
 
diff --git a/tests/test_matrixmult.py b/tests/test_matrixmult.py
@@ -19,14 +19,21 @@
 from mpi4py import MPI
 import pytest
 
-from pylops.basicoperators import FirstDerivative, Identity
+from pylops.basicoperators import FirstDerivative
 from pylops_mpi import DistributedArray, Partition
 from pylops_mpi.basicoperators import MPIMatrixMult, MPIBlockDiag
 
 np.random.seed(42)
 base_comm = MPI.COMM_WORLD
 size = base_comm.Get_size()
-
+rank = MPI.COMM_WORLD.Get_rank()
+if backend == "cupy":
+    device_count = np.cuda.runtime.getDeviceCount()
+    device_id = int(
+        os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK")
+        or rank % np.cuda.runtime.getDeviceCount()
+    )
+    np.cuda.Device(device_id).use()
 # Define test cases: (N, K, M, dtype_str)
 # M, K, N are matrix dimensions A(N,K), B(K,M)
 # P_prime will be ceil(sqrt(size)).
@@ -39,6 +46,7 @@
     pytest.param(2, 1, 3, "float32", id="f32_2_1_3",),
 ]
 
+
 def _reorganize_local_matrix(x_dist, N, M, blk_cols, p_prime):
     """Re-organize distributed array in local matrix
     """
@@ -66,9 +74,9 @@ def test_MPIMatrixMult(N, K, M, dtype_str):
     cmplx = 1j if np.issubdtype(dtype, np.complexfloating) else 0
     base_float_dtype = np.float32 if dtype == np.complex64 else np.float64
 
-    comm, rank, row_id, col_id, is_active = \
-        MPIMatrixMult.active_grid_comm(base_comm, N, M)
-    if not is_active: return
+    comm, rank, row_id, col_id, is_active = MPIMatrixMult.active_grid_comm(base_comm, N, M)
+    if not is_active:
+        return
 
     size = comm.Get_size()
     p_prime = math.isqrt(size)
diff --git a/tests/test_solver.py b/tests/test_solver.py
@@ -38,6 +38,13 @@
 
 size = MPI.COMM_WORLD.Get_size()
 rank = MPI.COMM_WORLD.Get_rank()
+if backend == "cupy":
+    device_count = np.cuda.runtime.getDeviceCount()
+    device_id = int(
+        os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK")
+        or rank % np.cuda.runtime.getDeviceCount()
+    )
+    np.cuda.Device(device_id).use()
 
 par1 = {
     "ny": 11,
diff --git a/tests/test_stack.py b/tests/test_stack.py
@@ -21,6 +21,15 @@
 import pylops_mpi
 from pylops_mpi.utils.dottest import dottest
 
+rank = MPI.COMM_WORLD.Get_rank()
+if backend == "cupy":
+    device_count = np.cuda.runtime.getDeviceCount()
+    device_id = int(
+        os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK")
+        or rank % np.cuda.runtime.getDeviceCount()
+    )
+    np.cuda.Device(device_id).use()
+
 par1 = {'ny': 101, 'nx': 101, 'imag': 0, 'dtype': np.float64}
 par1j = {'ny': 101, 'nx': 101, 'imag': 1j, 'dtype': np.complex128}
 par2 = {'ny': 301, 'nx': 101, 'imag': 0, 'dtype': np.float64}
diff --git a/tests/test_stackedarray.py b/tests/test_stackedarray.py
@@ -17,9 +17,19 @@
 import numpy as npp
 import pytest
 
+from mpi4py import MPI
 from pylops_mpi import DistributedArray, Partition, StackedDistributedArray
 
 np.random.seed(42)
+rank = MPI.COMM_WORLD.Get_rank()
+if backend == "cupy":
+    device_count = np.cuda.runtime.getDeviceCount()
+    device_id = int(
+        os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK")
+        or rank % np.cuda.runtime.getDeviceCount()
+    )
+    np.cuda.Device(device_id).use()
+
 
 par1 = {'global_shape': (500, 501),
         'partition': Partition.SCATTER, 'dtype': np.float64,
diff --git a/tests/test_stackedlinearop.py b/tests/test_stackedlinearop.py
@@ -24,6 +24,14 @@
 np.random.seed(42)
 rank = MPI.COMM_WORLD.Get_rank()
 size = MPI.COMM_WORLD.Get_size()
+if backend == "cupy":
+    device_count = np.cuda.runtime.getDeviceCount()
+    device_id = int(
+        os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK")
+        or rank % np.cuda.runtime.getDeviceCount()
+    )
+    np.cuda.Device(device_id).use()
+
 
 par1 = {'ny': 101, 'nx': 101, 'dtype': np.float64}
 par1j = {'ny': 101, 'nx': 101, 'dtype': np.complex128}