From aee9adde7c80a0f89e3f1ad424d9a2e766ad47d1 Mon Sep 17 00:00:00 2001 From: mrava87 Date: Tue, 1 Jul 2025 08:37:05 +0000 Subject: [PATCH 1/5] test: fix tests not working with high nranks and change GA to test that --- .github/workflows/build.yml | 2 +- Makefile | 2 +- tests/test_distributedarray.py | 45 +++++++++++++++++++++++++++------- tests/test_fredholm.py | 12 ++++----- tests/test_solver.py | 13 +++++++++- 5 files changed, 56 insertions(+), 18 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 4de6df86..55eecd5e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -16,7 +16,7 @@ jobs: os: [ubuntu-latest, macos-latest] python-version: ['3.10', '3.11', '3.12', '3.13'] mpi: ['mpich', 'openmpi', 'intelmpi'] - rank: ['2', '3', '4'] + rank: ['2', '4', '9'] exclude: - os: macos-latest mpi: 'intelmpi' diff --git a/Makefile b/Makefile index 7d866764..d2715edd 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ PIP := $(shell command -v pip3 2> /dev/null || command which pip 2> /dev/null) PYTHON := $(shell command -v python3 2> /dev/null || command which python 2> /dev/null) -NUM_PROCESSES = 3 +NUM_PROCESSES = 4 .PHONY: install dev-install dev-install_nccl install_ \ conda install_conda_nccl dev-install_conda dev-install_conda_nccl \ diff --git a/tests/test_distributedarray.py b/tests/test_distributedarray.py index bc3d7a32..34944b06 100644 --- a/tests/test_distributedarray.py +++ b/tests/test_distributedarray.py @@ -199,12 +199,21 @@ def test_distributed_norm(par): def test_distributed_masked(par): """Test Asarray with masked array""" # Number of subcommunicators - if MPI.COMM_WORLD.Get_size() % 2 == 0: + size = MPI.COMM_WORLD.Get_size() + + # Exclude not handled cases + shape_axis = par['x'].shape[par['axis']] + print('shape_axis, size', shape_axis, size, shape_axis % size != 0) + if shape_axis % size != 0: + pytest.skip(f"Array dimension to distributed ({shape_axis}) is not " + f"divisible by the number of processes ({size})...") + if size % 2 == 0: nsub = 2 - elif MPI.COMM_WORLD.Get_size() % 3 == 0: + elif size % 3 == 0: nsub = 3 else: - pass + pytest.skip(f"Number of processes ({size}) is not divisible " + "by 2 or 3...") subsize = max(1, MPI.COMM_WORLD.Get_size() // nsub) mask = np.repeat(np.arange(nsub), subsize) @@ -236,12 +245,21 @@ def test_distributed_masked(par): def test_distributed_maskeddot(par1, par2): """Test Distributed Dot product with masked array""" # Number of subcommunicators - if MPI.COMM_WORLD.Get_size() % 2 == 0: + size = MPI.COMM_WORLD.Get_size() + + # Exclude not handled cases + shape_axis = par1['x'].shape[par1['axis']] + print('shape_axis, size', shape_axis, size, shape_axis % size != 0) + if shape_axis % size != 0: + pytest.skip(f"Array dimension to distributed ({shape_axis}) is not " + f"divisible by the number of processes ({size})...") + if size % 2 == 0: nsub = 2 - elif MPI.COMM_WORLD.Get_size() % 3 == 0: + elif size % 3 == 0: nsub = 3 else: - pass + pytest.skip(f"Number of processes ({size}) is not divisible " + "by 2 or 3...") subsize = max(1, MPI.COMM_WORLD.Get_size() // nsub) mask = np.repeat(np.arange(nsub), subsize) @@ -271,12 +289,21 @@ def test_distributed_maskeddot(par1, par2): def test_distributed_maskednorm(par): """Test Distributed numpy.linalg.norm method with masked array""" # Number of subcommunicators - if MPI.COMM_WORLD.Get_size() % 2 == 0: + size = MPI.COMM_WORLD.Get_size() + + # Exclude not handled cases + shape_axis = par['x'].shape[par['axis']] + print('shape_axis, size', shape_axis, size, shape_axis % size != 0) + if shape_axis % size != 0: + pytest.skip(f"Array dimension to distributed ({shape_axis}) is not " + f"divisible by the number of processes ({size})...") + if size % 2 == 0: nsub = 2 - elif MPI.COMM_WORLD.Get_size() % 3 == 0: + elif size % 3 == 0: nsub = 3 else: - pass + pytest.skip(f"Number of processes ({size}) is not divisible " + "by 2 or 3...") subsize = max(1, MPI.COMM_WORLD.Get_size() // nsub) mask = np.repeat(np.arange(nsub), subsize) # Replicate x as required in masked arrays diff --git a/tests/test_fredholm.py b/tests/test_fredholm.py index 3d45a4c6..95bd5468 100644 --- a/tests/test_fredholm.py +++ b/tests/test_fredholm.py @@ -20,7 +20,7 @@ size = MPI.COMM_WORLD.Get_size() par1 = { - "nsl": 12, + "nsl": 21, "ny": 6, "nx": 4, "nz": 5, @@ -30,7 +30,7 @@ "dtype": "float32", } # real, saved Gt par2 = { - "nsl": 12, + "nsl": 21, "ny": 6, "nx": 4, "nz": 5, @@ -40,7 +40,7 @@ "dtype": "float32", } # real, unsaved Gt par3 = { - "nsl": 12, + "nsl": 21, "ny": 6, "nx": 4, "nz": 5, @@ -50,7 +50,7 @@ "dtype": "complex64", } # complex, saved Gt par4 = { - "nsl": 12, + "nsl": 21, "ny": 6, "nx": 4, "nz": 5, @@ -60,7 +60,7 @@ "dtype": "complex64", } # complex, unsaved Gt par5 = { - "nsl": 12, + "nsl": 21, "ny": 6, "nx": 4, "nz": 1, @@ -70,7 +70,7 @@ "dtype": "float32", } # real, saved Gt, nz=1 par6 = { - "nsl": 12, + "nsl": 21, "ny": 6, "nx": 4, "nz": 1, diff --git a/tests/test_solver.py b/tests/test_solver.py index 46e9a139..f79e3e61 100644 --- a/tests/test_solver.py +++ b/tests/test_solver.py @@ -26,7 +26,6 @@ StackedDistributedArray ) -np.random.seed(42) size = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() @@ -94,6 +93,8 @@ ) def test_cg(par): """CG with MPIBlockDiag""" + np.random.seed(42) + A = np.ones((par["ny"], par["nx"])) + par[ "imag"] * np.ones((par["ny"], par["nx"])) Aop = MatrixMult(np.conj(A.T) @ A, dtype=par['dtype']) @@ -139,6 +140,8 @@ def test_cg(par): ) def test_cgls(par): """CGLS with MPIBlockDiag""" + np.random.seed(42) + A = np.ones((par["ny"], par["nx"])) + par[ "imag"] * np.ones((par["ny"], par["nx"])) Aop = MatrixMult(np.conj(A.T) @ A + 1e-5 * np.eye(par["nx"], dtype=par['dtype']), @@ -186,6 +189,8 @@ def test_cgls(par): ) def test_cgls_broadcastdata(par): """CGLS with broadcasted data vector""" + np.random.seed(42) + A = (rank + 1) * np.ones((par["ny"], par["nx"])) + (rank + 2) * par[ "imag" ] * np.ones((par["ny"], par["nx"])) @@ -232,6 +237,8 @@ def test_cgls_broadcastdata(par): ) def test_cgls_broadcastmodel(par): """CGLS with broadcasted model vector""" + np.random.seed(42) + A = np.ones((par["ny"], par["nx"])) + par[ "imag"] * np.ones((par["ny"], par["nx"])) Aop = MatrixMult(np.conj(A.T) @ A + 1e-5 * np.eye(par["nx"], dtype=par['dtype']), @@ -281,6 +288,8 @@ def test_cgls_broadcastmodel(par): ) def test_cg_stacked(par): """CG with MPIStackedBlockDiag""" + np.random.seed(42) + A = np.ones((par["ny"], par["nx"])) + par[ "imag"] * np.ones((par["ny"], par["nx"])) Aop = MatrixMult(np.conj(A.T) @ A + 1e-5 * np.eye(par["nx"], dtype=par['dtype']), @@ -344,6 +353,8 @@ def test_cg_stacked(par): ) def test_cgls_stacked(par): """CGLS with MPIStackedBlockDiag""" + np.random.seed(42) + A = np.ones((par["ny"], par["nx"])) + par[ "imag"] * np.ones((par["ny"], par["nx"])) Aop = MatrixMult(np.conj(A.T) @ A + 1e-5 * np.eye(par["nx"], dtype=par['dtype']), From 12d17082c16c3b30fecec17bae6584fe3878cdf4 Mon Sep 17 00:00:00 2001 From: mrava87 Date: Wed, 2 Jul 2025 22:05:56 +0000 Subject: [PATCH 2/5] ci: exclude rank=9 for openmpi as it does not run in CI (works locally) --- .github/workflows/build.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 55eecd5e..cb6b2ada 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -20,6 +20,8 @@ jobs: exclude: - os: macos-latest mpi: 'intelmpi' + - mpi: 'openmpi' + rank: '9' # works locally, not in CI runs-on: ${{ matrix.os }} steps: - name: Checkout From b5402c80869b72c57711cd563dc618e5b527c75d Mon Sep 17 00:00:00 2001 From: rohanbabbar04 Date: Sun, 13 Jul 2025 01:10:23 +0530 Subject: [PATCH 3/5] Add extra args for openmpi to prevent hangs --- .github/workflows/build.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index cb6b2ada..46e056a8 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -45,4 +45,9 @@ jobs: - name: Install pylops-mpi run: pip install . - name: Testing using pytest-mpi - run: mpiexec -n ${{ matrix.rank }} pytest tests/ --with-mpi + run: | + if [ "${{ matrix.mpi }}" = "openmpi" ]; then + mpiexec --oversubscribe --mca btl ^openib --bind-to none -n ${{ matrix.rank }} pytest tests/ --with-mpi + else + mpiexec -n ${{ matrix.rank }} pytest tests/ --with-mpi + fi From 46c232a675ab4b602afe1c61480f2a2d2cd2d5d4 Mon Sep 17 00:00:00 2001 From: rohanbabbar04 Date: Sun, 13 Jul 2025 01:11:32 +0530 Subject: [PATCH 4/5] Support openmpi for rank=9 --- .github/workflows/build.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 46e056a8..05d503fa 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -20,8 +20,6 @@ jobs: exclude: - os: macos-latest mpi: 'intelmpi' - - mpi: 'openmpi' - rank: '9' # works locally, not in CI runs-on: ${{ matrix.os }} steps: - name: Checkout From 7e6cbe0843d16ebf51b5358b7efe46c4bc9fc900 Mon Sep 17 00:00:00 2001 From: rohanbabbar04 Date: Mon, 14 Jul 2025 10:48:26 +0530 Subject: [PATCH 5/5] Only push ^openib to not work with Infiniband and minor linting --- .github/workflows/build.yml | 2 +- tests/test_distributedarray.py | 4 ++-- tests/test_solver.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 05d503fa..1f8e9c4f 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -45,7 +45,7 @@ jobs: - name: Testing using pytest-mpi run: | if [ "${{ matrix.mpi }}" = "openmpi" ]; then - mpiexec --oversubscribe --mca btl ^openib --bind-to none -n ${{ matrix.rank }} pytest tests/ --with-mpi + mpiexec --mca btl ^openib -n ${{ matrix.rank }} pytest tests/ --with-mpi else mpiexec -n ${{ matrix.rank }} pytest tests/ --with-mpi fi diff --git a/tests/test_distributedarray.py b/tests/test_distributedarray.py index 34944b06..8354c48a 100644 --- a/tests/test_distributedarray.py +++ b/tests/test_distributedarray.py @@ -200,7 +200,7 @@ def test_distributed_masked(par): """Test Asarray with masked array""" # Number of subcommunicators size = MPI.COMM_WORLD.Get_size() - + # Exclude not handled cases shape_axis = par['x'].shape[par['axis']] print('shape_axis, size', shape_axis, size, shape_axis % size != 0) @@ -246,7 +246,7 @@ def test_distributed_maskeddot(par1, par2): """Test Distributed Dot product with masked array""" # Number of subcommunicators size = MPI.COMM_WORLD.Get_size() - + # Exclude not handled cases shape_axis = par1['x'].shape[par1['axis']] print('shape_axis, size', shape_axis, size, shape_axis % size != 0) diff --git a/tests/test_solver.py b/tests/test_solver.py index f79e3e61..baa63bbe 100644 --- a/tests/test_solver.py +++ b/tests/test_solver.py @@ -94,7 +94,7 @@ def test_cg(par): """CG with MPIBlockDiag""" np.random.seed(42) - + A = np.ones((par["ny"], par["nx"])) + par[ "imag"] * np.ones((par["ny"], par["nx"])) Aop = MatrixMult(np.conj(A.T) @ A, dtype=par['dtype'])