PyLops
diff --git a/‎CHANGELOG.md‎
Lines changed: 16 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 5 additions & 1 deletion b/‎Makefile‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎docs/source/changelog.rst‎
Lines changed: 22 additions & 0 deletions b/‎docs/source/changelog.rst‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎docs/source/contributing.rst‎
Lines changed: 17 additions & 2 deletions b/‎docs/source/contributing.rst‎
Lines changed: 17 additions & 2 deletions
diff --git a/‎pylops_mpi/DistributedArray.py‎
Lines changed: 17 additions & 6 deletions b/‎pylops_mpi/DistributedArray.py‎
Lines changed: 17 additions & 6 deletions
diff --git a/‎pylops_mpi/utils/benchmark.py‎
Lines changed: 1 addition & 0 deletions b/‎pylops_mpi/utils/benchmark.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/test_blockdiag.py‎
Lines changed: 22 additions & 8 deletions b/‎tests/test_blockdiag.py‎
Lines changed: 22 additions & 8 deletions
@@ -1,3 +1,19 @@
+
+# 0.3.0
+* Added `pylops_mpi.basicoperators.MPIMatrixMult` operator.
+* Added NCCL support to all operators in :mod:`pylops_mpi.basicoperators`, 
+  and  `pylops_mpi.signalprocessing`.
+* Added ``base_comm_nccl`` in constructor of `pylops_mpi.DistributedArray`,
+  to enable NCCL communication backend.
+* Added `pylops_mpi.utils.benchmark` subpackage providing methods
+  to decorate and mark functions / class methods to measure their execution 
+  time.
+* Added `pylops_mpi.utils._nccl` subpackage implementing methods
+  for NCCL communication backend.
+* Added `pylops_mpi.utils.deps` subpackage to safely import ``nccl``
+* Fixed partition in the creation of the output distributed array in 
+  `pylops_mpi.signalprocessing.MPIFredholm1`.
+
 # 0.2.0
 - Added support for using CuPy arrays with PyLops-MPI.
 - Introduced the `pylops_mpi.signalprocessing.MPIFredholm1` and `pylops_mpi.waveeqprocessing.MPIMDC` operators.
 
@@ -29,7 +29,7 @@ dev-install:
 
 dev-install_nccl:
 	make pipcheck
-	$(PIP) install -r requirements-dev.txt && $(PIP) install cupy-cuda12x nvidia-nccl-cu12  $(PIP) install -e .
+	$(PIP) install -r requirements-dev.txt && $(PIP) install cupy-cuda12x nvidia-nccl-cu12 && $(PIP) install -e .
 
 install_conda:
 	conda env create -f environment.yml && conda activate pylops_mpi && pip install .
@@ -49,6 +49,10 @@ lint:
 tests:
 	mpiexec -n $(NUM_PROCESSES) pytest tests/ --with-mpi
 
+# assuming NUM_PROCESSES <= number of gpus available
+tests_gpu:
+	export TEST_CUPY_PYLOPS=1 && mpiexec -n $(NUM_PROCESSES) pytest tests/ --with-mpi
+
 # assuming NUM_PROCESSES <= number of gpus available
 tests_nccl:	
 	mpiexec -n $(NUM_PROCESSES) pytest tests_nccl/ --with-mpi
 
@@ -3,6 +3,27 @@
 Changelog
 =========
 
+
+Version 0.3.0
+-------------
+
+*Released on: 05/08/2025*
+
+* Added :class:`pylops_mpi.basicoperators.MPIMatrixMult` operator.
+* Added NCCL support to all operators in :mod:`pylops_mpi.basicoperators`,
+  and  :mod:`pylops_mpi.signalprocessing`.
+* Added ``base_comm_nccl`` in constructor of :class:`pylops_mpi.DistributedArray`,
+  to enable NCCL communication backend.
+* Added :class:`pylops_mpi.utils.benchmark` subpackage providing methods
+  to decorate and mark functions / class methods to measure their execution 
+  time.
+* Added :class:`pylops_mpi.utils._nccl` subpackage implementing methods
+  for NCCL communication backend.
+* Added :class:`pylops_mpi.utils.deps` subpackage to safely import ``nccl``
+* Fixed partition in the creation of the output distributed array in 
+  :class:`pylops_mpi.signalprocessing.MPIFredholm1`.
+  
+
 Version 0.2.0
 -------------
 
@@ -14,6 +35,7 @@ Version 0.2.0
 * Added a dottest function to perform dot tests on PyLops-MPI operators.
 * Created a tutorial for Multi-Dimensional Deconvolution (MDD).
 
+
 Version 0.1.0
 -------------
 
 
@@ -69,6 +69,18 @@ that the both old and new tests pass successfully:
 
    >> make tests
 
+If you run PyLops-MPI with GPUs you may also do:
+
+.. code-block:: bash
+
+   >> make tests_gpu
+
+Additionally, if you have a NCCL-enabled environment, you may also check:
+
+.. code-block:: bash
+
+   >> make tests_nccl
+
 4. Make sure the ``examples`` python scripts are executed using 3 processes without any errors:
 
 .. code-block:: bash
@@ -123,8 +135,11 @@ Project structure
 This repository is organized as follows:
 
 * **pylops_mpi**: Python library containing various mpi linear operators.
-* **tests**:      Set of tests using pytest-mpi.
+* **tests**:      Set of tests using pytest-mpi for both CPU and GPU.
+* **tests_nccl**  Set of tests for NCCL-enabled environment using pytest-mpi
 * **testdata**:   Sample datasets used in tests and documentation.
 * **docs**:       Sphinx documentation.
 * **examples**:   Set of python script examples for each mpi linear operator to be embedded in documentation using sphinx-gallery.
-* **tutorials**:  Set of python script tutorials to be embedded in documentation using sphinx-gallery.
+* **tutorials**:  Set of python script tutorials (NumPy & MPI) to be embedded in documentation using sphinx-gallery.
+* **tutorials_cupy**:  Same set of scripts as above but with CuPy & MPI 
+* **tutorials_nccl**:  Same set of scripts as above but with CuPy & NCCL
@@ -694,14 +694,25 @@ def _compute_vector_norm(self, local_array: NDArray,
             recv_buf = self._allreduce_subcomm(ncp.count_nonzero(local_array, axis=axis).astype(ncp.float64))
         elif ord == ncp.inf:
             # Calculate max followed by max reduction
-            recv_buf = self._allreduce_subcomm(ncp.max(ncp.abs(local_array), axis=axis).astype(ncp.float64),
-                                               recv_buf, op=MPI.MAX)
-            recv_buf = ncp.squeeze(recv_buf, axis=axis)
+            # TODO (tharitt): currently CuPy + MPI does not work well with buffered communication, particularly
+            # with MAX, MIN operator. Here we copy the array back to CPU, transfer, and copy them back to GPUs
+            send_buf = ncp.max(ncp.abs(local_array), axis=axis).astype(ncp.float64)
+            if self.engine == "cupy" and self.base_comm_nccl is None:
+                recv_buf = self._allreduce_subcomm(send_buf.get(), recv_buf.get(), op=MPI.MAX)
+                recv_buf = ncp.asarray(ncp.squeeze(recv_buf, axis=axis))
+            else:
+                recv_buf = self._allreduce_subcomm(send_buf, recv_buf, op=MPI.MAX)
+                recv_buf = ncp.squeeze(recv_buf, axis=axis)
         elif ord == -ncp.inf:
             # Calculate min followed by min reduction
-            recv_buf = self._allreduce_subcomm(ncp.min(ncp.abs(local_array), axis=axis).astype(ncp.float64),
-                                               recv_buf, op=MPI.MIN)
-            recv_buf = ncp.squeeze(recv_buf, axis=axis)
+            # TODO (tharitt): see the comment above in infinity norm
+            send_buf = ncp.min(ncp.abs(local_array), axis=axis).astype(ncp.float64)
+            if self.engine == "cupy" and self.base_comm_nccl is None:
+                recv_buf = self._allreduce_subcomm(send_buf.get(), recv_buf.get(), op=MPI.MIN)
+                recv_buf = ncp.asarray(ncp.squeeze(recv_buf, axis=axis))
+            else:
+                recv_buf = self._allreduce_subcomm(send_buf, recv_buf, op=MPI.MIN)
+                recv_buf = ncp.asarray(ncp.squeeze(recv_buf, axis=axis))
 
         else:
             recv_buf = self._allreduce_subcomm(ncp.sum(ncp.abs(ncp.float_power(local_array, ord)), axis=axis))
 
@@ -133,6 +133,7 @@ def wrapper(*args, **kwargs):
             header_index = len(_markers) - 1
 
             def local_mark(label):
+                _sync()
                 _markers.append((label, time.perf_counter(), level))
 
             _mark_func_stack.append(local_mark)
 
@@ -2,9 +2,19 @@
     Designed to run with n processes
     $ mpiexec -n 10 pytest test_blockdiag.py --with-mpi
 """
+import os
+
+if int(os.environ.get("TEST_CUPY_PYLOPS", 0)):
+    import cupy as np
+    from cupy.testing import assert_allclose
+
+    backend = "cupy"
+else:
+    import numpy as np
+    from numpy.testing import assert_allclose
+
+    backend = "numpy"
 from mpi4py import MPI
-import numpy as np
-from numpy.testing import assert_allclose
 import pytest
 
 import pylops
@@ -17,6 +27,10 @@
 par2j = {'ny': 301, 'nx': 101, 'dtype': np.complex128}
 
 np.random.seed(42)
+rank = MPI.COMM_WORLD.Get_rank()
+if backend == "cupy":
+    device_id = rank % np.cuda.runtime.getDeviceCount()
+    np.cuda.Device(device_id).use()
 
 
 @pytest.mark.mpi(min_size=2)
@@ -27,11 +41,11 @@ def test_blockdiag(par):
     Op = pylops.MatrixMult(A=((rank + 1) * np.ones(shape=(par['ny'], par['nx']))).astype(par['dtype']))
     BDiag_MPI = pylops_mpi.MPIBlockDiag(ops=[Op, ])
 
-    x = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'])
+    x = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend)
     x[:] = np.ones(shape=par['nx'], dtype=par['dtype'])
     x_global = x.asarray()
 
-    y = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'])
+    y = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'], engine=backend)
     y[:] = np.ones(shape=par['ny'], dtype=par['dtype'])
     y_global = y.asarray()
 
@@ -68,16 +82,16 @@ def test_stacked_blockdiag(par):
     FirstDeriv_MPI = pylops_mpi.MPIFirstDerivative(dims=(par['ny'], par['nx']), dtype=par['dtype'])
     StackedBDiag_MPI = pylops_mpi.MPIStackedBlockDiag(ops=[BDiag_MPI, FirstDeriv_MPI])
 
-    dist1 = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'])
+    dist1 = pylops_mpi.DistributedArray(global_shape=size * par['nx'], dtype=par['dtype'], engine=backend)
     dist1[:] = np.ones(dist1.local_shape, dtype=par['dtype'])
-    dist2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype'])
+    dist2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype'], engine=backend)
     dist2[:] = np.ones(dist2.local_shape, dtype=par['dtype'])
     x = pylops_mpi.StackedDistributedArray(distarrays=[dist1, dist2])
     x_global = x.asarray()
 
-    dist1 = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'])
+    dist1 = pylops_mpi.DistributedArray(global_shape=size * par['ny'], dtype=par['dtype'], engine=backend)
     dist1[:] = np.ones(dist1.local_shape, dtype=par['dtype'])
-    dist2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype'])
+    dist2 = pylops_mpi.DistributedArray(global_shape=par['nx'] * par['ny'], dtype=par['dtype'], engine=backend)
     dist2[:] = np.ones(dist2.local_shape, dtype=par['dtype'])
     y = pylops_mpi.StackedDistributedArray(distarrays=[dist1, dist2])
     y_global = y.asarray()