diff --git a/Makefile b/Makefile
index 6623e5be..7d866764 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,10 @@ PIP := $(shell command -v pip3 2> /dev/null || command which pip 2> /dev/null)
 PYTHON := $(shell command -v python3 2> /dev/null || command which python 2> /dev/null)
 NUM_PROCESSES = 3
 
-.PHONY: install dev-install dev-install_nccl install_conda install_conda_nccl dev-install_conda dev-install_conda_nccl tests tests_nccl doc docupdate run_examples run_tutorials
+.PHONY: install dev-install dev-install_nccl install_ \
+	conda install_conda_nccl dev-install_conda dev-install_conda_nccl  \
+	tests tests_nccl doc doc_cupy doc_nccl docupdate  \
+	run_examples run_tutorials run_tutorials_cupy run_tutorials_nccl
 
 pipcheck:
 ifndef PIP
@@ -55,12 +58,19 @@ doc:
 	rm -rf source/tutorials && rm -rf build &&\
 	cd .. && sphinx-build -b html docs/source docs/build
 
+doc_cupy:
+	cp tutorials_cupy/* tutorials/
+	cd docs  && rm -rf source/api/generated && rm -rf source/gallery &&\
+	rm -rf source/tutorials && rm -rf source/tutorials && rm -rf build &&\
+	cd .. && sphinx-build -b html docs/source docs/build
+	rm tutorials/*_cupy.py
+
 doc_nccl:
-	cp tutorials_nccl/* tutorials/
+	cp tutorials_cupy/* tutorials_nccl/* tutorials/
 	cd docs  && rm -rf source/api/generated && rm -rf source/gallery &&\
 	rm -rf source/tutorials && rm -rf source/tutorials && rm -rf build &&\
 	cd .. && sphinx-build -b html docs/source docs/build
-	rm tutorials/*_nccl.py
+	rm tutorials/*_cupy.py tutorials/*_nccl.py
 
 docupdate:
 	cd docs && make html && cd ..
@@ -76,6 +86,10 @@ run_examples:
 run_tutorials:
 	sh mpi_examples.sh tutorials $(NUM_PROCESSES)
 
+# Run tutorials using mpi with cupy arrays
+run_tutorials_cupy:
+	sh mpi_examples.sh tutorials_cupy $(NUM_PROCESSES)
+
 # Run tutorials using nccl 
 run_tutorials_nccl:
 	sh mpi_examples.sh tutorials_nccl $(NUM_PROCESSES)
diff --git a/tutorials/lsm.py b/tutorials/lsm.py
index 9db25795..bee504bd 100644
--- a/tutorials/lsm.py
+++ b/tutorials/lsm.py
@@ -152,8 +152,8 @@
 d = d_dist.asarray().reshape((nstot, nr, nt))
 
 ###############################################################################
-
-# Adjoint
+# We calculate now the adjoint and model the data using the adjoint reflectivity
+# as input.
 madj_dist = VStack.H @ d_dist
 madj = madj_dist.asarray().reshape((nx, nz))
 d_adj_dist = VStack @ madj_dist
diff --git a/tutorials/mdd.py b/tutorials/mdd.py
index 5946fc05..7913f3f5 100644
--- a/tutorials/mdd.py
+++ b/tutorials/mdd.py
@@ -37,16 +37,16 @@
 # Input parameters
 par = {
     "ox": -300,
-    "dx": 10,
-    "nx": 61,
+    "dx": 5,
+    "nx": 121,
     "oy": -500,
-    "dy": 10,
-    "ny": 101,
+    "dy": 5,
+    "ny": 201,
     "ot": 0,
-    "dt": 0.004,
-    "nt": 400,
+    "dt": 0.002,
+    "nt": 800,
     "f0": 20,
-    "nfmax": 200,
+    "nfmax": 400,
 }
 
 t0_m = 0.2
diff --git a/tutorials_cupy/lsm_cupy.py b/tutorials_cupy/lsm_cupy.py
new file mode 100644
index 00000000..ba1267c7
--- /dev/null
+++ b/tutorials_cupy/lsm_cupy.py
@@ -0,0 +1,200 @@
+r"""
+Least-squares Migration with CUDA-Aware MPI
+===========================================
+This tutorial is an extension of the :ref:`sphx_glr_tutorials_lsm.py` 
+tutorial where PyLops-MPI is run in multi-GPU setting with GPUs 
+communicating via MPI.
+"""
+
+import warnings
+warnings.filterwarnings('ignore')
+
+import numpy as np
+import cupy as cp
+from matplotlib import pyplot as plt
+from mpi4py import MPI
+
+from pylops.utils.wavelets import ricker
+from pylops.waveeqprocessing.lsm import LSM
+
+import pylops_mpi
+
+###############################################################################
+# The standard MPI communicator is used in this example, so there is no need
+# for any initalization. However, we need to assign our GPU resources to the 
+# different ranks. Here we decide to assign a unique GPU to each process if 
+# the number of ranks is equal or smaller than that of the GPUs. Otherwise we
+# start assigning more than one GPU to the available ranks. Note that this 
+# approach will work equally well if we have a multi-node multi-GPU setup, where
+# each node has one or more GPUs.
+
+np.random.seed(42)
+plt.close("all")
+rank = MPI.COMM_WORLD.Get_rank()
+size = MPI.COMM_WORLD.Get_size()
+device_count = cp.cuda.runtime.getDeviceCount()
+cp.cuda.Device(rank % device_count).use();
+
+###############################################################################
+# Let's start by defining all the parameters required by the 
+# :py:class:`pylops.waveeqprocessing.LSM` operator.
+# Note that this section is exactly the same as the one in the MPI example 
+# as we will keep using MPI for transfering metadata (i.e., shapes, dims, etc.)
+
+# Velocity Model
+nx, nz = 81, 60
+dx, dz = 4, 4
+x, z = np.arange(nx) * dx, np.arange(nz) * dz
+v0 = 1000  # initial velocity
+kv = 0.0  # gradient
+vel = np.outer(np.ones(nx), v0 + kv * z)
+
+# Reflectivity Model
+refl = np.zeros((nx, nz), dtype=np.float32)
+refl[:, 30] = -1
+refl[:, 50] = 0.5
+
+# Receivers
+nr = 11
+rx = np.linspace(10 * dx, (nx - 10) * dx, nr)
+rz = 20 * np.ones(nr)
+recs = np.vstack((rx, rz))
+
+# Sources
+ns = 10
+# Total number of sources at all ranks
+nstot = MPI.COMM_WORLD.allreduce(ns, op=MPI.SUM)
+sxtot = np.linspace(dx * 10, (nx - 10) * dx, nstot)
+sx = sxtot[rank * ns: (rank + 1) * ns]
+sztot = 10 * np.ones(nstot)
+sz = 10 * np.ones(ns)
+sources = np.vstack((sx, sz))
+sources_tot = np.vstack((sxtot, sztot))
+
+if rank == 0:
+    plt.figure(figsize=(10, 5))
+    im = plt.imshow(vel.T, cmap="summer", extent=(x[0], x[-1], z[-1], z[0]))
+    plt.scatter(recs[0], recs[1], marker="v", s=150, c="b", edgecolors="k")
+    plt.scatter(sources_tot[0], sources_tot[1], marker="*", s=150, c="r", 
+                edgecolors="k")
+    cb = plt.colorbar(im)
+    cb.set_label("[m/s]")
+    plt.axis("tight")
+    plt.xlabel("x [m]"), plt.ylabel("z [m]")
+    plt.title("Velocity")
+    plt.xlim(x[0], x[-1])
+    plt.tight_layout()
+
+    plt.figure(figsize=(10, 5))
+    im = plt.imshow(refl.T, cmap="gray", extent=(x[0], x[-1], z[-1], z[0]))
+    plt.scatter(recs[0], recs[1], marker="v", s=150, c="b", edgecolors="k")
+    plt.scatter(sources_tot[0], sources_tot[1], marker="*", s=150, c="r", 
+                edgecolors="k")
+    plt.colorbar(im)
+    plt.axis("tight")
+    plt.xlabel("x [m]"), plt.ylabel("z [m]")
+    plt.title("Reflectivity")
+    plt.xlim(x[0], x[-1])
+    plt.tight_layout()
+
+###############################################################################
+# We are now ready to create the :py:class:`pylops.waveeqprocessing.LSM` 
+# operator and initialize the :py:class:`pylops_mpi.DistributedArray` 
+# reflecitivity object. Compared to the MPI tutorial, we need to make sure that 
+# we set ``cupy`` as the engine and fill the distributed arrays with CuPy arrays.
+
+# Wavelet
+nt = 651
+dt = 0.004
+t = np.arange(nt) * dt
+wav, wavt, wavc = ricker(t[:41], f0=20)
+
+lsm = LSM(
+    z,
+    x,
+    t,
+    sources,
+    recs,
+    v0,
+    cp.asarray(wav.astype(np.float32)),
+    wavc,
+    mode="analytic",
+    engine="cuda",
+    dtype=np.float32
+)
+lsm.Demop.trav_srcs = cp.asarray(lsm.Demop.trav_srcs.astype(np.float32))
+lsm.Demop.trav_recs = cp.asarray(lsm.Demop.trav_recs.astype(np.float32))
+
+VStack = pylops_mpi.MPIVStack(ops=[lsm.Demop, ])
+refl_dist = pylops_mpi.DistributedArray(global_shape=nx * nz, 
+                                        partition=pylops_mpi.Partition.BROADCAST, 
+                                        engine="cupy")
+refl_dist[:] = cp.asarray(refl.flatten())
+d_dist = VStack @ refl_dist
+d = d_dist.asarray().reshape((nstot, nr, nt))
+
+###############################################################################
+# We calculate now the adjoint and the inverse using the 
+# :py:func:`pylops_mpi.optimization.basic.cgls` solver. No code change
+# is required to run on CUDA-aware 
+# MPI (this is handled by the MPI operator and DistributedArray)
+# In this particular case, the local computation will be done in GPU. 
+# Collective communication calls will be carried through MPI GPU-to-GPU.
+
+# Adjoint
+madj_dist = VStack.H @ d_dist
+madj = madj_dist.asarray().reshape((nx, nz))
+d_adj_dist = VStack @ madj_dist
+d_adj = d_adj_dist.asarray().reshape((nstot, nr, nt))
+
+# Inverse
+# Initializing x0 to zeroes
+x0 = pylops_mpi.DistributedArray(VStack.shape[1], 
+                                 partition=pylops_mpi.Partition.BROADCAST, 
+                                 engine="cupy")
+x0[:] = 0
+minv_dist = pylops_mpi.cgls(VStack, d_dist, x0=x0, niter=100, show=True)[0]
+minv = minv_dist.asarray().reshape((nx, nz))
+d_inv_dist = VStack @ minv_dist
+d_inv = d_inv_dist.asarray().reshape(nstot, nr, nt)
+
+###############################################################################
+# Finally we visualize the results. Note that the array must be copied back 
+# to the CPU by calling the :code:`get()` method on the CuPy arrays.
+
+if rank == 0:
+    # Visualize
+    fig1, axs = plt.subplots(1, 3, figsize=(10, 3))
+    axs[0].imshow(refl.T, cmap="gray", vmin=-1, vmax=1)
+    axs[0].axis("tight")
+    axs[0].set_title(r"$m$")
+    axs[1].imshow(madj.T.get(), cmap="gray", vmin=-madj.max(), vmax=madj.max())
+    axs[1].set_title(r"$m_{adj}$")
+    axs[1].axis("tight")
+    axs[2].imshow(minv.T.get(), cmap="gray", vmin=-1, vmax=1)
+    axs[2].axis("tight")
+    axs[2].set_title(r"$m_{inv}$")
+    plt.tight_layout()
+
+    fig2, axs = plt.subplots(1, 3, figsize=(10, 3))
+    axs[0].imshow(d[0, :, :300].T.get(), cmap="gray", vmin=-d.max(), vmax=d.max())
+    axs[0].set_title(r"$d$")
+    axs[0].axis("tight")
+    axs[1].imshow(d_adj[0, :, :300].T.get(), cmap="gray", vmin=-d_adj.max(), vmax=d_adj.max())
+    axs[1].set_title(r"$d_{adj}$")
+    axs[1].axis("tight")
+    axs[2].imshow(d_inv[0, :, :300].T.get(), cmap="gray", vmin=-d.max(), vmax=d.max())
+    axs[2].set_title(r"$d_{inv}$")
+    axs[2].axis("tight")
+
+    fig3, axs = plt.subplots(1, 3, figsize=(10, 3))
+    axs[0].imshow(d[nstot // 2, :, :300].T.get(), cmap="gray", vmin=-d.max(), vmax=d.max())
+    axs[0].set_title(r"$d$")
+    axs[0].axis("tight")
+    axs[1].imshow(d_adj[nstot // 2, :, :300].T.get(), cmap="gray", vmin=-d_adj.max(), vmax=d_adj.max())
+    axs[1].set_title(r"$d_{adj}$")
+    axs[1].axis("tight")
+    axs[2].imshow(d_inv[nstot // 2, :, :300].T.get(), cmap="gray", vmin=-d.max(), vmax=d.max())
+    axs[2].set_title(r"$d_{inv}$")
+    axs[2].axis("tight")
+    plt.tight_layout()
diff --git a/tutorials_cupy/mdd_cupy.py b/tutorials_cupy/mdd_cupy.py
new file mode 100644
index 00000000..400742c4
--- /dev/null
+++ b/tutorials_cupy/mdd_cupy.py
@@ -0,0 +1,254 @@
+"""
+Multi-Dimensional Deconvolution with CUDA-Aware MPI
+===================================================
+This tutorial is an extension of the :ref:`sphx_glr_tutorials_mdd.py` 
+tutorial where PyLops-MPI is run in multi-GPU setting with GPUs communicating 
+via MPI.
+"""
+
+import numpy as np
+import cupy as cp
+from matplotlib import pyplot as plt
+from mpi4py import MPI
+
+from pylops.utils.seismicevents import hyperbolic2d, makeaxis
+from pylops.utils.tapers import taper3d
+from pylops.utils.wavelets import ricker
+
+import pylops_mpi
+from pylops_mpi.DistributedArray import local_split, Partition
+
+###############################################################################
+# The standard MPI communicator is used in this example, so there is no need
+# for any initalization. However, we need to assign our GPU resources to the 
+# different ranks. Here we decide to assign a unique GPU to each process if 
+# the number of ranks is equal or smaller than that of the GPUs. Otherwise we
+# start assigning more than one GPU to the available ranks. Note that this 
+# approach will work equally well if we have a multi-node multi-GPU setup, where
+# each node has one or more GPUs.
+
+plt.close("all")
+rank = MPI.COMM_WORLD.Get_rank()
+size = MPI.COMM_WORLD.Get_size()
+dtype = np.float32
+cdtype = np.complex64
+device_count = cp.cuda.runtime.getDeviceCount()
+cp.cuda.Device(rank % device_count).use();
+
+###############################################################################
+# Let's start by defining all the parameters required by the
+# :py:func:`pylops.waveeqprocessing.MPIMDC` operator.
+# Note that this section is exactly the same as the one in the MPI example as 
+# we will keep using MPI for transfering metadata (i.e., shapes, dims, etc.)
+
+# Input parameters
+par = {
+    "ox": -300,
+    "dx": 5,
+    "nx": 121,
+    "oy": -500,
+    "dy": 5,
+    "ny": 201,
+    "ot": 0,
+    "dt": 0.002,
+    "nt": 800,
+    "f0": 20,
+    "nfmax": 400,
+}
+
+t0_m = 0.2
+vrms_m = 1100.0
+amp_m = 1.0
+
+t0_G = (0.2, 0.5, 0.7)
+vrms_G = (1200.0, 1500.0, 2000.0)
+amp_G = (1.0, 0.6, 0.5)
+
+# Taper
+tap = taper3d(par["nt"], (par["ny"], par["nx"]), (5, 5), tapertype="hanning")
+
+# Create axis
+t, t2, x, y = makeaxis(par)
+
+# Create wavelet
+wav = ricker(t[:41], f0=par["f0"])[0]
+
+# Generate model
+mrefl, mwav = hyperbolic2d(x, t, t0_m, vrms_m, amp_m, wav)
+
+# Generate operator
+G, Gwav = np.zeros((par["ny"], par["nx"], par["nt"])), np.zeros(
+    (par["ny"], par["nx"], par["nt"])
+)
+for iy, y0 in enumerate(y):
+    G[iy], Gwav[iy] = hyperbolic2d(x - y0, t, t0_G, vrms_G, amp_G, wav)
+G, Gwav = G * tap, Gwav * tap
+
+# Add negative part to data and model
+mrefl = np.concatenate((np.zeros((par["nx"], par["nt"] - 1)), mrefl), axis=-1)
+mwav = np.concatenate((np.zeros((par["nx"], par["nt"] - 1)), mwav), axis=-1)
+Gwav2 = np.concatenate((np.zeros((par["ny"], par["nx"], par["nt"] - 1)), Gwav), axis=-1)
+
+# Move to frequency
+Gwav_fft = np.fft.rfft(Gwav2, 2 * par["nt"] - 1, axis=-1)
+Gwav_fft = (Gwav_fft[..., : par["nfmax"]])
+
+# Move frequency/time to first axis
+mrefl, mwav = mrefl.T, mwav.T
+Gwav_fft = Gwav_fft.transpose(2, 0, 1)
+
+# Choose how to split frequencies to ranks
+nf = par["nfmax"]
+nf_rank = local_split((nf,), MPI.COMM_WORLD, Partition.SCATTER, 0)
+nf_ranks = np.concatenate(MPI.COMM_WORLD.allgather(nf_rank))
+ifin_rank = np.insert(np.cumsum(nf_ranks)[:-1], 0, 0)[rank]
+ifend_rank = np.cumsum(nf_ranks)[rank]
+
+# Extract batch of frequency slices (in practice, this will be directly 
+# read from input file)
+G = Gwav_fft[ifin_rank:ifend_rank].astype(cdtype)
+
+###############################################################################
+# For MPIMDCOperator, there is no change needed to have it run with 
+# MPI. This PyLops operator has GPU-support 
+# (https://pylops.readthedocs.io/en/stable/gpu.html) so it can operate on a 
+# distributed arrays with engine set to CuPy.
+
+# Move operator kernel to GPU
+G = cp.asarray(G)
+
+# Define operator
+MDCop = pylops_mpi.waveeqprocessing.MPIMDC((1.0 * par["dt"] * np.sqrt(par["nt"])) * G,
+                                           nt=2 * par["nt"] - 1, nv=1, nfreq=nf,
+                                           dt=par["dt"], dr=1.0, twosided=True,
+                                           fftengine="numpy", prescaled=True)
+
+# Create model
+m = pylops_mpi.DistributedArray(global_shape=(2 * par["nt"] - 1) * par["nx"] * 1,
+                                partition=Partition.BROADCAST, engine="cupy",
+                                dtype=dtype)
+m[:] = cp.asarray(mrefl.astype(dtype).ravel())
+
+# Create data
+d = MDCop @ m
+dloc = d.asarray().real.reshape(2 * par["nt"] - 1, par["ny"])
+
+###############################################################################
+# Let's display what we have so far: operator, input model, and data
+
+if rank == 0:
+    fig, axs = plt.subplots(1, 2, figsize=(8, 6))
+    axs[0].imshow(
+        Gwav2[int(par["ny"] / 2)].T,
+        aspect="auto",
+        interpolation="nearest",
+        cmap="gray",
+        vmin=-np.abs(Gwav2.max()),
+        vmax=np.abs(Gwav2.max()),
+        extent=(x.min(), x.max(), t2.max(), t2.min()),
+    )
+    axs[0].set_title("G - inline view", fontsize=15)
+    axs[0].set_xlabel(r"$x_R$")
+    axs[1].set_ylabel(r"$t$")
+    axs[1].imshow(
+        Gwav2[:, int(par["nx"] / 2)].T,
+        aspect="auto",
+        interpolation="nearest",
+        cmap="gray",
+        vmin=-np.abs(Gwav2.max()),
+        vmax=np.abs(Gwav2.max()),
+        extent=(y.min(), y.max(), t2.max(), t2.min()),
+    )
+    axs[1].set_title("G - inline view", fontsize=15)
+    axs[1].set_xlabel(r"$x_S$")
+    axs[1].set_ylabel(r"$t$")
+    fig.tight_layout()
+
+    fig, axs = plt.subplots(1, 2, figsize=(8, 6))
+    axs[0].imshow(
+        mwav,
+        aspect="auto",
+        interpolation="nearest",
+        cmap="gray",
+        vmin=-np.abs(mwav.max()),
+        vmax=np.abs(mwav.max()),
+        extent=(x.min(), x.max(), t2.max(), t2.min()),
+    )
+    axs[0].set_title(r"$m$", fontsize=15)
+    axs[0].set_xlabel(r"$x_R$")
+    axs[0].set_ylabel(r"$t$")
+    axs[1].imshow(
+        dloc.get(),
+        aspect="auto",
+        interpolation="nearest",
+        cmap="gray",
+        vmin=-np.abs(dloc.max()),
+        vmax=np.abs(dloc.max()),
+        extent=(x.min(), x.max(), t2.max(), t2.min()),
+    )
+    axs[1].set_title(r"$d$", fontsize=15)
+    axs[1].set_xlabel(r"$x_S$")
+    axs[1].set_ylabel(r"$t$")
+    fig.tight_layout()
+
+###############################################################################
+# We are now ready to compute the adjoint (i.e., cross-correlation) and invert
+# back for our input model. This computation will be done in GPU. The call 
+# :code:`asarray()` triggers CUDA-aware MPI communication (gather result 
+# from each GPU). Note that the array :code:`madjloc` and :code:`minvloc`
+# still live in GPU memory.
+
+
+# Adjoint
+madj = MDCop.H @ d
+madjloc = madj.asarray().real.reshape(2 * par["nt"] - 1, par["nx"])
+
+# Inverse
+m0 = pylops_mpi.DistributedArray(global_shape=(2 * par["nt"] - 1) * par["nx"] * 1,
+                                 partition=Partition.BROADCAST, engine="cupy",
+                                 dtype=cdtype)
+m0[:] = 0
+minv = pylops_mpi.cgls(MDCop, d, x0=m0, niter=50, show=True if rank == 0 else False)[0]
+minvloc = minv.asarray().real.reshape(2 * par["nt"] - 1, par["nx"])
+
+###############################################################################
+# Finally we visualize the results. Note that the array must be copied back 
+# to the CPU by calling the :code:`get()` method on the CuPy arrays.
+
+if rank == 0:
+    fig = plt.figure(figsize=(8, 6))
+    ax1 = plt.subplot2grid((1, 5), (0, 0), colspan=2)
+    ax2 = plt.subplot2grid((1, 5), (0, 2), colspan=2)
+    ax3 = plt.subplot2grid((1, 5), (0, 4))
+    ax1.imshow(
+        madjloc.get(),
+        aspect="auto",
+        interpolation="nearest",
+        cmap="gray",
+        vmin=-np.abs(madjloc.max()),
+        vmax=np.abs(madjloc.max()),
+        extent=(x.min(), x.max(), t2.max(), t2.min()),
+    )
+    ax1.set_title("Adjoint m", fontsize=15)
+    ax1.set_xlabel(r"$x_V$")
+    ax1.set_ylabel(r"$t$")
+    ax2.imshow(
+        minvloc.get(),
+        aspect="auto",
+        interpolation="nearest",
+        cmap="gray",
+        vmin=-np.abs(minvloc.max()),
+        vmax=np.abs(minvloc.max()),
+        extent=(x.min(), x.max(), t2.max(), t2.min()),
+    )
+    ax2.set_title("Inverted m", fontsize=15)
+    ax2.set_xlabel(r"$x_V$")
+    ax2.set_ylabel(r"$t$")
+    ax3.plot(
+        madjloc[:, int(par["nx"] / 2)].get() / np.abs(madjloc[:, int(par["nx"] / 2)].get()).max(), t2, "r", lw=5
+    )
+    ax3.plot(
+        minvloc[:, int(par["nx"] / 2)].get() / np.abs(minvloc[:, int(par["nx"] / 2)].get()).max(), t2, "k", lw=3
+    )
+    ax3.set_ylim([t2[-1], t2[0]])
+    fig.tight_layout()
diff --git a/tutorials_cupy/poststack_cupy.py b/tutorials_cupy/poststack_cupy.py
new file mode 100644
index 00000000..3b35cb76
--- /dev/null
+++ b/tutorials_cupy/poststack_cupy.py
@@ -0,0 +1,222 @@
+r"""
+Post Stack Inversion - 3D with CUDA-Aware MPI
+=============================================
+This tutorial is an extension of the :ref:`sphx_glr_tutorials_poststack.py` 
+tutorial where PyLops-MPI is run in multi-GPU setting with GPUs communicating 
+via MPI.
+"""
+
+import numpy as np
+import cupy as cp
+from scipy.signal import filtfilt
+from matplotlib import pyplot as plt
+from mpi4py import MPI
+
+from pylops.utils.wavelets import ricker
+from pylops.basicoperators import Transpose
+from pylops.avo.poststack import PoststackLinearModelling
+
+import pylops_mpi
+
+###############################################################################
+# The standard MPI communicator is used in this example, so there is no need
+# for any initalization. However, we need to assign our GPU resources to the 
+# different ranks. Here we decide to assign a unique GPU to each process if 
+# the number of ranks is equal or smaller than that of the GPUs. Otherwise we
+# start assigning more than one GPU to the available ranks. Note that this 
+# approach will work equally well if we have a multi-node multi-GPU setup, where
+# each node has one or more GPUs.
+
+plt.close("all")
+rank = MPI.COMM_WORLD.Get_rank()
+device_count = cp.cuda.runtime.getDeviceCount()
+cp.cuda.Device(rank % device_count).use();
+
+###############################################################################
+# Let's start by defining all the parameters required by the
+# :py:func:`pylops.avo.poststack.PoststackLinearModelling` operator.
+# Note that this section is exactly the same as the one in the MPI example as 
+# we will keep using MPI for transfering metadata (i.e., shapes, dims, etc.)
+
+# Model
+model = np.load("../testdata/avo/poststack_model.npz")
+x, z, m = model['x'][::3], model['z'], np.log(model['model'])[:, ::3]
+
+# Making m a 3D model
+ny_i = 20  # size of model in y direction for rank i
+y = np.arange(ny_i)
+m3d_i = np.tile(m[:, :, np.newaxis], (1, 1, ny_i)).transpose((2, 1, 0))
+ny_i, nx, nz = m3d_i.shape
+
+# Size of y at all ranks
+ny = MPI.COMM_WORLD.allreduce(ny_i)
+
+# Smooth model
+nsmoothy, nsmoothx, nsmoothz = 5, 30, 20
+mback3d_i = filtfilt(np.ones(nsmoothy) / float(nsmoothy), 1, m3d_i, axis=0)
+mback3d_i = filtfilt(np.ones(nsmoothx) / float(nsmoothx), 1, mback3d_i, axis=1)
+mback3d_i = filtfilt(np.ones(nsmoothz) / float(nsmoothz), 1, mback3d_i, axis=2)
+
+# Wavelet
+dt = 0.004
+t0 = np.arange(nz) * dt
+ntwav = 41
+wav = ricker(t0[:ntwav // 2 + 1], 15)[0]
+
+# Collecting all the m3d and mback3d at all ranks
+m3d = np.concatenate(MPI.COMM_WORLD.allgather(m3d_i))
+mback3d = np.concatenate(MPI.COMM_WORLD.allgather(mback3d_i))
+
+###############################################################################
+# We are now ready to initialize various :py:class:`pylops_mpi.DistributedArray` 
+# objects. Compared to the MPI tutorial, we need to make sure that we set ``cupy`` 
+# as the engine and fill the distributed arrays with CuPy arrays.
+
+m3d_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, 
+                                       engine="cupy")
+m3d_dist[:] = cp.asarray(m3d_i.flatten())
+
+# Do the same thing for smooth model
+mback3d_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, 
+                                           engine="cupy")
+mback3d_dist[:] = cp.asarray(mback3d_i.flatten())
+
+###############################################################################
+# For PostStackLinearModelling, there is no change needed to have it run with 
+# MPI. This PyLops operator has GPU-support 
+# (https://pylops.readthedocs.io/en/stable/gpu.html) so it can operate on a 
+# distributed arrays with engine set to CuPy.
+
+PPop = PoststackLinearModelling(cp.asarray(wav.astype(np.float32)), nt0=nz, 
+                                spatdims=(ny_i, nx))
+Top = Transpose((ny_i, nx, nz), (2, 0, 1))
+BDiag = pylops_mpi.basicoperators.MPIBlockDiag(ops=[Top.H @ PPop @ Top, ])
+
+###############################################################################
+# This computation will be done on the GPU(s). The call :code:`asarray()` 
+# triggers the MPI communication (gather results from each GPU).
+# Note that the array :code:`d` and :code:`d_0` still live in GPU memory.
+
+d_dist = BDiag @ m3d_dist
+d_local = d_dist.local_array.reshape((ny_i, nx, nz))
+d = d_dist.asarray().reshape((ny, nx, nz))
+d_0_dist = BDiag @ mback3d_dist
+d_0 = d_dist.asarray().reshape((ny, nx, nz))
+
+###############################################################################
+# Inversion using CGLS solver - no code change is required to run the solver
+# with CUDA-aware MPI (this is handled by the MPI operator and DistributedArray)
+# In this particular case, the local computation will be done in GPU. 
+# Collective communication calls will be carried through MPI GPU-to-GPU.
+
+# Inversion using CGLS solver
+minv3d_iter_dist = pylops_mpi.optimization.basic.cgls(BDiag, d_dist, 
+                                                      x0=mback3d_dist,
+                                                      niter=100, show=True)[0]
+minv3d_iter = minv3d_iter_dist.asarray().reshape((ny, nx, nz))
+
+###############################################################################
+
+# Regularized inversion with normal equations
+epsR = 1e2
+LapOp = pylops_mpi.MPILaplacian(dims=(ny, nx, nz), axes=(0, 1, 2), 
+                                weights=(1, 1, 1),
+                                sampling=(1, 1, 1), 
+                                dtype=BDiag.dtype)
+NormEqOp = BDiag.H @ BDiag + epsR * LapOp.H @ LapOp
+dnorm_dist = BDiag.H @ d_dist
+minv3d_ne_dist = pylops_mpi.optimization.basic.cg(NormEqOp, dnorm_dist, 
+                                                  x0=mback3d_dist, 
+                                                  niter=100, show=True)[0]
+minv3d_ne = minv3d_ne_dist.asarray().reshape((ny, nx, nz))
+
+###############################################################################
+
+# Regularized inversion with regularized equations
+StackOp = pylops_mpi.MPIStackedVStack([BDiag, np.sqrt(epsR) * LapOp])
+d0_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, engine="cupy")
+d0_dist[:] = 0.
+dstack_dist = pylops_mpi.StackedDistributedArray([d_dist, d0_dist])
+
+dnorm_dist = BDiag.H @ d_dist
+minv3d_reg_dist = pylops_mpi.optimization.basic.cgls(StackOp, dstack_dist, 
+                                                     x0=mback3d_dist, 
+                                                     niter=100, show=True)[0]
+minv3d_reg = minv3d_reg_dist.asarray().reshape((ny, nx, nz))
+
+###############################################################################
+# Finally we visualize the results. Note that the array must be copied back 
+# to the CPU by calling the :code:`get()` method on the CuPy arrays.
+
+if rank == 0:
+    # Check the distributed implementation gives the same result
+    # as the one running only on rank0
+    PPop0 = PoststackLinearModelling(wav, nt0=nz, spatdims=(ny, nx))
+    d0 = (PPop0 @ m3d.transpose(2, 0, 1)).transpose(1, 2, 0)
+    d0_0 = (PPop0 @ m3d.transpose(2, 0, 1)).transpose(1, 2, 0)
+
+    # Check the two distributed implementations give the same modelling results
+    print('Distr == Local', np.allclose(cp.asnumpy(d), d0, atol=1e-6))
+    print('Smooth Distr == Local', np.allclose(cp.asnumpy(d_0), d0_0, atol=1e-6))
+    
+    # Visualize
+    fig, axs = plt.subplots(nrows=6, ncols=3, figsize=(9, 14), constrained_layout=True)
+    axs[0][0].imshow(m3d[5, :, :].T, cmap="gist_rainbow", vmin=m.min(), vmax=m.max())
+    axs[0][0].set_title("Model x-z")
+    axs[0][0].axis("tight")
+    axs[0][1].imshow(m3d[:, 200, :].T, cmap="gist_rainbow", vmin=m.min(), vmax=m.max())
+    axs[0][1].set_title("Model y-z")
+    axs[0][1].axis("tight")
+    axs[0][2].imshow(m3d[:, :, 220].T, cmap="gist_rainbow", vmin=m.min(), vmax=m.max())
+    axs[0][2].set_title("Model y-z")
+    axs[0][2].axis("tight")
+
+    axs[1][0].imshow(mback3d[5, :, :].T, cmap="gist_rainbow", vmin=m.min(), vmax=m.max())
+    axs[1][0].set_title("Smooth Model x-z")
+    axs[1][0].axis("tight")
+    axs[1][1].imshow(mback3d[:, 200, :].T, cmap="gist_rainbow", vmin=m.min(), vmax=m.max())
+    axs[1][1].set_title("Smooth Model y-z")
+    axs[1][1].axis("tight")
+    axs[1][2].imshow(mback3d[:, :, 220].T, cmap="gist_rainbow", vmin=m.min(), vmax=m.max())
+    axs[1][2].set_title("Smooth Model y-z")
+    axs[1][2].axis("tight")
+
+    axs[2][0].imshow(d[5, :, :].T.get(), cmap="gray", vmin=-1, vmax=1)
+    axs[2][0].set_title("Data x-z")
+    axs[2][0].axis("tight")
+    axs[2][1].imshow(d[:, 200, :].T.get(), cmap='gray', vmin=-1, vmax=1)
+    axs[2][1].set_title('Data y-z')
+    axs[2][1].axis('tight')
+    axs[2][2].imshow(d[:, :, 220].T.get(), cmap='gray', vmin=-1, vmax=1)
+    axs[2][2].set_title('Data x-y')
+    axs[2][2].axis('tight')
+
+    axs[3][0].imshow(minv3d_iter[5, :, :].T.get(), cmap="gist_rainbow", vmin=m.min(), vmax=m.max())
+    axs[3][0].set_title("Inverted Model iter x-z")
+    axs[3][0].axis("tight")
+    axs[3][1].imshow(minv3d_iter[:, 200, :].T.get(), cmap='gist_rainbow', vmin=m.min(), vmax=m.max())
+    axs[3][1].set_title('Inverted Model iter y-z')
+    axs[3][1].axis('tight')
+    axs[3][2].imshow(minv3d_iter[:, :, 220].T.get(), cmap='gist_rainbow', vmin=m.min(), vmax=m.max())
+    axs[3][2].set_title('Inverted Model iter x-y')
+    axs[3][2].axis('tight')
+
+    axs[4][0].imshow(minv3d_ne[5, :, :].T.get(), cmap="gist_rainbow", vmin=m.min(), vmax=m.max())
+    axs[4][0].set_title("Normal Equations Inverted Model iter x-z")
+    axs[4][0].axis("tight")
+    axs[4][1].imshow(minv3d_ne[:, 200, :].T.get(), cmap='gist_rainbow', vmin=m.min(), vmax=m.max())
+    axs[4][1].set_title('Normal Equations Inverted Model iter y-z')
+    axs[4][1].axis('tight')
+    axs[4][2].imshow(minv3d_ne[:, :, 220].T.get(), cmap='gist_rainbow', vmin=m.min(), vmax=m.max())
+    axs[4][2].set_title('Normal Equations Inverted Model iter x-y')
+    axs[4][2].axis('tight')
+
+    axs[5][0].imshow(minv3d_reg[5, :, :].T.get(), cmap="gist_rainbow", vmin=m.min(), vmax=m.max())
+    axs[5][0].set_title("Regularized Inverted Model iter x-z")
+    axs[5][0].axis("tight")
+    axs[5][1].imshow(minv3d_reg[:, 200, :].T.get(), cmap='gist_rainbow', vmin=m.min(), vmax=m.max())
+    axs[5][1].set_title('Regularized Inverted Model iter y-z')
+    axs[5][1].axis('tight')
+    axs[5][2].imshow(minv3d_reg[:, :, 220].T.get(), cmap='gist_rainbow', vmin=m.min(), vmax=m.max())
+    axs[5][2].set_title('Regularized Inverted Model iter x-y')
+    axs[5][2].axis('tight')
diff --git a/tutorials_nccl/poststack_nccl.py b/tutorials_nccl/poststack_nccl.py
index e6d94374..e81d9323 100644
--- a/tutorials_nccl/poststack_nccl.py
+++ b/tutorials_nccl/poststack_nccl.py
@@ -1,7 +1,9 @@
 r"""
 Post Stack Inversion - 3D with NCCL
-============================================
-This tutorial is an extension of the :ref:`sphx_glr_tutorials_poststack.py` tutorial where PyLops-MPI is run in multi-GPU setting with GPUs communicating via NCCL.
+===================================
+This tutorial is an extension of the :ref:`sphx_glr_tutorials_poststack.py`
+tutorial where PyLops-MPI is run in multi-GPU setting with GPUs communicating
+via NCCL.
 """
 
 import numpy as np
@@ -28,7 +30,8 @@
 ###############################################################################
 # Let's start by defining all the parameters required by the
 # :py:func:`pylops.avo.poststack.PoststackLinearModelling` operator.
-# Note that this section is exactly the same as the one in the MPI example as we will keep using MPI for transfering metadata (i.e., shapes, dims, etc.)
+# Note that this section is exactly the same as the one in the MPI example as 
+# we will keep using MPI for transfering metadata (i.e., shapes, dims, etc.)
 
 # Model
 model = np.load("../testdata/avo/poststack_model.npz")
@@ -60,28 +63,37 @@
 mback3d = np.concatenate(MPI.COMM_WORLD.allgather(mback3d_i))
 
 ###############################################################################
-# We are now ready to initialize various :py:class:`pylops_mpi.DistributedArray` objects.
-# Compared to the MPI tutorial, we need to make sure that we pass :code:`base_comm_nccl = nccl_comm` and set CuPy as the engine.
-
-m3d_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, base_comm_nccl=nccl_comm, engine="cupy")
+# We are now ready to initialize various :py:class:`pylops_mpi.DistributedArray`
+# objects. Compared to the MPI tutorial, we need to make sure that we pass 
+# :code:`base_comm_nccl = nccl_comm`, set ``cupy`` as the engine, and fill the
+# distributed arrays with CuPy arrays.
+
+m3d_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, 
+                                       base_comm_nccl=nccl_comm, 
+                                       engine="cupy")
 m3d_dist[:] = cp.asarray(m3d_i.flatten())
 
 # Do the same thing for smooth model
-mback3d_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, base_comm_nccl=nccl_comm, engine="cupy")
+mback3d_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, 
+                                           base_comm_nccl=nccl_comm, 
+                                           engine="cupy")
 mback3d_dist[:] = cp.asarray(mback3d_i.flatten())
 
 ###############################################################################
-# For PostStackLinearModelling, there is no change needed to have it run with NCCL.
-# This PyLops operator has GPU-support (https://pylops.readthedocs.io/en/stable/gpu.html)
-# so it can run with DistributedArray whose engine is Cupy
+# For PostStackLinearModelling, there is no change needed to have it run 
+# with NCCL. This PyLops operator has GPU-support 
+# (https://pylops.readthedocs.io/en/stable/gpu.html) so it can operate on a 
+# distributed arrays with engine set to  CuPy.
 
-PPop = PoststackLinearModelling(wav=cp.asarray(wav), nt0=nz, spatdims=(ny_i, nx))
+PPop = PoststackLinearModelling(wav=cp.asarray(wav), nt0=nz, 
+                                spatdims=(ny_i, nx))
 Top = Transpose((ny_i, nx, nz), (2, 0, 1))
 BDiag = pylops_mpi.basicoperators.MPIBlockDiag(ops=[Top.H @ PPop @ Top, ])
 
 ###############################################################################
-# This computation will be done in GPU. The call :code:`asarray()` triggers the NCCL communication (gather result from each GPU).
-# But array :code:`d` and :code:`d_0` still live in GPU memory
+# This computation will be done on the GPU(s). The call :code:`asarray()` 
+# triggers the NCCL communication (gather results from each GPU).
+# Note that the array :code:`d` and :code:`d_0` still live in GPU memory.
 
 d_dist = BDiag @ m3d_dist
 d_local = d_dist.local_array.reshape((ny_i, nx, nz))
@@ -90,39 +102,50 @@
 d_0 = d_dist.asarray().reshape((ny, nx, nz))
 
 ###############################################################################
-# Inversion using CGLS solver - There is no code change to have run on NCCL (it handles though MPI operator and DistributedArray)
-# In this particular case, the local computation will be done in GPU. Collective communication calls
-# will be carried through NCCL GPU-to-GPU.
+# Inversion using CGLS solver - There is no code change to have run the solver
+# with NCCL (this is handledby the MPI operator and DistributedArray)
+# In this particular case, the local computation will be done in GPU. 
+# Collective communication calls will be carried through NCCL GPU-to-GPU.
 
 # Inversion using CGLS solver
-minv3d_iter_dist = pylops_mpi.optimization.basic.cgls(BDiag, d_dist, x0=mback3d_dist, niter=100, show=True)[0]
+minv3d_iter_dist = pylops_mpi.optimization.basic.cgls(BDiag, d_dist, 
+                                                      x0=mback3d_dist, 
+                                                      niter=100, show=True)[0]
 minv3d_iter = minv3d_iter_dist.asarray().reshape((ny, nx, nz))
 
 ###############################################################################
 
 # Regularized inversion with normal equations
 epsR = 1e2
-LapOp = pylops_mpi.MPILaplacian(dims=(ny, nx, nz), axes=(0, 1, 2), weights=(1, 1, 1),
-                                sampling=(1, 1, 1), dtype=BDiag.dtype)
+LapOp = pylops_mpi.MPILaplacian(dims=(ny, nx, nz), axes=(0, 1, 2), 
+                                weights=(1, 1, 1),
+                                sampling=(1, 1, 1), 
+                                dtype=BDiag.dtype)
 NormEqOp = BDiag.H @ BDiag + epsR * LapOp.H @ LapOp
 dnorm_dist = BDiag.H @ d_dist
-minv3d_ne_dist = pylops_mpi.optimization.basic.cg(NormEqOp, dnorm_dist, x0=mback3d_dist, niter=100, show=True)[0]
+minv3d_ne_dist = pylops_mpi.optimization.basic.cg(NormEqOp, dnorm_dist, 
+                                                  x0=mback3d_dist, 
+                                                  niter=100, show=True)[0]
 minv3d_ne = minv3d_ne_dist.asarray().reshape((ny, nx, nz))
 
 ###############################################################################
 
 # Regularized inversion with regularized equations
 StackOp = pylops_mpi.MPIStackedVStack([BDiag, np.sqrt(epsR) * LapOp])
-d0_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, base_comm_nccl=nccl_comm, engine="cupy")
+d0_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, 
+                                      base_comm_nccl=nccl_comm, engine="cupy")
 d0_dist[:] = 0.
 dstack_dist = pylops_mpi.StackedDistributedArray([d_dist, d0_dist])
 
 dnorm_dist = BDiag.H @ d_dist
-minv3d_reg_dist = pylops_mpi.optimization.basic.cgls(StackOp, dstack_dist, x0=mback3d_dist, niter=100, show=True)[0]
+minv3d_reg_dist = pylops_mpi.optimization.basic.cgls(StackOp, dstack_dist, 
+                                                     x0=mback3d_dist, 
+                                                     niter=100, show=True)[0]
 minv3d_reg = minv3d_reg_dist.asarray().reshape((ny, nx, nz))
 
 ###############################################################################
-# To plot the inversion results, the array must be copied back to cpu via :code:`get()`
+# Finally we visualize the results. Note that the array must be copied back 
+# to the CPU by calling the :code:`get()` method on the CuPy arrays.
 
 if rank == 0:
     # Check the distributed implementation gives the same result