minor: finalized cupy tutorials

mrava87 · mrava87 · commit 6f68c0b0d5dd · 2025-06-29T21:08:37.000Z
diff --git a/tutorials_cupy/lsm_cupy.py b/tutorials_cupy/lsm_cupy.py
@@ -2,8 +2,8 @@
 Least-squares Migration with CUDA-Aware MPI
 ===========================================
 This tutorial is an extension of the :ref:`sphx_glr_tutorials_lsm.py` 
-tutorial where PyLops-MPI is run in multi-GPU setting with GPUs communicating via 
-CUDA-Aware MPI.
+tutorial where PyLops-MPI is run in multi-GPU setting with GPUs 
+communicating via MPI.
 """
 
 import warnings
@@ -19,16 +19,25 @@
 
 import pylops_mpi
 
+###############################################################################
+# The standard MPI communicator is used in this example, so there is no need
+# for any initalization. However, we need to assign our GPU resources to the 
+# different ranks. Here we decide to assign a unique GPU to each process if 
+# the number of ranks is equal or smaller than that of the GPUs. Otherwise we
+# start assigning more than one GPU to the available ranks. Note that this 
+# approach will work equally well if we have a multi-node multi-GPU setup, where
+# each node has one or more GPUs.
+
 np.random.seed(42)
 plt.close("all")
 rank = MPI.COMM_WORLD.Get_rank()
 size = MPI.COMM_WORLD.Get_size()
 device_count = cp.cuda.runtime.getDeviceCount()
-cp.cuda.Device(rank % device_count).use()
+cp.cuda.Device(rank % device_count).use();
 
 ###############################################################################
-# Let's start with a simple model with two interfaces, where sources are 
-# distributed across different ranks.
+# Let's start by defining all the parameters required by the 
+# :py:class:`pylops.waveeqprocessing.LSM` operator.
 # Note that this section is exactly the same as the one in the MPI example 
 # as we will keep using MPI for transfering metadata (i.e., shapes, dims, etc.)
 
@@ -66,7 +75,8 @@
     plt.figure(figsize=(10, 5))
     im = plt.imshow(vel.T, cmap="summer", extent=(x[0], x[-1], z[-1], z[0]))
     plt.scatter(recs[0], recs[1], marker="v", s=150, c="b", edgecolors="k")
-    plt.scatter(sources_tot[0], sources_tot[1], marker="*", s=150, c="r", edgecolors="k")
+    plt.scatter(sources_tot[0], sources_tot[1], marker="*", s=150, c="r", 
+                edgecolors="k")
     cb = plt.colorbar(im)
     cb.set_label("[m/s]")
     plt.axis("tight")
@@ -78,7 +88,8 @@
     plt.figure(figsize=(10, 5))
     im = plt.imshow(refl.T, cmap="gray", extent=(x[0], x[-1], z[-1], z[0]))
     plt.scatter(recs[0], recs[1], marker="v", s=150, c="b", edgecolors="k")
-    plt.scatter(sources_tot[0], sources_tot[1], marker="*", s=150, c="r", edgecolors="k")
+    plt.scatter(sources_tot[0], sources_tot[1], marker="*", s=150, c="r", 
+                edgecolors="k")
     plt.colorbar(im)
     plt.axis("tight")
     plt.xlabel("x [m]"), plt.ylabel("z [m]")
@@ -90,7 +101,7 @@
 # We are now ready to create the :py:class:`pylops.waveeqprocessing.LSM` 
 # operator and initialize the :py:class:`pylops_mpi.DistributedArray` 
 # reflecitivity object. Compared to the MPI tutorial, we need to make sure that 
-# we set CuPy as the engine and use CuPy arrays
+# we set ``cupy`` as the engine and fill the distributed arrays with CuPy arrays.
 
 # Wavelet
 nt = 651
@@ -126,7 +137,7 @@
 # We calculate now the adjoint and the inverse using the 
 # :py:func:`pylops_mpi.optimization.basic.cgls` solver. No code change
 # is required to run on CUDA-aware 
-# MPI (this is handled through MPI operator and DistributedArray)
+# MPI (this is handled by the MPI operator and DistributedArray)
 # In this particular case, the local computation will be done in GPU. 
 # Collective communication calls will be carried through MPI GPU-to-GPU.
 
@@ -148,6 +159,9 @@
 d_inv = d_inv_dist.asarray().reshape(nstot, nr, nt)
 
 ###############################################################################
+# Finally we visualize the results. Note that the array must be copied back 
+# to the CPU by calling the :code:`get()` method on the CuPy arrays.
+
 if rank == 0:
     # Visualize
     fig1, axs = plt.subplots(1, 3, figsize=(10, 3))
diff --git a/tutorials_cupy/mdd_cupy.py b/tutorials_cupy/mdd_cupy.py
@@ -2,8 +2,8 @@
 Multi-Dimensional Deconvolution with CUDA-Aware MPI
 ===================================================
 This tutorial is an extension of the :ref:`sphx_glr_tutorials_mdd.py` 
-tutorial where PyLops-MPI is run in multi-GPU setting with GPUs communicating via 
-CUDA-Aware MPI.
+tutorial where PyLops-MPI is run in multi-GPU setting with GPUs communicating 
+via MPI.
 """
 
 import numpy as np
@@ -18,17 +18,28 @@
 import pylops_mpi
 from pylops_mpi.DistributedArray import local_split, Partition
 
+###############################################################################
+# The standard MPI communicator is used in this example, so there is no need
+# for any initalization. However, we need to assign our GPU resources to the 
+# different ranks. Here we decide to assign a unique GPU to each process if 
+# the number of ranks is equal or smaller than that of the GPUs. Otherwise we
+# start assigning more than one GPU to the available ranks. Note that this 
+# approach will work equally well if we have a multi-node multi-GPU setup, where
+# each node has one or more GPUs.
+
 plt.close("all")
 rank = MPI.COMM_WORLD.Get_rank()
 size = MPI.COMM_WORLD.Get_size()
 dtype = np.float32
 cdtype = np.complex64
 device_count = cp.cuda.runtime.getDeviceCount()
-cp.cuda.Device(rank % device_count).use()
+cp.cuda.Device(rank % device_count).use();
 
 ###############################################################################
-# Let's start by creating a set of hyperbolic events to be used as
-# our MDC kernel as well as the model
+# Let's start by defining all the parameters required by the
+# :py:func:`pylops.waveeqprocessing.MPIMDC` operator.
+# Note that this section is exactly the same as the one in the MPI example as 
+# we will keep using MPI for transfering metadata (i.e., shapes, dims, etc.)
 
 # Input parameters
 par = {
@@ -93,13 +104,15 @@
 ifin_rank = np.insert(np.cumsum(nf_ranks)[:-1], 0, 0)[rank]
 ifend_rank = np.cumsum(nf_ranks)[rank]
 
-# Extract batch of frequency slices (in practice, this will be directly read from input file)
+# Extract batch of frequency slices (in practice, this will be directly 
+# read from input file)
 G = Gwav_fft[ifin_rank:ifend_rank].astype(cdtype)
 
 ###############################################################################
-# Let's now define the distributed operator and model as well as compute the
-# data. Compared to the MPI tutorial, we need to make sure that we set CuPy 
-# as the engine and use CuPy arrays
+# For MPIMDCOperator, there is no change needed to have it run with 
+# MPI. This PyLops operator has GPU-support 
+# (https://pylops.readthedocs.io/en/stable/gpu.html) so it can operate on a 
+# distributed arrays with engine set to CuPy.
 
 # Move operator kernel to GPU
 G = cp.asarray(G)
@@ -182,7 +195,9 @@
 # We are now ready to compute the adjoint (i.e., cross-correlation) and invert
 # back for our input model. This computation will be done in GPU. The call 
 # :code:`asarray()` triggers CUDA-aware MPI communication (gather result 
-# from each GPU). But array :code:`madjloc` still live in GPU memory
+# from each GPU). Note that the array :code:`madjloc` and :code:`minvloc`
+# still live in GPU memory.
+
 
 # Adjoint
 madj = MDCop.H @ d
@@ -196,6 +211,10 @@
 minv = pylops_mpi.cgls(MDCop, d, x0=m0, niter=50, show=True if rank == 0 else False)[0]
 minvloc = minv.asarray().real.reshape(2 * par["nt"] - 1, par["nx"])
 
+###############################################################################
+# Finally we visualize the results. Note that the array must be copied back 
+# to the CPU by calling the :code:`get()` method on the CuPy arrays.
+
 if rank == 0:
     fig = plt.figure(figsize=(8, 6))
     ax1 = plt.subplot2grid((1, 5), (0, 0), colspan=2)
diff --git a/tutorials_cupy/poststack_cupy.py b/tutorials_cupy/poststack_cupy.py
@@ -2,8 +2,8 @@
 Post Stack Inversion - 3D with CUDA-Aware MPI
 =============================================
 This tutorial is an extension of the :ref:`sphx_glr_tutorials_poststack.py` 
-tutorial where PyLops-MPI is run in multi-GPU setting with GPUs communicating via 
-CUDA-Aware MPI.
+tutorial where PyLops-MPI is run in multi-GPU setting with GPUs communicating 
+via MPI.
 """
 
 import numpy as np
@@ -18,15 +18,25 @@
 
 import pylops_mpi
 
+###############################################################################
+# The standard MPI communicator is used in this example, so there is no need
+# for any initalization. However, we need to assign our GPU resources to the 
+# different ranks. Here we decide to assign a unique GPU to each process if 
+# the number of ranks is equal or smaller than that of the GPUs. Otherwise we
+# start assigning more than one GPU to the available ranks. Note that this 
+# approach will work equally well if we have a multi-node multi-GPU setup, where
+# each node has one or more GPUs.
+
 plt.close("all")
 rank = MPI.COMM_WORLD.Get_rank()
 device_count = cp.cuda.runtime.getDeviceCount()
-cp.cuda.Device(rank % device_count).use()
+cp.cuda.Device(rank % device_count).use();
 
 ###############################################################################
 # Let's start by defining all the parameters required by the
 # :py:func:`pylops.avo.poststack.PoststackLinearModelling` operator.
-# Note that this section is exactly the same as the one in the MPI example as we will keep using MPI for transfering metadata (i.e., shapes, dims, etc.)
+# Note that this section is exactly the same as the one in the MPI example as 
+# we will keep using MPI for transfering metadata (i.e., shapes, dims, etc.)
 
 # Model
 model = np.load("../testdata/avo/poststack_model.npz")
@@ -58,31 +68,34 @@
 mback3d = np.concatenate(MPI.COMM_WORLD.allgather(mback3d_i))
 
 ###############################################################################
-# We are now ready to initialize various :py:class:`pylops_mpi.DistributedArray` objects.
-# Compared to the MPI tutorial, we need to make sure that we set CuPy as the engine and
-# use CuPy arrays
+# We are now ready to initialize various :py:class:`pylops_mpi.DistributedArray` 
+# objects. Compared to the MPI tutorial, we need to make sure that we set ``cupy`` 
+# as the engine and fill the distributed arrays with CuPy arrays.
 
-m3d_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, engine="cupy")
+m3d_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, 
+                                       engine="cupy")
 m3d_dist[:] = cp.asarray(m3d_i.flatten())
 
 # Do the same thing for smooth model
-mback3d_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, engine="cupy")
+mback3d_dist = pylops_mpi.DistributedArray(global_shape=ny * nx * nz, 
+                                           engine="cupy")
 mback3d_dist[:] = cp.asarray(mback3d_i.flatten())
 
 ###############################################################################
-# For PostStackLinearModelling, there is no change needed to have it run with CUDA-Aware MPI.
-# This PyLops operator has GPU-support (https://pylops.readthedocs.io/en/stable/gpu.html)
-# so it can run with DistributedArray whose engine is Cupy
+# For PostStackLinearModelling, there is no change needed to have it run with 
+# MPI. This PyLops operator has GPU-support 
+# (https://pylops.readthedocs.io/en/stable/gpu.html) so it can operate on a 
+# distributed arrays with engine set to CuPy.
 
 PPop = PoststackLinearModelling(cp.asarray(wav.astype(np.float32)), nt0=nz, 
                                 spatdims=(ny_i, nx))
 Top = Transpose((ny_i, nx, nz), (2, 0, 1))
 BDiag = pylops_mpi.basicoperators.MPIBlockDiag(ops=[Top.H @ PPop @ Top, ])
 
 ###############################################################################
-# This computation will be done in GPU. The call :code:`asarray()` triggers the CUDA-aware 
-# MPI communication (gather result from each GPU).
-# But array :code:`d` and :code:`d_0` still live in GPU memory
+# This computation will be done on the GPU(s). The call :code:`asarray()` 
+# triggers the MPI communication (gather results from each GPU).
+# Note that the array :code:`d` and :code:`d_0` still live in GPU memory.
 
 d_dist = BDiag @ m3d_dist
 d_local = d_dist.local_array.reshape((ny_i, nx, nz))
@@ -91,24 +104,30 @@
 d_0 = d_dist.asarray().reshape((ny, nx, nz))
 
 ###############################################################################
-# Inversion using CGLS solver - There is no code change to run on CUDA-aware 
-# MPI (this is handled through MPI operator and DistributedArray)
+# Inversion using CGLS solver - no code change is required to run the solver
+# with CUDA-aware MPI (this is handled by the MPI operator and DistributedArray)
 # In this particular case, the local computation will be done in GPU. 
 # Collective communication calls will be carried through MPI GPU-to-GPU.
 
 # Inversion using CGLS solver
-minv3d_iter_dist = pylops_mpi.optimization.basic.cgls(BDiag, d_dist, x0=mback3d_dist, niter=100, show=True)[0]
+minv3d_iter_dist = pylops_mpi.optimization.basic.cgls(BDiag, d_dist, 
+                                                      x0=mback3d_dist,
+                                                      niter=100, show=True)[0]
 minv3d_iter = minv3d_iter_dist.asarray().reshape((ny, nx, nz))
 
 ###############################################################################
 
 # Regularized inversion with normal equations
 epsR = 1e2
-LapOp = pylops_mpi.MPILaplacian(dims=(ny, nx, nz), axes=(0, 1, 2), weights=(1, 1, 1),
-                                sampling=(1, 1, 1), dtype=BDiag.dtype)
+LapOp = pylops_mpi.MPILaplacian(dims=(ny, nx, nz), axes=(0, 1, 2), 
+                                weights=(1, 1, 1),
+                                sampling=(1, 1, 1), 
+                                dtype=BDiag.dtype)
 NormEqOp = BDiag.H @ BDiag + epsR * LapOp.H @ LapOp
 dnorm_dist = BDiag.H @ d_dist
-minv3d_ne_dist = pylops_mpi.optimization.basic.cg(NormEqOp, dnorm_dist, x0=mback3d_dist, niter=100, show=True)[0]
+minv3d_ne_dist = pylops_mpi.optimization.basic.cg(NormEqOp, dnorm_dist, 
+                                                  x0=mback3d_dist, 
+                                                  niter=100, show=True)[0]
 minv3d_ne = minv3d_ne_dist.asarray().reshape((ny, nx, nz))
 
 ###############################################################################
@@ -120,11 +139,14 @@
 dstack_dist = pylops_mpi.StackedDistributedArray([d_dist, d0_dist])
 
 dnorm_dist = BDiag.H @ d_dist
-minv3d_reg_dist = pylops_mpi.optimization.basic.cgls(StackOp, dstack_dist, x0=mback3d_dist, niter=100, show=True)[0]
+minv3d_reg_dist = pylops_mpi.optimization.basic.cgls(StackOp, dstack_dist, 
+                                                     x0=mback3d_dist, 
+                                                     niter=100, show=True)[0]
 minv3d_reg = minv3d_reg_dist.asarray().reshape((ny, nx, nz))
 
 ###############################################################################
-# To plot the inversion results, the array must be copied back to cpu via :code:`get()`
+# Finally we visualize the results. Note that the array must be copied back 
+# to the CPU by calling the :code:`get()` method on the CuPy arrays.
 
 if rank == 0:
     # Check the distributed implementation gives the same result
diff --git a/tutorials_nccl/poststack_nccl.py b/tutorials_nccl/poststack_nccl.py