Fix write-up as suggested by PR

tharittk · tharittk · commit 51bc70c8bc37 · 2025-07-09T04:05:58.000-05:00
diff --git a/tutorials_nccl/lsm_nccl.py b/tutorials_nccl/lsm_nccl.py
@@ -93,10 +93,11 @@
 # We create a :py:class:`pylops.waveeqprocessing.LSM` at each rank and then push them
 # into a :py:class:`pylops_mpi.basicoperators.MPIVStack` to perform a matrix-vector
 # product with the broadcasted reflectivity at every location on the subsurface.
-# Also, we must pass `nccl_comm` to `refl` in order to use NCCL for communications.
-# Noted that we allocate some arrays (wav, lsm.Demop.trav_srcs, and lsm.Demop.trav.recs)
-# to GPU upfront. Because we want a fair performace comparison, we avoid having
-# LSM internally copying arrays.
+# Note that we must use :code`engine="cuda"` and move the wavelet wav to the GPU prior to creating the operator.
+# Moreover, we allocate the traveltime tables (:code`lsm.Demop.trav_srcs`, and :code`lsm.Demop.trav_recs`)
+# to the GPU prior to applying the operator to avoid incurring in the penalty of performing
+# host-to-device memory copies every time the operator is applied. Moreover, we must pass :code`nccl_comm`
+# to the DistributedArray constructor used to create :code`refl_dist` in order to use NCCL for communications.
 
 # Wavelet
 nt = 651
@@ -139,7 +140,7 @@
 
 ###############################################################################
 # We calculate the inverse using the :py:func:`pylops_mpi.optimization.basic.cgls`
-# solver. Here, we pass the `nccl_comm` to `x0` to use NCCL as a communicator.
+# solver. Here, we pass the :code:`nccl_comm` to :code:`x0` to use NCCL as a communicator.
 # In this particular case, the local computation will be done in GPU.
 # Collective communication calls will be carried through NCCL GPU-to-GPU.
 
diff --git a/tutorials_nccl/mdd_nccl.py b/tutorials_nccl/mdd_nccl.py
@@ -32,7 +32,7 @@
 
 ###############################################################################
 # Let's start by defining all the parameters required by the
-# :py:func:`pylops.waveeqprocessing.MPIMDC` operator.
+# :py:class:`pylops.waveeqprocessing.MPIMDC` operator.
 # Note that this section is exactly the same as the one in the MPI example as
 # we will keep using MPI for transfering metadata (i.e., shapes, dims, etc.)
 
@@ -106,7 +106,7 @@
 # And now, we define the distributed operator MPIMDC and model as well as compute the data.
 # Both the model and data have to live in GPU. We also define the DistributedArray `m`
 # with `nccl_comm`` and engine="cupy" to use NCCL for communications (the data `d` will be set the same).
-# Noted that fftengine must be set to "numpy" in MDCop operator when running with CuPy
+# Note that fftengine must be set to "numpy" in MDCop operator when running with CuPy
 
 # Move operator kernel to GPU
 G = cp.asarray(G)