fix flake8, docstring on _nccl functions and gpu.rst

tharittk · tharittk · commit 95e32bfa2898 · 2025-06-29T04:23:33.000-05:00
diff --git a/docs/source/gpu.rst b/docs/source/gpu.rst
@@ -153,8 +153,8 @@ In the following, we provide a list of modules (i.e., operators and solvers) whe
    * - :class:`pylops_mpi.optimization.basic.cgls`
      - ✅ 
    * - :class:`pylops_mpi.signalprocessing.Fredhoml1`
-     - Planned ⏳
-   * - ISTA Solver
-     - Planned ⏳
+     - ✅ 
    * - Complex Numeric Data Type for NCCL 
-     - Planned ⏳ 
+     - ✅ 
+   * - ISTA Solver
+     - Planned ⏳
diff --git a/pylops_mpi/signalprocessing/Fredholm1.py b/pylops_mpi/signalprocessing/Fredholm1.py
@@ -111,7 +111,7 @@ def _matvec(self, x: DistributedArray) -> DistributedArray:
         if x.partition not in [Partition.BROADCAST, Partition.UNSAFE_BROADCAST]:
             raise ValueError(f"x should have partition={Partition.BROADCAST},{Partition.UNSAFE_BROADCAST}"
                              f"Got  {x.partition} instead...")
-        y = DistributedArray(global_shape=self.shape[0], 
+        y = DistributedArray(global_shape=self.shape[0],
                              base_comm=x.base_comm,
                              base_comm_nccl=x.base_comm_nccl,
                              partition=x.partition,
@@ -128,11 +128,6 @@ def _matvec(self, x: DistributedArray) -> DistributedArray:
             for isl in range(self.nsls[self.rank]):
                 y1[isl] = ncp.dot(self.G[isl], x[isl])
         # gather results
-        # TODO: _allgather is supposed to be private to DistributedArray
-        # but so far, we do not take base_comm_nccl as an argument to Op.
-        # For consistency, y._allgather has to be called here.
-        # Alternatively, we can also do if-else checking x.base_comm_nccl, but that means
-        # we have to call function from _nccl.py
         y[:] = ncp.vstack(y._allgather(y1)).ravel()
         return y
 
@@ -141,7 +136,7 @@ def _rmatvec(self, x: NDArray) -> NDArray:
         if x.partition not in [Partition.BROADCAST, Partition.UNSAFE_BROADCAST]:
             raise ValueError(f"x should have partition={Partition.BROADCAST},{Partition.UNSAFE_BROADCAST}"
                              f"Got  {x.partition} instead...")
-        y = DistributedArray(global_shape=self.shape[1], 
+        y = DistributedArray(global_shape=self.shape[1],
                              base_comm=x.base_comm,
                              base_comm_nccl=x.base_comm_nccl,
                              partition=x.partition,
@@ -176,8 +171,8 @@ def _rmatvec(self, x: NDArray) -> NDArray:
         if self.usematmul and isinstance(recv, ncp.ndarray) :
             # unrolling
             chunk_size = self.ny * self.nz
-            num_partition = (len(recv)+chunk_size-1)//chunk_size
-            recv = ncp.vstack([recv[i*chunk_size: (i+1)*chunk_size].reshape(self.nz, self.ny).T for i in range(num_partition)])
+            num_partition = (len(recv) + chunk_size - 1) // chunk_size
+            recv = ncp.vstack([recv[i * chunk_size: (i + 1) * chunk_size].reshape(self.nz, self.ny).T for i in range(num_partition)])
         else:
             recv = ncp.vstack(recv)
         y[:] = recv.ravel()
diff --git a/pylops_mpi/utils/_nccl.py b/pylops_mpi/utils/_nccl.py
@@ -39,6 +39,20 @@ class NcclOp(IntEnum):
 
 
 def _nccl_buf_size(buf, count=None):
+    """ Get an appropriate buffer size according to the dtype of buf
+
+    Parameters
+    ----------
+    buf : :obj:`cupy.ndarray` or array-like
+        The data buffer from the local GPU to be sent.
+
+    count : :obj:`int`, optional
+        Number of elements to send from `buf`, if not sending the every element in `buf`.
+    Returns:
+    -------
+    :obj:`int`
+        An appropriate number of elements to send from `send_buf` for NCCL communication.
+    """
     if buf.dtype in ['complex64', 'complex128']:
         return 2 * count if count else 2 * buf.size
     else: