fix flake8

tharittk · tharittk · commit a08924bf1838 · 2025-09-12T03:07:05.000-05:00
diff --git a/pylops_mpi/Distributed.py b/pylops_mpi/Distributed.py
@@ -1,5 +1,3 @@
-from typing import Any, NewType, Tuple
-
 from mpi4py import MPI
 from pylops.utils import deps as pylops_deps  # avoid namespace crashes with pylops_mpi.utils
 from pylops_mpi.utils._mpi import mpi_allreduce, mpi_allgather, mpi_send, mpi_recv, _prepare_allgather_inputs, _unroll_allgather_recv
@@ -10,10 +8,10 @@
 
 if nccl_message is None and cupy_message is None:
     from pylops_mpi.utils._nccl import (
-        nccl_allgather, nccl_allreduce, 
-        nccl_asarray, nccl_bcast, nccl_split, nccl_send, nccl_recv
+        nccl_allgather, nccl_allreduce, nccl_send, nccl_recv
     )
 
+
 class DistributedMixIn:
     r"""Distributed Mixin class
 
@@ -30,7 +28,7 @@ def _allreduce(self, send_buf, recv_buf=None, op: MPI.Op = MPI.SUM):
         if deps.nccl_enabled and getattr(self, "base_comm_nccl"):
             return nccl_allreduce(self.base_comm_nccl, send_buf, recv_buf, op)
         else:
-            return mpi_allreduce(self.base_comm, send_buf, 
+            return mpi_allreduce(self.base_comm, send_buf,
                                  recv_buf, self.engine, op)
 
     def _allreduce_subcomm(self, send_buf, recv_buf=None, op: MPI.Op = MPI.SUM):
@@ -39,7 +37,7 @@ def _allreduce_subcomm(self, send_buf, recv_buf=None, op: MPI.Op = MPI.SUM):
         if deps.nccl_enabled and getattr(self, "base_comm_nccl"):
             return nccl_allreduce(self.sub_comm, send_buf, recv_buf, op)
         else:
-            return mpi_allreduce(self.sub_comm, send_buf, 
+            return mpi_allreduce(self.sub_comm, send_buf,
                                  recv_buf, self.engine, op)
 
     def _allgather(self, send_buf, recv_buf=None):
@@ -96,7 +94,5 @@ def _recv(self, recv_buf=None, source=0, count=None, tag=0):
             return recv_buf
         else:
             return mpi_recv(self.base_comm,
-                     recv_buf, source, count, tag=tag,
-                     engine=self.engine)
-
-
+                            recv_buf, source, count, tag=tag,
+                            engine=self.engine)
diff --git a/pylops_mpi/DistributedArray.py b/pylops_mpi/DistributedArray.py
@@ -15,7 +15,7 @@
 nccl_message = deps.nccl_import("the DistributedArray module")
 
 if nccl_message is None and cupy_message is None:
-    from pylops_mpi.utils._nccl import nccl_asarray, nccl_bcast, nccl_split 
+    from pylops_mpi.utils._nccl import nccl_asarray, nccl_bcast, nccl_split
     from cupy.cuda.nccl import NcclCommunicator
 else:
     NcclCommunicator = Any
@@ -613,14 +613,14 @@ def _compute_vector_norm(self, local_array: NDArray,
             # with MAX, MIN operator. Here we copy the array back to CPU, transfer, and copy them back to GPUs
             send_buf = ncp.max(ncp.abs(local_array), axis=axis).astype(ncp.float64)
             if self.engine == "cupy" and self.base_comm_nccl is None and not deps.cuda_aware_mpi_enabled:
-                # CuPy + non-CUDA-aware MPI: This will call non-buffered communication 
+                # CuPy + non-CUDA-aware MPI: This will call non-buffered communication
                 # which return a list of object - must be copied back to a GPU memory.
                 recv_buf = self._allreduce_subcomm(send_buf.get(), recv_buf.get(), op=MPI.MAX)
                 recv_buf = ncp.asarray(ncp.squeeze(recv_buf, axis=axis))
             else:
                 recv_buf = self._allreduce_subcomm(send_buf, recv_buf, op=MPI.MAX)
                 # TODO (tharitt): In current implementation, there seems to be a semantic difference between Buffered MPI and NCCL
-                # the (1, size) is collapsed to (size, ) with buffered MPI while NCCL retains it. 
+                # the (1, size) is collapsed to (size, ) with buffered MPI while NCCL retains it.
                 # There may be a way to unify it - may be something to do with how we allocate the recv_buf.
                 if self.base_comm_nccl:
                     recv_buf = ncp.squeeze(recv_buf, axis=axis)
diff --git a/pylops_mpi/utils/_mpi.py b/pylops_mpi/utils/_mpi.py
@@ -9,13 +9,14 @@
     "_unroll_allgather_recv"
 ]
 
-from typing import Optional, Tuple
+from typing import Optional
 
 import numpy as np
 from mpi4py import MPI
 from pylops.utils.backend import get_module
 from pylops_mpi.utils import deps
 
+
 # TODO: return type annotation for both cupy and numpy
 def _prepare_allgather_inputs(send_buf, send_buf_shapes, engine):
     r""" Prepare send_buf and recv_buf for NCCL allgather (nccl_allgather)
@@ -33,7 +34,7 @@ def _prepare_allgather_inputs(send_buf, send_buf_shapes, engine):
         The data buffer from the local GPU to be sent for allgather.
     send_buf_shapes: :obj:`list`
         A list of shapes for each GPU send_buf (used to calculate padding size)
-    engine : :obj:`str` 
+    engine : :obj:`str`
         Engine used to store array (``numpy`` or ``cupy``)
 
     Returns
@@ -96,20 +97,21 @@ def _unroll_allgather_recv(recv_buf, padded_send_buf_shape, send_buf_shapes) ->
 
     return chunks
 
+
 def mpi_allreduce(base_comm: MPI.Comm,
-                  send_buf, recv_buf=None, 
+                  send_buf, recv_buf=None,
                   engine: Optional[str] = "numpy",
                   op: MPI.Op = MPI.SUM) -> np.ndarray:
-    """MPI_Allreduce/allreduce 
-    
-    Dispatch allreduce routine based on type of input and availability of 
+    """MPI_Allreduce/allreduce
+
+    Dispatch allreduce routine based on type of input and availability of
     CUDA-Aware MPI
 
     Parameters
     ----------
     base_comm : :obj:`MPI.Comm`
         Base MPI Communicator.
-    send_buf : :obj:`numpy.ndarray` or :obj:`cupy.ndarray` 
+    send_buf : :obj:`numpy.ndarray` or :obj:`cupy.ndarray`
         The data buffer from the local GPU to be reduced.
     recv_buf : :obj:`cupy.ndarray`, optional
         The buffer to store the result of the reduction. If None,
@@ -121,10 +123,10 @@ def mpi_allreduce(base_comm: MPI.Comm,
 
     Returns
     -------
-    recv_buf : :obj:`numpy.ndarray` or :obj:`cupy.ndarray` 
+    recv_buf : :obj:`numpy.ndarray` or :obj:`cupy.ndarray`
         A buffer containing the result of the reduction, broadcasted
         to all GPUs.
-    
+
     """
     if deps.cuda_aware_mpi_enabled or engine == "numpy":
         ncp = get_module(engine)
@@ -141,9 +143,8 @@ def mpi_allreduce(base_comm: MPI.Comm,
 
 
 def mpi_allgather(base_comm: MPI.Comm,
-                  send_buf, recv_buf=None, 
-                  engine: Optional[str] = "numpy",
-                ) -> np.ndarray:
+                  send_buf, recv_buf=None,
+                  engine: Optional[str] = "numpy") -> np.ndarray:
 
     if deps.cuda_aware_mpi_enabled or engine == "numpy":
         send_shapes = base_comm.allgather(send_buf.shape)
@@ -165,15 +166,15 @@ def mpi_send(base_comm: MPI.Comm,
              engine: Optional[str] = "numpy",
              ) -> None:
     """MPI_Send/send
-    
-    Dispatch send routine based on type of input and availability of 
+
+    Dispatch send routine based on type of input and availability of
     CUDA-Aware MPI
 
     Parameters
     ----------
     base_comm : :obj:`MPI.Comm`
         Base MPI Communicator.
-    send_buf : :obj:`numpy.ndarray` or :obj:`cupy.ndarray` 
+    send_buf : :obj:`numpy.ndarray` or :obj:`cupy.ndarray`
         The array containing data to send.
     dest: :obj:`int`
         The rank of the destination CPU/GPU device.
@@ -183,7 +184,6 @@ def mpi_send(base_comm: MPI.Comm,
         Tag of the message to be sent.
     engine : :obj:`str`, optional
         Engine used to store array (``numpy`` or ``cupy``)
-    
     """
     if deps.cuda_aware_mpi_enabled or engine == "numpy":
         # Determine MPI type based on array dtype
@@ -195,11 +195,12 @@ def mpi_send(base_comm: MPI.Comm,
         # Uses CuPy without CUDA-aware MPI
         base_comm.send(send_buf, dest, tag)
 
+
 def mpi_recv(base_comm: MPI.Comm,
-            recv_buf=None, source=0, count=None, tag=0,
-            engine: Optional[str] = "numpy") -> np.ndarray:
+             recv_buf=None, source=0, count=None, tag=0,
+             engine: Optional[str] = "numpy") -> np.ndarray:
     """ MPI_Recv/recv
-    Dispatch receive routine based on type of input and availability of 
+    Dispatch receive routine based on type of input and availability of
     CUDA-Aware MPI
 
     Parameters
@@ -216,7 +217,6 @@ def mpi_recv(base_comm: MPI.Comm,
         Tag of the message to be sent.
     engine : :obj:`str`, optional
         Engine used to store array (``numpy`` or ``cupy``)
-    
     """
     if deps.cuda_aware_mpi_enabled or engine == "numpy":
         ncp = get_module(engine)
@@ -233,4 +233,3 @@ def mpi_recv(base_comm: MPI.Comm,
         # Uses CuPy without CUDA-aware MPI
         recv_buf = base_comm.recv(source=source, tag=tag)
     return recv_buf
-
diff --git a/pylops_mpi/utils/_nccl.py b/pylops_mpi/utils/_nccl.py
@@ -66,6 +66,7 @@ def _nccl_sync():
         return
     cp.cuda.runtime.deviceSynchronize()
 
+
 def mpi_op_to_nccl(mpi_op) -> NcclOp:
     """ Map MPI reduction operation to NCCL equivalent