ModelTC
diff --git a/‎lightllm/distributed/device_communicators/pynccl.py‎
Lines changed: 46 additions & 29 deletions b/‎lightllm/distributed/device_communicators/pynccl.py‎
Lines changed: 46 additions & 29 deletions
diff --git a/‎lightllm/distributed/device_communicators/pynccl_wrapper.py‎
Lines changed: 54 additions & 49 deletions b/‎lightllm/distributed/device_communicators/pynccl_wrapper.py‎
Lines changed: 54 additions & 49 deletions
@@ -1,4 +1,4 @@
-# Adapted from 
+# Adapted from
 # https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/distributed/device_communicators/pynccl.py
 # of the vllm-project/vllm GitHub repository.
 #
@@ -26,15 +26,20 @@
 from torch.distributed import ProcessGroup, ReduceOp
 
 from lightllm.distributed.device_communicators.pynccl_wrapper import (
-    NCCLLibrary, buffer_type, cudaStream_t, ncclComm_t, ncclDataTypeEnum,
-    ncclRedOpTypeEnum, ncclUniqueId)
+    NCCLLibrary,
+    buffer_type,
+    cudaStream_t,
+    ncclComm_t,
+    ncclDataTypeEnum,
+    ncclRedOpTypeEnum,
+    ncclUniqueId,
+)
 from lightllm.utils.log_utils import init_logger
 
 logger = init_logger(__name__)
 
 
 class PyNcclCommunicator:
-
     def __init__(
         self,
         group: ProcessGroup,
@@ -53,8 +58,9 @@ def __init__(
         is bind to a unique device.
         """
         assert dist.is_initialized()
-        assert dist.get_backend(group) != dist.Backend.NCCL, (
-            "PyNcclCommunicator should be attached to a non-NCCL group.")
+        assert (
+            dist.get_backend(group) != dist.Backend.NCCL
+        ), "PyNcclCommunicator should be attached to a non-NCCL group."
         self.group = group
         # note: this rank is the rank in the group
         self.rank = dist.get_rank(group)
@@ -105,8 +111,7 @@ def __init__(
         # `torch.cuda.device` is a context manager that changes the
         # current cuda device to the specified one
         with torch.cuda.device(device):
-            self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
-                self.world_size, self.unique_id, self.rank)
+            self.comm: ncclComm_t = self.nccl.ncclCommInitRank(self.world_size, self.unique_id, self.rank)
             self.stream = torch.cuda.Stream()
 
             # A small all_reduce for warmup.
@@ -120,54 +125,66 @@ def __init__(
         # when we are using CUDA graph.
         self.disabled = True
 
-    def all_reduce(self,
-                   tensor: torch.Tensor,
-                   op: ReduceOp = ReduceOp.SUM,
-                   stream=None):
+    def all_reduce(self, tensor: torch.Tensor, op: ReduceOp = ReduceOp.SUM, stream=None):
         if self.disabled:
             return
         # nccl communicator created on a specific device
         # will only work on tensors on the same device
         # otherwise it will cause "illegal memory access"
         assert tensor.device == self.device, (
             f"this nccl communicator is created to work on {self.device}, "
-            f"but the input tensor is on {tensor.device}")
+            f"but the input tensor is on {tensor.device}"
+        )
         if stream is None:
             stream = self.stream
-        self.nccl.ncclAllReduce(buffer_type(tensor.data_ptr()),
-                                buffer_type(tensor.data_ptr()), tensor.numel(),
-                                ncclDataTypeEnum.from_torch(tensor.dtype),
-                                ncclRedOpTypeEnum.from_torch(op), self.comm,
-                                cudaStream_t(stream.cuda_stream))
+        self.nccl.ncclAllReduce(
+            buffer_type(tensor.data_ptr()),
+            buffer_type(tensor.data_ptr()),
+            tensor.numel(),
+            ncclDataTypeEnum.from_torch(tensor.dtype),
+            ncclRedOpTypeEnum.from_torch(op),
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
 
     def send(self, tensor: torch.Tensor, dst: int, stream=None):
         if self.disabled:
             return
         assert tensor.device == self.device, (
             f"this nccl communicator is created to work on {self.device}, "
-            f"but the input tensor is on {tensor.device}")
+            f"but the input tensor is on {tensor.device}"
+        )
         if stream is None:
             stream = self.stream
-        self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(),
-                           ncclDataTypeEnum.from_torch(tensor.dtype), dst,
-                           self.comm, cudaStream_t(stream.cuda_stream))
+        self.nccl.ncclSend(
+            buffer_type(tensor.data_ptr()),
+            tensor.numel(),
+            ncclDataTypeEnum.from_torch(tensor.dtype),
+            dst,
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
 
     def recv(self, tensor: torch.Tensor, src: int, stream=None):
         if self.disabled:
             return
         assert tensor.device == self.device, (
             f"this nccl communicator is created to work on {self.device}, "
-            f"but the input tensor is on {tensor.device}")
+            f"but the input tensor is on {tensor.device}"
+        )
         if stream is None:
             stream = self.stream
-        self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(),
-                           ncclDataTypeEnum.from_torch(tensor.dtype), src,
-                           self.comm, cudaStream_t(stream.cuda_stream))
+        self.nccl.ncclRecv(
+            buffer_type(tensor.data_ptr()),
+            tensor.numel(),
+            ncclDataTypeEnum.from_torch(tensor.dtype),
+            src,
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
 
     @contextmanager
-    def change_state(self,
-                     enable: Optional[bool] = None,
-                     stream: Optional[torch.cuda.Stream] = None):
+    def change_state(self, enable: Optional[bool] = None, stream: Optional[torch.cuda.Stream] = None):
         """
         A context manager to change the state of the communicator.
         """
 
@@ -1,4 +1,4 @@
-# Adapted from 
+# Adapted from
 # https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/distributed/device_communicators/pynccl_wrapper.py
 # of the vllm-project/vllm GitHub repository.
 #
@@ -146,46 +146,43 @@ class NCCLLibrary:
         # const char* ncclGetErrorString(ncclResult_t result)
         Function("ncclGetErrorString", ctypes.c_char_p, [ncclResult_t]),
         # ncclResult_t  ncclGetVersion(int *version);
-        Function("ncclGetVersion", ncclResult_t,
-                 [ctypes.POINTER(ctypes.c_int)]),
+        Function("ncclGetVersion", ncclResult_t, [ctypes.POINTER(ctypes.c_int)]),
         # ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
-        Function("ncclGetUniqueId", ncclResult_t,
-                 [ctypes.POINTER(ncclUniqueId)]),
+        Function("ncclGetUniqueId", ncclResult_t, [ctypes.POINTER(ncclUniqueId)]),
         # ncclResult_t  ncclCommInitRank(
         #   ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
         # note that ncclComm_t is a pointer type, so the first argument
         # is a pointer to a pointer
-        Function("ncclCommInitRank", ncclResult_t, [
-            ctypes.POINTER(ncclComm_t), ctypes.c_int, ncclUniqueId,
-            ctypes.c_int
-        ]),
+        Function(
+            "ncclCommInitRank", ncclResult_t, [ctypes.POINTER(ncclComm_t), ctypes.c_int, ncclUniqueId, ctypes.c_int]
+        ),
         # ncclResult_t  ncclAllReduce(
         #   const void* sendbuff, void* recvbuff, size_t count,
         #   ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
         #   cudaStream_t stream);
         # note that cudaStream_t is a pointer type, so the last argument
         # is a pointer
-        Function("ncclAllReduce", ncclResult_t, [
-            buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t,
-            ncclRedOp_t, ncclComm_t, cudaStream_t
-        ]),
-
+        Function(
+            "ncclAllReduce",
+            ncclResult_t,
+            [buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t, ncclRedOp_t, ncclComm_t, cudaStream_t],
+        ),
         # ncclResult_t  ncclSend(
         #   const void* sendbuff, size_t count, ncclDataType_t datatype,
         #   int dest, ncclComm_t comm, cudaStream_t stream);
-        Function("ncclSend", ncclResult_t, [
-            buffer_type, ctypes.c_size_t, ncclDataType_t, ctypes.c_int,
-            ncclComm_t, cudaStream_t
-        ]),
-
+        Function(
+            "ncclSend",
+            ncclResult_t,
+            [buffer_type, ctypes.c_size_t, ncclDataType_t, ctypes.c_int, ncclComm_t, cudaStream_t],
+        ),
         # ncclResult_t  ncclRecv(
         #   void* recvbuff, size_t count, ncclDataType_t datatype,
         #   int src, ncclComm_t comm, cudaStream_t stream);
-        Function("ncclRecv", ncclResult_t, [
-            buffer_type, ctypes.c_size_t, ncclDataType_t, ctypes.c_int,
-            ncclComm_t, cudaStream_t
-        ]),
-
+        Function(
+            "ncclRecv",
+            ncclResult_t,
+            [buffer_type, ctypes.c_size_t, ncclDataType_t, ctypes.c_int, ncclComm_t, cudaStream_t],
+        ),
         # be cautious! this is a collective call, it will block until all
         # processes in the communicator have called this function.
         # because Python object destruction can happen in random order,
@@ -219,8 +216,10 @@ def __init__(self, so_file: Optional[str] = None):
                 "or it does not support the current platform %s."
                 "If you already have the library, please set the "
                 "environment variable VLLM_NCCL_SO_PATH"
-                " to point to the correct nccl library path.", so_file,
-                platform.platform())
+                " to point to the correct nccl library path.",
+                so_file,
+                platform.platform(),
+            )
             raise e
 
         if so_file not in NCCLLibrary.path_to_dict_mapping:
@@ -253,45 +252,51 @@ def ncclGetVersion(self) -> str:
 
     def ncclGetUniqueId(self) -> ncclUniqueId:
         unique_id = ncclUniqueId()
-        self.NCCL_CHECK(self._funcs["ncclGetUniqueId"](
-            ctypes.byref(unique_id)))
+        self.NCCL_CHECK(self._funcs["ncclGetUniqueId"](ctypes.byref(unique_id)))
         return unique_id
 
-    def ncclCommInitRank(self, world_size: int, unique_id: ncclUniqueId,
-                         rank: int) -> ncclComm_t:
+    def ncclCommInitRank(self, world_size: int, unique_id: ncclUniqueId, rank: int) -> ncclComm_t:
         comm = ncclComm_t()
-        self.NCCL_CHECK(self._funcs["ncclCommInitRank"](ctypes.byref(comm),
-                                                        world_size, unique_id,
-                                                        rank))
+        self.NCCL_CHECK(self._funcs["ncclCommInitRank"](ctypes.byref(comm), world_size, unique_id, rank))
         return comm
 
-    def ncclAllReduce(self, sendbuff: buffer_type, recvbuff: buffer_type,
-                      count: int, datatype: int, op: int, comm: ncclComm_t,
-                      stream: cudaStream_t) -> None:
+    def ncclAllReduce(
+        self,
+        sendbuff: buffer_type,
+        recvbuff: buffer_type,
+        count: int,
+        datatype: int,
+        op: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
         # `datatype` actually should be `ncclDataType_t`
         # and `op` should be `ncclRedOp_t`
         # both are aliases of `ctypes.c_int`
         # when we pass int to a function, it will be converted to `ctypes.c_int`
         # by ctypes automatically
-        self.NCCL_CHECK(self._funcs["ncclAllReduce"](sendbuff, recvbuff, count,
-                                                     datatype, op, comm,
-                                                     stream))
+        self.NCCL_CHECK(self._funcs["ncclAllReduce"](sendbuff, recvbuff, count, datatype, op, comm, stream))
 
-    def ncclSend(self, sendbuff: buffer_type, count: int, datatype: int,
-                 dest: int, comm: ncclComm_t, stream: cudaStream_t) -> None:
-        self.NCCL_CHECK(self._funcs["ncclSend"](sendbuff, count, datatype,
-                                                dest, comm, stream))
+    def ncclSend(
+        self, sendbuff: buffer_type, count: int, datatype: int, dest: int, comm: ncclComm_t, stream: cudaStream_t
+    ) -> None:
+        self.NCCL_CHECK(self._funcs["ncclSend"](sendbuff, count, datatype, dest, comm, stream))
 
-    def ncclRecv(self, recvbuff: buffer_type, count: int, datatype: int,
-                 src: int, comm: ncclComm_t, stream: cudaStream_t) -> None:
-        self.NCCL_CHECK(self._funcs["ncclRecv"](recvbuff, count, datatype, src,
-                                                comm, stream))
+    def ncclRecv(
+        self, recvbuff: buffer_type, count: int, datatype: int, src: int, comm: ncclComm_t, stream: cudaStream_t
+    ) -> None:
+        self.NCCL_CHECK(self._funcs["ncclRecv"](recvbuff, count, datatype, src, comm, stream))
 
     def ncclCommDestroy(self, comm: ncclComm_t) -> None:
         self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm))
 
 
 __all__ = [
-    "NCCLLibrary", "ncclDataTypeEnum", "ncclRedOpTypeEnum", "ncclUniqueId",
-    "ncclComm_t", "cudaStream_t", "buffer_type"
+    "NCCLLibrary",
+    "ncclDataTypeEnum",
+    "ncclRedOpTypeEnum",
+    "ncclUniqueId",
+    "ncclComm_t",
+    "cudaStream_t",
+    "buffer_type",
 ]