ModelTC
diff --git a/‎lightllm/common/deepseek2_mem_manager.py‎
Lines changed: 20 additions & 8 deletions b/‎lightllm/common/deepseek2_mem_manager.py‎
Lines changed: 20 additions & 8 deletions
diff --git a/‎lightllm/common/mem_manager.py‎
Lines changed: 18 additions & 6 deletions b/‎lightllm/common/mem_manager.py‎
Lines changed: 18 additions & 6 deletions
diff --git a/‎lightllm/distributed/pynccl.py‎
Lines changed: 81 additions & 57 deletions b/‎lightllm/distributed/pynccl.py‎
Lines changed: 81 additions & 57 deletions
@@ -42,8 +42,11 @@ def alloc_kv_move_buffer(self, max_req_total_len):
         return
 
     def send_to_decode_node(
-        self, move_tasks: List[KVMoveTask], mem_managers: List["Deepseek2MemoryManager"], dp_size_in_node: int,
-        nccl_comm: PyNcclCommunicator
+        self,
+        move_tasks: List[KVMoveTask],
+        mem_managers: List["Deepseek2MemoryManager"],
+        dp_size_in_node: int,
+        nccl_comm: PyNcclCommunicator,
     ):
         assert dp_size_in_node == 1
 
@@ -69,8 +72,11 @@ def _get_kv_move_data(self, token_indexes: List[int], layer_index: int):
         return move_buffer
 
     def receive_from_prefill_node(
-        self, move_tasks: List[KVMoveTask], mem_managers: List["MemoryManager"], dp_size_in_node: int,
-        nccl_comm: PyNcclCommunicator
+        self,
+        move_tasks: List[KVMoveTask],
+        mem_managers: List["MemoryManager"],
+        dp_size_in_node: int,
+        nccl_comm: PyNcclCommunicator,
     ):
         assert dp_size_in_node == 1
 
@@ -102,8 +108,11 @@ def _write_kv_move_data(self, token_indexes: torch.Tensor, buffer_tensor: torch.
         return
 
     def send_to_decode_node_p2p(
-        self, move_tasks: List[KVMoveTask], mem_managers: List["MemoryManager"], dp_size_in_node: int,
-        nccl_comm: PyNcclCommunicator
+        self,
+        move_tasks: List[KVMoveTask],
+        mem_managers: List["MemoryManager"],
+        dp_size_in_node: int,
+        nccl_comm: PyNcclCommunicator,
     ):
         """
         使用 p2p triton kernel 进行数据复制和传输的实现方式。
@@ -155,8 +164,11 @@ def _get_kv_move_data_p2p(
         return move_buffer
 
     def receive_from_prefill_node_p2p(
-        self, move_tasks: List[KVMoveTask], mem_managers: List["MemoryManager"], dp_size_in_node: int,
-        nccl_comm: PyNcclCommunicator
+        self,
+        move_tasks: List[KVMoveTask],
+        mem_managers: List["MemoryManager"],
+        dp_size_in_node: int,
+        nccl_comm: PyNcclCommunicator,
     ):
         if not hasattr(self, "mem_ptrs_dict"):
             self.mem_ptrs_dict = {}
 
@@ -87,8 +87,11 @@ def alloc_kv_move_buffer(self, max_req_total_len):
         return
 
     def send_to_decode_node(
-        self, move_tasks: List[KVMoveTask], mem_managers: List["MemoryManager"], dp_size_in_node: int,
-        nccl_comm: PyNcclCommunicator
+        self,
+        move_tasks: List[KVMoveTask],
+        mem_managers: List["MemoryManager"],
+        dp_size_in_node: int,
+        nccl_comm: PyNcclCommunicator,
     ):
         assert dp_size_in_node == 1
 
@@ -124,7 +127,10 @@ def _get_kv_move_data(self, token_indexes: List[int], layer_index: int):
         return move_buffer
 
     def receive_from_prefill_node(
-        self, move_tasks: List[KVMoveTask], mem_managers: List["MemoryManager"], dp_size_in_node: int,
+        self,
+        move_tasks: List[KVMoveTask],
+        mem_managers: List["MemoryManager"],
+        dp_size_in_node: int,
         nccl_comm: PyNcclCommunicator,
     ):
         assert dp_size_in_node == 1
@@ -158,8 +164,11 @@ def _write_kv_move_data(self, token_indexes: torch.Tensor, buffer_tensor: torch.
         return
 
     def send_to_decode_node_p2p(
-        self, move_tasks: List[KVMoveTask], mem_managers: List["MemoryManager"], dp_size_in_node: int,
-        nccl_comm: PyNcclCommunicator
+        self,
+        move_tasks: List[KVMoveTask],
+        mem_managers: List["MemoryManager"],
+        dp_size_in_node: int,
+        nccl_comm: PyNcclCommunicator,
     ):
         """
         使用 p2p triton kernel 进行数据复制和传输的实现方式。
@@ -190,7 +199,10 @@ def _get_kv_move_data_p2p(self, token_indexes: torch.Tensor, layer_index: int, k
         return move_buffer
 
     def receive_from_prefill_node_p2p(
-        self, move_tasks: List[KVMoveTask], mem_managers: List["MemoryManager"], dp_size_in_node: int,
+        self,
+        move_tasks: List[KVMoveTask],
+        mem_managers: List["MemoryManager"],
+        dp_size_in_node: int,
         nccl_comm: PyNcclCommunicator,
     ):
         assert dp_size_in_node == 1
 
@@ -33,19 +33,27 @@
 from torch.distributed import ProcessGroup, ReduceOp, TCPStore
 
 from lightllm.distributed.pynccl_wrapper import (
-    NCCLLibrary, buffer_type, cudaStream_t, ncclComm_t, ncclDataTypeEnum,
-    ncclRedOpTypeEnum, ncclUniqueId)
+    NCCLLibrary,
+    buffer_type,
+    cudaStream_t,
+    ncclComm_t,
+    ncclDataTypeEnum,
+    ncclRedOpTypeEnum,
+    ncclUniqueId,
+)
 
 logger = logging.getLogger(__name__)
 
 _current_stream = None
 
+
 def current_stream() -> torch.cuda.Stream:
     global _current_stream
     if _current_stream is None:
         _current_stream = torch.cuda.current_stream()
     return _current_stream
 
+
 @dataclasses.dataclass
 class StatelessP2PProcessGroup:
     """A dataclass to hold a metadata store, and the rank, world_size of the
@@ -94,18 +102,13 @@ def expire_data(self):
 
     def recv_obj(self) -> Any:
         """Receive an object from a source rank."""
-        obj = pickle.loads(
-            self.store.get(
-                f"send_to/{self.dest_id}/{self.recv_src_counter}"))
+        obj = pickle.loads(self.store.get(f"send_to/{self.dest_id}/{self.recv_src_counter}"))
         self.recv_src_counter += 1
         return obj
 
     @staticmethod
     def create(
-        src_id: int,
-        dest_id: int,
-        is_server: bool,
-        store: torch._C._distributed_c10d.Store
+        src_id: int, dest_id: int, is_server: bool, store: torch._C._distributed_c10d.Store
     ) -> "StatelessP2PProcessGroup":
         """A replacement for `torch.distributed.init_process_group` that does not
         pollute the global state.
@@ -121,12 +124,11 @@ def create(
         used for exchanging metadata. With this function, process A and process B
         can call `StatelessProcessGroup.create` to form a group, and then process A, B,
         C, and D can call `StatelessProcessGroup.create` to form another group.
-        """ # noqa
+        """  # noqa
         return StatelessP2PProcessGroup(src_id=src_id, dest_id=dest_id, is_server=is_server, store=store)
 
 
 class PyNcclCommunicator:
-
     def __init__(
         self,
         group: Union[ProcessGroup, StatelessP2PProcessGroup],
@@ -146,8 +148,9 @@ def __init__(
         """
         if not isinstance(group, StatelessP2PProcessGroup):
             assert dist.is_initialized()
-            assert dist.get_backend(group) != dist.Backend.NCCL, (
-                "PyNcclCommunicator should be attached to a non-NCCL group.")
+            assert (
+                dist.get_backend(group) != dist.Backend.NCCL
+            ), "PyNcclCommunicator should be attached to a non-NCCL group."
             # note: this rank is the rank in the group
             self.rank = dist.get_rank(group)
             self.world_size = dist.get_world_size(group)
@@ -207,8 +210,7 @@ def __init__(
         # `torch.cuda.device` is a context manager that changes the
         # current cuda device to the specified one
         with torch.cuda.device(device):
-            self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
-                self.world_size, self.unique_id, self.rank)
+            self.comm: ncclComm_t = self.nccl.ncclCommInitRank(self.world_size, self.unique_id, self.rank)
 
             stream = current_stream()
             # A small all_reduce for warmup.
@@ -220,103 +222,120 @@ def __init__(
     def destroy(self):
         self.nccl.ncclCommDestroy(self.comm)
 
-    def all_reduce(self,
-                   in_tensor: torch.Tensor,
-                   op: ReduceOp = ReduceOp.SUM,
-                   stream=None) -> torch.Tensor:
+    def all_reduce(self, in_tensor: torch.Tensor, op: ReduceOp = ReduceOp.SUM, stream=None) -> torch.Tensor:
         if self.disabled:
             return None
         # nccl communicator created on a specific device
         # will only work on tensors on the same device
         # otherwise it will cause "illegal memory access"
         assert in_tensor.device == self.device, (
             f"this nccl communicator is created to work on {self.device}, "
-            f"but the input tensor is on {in_tensor.device}")
+            f"but the input tensor is on {in_tensor.device}"
+        )
 
         out_tensor = torch.empty_like(in_tensor)
 
         if stream is None:
             stream = current_stream()
-        self.nccl.ncclAllReduce(buffer_type(in_tensor.data_ptr()),
-                                buffer_type(out_tensor.data_ptr()),
-                                in_tensor.numel(),
-                                ncclDataTypeEnum.from_torch(in_tensor.dtype),
-                                ncclRedOpTypeEnum.from_torch(op), self.comm,
-                                cudaStream_t(stream.cuda_stream))
+        self.nccl.ncclAllReduce(
+            buffer_type(in_tensor.data_ptr()),
+            buffer_type(out_tensor.data_ptr()),
+            in_tensor.numel(),
+            ncclDataTypeEnum.from_torch(in_tensor.dtype),
+            ncclRedOpTypeEnum.from_torch(op),
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
         return out_tensor
 
-    def all_gather(self,
-                   output_tensor: torch.Tensor,
-                   input_tensor: torch.Tensor,
-                   stream=None):
+    def all_gather(self, output_tensor: torch.Tensor, input_tensor: torch.Tensor, stream=None):
         if self.disabled:
             return
         # nccl communicator created on a specific device
         # will only work on tensors on the same device
         # otherwise it will cause "illegal memory access"
         assert input_tensor.device == self.device, (
             f"this nccl communicator is created to work on {self.device}, "
-            f"but the input tensor is on {input_tensor.device}")
+            f"but the input tensor is on {input_tensor.device}"
+        )
         if stream is None:
             stream = current_stream()
         self.nccl.ncclAllGather(
             buffer_type(input_tensor.data_ptr()),
-            buffer_type(output_tensor.data_ptr()), input_tensor.numel(),
-            ncclDataTypeEnum.from_torch(input_tensor.dtype), self.comm,
-            cudaStream_t(stream.cuda_stream))
-
-    def reduce_scatter(self,
-                       output_tensor: torch.Tensor,
-                       input_tensor: torch.Tensor,
-                       op: ReduceOp = ReduceOp.SUM,
-                       stream=None):
+            buffer_type(output_tensor.data_ptr()),
+            input_tensor.numel(),
+            ncclDataTypeEnum.from_torch(input_tensor.dtype),
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
+
+    def reduce_scatter(
+        self, output_tensor: torch.Tensor, input_tensor: torch.Tensor, op: ReduceOp = ReduceOp.SUM, stream=None
+    ):
         if self.disabled:
             return
         # nccl communicator created on a specific device
         # will only work on tensors on the same device
         # otherwise it will cause "illegal memory access"
         assert input_tensor.device == self.device, (
             f"this nccl communicator is created to work on {self.device}, "
-            f"but the input tensor is on {input_tensor.device}")
+            f"but the input tensor is on {input_tensor.device}"
+        )
         if stream is None:
             stream = current_stream()
         self.nccl.ncclReduceScatter(
             buffer_type(input_tensor.data_ptr()),
-            buffer_type(output_tensor.data_ptr()), output_tensor.numel(),
+            buffer_type(output_tensor.data_ptr()),
+            output_tensor.numel(),
             ncclDataTypeEnum.from_torch(input_tensor.dtype),
-            ncclRedOpTypeEnum.from_torch(op), self.comm,
-            cudaStream_t(stream.cuda_stream))
+            ncclRedOpTypeEnum.from_torch(op),
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
 
     def send(self, tensor: torch.Tensor, dst: int, stream=None):
         if self.disabled:
             return
         assert tensor.device == self.device, (
             f"this nccl communicator is created to work on {self.device}, "
-            f"but the input tensor is on {tensor.device}")
+            f"but the input tensor is on {tensor.device}"
+        )
         if stream is None:
             stream = current_stream()
-        self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(),
-                           ncclDataTypeEnum.from_torch(tensor.dtype), dst,
-                           self.comm, cudaStream_t(stream.cuda_stream))
+        self.nccl.ncclSend(
+            buffer_type(tensor.data_ptr()),
+            tensor.numel(),
+            ncclDataTypeEnum.from_torch(tensor.dtype),
+            dst,
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
 
     def recv(self, tensor: torch.Tensor, src: int, stream=None):
         if self.disabled:
             return
         assert tensor.device == self.device, (
             f"this nccl communicator is created to work on {self.device}, "
-            f"but the input tensor is on {tensor.device}")
+            f"but the input tensor is on {tensor.device}"
+        )
         if stream is None:
             stream = current_stream()
-        self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(),
-                           ncclDataTypeEnum.from_torch(tensor.dtype), src,
-                           self.comm, cudaStream_t(stream.cuda_stream))
+        self.nccl.ncclRecv(
+            buffer_type(tensor.data_ptr()),
+            tensor.numel(),
+            ncclDataTypeEnum.from_torch(tensor.dtype),
+            src,
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
 
     def broadcast(self, tensor: torch.Tensor, src: int, stream=None):
         if self.disabled:
             return
         assert tensor.device == self.device, (
             f"this nccl communicator is created to work on {self.device}, "
-            f"but the input tensor is on {tensor.device}")
+            f"but the input tensor is on {tensor.device}"
+        )
         if stream is None:
             stream = current_stream()
         if src == self.rank:
@@ -326,7 +345,12 @@ def broadcast(self, tensor: torch.Tensor, src: int, stream=None):
         else:
             sendbuff = buffer_type()
             recvbuff = buffer_type(tensor.data_ptr())
-        self.nccl.ncclBroadcast(sendbuff, recvbuff, tensor.numel(),
-                                ncclDataTypeEnum.from_torch(tensor.dtype), src,
-                                self.comm, cudaStream_t(stream.cuda_stream))
-
+        self.nccl.ncclBroadcast(
+            sendbuff,
+            recvbuff,
+            tensor.numel(),
+            ncclDataTypeEnum.from_torch(tensor.dtype),
+            src,
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )