ModelTC
diff --git a/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 1 addition & 1 deletion b/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lightllm/common/deepseek2_mem_manager.py‎
Lines changed: 17 additions & 8 deletions b/‎lightllm/common/deepseek2_mem_manager.py‎
Lines changed: 17 additions & 8 deletions
diff --git a/‎lightllm/common/mem_manager.py‎
Lines changed: 20 additions & 13 deletions b/‎lightllm/common/mem_manager.py‎
Lines changed: 20 additions & 13 deletions
diff --git a/‎lightllm/server/router/manager.py‎
Lines changed: 2 additions & 2 deletions b/‎lightllm/server/router/manager.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lightllm/server/router/model_infer/mode_backend/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎lightllm/server/router/model_infer/mode_backend/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/__init__.PY‎ b/‎lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/__init__.PY‎
diff --git a/‎…inues_batch/decode_node_impl/__init__.py‎ ‎…tch/pd_mode/decode_node_impl/__init__.py‎lightllm/server/router/model_infer/mode_backend/continues_batch/decode_node_impl/__init__.py renamed to lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/decode_node_impl/__init__.py b/‎…inues_batch/decode_node_impl/__init__.py‎ ‎…tch/pd_mode/decode_node_impl/__init__.py‎lightllm/server/router/model_infer/mode_backend/continues_batch/decode_node_impl/__init__.py renamed to lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/decode_node_impl/__init__.py
diff --git a/‎…es_batch/decode_node_impl/decode_impl.py‎ ‎…/pd_mode/decode_node_impl/decode_impl.py‎lightllm/server/router/model_infer/mode_backend/continues_batch/decode_node_impl/decode_impl.py renamed to lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/decode_node_impl/decode_impl.py
Lines changed: 2 additions & 2 deletions b/‎…es_batch/decode_node_impl/decode_impl.py‎ ‎…/pd_mode/decode_node_impl/decode_impl.py‎lightllm/server/router/model_infer/mode_backend/continues_batch/decode_node_impl/decode_impl.py renamed to lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/decode_node_impl/decode_impl.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎…ch/decode_node_impl/decode_infer_rpyc.py‎ ‎…de/decode_node_impl/decode_infer_rpyc.py‎lightllm/server/router/model_infer/mode_backend/continues_batch/decode_node_impl/decode_infer_rpyc.py renamed to lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/decode_node_impl/decode_infer_rpyc.py b/‎…ch/decode_node_impl/decode_infer_rpyc.py‎ ‎…de/decode_node_impl/decode_infer_rpyc.py‎lightllm/server/router/model_infer/mode_backend/continues_batch/decode_node_impl/decode_infer_rpyc.py renamed to lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/decode_node_impl/decode_infer_rpyc.py
@@ -158,7 +158,7 @@ def _init_mem_manager(self):
     def _init_kv_move_buffer(self):
         # p d 分离的推理模式下才需要做这一步初始化
         if self.run_mode in ["prefill", "decode"]:
-            self.mem_manager.alloc_kv_move_buffer(self.max_seq_length)
+            self.mem_manager.alloc_kv_move_buffer(self.mem_manager.size)
 
     def _check_mem_size(self):
         self.max_total_token_num = self.mem_manager.size
 
@@ -1,6 +1,7 @@
 import torch
 import os
 
+from lightllm.server.pd_io_struct import KVMoveTask
 from .mem_manager import MemoryManager
 from typing import List
 
@@ -24,18 +25,22 @@ def alloc_kv_move_buffer(self, max_req_total_len):
         return
 
     def send_to_decode_node(
-        self, token_indexes: List[int], mem_managers: List["Deepseek2MemoryManager"], dp_size: int, dp_index: int
+        self, move_tasks: List[KVMoveTask], mem_managers: List["Deepseek2MemoryManager"], dp_size: int
     ):
         assert dp_size == 1
-        assert dp_index == 0
 
         # 先将数据发送到指定的一张卡上的buffer，再发送。
         import torch.distributed as dist
 
+        move_token_indexes = []
+        for task in move_tasks:
+            if task.move_kv_len != 0:
+                move_token_indexes.extend(task.prefill_token_indexes[-task.move_kv_len :])
+
         cur_device_index = self.kv_buffer.get_device()
         cur_mem = mem_managers[cur_device_index]
         for layer_index in range(cur_mem.layer_num):
-            move_buffer = cur_mem._get_kv_move_data(token_indexes, layer_index)
+            move_buffer = cur_mem._get_kv_move_data(move_token_indexes, layer_index)
             dist.send(move_buffer, dst=1)
         return
 
@@ -48,29 +53,33 @@ def _get_kv_move_data(self, token_indexes: List[int], layer_index: int):
         return move_buffer
 
     def receive_from_prefill_node(
-        self, token_indexes: List[int], mem_managers: List["MemoryManager"], dp_size: int, dp_index: int
+        self, move_tasks: List[KVMoveTask], mem_managers: List["MemoryManager"], dp_size: int
     ):
         assert dp_size == 1
-        assert dp_index == 0
 
         # 先将数据接受到指定的一张卡上的buffer，再复制到其他的卡上。
         import torch.distributed as dist
 
+        move_token_indexes = []
+        for task in move_tasks:
+            if task.move_kv_len != 0:
+                move_token_indexes.extend(task.decode_token_indexes[-task.move_kv_len :])
+
         cur_device_index = self.kv_buffer.get_device()
-        token_num = len(token_indexes)
+        token_num = len(move_token_indexes)
         move_size = self.kv_buffer.numel() // self.layer_num // self.size * token_num
         recive_buffer = self.kv_move_buffer.view(-1)[0:move_size].view(1, token_num, self.head_num, self.head_dim)
         for layer_index in range(self.layer_num):
             dist.recv(recive_buffer, src=0)
             for i, mem in enumerate(mem_managers):
                 if i == cur_device_index:
-                    mem._write_kv_move_data(token_indexes, recive_buffer, layer_index)
+                    mem._write_kv_move_data(move_token_indexes, recive_buffer, layer_index)
                 else:
                     new_recive_buffer = mem.kv_move_buffer.view(-1)[0:move_size].view(recive_buffer.shape)
                     from torch.cuda import comm
 
                     comm.broadcast(recive_buffer, out=[new_recive_buffer])
-                    mem._write_kv_move_data(token_indexes, new_recive_buffer, layer_index)
+                    mem._write_kv_move_data(move_token_indexes, new_recive_buffer, layer_index)
         return
 
     def _write_kv_move_data(self, token_indexes: torch.Tensor, buffer_tensor: torch.Tensor, layer_index):
 
@@ -3,6 +3,7 @@
 import torch
 import torch.distributed as dist
 from typing import List
+from lightllm.server.pd_io_struct import KVMoveTask
 from lightllm.utils.log_utils import init_logger
 from lightllm.server.router.dynamic_prompt.shared_arr import SharedInt
 from lightllm.utils.profile_max_tokens import get_available_gpu_memory, get_total_gpu_memory
@@ -79,25 +80,27 @@ def alloc_kv_move_buffer(self, max_req_total_len):
         )
         return
 
-    def send_to_decode_node(
-        self, token_indexes: List[int], mem_managers: List["MemoryManager"], dp_size: int, dp_index: int
-    ):
+    def send_to_decode_node(self, move_tasks: List[KVMoveTask], mem_managers: List["MemoryManager"], dp_size: int):
         """
-        dp_size 和 dp_index 是为 deepseekv2 类型，可以 dp 和 tp 混合模式运行的模型定制的参数，
+        dp_size 是为 deepseekv2 类型，可以 dp 和 tp 混合模式运行的模型定制的参数，
         普通tp模式下, dp_size 一定等于 1, dp_index 一定等于 0, 同时普通模式下, 这两个参数并不会
         被真正使用
         """
         assert dp_size == 1
-        assert dp_index == 0
 
         # 先将数据发送到指定的一张卡上的buffer，再发送。
         import torch.distributed as dist
 
+        move_token_indexes = []
+        for task in move_tasks:
+            if task.move_kv_len != 0:
+                move_token_indexes.extend(task.prefill_token_indexes[-task.move_kv_len :])
+
         cur_device_index = self.kv_buffer.get_device()
         cur_mem = mem_managers[cur_device_index]
         for i, mem in enumerate(mem_managers):
             for layer_index in range(mem.layer_num):
-                move_buffer = mem._get_kv_move_data(token_indexes, layer_index)
+                move_buffer = mem._get_kv_move_data(move_token_indexes, layer_index)
                 if i == cur_device_index:
                     dist.send(move_buffer, dst=1)
                 else:
@@ -118,34 +121,38 @@ def _get_kv_move_data(self, token_indexes: List[int], layer_index: int):
         return move_buffer
 
     def receive_from_prefill_node(
-        self, token_indexes: List[int], mem_managers: List["MemoryManager"], dp_size: int, dp_index: int
+        self, move_tasks: List[KVMoveTask], mem_managers: List["MemoryManager"], dp_size: int
     ):
         """
-        dp_size 和 dp_index 是为 deepseekv2 类型，可以 dp 和 tp 混合模式运行的模型定制的参数，
-        普通tp模式下, dp_size 一定等于 1, dp_index 一定等于 0, 同时普通模式下, 这两个参数并不会
+        dp_size 是为 deepseekv2 类型，可以 dp 和 tp 混合模式运行的模型定制的参数，
+        普通tp模式下, dp_size 一定等于 1, 同时普通模式下, 这两个参数并不会
         被真正使用
         """
         assert dp_size == 1
-        assert dp_index == 0
 
         # 先将数据接受到指定的一张卡上的buffer，再复制到其他的卡上。
         import torch.distributed as dist
 
+        move_token_indexes = []
+        for task in move_tasks:
+            if task.move_kv_len != 0:
+                move_token_indexes.extend(task.decode_token_indexes[-task.move_kv_len :])
+
         cur_device_index = self.kv_buffer.get_device()
-        token_num = len(token_indexes)
+        token_num = len(move_token_indexes)
         move_size = self.kv_buffer.numel() // self.layer_num // self.size * token_num
         recive_buffer = self.kv_move_buffer.view(-1)[0:move_size].view(1, token_num, 2 * self.head_num, self.head_dim)
         for i, mem in enumerate(mem_managers):
             for layer_index in range(mem.layer_num):
                 dist.recv(recive_buffer, src=0)
                 if i == cur_device_index:
-                    mem._write_kv_move_data(token_indexes, recive_buffer, layer_index)
+                    mem._write_kv_move_data(move_token_indexes, recive_buffer, layer_index)
                 else:
                     new_recive_buffer = mem.kv_move_buffer.view(-1)[0:move_size].view(recive_buffer.shape)
                     from torch.cuda import comm
 
                     comm.broadcast(recive_buffer, out=[new_recive_buffer])
-                    mem._write_kv_move_data(token_indexes, new_recive_buffer, layer_index)
+                    mem._write_kv_move_data(move_token_indexes, new_recive_buffer, layer_index)
         return
 
     def _write_kv_move_data(self, token_indexes: torch.Tensor, buffer_tensor: torch.Tensor, layer_index):
 
@@ -153,15 +153,15 @@ async def wait_to_model_ready(self):
 
         if self.args.run_mode == "prefill":
             # 启动 prefill kv move 管理进程
-            from lightllm.server.router.model_infer.mode_backend.continues_batch.prefill_node_impl import (
+            from lightllm.server.router.model_infer.mode_backend.continues_batch.pd_mode.prefill_node_impl import (
                 start_prefill_kv_move_manager_process,
             )
 
             start_prefill_kv_move_manager_process(self.args, self.info_queue, self.mem_queues)
 
         if self.args.run_mode == "decode":
             # 启动 decode kv move 管理进程
-            from lightllm.server.router.model_infer.mode_backend.continues_batch.decode_node_impl import (
+            from lightllm.server.router.model_infer.mode_backend.continues_batch.pd_mode.decode_node_impl import (
                 start_decode_kv_move_manager_process,
             )
 
 
@@ -7,5 +7,5 @@
 from .continues_batch.impl_for_token_healing import TokenHealingBackend
 from .continues_batch.impl_for_simple_constraint_mode import SimpleConstraintBackend
 from .continues_batch.impl_for_first_token_constraint_mode import FirstTokenConstraintBackend
-from .continues_batch.prefill_node_impl.prefill_impl import ContinuesBatchBackendForPrefillNode
-from .continues_batch.decode_node_impl.decode_impl import ContinuesBatchBackendForDecodeNode
+from .continues_batch.pd_mode.prefill_node_impl.prefill_impl import ContinuesBatchBackendForPrefillNode
+from .continues_batch.pd_mode.decode_node_impl.decode_impl import ContinuesBatchBackendForDecodeNode
@@ -11,8 +11,8 @@
 from lightllm.server.io_struct import ReqRunStatus, FinishStatus
 from lightllm.server.pd_io_struct import UpKVStatus
 from lightllm.utils.log_utils import init_logger
-from ..pre_process import prepare_prefill_inputs, prepare_decode_inputs
-from ..post_process import sample
+from ...pre_process import prepare_prefill_inputs, prepare_decode_inputs
+from ...post_process import sample
 from .up_status import UpStatusManager
 from rpyc.utils.server import ThreadedServer
 from lightllm.common.basemodel.infer_lock import g_infer_state_lock, g_router_lock
Original file line number	Diff line number	Diff line change
`@@ -153,15 +153,15 @@ async def wait_to_model_ready(self):`
`153`	`153`
`154`	`154`	`if self.args.run_mode == "prefill":`
`155`	`155`	`# 启动 prefill kv move 管理进程`
`156`		`- from lightllm.server.router.model_infer.mode_backend.continues_batch.prefill_node_impl import (`
	`156`	`+ from lightllm.server.router.model_infer.mode_backend.continues_batch.pd_mode.prefill_node_impl import (`
`157`	`157`	`start_prefill_kv_move_manager_process,`
`158`	`158`	`)`
`159`	`159`
`160`	`160`	`start_prefill_kv_move_manager_process(self.args, self.info_queue, self.mem_queues)`
`161`	`161`
`162`	`162`	`if self.args.run_mode == "decode":`
`163`	`163`	`# 启动 decode kv move 管理进程`
`164`		`- from lightllm.server.router.model_infer.mode_backend.continues_batch.decode_node_impl import (`
	`164`	`+ from lightllm.server.router.model_infer.mode_backend.continues_batch.pd_mode.decode_node_impl import (`
`165`	`165`	`start_decode_kv_move_manager_process,`
`166`	`166`	`)`
`167`	`167`