ModelTC
diff --git a/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 13 additions & 4 deletions b/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 13 additions & 4 deletions
diff --git a/‎lightllm/common/basemodel/batch_objs.py‎
Lines changed: 9 additions & 0 deletions b/‎lightllm/common/basemodel/batch_objs.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎lightllm/common/basemodel/infer_struct.py‎
Lines changed: 6 additions & 5 deletions b/‎lightllm/common/basemodel/infer_struct.py‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎lightllm/common/basemodel/triton_kernel/gather_token_id.py‎
Lines changed: 209 additions & 0 deletions b/‎lightllm/common/basemodel/triton_kernel/gather_token_id.py‎
Lines changed: 209 additions & 0 deletions
diff --git a/‎lightllm/common/basemodel/triton_kernel/gen_decode_params.py‎
Lines changed: 2 additions & 3 deletions b/‎lightllm/common/basemodel/triton_kernel/gen_decode_params.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎lightllm/common/req_manager.py‎
Lines changed: 6 additions & 0 deletions b/‎lightllm/common/req_manager.py‎
Lines changed: 6 additions & 0 deletions
@@ -17,10 +17,11 @@
 from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
 from lightllm.common.basemodel.cuda_graph import CudaGraph
 from lightllm.common.quantization import Quantcfg
+from lightllm.common.basemodel.triton_kernel.gather_token_id import gather_token_from_cpu
 from lightllm.utils.log_utils import init_logger
 from lightllm.utils.dist_utils import get_dp_world_size
 from lightllm.utils.envs_utils import get_env_start_args
-from lightllm.distributed.communication_op import CustomProcessGroup, dist_group_manager
+from lightllm.distributed.communication_op import dist_group_manager
 from lightllm.common.basemodel.batch_objs import ModelInput, ModelOutput
 from lightllm.utils.custom_kernel_utis import pad2dim_tensor_to_new_batch
 from lightllm.utils.envs_utils import set_model_init_status
@@ -237,6 +238,7 @@ def _init_custom(self):
 
     @torch.no_grad()
     def forward(self, model_input: ModelInput):
+        model_input.to_cuda()
         assert model_input.mem_indexes.is_cuda
 
         if model_input.is_prefill:
@@ -339,13 +341,20 @@ def _prefill(
             infer_state.mem_index,
         )
 
-        infer_state.init_some_extra_state(self, model_input.input_ids)
+        infer_state.init_some_extra_state(self, model_input)
         return self._context_forward(model_input.input_ids, infer_state)
 
     def _decode(
         self,
         model_input: ModelInput,
     ) -> ModelOutput:
+        # for overlap mode
+        if model_input.input_ids is None:
+            model_input.input_ids = gather_token_from_cpu(
+                self.req_manager.req_sampling_params_manager.req_to_next_token_ids_cpu,
+                model_input.b_req_idx,
+            )
+
         if self.graph is not None and self.graph.can_run(model_input.batch_size, model_input.max_len_in_batch):
             find_graph_batch_size = self.graph.find_closest_graph_batch_size(model_input.batch_size)
             padded_model_input = self._create_padded_decode_model_input(model_input, find_graph_batch_size)
@@ -356,7 +365,7 @@ def _decode(
                 infer_state.b_seq_len,
                 infer_state.mem_index,
             )
-            infer_state.init_some_extra_state(self, padded_model_input.input_ids)
+            infer_state.init_some_extra_state(self, padded_model_input)
 
             if self.graph.need_capture(find_graph_batch_size):
                 infer_state.is_cuda_graph = True
@@ -377,7 +386,7 @@ def _decode(
                 infer_state.b_seq_len,
                 infer_state.mem_index,
             )
-            infer_state.init_some_extra_state(self, model_input.input_ids)
+            infer_state.init_some_extra_state(self, model_input)
             model_output = self._token_forward(model_input.input_ids, infer_state)
 
         return model_output
 
@@ -24,6 +24,15 @@ class ModelInput:
     # 的 draft 模型的输入
     deepseekv3_mtp_draft_input_hiddens: Optional[torch.Tensor] = None
 
+    def to_cuda(self):
+        if self.input_ids is not None:
+            self.input_ids = self.input_ids.cuda(non_blocking=True)
+        self.mem_indexes = self.mem_indexes.cuda(non_blocking=True)
+        self.b_req_idx = self.b_req_idx.cuda(non_blocking=True)
+        self.b_seq_len = self.b_seq_len.cuda(non_blocking=True)
+        if self.b_ready_cache_len is not None:
+            self.b_ready_cache_len = self.b_ready_cache_len.cuda(non_blocking=True)
+
 
 @dataclass
 class ModelOutput:
 
@@ -6,6 +6,7 @@
 from .triton_kernel.gen_prefill_params import gen_prefill_params
 from .triton_kernel.gen_decode_params import gen_decode_params
 from .triton_kernel.multimodal_emb import mark_multimodal_obj
+from .batch_objs import ModelInput
 
 
 class InferStateInfo:
@@ -64,7 +65,7 @@ def __init__(self):
         # 的输入会用到，其他模型和场景都不会用到
         self.deepseekv3_mtp_draft_input_hiddens: Optional[torch.Tensor] = None
 
-    def init_some_extra_state(self, model, input_ids: torch.Tensor):
+    def init_some_extra_state(self, model, model_input: ModelInput):
         if self.is_prefill:
             (
                 self.b_q_seq_len,
@@ -75,7 +76,7 @@ def init_some_extra_state(self, model, input_ids: torch.Tensor):
                 self.max_q_seq_len,
                 self.max_kv_seq_len,
             ) = gen_prefill_params(
-                input_token_num=input_ids.shape[0],
+                input_token_num=model_input.input_ids.shape[0],
                 b_ready_cache_len=self.b_ready_cache_len,
                 b_seq_len=self.b_seq_len,
             )
@@ -87,10 +88,10 @@ def init_some_extra_state(self, model, input_ids: torch.Tensor):
                 self.b_kv_seq_len,
                 self.b1_cu_kv_seq_len,
                 self.position_ids,
-                self.max_q_seq_len,
-                self.max_kv_seq_len,
-            ) = gen_decode_params(b_seq_len=self.b_seq_len)
+            ) = gen_decode_params(self.b_seq_len)
             self.b_start_loc = self.b1_cu_kv_seq_len[0:-1]
+            self.max_q_seq_len = 1
+            self.max_kv_seq_len = model_input.max_len_in_batch
 
     def copy_for_cuda_graph(self, new_infer_state: "InferStateInfo"):
         for attr_name, attr_value in vars(new_infer_state).items():
 
@@ -0,0 +1,209 @@
+import torch
+
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _fwd_kernel_gather_and_scatter(
+    probs_idx,
+    probs_sort,
+    req_to_next_token_ids,
+    req_to_next_token_probs,
+    sampled_index,
+    b_req_idx,
+    probs_idx_stride,
+    probs_sort_stride,
+    req_to_next_token_ids_stride,
+    req_to_next_token_probs_stride,
+):
+    cur_index = tl.program_id(0)
+    cur_req_idx = tl.load(b_req_idx + cur_index)
+    cur_sampled_index = tl.load(sampled_index + cur_index)
+    cur_token_index = tl.load(probs_idx + cur_index * probs_idx_stride + cur_sampled_index)
+    cur_token_probs = tl.load(probs_sort + cur_index * probs_sort_stride + cur_sampled_index)
+    tl.store(req_to_next_token_ids + cur_req_idx * req_to_next_token_ids_stride, cur_token_index)
+    tl.store(req_to_next_token_probs + cur_req_idx * req_to_next_token_probs_stride, tl.log(cur_token_probs))
+    return
+
+
+@torch.no_grad()
+def gather_and_scatter_token_to_cpu(
+    probs_idx: torch.Tensor,
+    probs_sort: torch.Tensor,
+    req_to_next_token_ids: torch.Tensor,
+    req_to_next_token_probs: torch.Tensor,
+    sampled_index: torch.Tensor,
+    b_req_idx: torch.Tensor,
+):
+    """
+    This function is used to gather the next_token_id(GPU tensor) and next_token_probs(GPU tensor)
+    info to the req_to_next_token_ids and req_to_next_token_probs(CPU tensor).
+    Args:
+        probs_idx: (batch_size, vocab_size)
+        probs_sort: (batch_size, vocab_size)
+        req_to_next_token_ids: (max_req_num,)
+        req_to_next_token_probs: (max_req_num,)
+        sampled_index: (batch_size,)
+        b_req_idx: (batch_size,)
+    """
+    assert probs_idx.shape == probs_sort.shape
+    assert sampled_index.shape[0] == b_req_idx.shape[0]
+    batch_size = b_req_idx.shape[0]
+    grid = (batch_size,)
+    num_warps = 1
+
+    _fwd_kernel_gather_and_scatter[grid](
+        probs_idx,
+        probs_sort,
+        req_to_next_token_ids,
+        req_to_next_token_probs,
+        sampled_index,
+        b_req_idx,
+        probs_idx.stride(0),
+        probs_sort.stride(0),
+        req_to_next_token_ids.stride(0),
+        req_to_next_token_probs.stride(0),
+        num_warps=num_warps,
+        num_stages=1,
+    )
+    return
+
+
+@triton.jit
+def _fwd_kernel_scatter(
+    token_info,
+    req_to_token_info,
+    b_req_idx,
+    req_to_token_info_stride,
+):
+    cur_index = tl.program_id(0)
+    cur_req_idx = tl.load(b_req_idx + cur_index)
+    cur_token_info = tl.load(token_info + cur_index)
+    tl.store(req_to_token_info + cur_req_idx * req_to_token_info_stride, cur_token_info)
+    return
+
+
+@torch.no_grad()
+def scatter_token_to_cpu(token_info: torch.Tensor, req_to_token_info: torch.Tensor, b_req_idx: torch.Tensor):
+    """
+    This function is used to scatter the token_info(GPU tensor) to the req_to_token_info(CPU tensor).
+    Args:
+        token_info: (batch_size, vocab_size)
+        req_to_token_info: (max_req_num,)
+        b_req_idx: (batch_size,)
+    """
+    assert token_info.shape[0] == b_req_idx.shape[0]
+    batch_size = b_req_idx.shape[0]
+    grid = (batch_size,)
+    num_warps = 1
+
+    _fwd_kernel_scatter[grid](
+        token_info,
+        req_to_token_info,
+        b_req_idx,
+        req_to_token_info.stride(0),
+        num_warps=num_warps,
+        num_stages=1,
+    )
+    return
+
+
+@triton.jit
+def _fwd_kernel_gather(
+    req_to_token_info,
+    output,
+    b_req_idx,
+):
+    cur_index = tl.program_id(0)
+    cur_req_idx = tl.load(b_req_idx + cur_index)
+    cur_token_info = tl.load(req_to_token_info + cur_req_idx)
+    tl.store(output + cur_index, cur_token_info)
+    return
+
+
+def gather_token_from_cpu(req_to_token_info: torch.Tensor, b_req_idx: torch.Tensor):
+    """
+    This function is used to gather the token_info(CPU tensor) to the token_info(GPU tensor).
+    Args:
+        req_to_token_info: (max_req_num,)
+        b_req_idx: (batch_size,)
+    Returns:
+        output: (batch_size,)
+    """
+    batch_size = b_req_idx.shape[0]
+    output = torch.empty_like(b_req_idx)
+    grid = (batch_size,)
+    num_warps = 1
+    _fwd_kernel_gather[grid](
+        req_to_token_info,
+        output,
+        b_req_idx,
+        num_warps=num_warps,
+        num_stages=1,
+    )
+    return output
+
+
+def _top_p_top_k(probs: torch.Tensor, top_ps: torch.Tensor, top_ks: torch.Tensor):
+    probs_sort, probs_idx = probs.sort(dim=-1, descending=True)
+
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    probs_sort[(probs_sum - probs_sort) > top_ps.view(-1, 1)] = 0.0
+
+    probs_sort[torch.arange(0, probs.shape[-1], device="cuda").view(1, -1) >= top_ks.view(-1, 1)] = 0.0
+
+    return probs_sort, probs_idx
+
+
+def test_gather_and_scatter_token_to_cpu():
+    batch_size = 30
+    vocab_size = 60000
+    req_to_next_token_ids = torch.ones((1000,), dtype=torch.int32, pin_memory=True)
+    req_to_next_token_probs = torch.ones((1000,), dtype=torch.float32, pin_memory=True)
+    req_ids = torch.arange(20, 20 + batch_size, dtype=torch.int32).cuda()
+    probs = torch.randn((batch_size, vocab_size)).cuda()
+    top_ps = torch.rand((batch_size,)).cuda()
+    top_ks = torch.ones((batch_size,), dtype=torch.int32).cuda()
+    probs_sort, probs_idx = _top_p_top_k(probs, top_ps, top_ks)
+    sampled_index = torch.multinomial(probs_sort, num_samples=1, replacement=True)
+    batch_next_token_ids = torch.gather(probs_idx, dim=1, index=sampled_index)
+    batch_next_token_probs = torch.gather(probs_sort, dim=1, index=sampled_index)
+
+    gather_and_scatter_token_to_cpu(
+        probs_idx, probs_sort, req_to_next_token_ids, req_to_next_token_probs, sampled_index, req_ids
+    )
+    diff_ids = (req_to_next_token_ids[20 : 20 + batch_size].cuda() - batch_next_token_ids.view(-1)).abs().max()
+    diff_probs = (req_to_next_token_probs[20 : 20 + batch_size].cuda() - batch_next_token_probs.view(-1)).abs().max()
+    assert diff_ids < 1e-6
+    assert diff_probs < 1e-6
+    print("test_gather_and_scatter_token_to_cpu passed")
+
+
+def test_scatter_token_to_cpu():
+    batch_size = 30
+    req_to_token_info = torch.zeros((1000,), dtype=torch.float32, pin_memory=True)
+    token_info = torch.randn((batch_size,)).cuda()
+    req_ids = torch.arange(20, 20 + batch_size, dtype=torch.int32).cuda()
+    scatter_token_to_cpu(token_info, req_to_token_info, req_ids)
+    diff = (req_to_token_info[20 : 20 + batch_size].cuda() - token_info).abs().max()
+    assert diff < 1e-6
+    print("test_scatter_token_to_cpu passed")
+
+
+def test_gather_token_from_cpu():
+    batch_size = 30
+    req_to_token_info = torch.zeros((1000,), dtype=torch.int32, pin_memory=True)
+    token_info = torch.randn((batch_size,)).cuda()
+    req_ids = torch.arange(20, 20 + batch_size, dtype=torch.int32).cuda()
+    scatter_token_to_cpu(token_info, req_to_token_info, req_ids)
+    output = gather_token_from_cpu(req_to_token_info, req_ids)
+    diff = (token_info - output).abs().max()
+    assert diff < 1e-6
+    print("test_gather_token_from_cpu passed")
+
+
+if __name__ == "__main__":
+    test_gather_and_scatter_token_to_cpu()
+    test_scatter_token_to_cpu()
+    test_gather_token_from_cpu()
@@ -10,6 +10,5 @@ def gen_decode_params(b_seq_len: torch.Tensor):
     position_ids = b_seq_len - 1
     b_q_seq_len = torch.ones_like(b_seq_len)
     b1_cu_q_seq_len, b1_cu_kv_seq_len = gen_cumsum_pad0_tensor(b_q_seq_len, b_kv_seq_len)
-    max_q_seq_len = b_q_seq_len.max().item()
-    max_kv_seq_len = b_kv_seq_len.max().item()
-    return b_q_seq_len, b1_cu_q_seq_len, b_kv_seq_len, b1_cu_kv_seq_len, position_ids, max_q_seq_len, max_kv_seq_len
+
+    return b_q_seq_len, b1_cu_q_seq_len, b_kv_seq_len, b1_cu_kv_seq_len, position_ids
@@ -110,6 +110,12 @@ def __init__(self, max_request_num):
         self.req_to_presence_penalty = torch.zeros(max_request_num + 1, dtype=torch.float32, device="cuda")
         self.req_to_frequency_penalty = torch.zeros(max_request_num + 1, dtype=torch.float32, device="cuda")
         self.req_to_repetition_penalty = torch.zeros(max_request_num + 1, dtype=torch.float32, device="cuda")
+        self.req_to_next_token_ids_cpu = torch.zeros(
+            max_request_num + 1, dtype=torch.int32, device="cpu", pin_memory=True
+        )
+        self.req_to_next_token_probs_cpu = torch.zeros(
+            max_request_num + 1, dtype=torch.float32, device="cpu", pin_memory=True
+        )
         self.req_to_exponential_decay_length_penalty = torch.zeros(
             max_request_num + 1, dtype=torch.float32, device="cuda"
         )