ModelTC
diff --git a/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 387 additions & 258 deletions b/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 387 additions & 258 deletions
diff --git a/‎lightllm/common/basemodel/batch_objs.py‎
Lines changed: 39 additions & 0 deletions b/‎lightllm/common/basemodel/batch_objs.py‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎lightllm/common/basemodel/cuda_graph.py‎
Lines changed: 133 additions & 97 deletions b/‎lightllm/common/basemodel/cuda_graph.py‎
Lines changed: 133 additions & 97 deletions
@@ -0,0 +1,39 @@
+import torch
+from dataclasses import dataclass, field
+from typing import Optional
+
+
+@dataclass
+class ModelInput:
+    # 通用变量
+    batch_size: int
+    total_token_num: int
+    max_len_in_batch: int
+    input_ids: torch.Tensor
+    mem_indexes: torch.Tensor
+    b_req_idx: torch.Tensor
+    b_seq_len: torch.Tensor
+    is_prefill: bool = False
+    b_ready_cache_len: torch.Tensor = None
+    multimodal_params: list = field(default_factory=list)
+
+    # 专有变量，用于一些特殊的模型，特殊的模式下, 传递一些特殊
+    # 的输入变量。只在特殊的模型模式下才会具体使用和生效。
+
+    # deepseekv3_mtp_draft_input_hiddens 用于 deepseekv3 模型 mtp 模式下
+    # 的 draft 模型的输入
+    deepseekv3_mtp_draft_input_hiddens: Optional[torch.Tensor] = None
+
+
+@dataclass
+class ModelOutput:
+    # 通用变量
+    logits: torch.Tensor
+
+    # 专有变量，用于一些特殊的模型，特殊的模式下, 传递一些特殊
+    # 的输出变量。只在特殊的模型模式下才会具体使用和生效。
+
+    # deepseekv3_mtp_main_output_hiddens 用于在mtp模式下，llm main model
+    # 输出最后一层的hidden state 状态用于 draft 模型的 deepseekv3_mtp_draft_input_hiddens
+    # 输入
+    deepseekv3_mtp_main_output_hiddens: Optional[torch.Tensor] = None
@@ -1,10 +1,14 @@
 import os
 import torch
 import copy
+import bisect
+from typing import Optional
 from lightllm.utils.log_utils import init_logger
 from lightllm.utils.envs_utils import get_env_start_args
 from lightllm.distributed import dist_group_manager, lightllm_capture_graph, CustomProcessGroup
-from lightllm.common.basemodel.microbatch_overlap_objs import DecodeMicroBatch
+from lightllm.common.basemodel.batch_objs import ModelInput, ModelOutput
+from .infer_struct import InferStateInfo
+
 
 logger = init_logger(__name__)
 
@@ -17,15 +21,48 @@ def __init__(self, max_batch_size=8, max_len_in_batch=8192):
         self.mempool = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
         self.max_batch_size = max_batch_size
         self.graph_max_len_in_batch = max_len_in_batch
-        self.enable_decode_microbatch_overlap = get_env_start_args().enable_decode_microbatch_overlap
+        self.args = get_env_start_args()
+        self.enable_decode_microbatch_overlap = self.args.enable_decode_microbatch_overlap
+
+        # gen cuda graph batch_sizes
+        # cuda graph gen for batch size = [1, 2, 3, ..., graph_split_batch_size]
+        # and [graph_split_batch_size + graph_grow_step_size,
+        # graph_split_batch_size + 2 * graph_grow_step_size,  ...,  self.max_batch_size]
+        graph_split_batch_size = self.args.graph_split_batch_size
+        max_batch_size = self.max_batch_size
+        graph_grow_step_size = self.args.graph_grow_step_size
+
+        batch_sizes = [i for i in range(1, graph_split_batch_size + 1)]
+        for _batch_size in range(graph_split_batch_size + graph_grow_step_size, max_batch_size, graph_grow_step_size):
+            batch_sizes.append(_batch_size)
+
+        batch_sizes = list(set([e for e in batch_sizes if e < max_batch_size]))
+        batch_sizes.append(max_batch_size)
+        batch_sizes.sort()
+
+        self.cuda_graph_batch_sizes = batch_sizes
+        assert batch_sizes[-1] == self.max_batch_size
+        logger.info(f"cuda graph batch_sizes: {self.cuda_graph_batch_sizes}")
 
     def can_run(self, batch_size, max_len_in_batch):
         return batch_size <= self.max_batch_size and max_len_in_batch <= self.graph_max_len_in_batch
 
     def need_capture(self, batch_size):
-        return batch_size not in self.graph
+        find_batch_size = self.find_closest_graph_batch_size(batch_size)
+        if find_batch_size is not None:
+            return find_batch_size not in self.graph
+        else:
+            assert False, "dead code"
 
-    def _capture_decode(self, decode_func, input_ids, infer_state):
+    def find_closest_graph_batch_size(self, batch_size):
+        index = bisect.bisect_left(self.cuda_graph_batch_sizes, batch_size)
+        if index < len(self.cuda_graph_batch_sizes):
+            find_batch_size = self.cuda_graph_batch_sizes[index]
+            return find_batch_size
+        else:
+            return None
+
+    def _capture_decode(self, decode_func, input_ids: torch.Tensor, infer_state: InferStateInfo):
         dist_group: CustomProcessGroup = infer_state.dist_group
         graph_obj = torch.cuda.CUDAGraph()
         batch_size = input_ids.shape[0]
@@ -46,12 +83,19 @@ def _capture_decode(self, decode_func, input_ids, infer_state):
 
         with lightllm_capture_graph(dist_group):
             with torch.cuda.graph(graph_obj, pool=self.mempool):
-                predict_logics = decode_func(input_ids, infer_state)
-        self.graph[batch_size] = (graph_obj, input_ids, infer_state, predict_logics)
+                model_output = decode_func(input_ids, infer_state)
+        self.graph[batch_size] = (graph_obj, input_ids, infer_state, model_output)
         graph_obj.replay()
-        return predict_logics
+        return model_output
 
-    def _capture_decode_overlap(self, decode_func, input_ids, infer_state, input_ids1, infer_state1):
+    def _capture_decode_overlap(
+        self,
+        decode_func,
+        input_ids: torch.Tensor,
+        infer_state: InferStateInfo,
+        input_ids1: torch.Tensor,
+        infer_state1: InferStateInfo,
+    ):
         dist_group: CustomProcessGroup = infer_state.dist_group
         dist_group1 = infer_state1.dist_group
         graph_obj = torch.cuda.CUDAGraph()
@@ -68,20 +112,27 @@ def _capture_decode_overlap(self, decode_func, input_ids, infer_state, input_ids
         with lightllm_capture_graph(dist_group1):
             with lightllm_capture_graph(dist_group):
                 with torch.cuda.graph(graph_obj, pool=self.mempool):
-                    predict_logics, predict_logics1 = decode_func(input_ids, infer_state, input_ids1, infer_state1)
+                    model_output, model_output1 = decode_func(input_ids, infer_state, input_ids1, infer_state1)
         self.graph[batch_size] = (
             graph_obj,
             input_ids,
             infer_state,
             input_ids1,
             infer_state1,
-            predict_logics,
-            predict_logics1,
+            model_output,
+            model_output1,
         )
         graph_obj.replay()
-        return predict_logics, predict_logics1
+        return model_output, model_output1
 
-    def capture_decode(self, decode_func, input_ids, infer_state, input_ids1=None, infer_state1=None):
+    def capture_decode(
+        self,
+        decode_func,
+        input_ids: torch.Tensor,
+        infer_state: InferStateInfo,
+        input_ids1: Optional[torch.Tensor] = None,
+        infer_state1: Optional[torch.Tensor] = None,
+    ):
         """
         Capture the cuda graph for the decoding stage.
         input_ids1 and infer_state1 is used for the overlap.
@@ -92,31 +143,37 @@ def capture_decode(self, decode_func, input_ids, infer_state, input_ids1=None, i
             assert input_ids1 is None and infer_state1 is None
             return self._capture_decode(decode_func, input_ids, infer_state)
 
-    def _replay(self, input_ids, infer_state):
+    def _replay(self, input_ids: torch.Tensor, infer_state: InferStateInfo):
         batch_size = input_ids.shape[0]
-        graph_obj, graph_input_ids, graph_infer_state, graph_predict_logics = self.graph[batch_size]
+        graph_obj, graph_input_ids, graph_infer_state, graph_output = self.graph[batch_size]
         graph_input_ids.copy_(input_ids)
         graph_infer_state.copy_for_cuda_graph(infer_state)
         graph_obj.replay()
-        return graph_predict_logics
+        return graph_output
 
-    def _replay_overlap(self, input_ids, infer_state, input_ids1, infer_state1):
+    def _replay_overlap(
+        self,
+        input_ids: torch.Tensor,
+        infer_state: InferStateInfo,
+        input_ids1: torch.Tensor,
+        infer_state1: InferStateInfo,
+    ):
         batch_size = input_ids.shape[0]
         (
             graph_obj,
             graph_input_ids,
             graph_infer_state,
             graph_input_ids1,
             graph_infer_state1,
-            graph_predict_logics,
-            graph_predict_logics1,
+            graph_model_output,
+            graph_model_output1,
         ) = self.graph[batch_size]
         graph_input_ids.copy_(input_ids)
         graph_infer_state.copy_for_cuda_graph(infer_state)
         graph_input_ids1.copy_(input_ids1)
         graph_infer_state1.copy_for_cuda_graph(infer_state1)
         graph_obj.replay()
-        return graph_predict_logics, graph_predict_logics1
+        return graph_model_output, graph_model_output1
 
     def replay(self, input_ids, infer_state, input_ids1=None, infer_state1=None):
         if self.enable_decode_microbatch_overlap:
@@ -128,59 +185,50 @@ def replay(self, input_ids, infer_state, input_ids1=None, infer_state1=None):
     @torch.no_grad()
     def warmup(self, model):
         logger.info("Begin capture cudagraph, use the --disable_cudagraph to disable it.")
-        for batch_size in range(self.max_batch_size, self.max_batch_size - 1, -1):
-            # dummy prefill
-            prefill_input_len = 1
-            dummy_input_ids = torch.ones((batch_size,), dtype=torch.int32, device="cuda")
+        # for typing easy
+        from .basemodel import TpPartBaseModel
+
+        model: TpPartBaseModel = model
+
+        # decode cuda graph init
+        for batch_size in self.cuda_graph_batch_sizes[::-1]:
+            seq_len = 2
+            total_token_num = batch_size * seq_len
+            max_len_in_batch = self.graph_max_len_in_batch
+            input_ids = torch.tensor([1 for _ in range(batch_size)], dtype=torch.int32, device="cuda")
+            mem_indexes = model.mem_manager.alloc(len(input_ids)).cuda()
             b_req_idx = torch.tensor(
-                [model.req_manager.alloc() for _ in range(batch_size)], dtype=torch.int32, device="cuda"
-            )
-            mem_indexes = model.mem_manager.alloc(len(dummy_input_ids)).cuda()
-            b_seq_len = torch.ones(batch_size, dtype=torch.int32, device="cuda")
-            b_ready_cache_len = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
-            total_token_num = prefill_input_len * batch_size
-            logics = model.forward(
-                batch_size,
-                total_token_num,
-                prefill_input_len,
-                dummy_input_ids,
-                mem_indexes,
-                b_req_idx,
-                b_seq_len,
-                b_ready_cache_len=b_ready_cache_len,
-                is_prefill=True,
-                multimodal_params=[],
+                [model.req_manager.HOLD_REQUEST_ID for _ in range(batch_size)], dtype=torch.int32, device="cuda"
             )
-            mem_indexes = None
-            prob_out = torch.softmax(logics, dim=-1)
-            logics = None
-            predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
-            prob_out = None
-            predict_ids = predict_ids.detach().cpu().numpy()
-            torch.cuda.empty_cache()
+            b_seq_len = torch.empty(batch_size, dtype=torch.int32, device="cuda")
+            b_seq_len.fill_(seq_len)
 
-            # dummy decoding, capture the cudagraph
-            total_token_num += batch_size
-            b_seq_len += 1
-            mem_indexes = model.mem_manager.alloc(len(predict_ids)).cuda()
-            logics = model.forward(
-                batch_size,
-                total_token_num,
-                prefill_input_len + 1,
-                torch.from_numpy(predict_ids).cuda().reshape(-1),
-                mem_indexes,
-                b_req_idx,
-                b_seq_len,
+            model_input = ModelInput(
+                batch_size=batch_size,
+                total_token_num=total_token_num,
+                max_len_in_batch=max_len_in_batch,
+                input_ids=input_ids,
+                mem_indexes=mem_indexes,
+                b_req_idx=b_req_idx,
+                b_seq_len=b_seq_len,
                 is_prefill=False,
+                **model._gen_special_model_input(batch_size),
             )
-            mem_indexes = None
+            model_output: ModelOutput = model.forward(model_input)
+            del model_output
+            del input_ids
+            del mem_indexes
+            del b_req_idx
+            del b_seq_len
+
             model.mem_manager.free_all()
             model.req_manager.free_all()
             # release local tensors
             for var_name, var_value in list(locals().items()):
                 if isinstance(var_value, torch.Tensor):
                     del locals()[var_name]
             torch.cuda.empty_cache()
+
         logger.info(
             f"Capture cudagraph success, batch_size <={self.max_batch_size} "
             f"and max_len_in_batch <= {self.graph_max_len_in_batch} will infer with cudagraph."
@@ -189,64 +237,52 @@ def warmup(self, model):
     @torch.no_grad()
     def warmup_overlap(self, model):
         logger.info("Begin capture overlap cudagraph, use the --disable_cudagraph to disable it.")
-        for batch_size in range(self.max_batch_size, 0, -1):
+        # for typing easy
+        from .basemodel import TpPartBaseModel
+
+        model: TpPartBaseModel = model
+
+        for batch_size in self.cuda_graph_batch_sizes[::-1]:
             decode_batches = []
             for micro_batch_index in [0, 1]:
-                # dummy prefill
-                prefill_input_len = 1
-                dummy_input_ids = torch.ones((batch_size,), dtype=torch.int32, device="cuda")
+                # dummy decoding, capture the cudagraph
+                seq_len = 2
+                total_token_num = batch_size * seq_len
+                max_len_in_batch = self.graph_max_len_in_batch
+                input_ids = torch.tensor([1 for _ in range(batch_size)], dtype=torch.int32, device="cuda")
+                mem_indexes = model.mem_manager.alloc(len(input_ids)).cuda()
                 b_req_idx = torch.tensor(
-                    [model.req_manager.alloc() for _ in range(batch_size)], dtype=torch.int32, device="cuda"
-                )
-                mem_indexes = model.mem_manager.alloc(len(dummy_input_ids)).cuda()
-                b_seq_len = torch.ones(batch_size, dtype=torch.int32, device="cuda")
-                b_ready_cache_len = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
-                total_token_num = prefill_input_len * batch_size
-                logics = model.forward(
-                    batch_size,
-                    total_token_num,
-                    prefill_input_len,
-                    dummy_input_ids,
-                    mem_indexes,
-                    b_req_idx,
-                    b_seq_len,
-                    b_ready_cache_len=b_ready_cache_len,
-                    is_prefill=True,
-                    multimodal_params=[],
+                    [model.req_manager.HOLD_REQUEST_ID for _ in range(batch_size)], dtype=torch.int32, device="cuda"
                 )
-                mem_indexes = None
-                prob_out = torch.softmax(logics, dim=-1)
-                logics = None
-                predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
-                prob_out = None
-                predict_ids = predict_ids.detach().cpu().numpy()
-                torch.cuda.empty_cache()
-
-                # dummy decoding, capture the cudagraph
-                total_token_num += batch_size
-                b_seq_len += 1
-                mem_indexes = model.mem_manager.alloc(len(predict_ids)).cuda()
+                b_seq_len = torch.empty(batch_size, dtype=torch.int32, device="cuda")
+                b_seq_len.fill_(seq_len)
 
-                micro_batch = DecodeMicroBatch(
+                micro_batch = ModelInput(
+                    is_prefill=False,
                     batch_size=batch_size,
                     total_token_num=total_token_num,
-                    max_len_in_batch=prefill_input_len + 1,
-                    input_ids=torch.from_numpy(predict_ids).cuda().reshape(-1),
+                    max_len_in_batch=max_len_in_batch,
+                    input_ids=input_ids,
                     mem_indexes=mem_indexes,
                     b_req_idx=b_req_idx,
                     b_seq_len=b_seq_len,
+                    **model._gen_special_model_input(batch_size),
                 )
                 decode_batches.append(micro_batch)
+                del micro_batch
 
                 for var_name, var_value in list(locals().items()):
                     if isinstance(var_value, torch.Tensor):
                         del locals()[var_name]
                 torch.cuda.empty_cache()
+
             _, _ = model.microbatch_overlap_decode(decode_batches[0], decode_batches[1])
 
             model.mem_manager.free_all()
             model.req_manager.free_all()
 
+            del decode_batches
+
             # release local tensors
             for var_name, var_value in list(locals().items()):
                 if isinstance(var_value, torch.Tensor):