ModelTC
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/CN/source/tutorial/api_server_args_zh.rst‎
Lines changed: 0 additions & 4 deletions b/‎docs/CN/source/tutorial/api_server_args_zh.rst‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎docs/EN/source/tutorial/api_server_args_zh.rst‎
Lines changed: 0 additions & 4 deletions b/‎docs/EN/source/tutorial/api_server_args_zh.rst‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 36 additions & 1 deletion b/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 36 additions & 1 deletion
diff --git a/‎lightllm/common/basemodel/batch_objs.py‎
Lines changed: 20 additions & 1 deletion b/‎lightllm/common/basemodel/batch_objs.py‎
Lines changed: 20 additions & 1 deletion
diff --git a/‎lightllm/common/basemodel/cuda_graph.py‎
Lines changed: 4 additions & 0 deletions b/‎lightllm/common/basemodel/cuda_graph.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎lightllm/common/basemodel/infer_struct.py‎
Lines changed: 27 additions & 3 deletions b/‎lightllm/common/basemodel/infer_struct.py‎
Lines changed: 27 additions & 3 deletions
diff --git a/‎lightllm/common/basemodel/layer_infer/cache_tensor_manager.py‎
Lines changed: 11 additions & 0 deletions b/‎lightllm/common/basemodel/layer_infer/cache_tensor_manager.py‎
Lines changed: 11 additions & 0 deletions
@@ -11,4 +11,4 @@ repos:
     hooks:
       - id: flake8
         additional_dependencies: [flake8-typing-imports==1.9.0]
-        args: ['--config=.flake8', '--max-line-length=120', '--ignore=TYP001, E722, C901, E203, E266, E402, E302, E241, E902, E731, F403, E701, F405, F401, W292, W293, W503, W606']
+        args: ['--config=.flake8', '--max-line-length=120', '--ignore=TYP001, E722, C901, E203, E266, E402, E302, E241, E902, E731, F403, E701, F405, F401, W292, W293, W503, W606, E231']
@@ -21,7 +21,7 @@ LightLLM is a Python-based LLM (Large Language Model) inference and serving fram
 [English Docs](https://lightllm-en.readthedocs.io/en/latest/) | [中文文档](https://lightllm-cn.readthedocs.io/en/latest/) | [Blogs](https://modeltc.github.io/lightllm-blog/)
 
 ## News
-- [2025/05] LightLLM paper on constrained decoding accepted by [ACL25](https://openreview.net/pdf?id=g1aBeiyZEi) (Pre $^3$: Enabling Deterministic Pushdown Automata for Faster Structured LLM Generation)
+- [2025/05] LightLLM paper on constrained decoding accepted by [ACL25](https://arxiv.org/pdf/2506.03887) (Pre $^3$: Enabling Deterministic Pushdown Automata for Faster Structured LLM Generation). For a more accessible overview of the research with key insights and examples, check out our blog post: [LightLLM Blog](https://www.light-ai.top/lightllm-blog/2025/06/15/pre3.html)
 - [2025/04] LightLLM paper on request scheduler published in [ASPLOS’25](https://dl.acm.org/doi/10.1145/3676641.3716011) (Past-Future Scheduler for LLM Serving under SLA Guarantees)
 - [2025/02] 🔥 LightLLM v1.0.0 release, achieving the **fastest DeepSeek-R1** serving performance on single H200 machine.
 
 
@@ -274,10 +274,6 @@ attention类型选择参数
 
     多模态资源的缓存服务器容量，默认为 ``200``
 
-.. option:: --cache_reserved_ratio
-
-    缓存服务器清理后的保留容量比例，默认为 ``0.5``
-
 .. option:: --visual_infer_batch_size
 
     每次推理批次中处理的图像数量，默认为 ``1``
 
@@ -273,10 +273,6 @@ Multimodal Parameters
 
     Cache server capacity for multimodal resources, default is ``200``
 
-.. option:: --cache_reserved_ratio
-
-    Reserved capacity ratio after cache server cleanup, default is ``0.5``
-
 .. option:: --visual_infer_batch_size
 
     Number of images processed in each inference batch, default is ``1``
 
@@ -18,12 +18,14 @@
 from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
 from lightllm.common.basemodel.cuda_graph import CudaGraph
 from lightllm.common.quantization import Quantcfg
+from lightllm.common.basemodel.triton_kernel.gather_token_id import gather_token
 from lightllm.utils.log_utils import init_logger
 from lightllm.utils.dist_utils import get_dp_world_size, get_global_world_size, get_global_rank
 from lightllm.utils.envs_utils import get_env_start_args
-from lightllm.distributed.communication_op import CustomProcessGroup, dist_group_manager
+from lightllm.distributed.communication_op import dist_group_manager
 from lightllm.common.basemodel.batch_objs import ModelInput, ModelOutput
 from lightllm.utils.custom_kernel_utis import pad2dim_tensor_to_new_batch
+from lightllm.utils.envs_utils import set_model_init_status
 
 
 logger = init_logger(__name__)
@@ -104,6 +106,7 @@ def __init__(self, kvargs):
         self._init_cudagraph()
         self._check_max_len_infer()
         torch.cuda.empty_cache()
+        set_model_init_status(True)
         return
 
     def _init_config(self):
@@ -236,6 +239,7 @@ def _init_custom(self):
 
     @torch.no_grad()
     def forward(self, model_input: ModelInput):
+        model_input.to_cuda()
         assert model_input.mem_indexes.is_cuda
 
         if model_input.is_prefill:
@@ -345,6 +349,14 @@ def _decode(
         self,
         model_input: ModelInput,
     ) -> ModelOutput:
+        # for overlap mode
+        if model_input.input_ids is None:
+            model_input.input_ids = gather_token(
+                self.req_manager.req_sampling_params_manager.req_to_next_token_ids,
+                model_input.b_req_idx,
+                model_input.b_mtp_index,
+            )
+
         # collect global max batch_size
         world_size = get_global_world_size()
         rank = get_global_rank()
@@ -466,6 +478,9 @@ def _token_forward(self, input_ids, infer_state: InferStateInfo):
 
     @torch.no_grad()
     def microbatch_overlap_prefill(self, model_input0: ModelInput, model_input1: ModelInput):
+        model_input0.to_cuda()
+        model_input1.to_cuda()
+
         assert model_input0.mem_indexes.is_cuda
         assert model_input1.mem_indexes.is_cuda
         input_ids0, input_ids1 = model_input0.input_ids, model_input1.input_ids
@@ -503,6 +518,22 @@ def microbatch_overlap_prefill(self, model_input0: ModelInput, model_input1: Mod
 
     @torch.no_grad()
     def microbatch_overlap_decode(self, model_input0: ModelInput, model_input1: ModelInput):
+        model_input0.to_cuda()
+        model_input1.to_cuda()
+
+        if model_input0.input_ids is None:
+            model_input0.input_ids = gather_token(
+                self.req_manager.req_sampling_params_manager.req_to_next_token_ids,
+                model_input0.b_req_idx,
+                model_input0.b_mtp_index,
+            )
+        if model_input1.input_ids is None:
+            model_input1.input_ids = gather_token(
+                self.req_manager.req_sampling_params_manager.req_to_next_token_ids,
+                model_input1.b_req_idx,
+                model_input1.b_mtp_index,
+            )
+
         assert model_input0.batch_size == model_input1.batch_size
         assert model_input0.mem_indexes.is_cuda
         assert model_input1.mem_indexes.is_cuda
@@ -686,6 +717,7 @@ def _check_max_len_infer(self):
             b_seq_len[:] = self.batch_max_tokens
             b_ready_cache_len = torch.zeros(1, dtype=torch.int32, device="cuda")
             total_token_num = self.batch_max_tokens
+            b_mtp_index = torch.zeros(1, dtype=torch.int32, device="cuda")
             model_input = ModelInput(
                 batch_size=1,
                 total_token_num=total_token_num,
@@ -694,6 +726,7 @@ def _check_max_len_infer(self):
                 mem_indexes=mem_indexes,
                 b_req_idx=b_req_idx,
                 b_seq_len=b_seq_len,
+                b_mtp_index=b_mtp_index,
                 is_prefill=True,
                 b_ready_cache_len=b_ready_cache_len,
             )
@@ -741,13 +774,15 @@ def _init_padded_req(self):
         b_seq_len = torch.ones(batch_size, dtype=torch.int32, device="cuda")
         b_ready_cache_len = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
         total_token_num = prefill_input_len * batch_size
+        b_mtp_index = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
         model_input = ModelInput(
             batch_size=batch_size,
             total_token_num=total_token_num,
             max_len_in_batch=prefill_input_len,
             input_ids=dummy_input_ids,
             mem_indexes=mem_indexes,
             b_req_idx=b_req_idx,
+            b_mtp_index=b_mtp_index,
             b_seq_len=b_seq_len,
             b_ready_cache_len=b_ready_cache_len,
             is_prefill=True,
 
@@ -1,6 +1,7 @@
 import torch
 from dataclasses import dataclass, field
 from typing import Optional
+from typing import List
 
 
 @dataclass
@@ -10,20 +11,38 @@ class ModelInput:
     total_token_num: int
     max_len_in_batch: int
     input_ids: torch.Tensor
-    mem_indexes: torch.Tensor
     b_req_idx: torch.Tensor
+    b_mtp_index: torch.Tensor
     b_seq_len: torch.Tensor
+    mem_indexes: torch.Tensor = None
     is_prefill: bool = False
     b_ready_cache_len: torch.Tensor = None
     multimodal_params: list = field(default_factory=list)
 
+    # cpu 变量
+    mem_indexes_cpu: torch.Tensor = None
+    # prefill 阶段使用的参数，但是不是推理过程使用的参数，是推理外部进行资源管理
+    # 的一些变量
+    b_prefill_has_output_cpu: List[bool] = None  # 标记进行prefill的请求是否具有输出
+
     # 专有变量，用于一些特殊的模型，特殊的模式下, 传递一些特殊
     # 的输入变量。只在特殊的模型模式下才会具体使用和生效。
 
     # deepseekv3_mtp_draft_input_hiddens 用于 deepseekv3 模型 mtp 模式下
     # 的 draft 模型的输入
     deepseekv3_mtp_draft_input_hiddens: Optional[torch.Tensor] = None
 
+    def to_cuda(self):
+        if self.input_ids is not None:
+            self.input_ids = self.input_ids.cuda(non_blocking=True)
+        if self.mem_indexes is None:
+            self.mem_indexes = self.mem_indexes_cpu.cuda(non_blocking=True)
+        self.b_req_idx = self.b_req_idx.cuda(non_blocking=True)
+        self.b_seq_len = self.b_seq_len.cuda(non_blocking=True)
+        self.b_mtp_index = self.b_mtp_index.cuda(non_blocking=True)
+        if self.b_ready_cache_len is not None:
+            self.b_ready_cache_len = self.b_ready_cache_len.cuda(non_blocking=True)
+
 
 @dataclass
 class ModelOutput:
 
@@ -221,6 +221,7 @@ def warmup(self, model):
             )
             b_seq_len = torch.empty(batch_size, dtype=torch.int32, device="cuda")
             b_seq_len.fill_(seq_len)
+            b_mtp_index = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
 
             model_input = ModelInput(
                 batch_size=batch_size,
@@ -230,6 +231,7 @@ def warmup(self, model):
                 mem_indexes=mem_indexes,
                 b_req_idx=b_req_idx,
                 b_seq_len=b_seq_len,
+                b_mtp_index=b_mtp_index,
                 is_prefill=False,
                 **model._gen_special_model_input(batch_size),
             )
@@ -275,13 +277,15 @@ def warmup_overlap(self, model):
                 )
                 b_seq_len = torch.empty(batch_size, dtype=torch.int32, device="cuda")
                 b_seq_len.fill_(seq_len)
+                b_mtp_index = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
 
                 micro_batch = ModelInput(
                     is_prefill=False,
                     batch_size=batch_size,
                     total_token_num=total_token_num,
                     max_len_in_batch=max_len_in_batch,
                     input_ids=input_ids,
+                    b_mtp_index=b_mtp_index,
                     mem_indexes=mem_indexes,
                     b_req_idx=b_req_idx,
                     b_seq_len=b_seq_len,
 
@@ -5,6 +5,8 @@
 from typing import Tuple, Any, Optional
 from .triton_kernel.gen_prefill_params import gen_prefill_params
 from .triton_kernel.gen_decode_params import gen_decode_params
+from .triton_kernel.multimodal_emb import mark_multimodal_obj
+from .batch_objs import ModelInput
 
 
 class InferStateInfo:
@@ -86,9 +88,10 @@ def init_some_extra_state(self, model, input_ids: torch.Tensor):
                 self.b_kv_seq_len,
                 self.b1_cu_kv_seq_len,
                 self.position_ids,
-                self.max_q_seq_len,
-                self.max_kv_seq_len,
-            ) = gen_decode_params(b_seq_len=self.b_seq_len)
+            ) = gen_decode_params(self.b_seq_len)
+            self.max_q_seq_len = 1
+            # TODO: check the correctness
+            self.max_kv_seq_len = self.max_len_in_batch
             self.b_start_loc = self.b1_cu_kv_seq_len[0:-1]
 
     def copy_for_cuda_graph(self, new_infer_state: "InferStateInfo"):
@@ -98,3 +101,24 @@ def copy_for_cuda_graph(self, new_infer_state: "InferStateInfo"):
                 if attr_ is not None and attr_.data_ptr() != attr_value.data_ptr():
                     attr_.copy_(attr_value, non_blocking=True)
         return
+
+    def mark_multimodal_objs_for_prefill(self, input_ids: torch.Tensor):
+        """
+        功能函数，用于标记在chuncked prefill的过程中，到底哪些多模态对象对应的token是需要参与计算的。
+        因为分chunck的原因，并不是所有的多模态对象对应的token都需要参与计算。
+        """
+        multi_objs = []
+        for _, p in enumerate(self.multimodal_params):
+            for obj in p["images"] + p["audios"]:
+                multi_objs.append(obj)
+
+        if multi_objs:
+            obj_start_ids = torch.tensor([e["token_id"] for e in multi_objs], dtype=torch.int64, device="cuda")
+            obj_token_lens = torch.tensor([e["token_num"] for e in multi_objs], dtype=torch.int64, device="cuda")
+            marks = mark_multimodal_obj(
+                obj_start_token_ids=obj_start_ids, obj_token_lens=obj_token_lens, input_ids=input_ids
+            )
+            marks_array = marks.detach().cpu().numpy()
+            for mark, obj in zip(marks_array, multi_objs):
+                obj["_prefill_"] = mark > 0
+        return
@@ -93,6 +93,10 @@ def __init__(self):
             self.cuda_graph_cur_batch_size = None
             self.is_cuda_graph = False
             self.managed_total_tensor_bytes = 0
+            # 防止误用导致显存泄露，添加标记变量。
+            # 当使用者没有合法的调用 cache_env_in 和 cache_env_out 的时候
+            # 如果调用了alloc_tensor 接口，则退化为 torch.empty 申请方式。
+            self.cache_env_ok = False
 
         def cache_env_in(
             self, is_cuda_graph: bool = False, cur_batch_size: int = 0, cuda_graph_max_batch_size: int = 0
@@ -107,6 +111,7 @@ def cache_env_in(
                     assert self.inner_cuda_graph_manager.cuda_graph_max_batch_size == cuda_graph_max_batch_size
                 self.cuda_graph_cur_batch_size = cur_batch_size
                 assert cur_batch_size != 0
+            self.cache_env_ok = True
             return
 
         def cache_env_out(self):
@@ -115,6 +120,7 @@ def cache_env_out(self):
             self.free_shape_dtype_to_bufs.clear()
             self.calcu_shape_cache.clear()
             self.changed_ptr.clear()
+            self.cache_env_ok = False
             return
 
         def alloc_tensor(
@@ -129,6 +135,11 @@ def alloc_tensor(
             # shape 类型转换
             if isinstance(shape, list):
                 shape = torch.Size(shape)
+
+            # cache manager 没有被正常使用时
+            if not self.cache_env_ok:
+                return torch.empty(shape, dtype=data_type, device=device, requires_grad=False)
+
             # 是 cuda graph的时候，由cuda graph manager 接管
             if self.is_cuda_graph:
                 return self.inner_cuda_graph_manager.alloc_tensor_for_cuda_graph(