ModelTC
diff --git a/‎docs/CN/source/getting_started/quickstart.rst‎
Lines changed: 1 addition & 5 deletions b/‎docs/CN/source/getting_started/quickstart.rst‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎docs/EN/source/getting_started/quickstart.rst‎
Lines changed: 1 addition & 3 deletions b/‎docs/EN/source/getting_started/quickstart.rst‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎lightllm/server/api_cli.py‎
Lines changed: 4 additions & 1 deletion b/‎lightllm/server/api_cli.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎lightllm/server/api_start.py‎
Lines changed: 5 additions & 6 deletions b/‎lightllm/server/api_start.py‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎lightllm/server/core/objs/req.py‎
Lines changed: 23 additions & 23 deletions b/‎lightllm/server/core/objs/req.py‎
Lines changed: 23 additions & 23 deletions
diff --git a/‎lightllm/server/core/objs/start_args_type.py‎
Lines changed: 1 addition & 1 deletion b/‎lightllm/server/core/objs/start_args_type.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lightllm/server/router/manager.py‎
Lines changed: 3 additions & 3 deletions b/‎lightllm/server/router/manager.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎lightllm/server/router/model_infer/mode_backend/__init__.py‎
Lines changed: 3 additions & 3 deletions b/‎lightllm/server/router/model_infer/mode_backend/__init__.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎lightllm/server/router/model_infer/mode_backend/base_backend.py‎
Lines changed: 1 addition & 1 deletion b/‎lightllm/server/router/model_infer/mode_backend/base_backend.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎…tch/impl_for_outlines_constraint_mode.py‎ ‎…ill/impl_for_outlines_constraint_mode.py‎lightllm/server/router/model_infer/mode_backend/continues_batch/impl_for_outlines_constraint_mode.py renamed to lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl_for_outlines_constraint_mode.py
Lines changed: 65 additions & 58 deletions b/‎…tch/impl_for_outlines_constraint_mode.py‎ ‎…ill/impl_for_outlines_constraint_mode.py‎lightllm/server/router/model_infer/mode_backend/continues_batch/impl_for_outlines_constraint_mode.py renamed to lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl_for_outlines_constraint_mode.py
Lines changed: 65 additions & 58 deletions
@@ -113,7 +113,6 @@
     $ --tokenizer_mode fast \
     $ --pd_master_ip /your/host/ip \
     $ --pd_master_port 60011 \
-    $ --use_dynamic_prompt_cache \
     $ --max_req_total_len 16000 \
     $ --running_max_req_size 128 \
     $ --disable_cudagraph
@@ -133,8 +132,7 @@
     $ --graph_max_batch_size 16 \
     $ --tokenizer_mode fast \
     $ --pd_master_ip /your/host/ip \
-    $ --pd_master_port 60011 \
-    $ --use_dynamic_prompt_cache 
+    $ --pd_master_port 60011
 
 .. note::
     prefill和decoding阶段的tp大小保持一致, 目前可以支持 prefill 和 decode 节点的数量是变化的，同时prefill 和 decode可以跨机部署。
@@ -215,7 +213,6 @@ $ --config_server_port 60088 \
     $ --nccl_port 2732 \
     $ --max_total_token_num 400000 \
     $ --tokenizer_mode fast \
-    $ --use_dynamic_prompt_cache \
     $ --max_req_total_len 16000 \
     $ --running_max_req_size 128 \
     $ --disable_cudagraph \
@@ -236,7 +233,6 @@ $ --config_server_port 60088 \
     $ --graph_max_len_in_batch 2048 \
     $ --graph_max_batch_size 16 \
     $ --tokenizer_mode fast \
-    $ --use_dynamic_prompt_cache \
     $ --config_server_host <config_server_host> \
     $ --config_server_port <config_server_port>
 
 
@@ -110,7 +110,6 @@ Open a new terminal and run the prefill service
     $ --tokenizer_mode fast \
     $ --pd_master_ip /your/host/ip \
     $ --pd_master_port 60011 \
-    $ --use_dynamic_prompt_cache \
     $ --max_req_total_len 16000 \
     $ --running_max_req_size 128 \
     $ --disable_cudagraph
@@ -130,8 +129,7 @@ Open a new terminal and run the decoding service
     $ --graph_max_batch_size 16 \
     $ --tokenizer_mode fast \
     $ --pd_master_ip /your/host/ip \
-    $ --pd_master_port 60011 \
-    $ --use_dynamic_prompt_cache 
+    $ --pd_master_port 60011
 
 .. note::
     The tp size for the prefill and decoding stages should remain consistent.
 
@@ -184,7 +184,10 @@ def make_argument_parser() -> argparse.ArgumentParser:
                 disabling it allows the router_max_wait_tokens parameter to work more effectively.""",
     )
 
-    parser.add_argument("--use_dynamic_prompt_cache", action="store_true", help="use_dynamic_prompt_cache test")
+    parser.add_argument(
+        "--use_dynamic_prompt_cache", action="store_true", help="This argument is deprecated and no longer in use."
+    )
+    parser.add_argument("--disable_dynamic_prompt_cache", action="store_true", help="disable dynamic prompt cache")
 
     parser.add_argument("--chunked_prefill_size", type=int, default=8192, help="chunked prefill size")
     parser.add_argument("--disable_chunked_prefill", action="store_true", help="whether to disable chunked prefill")
 
@@ -95,18 +95,17 @@ def normal_or_p_d_start(args):
     assert [
         args.disable_chunked_prefill,
         args.diverse_mode,
-        args.token_healing_mode,
         args.use_reward_model,
         args.return_all_prompt_logprobs,
-        args.output_constraint_mode != "none",
     ].count(True) <= 1
-    # 部分模式目前还无法与dynamic_prompt_cache一起跑，to do。
-    if args.use_dynamic_prompt_cache:
-        assert args.token_healing_mode is False
 
     # chuncked prefill 需要和 dynamic_prompt_cache 一起使能
     if not args.disable_chunked_prefill:
-        assert args.use_dynamic_prompt_cache is True
+        assert args.disable_dynamic_prompt_cache is False
+    if args.output_constraint_mode != "none":
+        assert args.disable_dynamic_prompt_cache is False
+    if args.token_healing_mode:
+        assert args.disable_dynamic_prompt_cache is False
 
     # 部分模式还不能支持与高级动态调度算法协同，to do.
     if args.diverse_mode:
 
@@ -272,29 +272,6 @@ def get_first_router_need_tokens(self):
         return self.input_len + self.shm_cur_output_len
 
 
-class TokenHealingReq(NormalReq):
-    _pack_ = 4
-
-    def post_init(
-        self,
-    ):
-        for prefix_token_num in range(2, -1, -1):
-            if self.input_len > prefix_token_num:
-                self.input_len -= prefix_token_num
-                self.prefix_token_ids.set_token_ids(
-                    self.shm_prompt_ids.arr[self.input_len : (self.input_len + prefix_token_num)]
-                )
-                break
-
-        # 因为原始的输出token数量，会被中间的前缀补全占用decode次数，
-        # 所以默认多添加一些decode步数, token healing mode 下，由于
-        # 估计的生成token数据对应的生存周期可能会不准确,所以为了缓解调
-        # 度带来的显存估计问题，对于生成token的长度 + 6来缓解可能的估计
-        # 错误问题。
-        self.sample_params.max_new_tokens = self.sample_params.max_new_tokens + self.prefix_token_ids.size + 6
-        return
-
-
 class ChunkedPrefillReq(Req):
     _pack_ = 4
 
@@ -333,3 +310,26 @@ def get_decode_need_tokens(self):
     def get_first_router_need_tokens(self):
 
         return min(self.input_len + self.shm_cur_output_len, self.chunked_prefill_size)
+
+
+class TokenHealingReq(ChunkedPrefillReq):
+    _pack_ = 4
+
+    def post_init(
+        self,
+    ):
+        for prefix_token_num in range(2, -1, -1):
+            if self.input_len > prefix_token_num:
+                self.input_len -= prefix_token_num
+                self.prefix_token_ids.set_token_ids(
+                    self.shm_prompt_ids.arr[self.input_len : (self.input_len + prefix_token_num)]
+                )
+                break
+
+        # 因为原始的输出token数量，会被中间的前缀补全占用decode次数，
+        # 所以默认多添加一些decode步数, token healing mode 下，由于
+        # 估计的生成token数据对应的生存周期可能会不准确,所以为了缓解调
+        # 度带来的显存估计问题，对于生成token的长度 + 6来缓解可能的估计
+        # 错误问题。
+        self.sample_params.max_new_tokens = self.sample_params.max_new_tokens + self.prefix_token_ids.size + 6
+        return
@@ -39,7 +39,7 @@ class StartArgs:
     router_max_new_token_len: int = field(default=1024)
     router_max_wait_tokens: int = field(default=6)
     disable_aggressive_schedule: bool = field(default=False)
-    use_dynamic_prompt_cache: bool = field(default=False)
+    disable_dynamic_prompt_cache: bool = field(default=False)
     chunked_prefill_size: int = field(default=8192)
     disable_chunked_prefill: bool = field(default=False)
     diverse_mode: bool = field(default=False)
 
@@ -163,7 +163,7 @@ async def wait_to_model_ready(self):
             "is_token_healing": self.args.token_healing_mode,
             "return_all_prompt_logprobs": self.args.return_all_prompt_logprobs,
             "use_reward_model": self.args.use_reward_model,
-            "use_dynamic_prompt_cache": self.args.use_dynamic_prompt_cache,
+            "disable_dynamic_prompt_cache": self.args.disable_dynamic_prompt_cache,
             "data_type": self.args.data_type,
             "eos_id": self.eos_id,
             "diverse_mode": self.args.diverse_mode,
@@ -182,7 +182,7 @@ async def wait_to_model_ready(self):
         if self.max_total_token_num is None:
             self.max_total_token_num = await self.model_rpc_client.get_max_total_token_num()
             self.args.max_total_token_num = self.max_total_token_num
-        if self.args.use_dynamic_prompt_cache:
+        if not self.args.disable_dynamic_prompt_cache:
             self.radix_cache_client = RadixCacheReadOnlyClient(
                 get_unique_server_name(),
                 self.max_total_token_num,
@@ -425,7 +425,7 @@ def _can_decode(self, batch: Batch, dp_index: int):
         )
 
     def get_used_tokens(self, dp_index):
-        if self.args.use_dynamic_prompt_cache:
+        if not self.args.disable_dynamic_prompt_cache:
             return (
                 self.max_total_token_num
                 - self.read_only_statics_mem_manager.get_unrefed_token_num(dp_index)
 
@@ -3,12 +3,12 @@
 from .continues_batch.impl_for_reward_model import RewardModelBackend
 from .chunked_prefill.impl import ChunkedPrefillBackend
 from .diverse_backend.impl import DiversehBackend
-from .continues_batch.impl_for_token_healing import TokenHealingBackend
-from .continues_batch.impl_for_outlines_constraint_mode import OutlinesConstraintBackend
+from .chunked_prefill.impl_for_token_healing import TokenHealingBackend
+from .chunked_prefill.impl_for_outlines_constraint_mode import OutlinesConstraintBackend
 from .chunked_prefill.impl_for_first_token_constraint_mode import FirstTokenConstraintBackend
 from .dp_backend.impl import DPChunkedPrefillBackend
 from .continues_batch.pd_mode.prefill_node_impl.prefill_impl import ChunckedPrefillForPrefillNode
 from .continues_batch.pd_mode.decode_node_impl.decode_impl import ContinuesBatchBackendForDecodeNode
-from .continues_batch.impl_for_xgrammar_mode import XgrammarBackend
+from .chunked_prefill.impl_for_xgrammar_mode import XgrammarBackend
 from .continues_batch.pd_mode.prefill_node_impl.prefill_impl_for_dp_chuncked import DPChunkedForPrefillNode
 from .continues_batch.pd_mode.decode_node_impl.decode_impl_for_dp import DPForDecodeNode
@@ -84,7 +84,7 @@ def init_model(self, kvargs):
         self.disable_chunked_prefill = kvargs.get("disable_chunked_prefill", False)
         self.chunked_prefill_size = kvargs.get("chunked_prefill_size", None)
         self.return_all_prompt_logprobs = kvargs.get("return_all_prompt_logprobs", False)
-        self.use_dynamic_prompt_cache = kvargs.get("use_dynamic_prompt_cache", False)
+        self.use_dynamic_prompt_cache = not kvargs.get("disable_dynamic_prompt_cache", False)
         self.eos_id: List[int] = kvargs.get("eos_id", [2])
         self.disable_cudagraph = kvargs.get("disable_cudagraph", False)
 
 
@@ -1,10 +1,9 @@
 import os
 import shutil
 import torch
-from .impl import ContinuesBatchBackend
-from lightllm.utils.infer_utils import calculate_time, mark_start, mark_end
+from .impl import ChunkedPrefillBackend
 from lightllm.server.core.objs import FinishStatus
-from lightllm.server.router.model_infer.infer_batch import g_infer_context, InferReq, InferSamplingParams
+from lightllm.server.router.model_infer.infer_batch import g_infer_context, InferReq
 from lightllm.server.router.model_infer.mode_backend.generic_pre_process import (
     prepare_prefill_inputs,
     prepare_decode_inputs,
@@ -17,7 +16,7 @@
 logger = init_logger(__name__)
 
 
-class OutlinesConstraintBackend(ContinuesBatchBackend):
+class OutlinesConstraintBackend(ChunkedPrefillBackend):
     def __init__(self) -> None:
         super().__init__()
 
@@ -45,63 +44,23 @@ def init_custom(self):
         logger.info(f"eos_ids {self.tokenizer.eos_token_ids}")
         return
 
-    def prefill(self, reqs: List[Tuple]):
-
-        req_ids = self._init_reqs(reqs)
-
-        # import here, 当你不使用这个模式，缺少这些依赖也可以运行
-        from outlines.fsm.guide import RegexGuide
-
-        req_objs = self._trans_req_ids_to_req_objs(req_ids)
-        kwargs, run_reqs = prepare_prefill_inputs(req_objs, is_chuncked_mode=False, is_multimodal=self.is_multimodal)
-        run_reqs: List[InferReq] = run_reqs
-
-        logics = self.model.forward(**kwargs)
-
-        # 对于不能满足前缀匹配的logic位置，将其logics设置为一个较大负值，将其概率掩盖为 0
-        mask = torch.ones_like(logics, dtype=torch.bool)
-        for i, run_obj in enumerate(run_reqs):
-            run_obj: InferReq = run_obj
-            sample_params = run_obj.sampling_param
-            if sample_params.regular_constraint is not None:
-                sample_params.regex_guide = RegexGuide.from_regex(sample_params.regular_constraint, self.tokenizer)
-            self._mask_req_out_token(i, run_obj, mask)
-
-        logics[mask] = -1000000.0
-
-        next_token_ids, next_token_probs = sample(logics, run_reqs, self.eos_id)
-        next_token_ids = next_token_ids.detach().cpu().numpy()
-        next_token_logprobs = torch.log(next_token_probs).detach().cpu().numpy()
-
-        self._post_handle(
-            run_reqs,
-            next_token_ids,
-            next_token_logprobs,
-            is_chuncked_mode=False,
-            do_filter_finished_reqs=False,
-            extra_post_req_handle_func=self._update_state_fsm,
-        )
-
-        return
-
     def decode(self):
         uninit_reqs, aborted_reqs, ok_finished_reqs, prefill_reqs, decode_reqs = self._get_classed_reqs(
             g_infer_context.infer_req_ids
         )
-        assert len(uninit_reqs) == 0
-        assert len(prefill_reqs) == 0
 
         if aborted_reqs:
             g_infer_context.filter_reqs(aborted_reqs)
 
+        # 先 decode
         if decode_reqs:
             kwargs, run_reqs = prepare_decode_inputs(decode_reqs)
-            run_reqs: List[InferReq] = run_reqs
-
             logits = self.model.forward(**kwargs)
+            self._overlap_req_init_and_filter(
+                uninit_reqs=uninit_reqs, ok_finished_reqs=ok_finished_reqs, clear_list=True
+            )
 
-            self._overlap_req_init_and_filter(uninit_reqs=[], ok_finished_reqs=ok_finished_reqs, clear_list=True)
-
+            self._init_guide_infos(run_reqs)
             all_has_no_constraint = all([not e.sampling_param.has_constraint_setting() for e in run_reqs])
             if not all_has_no_constraint:
                 mask = torch.ones_like(logits, dtype=torch.bool)
@@ -112,7 +71,6 @@ def decode(self):
             next_token_ids, next_token_probs = sample(logits, run_reqs, self.eos_id)
             next_token_ids = next_token_ids.detach().cpu().numpy()
             next_token_logprobs = torch.log(next_token_probs).detach().cpu().numpy()
-
             self._post_handle(
                 run_reqs,
                 next_token_ids,
@@ -121,8 +79,42 @@ def decode(self):
                 do_filter_finished_reqs=False,
                 extra_post_req_handle_func=self._update_state_fsm,
             )
+            logits = None
+
+        # 再 prefill
+        if len(decode_reqs) == 0 or (self.forward_step % self.max_wait_step == 0) or (self.need_prefill_count > 0):
+            if prefill_reqs:
+                self.need_prefill_count -= 1
+                kwargs, run_reqs = prepare_prefill_inputs(
+                    prefill_reqs, is_chuncked_mode=True, is_multimodal=self.is_multimodal
+                )
+                logits = self.model.forward(**kwargs)
+                self._overlap_req_init_and_filter(
+                    uninit_reqs=uninit_reqs, ok_finished_reqs=ok_finished_reqs, clear_list=True
+                )
+                # 对于不能满足前缀匹配的logic位置，将其logics设置为一个较大负值，将其概率掩盖为 0
+                self._init_guide_infos(run_reqs)
+                mask = torch.ones_like(logits, dtype=torch.bool)
+                for i, run_obj in enumerate(run_reqs):
+                    self._mask_req_out_token(i, run_obj, mask)
 
-        self._overlap_req_init_and_filter(uninit_reqs=[], ok_finished_reqs=ok_finished_reqs, clear_list=True)
+                logits[mask] = -1000000.0
+
+                next_token_ids, next_token_probs = sample(logits, run_reqs, self.eos_id)
+                next_token_ids = next_token_ids.detach().cpu().numpy()
+                next_token_logprobs = torch.log(next_token_probs).detach().cpu().numpy()
+                self._post_handle(
+                    run_reqs,
+                    next_token_ids,
+                    next_token_logprobs,
+                    is_chuncked_mode=True,
+                    do_filter_finished_reqs=False,
+                    extra_post_req_handle_func=self._update_state_fsm,
+                )
+                logits = None
+
+        self._overlap_req_init_and_filter(uninit_reqs=uninit_reqs, ok_finished_reqs=ok_finished_reqs, clear_list=True)
+        self.forward_step += 1
         return
 
     def _update_state_fsm(self, req_obj: InferReq, next_token_id, next_token_logprob):
@@ -138,13 +130,28 @@ def _update_state_fsm(self, req_obj: InferReq, next_token_id, next_token_logprob
     def _mask_req_out_token(self, i, run_obj: InferReq, mask):
         from outlines.fsm.guide import RegexGuide
 
-        sample_params = run_obj.sampling_param
-        if sample_params.regular_constraint is not None:
-            regex_guide: RegexGuide = sample_params.regex_guide
-            ok_token_id_list = regex_guide.get_next_instruction(sample_params.fsm_current_state).tokens
-            mask[i, ok_token_id_list] = False
-        elif sample_params.allowed_token_ids is not None:
-            mask[i, sample_params.allowed_token_ids] = False
+        if run_obj.get_chuncked_input_token_len() == run_obj.get_cur_total_len():
+            # this run_obj is ready to gen next token.
+            sample_params = run_obj.sampling_param
+            if sample_params.regular_constraint is not None:
+                regex_guide: RegexGuide = sample_params.regex_guide
+                ok_token_id_list = regex_guide.get_next_instruction(sample_params.fsm_current_state).tokens
+                mask[i, ok_token_id_list] = False
+            elif sample_params.allowed_token_ids is not None:
+                mask[i, sample_params.allowed_token_ids] = False
+            else:
+                mask[i, :] = False
         else:
+            # no constraint
             mask[i, :] = False
         return
+
+    def _init_guide_infos(self, run_reqs: List[InferReq]):
+        from outlines.fsm.guide import RegexGuide
+
+        for i, run_obj in enumerate(run_reqs):
+            run_obj: InferReq = run_obj
+            sample_params = run_obj.sampling_param
+            if sample_params.regular_constraint is not None:
+                if not hasattr(sample_params, "regex_guide"):
+                    sample_params.regex_guide = RegexGuide.from_regex(sample_params.regular_constraint, self.tokenizer)