feat: move stop string matching to detokenization

niushengxiao · niushengxiao · commit fcc854053602 · 2025-08-15T16:59:24.000+08:00
diff --git a/lightllm/server/api_openai.py b/lightllm/server/api_openai.py
@@ -539,6 +539,7 @@ async def _collect_generation_results(
                         earliest_stop_index = actual_stop_index
 
             if earliest_stop_index < len(final_text):
+                logger.info(f"removed stop sequence in tail: '{final_text[earliest_stop_index:]}'")
                 final_text = final_text[:earliest_stop_index]
 
     return {
diff --git a/lightllm/server/core/objs/io_objs/__init__.py b/lightllm/server/core/objs/io_objs/__init__.py
@@ -1 +1 @@
-from .group_req import GroupReqIndexes, GroupReqObjs, AbortedReqCmd
+from .group_req import GroupReqIndexes, GroupReqObjs, AbortedReqCmd, StopStrMatchedReqCmd
diff --git a/lightllm/server/core/objs/io_objs/group_req.py b/lightllm/server/core/objs/io_objs/group_req.py
@@ -31,3 +31,8 @@ def to_group_req_index(self):
 @dataclass
 class AbortedReqCmd:
     req_id: int
+
+
+@dataclass
+class StopStrMatchedReqCmd:
+    req_id: int
diff --git a/lightllm/server/core/objs/req.py b/lightllm/server/core/objs/req.py
@@ -32,6 +32,9 @@ def get_status(self):
     def is_finished(self):
         return self.FINISHED_STOP <= self.status <= self.FINISHED_LENGTH
 
+    def is_stoped(self):
+        return self.status == self.FINISHED_STOP
+
     def get_finish_reason(self):
         if self.status == self.FINISHED_STOP:
             return "stop"
@@ -97,6 +100,8 @@ class Req(ctypes.Structure):
         ("mtp_accepted_token_num", ctypes.c_int),
         # mtp_step 保存一个mtp使用的常量参数，用于快速访问，不会被外部输入初始化
         ("_mtp_step", ctypes.c_int),
+        # stop_str_matched用于判断停止字符串是否匹配成功
+        ("stop_str_matched", ctypes.c_bool),
     ]
 
     def get_str(self):
@@ -150,6 +155,7 @@ def init(
         self.shm_prompt_ids.arr[0 : len(prompt_ids)] = prompt_ids
         self.mtp_accepted_token_num = 0
         self._mtp_step = get_env_start_args().mtp_step
+        self.stop_str_matched = False
 
         self.post_init()
 
@@ -207,7 +213,7 @@ def can_release(self):
         ref_count_ok = self.ref_count == 1
         can_released_mark = self.can_released_mark
 
-        if self.is_aborted and can_released_mark and ref_count_ok:
+        if (self.is_aborted or self.stop_str_matched) and can_released_mark and ref_count_ok:
             return True
 
         if self.finish_status.is_finished() and can_released_mark and ref_count_ok and self.out_tokens_queue.is_empty():
diff --git a/lightllm/server/detokenization/decode_req.py b/lightllm/server/detokenization/decode_req.py
@@ -36,7 +36,11 @@ def init_token_healing_prefix_str(self, token_id_to_token: Dict[int, str], token
         return
 
     def need_detoken(self):
-        if (not self.req.is_aborted) and len(self.output_ids) < self.req.candetoken_out_len:
+        if (
+            (not self.req.is_aborted)
+            and (not self.req.stop_str_matched)
+            and len(self.output_ids) < self.req.candetoken_out_len
+        ):
             return True
         return False
 
@@ -55,6 +59,9 @@ def get_decode_tokens(self):
     def can_set_release_mark(self):
         if self.req.is_aborted:
             return True
+        if self.req.stop_str_matched:
+            # httpserver那里必须先处理完请求, 这里才能释放
+            return self.req.out_tokens_queue.is_empty()
         if (
             self.req.finish_status.is_finished()
             and self.req.candetoken_out_len == len(self.output_ids)
diff --git a/lightllm/server/detokenization/manager.py b/lightllm/server/detokenization/manager.py
@@ -101,6 +101,32 @@ def handle_loop(self):
             logger.exception(f"detoken process has exception {str(e)}")
         return
 
+    def _stop_sequences_str_matched(self, decode_req, tokenizer):
+        stop_sequences_str = (
+            decode_req.req.sample_params.stop_sequences.to_string()
+            if decode_req.req.sample_params.stop_sequences
+            else []
+        )
+        if not stop_sequences_str or tokenizer is None:
+            return False
+
+        max_stop_str_len = max(len(stop_str) for stop_str in stop_sequences_str) if stop_sequences_str else 0
+        if max_stop_str_len == 0:
+            return False
+
+        output_len = len(decode_req.output_ids)
+        tail_token_len = min(decode_req.req.input_len + output_len, max_stop_str_len + 10)  # +10 for safety
+        if tail_token_len > 0:
+            tail_token_ids = decode_req.req.shm_prompt_ids.arr[
+                (decode_req.req.input_len + output_len - tail_token_len) : (decode_req.req.input_len + output_len)
+            ]
+            tail_str = tokenizer.decode(tail_token_ids, skip_special_tokens=False)
+            for stop_str in stop_sequences_str:
+                if stop_str in tail_str:
+                    logger.info(f"Found stop sequence in tail: stop_str='{stop_str}', " f"tail_str='{tail_str}'")
+                    return True
+        return False
+
     def gen_token_out(self):
         exist_need_detoken = False
         exist_decode = False
@@ -111,6 +137,9 @@ def gen_token_out(self):
                 special = new_token_id in self.all_special_ids
                 count_output_tokens = len(decode_req.output_ids)
 
+                if decode_req.req.stop_str_matched:
+                    continue
+
                 exist_decode = True
                 new_text = decode_token(
                     self.tokenizer,
@@ -131,6 +160,13 @@ def gen_token_out(self):
                         logger.error(
                             f"error token healing state, prefix_str {decode_req.prefix_str} new_text {new_text}"
                         )
+
+                # 停止字符串匹配
+                if not decode_req.req.finish_status.is_stoped() and self._stop_sequences_str_matched(
+                    decode_req, self.tokenizer
+                ):
+                    decode_req.req.stop_str_matched = True
+
                 decode_req.req.out_tokens_queue.push(new_text, src_index, special, count_output_tokens)
 
             if decode_req.need_detoken():
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
@@ -661,7 +661,12 @@ async def handle_loop(self):
                         for _ in range(read_token_count):
                             if not req.out_tokens_queue.is_empty():
 
-                                text, src_index, special, count_output_tokens = req.out_tokens_queue.peek()
+                                (
+                                    text,
+                                    src_index,
+                                    special,
+                                    count_output_tokens,
+                                ) = req.out_tokens_queue.peek()
                                 req.cumlogprob += float(req.shm_logprobs.arr[src_index])
                                 metadata = {
                                     "id": int(req.shm_prompt_ids.arr[src_index]),
@@ -679,10 +684,14 @@ async def handle_loop(self):
 
                                 req.out_tokens_queue.pop_no_ret()
 
-                                if req.finish_token_index != src_index:
+                                if not req.stop_str_matched and req.finish_token_index != src_index:
                                     token_list.append((req_id, text, metadata, FinishStatus()))
                                 else:
-                                    finish_status = FinishStatus(req.finish_status.status)
+                                    finish_status = FinishStatus(
+                                        req.finish_status.FINISHED_STOP
+                                        if req.stop_str_matched
+                                        else req.finish_status.status
+                                    )
                                     token_list.append((req_id, text, metadata, finish_status))
                             else:
                                 break
diff --git a/lightllm/server/router/manager.py b/lightllm/server/router/manager.py
@@ -15,7 +15,7 @@
 from .batch import Batch, Req
 from .model_infer.model_rpc import start_model_process, ModelRpcClient
 from .req_queue import build_req_queue
-from lightllm.server.core.objs.io_objs import GroupReqIndexes, AbortedReqCmd
+from lightllm.server.core.objs.io_objs import GroupReqIndexes, AbortedReqCmd, StopStrMatchedReqCmd
 from lightllm.server.core.objs import ShmReqManager, StartArgs
 from .dynamic_prompt.radix_cache import RadixCacheReadOnlyClient
 from .shm_reqs_io_buffer import ShmReqsIOBuffer
@@ -277,8 +277,11 @@ async def _step(self):
 
         self._filter_reqs_from_running_batch()
         aborted_reqs = self._get_aborted_reqs_from_running_batch()
+        stop_str_matched_reqs = self._get_stop_str_reqs_from_running_batch()
         if aborted_reqs:
             await self._aborted_reqs(aborted_reqs=aborted_reqs)
+        if stop_str_matched_reqs:
+            await self._stop_str_matched_reqs(stop_str_matched_reqs=stop_str_matched_reqs)
         return
 
     async def _add_batch(self, batch: Batch):
@@ -301,6 +304,15 @@ async def _aborted_reqs(self, aborted_reqs: List[Req]):
         self.shm_reqs_io_buffer.set_ready()
         return
 
+    async def _stop_str_matched_reqs(self, stop_str_matched_reqs: List[Req]):
+        cmds = [StopStrMatchedReqCmd(req_id=r.request_id) for r in stop_str_matched_reqs]
+        while not self.shm_reqs_io_buffer.is_empty():
+            await asyncio.sleep(0.02)
+
+        self.shm_reqs_io_buffer.write_obj(cmds)
+        self.shm_reqs_io_buffer.set_ready()
+        return
+
     def _add_new_batch_to_running_batch(self, new_batch: Batch):
         if self.running_batch is None:
             self.running_batch = new_batch
@@ -325,6 +337,15 @@ def _get_aborted_reqs_from_running_batch(self) -> List[Req]:
                 ans.append(req)
         return ans
 
+    def _get_stop_str_reqs_from_running_batch(self) -> List[Req]:
+        ans = []
+        if self.running_batch is None:
+            return ans
+        for req in self.running_batch.reqs:
+            if req.stop_str_matched:
+                ans.append(req)
+        return ans
+
     def _get_paused_req_num(self) -> int:
         if self.running_batch is None:
             return 0
diff --git a/lightllm/server/router/model_infer/infer_batch.py b/lightllm/server/router/model_infer/infer_batch.py
@@ -319,7 +319,6 @@ def _init_all_state(self):
         g_infer_context.req_manager.req_sampling_params_manager.init_req_sampling_params(self)
 
         self.stop_sequences = self.sampling_param.shm_param.stop_sequences.to_list()
-        self.stop_sequences_str = self.sampling_param.shm_param.stop_sequences.to_string()
         # token healing mode 才被使用的管理对象
         if self.shm_req.prefix_token_ids.size != 0:
             self.prefix_token_ids = self.shm_req.prefix_token_ids.get_token_ids()
@@ -380,10 +379,8 @@ def update_mtp_accepted_token_num(self, accept_token_num: int):
     def get_last_gen_token(self):
         return self.shm_req.shm_prompt_ids.arr[self.shm_req.input_len + self.cur_output_len - 1]
 
-    def update_finish_status(self, eos_ids, output_len: int, tokenizer=None):
-        if self._stop_sequences_matched(output_len=output_len) or self._stop_sequences_str_matched(
-            tokenizer, output_len
-        ):
+    def update_finish_status(self, eos_ids, output_len: int):
+        if self._stop_sequences_matched(output_len=output_len):
             self.finish_status.set_status(FinishStatus.FINISHED_STOP)
         elif (
             output_len > 0
@@ -408,26 +405,6 @@ def _stop_sequences_matched(self, output_len: int):
                         return True
         return False
 
-    def _stop_sequences_str_matched(self, tokenizer, output_len):
-        if not self.stop_sequences_str or tokenizer is None:
-            return False
-
-        max_stop_str_len = max(len(stop_str) for stop_str in self.stop_sequences_str) if self.stop_sequences_str else 0
-        if max_stop_str_len == 0:
-            return False
-
-        tail_token_len = min(self.shm_req.input_len + output_len, max_stop_str_len + 10)  # +10 for safety
-        if tail_token_len > 0:
-            tail_token_ids = self.shm_req.shm_prompt_ids.arr[
-                (self.shm_req.input_len + output_len - tail_token_len) : (self.shm_req.input_len + output_len)
-            ]
-            tail_str = tokenizer.decode(tail_token_ids, skip_special_tokens=False)
-            for stop_str in self.stop_sequences_str:
-                if stop_str in tail_str:
-                    logger.info(f"Found stop sequence in tail: stop_str='{stop_str}', tail_str='{tail_str}'")
-                    return True
-        return False
-
     def prefill_need_token_num(self, is_chuncked_prefill: bool):
         if is_chuncked_prefill:
             input_token_ids = self.get_chuncked_input_token_ids()
@@ -506,7 +483,6 @@ def handle(
         eos_ids: List[int],
         extra_post_req_handle_func: Optional[Callable[[InferReq, int, float], None]],
         is_master_in_dp: bool,
-        tokenizer=None,
     ):
         if self.output_len <= 0:
             return
@@ -528,7 +504,7 @@ def handle(
             return
 
         # 更新判断请求的 finished 状态
-        req_obj.update_finish_status(eos_ids=eos_ids, output_len=self.output_len, tokenizer=tokenizer)
+        req_obj.update_finish_status(eos_ids=eos_ids, output_len=self.output_len)
 
         if extra_post_req_handle_func is not None:
             extra_post_req_handle_func(req_obj, next_token_id, next_token_logprob)
diff --git a/lightllm/server/router/model_infer/mode_backend/base_backend.py b/lightllm/server/router/model_infer/mode_backend/base_backend.py
@@ -19,19 +19,18 @@
 from lightllm.utils.dist_utils import init_distributed_env
 from lightllm.utils.envs_utils import get_unique_server_name
 from lightllm.server.core.objs import ShmReqManager, StartArgs
-from lightllm.server.core.objs.io_objs import AbortedReqCmd
+from lightllm.server.core.objs.io_objs import AbortedReqCmd, StopStrMatchedReqCmd
 from lightllm.server.router.model_infer.infer_batch import g_infer_context
 from lightllm.server.router.model_infer.pin_mem_manager import g_pin_mem_manager
 from lightllm.utils.dist_utils import get_global_rank, get_global_world_size, get_dp_size
 from lightllm.utils.dist_utils import get_dp_world_size, get_global_dp_rank, get_current_rank_in_dp
 from lightllm.utils.dist_utils import get_current_device_id, get_current_rank_in_node, get_node_world_size
 from lightllm.utils.dist_utils import get_dp_rank_in_node, create_new_group_for_current_node
-from lightllm.utils.envs_utils import get_env_start_args, enable_stop_string_match
+from lightllm.utils.envs_utils import get_env_start_args
 from lightllm.distributed import dist_group_manager
 from lightllm.server.router.shm_reqs_io_buffer import ShmReqsIOBuffer
 from lightllm.server.router.model_infer.mode_backend.overlap_events import OverlapEventManager, OverlapEventPack
 from lightllm.models.deepseek_mtp.model import Deepseek3MTPModel
-from lightllm.server.tokenizer import get_tokenizer
 
 
 class ModeBackend:
@@ -322,6 +321,12 @@ def _read_reqs_buffer_and_init_reqs(self):
                     if obj.req_id in g_infer_context.requests_mapping:
                         req: InferReq = g_infer_context.requests_mapping[obj.req_id]
                         req.infer_aborted = True
+            elif isinstance(cmds[0], StopStrMatchedReqCmd):
+                for obj in cmds:
+                    obj: StopStrMatchedReqCmd = obj
+                    if obj.req_id in g_infer_context.requests_mapping:
+                        req: InferReq = g_infer_context.requests_mapping[obj.req_id]
+                        req.infer_aborted = True
             else:
                 self._init_reqs(reqs=cmds)
         return
@@ -507,14 +512,6 @@ def _post_handle(
         extra_post_req_handle_func 用于提供在一个请求确定输出的时候，给出额外的后处理操作，主要是用于
         约束输出等模式，设置自己请求内部的状态机的状态，并添加额外的停止判定条件等。
         """
-        if enable_stop_string_match():
-            if not hasattr(self, "tokenizer"):
-                self.tokenizer = get_tokenizer(
-                    self.args.model_dir, self.args.tokenizer_mode, trust_remote_code=self.args.trust_remote_code
-                )
-        else:
-            self.tokenizer = None
-
         for req_obj, next_token_id, next_token_logprob, pack in zip(
             run_reqs, next_token_ids, next_token_logprobs, run_reqs_update_packs
         ):
@@ -526,7 +523,6 @@ def _post_handle(
                 eos_ids=self.eos_id,
                 extra_post_req_handle_func=extra_post_req_handle_func,
                 is_master_in_dp=self.is_master_in_dp,
-                tokenizer=self.tokenizer,
             )
 
         g_infer_context.req_manager.req_sampling_params_manager.update_reqs_token_counter(
diff --git a/lightllm/server/router/req_queue/chunked_prefill/beam_impl.py b/lightllm/server/router/req_queue/chunked_prefill/beam_impl.py
@@ -91,7 +91,7 @@ def generate_new_batch(self, current_batch: Batch):
         aborted_count = 0
         cur_group_reqs = []
         for req in self.waiting_req_list:
-            if req.is_aborted:
+            if req.is_aborted or req.stop_str_matched:
                 aborted_count += 1
                 abort_req_list.append(req)
                 continue
diff --git a/lightllm/server/router/req_queue/chunked_prefill/impl.py b/lightllm/server/router/req_queue/chunked_prefill/impl.py
@@ -78,7 +78,7 @@ def generate_new_batch(self, current_batch: Batch):
         waiting_queue = self.waiting_req_list
 
         for req in waiting_queue:
-            if req.is_aborted:
+            if req.is_aborted or req.stop_str_matched:
                 # 由于管理的复杂性，只有没有被调度运行过的请求可以因为abort直接在队列中忽略掉.
                 # 暂停的请求需要恢复后，由 router manager 部分来过滤。暂时保持这种处理方法, 否则会导致管理token的泄漏
                 aborted_count += 1
diff --git a/lightllm/server/router/req_queue/chunked_prefill/impl_for_pd_decode.py b/lightllm/server/router/req_queue/chunked_prefill/impl_for_pd_decode.py
@@ -38,7 +38,7 @@ def generate_new_batch(self, current_batch: Batch):
         abort_req_list = []
         aborted_count = 0
         for req in self.waiting_req_list:
-            if req.is_aborted:
+            if req.is_aborted or req.stop_str_matched:
                 # 由于管理的复杂性，只有没有被调度运行过的请求可以因为abort直接在队列中忽略掉.
                 # 暂停的请求需要恢复后，由 router manager 部分来过滤。暂时保持这种处理方法, 否则会导致管理token和管理req对象的泄漏
                 aborted_count += 1
diff --git a/lightllm/utils/envs_utils.py b/lightllm/utils/envs_utils.py
@@ -68,11 +68,6 @@ def get_lightllm_gunicorn_keep_alive():
     return int(os.getenv("LIGHTLMM_GUNICORN_KEEP_ALIVE", 10))
 
 
-@lru_cache(maxsize=None)
-def enable_stop_string_match():
-    return os.getenv("ENABLE_STOP_STRING_MATCH", "False").upper() in ["ON", "TRUE", "1"]
-
-
 @lru_cache(maxsize=None)
 def get_lightllm_websocket_max_message_size():
     """

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-from .group_req import GroupReqIndexes, GroupReqObjs, AbortedReqCmd`
	`1`	`+from .group_req import GroupReqIndexes, GroupReqObjs, AbortedReqCmd, StopStrMatchedReqCmd`