fix

hiworldwzj · web-flow · commit 315a495bf0cf · 2025-05-14T14:27:53.000+08:00
diff --git a/lightllm/server/router/model_infer/infer_batch.py b/lightllm/server/router/model_infer/infer_batch.py
@@ -338,8 +338,8 @@ def update_finish_status(self, eos_ids):
             self.finish_status.set_status(FinishStatus.FINISHED_STOP)
         elif (
             self.cur_output_len > 0
-            and self.sampling_param.shm_param.ignore_eos is False
             and self.get_last_gen_token() in eos_ids
+            and self.sampling_param.shm_param.ignore_eos is False
         ):
             self.finish_status.set_status(FinishStatus.FINISHED_STOP)
         elif self.cur_output_len >= self.sampling_param.shm_param.max_new_tokens:
diff --git a/lightllm/server/router/model_infer/mode_backend/base_backend.py b/lightllm/server/router/model_infer/mode_backend/base_backend.py
@@ -4,7 +4,6 @@
 import rpyc
 import torch
 import socket
-import time
 from datetime import timedelta
 from typing import Dict, List, Tuple, Callable, Optional
 from transformers.configuration_utils import PretrainedConfig
@@ -251,81 +250,6 @@ def _post_handle(
         is_chuncked_mode: bool,
         do_filter_finished_reqs: bool,
         extra_post_req_handle_func: Optional[Callable[[InferReq, int, float], None]] = None,
-    ) -> List[int]:
-        """
-        extra_post_req_handle_func 用于提供在一个请求确定输出的时候，给出额外的后处理操作，主要是用于
-        约束输出等模式，设置自己请求内部的状态机的状态，并添加额外的停止判定条件等。
-        """
-        if not hasattr(self, "_post_handle_impl"):
-            try:
-                finished_req_ids = self._fast_post_handle(
-                    run_reqs,
-                    next_token_ids,
-                    next_token_logprobs,
-                    is_chuncked_mode,
-                    do_filter_finished_reqs,
-                    extra_post_req_handle_func,
-                )
-                self._post_handle_impl = self._fast_post_handle
-                self.logger.info("use _fast_post_handle")
-                return finished_req_ids
-            except:
-                finished_req_ids = self._python_post_handle(
-                    run_reqs,
-                    next_token_ids,
-                    next_token_logprobs,
-                    is_chuncked_mode,
-                    do_filter_finished_reqs,
-                    extra_post_req_handle_func,
-                )
-                self.logger.info("use _python_post_handle")
-                self._post_handle_impl = self._python_post_handle
-                return finished_req_ids
-        else:
-            return self._post_handle_impl(
-                run_reqs,
-                next_token_ids,
-                next_token_logprobs,
-                is_chuncked_mode,
-                do_filter_finished_reqs,
-                extra_post_req_handle_func,
-            )
-
-    def _fast_post_handle(
-        self,
-        run_reqs: List[InferReq],
-        next_token_ids,
-        next_token_logprobs,
-        is_chuncked_mode: bool,
-        do_filter_finished_reqs: bool,
-        extra_post_req_handle_func: Optional[Callable[[InferReq, int, float], None]] = None,
-    ):
-        from . import cython_fast_impl
-
-        start = time.time()
-        finished_req_ids = cython_fast_impl.fast_post_handle(
-            self,
-            run_reqs,
-            next_token_ids,
-            next_token_logprobs,
-            is_chuncked_mode,
-            do_filter_finished_reqs,
-            extra_post_req_handle_func,
-        )
-        cost_time = time.time() - start
-        if self.is_master_in_dp and cost_time > 0.001:
-            self.logger.info(f"post handle cost time {cost_time} s, batch_size: {len(run_reqs)}")
-        return finished_req_ids
-
-    # 一些可以复用的通用功能函数
-    def _python_post_handle(
-        self,
-        run_reqs: List[InferReq],
-        next_token_ids,
-        next_token_logprobs,
-        is_chuncked_mode: bool,
-        do_filter_finished_reqs: bool,
-        extra_post_req_handle_func: Optional[Callable[[InferReq, int, float], None]] = None,
     ) -> List[int]:
         """
         extra_post_req_handle_func 用于提供在一个请求确定输出的时候，给出额外的后处理操作，主要是用于
diff --git a/lightllm/server/router/model_infer/mode_backend/generic_post_process.py b/lightllm/server/router/model_infer/mode_backend/generic_post_process.py
@@ -65,7 +65,7 @@ def sample(logits, reqs, eos_id: List[int] = [2]):
         int64_batch_next_token_ids = torch.empty_like(batch_next_token_ids, dtype=torch.int64)
         int64_batch_next_token_ids[:] = batch_next_token_ids
         batch_next_token_probs = torch.gather(probs, dim=1, index=int64_batch_next_token_ids.view(-1, 1))
-        return int64_batch_next_token_ids.view(-1), batch_next_token_probs.view(-1)
+        return batch_next_token_ids.view(-1), batch_next_token_probs.view(-1)
     else:
         assert False, "dead path"
 
diff --git a/requirements.txt b/requirements.txt
@@ -88,4 +88,3 @@ flashinfer-python==0.2.4
 sgl-kernel
 httpx==0.28.1
 librosa==0.11.0
-Cython
diff --git a/setup.py b/setup.py
@@ -1,5 +1,4 @@
 from setuptools import setup, find_packages
-from Cython.Build import cythonize
 
 package_data = {"lightllm": ["common/all_kernel_configs/*/*.json"]}
 setup(
@@ -29,10 +28,4 @@
         "triton",
     ],
     package_data=package_data,
-    ext_modules=cythonize(
-        [
-            "lightllm/server/router/model_infer/mode_backend/cython_fast_impl.pyx",
-        ]
-    ),
-    zip_safe=False,
 )