ModelTC
diff --git a/‎lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl_for_return_all_prompt_logprobs.py‎
Lines changed: 68 additions & 0 deletions b/‎lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl_for_return_all_prompt_logprobs.py‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎…continues_batch/impl_for_reward_model.py‎ ‎…chunked_prefill/impl_for_reward_model.py‎lightllm/server/router/model_infer/mode_backend/continues_batch/impl_for_reward_model.py renamed to lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl_for_reward_model.py
Lines changed: 14 additions & 33 deletions b/‎…continues_batch/impl_for_reward_model.py‎ ‎…chunked_prefill/impl_for_reward_model.py‎lightllm/server/router/model_infer/mode_backend/continues_batch/impl_for_reward_model.py renamed to lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl_for_reward_model.py
Lines changed: 14 additions & 33 deletions
diff --git a/‎lightllm/server/router/model_infer/mode_backend/continues_batch/impl.py‎
Lines changed: 0 additions & 100 deletions b/‎lightllm/server/router/model_infer/mode_backend/continues_batch/impl.py‎
Lines changed: 0 additions & 100 deletions
diff --git a/‎lightllm/server/router/model_infer/mode_backend/continues_batch/impl_mtp.py‎
Lines changed: 0 additions & 136 deletions b/‎lightllm/server/router/model_infer/mode_backend/continues_batch/impl_mtp.py‎
Lines changed: 0 additions & 136 deletions
@@ -0,0 +1,68 @@
+import torch
+from .impl import ChunkedPrefillBackend
+from typing import List
+from lightllm.server.router.model_infer.infer_batch import InferReq
+from lightllm.server.router.model_infer.mode_backend.pre import prepare_prefill_inputs
+from lightllm.server.router.model_infer.mode_backend.generic_post_process import sample
+from lightllm.server.router.model_infer.mode_backend.overlap_events import OverlapEventPack
+
+
+class ReturnPromptLogProbBackend(ChunkedPrefillBackend):
+    def __init__(self) -> None:
+        super().__init__()
+        self.prefill = self.return_all_prompt_logprobs_prefill
+        return
+
+    def return_all_prompt_logprobs_prefill(
+        self,
+        event_pack: OverlapEventPack,
+        prefill_reqs: List[InferReq]):
+
+        # 在 return all_prompt_logprobs 的模式下，不能启用 dynamic prompt cache
+        assert self.radix_cache is None
+        assert self.disable_chunked_prefill is True
+
+        model_input, run_reqs = prepare_prefill_inputs(
+            prefill_reqs, is_chuncked_mode=not self.disable_chunked_prefill, is_multimodal=self.is_multimodal
+        )
+
+        model_output = self.model.forward(model_input)
+        prompt_all_logits = model_output.logits
+
+        input_ids = model_input.input_ids
+        b_ready_cache_len = model_input.b_ready_cache_len
+        b_seq_len = model_input.b_seq_len
+        last_index = torch.cumsum(b_seq_len, dim=0, dtype=torch.long) - 1
+        logits = prompt_all_logits[last_index, :]
+
+        b_q_seq_len = b_seq_len - b_ready_cache_len
+        b_start_loc = torch.cumsum(b_q_seq_len, dim=0, dtype=torch.long) - b_q_seq_len
+        b_start_loc = b_start_loc.cpu().numpy()
+        b_q_seq_len = b_q_seq_len.cpu().numpy()
+
+        for req_obj, start_loc, q_seq_len in zip(run_reqs, b_start_loc, b_q_seq_len):
+            req_obj: InferReq = req_obj
+            cur_ids: torch.Tensor = input_ids[start_loc : start_loc + q_seq_len]
+            cur_logits = prompt_all_logits[start_loc : start_loc + q_seq_len]
+            cur_logprobs = torch.log_softmax(cur_logits, dim=-1, dtype=torch.float)[0:-1, :]
+            cur_logprobs = torch.gather(cur_logprobs, dim=1, index=cur_ids[1:].view(-1, 1)).detach().cpu().numpy()
+
+            if req_obj.shm_req.input_len > 1:
+                req_obj.shm_req.shm_logprobs.arr[1 : req_obj.shm_req.input_len] = cur_logprobs.flatten()
+
+        if self.prefill_mask_func is not None:
+            self.prefill_mask_func(run_reqs, logits)
+
+        next_token_ids, next_token_probs = sample(logits, run_reqs, self.eos_id)
+        next_token_ids = next_token_ids.detach().cpu().numpy()
+        next_token_logprobs = torch.log(next_token_probs).detach().cpu().numpy()
+
+        update_packs = self._pre_post_handle(run_reqs, is_chuncked_mode=not self.disable_chunked_prefill)
+        self._post_handle(
+            run_reqs=run_reqs,
+            next_token_ids=next_token_ids,
+            next_token_logprobs=next_token_logprobs,
+            run_reqs_update_packs=update_packs,
+            extra_post_req_handle_func=self.extra_post_req_handle_func,
+        )
+        return
@@ -1,35 +1,24 @@
 import torch
 from typing import List, Tuple
-from .impl import ContinuesBatchBackend
-from lightllm.server.router.model_infer.infer_batch import InferReq, InferSamplingParams, g_infer_context
+from .impl import ChunkedPrefillBackend
+from lightllm.server.router.model_infer.infer_batch import InferReq
 from lightllm.server.router.model_infer.mode_backend.pre import prepare_prefill_inputs
-from lightllm.server.core.objs import FinishStatus
+from lightllm.server.router.model_infer.mode_backend.overlap_events import OverlapEventPack
 
-
-class RewardModelBackend(ContinuesBatchBackend):
+class RewardModelBackend(ChunkedPrefillBackend):
     def __init__(self) -> None:
         super().__init__()
 
-    def decode(self):
-        uninit_reqs, aborted_reqs, ok_finished_reqs, prefill_reqs, decode_reqs = self._get_classed_reqs(
-            g_infer_context.infer_req_ids
-        )
-
-        if aborted_reqs:
-            g_infer_context.filter_reqs(aborted_reqs)
-
-        if prefill_reqs:
-            self._prefill_reqs(req_objs=prefill_reqs)
-
-        if decode_reqs:
-            self.normal_decode(decode_reqs=decode_reqs, uninit_reqs=uninit_reqs, ok_finished_reqs=ok_finished_reqs)
-
-        self._overlap_req_init_and_filter(uninit_reqs=uninit_reqs, ok_finished_reqs=ok_finished_reqs, clear_list=True)
+        self.prefill = self.reward_prefill
         return
 
-    def _prefill_reqs(self, req_objs: List[InferReq]):
+    def reward_prefill(self,
+                       event_pack: OverlapEventPack,
+                       prefill_reqs: List[InferReq]):
+        
+        assert self.disable_chunked_prefill is True
         model_input, run_reqs = prepare_prefill_inputs(
-            req_objs, is_chuncked_mode=False, is_multimodal=self.is_multimodal
+            prefill_reqs, is_chuncked_mode=not self.disable_chunked_prefill, is_multimodal=self.is_multimodal
         )
 
         model_output = self.model.forward(model_input)
@@ -39,20 +28,14 @@ def _prefill_reqs(self, req_objs: List[InferReq]):
         next_token_id = 1
         next_token_logprob = 1.0
 
-        finished_req_ids = []
-
         for req_obj, score in zip(run_reqs, scores):
             # prefill and decode is same
             req_obj: InferReq = req_obj
             req_obj.cur_kv_len = req_obj.get_cur_total_len()
-
-            req_obj.set_next_gen_token_id(next_token_id, next_token_logprob)
+            
             req_obj.cur_output_len += 1
-
-            req_obj.update_finish_status(self.eos_id)
-
-            if req_obj.finish_status.is_finished() or req_obj.shm_req.router_aborted:
-                finished_req_ids.append(req_obj.shm_req.request_id)
+            req_obj.set_next_gen_token_id(next_token_id, next_token_logprob, output_len=req_obj.cur_output_len)
+            req_obj.update_finish_status(self.eos_id, output_len=req_obj.cur_output_len)
 
             if self.is_master_in_dp:
                 # 写入 reward_score
@@ -69,6 +52,4 @@ def _prefill_reqs(self, req_objs: List[InferReq]):
                     req_obj.shm_req.finish_status = req_obj.finish_status
 
                 req_obj.shm_req.candetoken_out_len = req_obj.cur_output_len
-
-        g_infer_context.filter(finished_req_ids)
         return