ModelTC
diff --git a/‎lightllm/server/router/model_infer/mode_backend/base_backend.py‎
Lines changed: 40 additions & 0 deletions b/‎lightllm/server/router/model_infer/mode_backend/base_backend.py‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py‎
Lines changed: 27 additions & 40 deletions b/‎lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py‎
Lines changed: 27 additions & 40 deletions
@@ -31,6 +31,8 @@
 from lightllm.server.router.shm_reqs_io_buffer import ShmReqsIOBuffer
 from lightllm.server.router.model_infer.mode_backend.overlap_events import OverlapEventManager, OverlapEventPack
 from lightllm.models.deepseek_mtp.model import Deepseek3MTPModel
+from lightllm.server.router.model_infer.mode_backend.generic_post_process import sample
+from lightllm.common.basemodel.triton_kernel.gather_token_id import scatter_token
 
 
 class ModeBackend:
@@ -574,6 +576,44 @@ def _gen_argmax_token_ids(self, model_output: ModelOutput):
         draft_next_token_ids_gpu = torch.argmax(probs, dim=-1)
         return draft_next_token_ids_gpu
 
+    def _sample_and_scatter_token(
+        self,
+        logits: torch.Tensor,
+        b_req_idx: torch.Tensor,
+        b_mtp_index: torch.Tensor,
+        run_reqs: List[InferReq],
+        is_prefill: bool,
+        b_prefill_has_output_cpu: torch.Tensor = None,
+        mask_func: Optional[Callable] = None,
+    ):
+
+        if mask_func is not None:
+            mask_func(run_reqs, logits)
+
+        next_token_ids, next_token_logprobs = sample(logits, run_reqs, self.eos_id)
+        b_has_out = None
+        if is_prefill:
+            b_has_out = g_pin_mem_manager.gen_from_list(
+                key="b_has_out", data=b_prefill_has_output_cpu, dtype=torch.bool
+            ).cuda(non_blocking=True)
+
+        scatter_token(
+            next_token_ids=next_token_ids,
+            req_to_next_token_ids=self.model.req_manager.req_sampling_params_manager.req_to_next_token_ids,
+            b_req_idx=b_req_idx,
+            b_mtp_index=b_mtp_index,
+            b_has_out=b_has_out,
+        )
+        g_infer_context.req_sampling_manager.update_reqs_out_token_counter_gpu(
+            b_req_idx=b_req_idx,
+            next_token_ids=next_token_ids,
+            mask=b_has_out,
+        )
+        next_token_ids_cpu, next_token_logprobs_cpu = self._async_copy_next_token_infos_to_pin_mem(
+            next_token_ids, next_token_logprobs
+        )
+        return next_token_ids, next_token_ids_cpu, next_token_logprobs_cpu
+
     def _dp_all_gather_prefill_and_decode_req_num(
         self, prefill_reqs: List[InferReq], decode_reqs: List[InferReq]
     ) -> Tuple[np.ndarray, np.ndarray]:
 
@@ -102,8 +102,15 @@ def prefill_normal(
             prefill_reqs, is_chuncked_mode=not self.disable_chunked_prefill, is_multimodal=self.is_multimodal
         )
         with torch.cuda.stream(g_infer_context.get_overlap_stream()):
-            _, next_token_ids_cpu, next_token_logprobs_cpu, _ = self._main_model_forward(
-                model_input, run_reqs, self.prefill_mask_func
+            model_output = self.model.forward(model_input)
+            next_token_ids, next_token_ids_cpu, next_token_logprobs_cpu = self._sample_and_scatter_token(
+                logits=model_output.logits,
+                b_req_idx=model_input.b_req_idx,
+                b_mtp_index=model_input.b_mtp_index,
+                run_reqs=run_reqs,
+                is_prefill=True,
+                b_prefill_has_output_cpu=model_input.b_prefill_has_output_cpu,
+                mask_func=self.prefill_mask_func,
             )
             sync_event = torch.cuda.Event()
             sync_event.record()
@@ -133,8 +140,14 @@ def decode_normal(
     ):
         model_input, run_reqs = prepare_decode_inputs(decode_reqs)
         with torch.cuda.stream(g_infer_context.get_overlap_stream()):
-            _, next_token_ids_cpu, next_token_logprobs_cpu, _ = self._main_model_forward(
-                model_input, run_reqs, self.decode_mask_func
+            model_output = self.model.forward(model_input)
+            next_token_ids, next_token_ids_cpu, next_token_logprobs_cpu = self._sample_and_scatter_token(
+                logits=model_output.logits,
+                b_req_idx=model_input.b_req_idx,
+                b_mtp_index=model_input.b_mtp_index,
+                run_reqs=run_reqs,
+                is_prefill=False,
+                mask_func=self.decode_mask_func,
             )
             sync_event = torch.cuda.Event()
             sync_event.record()
@@ -167,8 +180,15 @@ def prefill_mtp(
             prefill_reqs, is_chuncked_mode=not self.disable_chunked_prefill, is_multimodal=self.is_multimodal
         )
         with torch.cuda.stream(g_infer_context.get_overlap_stream()):
-            next_token_ids, next_token_ids_cpu, next_token_logprobs_cpu, model_output = self._main_model_forward(
-                model_input, run_reqs, self.prefill_mask_func
+            model_output = self.model.forward(model_input)
+            next_token_ids, next_token_ids_cpu, next_token_logprobs_cpu = self._sample_and_scatter_token(
+                logits=model_output.logits,
+                b_req_idx=model_input.b_req_idx,
+                b_mtp_index=model_input.b_mtp_index,
+                run_reqs=run_reqs,
+                is_prefill=True,
+                b_prefill_has_output_cpu=model_input.b_prefill_has_output_cpu,
+                mask_func=self.prefill_mask_func,
             )
             # mtp kv fill
             self._draft_prefill_forward(model_input, model_output, self.prefill_mtp_step, next_token_ids)
@@ -201,7 +221,7 @@ def decode_mtp(
         decode_reqs: List[InferReq],
     ):
         if self.is_mtp_eagle:
-            draft_model_input, _, eagle_mem_indexes_cpu = prepare_eagle_decode_inputs(decode_reqs, self.mtp_step)
+            draft_model_input, eagle_mem_indexes_cpu = prepare_eagle_decode_inputs(decode_reqs, self.mtp_step)
             self._decode_mtp_common(
                 event_pack=event_pack,
                 decode_reqs=decode_reqs,
@@ -218,39 +238,6 @@ def decode_mtp(
             )
         return
 
-    def _main_model_forward(
-        self, model_input: ModelInput, run_reqs: List[InferReq], mask_func: Optional[Callable] = None
-    ):
-        model_output = self.model.forward(model_input)
-        logits = model_output.logits
-
-        if mask_func is not None:
-            mask_func(run_reqs, logits)
-
-        next_token_ids, next_token_logprobs = sample(logits, run_reqs, self.eos_id)
-        b_has_out = None
-        if model_input.is_prefill:
-            b_has_out = g_pin_mem_manager.gen_from_list(
-                key="b_has_out", data=model_input.b_prefill_has_output_cpu, dtype=torch.bool
-            ).cuda(non_blocking=True)
-
-        scatter_token(
-            next_token_ids=next_token_ids,
-            req_to_next_token_ids=self.model.req_manager.req_sampling_params_manager.req_to_next_token_ids,
-            b_req_idx=model_input.b_req_idx,
-            b_mtp_index=model_input.b_mtp_index,
-            b_has_out=b_has_out,
-        )
-        g_infer_context.req_sampling_manager.update_reqs_out_token_counter_gpu(
-            b_req_idx=model_input.b_req_idx,
-            next_token_ids=next_token_ids,
-            mask=b_has_out,
-        )
-        next_token_ids_cpu, next_token_logprobs_cpu = self._async_copy_next_token_infos_to_pin_mem(
-            next_token_ids, next_token_logprobs
-        )
-        return next_token_ids, next_token_ids_cpu, next_token_logprobs_cpu, model_output
-
     def _draft_prefill_forward(
         self, model_input: ModelInput, model_output: ModelOutput, mtp_step: int, next_token_ids: torch.Tensor
     ):