fix overlap

shihaobai · shihaobai · commit 149c72ddacaa · 2025-09-17T15:59:12.000+08:00
diff --git a/lightllm/models/deepseek2/flashattention_infer_struct.py b/lightllm/models/deepseek2/flashattention_infer_struct.py
@@ -53,5 +53,4 @@ def init_some_extra_state(self, model, model_input: ModelInput):
             self.page_table[:, :max_seq_len_k].copy_(
                 model.req_manager.req_to_token_indexs[self.b_req_idx, :max_seq_len_k]
             )
-            self.page_table[:, max_seq_len_k:].fill_(0)
         return
diff --git a/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py b/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py
@@ -265,7 +265,7 @@ def decode_mtp(
         decode_reqs: List[InferReq],
     ):
         model_input, run_reqs = prepare_decode_inputs(decode_reqs)
-        b_mtp_index_cpu = model_input.b_mtp_index
+        b_mtp_index_cpu = model_input.b_mtp_index_cpu
         with torch.cuda.stream(g_infer_context.get_overlap_stream()):
             model_output = self.model.forward(model_input)
             all_next_token_ids = []
diff --git a/lightllm/server/router/model_infer/mode_backend/dp_backend/impl.py b/lightllm/server/router/model_infer/mode_backend/dp_backend/impl.py
@@ -449,7 +449,7 @@ def prefill_mtp(self, event_pack: OverlapEventPack, prefill_reqs: List[InferReq]
 
     def decode_mtp(self, event_pack: OverlapEventPack, decode_reqs: List[InferReq]):
         model_input, run_reqs, padded_req_num = padded_prepare_decode_inputs(decode_reqs)
-        b_mtp_index_cpu = model_input.b_mtp_index
+        b_mtp_index_cpu = model_input.b_mtp_index_cpu
         req_num = len(run_reqs)
 
         with torch.cuda.stream(g_infer_context.get_overlap_stream()):
@@ -680,8 +680,8 @@ def decode_overlap_mtp(self, event_pack: OverlapEventPack, decode_reqs: List[Inf
         ) = padded_overlap_prepare_decode_inputs(decode_reqs)
         req_num0, req_num1 = len(run_reqs0), len(run_reqs1)
         all_next_token_ids = []
-        b_mtp_index_cpu0 = micro_input0.b_mtp_index
-        b_mtp_index_cpu1 = micro_input1.b_mtp_index
+        b_mtp_index_cpu0 = micro_input0.b_mtp_index_cpu
+        b_mtp_index_cpu1 = micro_input1.b_mtp_index_cpu
         with torch.cuda.stream(g_infer_context.get_overlap_stream()):
 
             model_output0, model_output1 = self.model.microbatch_overlap_decode(micro_input0, micro_input1)
diff --git a/lightllm/server/router/model_infer/mode_backend/generic_padded_pre_process.py b/lightllm/server/router/model_infer/mode_backend/generic_padded_pre_process.py
@@ -93,12 +93,12 @@ def padded_prepare_prefill_inputs(
         batch_size=b_seq_len.shape[0],
         total_token_num=total_token_num,
         max_len_in_batch=max_len_in_batch,
-        input_ids=input_ids,
+        input_ids_cpu=input_ids,
         mem_indexes_cpu=mem_indexes,
-        b_req_idx=b_req_idx,
-        b_mtp_index=b_mtp_index,
-        b_seq_len=b_seq_len,
-        b_ready_cache_len=b_ready_cache_len,
+        b_req_idx_cpu=b_req_idx,
+        b_mtp_index_cpu=b_mtp_index,
+        b_seq_len_cpu=b_seq_len,
+        b_ready_cache_len_cpu=b_ready_cache_len,
         is_prefill=True,
         b_prefill_has_output_cpu=b_prefill_has_output,
     )
@@ -180,9 +180,9 @@ def padded_prepare_decode_inputs(
         max_len_in_batch=max_len_in_batch,
         input_ids=None,
         mem_indexes_cpu=mem_indexes,
-        b_req_idx=b_req_idx,
-        b_mtp_index=b_mtp_index,
-        b_seq_len=b_seq_len,
+        b_req_idx_cpu=b_req_idx,
+        b_mtp_index_cpu=b_mtp_index,
+        b_seq_len_cpu=b_seq_len,
         is_prefill=False,
     )
     return model_input, run_reqs, padded_req_num

Original file line number	Diff line number	Diff line change
`@@ -53,5 +53,4 @@ def init_some_extra_state(self, model, model_input: ModelInput):`
`53`	`53`	`self.page_table[:, :max_seq_len_k].copy_(`
`54`	`54`	`model.req_manager.req_to_token_indexs[self.b_req_idx, :max_seq_len_k]`
`55`	`55`	`)`
`56`		`- self.page_table[:, max_seq_len_k:].fill_(0)`
`57`	`56`	`return`