fix

hiworldwzj · web-flow · commit f579157b82aa · 2025-10-15T18:30:10.000+08:00
diff --git a/lightllm/common/basemodel/basemodel.py b/lightllm/common/basemodel/basemodel.py
@@ -842,7 +842,8 @@ def _init_padded_req(self):
         )
         b_seq_len = torch.ones(batch_size, dtype=torch.int32, device="cuda")
         b_ready_cache_len = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
-        b_prefill_start_loc = F.pad(torch.cumsum(b_seq_len, dim=0), (1, 0), value=0)[:-1]
+        b_q_seq_len = b_seq_len - b_ready_cache_len
+        b_prefill_start_loc = b_q_seq_len.cumsum(dim=0, dtype=torch.int32) - b_q_seq_len
         total_token_num = prefill_input_len * batch_size
         b_mtp_index = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
         model_input = ModelInput(
diff --git a/lightllm/server/router/model_infer/mode_backend/generic_padded_pre_process.py b/lightllm/server/router/model_infer/mode_backend/generic_padded_pre_process.py
@@ -36,7 +36,6 @@ def padded_prepare_prefill_inputs(
     b_ready_cache_len = []
     b_mtp_index = []
     b_prefill_has_output = []
-    b_prefill_start_loc = [0]
 
     for req in req_objs:
 
@@ -57,7 +56,6 @@ def padded_prepare_prefill_inputs(
         prefix_total_token_num += req.cur_kv_len
         b_ready_cache_len.append(req.cur_kv_len)
         b_mtp_index.append(0)
-        b_prefill_start_loc.append(b_prefill_start_loc[-1] + input_token_len)
 
     # padding fake req for prefill
     for _ in range(padded_req_num):
@@ -68,7 +66,6 @@ def padded_prepare_prefill_inputs(
         b_mtp_index.append(0)
         b_prefill_has_output.append(False)
         b_ready_cache_len.append(0)
-        b_prefill_start_loc.append(b_prefill_start_loc[-1] + 1)
         total_token_num += 1
         prefix_total_token_num += 0
 
@@ -83,7 +80,8 @@ def padded_prepare_prefill_inputs(
     b_seq_len = torch.tensor(b_seq_len, dtype=torch.int32, device="cpu")
     b_mtp_index = torch.tensor(b_mtp_index, dtype=torch.int32, device="cpu")
     b_ready_cache_len = torch.tensor(b_ready_cache_len, dtype=torch.int32, device="cpu")
-    b_prefill_start_loc = torch.tensor(b_prefill_start_loc, dtype=torch.int32, device="cpu")
+    b_q_seq_len = torch.tensor(b_q_seq_len, dtype=torch.int32, device="cpu")
+    b_prefill_start_loc = b_q_seq_len.cumsum(dim=0, dtype=torch.int32) - b_q_seq_len
 
     # dynamic prompt cache 准备 token
     g_infer_state_lock.acquire()
diff --git a/lightllm/server/router/model_infer/mode_backend/generic_pre_process.py b/lightllm/server/router/model_infer/mode_backend/generic_pre_process.py
@@ -20,7 +20,6 @@ def prepare_prefill_inputs(
     b_ready_cache_len = []
     b_mtp_index = []
     b_prefill_has_output = []
-    b_prefill_start_loc = [0]
 
     for req in req_objs:
         run_reqs.append(req)
@@ -41,7 +40,6 @@ def prepare_prefill_inputs(
 
         b_seq_len.append(seq_len)
         b_q_seq_len.append(input_token_len)
-        b_prefill_start_loc.append(b_prefill_start_loc[-1] + input_token_len)
         input_ids.append(input_id)
         total_token_num += seq_len
         prefix_total_token_num += req.cur_kv_len
@@ -59,7 +57,8 @@ def prepare_prefill_inputs(
     b_seq_len = torch.tensor(b_seq_len, dtype=torch.int32, device="cpu")
     b_mtp_index = torch.tensor(b_mtp_index, dtype=torch.int32, device="cpu")
     b_ready_cache_len = torch.tensor(b_ready_cache_len, dtype=torch.int32, device="cpu")
-    b_prefill_start_loc = torch.tensor(b_prefill_start_loc, dtype=torch.int32, device="cpu")
+    b_q_seq_len = torch.tensor(b_q_seq_len, dtype=torch.int32, device="cpu")
+    b_prefill_start_loc = b_q_seq_len.cumsum(dim=0, dtype=torch.int32) - b_q_seq_len
 
     # dynamic prompt cache 准备 token
     g_infer_state_lock.acquire()