bugfix:qwen3 fa3 inferstruct init, add b_prefill_start_loc for init_req_to_token_indexes (#1081)

shihaobai · hiworldwzj · niushengxiao · commit 6d23241bc8e6 · 2025-10-16T09:36:22.000+08:00
Co-authored-by: hiworldwzj &lt;30762946+hiworldwzj@users.noreply.github.com&gt;
diff --git a/lightllm/common/basemodel/basemodel.py b/lightllm/common/basemodel/basemodel.py
@@ -352,16 +352,16 @@ def _prefill(
         model_input: ModelInput,
     ):
         infer_state = self._create_inferstate(model_input)
-        infer_state.init_some_extra_state(self, model_input.input_ids)
         init_req_to_token_indexes(
             req_to_token_indexs=self.req_manager.req_to_token_indexs,
             b_req_idx=infer_state.b_req_idx,
             b_seq_len=infer_state.b_seq_len,
             b_ready_cache_len=infer_state.b_ready_cache_len,
-            b_start_loc=infer_state.b_start_loc,
+            b_start_loc=model_input.b_prefill_start_loc,
             alloc_mem_index=infer_state.mem_index,
             max_q_seq_len=infer_state.max_q_seq_len,
         )
+        infer_state.init_some_extra_state(self, model_input.input_ids)
         return self._context_forward(model_input.input_ids, infer_state)
 
     def _decode(
@@ -491,28 +491,28 @@ def microbatch_overlap_prefill(self, model_input0: ModelInput, model_input1: Mod
         input_ids0, input_ids1 = model_input0.input_ids, model_input1.input_ids
 
         infer_state0 = self._create_inferstate(model_input0, 0)
-        infer_state0.init_some_extra_state(self, input_ids0)
         init_req_to_token_indexes(
             req_to_token_indexs=self.req_manager.req_to_token_indexs,
             b_req_idx=infer_state0.b_req_idx,
             b_seq_len=infer_state0.b_seq_len,
             b_ready_cache_len=infer_state0.b_ready_cache_len,
-            b_start_loc=infer_state0.b_start_loc,
+            b_start_loc=model_input0.b_prefill_start_loc,
             alloc_mem_index=infer_state0.mem_index,
             max_q_seq_len=infer_state0.max_q_seq_len,
         )
+        infer_state0.init_some_extra_state(self, input_ids0)
 
         infer_state1 = self._create_inferstate(model_input1, 1)
-        infer_state1.init_some_extra_state(self, input_ids1)
         init_req_to_token_indexes(
             req_to_token_indexs=self.req_manager.req_to_token_indexs,
             b_req_idx=infer_state1.b_req_idx,
             b_seq_len=infer_state1.b_seq_len,
             b_ready_cache_len=infer_state1.b_ready_cache_len,
-            b_start_loc=infer_state1.b_start_loc,
+            b_start_loc=model_input1.b_prefill_start_loc,
             alloc_mem_index=infer_state1.mem_index,
             max_q_seq_len=infer_state1.max_q_seq_len,
         )
+        infer_state1.init_some_extra_state(self, input_ids1)
 
         model_output0, model_output1 = self._overlap_tpsp_context_forward(
             input_ids0, infer_state0, input_ids1=input_ids1, infer_state1=infer_state1
@@ -713,6 +713,7 @@ def _check_max_len_infer(self):
             b_seq_len = torch.ones(1, dtype=torch.int32, device="cuda")
             b_seq_len[:] = self.batch_max_tokens
             b_ready_cache_len = torch.zeros(1, dtype=torch.int32, device="cuda")
+            b_prefill_start_loc = torch.zeros(1, dtype=torch.int32, device="cuda")
             total_token_num = self.batch_max_tokens
             b_mtp_index = torch.zeros(1, dtype=torch.int32, device="cuda")
             model_input = ModelInput(
@@ -730,6 +731,7 @@ def _check_max_len_infer(self):
                 b_mtp_index=b_mtp_index,
                 is_prefill=True,
                 b_ready_cache_len=b_ready_cache_len,
+                b_prefill_start_loc=b_prefill_start_loc,
             )
             model_output = self.forward(
                 model_input,
@@ -787,6 +789,7 @@ def _autotune_warmup(self):
                 b_seq_len = torch.ones(1, dtype=torch.int32, device="cuda")
                 b_seq_len[:] = input_len
                 b_ready_cache_len = torch.zeros(1, dtype=torch.int32, device="cuda")
+                b_prefill_start_loc = torch.zeros(1, dtype=torch.int32, device="cuda")
                 total_token_num = input_len
                 b_mtp_index = torch.zeros(1, dtype=torch.int32, device="cuda")
                 model_input = ModelInput(
@@ -804,6 +807,7 @@ def _autotune_warmup(self):
                     b_mtp_index=b_mtp_index,
                     is_prefill=True,
                     b_ready_cache_len=b_ready_cache_len,
+                    b_prefill_start_loc=b_prefill_start_loc,
                     multimodal_params=[],
                     **self._gen_special_model_input(total_token_num),
                 )
@@ -847,6 +851,8 @@ def _init_padded_req(self):
         )
         b_seq_len = torch.ones(batch_size, dtype=torch.int32, device="cuda")
         b_ready_cache_len = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
+        b_q_seq_len = b_seq_len - b_ready_cache_len
+        b_prefill_start_loc = b_q_seq_len.cumsum(dim=0, dtype=torch.int32) - b_q_seq_len
         total_token_num = prefill_input_len * batch_size
         b_mtp_index = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
         model_input = ModelInput(
@@ -863,6 +869,7 @@ def _init_padded_req(self):
             b_mtp_index=b_mtp_index,
             b_seq_len=b_seq_len,
             b_ready_cache_len=b_ready_cache_len,
+            b_prefill_start_loc=b_prefill_start_loc,
             is_prefill=True,
             multimodal_params=[],
             **self._gen_special_model_input(total_token_num),
diff --git a/lightllm/common/basemodel/batch_objs.py b/lightllm/common/basemodel/batch_objs.py
@@ -24,6 +24,7 @@ class ModelInput:
     mem_indexes: torch.Tensor = None
     is_prefill: bool = False
     b_ready_cache_len: torch.Tensor = None
+    b_prefill_start_loc: torch.Tensor = None
     multimodal_params: list = field(default_factory=list)
 
     # cpu 变量
@@ -49,6 +50,8 @@ def to_cuda(self):
         self.b_mtp_index = self.b_mtp_index.cuda(non_blocking=True)
         if self.b_ready_cache_len is not None:
             self.b_ready_cache_len = self.b_ready_cache_len.cuda(non_blocking=True)
+        if self.b_prefill_start_loc is not None:
+            self.b_prefill_start_loc = self.b_prefill_start_loc.cuda(non_blocking=True)
 
 
 @dataclass
diff --git a/lightllm/server/router/model_infer/mode_backend/generic_padded_pre_process.py b/lightllm/server/router/model_infer/mode_backend/generic_padded_pre_process.py
@@ -80,6 +80,8 @@ def padded_prepare_prefill_inputs(
     b_seq_len = torch.tensor(b_seq_len, dtype=torch.int32, device="cpu")
     b_mtp_index = torch.tensor(b_mtp_index, dtype=torch.int32, device="cpu")
     b_ready_cache_len = torch.tensor(b_ready_cache_len, dtype=torch.int32, device="cpu")
+    b_q_seq_len = torch.tensor(b_q_seq_len, dtype=torch.int32, device="cpu")
+    b_prefill_start_loc = b_q_seq_len.cumsum(dim=0, dtype=torch.int32) - b_q_seq_len
 
     # dynamic prompt cache 准备 token
     g_infer_state_lock.acquire()
@@ -110,6 +112,7 @@ def padded_prepare_prefill_inputs(
         b_mtp_index=b_mtp_index,
         b_seq_len=b_seq_len,
         b_ready_cache_len=b_ready_cache_len,
+        b_prefill_start_loc=b_prefill_start_loc,
         is_prefill=True,
         b_prefill_has_output_cpu=b_prefill_has_output,
     )
diff --git a/lightllm/server/router/model_infer/mode_backend/generic_pre_process.py b/lightllm/server/router/model_infer/mode_backend/generic_pre_process.py
@@ -57,6 +57,8 @@ def prepare_prefill_inputs(
     b_seq_len = torch.tensor(b_seq_len, dtype=torch.int32, device="cpu")
     b_mtp_index = torch.tensor(b_mtp_index, dtype=torch.int32, device="cpu")
     b_ready_cache_len = torch.tensor(b_ready_cache_len, dtype=torch.int32, device="cpu")
+    b_q_seq_len = torch.tensor(b_q_seq_len, dtype=torch.int32, device="cpu")
+    b_prefill_start_loc = b_q_seq_len.cumsum(dim=0, dtype=torch.int32) - b_q_seq_len
 
     # dynamic prompt cache 准备 token
     g_infer_state_lock.acquire()
@@ -78,6 +80,7 @@ def prepare_prefill_inputs(
         b_mtp_index=b_mtp_index,
         b_seq_len=b_seq_len,
         b_ready_cache_len=b_ready_cache_len,
+        b_prefill_start_loc=b_prefill_start_loc,
         is_prefill=True,
         b_prefill_has_output_cpu=b_prefill_has_output,
         prefix_total_token_num=prefix_total_token_num,