update

shihaobai · shihaobai · commit 9e0e04bd1ae3 · 2025-07-31T20:42:00.000+08:00
diff --git a/lightllm/models/llama/flashattention_infer_struct.py b/lightllm/models/llama/flashattention_infer_struct.py
@@ -24,8 +24,7 @@ def get_page_table_buffer(cls, graph_max_batch_size: int, max_seq_len: int):
             ]
         return cls._shared_page_table_buffer
 
-    def init_some_extra_state(self, model, input_ids: torch.Tensor):
-        super().init_some_extra_state(model, input_ids)
+    def _init_flash_attention_state(self, model, input_ids: torch.Tensor):
         if self.is_prefill:
             self.cu_seqlens_q = self.b1_cu_q_seq_len.int()
             self.cu_seqlens_k = self.b1_cu_kv_seq_len.int()
@@ -93,3 +92,8 @@ def init_some_extra_state(self, model, input_ids: torch.Tensor):
                 )
             )
         return
+
+    def init_some_extra_state(self, model, input_ids: torch.Tensor):
+        super().init_some_extra_state(model, input_ids)
+        self._init_flash_attention_state(model, input_ids)
+        return
diff --git a/lightllm/models/qwen2_vl/flashattention_infer_struct.py b/lightllm/models/qwen2_vl/flashattention_infer_struct.py
@@ -2,94 +2,29 @@
 import torch
 import numpy as np
 import torch.distributed as dist
-from lightllm.models.qwen2_vl.infer_struct import Qwen2VLInferStateInfo
+from lightllm.common.basemodel.infer_struct import InferStateInfo
+from lightllm.models.llama.flashattention_infer_struct import FlashAttentionStateInfo
 from lightllm.utils.envs_utils import get_env_start_args
 from lightllm.utils.dist_utils import get_current_device_id
 from lightllm.models.deepseek2.triton_kernel.repack_kv_index import repack_kv_index
 from lightllm.common.basemodel.batch_objs import ModelInput
 
 
-class Qwen2VLFlashAttentionStateInfo(Qwen2VLInferStateInfo):
-    _shared_page_table_buffer = None
-
-    def __init__(self):
-        super().__init__()
-
-    @classmethod
-    def get_page_table_buffer(cls, graph_max_batch_size: int, max_seq_len: int):
-        if cls._shared_page_table_buffer is None:
-            cls._shared_page_table_buffer = [
-                torch.empty(graph_max_batch_size * max_seq_len, dtype=torch.int32).to(get_current_device_id()),
-                torch.empty(graph_max_batch_size * max_seq_len, dtype=torch.int32).to(get_current_device_id()),
-            ]
-        return cls._shared_page_table_buffer
-
+class Qwen2VLFlashAttentionStateInfo(FlashAttentionStateInfo):
     def init_some_extra_state(self, model, input_ids: torch.Tensor):
-        super().init_some_extra_state(model, input_ids)
+        InferStateInfo.init_some_extra_state(self, model, input_ids)
         if self.is_prefill:
-            self.cu_seqlens_q = self.b1_cu_q_seq_len.int()
-            self.cu_seqlens_k = self.b1_cu_kv_seq_len.int()
-            self.page_table = torch.empty(
-                (self.batch_size, self.max_seq_len), dtype=torch.int32, device=input_ids.device
-            )
-            self.page_table.copy_(model.req_manager.req_to_token_indexs[self.b_req_idx, : self.max_seq_len])
+            self.max_seq_len = self.max_kv_seq_len
+            self.q_max_seq_len = self.max_q_seq_len
+            position_ids = self.position_ids
+            self.position_sin = model._sin_cached[:, position_ids, :].unsqueeze(1)
+            self.position_cos = model._cos_cached[:, position_ids, :].unsqueeze(1)
+            position_ids = None
         else:
-            # Meta information of flashattention for decoding
-            self.cu_seqlens_q = self.b1_cu_q_seq_len.int()
-            self.cu_seqlens_k = self.b1_cu_kv_seq_len.int()
-            max_seq_len_k = self.max_kv_seq_len
-            if self.batch_size <= model.graph_max_batch_size and self.max_len_in_batch <= model.graph_max_len_in_batch:
-                page_buffer = Qwen2VLFlashAttentionStateInfo.get_page_table_buffer(
-                    model.graph_max_batch_size, model.graph_max_len_in_batch
-                )
-                self.page_table = page_buffer[self.microbatch_index][
-                    : self.batch_size * model.graph_max_len_in_batch
-                ].reshape(self.batch_size, model.graph_max_len_in_batch)
-            else:
-                self.page_table = torch.empty(
-                    (self.batch_size, self.max_len_in_batch), dtype=torch.int32, device=input_ids.device
-                )
-
-            self.page_table[:, :max_seq_len_k].copy_(
-                model.req_manager.req_to_token_indexs[self.b_req_idx, :max_seq_len_k],
-                non_blocking=True,
-            )
-            self.page_table[:, max_seq_len_k:].fill_(0)
-
-        if "offline_calibration_fp8kv" in model.mode:
-            if self.is_prefill:
-                device = input_ids.device
-                # q_scale和token_batch_ids在对q做per head量化使用，为了节省资源在推理外部初始化
-                self.q_scale = torch.empty(
-                    (self.batch_size, self.mem_manager.head_num), dtype=torch.float32, device=device
-                )
-                self.token_batch_ids = torch.repeat_interleave(
-                    torch.arange(self.batch_size, device=device), self.b_q_seq_len
-                )
+            position_ids = self.position_ids
+            self.position_sin = model._sin_cached[:, position_ids, :].unsqueeze(1)
+            self.position_cos = model._cos_cached[:, position_ids, :].unsqueeze(1)
 
-            offline_scales = self.mem_manager.scales
-            head_num = self.mem_manager.head_num
-            # 为了减少推理计算量，在推理外部初始化k_descale和v_descale
-            self.k_descale = (
-                offline_scales[:, :head_num]
-                .view(-1, 1, head_num)
-                .expand(offline_scales.shape[0], self.batch_size, head_num)
-                if offline_scales is not None
-                else torch.ones(
-                    (self.mem_manager.layer_num, self.batch_size, head_num),
-                    dtype=torch.float32,
-                    device=input_ids.device,
-                )
-            )
-            self.v_descale = (
-                offline_scales[:, head_num:]
-                .view(-1, 1, head_num)
-                .expand(offline_scales.shape[0], self.batch_size, head_num)
-                if offline_scales is not None
-                else torch.ones(
-                    (self.mem_manager.layer_num, self.batch_size, head_num),
-                    dtype=torch.float32,
-                    device=input_ids.device,
-                )
-            )
+        # init flash attention state
+        self._init_flash_attention_state(model, input_ids)
         return

Original file line number	Diff line number	Diff line change
`@@ -24,8 +24,7 @@ def get_page_table_buffer(cls, graph_max_batch_size: int, max_seq_len: int):`
`24`	`24`	`]`
`25`	`25`	`return cls._shared_page_table_buffer`
`26`	`26`
`27`		`- def init_some_extra_state(self, model, input_ids: torch.Tensor):`
`28`		`- super().init_some_extra_state(model, input_ids)`
	`27`	`+ def _init_flash_attention_state(self, model, input_ids: torch.Tensor):`
`29`	`28`	`if self.is_prefill:`
`30`	`29`	`self.cu_seqlens_q = self.b1_cu_q_seq_len.int()`
`31`	`30`	`self.cu_seqlens_k = self.b1_cu_kv_seq_len.int()`
`@@ -93,3 +92,8 @@ def init_some_extra_state(self, model, input_ids: torch.Tensor):`
`93`	`92`	`)`
`94`	`93`	`)`
`95`	`94`	`return`
	`95`	`+`
	`96`	`+ def init_some_extra_state(self, model, input_ids: torch.Tensor):`
	`97`	`+ super().init_some_extra_state(model, input_ids)`
	`98`	`+ self._init_flash_attention_state(model, input_ids)`
	`99`	`+ return`