ModelTC
diff --git a/‎.github/workflows/pre-commit.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/pre-commit.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lightllm/models/deepseek2/model.py‎
Lines changed: 2 additions & 3 deletions b/‎lightllm/models/deepseek2/model.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎lightllm/models/llama/flashinfer_struct.py‎
Lines changed: 133 additions & 0 deletions b/‎lightllm/models/llama/flashinfer_struct.py‎
Lines changed: 133 additions & 0 deletions
diff --git a/‎lightllm/models/llama/layer_infer/transformer_layer_infer.py‎
Lines changed: 51 additions & 9 deletions b/‎lightllm/models/llama/layer_infer/transformer_layer_infer.py‎
Lines changed: 51 additions & 9 deletions
diff --git a/‎lightllm/models/llama/model.py‎
Lines changed: 21 additions & 0 deletions b/‎lightllm/models/llama/model.py‎
Lines changed: 21 additions & 0 deletions
@@ -25,6 +25,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install pre-commit
         pre-commit install-hooks
+        black --version
 
     - name: Run pre-commit on modified files
       run: |
 
@@ -20,7 +20,7 @@
 logger = init_logger(__name__)
 
 
-class FlashInferStateExtraInfo:
+class DeepSeek2FlashInferStateExtraInfo:
     def __init__(self, model):
         num_heads = model.config["num_attention_heads"]
         self.tp_q_head_num = num_heads // get_dp_world_size()
@@ -71,8 +71,7 @@ def _init_inferstate_cls(self):
             self.infer_state_class = Deepseek2FlashAttentionStateInfo
         elif self.enable_flashinfer:
             self.infer_state_class = Deepseek2FlashInferStateInfo
-        if self.enable_flashinfer:
-            self.flashinfer_extra_state = FlashInferStateExtraInfo(self)
+            self.flashinfer_extra_state = DeepSeek2FlashInferStateExtraInfo(self)
 
     def _init_some_value(self):
         super()._init_some_value()
 
@@ -0,0 +1,133 @@
+import os
+import torch
+import numpy as np
+import torch.distributed as dist
+from lightllm.models.llama.infer_struct import LlamaInferStateInfo
+from lightllm.utils.envs_utils import get_env_start_args
+from lightllm.models.deepseek2.triton_kernel.repack_kv_index import repack_kv_index
+
+
+class LlamaFlashInferStateInfo(LlamaInferStateInfo):
+    def __init__(self):
+        super().__init__()
+        self.prefill_wrapper = None
+        self.decode_wrapper = None
+        self.flashinfer_extra_state = None
+
+    def init_some_extra_state(self, model, input_ids: torch.Tensor):
+        super().init_some_extra_state(model, input_ids)
+        self.flashinfer_extra_state = model.flashinfer_extra_state
+
+        import flashinfer
+
+        if not self.is_prefill:
+            if get_env_start_args().enable_flashinfer_decode:
+                self.kv_last_page_len_buffer = torch.full((self.batch_size,), 1, dtype=torch.int32).to(input_ids.device)
+                self.kv_indices = torch.empty(
+                    self.batch_size * self.flashinfer_extra_state.max_seq_length, dtype=torch.int32
+                ).to(input_ids.device)
+                repack_kv_index(
+                    self.req_manager.req_to_token_indexs,
+                    self.b_req_idx,
+                    self.b_seq_len,
+                    self.b_start_loc,
+                    self.max_len_in_batch,
+                    self.kv_indices,
+                )
+                self.kv_starts = torch.cat([self.b_start_loc, self.b_start_loc[-1:] + self.b_seq_len[-1:]], dim=0)
+                if self.decode_wrapper is None:
+                    self.decode_wrapper = flashinfer.decode.BatchDecodeWithPagedKVCacheWrapper(
+                        self.flashinfer_extra_state.workspace_buffer,
+                        "NHD",
+                        use_cuda_graph=True,
+                        use_tensor_cores=True,
+                        paged_kv_indptr_buffer=self.kv_starts,
+                        paged_kv_indices_buffer=self.kv_indices,
+                        paged_kv_last_page_len_buffer=self.kv_last_page_len_buffer,
+                    )
+                    self.decode_wrapper.plan(
+                        self.kv_starts,
+                        self.kv_indices,
+                        self.kv_last_page_len_buffer,
+                        self.flashinfer_extra_state.tp_q_head_num,
+                        self.flashinfer_extra_state.tp_kv_head_num,
+                        self.flashinfer_extra_state.head_dim,
+                        1,
+                        q_data_type=self.flashinfer_extra_state.q_data_type,
+                        kv_data_type=self.flashinfer_extra_state.kv_data_type,
+                        non_blocking=True,
+                    )
+        else:
+            if get_env_start_args().enable_flashinfer_prefill:
+                q_starts = torch.zeros((self.batch_size + 1,)).int().cuda()
+                q_starts[1:] = torch.cumsum(self.b_seq_len - self.b_ready_cache_len, dim=0)
+                kv_starts = torch.zeros_like(q_starts)
+                kv_starts[1:] = torch.cumsum(self.b_seq_len, dim=0)
+                kv_last_page_len = torch.full((self.batch_size,), 1, dtype=torch.int32).to(input_ids.device)
+                if self.use_dynamic_prompt_cache:
+                    kv_indices = torch.empty(
+                        self.batch_size * self.flashinfer_extra_state.max_seq_length, dtype=torch.int32
+                    ).to(input_ids.device)
+                    repack_kv_index(
+                        self.req_manager.req_to_token_indexs,
+                        self.b_req_idx,
+                        self.b_seq_len,
+                        self.b_start_loc,
+                        self.max_len_in_batch,
+                        kv_indices,
+                    )
+                    self.prefill_wrapper = flashinfer.prefill.BatchPrefillWithPagedKVCacheWrapper(
+                        self.flashinfer_extra_state.workspace_buffer,
+                        qo_indptr_buf=q_starts,
+                        paged_kv_indptr_buf=kv_starts,
+                        paged_kv_indices_buf=kv_indices,
+                        paged_kv_last_page_len_buf=kv_last_page_len,
+                    )
+                    self.prefill_wrapper.plan(
+                        q_starts,
+                        kv_starts,
+                        kv_indices,
+                        kv_last_page_len,
+                        self.flashinfer_extra_state.tp_q_head_num,
+                        self.flashinfer_extra_state.tp_kv_head_num,
+                        self.flashinfer_extra_state.head_dim,
+                        1,
+                        causal=True,
+                        pos_encoding_mode="NONE",
+                        logits_soft_cap=0.0,
+                        q_data_type=self.flashinfer_extra_state.q_data_type,
+                        kv_data_type=self.flashinfer_extra_state.kv_data_type,
+                    )
+                else:
+                    self.prefill_wrapper = flashinfer.prefill.BatchPrefillWithRaggedKVCacheWrapper(
+                        self.flashinfer_extra_state.workspace_buffer,
+                    )
+                    self.prefill_wrapper.plan(
+                        qo_indptr=q_starts,
+                        kv_indptr=kv_starts,
+                        num_qo_heads=self.flashinfer_extra_state.tp_q_head_num,
+                        num_kv_heads=self.flashinfer_extra_state.tp_kv_head_num,
+                        head_dim_qk=self.flashinfer_extra_state.head_dim,
+                        head_dim_vo=self.flashinfer_extra_state.head_dim,
+                        causal=True,
+                        q_data_type=self.flashinfer_extra_state.q_data_type,
+                        kv_data_type=self.flashinfer_extra_state.kv_data_type,
+                    )
+        return
+
+    def copy_for_cuda_graph(self, new_infer_state):
+        super().copy_for_cuda_graph(new_infer_state)
+        if get_env_start_args().enable_flashinfer_decode and not self.is_prefill:
+            self.decode_wrapper.plan(
+                new_infer_state.kv_starts,
+                new_infer_state.kv_indices,
+                new_infer_state.kv_last_page_len_buffer,
+                new_infer_state.flashinfer_extra_state.tp_q_head_num,
+                new_infer_state.flashinfer_extra_state.tp_kv_head_num,
+                new_infer_state.flashinfer_extra_state.head_dim,
+                1,
+                q_data_type=new_infer_state.flashinfer_extra_state.q_data_type,
+                kv_data_type=new_infer_state.flashinfer_extra_state.kv_data_type,
+                non_blocking=True,
+            )
+        return
@@ -20,6 +20,7 @@
 from lightllm.models.llama.triton_kernel.silu_and_mul import silu_and_mul_fwd
 
 from lightllm.models.llama.infer_struct import LlamaInferStateInfo
+from lightllm.models.llama.flashinfer_struct import LlamaFlashInferStateInfo
 from lightllm.common.basemodel.triton_kernel.destindex_copy_kv import destindex_copy_kv, destindex_copy_quantize_kv
 from lightllm.common.basemodel import TransformerLayerInferTpl
 from lightllm.models.llama.triton_kernel.ppl_quant_copy_kv import destindex_copy_dequantize_kv
@@ -68,8 +69,12 @@ def _bind_attention(self):
             )
             self._copy_kv_to_mem_cache = partial(LlamaTransformerLayerInfer._copy_kv_to_mem_cache_normal, self)
             return
-
-        self._context_attention_kernel = partial(LlamaTransformerLayerInfer._context_attention_kernel, self)
+        elif get_env_start_args().enable_flashinfer_prefill:
+            self._context_attention_kernel = partial(
+                LlamaTransformerLayerInfer._context_attention_flashinfer_kernel, self
+            )
+        else:
+            self._context_attention_kernel = partial(LlamaTransformerLayerInfer._context_attention_kernel, self)
         if "ppl_int8kv" in self.mode:
             self._token_attention_kernel = partial(LlamaTransformerLayerInfer._token_decode_attention_ppl_int8kv, self)
             self._copy_kv_to_mem_cache = partial(LlamaTransformerLayerInfer._copy_kv_to_mem_cache_ppl_int8kv, self)
@@ -119,7 +124,12 @@ def _bind_attention(self):
             )
             self._copy_kv_to_mem_cache = partial(LlamaTransformerLayerInfer._copy_kv_to_mem_cache_normal, self)
         else:
-            self._token_attention_kernel = partial(LlamaTransformerLayerInfer._token_decode_attention_normal, self)
+            if get_env_start_args().enable_flashinfer_decode:
+                self._token_attention_kernel = partial(
+                    LlamaTransformerLayerInfer._token_decode_attention_flashinfer, self
+                )
+            else:
+                self._token_attention_kernel = partial(LlamaTransformerLayerInfer._token_decode_attention_normal, self)
             self._copy_kv_to_mem_cache = partial(LlamaTransformerLayerInfer._copy_kv_to_mem_cache_normal, self)
 
         return
@@ -178,6 +188,28 @@ def _tpsp_get_qkv(
         )
         return q, cache_kv
 
+    def _context_attention_flashinfer_kernel(
+        self, q, kv, infer_state: LlamaFlashInferStateInfo, layer_weight, out=None
+    ) -> torch.Tensor:
+        o_tensor = self.alloc_tensor(q.shape, q.dtype) if out is None else out
+        if infer_state.use_dynamic_prompt_cache:
+            kv = infer_state.mem_manager.kv_buffer[self.layer_num_]
+            kv = kv.unsqueeze(1)
+            infer_state.prefill_wrapper.run(
+                q.view(q.shape[0], -1, self.head_dim_),
+                (kv[:, :, : self.tp_k_head_num_, :], kv[:, :, self.tp_k_head_num_ :, :]),
+                out=o_tensor.view(q.shape[0], -1, self.head_dim_),
+            )
+        else:
+            infer_state.prefill_wrapper.run(
+                q.view(q.shape[0], -1, self.head_dim_),
+                kv[:, : self.tp_k_head_num_, :],
+                kv[:, self.tp_k_head_num_ :, :],
+                out=o_tensor.view(q.shape[0], -1, self.head_dim_),
+            )
+
+        return o_tensor
+
     def _context_attention_kernel(
         self, q, kv, infer_state: LlamaInferStateInfo, layer_weight, out=None
     ) -> torch.Tensor:
@@ -254,7 +286,6 @@ def _context_attention_kernel_ppl_int8kv(
         return o_tensor
 
     def _context_attention_flashattention(self, q, kv, infer_state: LlamaInferStateInfo, layer_weight, out=None):
-
         cache_k = infer_state.mem_manager.kv_buffer[self.layer_num_][:, 0 : self.tp_k_head_num_, :].reshape(
             -1, 1, self.tp_k_head_num_, self.head_dim_
         )
@@ -264,7 +295,7 @@ def _context_attention_flashattention(self, q, kv, infer_state: LlamaInferStateI
         q = q.reshape(-1, self.tp_q_head_num_, self.head_dim_)
         k_descale, v_descale = None, None  # disable quantization
         Lq = q.shape[-1]
-        sm_scale = 1.0 / (Lq ** 0.5)
+        sm_scale = 1.0 / (Lq**0.5)
         o = flash_attn_with_kvcache(
             q=q,
             k_cache=cache_k,
@@ -392,6 +423,19 @@ def _copy_kv_to_mem_cache_ppl_int4kv(self, buffer, mem_index, mem_manager):
         )
         return
 
+    def _token_decode_attention_flashinfer(self, q, infer_state: LlamaFlashInferStateInfo, layer_weight, out=None):
+        batch_size = infer_state.batch_size
+        calcu_shape1 = (batch_size, self.tp_q_head_num_, self.head_dim_)
+
+        o_tensor = self.alloc_tensor(q.shape, q.dtype) if out is None else out
+        kv = infer_state.mem_manager.kv_buffer[self.layer_num_].unsqueeze(1)
+        infer_state.decode_wrapper.run(
+            q.view(calcu_shape1),
+            (kv[:, :, : self.tp_k_head_num_, :], kv[:, :, self.tp_k_head_num_ :, :]),
+            out=o_tensor.view(calcu_shape1),
+        )
+        return o_tensor
+
     def _token_decode_attention_normal(self, q, infer_state: LlamaInferStateInfo, layer_weight, out=None):
         total_token_num = infer_state.total_token_num
         batch_size = infer_state.batch_size
@@ -565,7 +609,7 @@ def _token_decode_attention_ppl_fp16(self, q, infer_state: LlamaInferStateInfo,
         # at::Tensor v,  at::Tensor v_s, at::Tensor b_loc, at::Tensor b_seq_len, int max_len_in_batch)
         fp16_decode_attention(
             o_tensor.view(calcu_shape1),
-            1.0 / (self.head_dim_ ** 0.5),
+            1.0 / (self.head_dim_**0.5),
             q.view(calcu_shape1),
             infer_state.mem_manager.kv_buffer[self.layer_num_][:, 0 : self.tp_k_head_num_, :],
             infer_state.mem_manager.kv_buffer[self.layer_num_][
@@ -673,7 +717,6 @@ def _token_decode_attention_gqa_flashdecoding_vsm(
         )
 
     def _token_decode_attention_flashattention(self, q, infer_state: LlamaInferStateInfo, layer_weight, out=None):
-
         cache_k = infer_state.mem_manager.kv_buffer[self.layer_num_][:, 0 : self.tp_k_head_num_, :].reshape(
             -1, 1, self.tp_k_head_num_, self.head_dim_
         )
@@ -683,7 +726,7 @@ def _token_decode_attention_flashattention(self, q, infer_state: LlamaInferState
         q = q.reshape(-1, self.tp_q_head_num_, self.head_dim_)
         k_descale, v_descale = None, None  # disable quantization
         Lq = q.shape[-1]
-        sm_scale = 1.0 / (Lq ** 0.5)
+        sm_scale = 1.0 / (Lq**0.5)
         o = flash_attn_with_kvcache(
             q=q,
             k_cache=cache_k,
@@ -711,7 +754,6 @@ def overlap_tpsp_token_forward(
         infer_state1: LlamaInferStateInfo,
         layer_weight: LlamaTransformerLayerWeight,
     ):
-
         input_embdings = self.tpsp_token_forward(input_embdings, infer_state, layer_weight=layer_weight)
         input_embdings1 = self.tpsp_token_forward(input_embdings1, infer_state1, layer_weight=layer_weight)
         return input_embdings, input_embdings1
 
@@ -12,14 +12,28 @@
 
 from lightllm.models.llama.infer_struct import LlamaInferStateInfo
 from lightllm.models.llama.flashattention_infer_struct import FlashAttentionStateInfo
+from lightllm.models.llama.flashinfer_struct import LlamaFlashInferStateInfo
 from lightllm.common.basemodel import TpPartBaseModel
 from lightllm.common.mem_utils import select_mem_manager_class
 from lightllm.utils.log_utils import init_logger
 from lightllm.utils.envs_utils import get_env_start_args
+from lightllm.utils.dist_utils import get_dp_world_size, get_current_device_id
 
 logger = init_logger(__name__)
 
 
+class LlamaFlashInferStateExtraInfo:
+    def __init__(self, model):
+        tp_world_size = get_dp_world_size()
+        self.tp_q_head_num = model.config["num_attention_heads"] // tp_world_size
+        self.tp_kv_head_num = model.config["num_key_value_heads"] // tp_world_size
+        self.head_dim = model.config["hidden_size"] // model.config["num_attention_heads"]
+        self.workspace_buffer = torch.empty(256 * 1024 * 1024, dtype=torch.int8).to(get_current_device_id())
+        self.max_seq_length = model.max_seq_length
+        self.q_data_type = model.data_type
+        self.kv_data_type = model.data_type
+
+
 class LlamaTpPartModel(TpPartBaseModel):
     # weight class
     pre_and_post_weight_class = LlamaPreAndPostLayerWeight
@@ -34,6 +48,11 @@ class LlamaTpPartModel(TpPartBaseModel):
     infer_state_class = LlamaInferStateInfo
 
     def __init__(self, kvargs):
+        self.enable_flashinfer = (
+            get_env_start_args().enable_flashinfer_prefill or get_env_start_args().enable_flashinfer_decode
+        )
+        if self.enable_flashinfer:
+            self.infer_state_class = LlamaFlashInferStateInfo
         super().__init__(kvargs)
         return
 
@@ -42,6 +61,8 @@ def _init_config(self):
         # rename key
         # repair_config()
         self._reset_num_key_value_heads()
+        if self.enable_flashinfer:
+            self.flashinfer_extra_state = LlamaFlashInferStateExtraInfo(self)
         return
 
     def _reset_num_key_value_heads(self):