support fa3

RunningLeon · RunningLeon · commit d72f284f7414 · 2025-09-09T18:35:09.000+08:00
diff --git a/lmdeploy/pytorch/backends/attention.py b/lmdeploy/pytorch/backends/attention.py
@@ -36,6 +36,7 @@ def __init__(
         logit_softcapping: float = None,
         causal: bool = True,
         use_flash_mla: bool = False,
+        use_flash_attn3: bool = False,
         **kwargs,
     ) -> None:
         if scale is None:
@@ -57,6 +58,7 @@ def __init__(
         self.logit_softcapping = logit_softcapping
         self.causal = causal
         self.use_flash_mla = use_flash_mla
+        self.use_flash_attn3 = use_flash_attn3
 
     @abstractmethod
     def forward(
@@ -92,6 +94,7 @@ def build(
         logical_softcapping: float = None,
         causal: bool = True,
         use_flash_mla: bool = False,
+        use_flash_attn3: bool = False,
         learnable_sink: bool = False,
         **kwargs,
     ) -> AttentionImpl[T]:
diff --git a/lmdeploy/pytorch/backends/cuda/attention.py b/lmdeploy/pytorch/backends/cuda/attention.py
@@ -407,8 +407,9 @@ def __init__(
             causal=causal,
             **kwargs,
         )
-        from lmdeploy.pytorch.third_party.flash_attn_interface import flash_attn_varlen_func
+        from lmdeploy.pytorch.third_party.flash_attn_interface import flash_attn_varlen_func, flash_attn_with_kvcache
         self.flash_attn_varlen_func_v3 = flash_attn_varlen_func
+        self.flash_attn_with_kvcache_v3 = flash_attn_with_kvcache
 
     def forward(
         self,
@@ -460,11 +461,10 @@ def forward(
                 quant_policy=quant_policy,
             )
 
-        q_shape = query.shape
-        o_shape = q_shape[:-1] + (self.v_head_size, )
-        attn_output = query.new_empty(o_shape)
-
         if is_decoding:
+            q_shape = query.shape
+            o_shape = q_shape[:-1] + (self.v_head_size, )
+            attn_output = query.new_empty(o_shape)
             self.paged_attention_fwd(
                 query,
                 k_cache,
@@ -480,6 +480,24 @@ def forward(
                 logit_softcapping=self.logit_softcapping,
             )
         else:
+            sliding_window = (-1, -1) if self.sliding_window is None else self.sliding_window
+            if isinstance(sliding_window, int):
+                sliding_window = (sliding_window, sliding_window)
+            attn_output = self.flash_attn_with_kvcache_v3(
+                query,
+                k_cache,
+                v_cache,
+                cache_seqlens=attn_metadata.kv_seqlens.to(torch.int32),
+                cu_seqlens_q=attn_metadata.cu_seqlens_q,
+                cu_seqlens_k_new=attn_metadata.cu_seqlens_k,
+                max_seqlen_q=max_q_seqlen,
+                page_table=block_offsets,
+                softmax_scale=self.scale,
+                causal=self.causal,
+                window_size=sliding_window,
+                softcap=-1.0 if self.logit_softcapping is None else self.logit_softcapping,
+            )
+            return attn_output
             flatten_k, flatten_v = self.flatten_kv_cache(
                 k_cache,
                 v_cache,
@@ -527,6 +545,7 @@ def build(
         logical_softcapping: float = None,
         causal: bool = True,
         use_flash_mla: bool = False,
+        use_flash_attn3: bool = False,
         learnable_sink: bool = False,
         **kwargs,
     ) -> TritonAttentionImpl:
@@ -542,7 +561,7 @@ def build(
                                 logical_softcapping=logical_softcapping,
                                 causal=causal,
                                 **kwargs)
-        elif use_fa3 and not alibi and not learnable_sink:
+        elif use_flash_attn3 and not alibi and not learnable_sink:
             return FA3Impl(num_heads,
                            head_size,
                            scale=scale,
diff --git a/lmdeploy/pytorch/backends/cuda/op_backend.py b/lmdeploy/pytorch/backends/cuda/op_backend.py
@@ -130,9 +130,16 @@ def update_step_context(cls, step_context):
         kv_seqlens = step_context.kv_seqlens
         kv_start_loc = None
         kv_flatten_size = None
-        cu_seqlens_q = torch.nn.functional.pad(torch.cumsum(q_seqlens, dim=0, dtype=torch.int32), (1, 0))
-        cu_seqlens_k = torch.nn.functional.pad(torch.cumsum(kv_seqlens, dim=0, dtype=torch.int32), (1, 0))
-        if not step_context.is_decoding:
+        use_flash_mla = getattr(step_context.model_config, 'use_flash_mla', False)
+        use_flash_attn3 = getattr(step_context.model_config, 'use_flash_attn3', False)
+        cu_seqlens_q = None
+        cu_seqlens_k = None
+        if use_flash_mla or use_flash_attn3:
+            cu_seqlens_q = torch.nn.functional.pad(torch.cumsum(q_seqlens, dim=0, dtype=torch.int32), (1, 0))
+            cu_seqlens_k = torch.nn.functional.pad(torch.cumsum(kv_seqlens, dim=0, dtype=torch.int32), (1, 0))
+            step_context.block_offsets = step_context.block_offsets.to(torch.int32)
+
+        if (not step_context.is_decoding) and not use_flash_attn3:
             kv_start_loc = kv_seqlens.cumsum(0) - kv_seqlens
             kv_flatten_size = step_context.sum_kv_seqlen
         attn_metadata = attn_meta_cls(
@@ -147,7 +154,7 @@ def update_step_context(cls, step_context):
             cu_seqlens_q=cu_seqlens_q,
             cu_seqlens_k=cu_seqlens_k,
         )
-        if getattr(step_context.model_config, 'use_flash_mla', False) is True:
+        if use_flash_mla:
             if step_context.is_decoding is True:
                 cls.update_meta_flashmla(attn_metadata, step_context.model_config.num_attention_heads)
 
diff --git a/lmdeploy/pytorch/config.py b/lmdeploy/pytorch/config.py
@@ -201,6 +201,7 @@ class ModelConfig:
     cogvlm_style: bool = False
     custom_module_map: Dict[str, setattr] = None
     use_flash_mla: bool = False
+    use_flash_attn3: bool = False
 
     def get_head_size(self):
         """Get head size."""
diff --git a/lmdeploy/pytorch/configurations/llama.py b/lmdeploy/pytorch/configurations/llama.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .builder import AutoModelConfigBuilder
 from .default import DefaultModelConfigBuilder
+from .utils import flash_attn_v3_available
 
 
 class LlamaModelConfigBuilder(AutoModelConfigBuilder):
@@ -26,5 +27,6 @@ def build(cls, hf_config, model_path: str = None, is_draft_model: bool = False,
                 num_layers = cfg.num_layers
                 hf_config.aux_hidden_state_layers = (2, num_layers // 2, num_layers - 3)
         cfg.hf_config = hf_config
-
+        cfg.use_flash_attn3 = flash_attn_v3_available()
+        cfg.hf_config.use_flash_attn3 = cfg.use_flash_attn3
         return cfg
diff --git a/lmdeploy/pytorch/configurations/utils.py b/lmdeploy/pytorch/configurations/utils.py
@@ -19,3 +19,18 @@ def flash_mla_available():
     except ImportError:
         logger.warning('For higher performance, please install flash_mla https://github.com/deepseek-ai/FlashMLA')
     return use_flash_mla
+
+
+def flash_attn_v3_available():
+    """Check if flash attn v3 is available."""
+    use_fa3 = False
+    try:
+        # Now flash-attention only support FA3 for sm90a && cuda >= 12.3
+        if (torch.cuda.get_device_capability()[0] == 9) and (torch.version.cuda >= '12.3'):
+            import flash_attn_interface  # noqa: F401
+            assert torch.ops.flash_attn_3 is not None
+            use_fa3 = True
+    except Exception:
+        logger.warning('For higher performance, please install FlashAttention-3 '
+                       'https://github.com/Dao-AILab/flash-attention')
+    return use_fa3
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
@@ -9,6 +9,7 @@
 
 import numpy as np
 import torch
+from torch.profiler import record_function
 
 from lmdeploy.messages import PytorchEngineConfig, RequestMetrics, ResponseType, SpeculativeConfig
 from lmdeploy.pytorch.disagg.config import EngineRole
@@ -747,6 +748,7 @@ def __has_values(input_multimodals):
 
     @torch.inference_mode()
     @logging_timer('create_spec_inputs', logger)
+    @record_function('create_spec_inputs')
     def _create_spec_inputs(self, messages: SeqList, token_ids: List[List[int]]):
         """Create spec inputs from messages."""
 
@@ -782,6 +784,7 @@ def _create_spec_inputs(self, messages: SeqList, token_ids: List[List[int]]):
 
     @torch.inference_mode()
     @logging_timer('CreateModelInputs', logger)
+    @record_function('CreateModelInputs')
     def create_model_inputs(self, messages: SeqList, is_prefill: bool):
         """Create model inputs from messages.
 
@@ -933,6 +936,7 @@ def _make_spec_stats(self, seqs: SeqList, next_token_ids: torch.LongTensor):
 
         return all_stats
 
+    @record_function('make_infer_outputs')
     def _make_infer_outputs(
         self,
         batched_outputs: BatchedOutputs,
diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
@@ -1262,11 +1262,13 @@ def build_cache_engine(self):
 
     def _forward_impl(self, inputs: ModelInputs, swap_in_map: SwapMap, swap_out_map: SwapMap):
         cache_swapping(self.cache_engine, swap_in_map=swap_in_map, swap_out_map=swap_out_map)
-        output = model_forward(self.patched_model,
-                               inputs,
-                               self.cache_engine,
-                               stream=self.stream,
-                               output_position_ids=False)
+        output = model_forward(
+            self.patched_model,
+            inputs,
+            self.cache_engine,
+            stream=self.stream,
+            output_position_ids=self.spec_agent is not None,
+        )
         return output
 
     async def async_forward(self, inputs: ModelInputs, swap_in_map: SwapMap, swap_out_map: SwapMap):
diff --git a/lmdeploy/pytorch/kernels/cuda/flatten_kv_cache.py b/lmdeploy/pytorch/kernels/cuda/flatten_kv_cache.py
@@ -333,3 +333,79 @@ def flatten_kv_cache(k_caches: Tensor,
         )
 
     return k_states, v_states
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=['max_seq_len'],
+        x_vals=[128 * i for i in range(1, 33)],
+        line_arg='provider',
+        line_vals=['hsd', 'shd'],
+        line_names=['hsd', 'shd'],
+        styles=[('blue', '-'), ('red', '-')],
+        ylabel='time/ms',
+        plot_name='bench-flatten-kvcache-performance',
+        args={},
+    ))
+def bench_flatten_kv_cache(max_seq_len: int,
+                           batch_size: int = 128,
+                           num_blocks: int = 6400,
+                           block_size: int = 64,
+                           dtype: torch.dtype = torch.float16,
+                           provider='hsd'):
+    """Benchmark."""
+    head_dim = 128
+    num_head = 8
+    seqlens = torch.tensor([max_seq_len] * batch_size, dtype=torch.long, device='cuda')
+    block_offsets = torch.arange(batch_size * ((max_seq_len + block_size) // block_size),
+                                 dtype=torch.int32,
+                                 device='cuda').reshape(batch_size, -1)
+    out_size = batch_size * ((max_seq_len + block_size) // block_size) * block_size
+    start_loc = seqlens.cumsum(0) - seqlens
+
+    k_caches = torch.randn((num_blocks, block_size, num_head, head_dim),
+                           dtype=dtype,
+                           device='cuda',
+                           requires_grad=False)
+    v_caches = torch.randn((num_blocks, block_size, num_head, head_dim),
+                           dtype=dtype,
+                           device='cuda',
+                           requires_grad=False)
+
+    def flatten_hsd():
+        return flatten_kv_cache(k_caches,
+                                v_caches,
+                                seqlens,
+                                block_offsets,
+                                start_loc,
+                                out_size,
+                                flatten_kv_layout='hsd',
+                                kv_layout='bshd',
+                                out_dtype=dtype)
+
+    def flatten_shd():
+        return flatten_kv_cache(k_caches,
+                                v_caches,
+                                seqlens,
+                                block_offsets,
+                                start_loc,
+                                out_size,
+                                flatten_kv_layout='shd',
+                                kv_layout='bshd',
+                                out_dtype=dtype)
+
+    if provider == 'hsd':
+        flatten_op = flatten_hsd
+    else:
+        flatten_op = flatten_shd
+    quantiles = [0.5, 0.2, 0.8]
+    ms, min_ms, max_ms = triton.testing.do_bench(flatten_op, quantiles=quantiles, rep=500)
+
+    def perf(ms):
+        return ms
+
+    return perf(ms), perf(max_ms), perf(min_ms)
+
+
+if __name__ == '__main__':
+    bench_flatten_kv_cache.run(print_data=True, show_plots=True, save_path='perf_flatten_kv_cache')
diff --git a/lmdeploy/pytorch/models/llama.py b/lmdeploy/pytorch/models/llama.py
@@ -49,6 +49,7 @@ def __init__(self, config: LlamaConfig, dtype: torch.dtype = None, device: torch
             head_dim,
             num_kv_heads=num_key_value_heads,
             v_head_size=head_dim,
+            use_flash_attn3=getattr(config, 'use_flash_attn3', False),
         )
 
         # o_proj
diff --git a/lmdeploy/pytorch/nn/attention.py b/lmdeploy/pytorch/nn/attention.py
@@ -36,6 +36,7 @@ def __init__(
         logit_softcapping: float = None,
         causal: bool = True,
         use_flash_mla: bool = False,
+        use_flash_attn3: bool = False,
         learnable_sink: bool = False,
         **kwargs,
     ):
@@ -60,6 +61,7 @@ def __init__(
             logit_softcapping=logit_softcapping,
             causal=causal,
             use_flash_mla=use_flash_mla,
+            use_flash_attn3=use_flash_attn3,
             learnable_sink=learnable_sink,
             **kwargs,
         )
diff --git a/lmdeploy/pytorch/spec_decode/base.py b/lmdeploy/pytorch/spec_decode/base.py
@@ -13,7 +13,7 @@
 from ..models.patch import build_patched_model, update_custom_module_map
 from ..weight_loader.model_weight_loader import load_model_weights
 
-SPEC_PROPOSERS = Registry('vision_model')
+SPEC_PROPOSERS = Registry('spec_proposers')
 
 logger = get_logger('lmdeploy')
 
diff --git a/lmdeploy/pytorch/third_party/flash_attn_interface.py b/lmdeploy/pytorch/third_party/flash_attn_interface.py
@@ -2,6 +2,7 @@
 import functools
 
 from flash_attn_interface import flash_attn_varlen_func as _flash_attn_varlen_func
+from flash_attn_interface import flash_attn_with_kvcache as _flash_attn_with_kvcache
 
 
 @functools.wraps(_flash_attn_varlen_func)
@@ -11,3 +12,9 @@ def flash_attn_varlen_func(*args, **kwargs):
         # for old api
         return output[0]
     return output
+
+
+@functools.wraps(_flash_attn_with_kvcache)
+def flash_attn_with_kvcache(*args, **kwargs):
+    output = _flash_attn_with_kvcache(*args, **kwargs)
+    return output

Original file line number	Diff line number	Diff line change
`@@ -49,6 +49,7 @@ def __init__(self, config: LlamaConfig, dtype: torch.dtype = None, device: torch`
`49`	`49`	`head_dim,`
`50`	`50`	`num_kv_heads=num_key_value_heads,`
`51`	`51`	`v_head_size=head_dim,`
	`52`	`+ use_flash_attn3=getattr(config, 'use_flash_attn3', False),`
`52`	`53`	`)`
`53`	`54`
`54`	`55`	`# o_proj`