File tree Expand file tree Collapse file tree 2 files changed +21
-1
lines changed
Expand file tree Collapse file tree 2 files changed +21
-1
lines changed Original file line number Diff line number Diff line change @@ -61,6 +61,16 @@ class BackendConfig:
6161 "cudagraph_mode" : "FULL_AND_PIECEWISE" ,
6262 },
6363 specific_gpu_arch = (9 , 0 )),
64+ # FlashAttention MLA on Hopper
65+ "FlashAttentionMLA" :
66+ BackendConfig (name = "FlashAttentionMLA" ,
67+ env_vars = {
68+ "VLLM_ATTENTION_BACKEND" : "FLASH_ATTN_MLA" ,
69+ },
70+ comp_config = {
71+ "cudagraph_mode" : "FULL_DECODE_ONLY" ,
72+ },
73+ specific_gpu_arch = (9 , 0 )),
6474 # Cutlass MLA on Blackwell
6575 "CutlassMLA" :
6676 BackendConfig (
@@ -102,7 +112,7 @@ class BackendConfig:
102112test_params_full_cudagraph = []
103113
104114# deepseek-ai/DeepSeek-V2-Lite with MLA
105- MLA_backends = ["FlashMLA" , "CutlassMLA" ]
115+ MLA_backends = ["FlashMLA" , "FlashAttentionMLA" , " CutlassMLA" ]
106116for mla_backend in MLA_backends :
107117 test_params_full_cudagraph .append (
108118 pytest .param (
Original file line number Diff line number Diff line change @@ -62,6 +62,16 @@ class BackendConfig:
6262 "cudagraph_mode" : "FULL_AND_PIECEWISE" ,
6363 },
6464 specific_gpu_arch = (9 , 0 )),
65+ # FlashAttention MLA on Hopper
66+ "FlashAttentionMLA" :
67+ BackendConfig (name = "FlashAttentionMLA" ,
68+ env_vars = {
69+ "VLLM_ATTENTION_BACKEND" : "FLASH_ATTN_MLA" ,
70+ },
71+ comp_config = {
72+ "cudagraph_mode" : "FULL_DECODE_ONLY" ,
73+ },
74+ specific_gpu_arch = (9 , 0 )),
6575 # FA2
6676 "FA2" :
6777 BackendConfig (name = "FA2" ,
You can’t perform that action at this time.
0 commit comments