feat: Introduce feature properties for attention backend. (NVIDIA#3659)

yuxianq · web-flow · commit 5346f53250b7 · 2025-04-19T12:37:27.000+08:00
Signed-off-by: Yuxian Qiu &lt;142763828+yuxianq@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/attention_backend/interface.py b/tensorrt_llm/_torch/attention_backend/interface.py
@@ -537,6 +537,18 @@ def forward(self,
         """
         raise NotImplementedError
 
+    @classmethod
+    def support_fused_rope(cls) -> bool:
+        return False
+
+    @classmethod
+    def support_fused_qkv(cls) -> bool:
+        return False
+
+    @classmethod
+    def support_mla(cls) -> bool:
+        return False
+
 
 @dataclass(kw_only=True, unsafe_hash=True)
 class MLAParams:
diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py
@@ -710,3 +710,15 @@ def forward(
                                   or k is not None,
                                   attention_mask=attention_mask)
         return output
+
+    @classmethod
+    def support_fused_rope(cls) -> bool:
+        return True
+
+    @classmethod
+    def support_fused_qkv(cls) -> bool:
+        return True
+
+    @classmethod
+    def support_mla(cls) -> bool:
+        return True
diff --git a/tensorrt_llm/_torch/attention_backend/utils.py b/tensorrt_llm/_torch/attention_backend/utils.py
@@ -45,7 +45,8 @@ def create_attention(
     attn_cls = get_attention_backend(backend_name)
 
     if is_mla_enable:
-        assert attn_cls == TrtllmAttention
+        assert attn_cls.support_mla(
+        ), f"MLA is not supported for {backend_name} backend"
         assert (q_lora_rank > 0 and kv_lora_rank > 0 and qk_rope_head_dim > 0
                 and qk_nope_head_dim > 0 and v_head_dim > 0)
         mla_params = MLAParams(
diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py
@@ -6,8 +6,7 @@
 
 from tensorrt_llm.mapping import Mapping
 
-from ..attention_backend import (AttentionInputType, AttentionMetadata,
-                                 TrtllmAttention)
+from ..attention_backend import AttentionInputType, AttentionMetadata
 from ..attention_backend.interface import (PositionalEmbeddingParams,
                                            PredefinedAttentionMask)
 from ..attention_backend.utils import create_attention
@@ -104,19 +103,6 @@ def __init__(
         self.attn_backend = config.attn_backend
         self.pos_embd_params = pos_embd_params
 
-        self.enable_rope_fusion = self.attn_backend == "TRTLLM"
-        self.support_fused_qkv = self.attn_backend == "TRTLLM"
-        self.support_unfused_qkv = self.attn_backend != "TRTLLM"
-        self.rotary_emb = None
-        self.apply_rotary_emb = (not self.enable_rope_fusion
-                                 and pos_embd_params is not None)
-        if self.apply_rotary_emb:
-            self.rotary_emb = RotaryEmbedding(
-                pos_embd_params.rope,
-                head_dim=self.head_dim,
-                is_neox=pos_embd_params.is_neox,
-            )
-
         # These two modules are mutually exclusive - either splitted_qkv_lora or fused_qkv_lora will be used,
         # but never both at the same time. splitted_qkv_lora handles Q,K,V separately while fused_qkv_lora
         # handles them as a single fused operation.
@@ -132,8 +118,23 @@ def __init__(
 
         if not config.skip_create_weights:
             self.create_weights()
+        else:
+            self.create_backend()
 
-    def create_weights(self):
+        self.enable_rope_fusion = self.attn.support_fused_rope()
+        self.support_fused_qkv = self.attn.support_fused_qkv()
+
+        self.rotary_emb = None
+        self.apply_rotary_emb = (not self.enable_rope_fusion
+                                 and pos_embd_params is not None)
+        if self.apply_rotary_emb:
+            self.rotary_emb = RotaryEmbedding(
+                pos_embd_params.rope,
+                head_dim=self.head_dim,
+                is_neox=pos_embd_params.is_neox,
+            )
+
+    def create_backend(self):
         self.attn = create_attention(
             self.attn_backend,
             self.layer_idx,
@@ -144,10 +145,14 @@ def create_weights(self):
             quant_config=self.quant_config,
         )
 
+    def create_weights(self):
+        # recreate the backend when quant_config changes
+        self.create_backend()
+
     def convert_qkv(self, q, k, v):
         if k is None and v is None and not self.support_fused_qkv:
             q, k, v = q.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        elif k is not None and v is not None and not self.support_unfused_qkv:
+        elif k is not None and v is not None and self.support_fused_qkv:
             qkv = torch.concat([q, k, v], dim=-1)
             q, k, v = qkv, None, None
         return q, k, v
@@ -459,9 +464,8 @@ def yarn_get_mscale(scale=1, mscale=1):
         self.aux_stream = aux_stream
         self.ln_events = [torch.cuda.Event(), torch.cuda.Event()]
 
-        self.enable_rope_fusion = isinstance(self.mha, TrtllmAttention)
-        self.support_fused_qkv = isinstance(self.mha, TrtllmAttention)
-        self.support_unfused_qkv = not isinstance(self.mha, TrtllmAttention)
+        self.enable_rope_fusion = self.mha.support_fused_rope()
+        self.support_fused_qkv = self.mha.support_fused_qkv()
         self.rotary_emb = None
         self.apply_rotary_emb = not self.enable_rope_fusion
         if self.apply_rotary_emb:
@@ -575,7 +579,7 @@ def forward(
         return attn_output
 
     def _maybe_concat_qkv(self, q, k, v):
-        if k is not None and v is not None and not self.support_unfused_qkv:
+        if k is not None and v is not None and self.support_fused_qkv:
             qkv = torch.concat([q, k, v], dim=-1)
             q, k, v = qkv, None, None
         return q, k, v