[fix] fix torchair & ci bug

zzzzwwjj · wangxiyuan · commit 6f149db5c500 · 2025-09-29T17:00:44.000+08:00
Signed-off-by: zzzzwwjj &lt;1183291235@qq.com&gt;
diff --git a/vllm_ascend/attention/sfa_v1.py b/vllm_ascend/attention/sfa_v1.py
@@ -1,5 +1,6 @@
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Type, TypeVar
+from typing import (TYPE_CHECKING, ClassVar, NamedTuple, Optional, Tuple, Type,
+                    TypeVar)
 
 import torch
 import torch_npu
@@ -12,6 +13,7 @@
 from vllm.model_executor.layers.linear import (LinearBase,
                                                UnquantizedLinearMethod)
 from vllm.utils import cdiv, round_down
+from vllm.v1.attention.backends.utils import AttentionCGSupport
 
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
@@ -77,9 +79,9 @@ class ChunkedContextMetadata:
     block_table: torch.Tensor
     max_query_len: int
     max_seq_lens: int
+    sin: torch.Tensor
+    cos: torch.Tensor
     chunked_context: Optional[ChunkedContextMetadata] = None
-    sin: torch.Tensor = None
-    cos: torch.Tensor = None
 
 
 @dataclass
@@ -91,10 +93,10 @@ class AscendSFADecodeMetadata:
     seq_lens: torch.Tensor
     max_seq_lens: int
     seq_lens_list: list[int]
-    actual_seq_lengths_q: Optional[torch.Tensor] = None
+    actual_seq_lengths_q: torch.Tensor
+    sin: torch.Tensor
+    cos: torch.Tensor
     attn_mask: Optional[torch.Tensor] = None
-    sin: torch.Tensor = None
-    cos: torch.Tensor = None
 
 
 @dataclass
@@ -163,6 +165,9 @@ def split_metadata_for_multistream(
 
 
 class AscendSFAMetadataBuilder:
+    # Does this backend/builder support ACL Graphs for attention (default: no).
+    aclgraph_support: ClassVar[AttentionCGSupport] = \
+        AttentionCGSupport.NEVER
     """
     NOTE: Please read the comment at the top of the file before trying to
     understand this class
@@ -292,11 +297,10 @@ def build(
         device = self.device
 
         block_table = (common_attn_metadata.block_table_tensor[:num_reqs])
-        slot_mapping = common_attn_metadata.slot_mapping_cpu[:
-                                                             num_actual_tokens].to(
-                                                                 device,
-                                                                 non_blocking=
-                                                                 True)
+        slot_mapping = common_attn_metadata.slot_mapping[:
+                                                         num_actual_tokens].to(
+                                                             device,
+                                                             non_blocking=True)
         input_positions = common_attn_metadata.positions[:
                                                          num_actual_tokens].long(
                                                          )
@@ -686,8 +690,7 @@ def _sfa_preprocess(self, hidden_states, kv_cache, attn_metadata,
             topk_indices = self.indexer_select(hidden_states_decode,
                                                decode_q_c,
                                                attn_metadata=attn_metadata,
-                                               kv_cache=kv_cache,
-                                               is_prefill=False)
+                                               kv_cache=kv_cache)
 
             query_states = (decode_q_nope, decode_q_pe)
             key_states = (decode_k_nope, decode_k_rope)
@@ -775,8 +778,7 @@ def _sfa_preprocess(self, hidden_states, kv_cache, attn_metadata,
             topk_indices = self.indexer_select(x=hidden_states_prefill,
                                                qr=prefill_qr,
                                                kv_cache=kv_cache,
-                                               attn_metadata=attn_metadata,
-                                               is_prefill=True)
+                                               attn_metadata=attn_metadata)
             query_states = (prefill_q_nope, prefill_q_pe)
             key_states = (prefill_k_nope, prefill_k_pe)
             prefill_preprocess_res = PrefillSFAPreprocessResult(
@@ -826,45 +828,27 @@ def forward(
                 query_states=decode_preprocess_res.query_states,
                 key_states=decode_preprocess_res.key_states,
                 attn_metadata=attn_metadata,
-                attention_mask=None,
-                kv_cache=kv_cache,
-                topk_indices=decode_preprocess_res.topk_indices,
-                is_prefill=False,
-                bsz=decode_preprocess_res.bsz)
+                topk_indices=decode_preprocess_res.topk_indices)
             o_proj_input[:num_decode_tokens] = decode_attn_output
 
         if prefill_preprocess_res is not None:
             prefill_attn_output = self.apply_attention_fusion(
                 query_states=prefill_preprocess_res.query_states,
                 key_states=prefill_preprocess_res.key_states,
                 attn_metadata=attn_metadata,
-                attention_mask=None,
-                kv_cache=kv_cache,
-                topk_indices=prefill_preprocess_res.topk_indices,
-                is_prefill=True,
-                bsz=None)
+                topk_indices=prefill_preprocess_res.topk_indices)
             o_proj_input[num_decode_tokens:] = prefill_attn_output
 
         output[...] = self.mla_epilog(o_proj_input, absorb=True)
         return output
 
-    def apply_attention_fusion(
-            self,
-            query_states,
-            key_states,
-            topk_indices,
-            attn_metadata: M,
-            attention_mask: Optional[torch.Tensor] = None,
-            # actual_seq_qlen: torch.Tensor = None,
-            # actual_seq_lengths_kv: torch.Tensor = None,
-            kv_cache: Tuple[torch.Tensor] = None,
-            is_prefill: bool = True,
-            bsz: int = None):
+    def apply_attention_fusion(self, query_states, key_states, topk_indices,
+                               attn_metadata: M):
         # repeat k/v heads if n_kv_heads < n_heads
         q_nope, q_pe = query_states
         k_nope, k_rope = key_states
 
-        if is_prefill:
+        if attn_metadata.prefill is not None:
 
             prefill_metadata = attn_metadata.prefill
 
@@ -885,7 +869,7 @@ def apply_attention_fusion(
                 sparse_mode=3,
             )
 
-        else:
+        elif attn_metadata.decode is not None:
             decode_metadata = attn_metadata.decode
 
             slc_fa_fusion = torch.ops.custom.npu_selected_flash_attention(
@@ -937,14 +921,19 @@ def indexer_select(
         qr: torch.Tensor,
         kv_cache: Tuple[torch.Tensor],
         attn_metadata: M,
-        is_prefill: bool = True,
     ):
-        if is_prefill:
+        if attn_metadata.prefill is not None:
             cos = attn_metadata.prefill.cos
             sin = attn_metadata.prefill.sin
-        else:
+            actual_seq_lengths_query = attn_metadata.prefill.query_lens
+            actual_seq_lengths_key = attn_metadata.prefill.seq_lens
+            block_table = attn_metadata.prefill.block_table
+        elif attn_metadata.decode is not None:
             cos = attn_metadata.decode.cos
             sin = attn_metadata.decode.sin
+            actual_seq_lengths_query = attn_metadata.decode.actual_seq_lengths_q
+            actual_seq_lengths_key = attn_metadata.decode.seq_lens
+            block_table = attn_metadata.decode.block_table
 
         cos_q, sin_q = cos, sin
         cos = cos.view(-1, 1, 1, self.qk_rope_head_dim)
@@ -982,17 +971,6 @@ def indexer_select(
                                                     k.shape[-1]))  # b, s, n, d
 
         weights = self.weights_proj(x)
-        actual_seq_lengths_query = None
-        actual_seq_lengths_key = None
-        block_table = None
-        if is_prefill:
-            actual_seq_lengths_query = attn_metadata.prefill.query_lens
-            actual_seq_lengths_key = attn_metadata.prefill.seq_lens
-            block_table = attn_metadata.prefill.block_table
-        else:
-            actual_seq_lengths_query = attn_metadata.decode.actual_seq_lengths_q
-            actual_seq_lengths_key = attn_metadata.decode.seq_lens
-            block_table = attn_metadata.decode.block_table
 
         topk_indices = torch.ops.custom.npu_lightning_indexer(
             query=q,
diff --git a/vllm_ascend/patch/worker/patch_common/patch_attention_layer.py b/vllm_ascend/patch/worker/patch_common/patch_attention_layer.py
@@ -16,6 +16,8 @@
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.platforms import current_platform
 
+from vllm_ascend.utils import vllm_version_is
+
 
 class AscendAttention(Attention, nn.Module, AttentionLayerBase):
     """Attention layer.
@@ -133,14 +135,23 @@ def __init__(
         # weight and activation dtype.
         dtype = torch.get_default_dtype()
         if attn_backend is None:
-            self.attn_backend = get_attn_backend(head_size,
-                                                 dtype,
-                                                 kv_cache_dtype,
-                                                 block_size,
-                                                 is_attention_free,
-                                                 use_mla=use_mla,
-                                                 use_sfa=use_sfa,
-                                                 has_sink=self.has_sink)
+            if vllm_version_is("0.10.2"):
+                self.attn_backend = get_attn_backend(head_size,
+                                                     dtype,
+                                                     kv_cache_dtype,
+                                                     block_size,
+                                                     is_attention_free,
+                                                     use_mla=use_mla,
+                                                     use_sfa=use_sfa,
+                                                     has_sink=self.has_sink)
+            else:
+                self.attn_backend = get_attn_backend(head_size,
+                                                     dtype,
+                                                     kv_cache_dtype,
+                                                     block_size,
+                                                     use_mla=use_mla,
+                                                     use_sfa=use_sfa,
+                                                     has_sink=self.has_sink)
         else:
             self.attn_backend = attn_backend
 
diff --git a/vllm_ascend/torchair/torchair_model_runner.py b/vllm_ascend/torchair/torchair_model_runner.py
@@ -43,7 +43,6 @@
                                is_310p, get_ascend_soc_version,
                                AscendSocVersion)
 from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
-import vllm.envs as envs_vllm
 
 
 class NPUTorchairModelRunner(NPUModelRunner):
@@ -378,7 +377,7 @@ def _get_torchair_lazy_compiled_model(self, batch_size: int):
             self.torchair_compiled_model = torch.compile(
                 self.model,
                 dynamic=not self.ascend_config.use_sfa,
-                fullgraph=envs_vllm.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
+                fullgraph=True,
                 backend=npu_backend)
             return self.torchair_compiled_model
         else:
@@ -401,7 +400,7 @@ def _get_torchair_lazy_compiled_model(self, batch_size: int):
                 batch_size] = torchair.inference.cache_compile(
                     self.model.__dict__[forward_proxy_name],
                     dynamic=not self.ascend_config.use_sfa,
-                    fullgraph=envs_vllm.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
+                    fullgraph=True,
                     cache_dir=TORCHAIR_CACHE_DIR,
                     config=config,
                     ge_cache=False)
diff --git a/vllm_ascend/torchair/torchair_sfa.py b/vllm_ascend/torchair/torchair_sfa.py
@@ -81,9 +81,9 @@ class TorchairChunkedContextMetadata:
     block_table: torch.Tensor
     max_query_len: int
     max_seq_lens: int
+    sin: torch.Tensor
+    cos: torch.Tensor
     chunked_context: Optional[TorchairChunkedContextMetadata] = None
-    sin: torch.Tensor = None
-    cos: torch.Tensor = None
 
 
 @dataclass
@@ -95,10 +95,10 @@ class AscendSFATorchairDecodeMetadata:
     seq_lens: torch.Tensor
     max_seq_lens: int
     seq_lens_list: list[int]
-    actual_seq_lengths_q: Optional[torch.Tensor] = None
+    actual_seq_lengths_q: torch.Tensor
+    sin: torch.Tensor
+    cos: torch.Tensor
     attn_mask: Optional[torch.Tensor] = None
-    sin: torch.Tensor = None
-    cos: torch.Tensor = None
 
 
 @dataclass
@@ -410,11 +410,10 @@ def build(
         device = self.device
 
         block_table = (common_attn_metadata.block_table_tensor[:num_reqs])
-        slot_mapping = common_attn_metadata.slot_mapping_cpu[:
-                                                             num_actual_tokens].to(
-                                                                 device,
-                                                                 non_blocking=
-                                                                 True)
+        slot_mapping = common_attn_metadata.slot_mapping[:
+                                                         num_actual_tokens].to(
+                                                             device,
+                                                             non_blocking=True)
         input_positions = common_attn_metadata.positions[:
                                                          num_actual_tokens].long(
                                                          )
@@ -984,7 +983,7 @@ def forward(
 
         has_prefill = attn_metadata.is_prefill
         has_decode = attn_metadata.is_decode
-        if has_prefill:
+        if attn_metadata.prefill is not None:
             # num_actual_tokens = attn_metadata.num_actual_tokens
             assert attn_metadata.num_decodes is not None and \
             attn_metadata.num_prefills is not None and \
@@ -1107,7 +1106,7 @@ def forward(
             output[...] = self.o_proj(attn_output, is_force_scatter=True)
             return output
 
-        if has_decode:
+        elif attn_metadata.decode is not None:
             if envs_ascend.VLLM_ASCEND_ENABLE_MLAPO:
                 prep_res = self._sfa_decode_preprocess(hidden_states, kv_cache,
                                                        attn_metadata,
@@ -1227,10 +1226,10 @@ def indexer_select(
         attn_metadata: M,
         is_prefill: bool = True,
     ):
-        if is_prefill:
+        if attn_metadata.prefill is not None:
             cos = attn_metadata.prefill.cos
             sin = attn_metadata.prefill.sin
-        else:
+        elif attn_metadata.decode is not None:
             cos = attn_metadata.decode.cos
             sin = attn_metadata.decode.sin
 
@@ -1281,14 +1280,13 @@ def indexer_select(
         actual_seq_lengths_query = None
         actual_seq_lengths_key = None
         block_table = None
-        if is_prefill:
+        if attn_metadata.prefill is not None:
             actual_seq_lengths_query = attn_metadata.prefill.query_lens
             actual_seq_lengths_key = attn_metadata.prefill.seq_lens
 
             block_table = attn_metadata.prefill.block_table
-        else:
+        elif attn_metadata.decode is not None:
             actual_seq_lengths_query = attn_metadata.decode.actual_seq_lengths_q
-            # actual_seq_lengths_query = self.actual_seq_length
             actual_seq_lengths_key = attn_metadata.decode.seq_lens.to(
                 torch.int32)
 
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -2511,7 +2511,7 @@ def profile_run(self) -> None:
             # MC2 will consume additional NPU memory.
             # Therefore, we need to run the MC2 path once here to complete its initialization,
             # allowing vLLM to correctly estimate the maximum memory required.
-            if self._select_moe_comm_method(
+            if not self.ascend_config.torchair_graph_config.enabled and self._select_moe_comm_method(
                     self.mc2_tokens_capacity,
                     with_prefill=True) == MoECommType.MC2:
                 self._dummy_run(self.mc2_tokens_capacity, with_prefill=True)
@@ -2684,10 +2684,9 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         self.kv_cache_config = kv_cache_config
         self.initialize_attn_backend(kv_cache_config)
         self.use_hybrid_blocks = (len(self.attn_groups) > 1)
-        # NOTE: Currently, we determine whether we need `num_accepted_tokens` through `GDNAttentionMetadataBuilder`.
+        # NOTE: Currently, we determine whether we need `num_accepted_tokens` through `MambaSpec`.
         self.need_accepted_tokens = any([
-            isinstance(attn_group[0].metadata_builder,
-                       GDNAttentionMetadataBuilder)
+            isinstance(attn_group[0].kv_cache_spec, MambaSpec)
             for attn_group in self.attn_groups
         ])
         self.may_reinitialize_input_batch(kv_cache_config)
@@ -2721,10 +2720,13 @@ def initialize_kv_cache_tensors_deepseek_sfa(
             kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size
 
         kv_caches: Dict[str, torch.Tensor] = {}
-        for kv_cache_spec, kv_cache_group in self._kv_cache_spec_attn_group_iterator(
-        ):
-            attn_backend = kv_cache_group.backend
-            for layer_name in kv_cache_group.layer_names:
+        for group in self._kv_cache_spec_attn_group_iterator_dispatcher():
+            if vllm_version_is("0.10.2"):
+                kv_cache_spec, group = group
+            else:
+                kv_cache_spec = group.kv_cache_spec
+            attn_backend = group.backend
+            for layer_name in group.layer_names:
                 if layer_name in self.runner_only_attn_layers:
                     continue
                 tensor_size = kv_cache_sizes[layer_name]