fix ci

linfeng-yuan · wangxiyuan · commit 8e9abf7c680c · 2025-09-29T17:00:44.000+08:00
Signed-off-by: linfeng-yuan &lt;1102311262@qq.com&gt;
diff --git a/vllm_ascend/attention/sfa_v1.py b/vllm_ascend/attention/sfa_v1.py
@@ -796,7 +796,7 @@ def _sfa_preprocess(self, hidden_states, kv_cache, attn_metadata,
     def forward(
         self,
         hidden_states: torch.Tensor,  # query in unified attn
-        kv_cache: Tuple[torch.Tensor],
+        kv_cache: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
         attn_metadata: M,
         need_gather_q_kv: bool = False,
         output: Optional[torch.Tensor] = None,
@@ -919,7 +919,7 @@ def indexer_select(
         self,
         x: torch.Tensor,
         qr: torch.Tensor,
-        kv_cache: Tuple[torch.Tensor],
+        kv_cache: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
         attn_metadata: M,
     ):
         if attn_metadata.prefill is not None:
diff --git a/vllm_ascend/patch/worker/patch_common/patch_attention_selector.py b/vllm_ascend/patch/worker/patch_common/patch_attention_selector.py
@@ -108,7 +108,7 @@ def _cached_get_attn_backend(
         return resolve_obj_by_qualname(attention_cls)
 else:
 
-    def get_attn_backend(
+    def get_attn_backend(  # type: ignore[misc]
         head_size: int,
         dtype: torch.dtype,
         kv_cache_dtype: Optional[str],
diff --git a/vllm_ascend/torchair/models/torchair_deepseek_v2.py b/vllm_ascend/torchair/models/torchair_deepseek_v2.py
@@ -890,7 +890,7 @@ def __init__(
                 attn_cls = TorchairDeepseekV2SFAAttention
                 self.use_sfa = True
             else:
-                attn_cls = TorchairDeepseekV2MLAAttention
+                attn_cls = TorchairDeepseekV2MLAAttention  # type: ignore[assignment]
         else:
             attn_cls = DeepseekV2Attention
         self.self_attn = attn_cls(
diff --git a/vllm_ascend/torchair/torchair_sfa.py b/vllm_ascend/torchair/torchair_sfa.py
@@ -971,7 +971,7 @@ def _sfa_decode_preprocess(self, hidden_states, kv_cache, attn_metadata,
     def forward(
         self,
         hidden_states: torch.Tensor,  # query in unified attn
-        kv_cache: Tuple[torch.Tensor],
+        kv_cache: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
         attn_metadata: M,
         need_gather_q_kv: bool = False,
         output: Optional[torch.Tensor] = None,
@@ -981,21 +981,12 @@ def forward(
             # Profiling run.
             return output
 
-        has_prefill = attn_metadata.is_prefill
-        has_decode = attn_metadata.is_decode
+
         if attn_metadata.prefill is not None:
-            # num_actual_tokens = attn_metadata.num_actual_tokens
             assert attn_metadata.num_decodes is not None and \
             attn_metadata.num_prefills is not None and \
             attn_metadata.num_decode_tokens is not None
-            # num_decode_tokens = attn_metadata.num_decode_tokens
-            # Inputs and outputs may be padded for CUDA graphs
-            # has_decode = attn_metadata.num_decodes > 0
-            has_prefill = attn_metadata.num_prefills > 0
-            # num_decode_tokens = attn_metadata.num_decode_tokens
-            # num_actual_tokens = attn_metadata.num_actual_tokens
-
-            # output_padded = output
+
             bsz = 1
 
             hidden_states_prefill = hidden_states
@@ -1222,7 +1213,7 @@ def indexer_select(
         self,
         x: torch.Tensor,
         qr: torch.Tensor,
-        kv_cache: Tuple[torch.Tensor],
+        kv_cache: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
         attn_metadata: M,
         is_prefill: bool = True,
     ):
diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
@@ -43,7 +43,7 @@
 from vllm.v1.worker.worker_base import WorkerBase
 
 import vllm_ascend.envs as envs_ascend
-from vllm_ascend.ascend_config import init_ascend_config
+from vllm_ascend.ascend_config import get_ascend_config, init_ascend_config
 from vllm_ascend.device_allocator.camem import CaMemAllocator
 from vllm_ascend.distributed.parallel_state import init_ascend_model_parallel
 from vllm_ascend.platform import NPUPlatform
@@ -88,7 +88,14 @@ def __init__(
         # init ascend config and soc version
         init_ascend_config(vllm_config)
         init_ascend_soc_version()
-        import custom_ops  # noqa
+        if get_ascend_config().use_sfa:
+            # Direct import instead of using try_register_lib to ensure proper error handling when
+            # custom_ops is necessary but not available (e.g., in DeepSeek v3.2 deployments)
+            import custom_ops  # type: ignore[import-untyped] # noqa
+            logger.info(
+                "custom_ops module loaded successfully. Custom operators like "
+                "torch.ops.custom.npu_selected_flash_attention are now available."
+            )
 
         super().__init__(vllm_config=vllm_config,
                          local_rank=local_rank,