adapt to new custom_ops interface

linfeng-yuan · wangxiyuan · commit a99579e863d2 · 2025-09-29T17:00:44.000+08:00
Signed-off-by: linfeng-yuan &lt;1102311262@qq.com&gt;
diff --git a/vllm_ascend/attention/sfa_v1.py b/vllm_ascend/attention/sfa_v1.py
@@ -852,13 +852,13 @@ def apply_attention_fusion(self, query_states, key_states, topk_indices,
 
             prefill_metadata = attn_metadata.prefill
 
-            slc_fa_fusion = torch.ops.custom.npu_selected_flash_attention(
+            slc_fa_fusion = torch.ops.custom.npu_sparse_flash_attention(
                 query=q_nope,
                 key=k_nope,
                 value=k_nope,
-                selected_indices=topk_indices,
+                sparse_indices=topk_indices,
                 scale_value=self.scale,
-                selected_block_size=1,
+                sparse_block_size=1,
                 block_table=prefill_metadata.block_table,
                 actual_seq_lengths_query=prefill_metadata.query_lens,
                 actual_seq_lengths_kv=prefill_metadata.seq_lens,
@@ -872,13 +872,13 @@ def apply_attention_fusion(self, query_states, key_states, topk_indices,
         elif attn_metadata.decode is not None:
             decode_metadata = attn_metadata.decode
 
-            slc_fa_fusion = torch.ops.custom.npu_selected_flash_attention(
+            slc_fa_fusion = torch.ops.custom.npu_sparse_flash_attention(
                 query=q_nope,
                 key=k_nope,
                 value=k_nope,
-                selected_indices=topk_indices,
+                sparse_indices=topk_indices,
                 scale_value=self.scale,
-                selected_block_size=1,
+                sparse_block_size=1,
                 block_table=attn_metadata.decode.block_table,
                 actual_seq_lengths_query=decode_metadata.actual_seq_lengths_q,
                 actual_seq_lengths_kv=decode_metadata.seq_lens,
@@ -981,6 +981,6 @@ def indexer_select(
             block_table=block_table,
             layout_query="TND",
             layout_key="PA_BSND",
-            selected_count=2048,
+            sparse_count=2048,
             sparse_mode=3)
         return topk_indices
diff --git a/vllm_ascend/torchair/torchair_sfa.py b/vllm_ascend/torchair/torchair_sfa.py
@@ -1070,13 +1070,13 @@ def forward(
             k_nope, k_rope = key_states
             prefill_metadata = attn_metadata.prefill
 
-            slc_fa_fusion = torch.ops.custom.npu_selected_flash_attention(
+            slc_fa_fusion = torch.ops.custom.npu_sparse_flash_attention(
                 query=q_nope,
                 key=k_nope,
                 value=k_nope,
-                selected_indices=topk_indices,
+                sparse_indices=topk_indices,
                 scale_value=self.scale,
-                selected_block_size=1,
+                sparse_block_size=1,
                 block_table=prefill_metadata.block_table,
                 actual_seq_lengths_query=prefill_metadata.query_lens,
                 actual_seq_lengths_kv=prefill_metadata.seq_lens,
@@ -1175,13 +1175,13 @@ def forward(
                 k_nope, k_rope = key_states
 
             decode_metadata = attn_metadata.decode
-            slc_fa_fusion = torch.ops.custom.npu_selected_flash_attention(
+            slc_fa_fusion = torch.ops.custom.npu_sparse_flash_attention(
                 query=q_nope,
                 key=k_nope,
                 value=k_nope,
-                selected_indices=topk_indices,
+                sparse_indices=topk_indices,
                 scale_value=self.scale,
-                selected_block_size=1,
+                sparse_block_size=1,
                 block_table=attn_metadata.decode.block_table,
                 actual_seq_lengths_query=decode_metadata.actual_seq_lengths_q,
                 actual_seq_lengths_kv=decode_metadata.seq_lens,
@@ -1292,7 +1292,7 @@ def indexer_select(
             block_table=block_table,
             layout_query="TND",
             layout_key="PA_BSND",
-            selected_count=2048,
+            sparse_count=2048,
             sparse_mode=3)
         return topk_indices
 
diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
@@ -94,7 +94,7 @@ def __init__(
             import custom_ops  # type: ignore[import-untyped] # noqa
             logger.info(
                 "custom_ops module loaded successfully. Custom operators like "
-                "torch.ops.custom.npu_selected_flash_attention are now available."
+                "torch.ops.custom.npu_sparse_flash_attention are now available."
             )
 
         super().__init__(vllm_config=vllm_config,

Original file line number	Diff line number	Diff line change
`@@ -94,7 +94,7 @@ def __init__(`
`94`	`94`	`import custom_ops # type: ignore[import-untyped] # noqa`
`95`	`95`	`logger.info(`
`96`	`96`	`"custom_ops module loaded successfully. Custom operators like "`
`97`		`- "torch.ops.custom.npu_selected_flash_attention are now available."`
	`97`	`+ "torch.ops.custom.npu_sparse_flash_attention are now available."`
`98`	`98`	`)`
`99`	`99`
`100`	`100`	`super().__init__(vllm_config=vllm_config,`