format

1024daniel · 1024daniel · commit 8cac14839887 · 2026-03-18T17:29:46.000+08:00
diff --git a/vllm_ascend/attention/sfa_v1.py b/vllm_ascend/attention/sfa_v1.py
@@ -28,15 +28,15 @@
 from vllm_ascend.attention.mla_v1 import MAX_O_PROJ_PREFETCH_SIZE, MLAPO_MAX_SUPPORTED_TOKENS
 from vllm_ascend.attention.utils import (
     AscendCommonAttentionMetadata,
+    AscendLightningIndexerMetadata,
     ascend_chunked_prefill_workspace_size,
     enable_cp,
+    get_index_of_skipped_queries_numpy,
+    get_sfa_skip_indices,
     maybe_save_kv_layer_to_connector,
     trans_rope_weight,
     transdata,
     wait_for_kv_layer_from_connector,
-    get_sfa_skip_indices,
-    get_index_of_skipped_queries_numpy,
-    AscendLightningIndexerMetadata
 )
 from vllm_ascend.device.device_op import DeviceOperator
 from vllm_ascend.distributed.utils import all_gather_async
@@ -55,8 +55,8 @@
     dispose_layer,
     enable_dsa_cp,
     enable_dsa_cp_with_layer_shard,
-    enable_lightning_indexer_skip,
     enable_dsa_cp_with_o_proj_tp,
+    enable_lightning_indexer_skip,
     get_weight_prefetch_method,
     maybe_trans_nz,
 )
@@ -244,7 +244,7 @@ def build(
 
         cum_query_lens = common_attn_metadata.query_start_loc[1 : num_reqs + 1]
         seq_lens = common_attn_metadata.seq_lens[:num_reqs]
-        
+
         my_query_start_loc = common_attn_metadata.query_start_loc[: num_reqs + 1]
         tokens = my_query_start_loc[1:] - my_query_start_loc[:-1]
 
@@ -332,9 +332,8 @@ def build(
         top_k_indices_skip_li_query = None
         skip = False
         if enable_lightning_indexer_skip():
-
-            li_reorder_indices, li_cum_query_lens, li_seq_lens, li_skiped_query_mask, num_of_non_skip_tokens = get_sfa_skip_indices(
-            seq_lens-tokens, tokens
+            li_reorder_indices, li_cum_query_lens, li_seq_lens, li_skiped_query_mask, num_of_non_skip_tokens = (
+                get_sfa_skip_indices(seq_lens - tokens, tokens)
             )
             skip = num_of_non_skip_tokens is not None
 
@@ -343,7 +342,7 @@ def build(
                     li_cum_query_lens, li_seq_lens, num_reqs, 2048
                 )
                 common_attn_metadata.lightning_indexer_metadata = AscendLightningIndexerMetadata(
-                                                li_reorder_indices=torch.from_numpy(li_reorder_indices)
+                    li_reorder_indices=torch.from_numpy(li_reorder_indices)
                     .pin_memory()
                     .to(dtype=torch.int32, device=self.device, non_blocking=True),
                     li_cum_query_lens=torch.from_numpy(li_cum_query_lens)
@@ -358,7 +357,7 @@ def build(
                     top_k_indices_of_skipped_queries=torch.from_numpy(top_k_indices_of_skipped_queries_numpy)
                     .pin_memory()
                     .to(dtype=torch.int32, device=self.device, non_blocking=True),
-                    num_of_non_skip_tokens = num_of_non_skip_tokens
+                    num_of_non_skip_tokens=num_of_non_skip_tokens,
                 )
                 li_reorder_indices = common_attn_metadata.lightning_indexer_metadata.li_reorder_indices
                 input_positions_pad = torch.zeros_like(input_positions)
@@ -374,7 +373,9 @@ def build(
                 slot_mapping = slot_mapping_pad
                 input_positions = input_positions_pad
                 cos, sin = get_cos_and_sin_mla(input_positions, True)
-                top_k_indices_skip_li_query = common_attn_metadata.lightning_indexer_metadata.top_k_indices_of_skipped_queries
+                top_k_indices_skip_li_query = (
+                    common_attn_metadata.lightning_indexer_metadata.top_k_indices_of_skipped_queries
+                )
 
         return self.metadata_cls(  # type: ignore
             num_input_tokens=common_attn_metadata.num_input_tokens,
@@ -389,10 +390,10 @@ def build(
             sin=sin[:num_input_tokens],
             cos=cos[:num_input_tokens],
             dsa_cp_context=dsa_cp_context,
-            num_actual_seqs = num_reqs,
-            top_k_indices_skip_li_query = top_k_indices_skip_li_query,
-            non_skip_num_actual_tokens = num_of_non_skip_tokens,
-            skip = skip
+            num_actual_seqs=num_reqs,
+            top_k_indices_skip_li_query=top_k_indices_skip_li_query,
+            non_skip_num_actual_tokens=num_of_non_skip_tokens,
+            skip=skip,
         )
 
     def build_for_graph_capture(
@@ -997,8 +998,8 @@ def indexer_select_post_process(
         if num_tokens > 0:
             weights, _ = self.weights_proj(x)
 
-            q_li, _ = self.wq_b(q_c) # [b,s,1536] @ [1536,64*128] = [b,s,64*128]
-            q_li = q_li.view(-1, self.n_head, self.head_dim) # [n_toks,64,128]
+            q_li, _ = self.wq_b(q_c)  # [b,s,1536] @ [1536,64*128] = [b,s,64*128]
+            q_li = q_li.view(-1, self.n_head, self.head_dim)  # [n_toks,64,128]
 
             # rope
             if HAS_TRITON:
@@ -1014,16 +1015,15 @@ def indexer_select_post_process(
                     q_li,
                     [self.qk_rope_head_dim, self.head_dim - self.qk_rope_head_dim],
                     dim=-1,
-                ) # [b,s,64,64+64]
+                )  # [b,s,64,64+64]
 
                 q_li_pe = torch_npu.npu_rotary_mul(
                     q_li_pe.unsqueeze(2),
                     cos,
                     sin,
                 ).squeeze(2)
 
-                q_li = torch.cat([q_li_pe, q_li_nope], dim=-1) # [b*s,64,128]
-                
+                q_li = torch.cat([q_li_pe, q_li_nope], dim=-1)  # [b*s,64,128]
 
         # =========================
         # step3: run lightning indexer
@@ -1043,9 +1043,9 @@ def indexer_select_post_process(
                     query=q_li,
                     key=kv_cache[2],
                     weights=weights,
-                    actual_seq_lengths_query=actual_seq_lengths_query[:attn_metadata.num_actual_seqs],
-                    actual_seq_lengths_key=actual_seq_lengths_key[:attn_metadata.num_actual_seqs],
-                    block_table=attn_metadata.block_table[:attn_metadata.num_actual_seqs],
+                    actual_seq_lengths_query=actual_seq_lengths_query[: attn_metadata.num_actual_seqs],
+                    actual_seq_lengths_key=actual_seq_lengths_key[: attn_metadata.num_actual_seqs],
+                    block_table=attn_metadata.block_table[: attn_metadata.num_actual_seqs],
                     layout_query="TND",
                     layout_key="PA_BSND",
                     sparse_count=sparse_count,
@@ -1289,7 +1289,7 @@ def forward(
 
             k_li = self._get_full_kv(k_li, attn_metadata)
 
-        if kv_cache is not None and  (not attn_metadata.skip or attn_metadata.non_skip_num_actual_tokens > 0):
+        if kv_cache is not None and (not attn_metadata.skip or attn_metadata.non_skip_num_actual_tokens > 0):
             if self.is_kv_producer:
                 attn_metadata.reshape_cache_event = torch.npu.Event()
             torch_npu.npu_scatter_nd_update_(
diff --git a/vllm_ascend/attention/utils.py b/vllm_ascend/attention/utils.py
@@ -1,8 +1,8 @@
 from dataclasses import dataclass, field
 from functools import lru_cache
 from typing import Any
-import numpy as np
 
+import numpy as np
 import torch
 import torch.nn.functional as F
 from vllm.config import VllmConfig, get_current_vllm_config
@@ -121,6 +121,7 @@ class AscendPrefillContextParallelMetadata:
     # the number of tokens padded in linear-attn per rank
     pcp_padded_tokens_fla: int = 0
 
+
 @dataclass
 class AscendLightningIndexerMetadata:
     li_reorder_indices: torch.Tensor = None
@@ -130,6 +131,7 @@ class AscendLightningIndexerMetadata:
     top_k_indices_of_skipped_queries: torch.Tensor = None
     num_of_non_skip_tokens: int = 0
 
+
 @dataclass
 class AscendCommonAttentionMetadata(CommonAttentionMetadata):
     """
@@ -342,11 +344,13 @@ def enabling_mlapo(vllm_config: VllmConfig) -> bool:
     )
     return bool(envs.VLLM_ASCEND_ENABLE_MLAPO and is_decode_instance)
 
+
 def to_numpy(x):
     if isinstance(x, torch.Tensor):
         return x.cpu().numpy()
     return x
 
+
 def get_sfa_skip_indices(num_comptuted_tokens, query_lens):
     num_comptuted_tokens = to_numpy(num_comptuted_tokens)
     query_lens = to_numpy(query_lens)
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
@@ -1255,4 +1255,4 @@ def enable_lightning_indexer_skip() -> bool:
     has_indexer_topk = hasattr(vllm_config.model_config, "hf_text_config") and hasattr(
         vllm_config.model_config.hf_text_config, "index_topk"
     )
-    return bool(has_indexer_topk and vllm_config.additional_config.get("enable_lightning_indexer_skip", False))
+    return bool(has_indexer_topk and vllm_config.additional_config.get("enable_lightning_indexer_skip", False))
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -94,8 +94,6 @@
 from vllm_ascend.attention.utils import (
     AscendCommonAttentionMetadata,
     AscendLightningIndexerMetadata,
-    get_index_of_skipped_queries_numpy,
-    get_sfa_skip_indices,
     hidden_states_reorder,
     maybe_pad_and_reorder_inputs,
     using_paged_attention,
@@ -127,6 +125,7 @@
 from vllm_ascend.utils import (
     calc_split_factor,
     check_gdn_layer,
+    enable_lightning_indexer_skip,
     enable_sp,
     enable_sp_by_pass,
     global_stream,
@@ -135,7 +134,6 @@
     lmhead_tp_enable,
     set_weight_prefetch_method,
     vllm_version_is,
-    enable_lightning_indexer_skip
 )
 from vllm_ascend.worker.npu_input_batch import NPUInputBatch
 from vllm_ascend.worker.pcp_utils import PCPManager

Original file line number	Diff line number	Diff line change
`@@ -1255,4 +1255,4 @@ def enable_lightning_indexer_skip() -> bool:`
`1255`	`1255`	`has_indexer_topk = hasattr(vllm_config.model_config, "hf_text_config") and hasattr(`
`1256`	`1256`	`vllm_config.model_config.hf_text_config, "index_topk"`
`1257`	`1257`	`)`
`1258`		`- return bool(has_indexer_topk and vllm_config.additional_config.get("enable_lightning_indexer_skip", False))`
	`1258`	`+ return bool(has_indexer_topk and vllm_config.additional_config.get("enable_lightning_indexer_skip", False))`