fix ci

JC-ut0 · JC-ut0 · commit cd337a77e0d1 · 2025-08-16T10:41:06.000+08:00
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -185,7 +185,8 @@ def __init__(self,
         self.device = device
         scheduler_config = vllm_config.scheduler_config
         self.block_size = vllm_config.cache_config.block_size
-        self.max_blocks = (vllm_config.model_config.max_model_len + self.block_size - 1) // self.block_size
+        self.max_blocks = (vllm_config.model_config.max_model_len +
+                           self.block_size - 1) // self.block_size
         self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
         if self.chunked_prefill_enabled:
             self.chunked_prefill_workspace_size = min(
@@ -216,7 +217,10 @@ def __init__(self,
         self.cos_cache = None
         self.sin_cache = None
         self.prefill_attn_mask = torch.triu(
-            torch.ones(512, 512, device=self.device, dtype=self.model_config.dtype),
+            torch.ones(512,
+                       512,
+                       device=self.device,
+                       dtype=self.model_config.dtype),
             1)  # 512: mask only support 512
 
     def reorder_batch(self, input_batch: "InputBatch",
@@ -384,13 +388,13 @@ def build(
 
         num_computed_tokens_cpu = (common_attn_metadata.seq_lens_cpu -
                                    query_seq_lens_cpu)
-        
+
         use_torchair_graph = num_token_pad_size != -1
         if use_torchair_graph and self.runner.attn_state in [
-                    AscendAttentionState.DecodeOnly,
-                    AscendAttentionState.SpecDecoding
-            ]:
-                decode_threshold = self.runner.decode_token_per_req
+                AscendAttentionState.DecodeOnly,
+                AscendAttentionState.SpecDecoding
+        ]:
+            decode_threshold = self.runner.decode_token_per_req
         else:
             # TODO(xyx): remove the if condition after mla supports torch mode speculative decoding
             decode_threshold = 1
@@ -420,16 +424,16 @@ def build(
         prefill_metadata = None
         if num_prefills > 0:
             reqs_start = num_decodes  # prefill_start
-            
+
             context_lens_cpu = num_computed_tokens_cpu[reqs_start:num_reqs]
             max_context_len_cpu = context_lens_cpu.max().item()
             num_prefills_with_context_cpu = (context_lens_cpu > 0).sum().item()
             prefill_query_start_loc = query_start_loc[
                 reqs_start:] - query_start_loc[reqs_start]
-            
+
             tokens_start = num_decode_tokens
             chunked_context_metadata = None
-            
+
             if self.chunked_prefill_enabled and max_context_len_cpu > 0:
                 # currently we allocate an equal amount of workspace for each
                 # prefill in the batch, we could probably use a more advanced
@@ -504,7 +508,7 @@ def build(
         decode_metadata = None
         use_torchair_graph = num_token_pad_size != -1
         if num_decodes > 0:
-            actual_seq_lengths_q = query_start_loc[1:num_decodes+1].tolist()
+            actual_seq_lengths_q = query_start_loc[1:num_decodes + 1].tolist()
             max_seq_lens = seq_lens[:num_decodes].max().item()
             seq_lens = seq_lens[:num_decodes]
             input_positions = input_positions[:num_decode_tokens]
@@ -953,7 +957,10 @@ def _forward_decode(
                                  self.qk_rope_head_dim)
                 input_layout = "BNSD"
 
-            if attn_metadata.attn_state in [AscendAttentionState.SpecDecoding, AscendAttentionState.ChunkedPrefill]:
+            if attn_metadata.attn_state in [
+                    AscendAttentionState.SpecDecoding,
+                    AscendAttentionState.ChunkedPrefill
+            ]:
                 assert num_tokens % (1 + self.spec_token_num) == 0
                 input_layout = "TND"
                 # [bs * q_seq_len, num_heads_per_rank, dim]
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -887,8 +887,8 @@ def _process_reqs(
         assert total_num_scheduled_tokens > 0
         num_reqs = self.input_batch.num_reqs
         assert num_reqs > 0
-        if (self.use_aclgraph and total_num_scheduled_tokens
-                <= self.aclgraph_batch_sizes[-1]):
+        if (self.use_aclgraph and
+                total_num_scheduled_tokens <= self.aclgraph_batch_sizes[-1]):
             # Add padding to the batch size.
             num_input_tokens = self.vllm_config.pad_for_cudagraph(
                 total_num_scheduled_tokens)