vllm-project
diff --git a/‎vllm_ascend/patch/worker/patch_common/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎vllm_ascend/patch/worker/patch_common/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎vllm_ascend/patch/worker/patch_common/patch_attentionspec.py‎
Lines changed: 0 additions & 1 deletion b/‎vllm_ascend/patch/worker/patch_common/patch_attentionspec.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎vllm_ascend/spec_decode/mtp_proposer.py‎
Lines changed: 6 additions & 3 deletions b/‎vllm_ascend/spec_decode/mtp_proposer.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎vllm_ascend/torchair/models/torchair_deepseek_v2.py‎
Lines changed: 13 additions & 11 deletions b/‎vllm_ascend/torchair/models/torchair_deepseek_v2.py‎
Lines changed: 13 additions & 11 deletions
@@ -20,6 +20,7 @@
 if HAS_TRITON:
     import vllm_ascend.patch.worker.patch_common.patch_triton
 
+# isort: off
 import vllm_ascend.patch.worker.patch_common.patch_attention_selector  # noqa
 import vllm_ascend.patch.worker.patch_common.patch_attentionspec  # noqa
 import vllm_ascend.patch.worker.patch_common.patch_attention_layer  # noqa
 
@@ -31,7 +31,6 @@ def page_size_bytes(self) -> int:
 
 
 vllm.v1.kv_cache_interface.AttentionSpec = AttentionSpec
-from vllm.v1.kv_cache_interface import FullAttentionSpec
 
 
 @dataclass(frozen=True)
 
@@ -3,6 +3,7 @@
 import torch
 import torch.nn as nn
 import torchair
+import vllm.envs as envs_vllm
 from torchair import patch_for_hcom
 from vllm.attention.layer import Attention
 from vllm.config import (VllmConfig, get_layers_from_vllm_config,
@@ -26,7 +27,6 @@
                                         TorchairCommonAttentionMetadata)
 from vllm_ascend.utils import (ProfileExecuteDuration, lmhead_tp_enable,
                                vllm_version_is)
-import vllm.envs as envs_vllm
 
 PADDING_SLOT_ID = -1
 
@@ -61,7 +61,8 @@ def __init__(
         self.torchair_compiled_models = {}  # type: ignore
         self.torchair_graph_enabled = get_ascend_config(
         ).torchair_graph_config.enabled
-        self.enable_shared_expert_dp = get_ascend_config().enable_shared_expert_dp
+        self.enable_shared_expert_dp = get_ascend_config(
+        ).enable_shared_expert_dp
         # We need +1 here because the arange is used to set query_start_loc,
         # which has one more element than batch_size.
         self.arange = torch.arange(vllm_config.scheduler_config.max_num_seqs +
@@ -81,7 +82,9 @@ def load_model(self, model) -> None:
         with set_default_torch_dtype(
                 draft_model_config.dtype), set_current_vllm_config(
                     self.vllm_config):
-            if self.torchair_graph_enabled or (self.enable_shared_expert_dp and self.vllm_config.model_config.use_mla):
+            if self.torchair_graph_enabled or (
+                    self.enable_shared_expert_dp
+                    and self.vllm_config.model_config.use_mla):
                 self.model = TorchairDeepSeekMTP(
                     vllm_config=self.vllm_config).to(target_device)
             else:
 
@@ -528,17 +528,15 @@ def __init__(
                 bias=False,
                 quant_config=quant_config,
                 prefix=f"{prefix}.o_proj",
-                return_bias=False
-            )
+                return_bias=False)
         else:
             self.o_proj = TorchairDeepseekV2RowParallelLinear(
                 self.num_heads * self.v_head_dim,
                 self.hidden_size,
                 bias=False,
                 quant_config=quant_config,
                 prefix=f"{prefix}.o_proj",
-                return_bias=False
-            )
+                return_bias=False)
 
         if rope_scaling:
             rope_scaling["rope_type"] = 'deepseek_yarn'
@@ -738,10 +736,10 @@ def __init__(
             return_bias=False,
         )
         if (config.n_routed_experts is not None
-              and self.debug_layer_idx >= config.first_k_dense_replace
-              and self.debug_layer_idx % config.moe_layer_freq == 0
-              and (ascend_config.multistream_overlap_shared_expert
-                   or self.enable_shared_expert_dp)):
+                and self.debug_layer_idx >= config.first_k_dense_replace
+                and self.debug_layer_idx % config.moe_layer_freq == 0
+                and (ascend_config.multistream_overlap_shared_expert
+                     or self.enable_shared_expert_dp)):
             self.o_proj = TorchairDeepseekV2RowParallelLinearReplaceAllreduce(
                 self.num_heads * self.v_head_dim,
                 self.hidden_size,
@@ -827,8 +825,10 @@ def forward(
             attn_metadata: Optional[AttentionMetadata] = None) -> torch.Tensor:
         forward_context = get_forward_context()
         if not self.torchair_graph_enabled:
-            if forward_context.attn_metadata is not None and isinstance(forward_context.attn_metadata, dict):
-                attn_metadata = next(iter(forward_context.attn_metadata.values()), None)
+            if forward_context.attn_metadata is not None and isinstance(
+                    forward_context.attn_metadata, dict):
+                attn_metadata = next(
+                    iter(forward_context.attn_metadata.values()), None)
             else:
                 attn_metadata = forward_context.attn_metadata
             if kv_cache is None:
@@ -843,7 +843,9 @@ def forward(
         #     need_gather_q_kv = True
         if not self.enable_shared_expert_dp or self.debug_layer_idx != self.first_k_dense_replace:
             output_shape = hidden_states.shape
-        if self.enable_shared_expert_dp and (self.debug_layer_idx == self.first_k_dense_replace or self.debug_layer_idx ==self.layers):
+        if self.enable_shared_expert_dp and (
+                self.debug_layer_idx == self.first_k_dense_replace
+                or self.debug_layer_idx == self.layers):
             rows = num_tokens // self.tp_size
             if num_tokens % self.tp_size:
                 rows += 1
Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,6 @@ def page_size_bytes(self) -> int:`
`31`	`31`
`32`	`32`
`33`	`33`	`vllm.v1.kv_cache_interface.AttentionSpec = AttentionSpec`
`34`		`-from vllm.v1.kv_cache_interface import FullAttentionSpec`
`35`	`34`
`36`	`35`
`37`	`36`	`@dataclass(frozen=True)`