remove enable_force_load_balance

wangxiyuan · wangxiyuan · commit c3120baf7885 · 2025-09-29T20:05:26.000+08:00
Signed-off-by: wangxiyuan &lt;wangxiyuan1007@gmail.com&gt;
diff --git a/vllm_ascend/ops/common_fused_moe.py b/vllm_ascend/ops/common_fused_moe.py
@@ -216,9 +216,6 @@ def forward_impl(self, hidden_states: torch.Tensor,
             router_logits=router_logits,
             replace_allreduce=forward_context.sp_enabled)
 
-        # Load balancing for token distribution among experts in dummy_run
-        enable_force_load_balance = forward_context.in_profile_run
-
         # Matrix multiply.
         final_hidden_states = self.quant_method.apply(
             layer=self,
@@ -240,7 +237,6 @@ def forward_impl(self, hidden_states: torch.Tensor,
             expert_load_view=self.expert_load_view,
             logical_to_physical_map=self.logical_to_physical_map,
             logical_replica_count=self.logical_replica_count,
-            enable_force_load_balance=enable_force_load_balance,
         )
         if isinstance(final_hidden_states, tuple):
             final_hidden_states, group_list_type, expert_tokens = final_hidden_states
diff --git a/vllm_ascend/patch/worker/patch_common/patch_attention_selector.py b/vllm_ascend/patch/worker/patch_common/patch_attention_selector.py
@@ -73,8 +73,8 @@ def _cached_get_attn_backend(
         # use the placeholder NO_ATTENTION
         if is_attention_free:
             from vllm.attention.backends.placeholder_attn import \
-                PlaceholderAttentionBackend
-            return PlaceholderAttentionBackend
+                PlaceholderAttentionBackend  # type: ignore[import-untyped]
+            return PlaceholderAttentionBackend  # type: ignore[import-untyped]
 
         # Check whether a particular choice of backend was
         # previously forced.
diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -217,12 +217,6 @@ def apply(
             e_score_correction_bias=e_score_correction_bias,
             global_num_experts=global_num_experts)
 
-        # this is a naive implementation for experts load balance so as
-        # to avoid accumulating too much tokens on a single rank.
-        # currently it is only activated when doing profile runs.
-        if enable_force_load_balance:
-            topk_ids = torch.randint_like(topk_ids, 0, global_num_experts)
-
         if self.use_aclgraph:
             moe_comm_method = get_forward_context().moe_comm_method
             return moe_comm_method.fused_experts(