Skip to content

Commit c3120ba

Browse files
committed
remove enable_force_load_balance
Signed-off-by: wangxiyuan <[email protected]>
1 parent c32f2c8 commit c3120ba

File tree

3 files changed

+2
-12
lines changed

3 files changed

+2
-12
lines changed

vllm_ascend/ops/common_fused_moe.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -216,9 +216,6 @@ def forward_impl(self, hidden_states: torch.Tensor,
216216
router_logits=router_logits,
217217
replace_allreduce=forward_context.sp_enabled)
218218

219-
# Load balancing for token distribution among experts in dummy_run
220-
enable_force_load_balance = forward_context.in_profile_run
221-
222219
# Matrix multiply.
223220
final_hidden_states = self.quant_method.apply(
224221
layer=self,
@@ -240,7 +237,6 @@ def forward_impl(self, hidden_states: torch.Tensor,
240237
expert_load_view=self.expert_load_view,
241238
logical_to_physical_map=self.logical_to_physical_map,
242239
logical_replica_count=self.logical_replica_count,
243-
enable_force_load_balance=enable_force_load_balance,
244240
)
245241
if isinstance(final_hidden_states, tuple):
246242
final_hidden_states, group_list_type, expert_tokens = final_hidden_states

vllm_ascend/patch/worker/patch_common/patch_attention_selector.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,8 @@ def _cached_get_attn_backend(
7373
# use the placeholder NO_ATTENTION
7474
if is_attention_free:
7575
from vllm.attention.backends.placeholder_attn import \
76-
PlaceholderAttentionBackend
77-
return PlaceholderAttentionBackend
76+
PlaceholderAttentionBackend # type: ignore[import-untyped]
77+
return PlaceholderAttentionBackend # type: ignore[import-untyped]
7878

7979
# Check whether a particular choice of backend was
8080
# previously forced.

vllm_ascend/quantization/w8a8_dynamic.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -217,12 +217,6 @@ def apply(
217217
e_score_correction_bias=e_score_correction_bias,
218218
global_num_experts=global_num_experts)
219219

220-
# this is a naive implementation for experts load balance so as
221-
# to avoid accumulating too much tokens on a single rank.
222-
# currently it is only activated when doing profile runs.
223-
if enable_force_load_balance:
224-
topk_ids = torch.randint_like(topk_ids, 0, global_num_experts)
225-
226220
if self.use_aclgraph:
227221
moe_comm_method = get_forward_context().moe_comm_method
228222
return moe_comm_method.fused_experts(

0 commit comments

Comments
 (0)