diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py index 582e4957e6b..d013be34db2 100644 --- a/vllm_ascend/ascend_forward_context.py +++ b/vllm_ascend/ascend_forward_context.py @@ -69,9 +69,7 @@ def set_ascend_forward_context( from vllm_ascend.ops.fused_moe.moe_comm_method import get_moe_comm_method - max_num_tokens = int(num_tokens_across_dp.max().item()) if num_tokens_across_dp is not None else num_tokens - moe_comm_type = select_moe_comm_method(max_num_tokens, vllm_config, is_draft_model) - + moe_comm_type = select_moe_comm_method(num_tokens, vllm_config, is_draft_model) forward_context.moe_comm_type = moe_comm_type forward_context.moe_comm_method = get_moe_comm_method(moe_comm_type) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 677e9925a82..89af64bc731 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1235,7 +1235,6 @@ def execute_model( num_scheduled_tokens_np=num_scheduled_tokens_np, max_num_scheduled_tokens=max_num_scheduled_tokens, use_cascade_attn=cascade_attn_prefix_lens is not None, - force_eager=self.model_config.enforce_eager, num_encoder_reqs=len(scheduler_output.scheduled_encoder_inputs), ) @@ -1854,7 +1853,6 @@ def _sync_batch_across_dp( self, num_tokens_padded: int | None = None, cudagraph_mode: int = 0, - allow_dp_padding: bool = False, ) -> tuple[bool, torch.Tensor | None, int]: """ Coordinates amongst all DP ranks to determine if and how the full batch @@ -1898,16 +1896,11 @@ def _sync_batch_across_dp( num_tokens_across_dp = tensor[0, :] max_num_tokens = int(num_tokens_across_dp.max().item()) - - if allow_dp_padding: - num_tokens_after_padding = torch.tensor( - [max_num_tokens] * len(num_tokens_across_dp), - device="cpu", - dtype=torch.int32, - ) - else: - num_tokens_after_padding = num_tokens_across_dp.cpu() - + num_tokens_after_padding = torch.tensor( + [max_num_tokens] * len(num_tokens_across_dp), + device="cpu", + dtype=torch.int32, + ) # Synchronize cudagraph_mode across ranks (take min) synced_cudagraph_mode = _post_process_cudagraph_mode(tensor) return False, num_tokens_after_padding, synced_cudagraph_mode @@ -1976,7 +1969,6 @@ def dispatch_cudagraph(num_tokens, disable_full=False, valid_modes=None): _, num_tokens_across_dp, synced_cudagraph_mode = self._sync_batch_across_dp( num_tokens_padded=num_tokens_padded, cudagraph_mode=cudagraph_mode.value, - allow_dp_padding=cudagraph_mode != CUDAGraphMode.NONE, ) # Extract DP padding if there is any