Skip to content
Draft

Pr/3234 #3242

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/_e2e_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ jobs:
pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_torchair_correctness.py
pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py

pytest -sv tests/e2e/singlecard/ops/
# pytest -sv tests/e2e/singlecard/ops/

e2e-2-cards:
name: multicard
Expand Down
6 changes: 4 additions & 2 deletions tests/ut/ops/test_fused_moe_prepare_and_finalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
FusedMoEPrepareAndFinalizeWithAll2All,
FusedMoEPrepareAndFinalizeWithAllGather, FusedMoEPrepareAndFinalizeWithMC2,
FusedMoEPrepareAndFinalizeWithNaiveMulticast)
from vllm_ascend.utils import vllm_version_is


class TestFusedMoEPrepareAndFinalize(unittest.TestCase):
Expand Down Expand Up @@ -230,8 +231,9 @@ def test_naive_multicast_prepare_finalize(self, mock_get_forward_context,
mock_get_dp_group):
# Mock forward context with DP metadata
mock_context = MagicMock()
mock_context.dp_metadata.cu_tokens_across_dp_cpu = torch.tensor(
[2, 5, 7])
if vllm_version_is("0.10.2"):
mock_context.dp_metadata.cu_tokens_across_dp_cpu = torch.tensor(
[2, 5, 7])
mock_get_forward_context.return_value = mock_context

# Setup DP group mock
Expand Down
20 changes: 11 additions & 9 deletions tests/ut/ops/test_fused_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
AscendUnquantizedFusedMoEMethod)
from vllm_ascend.ops.moe.experts_selector import select_experts
from vllm_ascend.ops.moe.moe_mlp import cumsum_group_list, unified_apply_mlp
from vllm_ascend.utils import AscendSocVersion, adapt_patch
from vllm_ascend.utils import AscendSocVersion, adapt_patch, vllm_version_is

adapt_patch(True)

Expand Down Expand Up @@ -93,14 +93,16 @@ def mock_finalize(hidden_states, **kwargs):

mock_moe_comm_method.finalize.side_effect = mock_finalize

mock_forward_context_obj = MagicMock(
moe_comm_method=mock_moe_comm_method,
moe_comm_type=MoECommType.MC2,
max_tokens_across_dp=10,
dp_metadata=MagicMock(cu_tokens_across_dp_cpu=[5, 10]),
mc2_mask=torch.zeros(16, dtype=torch.bool),
padded_num_tokens=16,
with_quant=False)
if vllm_version_is("0.10.2"):
dp_metadata = MagicMock(cu_tokens_across_dp_cpu=[5, 10])
mock_forward_context_obj = MagicMock(moe_comm_method=mock_moe_comm_method,
moe_comm_type=MoECommType.MC2,
max_tokens_across_dp=10,
dp_metadata=dp_metadata,
mc2_mask=torch.zeros(
16, dtype=torch.bool),
padded_num_tokens=16,
with_quant=False)

with patch('torch.distributed.get_rank', return_value=0), \
patch('torch.distributed.get_world_size', return_value=4), \
Expand Down
8 changes: 6 additions & 2 deletions tests/ut/torchair/ops/test_torchair_fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@
from vllm_ascend.quantization.quant_config import AscendFusedMoEMethod
from vllm_ascend.torchair.ops.torchair_fused_moe import (
TorchairAscendFusedMoE, TorchairAscendUnquantizedFusedMoEMethod)
from vllm_ascend.utils import AscendSocVersion, adapt_patch # noqa E402
from vllm_ascend.utils import adapt_patch # noqa E402
from vllm_ascend.utils import AscendSocVersion, vllm_version_is

adapt_patch(True)

Expand All @@ -53,6 +54,9 @@ def mock_dp_and_tp_group(mocker):
@pytest.fixture
def mock_dist_env(mocker: MockerFixture):
# init dist env patch
if vllm_version_is("0.10.2"):
dp_metadata = MagicMock(cu_tokens_across_dp_cpu=[5, 10])


with patch('torch.distributed.get_rank', return_value=0), \
patch('torch.distributed.get_world_size', return_value=4), \
Expand Down Expand Up @@ -80,7 +84,7 @@ def mock_dist_env(mocker: MockerFixture):
patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context',
return_value=MagicMock(
max_tokens_across_dp=10,
dp_metadata=MagicMock(cu_tokens_across_dp_cpu=[5, 10])
dp_metadata=dp_metadata,
)), \
patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_current_vllm_config',
return_value=MagicMock(
Expand Down
12 changes: 9 additions & 3 deletions vllm_ascend/models/qwen3_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
make_empty_intermediate_tensors_factory, make_layers, maybe_prefix)

from vllm_ascend.ops.fused_moe import AscendFusedMoE
from vllm_ascend.utils import vllm_version_is


class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock):
Expand Down Expand Up @@ -169,9 +170,14 @@ def __init__(
quant_config=quant_config,
prefix=f"{prefix}.mlp")
else:
self.mlp = Qwen3MoeSparseMoeBlock(config=config,
quant_config=quant_config,
prefix=f"{prefix}.mlp")
if vllm_version_is("0.10.2"):
self.mlp = Qwen3MoeSparseMoeBlock(
config=config,
quant_config=quant_config,
prefix=f"{prefix}.mlp")
else:
self.mlp = Qwen3MoeSparseMoeBlock(vllm_config=vllm_config,
prefix=f"{prefix}.mlp")
else:
self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size,
intermediate_size=config.intermediate_size,
Expand Down
10 changes: 8 additions & 2 deletions vllm_ascend/ops/moe/fused_moe_prepare_and_finalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
from vllm.forward_context import get_forward_context
from vllm.model_executor.layers.fused_moe import FusedMoEConfig

from vllm_ascend.utils import vllm_version_is


class FusedMoEPrepareAndFinalize(ABC):
"""
Expand Down Expand Up @@ -414,8 +416,12 @@ def prepare(self,
self.enable_shared_expert_dp = enable_shared_expert_dp

if self.moe_config.dp_size > 1:
self.cu_tokens_across_dp_cpu = get_forward_context(
).dp_metadata.cu_tokens_across_dp_cpu
if vllm_version_is("0.10.2"):
self.cu_tokens_across_dp_cpu = get_forward_context(
).dp_metadata.cu_tokens_across_dp_cpu
else:
self.cu_tokens_across_dp_cpu = get_forward_context(
).dp_metadata.cu_tokens_across_sp(1)
hidden_states = self._naive_multicast(hidden_states,
self.cu_tokens_across_dp_cpu)
if rm_router_logits:
Expand Down
12 changes: 9 additions & 3 deletions vllm_ascend/torchair/models/qwen3_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
from vllm_ascend.ops.fused_moe import AscendFusedMoE
from vllm_ascend.torchair.ops.sequence_parallel import (MetadataForPadding,
init_metadata_for_sp)
from vllm_ascend.utils import vllm_version_is


class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock):
Expand Down Expand Up @@ -311,9 +312,14 @@ def __init__(
quant_config=quant_config,
prefix=f"{prefix}.mlp")
else:
self.mlp = Qwen3MoeSparseMoeBlock(config=config,
quant_config=quant_config,
prefix=f"{prefix}.mlp")
if vllm_version_is("0.10.2"):
self.mlp = Qwen3MoeSparseMoeBlock(
config=config,
quant_config=quant_config,
prefix=f"{prefix}.mlp")
else:
self.mlp = Qwen3MoeSparseMoeBlock(vllm_config=vllm_config,
prefix=f"{prefix}.mlp")
else:
self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size,
intermediate_size=config.intermediate_size,
Expand Down
8 changes: 6 additions & 2 deletions vllm_ascend/torchair/ops/torchair_fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1242,8 +1242,12 @@ def forward(self,
router_logits = get_dp_group().all_gather(router_logits, 0)

elif fused_moe_state == FusedMoEState.NaiveMulticast:
cu_tokens_across_dp_cpu = get_forward_context(
).dp_metadata.cu_tokens_across_dp_cpu
if vllm_version_is("0.10.2"):
cu_tokens_across_dp_cpu = get_forward_context(
).dp_metadata.cu_tokens_across_dp_cpu
else:
cu_tokens_across_dp_cpu = get_forward_context(
).dp_metadata.cu_tokens_across_sp(1)
hidden_states = self.naive_multicast(hidden_states,
cu_tokens_across_dp_cpu)
if self.rm_router_logits:
Expand Down
Loading