ck moe 2 stage: cherry pick 612c2ed

tjtanaa · tjtanaavllm · commit 2ecc0d6c9467 · 2025-05-16T05:35:41.000Z
Signed-off-by: tjtanaavllm &lt;tunjian.tan@amd.com&gt;
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
@@ -222,9 +222,16 @@ def test_mixtral_moe(dtype: torch.dtype, padding: bool, use_rocm_aiter: bool,
     """Make sure our Mixtral MoE implementation agrees with the one from
     huggingface."""
 
+    # clear the cache before every test
+    from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+        is_rocm_aiter_moe_enabled)
+    is_rocm_aiter_moe_enabled.cache_clear()
     if use_rocm_aiter:
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
+        if dtype == torch.float32:
+            pytest.skip("AITER ROCm test skip for float32")
+
     # Instantiate our and huggingface's MoE blocks
     config = MixtralConfig()
     hf_moe = MixtralSparseMoeBlock(config).to(dtype).to("cuda")
diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
@@ -8,10 +8,8 @@
 from vllm.model_executor.layers.activation import (GeluAndMul,
                                                    ReLUSquaredActivation,
                                                    SiluAndMul)
-from vllm.model_executor.layers.fused_moe.fused_moe import (
-    dispatch_fused_experts_func, dispatch_topk_func,
-    torch_vllm_inplace_fused_experts, torch_vllm_outplace_fused_experts,
-    vllm_topk_softmax)
+from vllm.model_executor.layers.fused_moe.fused_moe import (dispatch_topk_func,
+                                                            vllm_topk_softmax)
 from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
     is_rocm_aiter_moe_enabled)
 from vllm.model_executor.layers.layernorm import (
@@ -142,24 +140,6 @@ def test_topk_dispatch(use_rocm_aiter: str, monkeypatch):
         assert topk_func == vllm_topk_softmax
 
 
-@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
-@pytest.mark.parametrize("inplace", [True, False])
-def test_fused_experts_dispatch(use_rocm_aiter: str, inplace: bool,
-                                monkeypatch):
-
-    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
-    is_rocm_aiter_moe_enabled.cache_clear()
-    fused_experts_func = dispatch_fused_experts_func(inplace)
-    if current_platform.is_rocm() and int(use_rocm_aiter):
-        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-            rocm_aiter_fused_experts)
-        assert fused_experts_func == rocm_aiter_fused_experts
-    elif inplace:
-        assert fused_experts_func == torch_vllm_inplace_fused_experts
-    else:
-        assert fused_experts_func == torch_vllm_outplace_fused_experts
-
-
 @pytest.mark.parametrize("add_residual", [True, False])
 @pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
 @pytest.mark.parametrize("use_rocm_aiter_norm", ["0", "1"])
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1098,9 +1098,6 @@ def torch_vllm_outplace_fused_experts(**kwargs) -> torch.Tensor:
 
 
 def dispatch_fused_experts_func(inplace: bool) -> Callable[..., torch.Tensor]:
-    if is_rocm_aiter_moe_enabled():
-        from .rocm_aiter_fused_moe import rocm_aiter_fused_experts
-        return rocm_aiter_fused_experts
     if inplace:
         return torch_vllm_inplace_fused_experts
     return torch_vllm_outplace_fused_experts
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -86,6 +86,16 @@ def apply(
 class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
     """MoE method without quantization."""
 
+    def __init__(self):
+        super().__init__()
+
+        self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
+        if self.rocm_aiter_moe_enabled:
+            from .rocm_aiter_fused_moe import rocm_aiter_fused_experts
+            self.rocm_aiter_fused_experts = rocm_aiter_fused_experts
+        else:
+            self.rocm_aiter_fused_experts = None  # type: ignore
+
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
@@ -128,18 +138,13 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.w2_weight.data = self._maybe_pad_weight(layer.w2_weight.data)
         # Lazy import to avoid importing triton.
         from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-            is_rocm_aiter_2stage_moe_enabled, is_rocm_aiter_moe_enabled,
             shuffle_weights)
-        self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
-        self.rocm_aiter_2stage_moe_enabled = is_rocm_aiter_2stage_moe_enabled()
-        if self.rocm_aiter_moe_enabled:
-            # reshaping weights is required for aiter moe kernel.
-            layout = (32, 32) if self.rocm_aiter_2stage_moe_enabled else (16,
-                                                                          16)
 
+        if self.rocm_aiter_moe_enabled:
+            # use 2stage ck moe layout
             shuffled_w13, shuffled_w2 = shuffle_weights(layer.w13_weight.data,
                                                         layer.w2_weight.data,
-                                                        layout=layout)
+                                                        layout=(32, 32))
 
             layer.w13_weight.data = shuffled_w13
             layer.w2_weight.data = shuffled_w2
@@ -221,18 +226,14 @@ def forward_cuda(
             e_score_correction_bias=e_score_correction_bias)
 
         if self.rocm_aiter_moe_enabled:
-            return rocm_aiter_fused_experts(
+            return self.rocm_aiter_fused_experts(
                 hidden_states=x,
                 w1=layer.w13_weight,
                 w2=layer.w2_weight,
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
-                inplace=True,
                 activation=activation,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
-                use_ck_moe_2stages=self.rocm_aiter_2stage_moe_enabled)
+                apply_router_weight_on_input=apply_router_weight_on_input)
 
         return fused_experts(
             hidden_states=x,
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 from functools import cache
-from typing import List, Optional, Tuple
+from typing import Optional
 
 import torch
 
@@ -16,13 +16,6 @@ def is_rocm_aiter_moe_enabled() -> bool:
         and envs.VLLM_ROCM_USE_AITER
 
 
-def is_rocm_aiter_2stage_moe_enabled() -> bool:
-    return current_platform.is_rocm() \
-        and envs.VLLM_ROCM_USE_AITER_2STAGE_MOE \
-        and envs.VLLM_ROCM_USE_AITER_MOE \
-        and envs.VLLM_ROCM_USE_AITER
-
-
 def rocm_aiter_asm_moe_tkw1_impl(
         hidden_states: torch.Tensor,
         w1: torch.Tensor,
@@ -76,23 +69,6 @@ def rocm_aiter_asm_moe_tkw1_fake(
     return torch.empty_like(hidden_states)
 
 
-def rocm_aiter_ck_moe_impl(hidden_states: torch.Tensor, w1: torch.Tensor,
-                           w2: torch.Tensor, topk_weights: torch.Tensor,
-                           topk_ids: torch.Tensor) -> torch.Tensor:
-    from aiter import ck_moe
-    return ck_moe(hidden_states=hidden_states,
-                  w1=w1,
-                  w2=w2,
-                  topk_weights=topk_weights,
-                  topk_ids=topk_ids)
-
-
-def rocm_aiter_ck_moe_fake(hidden_states: torch.Tensor, w1: torch.Tensor,
-                           w2: torch.Tensor, topk_weights: torch.Tensor,
-                           topk_ids: torch.Tensor) -> torch.Tensor:
-    return torch.empty_like(hidden_states)
-
-
 def rocm_aiter_fmoe_fp8_blockscale_g1u1_impl(
         topk_ids: torch.Tensor,
         topk_weights: torch.Tensor,
@@ -215,10 +191,9 @@ def rocm_aiter_ck_moe_2stages_impl(
     fc2_scale: Optional[torch.Tensor] = None,
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
-    block_size: Optional[List[int]] = None,
+    block_size: Optional[list[int]] = None,
     expert_mask: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
-
     from aiter.fused_moe_bf16_asm import ck_moe_2stages
     return ck_moe_2stages(a1=hidden_states,
                           w1=w1,
@@ -243,7 +218,7 @@ def rocm_aiter_ck_moe_2stages_fake(
     fc2_scale: Optional[torch.Tensor] = None,
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
-    block_size: Optional[List[int]] = None,
+    block_size: Optional[list[int]] = None,
     expert_mask: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     return torch.empty_like(hidden_states)
@@ -308,14 +283,6 @@ def rocm_aiter_biased_grouped_topk_fake(
         dispatch_key=current_platform.dispatch_key,
     )
 
-    direct_register_custom_op(
-        op_name="rocm_aiter_ck_moe",
-        op_func=rocm_aiter_ck_moe_impl,
-        mutates_args=[],
-        fake_impl=rocm_aiter_ck_moe_fake,
-        dispatch_key=current_platform.dispatch_key,
-    )
-
     direct_register_custom_op(
         op_name="rocm_aiter_fmoe_fp8_blockscale_g1u1",
         op_func=rocm_aiter_fmoe_fp8_blockscale_g1u1_impl,
@@ -390,31 +357,20 @@ def rocm_aiter_biased_group_topk(
 
 
 def rocm_aiter_fused_experts(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    inplace: bool = False,
-    activation: str = "silu",
-    apply_router_weight_on_input: bool = False,
-    use_fp8_w8a8: bool = False,
-    use_int8_w8a8: bool = False,
-    use_int8_w8a16: bool = False,
-    use_int4_w4a16: bool = False,
-    per_channel_quant: bool = False,
-    global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
-    w1_scale: Optional[torch.Tensor] = None,
-    w2_scale: Optional[torch.Tensor] = None,
-    w1_zp: Optional[torch.Tensor] = None,
-    w2_zp: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    block_shape: Optional[List[int]] = None,
-    allow_deep_gemm: bool = False,
-    use_ck_moe_2stages: bool = False,
-) -> torch.Tensor:
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str = "silu",
+        apply_router_weight_on_input: bool = False,
+        use_fp8_w8a8: bool = False,
+        per_channel_quant: bool = False,
+        w1_scale: Optional[torch.Tensor] = None,
+        w2_scale: Optional[torch.Tensor] = None,
+        a1_scale: Optional[torch.Tensor] = None,
+        a2_scale: Optional[torch.Tensor] = None,
+        block_shape: Optional[list[int]] = None) -> torch.Tensor:
 
     from vllm.model_executor.layers.quantization.utils.fp8_utils import (
         per_token_group_quant_fp8)
@@ -465,7 +421,7 @@ def rocm_aiter_fused_experts(
             fc2_smooth_scale=None,
             a16=False,
             per_tensor_quant_scale=None,
-            expert_mask=expert_map,
+            expert_mask=None,
             activation_str=activation)
 
     # w8a8 per-tensor activation per-tensor weight
@@ -475,7 +431,7 @@ def rocm_aiter_fused_experts(
 
         # - faster static per-tensor-activation static per-tensor-weight
         #   fp8 quantization w8a8
-        if use_ck_moe_2stages and a1_scale is not None and a2_scale is not None:
+        if a1_scale is not None and a2_scale is not None:
             return torch.ops.vllm.rocm_aiter_ck_moe_2stages(
                 hidden_states=hidden_states,
                 w1=w1,
@@ -514,28 +470,19 @@ def rocm_aiter_fused_experts(
         topk_ids = topk_ids.to(torch.int32)
         topk_weights = torch.ones_like(topk_weights, dtype=torch.float32)
 
-    # faster w16a16
-    if use_ck_moe_2stages:
-        return torch.ops.vllm.rocm_aiter_ck_moe_2stages(
-            hidden_states=hidden_states,
-            w1=w1,
-            w2=w2,
-            topk_weights=topk_weights,
-            topk_ids=topk_ids)
-
-    # w16a16 fallback to rocm_aiter_ck_moe w16a16
-    return torch.ops.vllm.rocm_aiter_ck_moe(hidden_states=hidden_states,
-                                            w1=w1,
-                                            w2=w2,
-                                            topk_weights=topk_weights,
-                                            topk_ids=topk_ids)
+    return torch.ops.vllm.rocm_aiter_ck_moe_2stages(
+        hidden_states=hidden_states,
+        w1=w1,
+        w2=w2,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids)
 
 
 def rocm_aiter_topk_softmax(topk_weights: torch.Tensor,
                             topk_indices: torch.Tensor,
                             token_expert_indices: torch.Tensor,
                             gating_output: torch.Tensor,
-                            renormalize: bool) -> Tuple[torch.Tensor, ...]:
+                            renormalize: bool) -> tuple[torch.Tensor, ...]:
     torch.ops.vllm.rocm_aiter_topk_softmax(topk_weights, topk_indices,
                                            token_expert_indices, gating_output,
                                            renormalize)
@@ -560,7 +507,7 @@ def shuffle_weights(*tensors: torch.Tensor,
 
 
 def expand_weights(*tensors: torch.Tensor,
-                   expansion_dims: List[int]) -> Tuple[torch.Tensor, ...]:
+                   expansion_dims: list[int]) -> tuple[torch.Tensor, ...]:
     """
     Expands the dimensions of input tensors.
 
@@ -570,12 +517,12 @@ def expand_weights(*tensors: torch.Tensor,
         corresponding to each tensor.
 
     Returns:
-        A Tuple of tensors with expanded dimensions.
+        A tuple of tensors with expanded dimensions.
     """
 
     assert len(tensors) == len(expansion_dims), \
     "Number of tensors must match the number of expansion dimensions."
 
     return tuple(
         tensor.unsqueeze(-1).unsqueeze(-1).expand((-1, dim, -1))
-        for tensor, dim in zip(tensors, expansion_dims))
+        for tensor, dim in zip(tensors, expansion_dims))
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py