neuralmagic
diff --git a/‎tests/kernels/moe/test_moe.py
Lines changed: 2 additions & 138 deletions b/‎tests/kernels/moe/test_moe.py
Lines changed: 2 additions & 138 deletions
diff --git a/‎tests/kernels/moe/test_triton_moe_ptpc_fp8.py
Lines changed: 20 additions & 14 deletions b/‎tests/kernels/moe/test_triton_moe_ptpc_fp8.py
Lines changed: 20 additions & 14 deletions
diff --git a/‎tests/kernels/quantization/test_block_fp8.py
Lines changed: 7 additions & 25 deletions b/‎tests/kernels/quantization/test_block_fp8.py
Lines changed: 7 additions & 25 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/fused_batched_moe.py
Lines changed: 3 additions & 14 deletions b/‎vllm/model_executor/layers/fused_moe/fused_batched_moe.py
Lines changed: 3 additions & 14 deletions
@@ -110,143 +110,6 @@ def test_fused_moe(
                                rtol=0)
 
 
-def torch_dispatch(
-    a: torch.Tensor,
-    topk_ids: torch.Tensor,
-    num_experts: int
-) -> torch.Tensor:
-    assert topk_ids.dim() == 2
-    assert topk_ids.shape[0] == a.shape[0]
-
-    num_tokens = a.shape[0]
-    topk = topk_ids.shape[1]
-
-    tokens_per_expert = torch.bincount(topk_ids.view(-1), minlength=num_experts)
-
-    max_num_tokens = tokens_per_expert.max()
-    b_a = torch.zeros((num_experts, max_num_tokens, a.shape[1]),
-                      dtype=a.dtype, device=a.device)
-    #print(f"b_a shape {b_a.shape}")
-
-    token_counts = torch.zeros(num_experts, dtype=torch.int, device=a.device)
-
-    for token in range(num_tokens):
-        for j in range(topk):
-            expert_id = topk_ids[token, j]
-            idx = token_counts[expert_id]
-            b_a[expert_id, idx:idx+1, :] = a[token, :]
-            token_counts[expert_id] = token_counts[expert_id] + 1
-
-    return b_a, tokens_per_expert
-
-
-def torch_combine(b_out, topk_weight, topk_ids):
-    num_tokens, topk = topk_ids.shape
-    num_experts = b_out.shape[0]
-    K = b_out.shape[-1]
-    out = torch.zeros((num_tokens, K), dtype=b_out.dtype, device=b_out.device)
-    expert_counts = torch.zeros(num_experts, dtype=torch.int, device=b_out.device)
-    for token in range(num_tokens):
-        expert_ids = topk_ids[token]
-        for i in range(expert_ids.numel()):
-            expert_id = expert_ids[i]
-            idx = expert_counts[expert_id]
-            out[token, :] = out[token, :] + b_out[expert_id, idx:idx+1, :] * topk_weight[token, i]
-            expert_counts[expert_id] = expert_counts[expert_id] + 1
-
-    return out
-
-
-def torch_batched_moe(a, w1, w2, topk_weight, topk_ids):
-    num_experts = w1.shape[0]
-    b_a, tokens_per_expert = torch_dispatch(a, topk_ids, num_experts)
-    assert b_a.dim() == 3
-    num_tokens, topk = topk_ids.shape
-    _, max_num_tokens, K = b_a.shape
-    assert num_experts == b_a.shape[0] and K == w2.shape[1]
-    out = torch.zeros((num_experts, max_num_tokens, K), dtype=b_a.dtype, device=b_a.device)
-    tmp = torch.empty((max_num_tokens, w1.shape[1] // 2), dtype=b_a.dtype, device=b_a.device)
-    for expert in range(num_experts):
-        num = tokens_per_expert[expert]
-        if num > 0:
-            torch.ops._C.silu_and_mul(tmp[:num], b_a[expert,:num,:] @ w1[expert].transpose(0, 1))
-            out[expert, :num, :] = tmp[:num] @ w2[expert].transpose(0, 1)
-
-    return torch_combine(out, topk_weight, topk_ids)
-
-
-# TODO: same as torch_moe but with fused_topk factored out.
-def torch_moe2(a, w1, w2, topk_weight, topk_ids):
-    M, K = a.shape
-    topk = topk_ids.shape[1]
-    a = a.view(M, -1, K).repeat(1, topk, 1).reshape(-1, K)
-    out = torch.zeros(M * topk, w2.shape[1], dtype=a.dtype, device=a.device)
-    num_experts = w1.shape[0]
-    for i in range(num_experts):
-        mask = (topk_ids == i).view(-1)
-        if mask.sum():
-            out[mask] = SiluAndMul()(
-                a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(0, 1)
-
-    return (out.view(M, -1, w2.shape[1]) *
-            topk_weight.view(M, -1, 1).to(out.dtype)).sum(dim=1)
-
-
-@pytest.mark.parametrize("m", [1, 33, 64, 222]) #, 1024 * 128])
-@pytest.mark.parametrize("n", [128, 1024, 2048])
-@pytest.mark.parametrize("k", [128, 511, 1024])
-@pytest.mark.parametrize("e", NUM_EXPERTS)
-@pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-def test_fused_moe_batched_experts(
-    m: int,
-    n: int,
-    k: int,
-    e: int,
-    topk: int,
-    dtype: torch.dtype,
-):
-    current_platform.seed_everything(7)
-
-    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
-    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
-    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
-
-    score = torch.randn((m, e), device="cuda", dtype=dtype)
-
-    vllm_config = VllmConfig()
-    with set_current_vllm_config(vllm_config):
-        topk_weight, topk_ids = fused_topk(a, score, topk, False)
-
-        torch_output = torch_moe2(a, w1, w2, topk_weight, topk_ids)
-
-        if True:
-            triton_output = torch_batched_moe(a,
-                                              w1,
-                                              w2,
-                                              topk_weight,
-                                              topk_ids)
-        else:
-            b_a, tokens_per_expert = batch_by_experts(a, topk_ids, e)
-            triton_output = fused_batched_experts(
-                b_a,
-                w1,
-                w2,
-                topk_weight,
-                topk_ids,
-                global_num_experts=e
-            )
-
-    if False:
-        torch.set_printoptions(profile="full")
-        print("BASELINE")
-        print(torch_output)
-        print("OUTPUT")
-        print(triton_output)
-
-    torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
-
-
 @pytest.mark.parametrize("m", [1, 32, 222])
 @pytest.mark.parametrize("n", [128, 1024, 2048])
 @pytest.mark.parametrize("k", [128, 1024])
@@ -587,7 +450,8 @@ def test_fused_marlin_moe(
     topk_weights, topk_ids, token_expert_indices = fused_topk(
         a, score, topk, False)
 
-    torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, e_map)
+    with set_current_vllm_config(vllm_config):
+        torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, e_map)
 
     marlin_output = torch.ops.vllm.fused_marlin_moe(
         a,
 
@@ -7,6 +7,7 @@
 import torch
 
 from vllm import _custom_ops as ops
+from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
 from vllm.platforms import current_platform
@@ -15,6 +16,10 @@
     pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
                 allow_module_level=True)
 
+vllm_config = VllmConfig()
+vllm_config.scheduler_config.max_num_seqs = 128
+vllm_config.scheduler_config.max_model_len = 8192
+
 
 def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):
     """Matrix multiplication function that supports per-token input
@@ -137,20 +142,21 @@ def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed):
     w2_s = torch.rand(E, K, device=w2_fp32.device) * factor_for_scale
     score = torch.randn((M, E), dtype=dtype)
 
-    ref_out = torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk)
-    out = fused_moe(
-        a,
-        w1,
-        w2,
-        score,
-        topk,
-        renormalize=False,
-        use_fp8_w8a8=True,  # using fp8
-        per_channel_quant=True,
-        w1_scale=w1_s,
-        w2_scale=w2_s,
-        block_shape=None,  # Not using block quantization
-    )
+    with set_current_vllm_config(vllm_config):
+        ref_out = torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk)
+        out = fused_moe(
+            a,
+            w1,
+            w2,
+            score,
+            topk,
+            renormalize=False,
+            use_fp8_w8a8=True,  # using fp8
+            per_channel_quant=True,
+            w1_scale=w1_s,
+            w2_scale=w2_s,
+            block_shape=None,  # Not using block quantization
+        )
 
     # Check results
     rel_diff = (torch.mean(
 
@@ -30,6 +30,10 @@
     pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
                 allow_module_level=True)
 
+vllm_config = VllmConfig()
+vllm_config.scheduler_config.max_num_seqs = 128
+vllm_config.scheduler_config.max_model_len = 8192
+
 # Test configurations
 DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
 NUM_TOKENS = [7, 83, 2048]
@@ -210,10 +214,6 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
     score = torch.randn((M, E), dtype=dtype)
 
     # Set the context to avoid lots of warning spam.
-    vllm_config = VllmConfig()
-    vllm_config.scheduler_config.max_num_seqs = 128
-    vllm_config.scheduler_config.max_model_len = 8192
-
     with set_current_vllm_config(vllm_config):
         out = fused_moe(
             a,
@@ -261,6 +261,7 @@ def per_block_cast_to_fp8(
 @pytest.mark.parametrize(
     "M,N,K,block_size,out_dtype,seed",
     itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
+@pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.")
 @torch.inference_mode()
 def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
     # only aligned sizes
@@ -426,26 +427,7 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed):
         w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i])
         w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i])
 
-    if True:
-        dgm = modular_deep_gemm_fused_moe_fp8()
-
-        def deep_gemm_moe_fp8_fn(a, w1, w2, w1_s, w2_s, topk_weights,
-                                 topk_ids):
-            return dgm(a,
-                       w1,
-                       w2,
-                       topk_weights,
-                       topk_ids,
-                       w1_scale=w1_s,
-                       w2_scale=w2_s)
-    else:
-        deep_gemm_moe_fp8_fn = deep_gemm_moe_fp8
-
     # Set the context to avoid lots of warning spam.
-    vllm_config = VllmConfig()
-    vllm_config.scheduler_config.max_num_seqs = 128
-    vllm_config.scheduler_config.max_model_len = 8192
-
     with set_current_vllm_config(vllm_config):
         if M >= 128:
             ref_out = deep_gemm_w8a8_block_fp8_moe(M, K, a, w1, w2, w1_s, w2_s,
@@ -457,8 +439,8 @@ def deep_gemm_moe_fp8_fn(a, w1, w2, w1_s, w2_s, topk_weights,
         topk_weights, topk_ids, token_expert_indices = fused_topk(
             a, score.float(), topk, False)
 
-        out = deep_gemm_moe_fp8_fn(a, w1, w2, w1_s, w2_s, topk_weights,
-                                   topk_ids)
+        out = deep_gemm_moe_fp8(a, w1, w2, w1_s, w2_s, topk_weights,
+                                topk_ids)
 
     #print(f"{out.sum()=}")
     #print(f"{ref_out.sum()=}")
 
@@ -24,7 +24,7 @@ def dispatch(
         topk_ids: torch.Tensor,
         num_experts: int,
         expert_map: Optional[torch.Tensor],
-        apply_router_weight_on_input: bool = False,
+        apply_router_weight_on_input: bool,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
         assert topk_ids.dim() == 2
         assert topk_ids.shape[0] == a1.shape[0]
@@ -99,8 +99,6 @@ class BatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
     def __init__(
         self,
-        rank: int = 0,
-        world_size: int = 1,
         max_num_tokens: Optional[int] = None,
         use_fp8_w8a8: bool = False,
         use_int8_w8a8: bool = False,
@@ -116,8 +114,6 @@ def __init__(
         assert block_shape is None
         assert block_m is None
         self.max_num_tokens = max_num_tokens
-        self.rank = rank
-        self.world_size = world_size
         assert not use_fp8_w8a8, "NYI"
         assert not use_int8_w8a8, "NYI"
         assert not use_int8_w8a16, "NYI"
@@ -171,21 +167,14 @@ def apply(
                             (num_experts, max_num_tokens, w2.shape[1]))
         num_local_experts = expert_num_tokens.numel()
 
-        # TODO: don't need world_size or rank if expert_base always == 0
-        #assert w1.shape[0] == num_experts, f"{w1.shape} == {num_experts}"
-        #expert_base = rank_chunk(w1.shape[0], self.rank,
-        #                         self.world_size) * self.rank
-        expert_base = 0
-
         for expert in range(num_local_experts):
             num = expert_num_tokens[expert]
             assert num <= max_num_tokens, f"{num}, {max_num_tokens}"
             if num > 0:
                 tmp = _resize_cache(workspace2, (num, w1.shape[1] // 2))
                 self.activation(
                     activation, tmp, hidden_states[expert, :num, :]
-                    @ w1[expert_base + expert].transpose(0, 1))
-                out[expert, :num, :] = tmp @ w2[expert_base +
-                                                expert].transpose(0, 1)
+                    @ w1[expert].transpose(0, 1))
+                out[expert, :num, :] = tmp @ w2[expert].transpose(0, 1)
 
         return out