test pplx w/naive implementation

bnellnm · bnellnm · commit f6acee68b03a · 2025-04-30T16:53:32.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
@@ -118,11 +118,7 @@ def batch_by_experts(
     num_tokens = a.shape[0]
     topk = topk_ids.shape[1]
 
-    tokens_per_expert = torch.zeros(num_experts, dtype=torch.int, device=a.device)
-    for i in range(num_tokens):
-        for j in range(topk):
-            expert_id = topk_ids[i, j]
-            tokens_per_expert[expert_id] = tokens_per_expert[expert_id] + 1
+    tokens_per_expert = torch.bincount(topk_ids.view(-1), minlength=num_experts)
 
     max_num_tokens = tokens_per_expert.max()
     b_a = torch.zeros((num_experts, max_num_tokens, a.shape[1]),
@@ -170,7 +166,6 @@ def torch_batched_moe(a, w1, w2, tokens_per_expert, topk_weight, topk_ids):
         num = tokens_per_expert[expert]
         if num > 0:
             out[expert, :num, :] = SiluAndMul()(a[expert,:num,:] @ w1[expert].transpose(0, 1)) @ w2[expert].transpose(0, 1)
-            #out[expert, :, :] = SiluAndMul()(a[expert,:,:] @ w1[expert].transpose(0, 1)) @ w2[expert].transpose(0, 1)
 
     out = unbatch_output(out, topk_weight, topk_ids, K)
 
@@ -231,12 +226,14 @@ def test_fused_moe_batched_experts(
                                               topk_weight,
                                               topk_ids)
         else:
-            triton_output = fused_experts(b_a,
-                                          w1,
-                                          w2,
-                                          topk_weight,
-                                          topk_ids,
-                                          global_num_experts=e)
+            triton_output = fused_batched_experts(
+                b_a,
+                w1,
+                w2,
+                topk_weight,
+                topk_ids,
+                global_num_experts=e
+            )
 
     if False:
         torch.set_printoptions(profile="full")
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1754,6 +1754,72 @@ def apply(
         return intermediate_cache3
 
 
+class BatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
+
+    def __init__(
+        self,
+        use_fp8_w8a8: bool = False,
+        use_int8_w8a16: bool = False,
+        use_int4_w4a16: bool = False,
+        block_shape: Optional[List[int]] = None,
+        block_m: Optional[int] = None,
+    ):
+        super().__init__()
+        assert not use_fp8_w8a8
+        assert not use_int4_w4a16
+        assert not use_int8_w8a16
+        assert block_shape is None
+        assert block_m is None
+
+    def workspace_shapes(
+        self,
+        a_dtype: torch.dtype,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        num_experts: int,
+        a: torch.Tensor,
+    ) -> Tuple[int, int, torch.dtype]:
+        max_num_tokens = a.shape[1]
+        workspace13 = num_experts * max_num_tokens * K
+        workspace2 = M * topk * N * num_experts
+        return (workspace13, workspace2, a_dtype)
+
+    def apply(
+        self,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+    ) -> torch.Tensor:
+        from vllm.model_executor.layers.activation import SiluAndMul
+        assert hidden_states.dim() == 3
+        num_tokens, topk = topk_ids.shape
+        _, max_num_tokens, K = hidden_states.shape
+        num_experts = w1.shape[0]
+        out = _resize_cache(workspace13, (num_experts, max_num_tokens, w2.shape[1]))
+        #tokens_per_expert = torch.bincount(topk_ids.view(-1), minlength=num_experts)
+        for expert in range(num_experts):
+            num = 1 #tokens_per_expert[expert]
+            if num > 0:
+                #out[expert, :num, :] = SiluAndMul(hidden_states[expert,:num,:] @ w1[expert].transpose(0, 1)) @ w2[expert].transpose(0, 1)
+                out[expert, :, :] = SiluAndMul()(hidden_states[expert,:,:] @ w1[expert].transpose(0, 1)) @ w2[expert].transpose(0, 1)
+
+        return out
+
+
 def modular_triton_fused_moe(
     use_fp8_w8a8: bool,
     use_int8_w8a8: bool,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -29,7 +29,7 @@
 
 if current_platform.is_cuda_alike():
     from .dispatch_combine import StandardDispatchCombine
-    from .fused_moe import TritonExperts, fused_experts
+    from .fused_moe import TritonExperts, BatchedExperts, fused_experts
     from .modular_kernel import FusedMoEModularKernel, FusedMoEQuantizeDispatchCombine
     from .pplx_dispatch_combine import PplxDispatchCombine
 else:
@@ -243,13 +243,16 @@ def set_dispatch_combine(self, dispatch_combine: FusedMoEQuantizeDispatchCombine
         block_m = MOE_DP_CHUNK_SIZE * (self.moe.ep_size // self.moe.dp_size)
         #print(f"block_m = {block_m}")
 
-        experts = TritonExperts(
-            use_fp8_w8a8 = False,
-            use_int8_w8a16 = False,
-            use_int4_w4a16 = False,
-            block_shape = None,
-            block_m = None, #block_m,
-        )
+        if False:
+            experts = TritonExperts(
+                use_fp8_w8a8 = False,
+                use_int8_w8a16 = False,
+                use_int4_w4a16 = False,
+                block_shape = None,
+                block_m = None, #block_m,
+            )
+        else:
+            experts = BatchedExperts()
 
         self.fused_experts = FusedMoEModularKernel(
             dispatch_combine,
@@ -1029,7 +1032,7 @@ def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
             hidden_states = full_hidden_states[chunk_start:chunk_end, :]
             router_logits = full_router_logits[chunk_start:chunk_end, :]
 
-            print(f"loop {chunk_start}:{chunk_end}")
+            #print(f"loop {chunk_start}:{chunk_end}")
 
             cu_tokens_across_dp_this_iter = torch.cumsum(
                 num_tokens_remaining_across_dp.clamp(
diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
@@ -35,16 +35,23 @@ def __init__(
         self.allow_deep_gemm = allow_deep_gemm
         self.use_fp8_w8a8 = use_fp8_w8a8
 
-    def workspace_shapes(self, a_dtype: torch.dtype, M: int, N: int, K: int,
-                         topk: int,
-                         num_experts: int) -> Tuple[int, int, torch.dtype]:
+    def workspace_shapes(
+        self,
+        a_dtype: torch.dtype,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        num_experts: int,
+        a: torch.Tensor,
+    ) -> Tuple[int, int, torch.dtype]:
         # Note: the deep gemm workspaces are strictly larger than the triton
         # workspaces so we can be pessimistic here and allocate for DeepGemm
         # even if we fall back to triton later, e.g. if expert maps are set.
         if self.allow_deep_gemm and _valid_deep_gemm_shape(M, N, K):
-            return self.deep_gemm_expert.workspace_shapes(a_dtype, M, N, K, topk, num_experts)
+            return self.deep_gemm_expert.workspace_shapes(a_dtype, M, N, K, topk, num_experts, a)
         else:
-            return self.triton_expert.workspace_shapes(a_dtype, M, N, K, topk, num_experts)
+            return self.triton_expert.workspace_shapes(a_dtype, M, N, K, topk, num_experts, a)
 
     def apply(
         self,