improve ref impl

bnellnm · bnellnm · commit a003bd8bd2f1 · 2025-05-07T15:37:14.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
@@ -276,7 +276,7 @@ def torch_moe2(a, w1, w2, topk_weight, topk_ids):
 
 @pytest.mark.parametrize("m", [1, 33, 64, 222])
 @pytest.mark.parametrize("n", [128, 1024, 2048])
-@pytest.mark.parametrize("k", [128, 511, 1024])
+@pytest.mark.parametrize("k", [128, 512, 1024])
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -491,6 +491,7 @@ def dispatch(
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+        assert a1.dim() == 2
         assert topk_ids.dim() == 2
         assert topk_ids.shape[0] == a1.shape[0]
 
@@ -504,11 +505,13 @@ def dispatch(
         num_tokens, hidden_dim = a1.shape
         topk = topk_ids.shape[1]
 
-        tokens_per_expert = torch.bincount(topk_ids.view(-1),
-                                           minlength=num_experts)
-
         if self.max_num_tokens is None:
+            tokens_per_expert = torch.bincount(topk_ids.view(-1),
+                                               minlength=num_experts)
             self.max_num_tokens = int(tokens_per_expert.max().item())
+        else:
+            tokens_per_expert = torch.zeros(num_experts, dtype=torch.int,
+                                            device=a1.device)
 
         rem_experts = num_experts % self.world_size
         num_local_experts = ((num_experts // self.world_size) +
@@ -518,23 +521,27 @@ def dispatch(
                            dtype=a1.dtype,
                            device=a1.device)
 
-        token_counts = torch.zeros(num_local_experts,
-                                   dtype=torch.int,
-                                   device=a1.device)
-
         first_expert = (((num_experts // self.world_size) * self.rank) +
                         rem_experts - self.rank)
         last_expert = first_expert + num_local_experts
-        #expert_id_range = range(first_expert, last_expert)
 
-        for token in range(num_tokens):
-            for j in range(topk):
-                expert_id = topk_ids[token, j]
-                if expert_id >= first_expert and expert_id < last_expert:
-                    rel_index = expert_id - first_expert
-                    idx = token_counts[rel_index]
-                    b_a1[rel_index, idx:idx + 1, :] = a1[token, :]
-                    token_counts[rel_index] = token_counts[rel_index] + 1
+        # rhs = torch.empty((self.max_num_tokens, hidden_dim),
+        #                   dtype=a1.dtype, device=a1.device)
+
+        # for expert_id in range(first_expert, last_expert):
+        #     topks = torch.any(topk_ids == expert_id, dim=1).flatten()
+        #     rows = torch.count_nonzero(topks.flatten())
+        #     #rhs[:rows] = a1[:topks.numel()][topks]
+        #     topks_idx = topks.nonzero()
+        #     torch.index_select(a1, dim=0, index=topks_idx.flatten(), out=rhs[:rows])
+        #     b_a1[expert_id - first_expert, :rows, :] = rhs[:rows]
+        #     tokens_per_expert[expert_id - first_expert] = rows
+
+        for expert_id in range(first_expert, last_expert):
+            topks = torch.any(topk_ids == expert_id, dim=1).flatten()
+            rows = torch.count_nonzero(topks.flatten())
+            b_a1[expert_id - first_expert, :rows, :] = a1[:topks.numel()][topks]
+            tokens_per_expert[expert_id - first_expert] = rows
 
         return b_a1, a1_scale, tokens_per_expert
 
@@ -548,31 +555,32 @@ def combine(
     ) -> None:
         num_tokens = topk_ids.shape[0]
         num_local_experts = fused_expert_output.shape[0]
-        num_experts = num_local_experts * self.world_size # NOT QUITE RIGHT
+        topk = topk_weights.shape[1]
         K = fused_expert_output.shape[-1]
         assert output.shape[0] == num_tokens and output.shape[1] == K
-        expert_counts = torch.zeros(
-            num_experts,
-            dtype=torch.int,
-            device=fused_expert_output.device)
 
         output.fill_(0)
 
         first_expert = num_local_experts * self.rank # NOT QUITE RIGHT
         last_expert = first_expert + num_local_experts
 
-        for token in range(num_tokens):
-            expert_ids = topk_ids[token]
-            for i in range(expert_ids.numel()):
-                expert_id = expert_ids[i]
-                if expert_id >= first_expert and expert_id < last_expert:
-                    assert expert_id < num_experts
-                    idx = expert_counts[expert_id]
-                    accum = fused_expert_output[expert_id - first_expert, idx:idx + 1, :]
-                    if not apply_router_weight_on_input:
-                        accum = accum * topk_weights[token, i]
-                    output[token, :] = output[token, :] + accum
-                    expert_counts[expert_id] = expert_counts[expert_id] + 1
+        # for expert_id in range(first_expert, last_expert):
+        #     topkws = topk_ids == expert_id
+        #     topks = torch.any(topkws, dim=1).flatten()
+        #     outrhs = output[topks]
+        #     rhs = fused_expert_output[expert_id - first_expert, :outrhs.shape[0], :]
+        #     if not apply_router_weight_on_input:
+        #         rhs.mul_(topk_weights[topkws].view(rhs.shape[0], 1))
+        #     output[topks] = outrhs + rhs
+
+        for expert_id in range(first_expert, last_expert):
+            topkws = topk_ids == expert_id
+            topks = torch.any(topkws, dim=1).flatten()
+            rows = torch.count_nonzero(topks)
+            rhs = fused_expert_output[expert_id - first_expert, :rows, :]
+            if not apply_router_weight_on_input:
+                rhs.mul_(topk_weights[topkws].view(rhs.shape[0], 1))
+            output[topks] = output[topks] + rhs
 
 
 class BatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -262,7 +262,7 @@ def set_dispatch_combine(
         if isinstance(dispatch_combine,
                       (BatchedDispatchCombine, PplxDispatchCombine)):
             logger.debug("BatchedTritonExperts %s", self.moe)
-            experts = BatchedTritonExperts(
+            experts = BatchedExperts(
                 max_num_tokens=MOE_DP_CHUNK_SIZE,
                 use_fp8_w8a8=False,
                 use_int8_w8a8=False,
@@ -695,8 +695,6 @@ def _construct_dispatch_combine(
                 rank,
                 moe.in_dtype,
             )
-        elif False:
-            return None
         elif self.dp_size > 1:
             logger.debug("using batched dispatch")
             dp_size = moe.ep_size // moe.dp_size  # dp_size actually means TP.
@@ -707,6 +705,8 @@ def _construct_dispatch_combine(
                 dp_size=dp_size,
                 rank=rank,
             )
+        elif False:
+            return None
         else:
             logger.debug("using standard dispatch")
             return StandardDispatchCombine(
diff --git a/vllm/model_executor/layers/fused_moe/pplx_dispatch_combine.py b/vllm/model_executor/layers/fused_moe/pplx_dispatch_combine.py
@@ -72,6 +72,7 @@ def dispatch(
                                                    per_act_token,
                                                    self.block_shape)
 
+        # TODO: does rem_experts need to be 0 for pplx to work properly?
         rem_experts = num_experts % self.world_size
         num_local_experts = ((num_experts // self.world_size) +
                              (1 if self.rank < rem_experts else 0))