wip

bnellnm · bnellnm · commit e0560d5c178d · 2025-04-30T16:53:32.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/tests/kernels/test_pplx_moe.py b/tests/kernels/test_pplx_moe.py
@@ -535,14 +535,14 @@ def torch_pplx_moe(pgi, dp_size, a, w1, w2, scores, topk):
 
     dispatch_combine = PplxDispatchCombine(
         ata,
-        max_num_tokens, # // world_size?
+        max_num_tokens,
         pgi.world_size,
         dp_size,
         rank,
         a.dtype,
     )
 
-    experts = BatchedExperts(max_num_tokens, rank)
+    experts = BatchedExperts(rank, pgi.world_size, max_num_tokens)
 
     fused_experts = FusedMoEModularKernel(
         dispatch_combine,
@@ -560,6 +560,8 @@ def torch_pplx_moe(pgi, dp_size, a, w1, w2, scores, topk):
         # Chunking weights like this only works for batched format
         chunk_by_rank(w1, rank, world_size),
         chunk_by_rank(w2, rank, world_size),
+        #w1,
+        #w2,
         chunk_topk_weight,
         chunk_topk_ids,
         global_num_experts=num_experts #? num_local_experts?
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1827,12 +1827,18 @@ def combine(
         #print(f"END COMBINE {hex(id(self))}")
 
 
+def rank_chunk(num, r, w):
+    rem = num % w
+    return (num // w) + (1 if r < rem else 0)
+
+
 class BatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
     def __init__(
         self,
-        max_num_tokens: Optional[int] = None,
         rank: int = 0,
+        world_size: int = 1,
+        max_num_tokens: Optional[int] = None,
         use_fp8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
         use_int4_w4a16: bool = False,
@@ -1847,6 +1853,7 @@ def __init__(
         assert block_m is None
         self.max_num_tokens = max_num_tokens
         self.rank = rank
+        self.world_size = world_size
 
     def workspace_shapes(
         self,
@@ -1895,14 +1902,19 @@ def apply(
         num_local_experts = expert_num_tokens.numel()
         #print(f"shapes = {hidden_states.shape}, {w1.shape}, {w2.shape}, {out.shape} {expert_num_tokens.shape} {workspace2.shape} {num_experts}")
 
+        # TODO: don't need world_size or rank if expert_base always == 0
+        #assert w1.shape[0] == num_experts, f"{w1.shape} == {num_experts}"
+        #expert_base = rank_chunk(w1.shape[0], self.rank, self.world_size) * self.rank
+        expert_base = 0
+
         for expert in range(num_local_experts):  # num_experts
             num = expert_num_tokens[expert]
             assert num <= max_num_tokens, f"{num}, {max_num_tokens}"
             #print(f"{type(num)}, {num}, {max_num_tokens}")
             if num > 0:
                 tmp = _resize_cache(workspace2, (num, w1.shape[1] // 2))
-                self.activation(activation, tmp, hidden_states[expert,:num,:] @ w1[expert].transpose(0, 1))
-                out[expert, :num, :] = tmp @ w2[expert].transpose(0, 1)
+                self.activation(activation, tmp, hidden_states[expert,:num,:] @ w1[expert_base + expert].transpose(0, 1))
+                out[expert, :num, :] = tmp @ w2[expert_base + expert].transpose(0, 1)
 
         return out
 
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -246,8 +246,8 @@ def set_dispatch_combine(self, dispatch_combine: FusedMoEQuantizeDispatchCombine
         #print(f"block_m = {block_m}")
 
         if isinstance(dispatch_combine, (BatchedDispatchCombine, PplxDispatchCombine)):
-            logger.info("BatchedExperts")
-            experts = BatchedExperts()
+            logger.info(f"BatchedExperts {self.moe}")
+            experts = BatchedExperts() #rank=self.moe.ep_rank, world_size=self.moe.ep_size)
         else:
             experts = TritonExperts(
                 use_fp8_w8a8 = False,