some cleanup

bnellnm · bnellnm · commit 4fb31ef0c777 · 2025-04-30T16:53:32.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/tests/kernels/test_pplx_moe.py b/tests/kernels/test_pplx_moe.py
@@ -299,10 +299,13 @@ def test_fused_moe_batched_experts(
     torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
 
 
+def rank_chunk(num, r, w):
+    rem = num % w
+    return (num // w) + (1 if r < rem else 0)
+
+
 def chunk_by_rank(t, r, w):
-    num = t.shape[0]
-    assert num % w == 0, f"{num}, {w}"  # for now
-    chunk = num // w
+    chunk = rank_chunk(t.shape[0], r, w)
     #print(f"chunk {t.shape}, {w}, {r}, {chunk}, {r*chunk}:{(r + 1)*chunk}")
     return t[(r * chunk):(r + 1)*chunk]
 
@@ -312,12 +315,11 @@ def torch_pplx_dispatch_combine(pgi, dp_size, a, w1, w2, scores, topk):
 
     num_tokens, hidden_dim = a.shape
     num_experts = w1.shape[0]
-    num_local_experts = w1.shape[0] // pgi.world_size
     block_size = 128
     device = pgi.device
-    rank_num_tokens = num_tokens // pgi.world_size
     rank = pgi.rank
     world_size = pgi.world_size
+    rank_num_tokens = rank_chunk(num_tokens, rank, world_size)
     max_num_tokens = num_tokens
     #print(f"device = {device}, max_num_tokens = {max_num_tokens}, topk = {topk}, num_ex = {num_experts}, dp_size = {dp_size}")
 
@@ -354,7 +356,7 @@ def torch_pplx_dispatch_combine(pgi, dp_size, a, w1, w2, scores, topk):
     score_chunk = chunk_by_rank(scores, rank, world_size).to(device)
     chunk_topk_weight, chunk_topk_ids = fused_topk(a_chunk, score_chunk, topk, False)
 
-    print(f"chunk_topk_ids = {chunk_topk_ids.view(-1)}")
+    #print(f"chunk_topk_ids = {chunk_topk_ids.view(-1)}")
 
     b_a, b_a_scale, expert_num_tokens = dispatch_combine.dispatch(
         a_chunk,
@@ -372,8 +374,8 @@ def torch_pplx_dispatch_combine(pgi, dp_size, a, w1, w2, scores, topk):
     #max_num = tokens_per_expert.max()
     tokens_per_expert = chunk_by_rank(tokens_per_expert, rank, world_size).to(dtype=torch.int32)
 
-    print(f"tpe {tokens_per_expert}")
-    print(f"ent {expert_num_tokens}")
+    #print(f"tpe {tokens_per_expert}")
+    #print(f"ent {expert_num_tokens}")
 
     #torch.set_printoptions(profile="full")
     #torch.distributed.all_reduce(naive_b_a, op=torch.distributed.ReduceOp.MAX)
@@ -501,15 +503,12 @@ def torch_pplx_moe(pgi, dp_size, a, w1, w2, scores, topk):
 
     num_tokens, hidden_dim = a.shape
     num_experts = w1.shape[0]
-    num_local_experts = num_experts // pgi.world_size
     block_size = 128
     device = pgi.device
-    rank_num_tokens = num_tokens // pgi.world_size  # TODO even divide
-
-    max_num_tokens = num_tokens
-    #print(f"device = {device}, max_num_tokens = {max_num_tokens}, topk = {topk}, num_ex = {num_experts}, dp_size = {dp_size}")
     rank = pgi.rank
     world_size = pgi.world_size
+    rank_num_tokens = rank_chunk(num_tokens, rank, world_size)
+    max_num_tokens = num_tokens
 
     ata = AllToAll(
         max_num_tokens=max_num_tokens,
@@ -558,6 +557,7 @@ def torch_pplx_moe(pgi, dp_size, a, w1, w2, scores, topk):
 
     out = fused_experts(
         a_chunk,
+        # Chunking weights like this only works for batched format
         chunk_by_rank(w1, rank, world_size),
         chunk_by_rank(w2, rank, world_size),
         chunk_topk_weight,
@@ -571,7 +571,7 @@ def torch_pplx_moe(pgi, dp_size, a, w1, w2, scores, topk):
 
     #print(f"OUT {rank}: {out.shape} {out}")
 
-    return out[:rank_num_tokens] # chunk_by_rank?
+    return out[:rank_num_tokens]
 
 
 def _pplx_moe(
@@ -624,18 +624,13 @@ def _pplx_moe(
     nvshmem_finalize()
 
 
-@pytest.mark.parametrize("m", [2, 32, 64, 222]) #, 1024 * 128])
-@pytest.mark.parametrize("n", [128, 1024, 2048])
-@pytest.mark.parametrize("k", [128, 512, 1024])
+# TODO: M == 1 doesn't work
+@pytest.mark.parametrize("m", [2, 3, 32, 45, 64, 222]) #, 1024 * 128])
+@pytest.mark.parametrize("n", [128, 1024])# , 2048])
+@pytest.mark.parametrize("k", [128, 512]) # , 1024])
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-# @pytest.mark.parametrize("m", [64]) ##, 32]) #, 1024 * 128])
-# @pytest.mark.parametrize("n", [128])
-# @pytest.mark.parametrize("k", [128])
-# @pytest.mark.parametrize("e", [8]) #NUM_EXPERTS)
-# @pytest.mark.parametrize("topk", [2]) #TOP_KS)
-# @pytest.mark.parametrize("dtype", [torch.bfloat16])
 @pytest.mark.parametrize("world_dp_size", [[2, 1]]) #, [4, 2]])
 def test_pplx_moe(
     m: int,
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1777,9 +1777,6 @@ def dispatch(
         num_tokens = a1.shape[0]
         topk = topk_ids.shape[1]
 
-        #assert num_experts % self.world_size == 0
-        #num_local_experts = num_experts // self.world_size
-
         tokens_per_expert = torch.bincount(topk_ids.view(-1), minlength=num_experts)
         max_num_tokens = tokens_per_expert.max()
         expert_counts = torch.zeros(num_experts, dtype=torch.int, device=a1.device)
@@ -1892,31 +1889,20 @@ def apply(
         num_tokens, topk = topk_ids.shape
         _, tmp_max_num_tokens, K = hidden_states.shape
         max_num_tokens = tmp_max_num_tokens if self.max_num_tokens is None else self.max_num_tokens
-        print(f"global_num_experts = {global_num_experts}")
+        #print(f"global_num_experts = {global_num_experts}")
         num_experts = global_num_experts
         out = _resize_cache(workspace13, (num_experts, max_num_tokens, w2.shape[1]))
         num_local_experts = expert_num_tokens.numel()
-        #assert num_local_experts >= topk_ids.view(-1).max()
-        #print(f"apply a={hidden_states}")
-        #print(f"apply topk={topk_ids}")
-        #print(f"apply num_tokens={expert_num_tokens}")
+        #print(f"shapes = {hidden_states.shape}, {w1.shape}, {w2.shape}, {out.shape} {expert_num_tokens.shape} {workspace2.shape} {num_experts}")
 
         for expert in range(num_local_experts):  # num_experts
             num = expert_num_tokens[expert]
-            assert num <= max_num_tokens
+            assert num <= max_num_tokens, f"{num}, {max_num_tokens}"
+            #print(f"{type(num)}, {num}, {max_num_tokens}")
             if num > 0:
                 tmp = _resize_cache(workspace2, (num, w1.shape[1] // 2))
                 self.activation(activation, tmp, hidden_states[expert,:num,:] @ w1[expert].transpose(0, 1))
                 out[expert, :num, :] = tmp @ w2[expert].transpose(0, 1)
-                # fill remainder with 0???
-                #out[expert, num:, :].fill_(0)
-            else:
-                #out[expert, :, :].fill_(0) # ??
-                pass
-
-        #print("END EXPERTS")
-
-        #print(f"apply out={out}")
 
         return out
 
diff --git a/vllm/model_executor/layers/fused_moe/pplx_dispatch_combine.py b/vllm/model_executor/layers/fused_moe/pplx_dispatch_combine.py
@@ -84,7 +84,7 @@ def dispatch(
             dtype=a1q.dtype,
             device=device,
         )
-        expert_x.fill_(0) #torch.nan   # debugging, remove later
+        #expert_x.fill_(0) #torch.nan   # debugging, remove later
 
         logger.debug(f"GOT HERE B {self.rank}")
 

Original file line number	Diff line number	Diff line change
`@@ -84,7 +84,7 @@ def dispatch(`
`84`	`84`	`dtype=a1q.dtype,`
`85`	`85`	`device=device,`
`86`	`86`	`)`
`87`		`- expert_x.fill_(0) #torch.nan # debugging, remove later`
	`87`	`+ #expert_x.fill_(0) #torch.nan # debugging, remove later`
`88`	`88`
`89`	`89`	`logger.debug(f"GOT HERE B {self.rank}")`
`90`	`90`