hack fix for chunking loop

bnellnm · bnellnm · commit 4971b43bb85d · 2025-04-30T16:53:32.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
@@ -108,7 +108,7 @@ def test_fused_moe(
                                rtol=0)
 
 
-def batch_by_experts(
+def torch_dispatch(
     a: torch.Tensor,
     topk_ids: torch.Tensor,
     num_experts: int
@@ -138,14 +138,14 @@ def batch_by_experts(
     return b_a, tokens_per_expert
 
 
-def unbatch_output(b_out, topk_weight, topk_ids, K):
+def torch_combine(b_out, topk_weight, topk_ids):
     num_tokens, topk = topk_ids.shape
 
     num_experts = b_out.shape[0]
     topk = topk_ids.shape[1]
+    K = b_out.shape[-1]
     out = torch.zeros((num_tokens, K), dtype=b_out.dtype, device=b_out.device)
     expert_counts = torch.zeros(num_experts, dtype=torch.int, device=b_out.device)
-    experts = torch.arange(0, num_experts, dtype=torch.int, device=b_out.device)
     for token in range(num_tokens):
         expert_ids = topk_ids[token]
         for i in range(expert_ids.numel()):
@@ -157,22 +157,25 @@ def unbatch_output(b_out, topk_weight, topk_ids, K):
     return out
 
 
-def torch_batched_moe(a, w1, w2, tokens_per_expert, topk_weight, topk_ids):
-    assert a.dim() == 3
-    num_tokens, topk = topk_ids.shape
-    _, max_num_tokens, K = a.shape
+def torch_batched_moe(a, w1, w2, topk_weight, topk_ids):
     num_experts = w1.shape[0]
-    out = torch.zeros((num_experts, max_num_tokens, w2.shape[1]), dtype=a.dtype, device=a.device)
+    b_a, tokens_per_expert = torch_dispatch(a, topk_ids, num_experts)
+    assert b_a.dim() == 3
+    num_tokens, topk = topk_ids.shape
+    _, max_num_tokens, K = b_a.shape
+    assert num_experts == b_a.shape[0] and K == w2.shape[1]
+    out = torch.zeros((num_experts, max_num_tokens, K), dtype=b_a.dtype, device=b_a.device)
+    tmp = torch.empty((max_num_tokens, w1.shape[1] // 2), dtype=b_a.dtype, device=b_a.device)
     for expert in range(num_experts):
         num = tokens_per_expert[expert]
         if num > 0:
-            out[expert, :num, :] = SiluAndMul()(a[expert,:num,:] @ w1[expert].transpose(0, 1)) @ w2[expert].transpose(0, 1)
+            torch.ops._C.silu_and_mul(tmp[:num], b_a[expert,:num,:] @ w1[expert].transpose(0, 1))
+            out[expert, :num, :] = tmp[:num] @ w2[expert].transpose(0, 1)
 
-    out = unbatch_output(out, topk_weight, topk_ids, K)
-
-    return out
+    return torch_combine(out, topk_weight, topk_ids)
 
 
+# TODO: same as torch_moe but with fused_topk factored out.
 def torch_moe2(a, w1, w2, topk_weight, topk_ids):
     M, K = a.shape
     topk = topk_ids.shape[1]
@@ -217,16 +220,14 @@ def test_fused_moe_batched_experts(
 
         torch_output = torch_moe2(a, w1, w2, topk_weight, topk_ids)
 
-        b_a, tokens_per_expert = batch_by_experts(a, topk_ids, e)
-
         if True:
-            triton_output = torch_batched_moe(b_a,
+            triton_output = torch_batched_moe(a,
                                               w1,
                                               w2,
-                                              tokens_per_expert,
                                               topk_weight,
                                               topk_ids)
         else:
+            b_a, tokens_per_expert = batch_by_experts(a, topk_ids, e)
             triton_output = fused_batched_experts(
                 b_a,
                 w1,
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1783,7 +1783,7 @@ def workspace_shapes(
     ) -> Tuple[int, int, torch.dtype]:
         max_num_tokens = a.shape[1]
         workspace13 = num_experts * max_num_tokens * K
-        workspace2 = M * topk * N * num_experts
+        workspace2 = max_num_tokens * (N // 2)
         return (workspace13, workspace2, a_dtype)
 
     def apply(
@@ -1810,12 +1810,14 @@ def apply(
         _, max_num_tokens, K = hidden_states.shape
         num_experts = w1.shape[0]
         out = _resize_cache(workspace13, (num_experts, max_num_tokens, w2.shape[1]))
+        # causes deadlock
         #tokens_per_expert = torch.bincount(topk_ids.view(-1), minlength=num_experts)
         for expert in range(num_experts):
-            num = 1 #tokens_per_expert[expert]
+            num = max_num_tokens #tokens_per_expert[expert]
             if num > 0:
-                #out[expert, :num, :] = SiluAndMul(hidden_states[expert,:num,:] @ w1[expert].transpose(0, 1)) @ w2[expert].transpose(0, 1)
-                out[expert, :, :] = SiluAndMul()(hidden_states[expert,:,:] @ w1[expert].transpose(0, 1)) @ w2[expert].transpose(0, 1)
+                tmp = _resize_cache(workspace2, (num, w1.shape[1] // 2))
+                torch.ops._C.silu_and_mul(tmp, hidden_states[expert,:num,:] @ w1[expert].transpose(0, 1))
+                out[expert, :num, :] = tmp @ w2[expert].transpose(0, 1)
 
         return out
 
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1028,11 +1028,15 @@ def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
                         full_hidden_states.shape[0])
         full_final_hidden_states = torch.empty_like(full_hidden_states)
 
-        for _ in range(0, max_tokens_across_dp, moe_dp_chunk_size_per_rank):
+        #print(f"ORIGINAL SHAPE {full_hidden_states.shape}")
+
+        #print(f"moe_dp_chunk_size_per_rank = {moe_dp_chunk_size_per_rank}")
+
+        for iter in range(0, max_tokens_across_dp, moe_dp_chunk_size_per_rank):
             hidden_states = full_hidden_states[chunk_start:chunk_end, :]
             router_logits = full_router_logits[chunk_start:chunk_end, :]
 
-            #print(f"loop {chunk_start}:{chunk_end}")
+            #print(f"loop {iter}: {chunk_start}:{chunk_end}, {hidden_states.shape}")
 
             cu_tokens_across_dp_this_iter = torch.cumsum(
                 num_tokens_remaining_across_dp.clamp(
@@ -1062,6 +1066,8 @@ def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
                 activation=self.activation,
             )
 
+            #print(f"final1 = {final_hidden_states.shape}")
+
             if self.dp_size > 1:
                 start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_this_iter[
                     self.dp_rank - 1]
@@ -1071,19 +1077,31 @@ def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
                     final_hidden_states)
                 final_hidden_states = all_hidden_states[start:end, :]
 
+                #print(f"final2 (AR) = {final_hidden_states.shape}")
+
             if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
                 # Default set to False. (May have to add shared expert outputs.)
                 final_hidden_states = tensor_model_parallel_all_reduce(
                     final_hidden_states)
 
+                #print(f"final3 (AR) = {final_hidden_states.shape}")
+
             full_final_hidden_states[chunk_start:chunk_end, :].copy_(
                 final_hidden_states)
 
+            #print(f"full final = {full_final_hidden_states.shape}")
+
             # Update bounds
             num_tokens_remaining_across_dp = torch.clamp(
                 num_tokens_remaining_across_dp - moe_dp_chunk_size_per_rank,
                 min=0)
 
+            #print(f"num remaining = {num_tokens_remaining_across_dp}")
+
+            # HACK FIX
+            if num_tokens_remaining_across_dp.sum() == 0:
+                break
+
             def update_chunk_bound(x: int):
                 return min(x + moe_dp_chunk_size_per_rank,
                            full_hidden_states.shape[0])
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -312,8 +312,8 @@ def forward(
         Returns:
         - torch.Tensor: The output tensor after applying the MoE layer.
         """
-        from vllm.distributed import (get_dp_group, get_tensor_model_parallel_rank)
-        print(f"START {hidden_states.shape} {topk_ids.shape} {get_tensor_model_parallel_rank()}/{get_dp_group().rank_in_group}")
+        #from vllm.distributed import (get_dp_group, get_tensor_model_parallel_rank)
+        #print(f"START {hidden_states.shape} {topk_ids.shape} {get_tensor_model_parallel_rank()}/{get_dp_group().rank_in_group}")
 
         a1 = hidden_states
         E, M, N, K, top_k = _moe_problem_size(a1, w1, w2, topk_ids)
@@ -364,6 +364,6 @@ def forward(
         self.dispatch_combine.combine(output, fused_out, topk_weights,
                                       topk_ids, apply_router_weight_on_input)
 
-        print(f"DONE {hidden_states.shape} {topk_ids.shape} {get_tensor_model_parallel_rank()}/{get_dp_group().rank_in_group}")
+        #print(f"DONE {hidden_states.shape} {topk_ids.shape} {get_tensor_model_parallel_rank()}/{get_dp_group().rank_in_group}")
 
         return output
diff --git a/vllm/model_executor/layers/fused_moe/pplx_dispatch_combine.py b/vllm/model_executor/layers/fused_moe/pplx_dispatch_combine.py
@@ -133,7 +133,7 @@ def combine(
         #device = get_dp_group().device
         #assert fused_expert_output.device == device
 
-        print(f"COMBINE START {self.rank}")
+        #print(f"COMBINE START {self.rank}")
 
         # This argument is optional
         #bound_m = get_forward_context().dp_metadata.dp_rank_num_tokens
@@ -154,4 +154,4 @@ def combine(
                          expert_y=fused_expert_output,
                          bound_m=bound_m)
 
-        print(f"COMBINE END {self.rank}")
+        #print(f"COMBINE END {self.rank}")