wip

bnellnm · bnellnm · commit a87645419c78 · 2025-04-30T16:53:32.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/tests/kernels/test_pplx_moe.py b/tests/kernels/test_pplx_moe.py
@@ -23,10 +23,7 @@
 import vllm.model_executor.layers.fused_moe  # noqa
 from tests.kernels.utils import (compute_max_diff, opcheck, stack_and_dev,
                                  torch_moe, torch_moe_single)
-#from vllm import _custom_ops as ops
 from vllm.config import VllmConfig, set_current_vllm_config
-#from vllm.model_executor.layers.fused_moe import fused_moe
-#from vllm.model_executor.layers.fused_moe.fused_batched_moe import fused_batched_experts
 from vllm.model_executor.layers.fused_moe.fused_moe import (
     fused_topk, moe_align_block_size)
 from vllm.platforms import current_platform
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
@@ -93,7 +93,7 @@ def set_forward_context(attn_metadata: Any,
         from vllm.distributed.parallel_state import get_dp_group
         dist.all_reduce(num_tokens_tensor, group=get_dp_group().cpu_group)
         #TODO device?
-        max_tokens_across_dp = torch.max(num_tokens_tensor).to(device="cuda")
+        max_tokens_across_dp = torch.max(num_tokens_tensor) #.to(device="cuda")
         cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_tensor, dim=0)
         dp_rank_num_tokens = torch.tensor(
             [num_tokens],
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1594,8 +1594,9 @@ def workspace_shapes(
         topk: int,
         num_experts: int,
     ) -> Tuple[int, int, torch.dtype]:
-        workspace1 = M * topk * max(N * 2, K)
-        workspace2 = M * topk * N
+        factor = num_experts if a.dim() == 3 else 1
+        workspace1 = M * topk * max(N * 2, K) * factor
+        workspace2 = M * topk * N * factor
         return (workspace1, workspace2, a.dtype)
 
     def apply(
@@ -1686,16 +1687,15 @@ def apply(
                     global_num_experts, expert_map
                 ))
         else:
-            #stride = hidden_states.shape[1]
-            sorted_token_ids = torch.arange(0, num_tokens*hidden_states.shape[1], device=hidden_states.device, dtype=torch.int)
+            max_num_tokens = hidden_states.shape[1]
+            sorted_token_ids = torch.arange(0, hidden_states.shape[0] * max_num_tokens, device=hidden_states.device, dtype=torch.int)
             sorted_token_ids = sorted_token_ids.flatten()
-            nans = torch.isnan(hidden_states).sum(dim=(1,2))
-            expert_ids = torch.where((nans > 0).flatten(), -1, torch.arange(0, nans.numel(), device=hidden_states.device, dtype=torch.int32))
-            #expert_ids = torch.repeat_interleave(expert_ids, hidden_states.shape[1], dim=0)
-            #print(f"EXPERT_IDS {nans.shape} {expert_ids}")
+            expert_ids = torch.arange(0, global_num_experts, device=hidden_states.device, dtype=torch.int)
+            expert_ids = torch.repeat_interleave(expert_ids, max_num_tokens, dim=0)
+            print(f"EXPERT_IDS {expert_ids}")
             #num_tokens_post_padded = torch.tensor([num_tokens], device=hidden_states.device, dtype=torch.int32)
             num_tokens_post_padded = torch.zeros(1, device=hidden_states.device, dtype=torch.int32)
-            num_tokens_post_padded.fill_(num_tokens)
+            num_tokens_post_padded.fill_(max_num_tokens)
             hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
             #print(f"P = {sorted_token_ids}, {hidden_states.shape}")
 
@@ -1857,19 +1857,18 @@ def __init__(
 
     def workspace_shapes(
         self,
-        a_dtype: torch.dtype,
+        a: torch.Tensor,
         M: int,
         N: int,
         K: int,
         topk: int,
         num_experts: int,
-        a: torch.Tensor,
     ) -> Tuple[int, int, torch.dtype]:
         #assert self.max_num_tokens >= a.shape[1]
         max_num_tokens = a.shape[1] if self.max_num_tokens is None else self.max_num_tokens
         workspace13 = num_experts * max_num_tokens * K * topk * 2 # TODO: *2 is a hack
         workspace2 = max_num_tokens * N
-        return (workspace13, workspace2, a_dtype)
+        return (workspace13, workspace2, a.dtype)
 
     def apply(
         self,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -249,6 +249,7 @@ def set_dispatch_combine(self, dispatch_combine: FusedMoEQuantizeDispatchCombine
             logger.info(f"BatchedExperts {self.moe}")
             experts = BatchedExperts() #rank=self.moe.ep_rank, world_size=self.moe.ep_size)
         else:
+            logger.info(f"TritonExperts {self.moe}")
             experts = TritonExperts(
                 use_fp8_w8a8 = False,
                 use_int8_w8a16 = False,
@@ -1011,21 +1012,20 @@ def forward(self, hidden_states: torch.Tensor,
                 router_logits: torch.Tensor):
         if self.use_direct_call:
             return self.forward_impl(hidden_states, router_logits)
-        else:
+        elif True:
             return torch.ops.vllm.moe_forward(hidden_states, router_logits,
                                               self.layer_name)
 
     def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
                              full_router_logits: torch.Tensor):
 
-        max_tokens_across_dp = get_forward_context(
-        ).dp_metadata.max_tokens_across_dp
-        cu_tokens_across_dp_cpu = get_forward_context(
-        ).dp_metadata.cu_tokens_across_dp_cpu
-        num_tokens_across_dp = get_forward_context(
-        ).dp_metadata.num_tokens_across_dp
+        ctx = get_forward_context()
+
+        max_tokens_across_dp = ctx.dp_metadata.max_tokens_across_dp
+        #cu_tokens_across_dp_cpu = ctx.dp_metadata.cu_tokens_across_dp_cpu
+        num_tokens_across_dp = ctx.dp_metadata.num_tokens_across_dp
 
-        #print(f"max/num/rank_num = {max_tokens_across_dp}/{num_tokens_across_dp}/{get_forward_context().dp_metadata.dp_rank_num_tokens}")
+        #print(f"max/num/rank_num = {max_tokens_across_dp}/{num_tokens_across_dp}/{ctx.dp_metadata.dp_rank_num_tokens}")
 
         #In this function we define two ranges:
         # 1. chunk_range - The current iteration of the loops's range over the DP world tokens
@@ -1042,17 +1042,19 @@ def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
         #print(f"ORIGINAL SHAPE {full_hidden_states.shape}")
         #print(f"moe_dp_chunk_size_per_rank = {moe_dp_chunk_size_per_rank}")
 
+        assert full_hidden_states.shape[0] == full_router_logits.shape[0]
+
         for iter in range(0, max_tokens_across_dp, moe_dp_chunk_size_per_rank):
             hidden_states = full_hidden_states[chunk_start:chunk_end, :]
             router_logits = full_router_logits[chunk_start:chunk_end, :]
 
-            #print(f"loop {iter}: {chunk_start}:{chunk_end}, {hidden_states.shape}")
-
             cu_tokens_across_dp_this_iter = torch.cumsum(
                 num_tokens_remaining_across_dp.clamp(
                     max=moe_dp_chunk_size_per_rank),
                 dim=0)
 
+            print(f"loop {iter}: {chunk_start}:{chunk_end}, {hidden_states.shape} {cu_tokens_across_dp_this_iter}")
+
             hidden_states = self.naive_multicast(
                 hidden_states, cu_tokens_across_dp_this_iter)
             router_logits = self.naive_multicast(
@@ -1087,14 +1089,14 @@ def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
                     final_hidden_states)
                 final_hidden_states = all_hidden_states[start:end, :]
 
-                #print(f"final2 (AR) = {final_hidden_states.shape}")
+                print(f"final2 (AR) = {final_hidden_states.shape}")
 
             if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
                 # Default set to False. (May have to add shared expert outputs.)
                 final_hidden_states = tensor_model_parallel_all_reduce(
                     final_hidden_states)
 
-                #print(f"final3 (AR) = {final_hidden_states.shape}")
+                print(f"final3 (AR) = {final_hidden_states.shape}")
 
             full_final_hidden_states[chunk_start:chunk_end, :].copy_(
                 final_hidden_states)
@@ -1128,8 +1130,9 @@ def forward_impl(self, hidden_states: torch.Tensor,
         assert self.quant_method is not None
 
         if self.dp_size > 1:
-            cu_tokens_across_dp_cpu = get_forward_context(
-            ).dp_metadata.cu_tokens_across_dp_cpu
+            print("FORWARD_IMPL")
+            ctx = get_forward_context()
+            cu_tokens_across_dp_cpu = ctx.dp_metadata.cu_tokens_across_dp_cpu
 
             hidden_states = self.naive_multicast(hidden_states,
                                                  cu_tokens_across_dp_cpu)
diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
@@ -37,21 +37,20 @@ def __init__(
 
     def workspace_shapes(
         self,
-        a_dtype: torch.dtype,
+        a: torch.Tensor,
         M: int,
         N: int,
         K: int,
         topk: int,
         num_experts: int,
-        a: torch.Tensor,
     ) -> Tuple[int, int, torch.dtype]:
         # Note: the deep gemm workspaces are strictly larger than the triton
         # workspaces so we can be pessimistic here and allocate for DeepGemm
         # even if we fall back to triton later, e.g. if expert maps are set.
         if self.allow_deep_gemm and _valid_deep_gemm_shape(M, N, K):
-            return self.deep_gemm_expert.workspace_shapes(a_dtype, M, N, K, topk, num_experts, a)
+            return self.deep_gemm_expert.workspace_shapes(a, M, N, K, topk, num_experts)
         else:
-            return self.triton_expert.workspace_shapes(a_dtype, M, N, K, topk, num_experts, a)
+            return self.triton_expert.workspace_shapes(a, M, N, K, topk, num_experts)
 
     def apply(
         self,