seems to be working

bnellnm · bnellnm · commit 792d7518559b · 2025-04-30T16:53:32.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -134,7 +134,9 @@ def apply(
         dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
             (a1q, a1q_scale), (w1, w1_scale), workspace1, expert_ids)
 
-        self.activation(activation, workspace2, workspace1.view(-1, N))
+        self.activation(activation,
+                        workspace2,
+                        workspace1.view(-1, N))
 
         a2q_scale: Optional[torch.Tensor] = None
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1678,12 +1678,20 @@ def apply(
         intermediate_cache3 = _resize_cache(workspace13,
                                             (num_tokens, top_k_num, K))
 
-        sorted_token_ids, expert_ids, num_tokens_post_padded = (
-            moe_align_block_size(
-                topk_ids,
-                config['BLOCK_SIZE_M'] if self.block_m is None else self.block_m,
-                global_num_experts, expert_map
-            ))
+        if hidden_states.dim() == 2: #block_m is None:
+            sorted_token_ids, expert_ids, num_tokens_post_padded = (
+                moe_align_block_size(
+                    topk_ids,
+                    config['BLOCK_SIZE_M'],
+                    global_num_experts, expert_map
+                ))
+        else:
+            stride = hidden_states.shape[1]
+            sorted_token_ids = torch.arange(0, hidden_states.shape[0], device=hidden_states.device, dtype=torch.int)
+            sorted_token_ids = sorted_token_ids * stride
+            expert_ids = torch.logical_not(torch.isnan(hidden_states)).sum(dim=(1,2)).nonzero()
+            num_tokens_post_padded = torch.zeros(1, device=hidden_states.device, dtype=torch.int)
+            hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
 
         invoke_fused_moe_kernel(hidden_states,
                                 w1,
@@ -1706,7 +1714,8 @@ def apply(
                                 per_channel_quant=self.per_channel_quant,
                                 block_shape=self.block_shape)
 
-        self.activation(activation, intermediate_cache2,
+        self.activation(activation,
+                        intermediate_cache2,
                         intermediate_cache1.view(-1, N))
 
         a2q_scale: Optional[torch.Tensor] = None
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -241,7 +241,7 @@ def apply(
     # Maybe extra args
     def set_dispatch_combine(self, dispatch_combine: FusedMoEQuantizeDispatchCombine) -> bool:
         block_m = MOE_DP_CHUNK_SIZE * (self.moe.ep_size // self.moe.dp_size)
-        print(f"block_m = {block_m}")
+        #print(f"block_m = {block_m}")
 
         experts = TritonExperts(
             use_fp8_w8a8 = False,
@@ -550,8 +550,8 @@ def __init__(
             self.ep_size = 1
             self.local_num_experts = self.global_num_experts
             self.expert_map = None
+        #self.global_num_experts = num_experts  redundant?
         self.top_k = top_k
-        self.global_num_experts = num_experts
 
         assert intermediate_size % self.tp_size == 0
         self.hidden_size = hidden_size
@@ -571,11 +571,12 @@ def __init__(
         if self.scoring_func != "softmax" and not self.use_grouped_topk:
             raise ValueError("Only softmax scoring function is supported for "
                              "non-grouped topk.")
+
         if current_platform.is_hpu():
             from vllm_hpu_extension.ops import DynamicFusedMOE
             self.hpu_fused_moe = DynamicFusedMOE(self.global_num_experts)
 
-        print(f"params dtype= {params_dtype}")
+        #print(f"params dtype= {params_dtype}")
 
         moe = MoEConfig(
             num_experts=self.global_num_experts,
@@ -604,59 +605,59 @@ def __init__(
         self.quant_method = quant_method
 
         # TODO: move to method?
-        if self.dp_size > 1:
-            if True:
-                max_num_tokens = MOE_DP_CHUNK_SIZE # // moe.dp_size
-                world_size = moe.ep_size
-                dp_size = moe.ep_size // moe.dp_size # dp_size actually means TP.
-                rank = moe.ep_rank
+        if False and self.dp_size > 1:
+            max_num_tokens = MOE_DP_CHUNK_SIZE # // moe.dp_size
+            world_size = moe.ep_size
+            dp_size = moe.ep_size // moe.dp_size # dp_size actually means TP.
+            rank = moe.ep_rank
 
+            if False:
                 print(f"max num = {max_num_tokens}")
                 print(f"world size = {world_size}")
                 print(f"moe ep size = {moe.ep_size}")
                 print(f"moe dp size = {moe.dp_size}")
                 print(f"dp size = {dp_size}")
                 print(f"rank= {rank}")
 
-                all_to_all = get_all_to_all(
-                    max_num_tokens=max_num_tokens,
-                    num_experts=moe.num_experts,
-                    experts_per_token=moe.experts_per_token, # topk
-                    rank=rank,
-                    world_size=world_size,
-                    dp_size=dp_size,
-                    hidden_dim=moe.hidden_dim,
-                    hidden_dim_bytes=moe.hidden_dim * moe.in_dtype.itemsize,
-                    # For blocked per token: set to ceil_div(hidden_dim, block_size) * sizeof(float32)
-                    # For per-token: set to sizeof(float32)
-                    hidden_dim_scale_bytes=(
-                        0
-                        if moe.in_dtype.itemsize != 1
-                        else (
-                                (moe.hidden_dim + moe.block_size - 1)
-                                // moe.block_size
-                                * torch.float32.itemsize
-                        )
+            all_to_all = get_all_to_all(
+                max_num_tokens=max_num_tokens,
+                num_experts=moe.num_experts,
+                experts_per_token=moe.experts_per_token, # topk
+                rank=rank,
+                world_size=world_size,
+                dp_size=dp_size,
+                hidden_dim=moe.hidden_dim,
+                hidden_dim_bytes=moe.hidden_dim * moe.in_dtype.itemsize,
+                # For blocked per token: set to ceil_div(hidden_dim, block_size) * sizeof(float32)
+                # For per-token: set to sizeof(float32)
+                hidden_dim_scale_bytes=(
+                    0
+                    if moe.in_dtype.itemsize != 1
+                    else (
+                            (moe.hidden_dim + moe.block_size - 1)
+                            // moe.block_size
+                            * torch.float32.itemsize
                     )
                 )
+            )
 
-                dispatch_combine = PplxDispatchCombine(
-                    all_to_all,
-                    max_num_tokens,
-                    world_size,
-                    dp_size,
-                    rank, # just for debugging
-                    moe.in_dtype,
-                )
-            else:
-                dispatch_combine = StandardDispatchCombine(
-                    moe.in_dtype,
-                    quant_config.weight_block_size if quant_config is not None else None,
-                )
+            dispatch_combine = PplxDispatchCombine(
+                all_to_all,
+                max_num_tokens,
+                world_size,
+                dp_size,
+                rank, # just for debugging
+                moe.in_dtype,
+            )
 
             success = self.quant_method.set_dispatch_combine(dispatch_combine)
             if not success:
                 logger.warning("DP+EP not supported for %s.", type(self.quant_method))
+        else:
+            dispatch_combine = StandardDispatchCombine(
+                moe.in_dtype,
+                quant_config.weight_block_size if quant_config is not None else None,
+            )
 
         self.apply_router_weight_on_input = apply_router_weight_on_input
         moe_quant_params = {
@@ -1010,7 +1011,7 @@ def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
         num_tokens_across_dp = get_forward_context(
         ).dp_metadata.num_tokens_across_dp
 
-        print(f"max/num/rank_num = {max_tokens_across_dp}/{num_tokens_across_dp}/{get_forward_context().dp_metadata.dp_rank_num_tokens}")
+        #print(f"max/num/rank_num = {max_tokens_across_dp}/{num_tokens_across_dp}/{get_forward_context().dp_metadata.dp_rank_num_tokens}")
 
         #In this function we define two ranges:
         # 1. chunk_range - The current iteration of the loops's range over the DP world tokens
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -60,9 +60,6 @@ def _moe_problem_size(
     E, N, _ = w1.shape
     K = w2.shape[1]
 
-    assert topk_ids.dim() == 2
-    topk = topk_ids.shape[1]
-
     if a1.dim() == 2:
         # Make sure we are using the correct a1 (pre-permute).
         assert topk_ids.shape[0] == a1.shape[0], \
@@ -73,6 +70,9 @@ def _moe_problem_size(
         assert E == a1.shape[0]
         M = a1.shape[1] # This is max_num_tokens
 
+    assert topk_ids.dim() == 2
+    topk = topk_ids.shape[1]
+
     return E, M, N, K, topk
 
 
diff --git a/vllm/model_executor/layers/fused_moe/pplx_dispatch_combine.py b/vllm/model_executor/layers/fused_moe/pplx_dispatch_combine.py
@@ -32,10 +32,6 @@ def __init__(self,
         self.dp_size = dp_size
         self.rank = rank
         self.quant_dtype = quant_dtype
-        print(f"max_num_tokens = {max_num_tokens}")
-        print(f"dp_num_tokens = {self.dp_num_tokens}")
-        print(f"world_size = {world_size}")
-        print(f"dp_size = {dp_size}")
 
     def dispatch(
         self,
@@ -77,15 +73,15 @@ def dispatch(
             dtype=torch.int32,
             device=a1.device,
         )
-        expert_num_tokens.fill_(-1)
+        expert_num_tokens.fill_(-1)  # debugging remove
 
         num_dp = self.world_size // self.dp_size
         expert_x = torch.empty(
             (num_local_experts, self.max_num_tokens * num_dp, a1q.shape[-1]),
             dtype=a1q.dtype,
             device=a1.device,
         )
-        expert_x.fill_(torch.nan)
+        expert_x.fill_(torch.nan)   # debugging remove
 
         expert_x_scale: Optional[torch.Tensor] = None
         if a1q.dtype.itemsize == 1:
@@ -146,3 +142,6 @@ def combine(
                          weights=topk_weights,
                          expert_y=fused_expert_output,
                          bound_m=bound_m)
+
+        #print("END COMBINE")
+