wip

bnellnm · bnellnm · commit be24517b298b · 2025-04-30T16:53:32.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
@@ -1098,6 +1098,7 @@ def get_tensor_model_parallel_rank():
 def destroy_model_parallel():
     """Set the groups to none and destroy them."""
     global _TP
+
     nvshmem_finalize()
 
     if _TP:
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1686,12 +1686,18 @@ def apply(
                     global_num_experts, expert_map
                 ))
         else:
-            stride = hidden_states.shape[1]
-            sorted_token_ids = torch.arange(0, hidden_states.shape[0], device=hidden_states.device, dtype=torch.int)
-            sorted_token_ids = sorted_token_ids * stride
-            expert_ids = torch.logical_not(torch.isnan(hidden_states)).sum(dim=(1,2)).nonzero()
-            num_tokens_post_padded = torch.zeros(1, device=hidden_states.device, dtype=torch.int)
+            #stride = hidden_states.shape[1]
+            sorted_token_ids = torch.arange(0, num_tokens*hidden_states.shape[1], device=hidden_states.device, dtype=torch.int)
+            sorted_token_ids = sorted_token_ids.flatten()
+            nans = torch.isnan(hidden_states).sum(dim=(1,2))
+            expert_ids = torch.where((nans > 0).flatten(), -1, torch.arange(0, nans.numel(), device=hidden_states.device, dtype=torch.int32))
+            #expert_ids = torch.repeat_interleave(expert_ids, hidden_states.shape[1], dim=0)
+            #print(f"EXPERT_IDS {nans.shape} {expert_ids}")
+            #num_tokens_post_padded = torch.tensor([num_tokens], device=hidden_states.device, dtype=torch.int32)
+            num_tokens_post_padded = torch.zeros(1, device=hidden_states.device, dtype=torch.int32)
+            num_tokens_post_padded.fill_(num_tokens)
             hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+            #print(f"P = {sorted_token_ids}, {hidden_states.shape}")
 
         invoke_fused_moe_kernel(hidden_states,
                                 w1,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -116,7 +116,7 @@ def get_or_create(self, **kwargs):
 
         with self._lock:
             instance = self._cache.get(key)
-            if instance is None:
+            if True or instance is None:
                 instance = pplx.AllToAll(**kwargs)
                 self._cache[key] = instance
             return instance
@@ -605,7 +605,7 @@ def __init__(
         self.quant_method = quant_method
 
         # TODO: move to method?
-        if False and self.dp_size > 1:
+        if self.dp_size > 1:
             max_num_tokens = MOE_DP_CHUNK_SIZE # // moe.dp_size
             world_size = moe.ep_size
             dp_size = moe.ep_size // moe.dp_size # dp_size actually means TP.
@@ -1029,6 +1029,8 @@ def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
             hidden_states = full_hidden_states[chunk_start:chunk_end, :]
             router_logits = full_router_logits[chunk_start:chunk_end, :]
 
+            print(f"loop {chunk_start}:{chunk_end}")
+
             cu_tokens_across_dp_this_iter = torch.cumsum(
                 num_tokens_remaining_across_dp.clamp(
                     max=moe_dp_chunk_size_per_rank),
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -312,6 +312,9 @@ def forward(
         Returns:
         - torch.Tensor: The output tensor after applying the MoE layer.
         """
+        from vllm.distributed import (get_dp_group, get_tensor_model_parallel_rank)
+        print(f"START {hidden_states.shape} {topk_ids.shape} {get_tensor_model_parallel_rank()}/{get_dp_group().rank_in_group}")
+
         a1 = hidden_states
         E, M, N, K, top_k = _moe_problem_size(a1, w1, w2, topk_ids)
 
@@ -361,4 +364,6 @@ def forward(
         self.dispatch_combine.combine(output, fused_out, topk_weights,
                                       topk_ids, apply_router_weight_on_input)
 
+        print(f"DONE {hidden_states.shape} {topk_ids.shape} {get_tensor_model_parallel_rank()}/{get_dp_group().rank_in_group}")
+
         return output
diff --git a/vllm/model_executor/layers/fused_moe/pplx_dispatch_combine.py b/vllm/model_executor/layers/fused_moe/pplx_dispatch_combine.py
@@ -46,6 +46,8 @@ def dispatch(
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
         # Is this always going to be a1.device?
         device = a1.device
+        num_tokens = a1.shape[0]   # M
+        hidden_dim = a1.shape[-1]  # K
 
         assert expert_map is None, "NYI"
 
@@ -71,15 +73,15 @@ def dispatch(
         expert_num_tokens = torch.empty(
             num_local_experts,
             dtype=torch.int32,
-            device=a1.device,
+            device=device,
         )
         expert_num_tokens.fill_(-1)  # debugging remove
 
         num_dp = self.world_size // self.dp_size
         expert_x = torch.empty(
             (num_local_experts, self.max_num_tokens * num_dp, a1q.shape[-1]),
             dtype=a1q.dtype,
-            device=a1.device,
+            device=device,
         )
         expert_x.fill_(torch.nan)   # debugging remove
 
@@ -95,7 +97,7 @@ def dispatch(
                     (expert_x.size(2) + block_size - 1) // block_size,
                 ),
                 dtype=torch.float32,
-                device=a1.device,
+                device=device,
             )
 
         # This argument is optional, defaults to indices.shape[0]
@@ -105,7 +107,7 @@ def dispatch(
         bound_m = None
 
         # TODO: optimize this?
-        indices = rank_topk_ids.to(dtype=torch.uint32)
+        indices = rank_topk_ids.to(dtype=torch.uint32).to(device)
 
         self.a2a.dispatch(
             out_expert_num_tokens=expert_num_tokens,
@@ -126,8 +128,17 @@ def combine(
         topk_ids: torch.Tensor,
         apply_router_weight_on_input: bool,
     ) -> None:
+        device = fused_expert_output.device
+        #device = torch.device("cuda", self.rank)
+        #device = get_dp_group().device
+        #assert fused_expert_output.device == device
+
+        print(f"COMBINE START {self.rank}")
+
         # This argument is optional
         #bound_m = get_forward_context().dp_metadata.dp_rank_num_tokens
+        #num_tokens = fused_expert_output.shape[0]   # M
+        #bound_m = torch.tensor([num_tokens], dtype=torch.uint32, device=device)
         bound_m = None
 
         assert output.shape[0] <= self.max_num_tokens
@@ -143,5 +154,4 @@ def combine(
                          expert_y=fused_expert_output,
                          bound_m=bound_m)
 
-        #print("END COMBINE")
-
+        print(f"COMBINE END {self.rank}")