wip ref impl

bnellnm · bnellnm · commit 3b72bc5f7b23 · 2025-05-14T14:53:06.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
@@ -70,6 +70,7 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
   int64_t num_tokens = input.numel() / input.size(-1);                   \
   dim3 grid(num_tokens);                                                 \
   dim3 block(std::min(d, 1024));                                         \
+  if (num_tokens == 0) { return; }                                       \
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));      \
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();          \
   VLLM_DISPATCH_FLOATING_TYPES(                                          \
diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
@@ -114,9 +114,11 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
 
     # Create an LLM.
     cconfig = CompilationConfig(
-        level=0,
+        level=3,
         #cudagraph_capture_sizes=[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208],
         #cudagraph_capture_sizes=[512,256,1],
+        #cudagraph_capture_sizes=[192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1]
+        #cudagraph_capture_sizes=[128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1]
     )
     llm = LLM(model=model,
               tensor_parallel_size=GPUs_per_dp_rank,
@@ -171,7 +173,7 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
         procs.append(proc)
     exit_code = 0
     for proc in procs:
-        proc.join(timeout=300)
+        proc.join(timeout=3000)
         if proc.exitcode is None:
             print(f"Killing process {proc.pid} that "
                   f"didn't stop within 5 minutes.")
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
@@ -515,6 +515,50 @@ def pplx_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids):
     return out
 
 
+def _batched_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids):
+    assert torch.cuda.current_device() == pgi.local_rank
+
+    hidden_dim = a.shape[1]
+    num_experts = w1.shape[0]
+    block_size = 128
+    device = pgi.device
+    rank = pgi.rank
+    world_size = pgi.world_size
+    topk = topk_ids.shape[1]
+    max_num_tokens = rank_chunk(a.shape[0], 0, world_size)
+
+    dispatch_combine = BatchedDispatchCombine(
+        max_num_tokens=max_num_tokens,
+        world_size=world_size,
+        dp_size=dp_size,
+        rank=rank,
+    )
+
+    experts = BatchedExperts(a.shape[0])
+
+    fused_experts = FusedMoEModularKernel(
+        dispatch_combine,
+        experts,
+    )
+
+    # TODO: workers with the same dp_rank must use the exact same inputs.
+
+    a_chunk = chunk_by_rank(a, rank, world_size).to(device)
+    chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
+    chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
+
+    out = fused_experts(
+        a_chunk,
+        # Chunking weights like this only works for batched format
+        chunk_by_rank(w1, rank, world_size).to(device),
+        chunk_by_rank(w2, rank, world_size).to(device),
+        chunk_topk_weight,
+        chunk_topk_ids,
+        global_num_experts=num_experts)
+
+    return out
+
+
 def _pplx_moe(
     pgi: ProcessGroupInfo,
     dp_size: int,
@@ -536,11 +580,13 @@ def _pplx_moe(
         topk_weight, topk_ids = fused_topk(a, score, topk, False)
         torch_output = torch_moe2(a, w1, w2, topk_weight, topk_ids)
         pplx_output = pplx_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids)
+        batched_output = _batched_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids)
 
     torch_output = chunk_by_rank(torch_output, pgi.rank,
                                  pgi.world_size).to(pplx_output.device)
 
     torch.testing.assert_close(pplx_output, torch_output, atol=2e-2, rtol=0)
+    torch.testing.assert_close(batched_output, torch_output, atol=2e-2, rtol=0)
 
     nvshmem_finalize()
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -466,6 +466,11 @@ def invoke_batched_silu_and_mul(
                                       compute_tl_dtype, D, BLOCK_M, BLOCK_D)
 
 
+def rank_chunk(num, r, w):
+    rem = num % w
+    return (num // w) + (1 if r < rem else 0)
+
+
 class BatchedDispatchCombine(mk.FusedMoEQuantizeDispatchCombine):
 
     def __init__(self, max_num_tokens: Optional[int], world_size: int, dp_size: int, rank: int):
@@ -505,20 +510,31 @@ def dispatch(
         if self.max_num_tokens is None:
             self.max_num_tokens = int(tokens_per_expert.max().item())
 
-        b_a1 = torch.zeros((num_experts, self.max_num_tokens, hidden_dim),
+        rem_experts = num_experts % self.world_size
+        num_local_experts = ((num_experts // self.world_size) +
+                             (1 if self.rank < rem_experts else 0))
+
+        b_a1 = torch.zeros((num_local_experts, self.max_num_tokens, hidden_dim),
                            dtype=a1.dtype,
                            device=a1.device)
 
-        token_counts = torch.zeros(num_experts,
+        token_counts = torch.zeros(num_local_experts,
                                    dtype=torch.int,
                                    device=a1.device)
 
+        first_expert = (((num_experts // self.world_size) * self.rank) +
+                        rem_experts - self.rank)
+        last_expert = first_expert + num_local_experts
+        #expert_id_range = range(first_expert, last_expert)
+
         for token in range(num_tokens):
             for j in range(topk):
                 expert_id = topk_ids[token, j]
-                idx = token_counts[expert_id]
-                b_a1[expert_id, idx:idx + 1, :] = a1[token, :]
-                token_counts[expert_id] = token_counts[expert_id] + 1
+                if expert_id >= first_expert and expert_id < last_expert:
+                    rel_index = expert_id - first_expert
+                    idx = token_counts[rel_index]
+                    b_a1[rel_index, idx:idx + 1, :] = a1[token, :]
+                    token_counts[rel_index] = token_counts[rel_index] + 1
 
         return b_a1, a1_scale, tokens_per_expert
 
@@ -531,7 +547,8 @@ def combine(
         apply_router_weight_on_input: bool,
     ) -> None:
         num_tokens = topk_ids.shape[0]
-        num_experts = fused_expert_output.shape[0]
+        num_local_experts = fused_expert_output.shape[0]
+        num_experts = num_local_experts * self.world_size # NOT QUITE RIGHT
         K = fused_expert_output.shape[-1]
         assert output.shape[0] == num_tokens and output.shape[1] == K
         expert_counts = torch.zeros(
@@ -541,17 +558,21 @@ def combine(
 
         output.fill_(0)
 
+        first_expert = num_local_experts * self.rank # NOT QUITE RIGHT
+        last_expert = first_expert + num_local_experts
+
         for token in range(num_tokens):
             expert_ids = topk_ids[token]
             for i in range(expert_ids.numel()):
                 expert_id = expert_ids[i]
-                assert expert_id < num_experts
-                idx = expert_counts[expert_id]
-                accum = fused_expert_output[expert_id, idx:idx + 1, :]
-                if not apply_router_weight_on_input:
-                    accum = accum * topk_weights[token, i]
-                output[token, :] = output[token, :] + accum
-                expert_counts[expert_id] = expert_counts[expert_id] + 1
+                if expert_id >= first_expert and expert_id < last_expert:
+                    assert expert_id < num_experts
+                    idx = expert_counts[expert_id]
+                    accum = fused_expert_output[expert_id - first_expert, idx:idx + 1, :]
+                    if not apply_router_weight_on_input:
+                        accum = accum * topk_weights[token, i]
+                    output[token, :] = output[token, :] + accum
+                    expert_counts[expert_id] = expert_counts[expert_id] + 1
 
 
 class BatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
@@ -622,20 +643,26 @@ def apply(
         num_experts = global_num_experts
         out = _resize_cache(workspace13,
                             (num_experts, max_num_tokens, hidden_dim))
-        num_local_experts = expert_num_tokens.numel()
-        assert num_local_experts == w1.shape[0]
+        num_local_experts = w1.shape[0] #expert_num_tokens.numel()
+        assert num_local_experts == w1.shape[0], f"{num_local_experts} == {w1.shape[0]}"
 
         N = w1.shape[1] // 2
 
+        # Not cudagraph friendly
+        assert (torch.cuda.is_current_stream_capturing() or
+                torch.all(expert_num_tokens <= max_num_tokens)), (
+                    f"{expert_num_tokens} <= {max_num_tokens}")
+
         for expert in range(num_local_experts):
-            num = expert_num_tokens[expert].item()
-            assert num <= max_num_tokens, f"{num} <= {max_num_tokens}"
-            if num > 0:  # CUDAGRAPH unfriendly
-                tmp = _resize_cache(workspace2, (num, N))
-                input = hidden_states[expert, :num, :] @ w1[expert].transpose(0, 1)
-                assert input.shape[1] == N * 2
-                self.activation(activation, tmp, input)
-                out[expert, :num, :] = tmp @ w2[expert].transpose(0, 1)
+            # Indexing expert_num_tokens doesn't work w/cudagraphs
+            if torch.cuda.is_current_stream_capturing():
+                num = max_num_tokens
+            else:
+                num = int(expert_num_tokens[expert].item())
+            tmp = _resize_cache(workspace2, (num, N))
+            input = hidden_states[expert, :num, :] @ w1[expert].transpose(0, 1)
+            self.activation(activation, tmp, input)
+            out[expert, :num, :] = tmp @ w2[expert].transpose(0, 1)
 
         return out
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -870,7 +870,7 @@ def fused_topk(
     gating_output: torch.Tensor,
     topk: int,
     renormalize: bool,
-    indices_type: torch.dtype = torch.int32,
+    indices_type: Optional[torch.dtype] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     assert hidden_states.shape[0] == gating_output.shape[0], (
         "Number of tokens mismatch")
@@ -881,10 +881,12 @@ def fused_topk(
                                topk,
                                dtype=torch.float32,
                                device=hidden_states.device)
-    topk_ids = torch.empty(M,
-                           topk,
-                           dtype=indices_type,
-                           device=hidden_states.device)
+    topk_ids = torch.empty(
+        M,
+        topk,
+        dtype=torch.int32 if indices_type is None else indices_type,
+        device=hidden_states.device
+    )
     token_expert_indicies = torch.empty(M,
                                         topk,
                                         dtype=torch.int32,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -136,7 +136,7 @@ def get_or_create(self, **kwargs):
 
         with self._lock:
             instance = self._cache.get(key)
-            if instance is None:
+            if True or instance is None:
                 # TODO: should be intranode
                 instance = pplx.AllToAll.internode(**kwargs)
                 self._cache[key] = instance
@@ -272,6 +272,8 @@ def set_dispatch_combine(
 
         experts: Optional[FusedMoEPermuteExpertsUnpermute] = None
 
+        self.using_pplx = False
+
         if isinstance(dispatch_combine,
                       (BatchedDispatchCombine, PplxDispatchCombine)):
             logger.debug("BatchedTritonExperts %s", self.moe)
@@ -283,6 +285,7 @@ def set_dispatch_combine(
                 use_int4_w4a16=False,
                 block_shape=None,
             )
+            self.using_pplx = isinstance(dispatch_combine, PplxDispatchCombine)
         else:
             logger.debug("TritonExperts %s", self.moe)
             experts = TritonExperts(
@@ -329,7 +332,8 @@ def forward_cuda(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=torch.uint32 if self.using_pplx else None)
 
         if self.rocm_aiter_moe_enabled:
             return self.rocm_aiter_fused_experts(
@@ -687,7 +691,7 @@ def _construct_dispatch_combine(
         max_num_tokens = MOE_DP_CHUNK_SIZE
         world_size = moe.ep_size
 
-        if False and self.dp_size > 1 and has_pplx:
+        if self.dp_size > 1 and has_pplx:
             logger.debug("using pplx dispatch")
             dp_size = moe.ep_size // moe.dp_size  # dp_size actually means TP.
             rank = moe.ep_rank
@@ -1020,13 +1024,16 @@ def select_experts(hidden_states: torch.Tensor,
                        num_expert_group: Optional[int] = None,
                        custom_routing_function: Optional[Callable] = None,
                        scoring_func: str = "softmax",
-                       e_score_correction_bias: Optional[torch.Tensor] = None):
-        from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
+                       e_score_correction_bias: Optional[torch.Tensor] = None,
+                       indices_type: Optional[torch.dtype] = None):
+        from vllm.model_executor.layers.fused_moe.fused_moe import (
+            fused_topk, grouped_topk)
 
         # DeekSeekv2 uses grouped_top_k
         if use_grouped_topk:
             assert topk_group is not None
             assert num_expert_group is not None
+            assert indices_type is None or indices_type == torch.int32
             topk_weights, topk_ids = grouped_topk(
                 hidden_states=hidden_states,
                 gating_output=router_logits,
@@ -1041,10 +1048,10 @@ def select_experts(hidden_states: torch.Tensor,
                                                 gating_output=router_logits,
                                                 topk=top_k,
                                                 renormalize=renormalize,
-                                                # XXXXX how to do this?
-                                                #indices_type=torch.uint32,
+                                                indices_type=indices_type,
                                                 )
         else:
+            assert indices_type is None or indices_type == torch.int32
             topk_weights, topk_ids = custom_routing_function(
                 hidden_states=hidden_states,
                 gating_output=router_logits,
diff --git a/vllm/model_executor/layers/fused_moe/pplx_dispatch_combine.py b/vllm/model_executor/layers/fused_moe/pplx_dispatch_combine.py
@@ -105,12 +105,10 @@ def dispatch(
             )
 
         # This argument is optional, defaults to indices.shape[0]
+        # There's not much point setting this unless it is != indices.shape[0]
         #bound_m = torch.tensor([num_tokens], dtype=torch.uint32, device=device)
         bound_m = None
 
-        # TODO: optimize this?
-        #indices = rank_topk_ids.to(dtype=torch.uint32)
-
         self.a2a.dispatch(
             out_expert_num_tokens=expert_num_tokens,
             out_expert_x=expert_x,
@@ -130,14 +128,15 @@ def combine(
         topk_ids: torch.Tensor,
         apply_router_weight_on_input: bool,
     ) -> None:
-        # This argument is optional
         num_tokens = output.shape[0]  # M
+        # This argument is optional
+        # There's not much point setting this unless it is != topk_ids.shape[0]
         #bound_m = torch.tensor([num_tokens],
         #                       dtype=torch.uint32,
         #                       device=fused_expert_output.device)
         bound_m = None
 
-        assert topk_ids.shape[0] <= num_tokens
+        assert topk_ids.shape[0] == num_tokens
         assert output.shape[0] <= self.max_num_tokens, \
             f"{output.shape[0]} <= {self.max_num_tokens}"
         assert output.shape[1] == fused_expert_output.shape[-1]