review comments + cudagraph debugging

bnellnm · bnellnm · commit 43e229c45549 · 2025-05-14T14:54:24.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
@@ -31,10 +31,10 @@ def make_tensors(config: BatchedMMConfig):
         A = torch.randn(
             (config.num_experts, config.max_tokens_per_expert, config.K),
             device="cuda",
-            dtype=config.dtype) / 50.0
+            dtype=config.dtype)
         B = torch.randn((config.num_experts, config.N, config.K),
                         device="cuda",
-                        dtype=config.dtype) / 50.0
+                        dtype=config.dtype)
         C = torch.zeros(
             (config.num_experts, config.max_tokens_per_expert, config.N),
             device="cuda",
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
@@ -122,7 +122,6 @@ def test_fused_moe(
 def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
                         ep_size: int, dtype: torch.dtype, group_size: int,
                         has_zp: bool, weight_bits: int):
-    #print(m, n, k, e, topk, dtype, group_size, has_zp, weight_bits)
     a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
     w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
     w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
@@ -24,21 +24,35 @@
     spawn)  # pyright: ignore[reportPrivateImportUsage]
 from typing_extensions import Concatenate, ParamSpec
 
-import vllm.model_executor.layers.fused_moe  # noqa
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import override_config
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    BatchedDispatchCombine, BatchedExperts)
-from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
+    BatchedDispatchCombine, BatchedExperts, BatchedTritonExperts)
+from vllm.model_executor.layers.fused_moe.fused_moe import (fused_topk,
+                                                            get_default_config)
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
     FusedMoEModularKernel)
 from vllm.model_executor.layers.fused_moe.pplx_dispatch_combine import (
     PplxDispatchCombine)
 from vllm.platforms import current_platform
 
+PPLX_DISPATCH_COMBOS = [(4, 128, 128), (32, 1024, 512), (64, 1024, 512),
+                        (222, 2048, 1024)]
+
+PPLX_MOE_COMBOS = [
+    (1, 128, 128),
+    (2, 128, 512),
+    (3, 1024, 2048),
+    (32, 128, 1024),
+    (45, 512, 2048),
+    (64, 1024, 1024),
+    (222, 1024, 2048),
+]
+
 NUM_EXPERTS = [8, 64]
 EP_SIZE = [1, 4]
-TOP_KS = [2, 6]
+TOP_KS = [1, 2, 6]
 
 vllm_config = VllmConfig()
 vllm_config.scheduler_config.max_num_seqs = 128
@@ -298,7 +312,6 @@ def test_fused_moe_batched_experts(
                                torch_output,
                                atol=2e-2,
                                rtol=0)
-    torch.set_printoptions(profile="full")
     torch.testing.assert_close(baseline_output,
                                batched_output,
                                atol=2e-2,
@@ -426,25 +439,24 @@ def _pplx_dispatch_combine(
     nvshmem_finalize()
 
 
-# TODO: this test point does not work for M == 1
-@pytest.mark.parametrize("m", [4, 32, 64, 222])
-@pytest.mark.parametrize("n", [128, 1024, 2048])
-@pytest.mark.parametrize("k", [128, 512, 1024])
+# TODO: this test point does not work for odd M due to how the test is
+# written, not due to limitations of the pplx kernels.  The pplx_moe
+# test below is able to deal with odd M.
+@pytest.mark.parametrize("mnk", PPLX_DISPATCH_COMBOS)
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
 @pytest.mark.parametrize("world_dp_size", [[2, 1]])
 @requires_pplx
 def test_pplx_dispatch_combine(
-    m: int,
-    n: int,
-    k: int,
+    mnk: tuple[int, int, int],
     e: int,
     topk: int,
     dtype: torch.dtype,
     world_dp_size: tuple[int, int],
 ):
     current_platform.seed_everything(7)
+    m, n, k = mnk
     world_size, dp_size = world_dp_size
     device = "cuda"
     a = torch.randn((m, k), device=device, dtype=dtype) / 10
@@ -454,15 +466,11 @@ def test_pplx_dispatch_combine(
                     topk, e)
 
 
-def pplx_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids):
-    assert torch.cuda.current_device() == pgi.local_rank
-
+def pplx_moe(rank, world_size, dp_size, a, w1, w2, topk_weight, topk_ids):
+    device = torch.device("cuda", rank)
     hidden_dim = a.shape[1]
     num_experts = w1.shape[0]
     block_size = 128
-    device = pgi.device
-    rank = pgi.rank
-    world_size = pgi.world_size
     topk = topk_ids.shape[1]
     max_num_tokens = rank_chunk(a.shape[0], 0, world_size)
 
@@ -490,29 +498,39 @@ def pplx_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids):
         dp_size,
     )
 
-    experts = BatchedExperts(max_num_tokens=a.shape[0],
-                             world_size=world_size,
-                             dp_size=dp_size)
+    experts = BatchedTritonExperts(max_num_tokens=a.shape[0],
+                                   world_size=world_size,
+                                   dp_size=dp_size)
 
     fused_experts = FusedMoEModularKernel(
         dispatch_combine,
         experts,
     )
 
-    # TODO: workers with the same dp_rank must use the exact same inputs.
-
+    # Note: workers with the same dp_rank must use the exact same inputs.
     a_chunk = chunk_by_rank(a, rank, world_size).to(device)
     chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
     chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
 
-    out = fused_experts(
-        a_chunk,
-        # Chunking weights like this only works for batched format
-        chunk_by_rank(w1, rank, world_size).to(device),
-        chunk_by_rank(w2, rank, world_size).to(device),
-        chunk_topk_weight,
-        chunk_topk_ids,
-        global_num_experts=num_experts)
+    # Chunking weights like this only works for batched format
+    w1_chunk = chunk_by_rank(w1, rank, world_size).to(device)
+    w2_chunk = chunk_by_rank(w2, rank, world_size).to(device)
+
+    @torch.compile(backend='inductor', fullgraph=True)
+    def _fused_experts(a, w1, w2, topk_weight, topk_ids, global_num_experts):
+        return fused_experts(a,
+                             w1,
+                             w2,
+                             topk_weight,
+                             topk_ids,
+                             global_num_experts=global_num_experts)
+
+    out = _fused_experts(a_chunk,
+                         w1_chunk,
+                         w2_chunk,
+                         chunk_topk_weight,
+                         chunk_topk_ids,
+                         global_num_experts=num_experts)
 
     torch.cuda.synchronize()
 
@@ -546,8 +564,7 @@ def _batched_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids):
         experts,
     )
 
-    # TODO: workers with the same dp_rank must use the exact same inputs.
-
+    # Note: workers with the same dp_rank must use the exact same inputs.
     a_chunk = chunk_by_rank(a, rank, world_size).to(device)
     chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
     chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
@@ -581,10 +598,14 @@ def _pplx_moe(
     m, k = a.shape
     e, _, n = w2.shape
 
-    with set_current_vllm_config(vllm_config):
+    moe_config = get_default_config(m, e, n, k, topk, a.dtype, False)
+
+    with set_current_vllm_config(vllm_config), override_config(moe_config):
         topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
         torch_output = torch_moe2(a, w1, w2, topk_weight, topk_ids)
-        pplx_output = pplx_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids)
+        pplx_output = pplx_moe(pgi.rank, pgi.world_size, dp_size, a, w1, w2,
+                               topk_weight, topk_ids)
+        # TODO: fix + re-enable
         #batched_output = _batched_moe(pgi, dp_size, a, w1, w2, topk_weight,
         #                              topk_ids)
 
@@ -597,24 +618,21 @@ def _pplx_moe(
     nvshmem_finalize()
 
 
-@pytest.mark.parametrize("m", [1, 2, 3, 32, 45, 64, 222])
-@pytest.mark.parametrize("n", [128, 1024, 2048])
-@pytest.mark.parametrize("k", [128, 512, 1024])
+@pytest.mark.parametrize("mnk", PPLX_MOE_COMBOS)
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
 @pytest.mark.parametrize("world_dp_size", [[2, 1]])
 @requires_pplx
 def test_pplx_moe(
-    m: int,
-    n: int,
-    k: int,
+    mnk: tuple[int, int, int],
     e: int,
     topk: int,
     dtype: torch.dtype,
     world_dp_size: tuple[int, int],
 ):
     current_platform.seed_everything(7)
+    m, n, k = mnk
     world_size, dp_size = world_dp_size
     a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
     w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
@@ -949,25 +949,27 @@ def pplx_init(rank, world_size):
                                           nvshmem_get_unique_id, nvshmem_init)
         try:
             global PPLX_DID_INIT
-            logger.info("PPLX_INIT rank=%d world=%d", rank, world_size)
+            logger.debug(
+                "Initialize NVSHMEM for PPLX kernels: rank=%d, "
+                "world size=%d", rank, world_size)
             uid = nvshmem_get_unique_id(
             ) if rank == 0 else nvshmem_alloc_empty_unique_id()
             uid_gpu = uid.cuda()
             get_world_group().broadcast(uid_gpu, src=0)
-            logger.debug("PPLX_INIT UID = %s", uid_gpu)
             uid = uid_gpu.to(device='cpu')
+            logger.debug("PPLX NVSHMEM UID = %s", uid)
             nvshmem_init(uid, rank, world_size)
             PPLX_DID_INIT = True
         except Exception as ex:
-            logger.error("Failed to initialize nvshmem for pplx: %s", ex)
+            logger.error("Failed to initialize NVSHMEM for PPLX: %s", ex)
 
 
 @run_once
 def pplx_finalize():
     global PPLX_DID_INIT
     if PPLX_DID_INIT:
         from pplx_kernels.nvshmem import nvshmem_finalize
-        logger.info("PPLX finalize")
+        logger.debug("PPLX NVSHMEM finalize")
         from vllm.model_executor.layers.fused_moe.layer import (
             _all_to_all_cache)
         _all_to_all_cache.destroy()
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -333,7 +333,7 @@ def invoke_moe_batched_triton_kernel(
     BLOCK_M = config['BLOCK_SIZE_M']
     BLOCK_N = config['BLOCK_SIZE_N']
     BLOCK_K = config['BLOCK_SIZE_K']
-    assert max_num_tokens % BLOCK_M == 0
+    assert (torch.compiler.is_compiling() or max_num_tokens % BLOCK_M == 0)
 
     grid = (expert_num_tokens.size(0), triton.cdiv(max_num_tokens, BLOCK_M) *
             triton.cdiv(B.size(1), BLOCK_N))
@@ -559,13 +559,15 @@ def apply(
         N = w1.size(1) // 2
 
         # Not cudagraph friendly
-        assert (torch.cuda.is_current_stream_capturing()
+        assert (torch.compiler.is_compiling()
+                or torch.cuda.is_current_stream_capturing()
                 or torch.all(expert_num_tokens <= max_num_tokens * num_dp)), (
                     f"{expert_num_tokens} <= {max_num_tokens * num_dp}")
 
         for expert in range(num_local_experts):
-            # Indexing expert_num_tokens doesn't work w/cudagraphs
-            if torch.cuda.is_current_stream_capturing():
+            # Indexing expert_num_tokens doesn't work w/cudagraphs or inductor
+            if (torch.compiler.is_compiling()
+                    or torch.cuda.is_current_stream_capturing()):
                 num = max_num_tokens * num_dp
             else:
                 num = int(expert_num_tokens[expert].item())
diff --git a/vllm/model_executor/layers/fused_moe/pplx_dispatch_combine.py b/vllm/model_executor/layers/fused_moe/pplx_dispatch_combine.py
@@ -103,7 +103,7 @@ def dispatch(
 
         # This argument is optional, defaults to indices.size(0)
         # There's not much point setting this unless it is != indices.size(0)
-        bound_m = None
+        bound_m: Optional[torch.Tensor] = None
 
         self.a2a.dispatch(
             out_expert_num_tokens=expert_num_tokens,
@@ -128,9 +128,10 @@ def combine(
         num_tokens = output.size(0)  # M
         # This argument is optional
         # There's not much point setting this unless it is != topk_ids.size(0)
-        bound_m = None
+        bound_m: Optional[torch.Tensor] = None
 
-        assert topk_ids.size(0) == num_tokens
+        assert topk_ids.size(0) == num_tokens, (
+            f"{topk_ids.size(0)} == {num_tokens}")
         assert output.size(0) <= self.max_num_tokens, (
             f"{output.size(0)} <= {self.max_num_tokens}")
         assert output.size(1) == fused_expert_output.size(-1)
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
@@ -173,8 +173,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
                     * (1. / self.routed_scaling_factor)
 
         if self.tp_size > 1:
-            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(  # noqa E501
-                final_hidden_states)
+            final_hidden_states = (
+                self.experts.maybe_all_reduce_tensor_model_parallel(
+                    final_hidden_states))
 
         return final_hidden_states.view(num_tokens, hidden_dim)
 
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
@@ -70,7 +70,6 @@ def __init__(self,
                  prefix: str = ""):
         super().__init__()
         self.hidden_size = hidden_size
-        self.tp_size = get_tensor_model_parallel_world_size()
 
         # Gate always runs at half / full precision for now.
         self.gate = ReplicatedLinear(hidden_size,
@@ -98,11 +97,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
         final_hidden_states = self.experts(hidden_states, router_logits)
-
-        if self.tp_size > 1:
-            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(  # noqa E501
-                final_hidden_states)
-
         return final_hidden_states.view(orig_shape)