cleanups

bnellnm · bnellnm · commit 938c516f499e · 2025-04-30T21:35:35.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
@@ -16,7 +16,7 @@
     from pplx_kernels.nvshmem import (nvshmem_alloc_empty_unique_id,
                                       nvshmem_finalize, nvshmem_get_unique_id,
                                       nvshmem_init)
-    has_pplx = False
+    has_pplx = True
 except ImportError:
     has_pplx = False
 
@@ -46,11 +46,6 @@
 
 P = ParamSpec("P")
 
-require_multi_node = pytest.mark.skipif(
-    "MASTER_ADDR" not in os.environ,
-    reason="Requires multi-node environment",
-)
-
 requires_pplx = pytest.mark.skipif(
     not has_pplx,
     reason="Requires PPLX kernels",
@@ -180,6 +175,9 @@ def torch_dispatch(
 
     tokens_per_expert = torch.bincount(topk_ids.view(-1),
                                        minlength=num_experts)
+
+    assert tokens_per_expert.numel() == num_experts
+
     if max_num_tokens is None:
         max_num_tokens = int(tokens_per_expert.max().item())
 
@@ -259,7 +257,7 @@ def torch_moe2(a, w1, w2, topk_weight, topk_ids):
             topk_weight.view(M, -1, 1).to(out.dtype)).sum(dim=1)
 
 
-@pytest.mark.parametrize("m", [1, 33, 64, 222])  #, 1024 * 128])
+@pytest.mark.parametrize("m", [1, 33, 64, 222])
 @pytest.mark.parametrize("n", [128, 1024, 2048])
 @pytest.mark.parametrize("k", [128, 511, 1024])
 @pytest.mark.parametrize("e", NUM_EXPERTS)
@@ -309,7 +307,7 @@ def torch_pplx_dispatch_combine(pgi, dp_size, a, w1, w2, scores, topk):
     rank = pgi.rank
     world_size = pgi.world_size
     rank_num_tokens = rank_chunk(num_tokens, rank, world_size)
-    max_num_tokens = num_tokens
+    max_num_tokens = max(num_tokens, 1)
 
     ata = AllToAll.internode(
         max_num_tokens=max_num_tokens,
@@ -350,22 +348,23 @@ def torch_pplx_dispatch_combine(pgi, dp_size, a, w1, w2, scores, topk):
         False,
     )
 
-    naive_b_a, tokens_per_expert = torch_dispatch(a_chunk, chunk_topk_ids,
-                                                  num_experts)
+    if False:
+        naive_b_a, tokens_per_expert = torch_dispatch(a_chunk, chunk_topk_ids,
+                                                      num_experts)
 
-    torch.distributed.all_reduce(tokens_per_expert)
-    tokens_per_expert = chunk_by_rank(tokens_per_expert, rank,
-                                      world_size).to(dtype=torch.int32)
+        torch.distributed.all_reduce(tokens_per_expert)
+        tokens_per_expert = chunk_by_rank(tokens_per_expert, rank,
+                                          world_size).to(dtype=torch.int32)
 
-    torch.testing.assert_close(tokens_per_expert,
-                               expert_num_tokens,
-                               atol=0,
-                               rtol=0)
+        torch.testing.assert_close(tokens_per_expert,
+                                   expert_num_tokens,
+                                   atol=0,
+                                   rtol=0)
 
     b_a = b_a * 1.5
 
     out = torch.full(
-        (rank_num_tokens * world_size, hidden_dim),
+        (rank_num_tokens, hidden_dim),
         torch.nan,
         dtype=a.dtype,
         device=device,
@@ -424,14 +423,15 @@ def _pplx_dispatch_combine(
     nvshmem_finalize()
 
 
+# TODO: M < world_size doesn't appear to be supported by pplx?
 @pytest.mark.parametrize("m", [4, 32, 64, 222])
 @pytest.mark.parametrize("n", [128, 1024, 2048])
-@pytest.mark.parametrize("k", [128, 512, 1024])  # restrictions?  % 128?
+@pytest.mark.parametrize("k", [128, 512, 1024])
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("world_dp_size", [[2, 1]])  #, [[4, 2]])
-@pytest.mark.skipif(not has_pplx, reason="PPLX kernels not available.")
+@requires_pplx
 def test_pplx_dispatch_combine(
     m: int,
     n: int,
@@ -502,11 +502,9 @@ def torch_pplx_moe(pgi, dp_size, a, w1, w2, scores, topk):
         # Chunking weights like this only works for batched format
         chunk_by_rank(w1, rank, world_size),
         chunk_by_rank(w2, rank, world_size),
-        #w1,
-        #w2,
         chunk_topk_weight,
         chunk_topk_ids,
-        global_num_experts=num_experts  #? num_local_experts?
+        global_num_experts=num_experts
     )
 
     torch.cuda.synchronize()
@@ -547,15 +545,15 @@ def _pplx_moe(
     nvshmem_finalize()
 
 
-# TODO: M == 1 doesn't work
+# TODO: M < world_size doesn't appear to be supported by pplx?
 @pytest.mark.parametrize("m", [2, 3, 32, 45, 64, 222])
 @pytest.mark.parametrize("n", [128, 1024, 2048])
 @pytest.mark.parametrize("k", [128, 512, 1024])
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("world_dp_size", [[2, 1]])  #, [4, 2]])
-@pytest.mark.skipif(not has_pplx, reason="PPLX kernels not available.")
+@requires_pplx
 def test_pplx_moe(
     m: int,
     n: int,
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
@@ -92,7 +92,7 @@ def set_forward_context(attn_metadata: Any,
                                          dtype=torch.int32)
         from vllm.distributed.parallel_state import get_dp_group
         dist.all_reduce(num_tokens_tensor, group=get_dp_group().cpu_group)
-        #TODO device?
+        #TODO device? (tms)
         max_tokens_across_dp = torch.max(
             num_tokens_tensor)  #.to(device="cuda")
         cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_tensor, dim=0)
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+import functools
 import importlib.util
 from typing import Optional, Tuple
 
@@ -19,6 +20,7 @@
 has_deep_gemm = importlib.util.find_spec("deep_gemm") is not None
 
 
+@functools.cache
 def deep_gemm_block_shape() -> list[int]:
     # Lazy import to avoid CUDA initialization problems.
     import deep_gemm as dg
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -63,8 +63,7 @@ class MoEConfig:
     ep_size: int
     ep_rank: int
 
-    in_dtype: torch.dtype
-    out_dtype: torch.dtype
+    in_dtype: torch.dtype  # The activation type.
 
     # TODO: add more quantization params, blocked, per-token, etc.
     block_size: int = 128
@@ -142,7 +141,6 @@ def get_all_to_all(**kwargs):
     return _all_to_all_cache.get_or_create(**kwargs)
 
 
-#TODO: Every change in this class is a broken hack!!
 @CustomOp.register("unquantized_fused_moe")
 class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
     """MoE method without quantization."""
@@ -249,18 +247,15 @@ def apply(
             activation=activation,
             apply_router_weight_on_input=apply_router_weight_on_input)
 
-    # Maybe extra args
     def set_dispatch_combine(
             self, dispatch_combine: FusedMoEQuantizeDispatchCombine) -> bool:
         assert self.fused_experts == fused_experts
 
-        #block_m = MOE_DP_CHUNK_SIZE * (self.moe.ep_size // self.moe.dp_size)
-
         experts: FusedMoEPermuteExpertsUnpermute = None
 
         if isinstance(dispatch_combine,
                       (BatchedDispatchCombine, PplxDispatchCombine)):
-            logger.info("BatchedTritonExperts %s", self.moe)
+            logger.debug("BatchedTritonExperts %s", self.moe)
             experts = BatchedTritonExperts(
                 use_fp8_w8a8=False,
                 use_int8_w8a8=False,
@@ -269,7 +264,7 @@ def set_dispatch_combine(
                 block_shape=None,
             )
         else:
-            logger.info("TritonExperts %s", self.moe)
+            logger.debug("TritonExperts %s", self.moe)
             experts = TritonExperts(
                 use_fp8_w8a8=False,
                 use_int8_w8a8=False,
@@ -611,8 +606,7 @@ def __init__(
             dp_rank=self.dp_rank,
             ep_size=self.ep_size,
             ep_rank=self.ep_rank,
-            in_dtype=params_dtype,  # this is probably not right, where to get?
-            out_dtype=params_dtype,  # ditto.
+            in_dtype=params_dtype,  # TODO: is this right?
         )
 
         # Note: get_quant_method will look at the layer's local_num_experts
@@ -628,12 +622,42 @@ def __init__(
         assert quant_method is not None
         self.quant_method = quant_method
 
-        dispatch_combine: FusedMoEQuantizeDispatchCombine = None
+        dispatch_combine = self._construct_dispatch_combine(
+            moe, quant_config)
+
+        success = self.quant_method.set_dispatch_combine(dispatch_combine)
+
+        if not success:
+            logger.warning("DP+EP not supported for %s.",
+                           type(self.quant_method))
+
+        self.apply_router_weight_on_input = apply_router_weight_on_input
+        moe_quant_params = {
+            "num_experts": self.local_num_experts,
+            "hidden_size": hidden_size,
+            "intermediate_size_per_partition":
+            self.intermediate_size_per_partition,
+            "params_dtype": params_dtype,
+            "weight_loader": self.weight_loader,
+        }
+        # need full intermediate size pre-sharding for WNA16 act order
+        if (self.quant_method.__class__.__name__
+                in ("GPTQMarlinMoEMethod",
+                    "CompressedTensorsWNA16MarlinMoEMethod",
+                    "CompressedTensorsWNA16MoEMethod")):
+            moe_quant_params["intermediate_size_full"] = intermediate_size
+
+        self.quant_method.create_weights(layer=self, **moe_quant_params)
 
-        # TODO: move to method?
+    # TODO: return Optional?
+    def _construct_dispatch_combine(
+        self,
+        moe: MoEConfig,
+        quant_config: Optional[QuantizationConfig],
+    ) -> FusedMoEQuantizeDispatchCombine:
         if self.dp_size > 1 and has_pplx:
-            logger.info("using pplx dispatch")
-            max_num_tokens = MOE_DP_CHUNK_SIZE  # // moe.dp_size
+            logger.debug("using pplx dispatch")
+            max_num_tokens = MOE_DP_CHUNK_SIZE
             world_size = moe.ep_size
             dp_size = moe.ep_size // moe.dp_size  # dp_size actually means TP.
             rank = moe.ep_rank
@@ -654,51 +678,28 @@ def __init__(
                     (moe.hidden_dim + moe.block_size - 1) // moe.block_size *
                     torch.float32.itemsize)))
 
-            dispatch_combine = PplxDispatchCombine(
+            return PplxDispatchCombine(
                 all_to_all,
                 max_num_tokens,
                 world_size,
                 dp_size,
-                rank,  # just for debugging
+                rank,
                 moe.in_dtype,
             )
         elif True:
-            logger.info("using standard dispatch")
-            dispatch_combine = StandardDispatchCombine(
+            logger.debug("using standard dispatch")
+            return StandardDispatchCombine(
                 moe.in_dtype,
                 quant_config.weight_block_size
                 if quant_config is not None else None,
             )
         else:
-            logger.info("using batched dispatch")
-            dispatch_combine = BatchedDispatchCombine(
+            logger.debug("using batched dispatch")
+            return BatchedDispatchCombine(
                 moe.ep_size,
                 moe.ep_rank,
             )
 
-        success = self.quant_method.set_dispatch_combine(dispatch_combine)
-        if not success:
-            logger.warning("DP+EP not supported for %s.",
-                           type(self.quant_method))
-
-        self.apply_router_weight_on_input = apply_router_weight_on_input
-        moe_quant_params = {
-            "num_experts": self.local_num_experts,
-            "hidden_size": hidden_size,
-            "intermediate_size_per_partition":
-            self.intermediate_size_per_partition,
-            "params_dtype": params_dtype,
-            "weight_loader": self.weight_loader,
-        }
-        # need full intermediate size pre-sharding for WNA16 act order
-        if (self.quant_method.__class__.__name__
-                in ("GPTQMarlinMoEMethod",
-                    "CompressedTensorsWNA16MarlinMoEMethod",
-                    "CompressedTensorsWNA16MoEMethod")):
-            moe_quant_params["intermediate_size_full"] = intermediate_size
-
-        self.quant_method.create_weights(layer=self, **moe_quant_params)
-
     def _load_per_tensor_weight_scale(self, shard_id: str,
                                       param: torch.nn.Parameter,
                                       loaded_weight: torch.Tensor,
@@ -1015,9 +1016,14 @@ def naive_multicast(self, x: torch.Tensor,
 
         return buffer
 
+    # TODO: will this be cudagraph-able? (probably not)
+    # This should not be necessary.
+    def invalid_pplx(self, hidden_states: torch.Tensor) -> bool:
+        return has_pplx and hidden_states.shape[0] < self.dp_size
+
     def forward(self, hidden_states: torch.Tensor,
                 router_logits: torch.Tensor):
-        if self.use_direct_call:
+        if self.use_direct_call or self.invalid_pplx(hidden_states):
             return self.forward_impl(hidden_states, router_logits)
         else:
             return torch.ops.vllm.moe_forward(hidden_states, router_logits,
diff --git a/vllm/model_executor/layers/fused_moe/pplx_dispatch_combine.py b/vllm/model_executor/layers/fused_moe/pplx_dispatch_combine.py
@@ -28,6 +28,7 @@ def __init__(self,
                  quant_dtype: Optional[torch.dtype] = None,
                  block_shape: Optional[List[int]] = None):
         super().__init__()
+        assert max_num_tokens > 0
         self.a2a = a2a
         self.block_shape = block_shape
         self.max_num_tokens = max_num_tokens
@@ -47,13 +48,15 @@ def dispatch(
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
-        # Is this always going to be a1.device?
-        device = a1.device
+        num_tokens = a1.shape[0]  # M
         hidden_dim = a1.shape[-1]  # K
 
-        # ??
+        assert rank_topk_ids.shape[0] == num_tokens
         # assert expert_map is None, "NYI"
 
+        # Is this always going to be a1.device?
+        device = a1.device
+
         if apply_router_weight_on_input:
             topk = rank_topk_ids.shape[1]
             # TODO: this only works for topK=1, will need to update for topK>1
@@ -102,7 +105,6 @@ def dispatch(
             )
 
         # This argument is optional, defaults to indices.shape[0]
-        num_tokens = a1.shape[0]  # M
         bound_m = torch.tensor([num_tokens], dtype=torch.uint32, device=device)
 
         # TODO: optimize this?
@@ -133,7 +135,9 @@ def combine(
                                dtype=torch.uint32,
                                device=fused_expert_output.device)
 
-        assert output.shape[0] <= self.max_num_tokens
+        assert topk_ids.shape[0] <= num_tokens
+        assert output.shape[0] <= self.max_num_tokens, \
+            f"{output.shape[0]} <= {self.max_num_tokens}"
         assert output.shape[1] == fused_expert_output.shape[-1]
 
         # Set weights to 1 if we did them in dispatch. This is hacky.
diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
@@ -12,11 +12,11 @@
 class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
     def __init__(self,
-                 use_fp8_w8a8: bool,
-                 use_int8_w8a8: bool,
-                 use_int8_w8a16: bool,
-                 use_int4_w4a16: bool,
-                 per_channel_quant: bool,
+                 use_fp8_w8a8: bool = False,
+                 use_int8_w8a8: bool = False,
+                 use_int8_w8a16: bool = False,
+                 use_int4_w4a16: bool = False,
+                 per_channel_quant: bool = False,
                  block_shape: Optional[List[int]] = None,
                  block_m: Optional[int] = None,
                  allow_deep_gemm: bool = False):
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py