fix test when pplx is missing + minor tweaks

bnellnm · bnellnm · commit d6e801e1030d · 2025-05-14T14:55:37.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
@@ -33,8 +33,6 @@
                                                             get_default_config)
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
     FusedMoEModularKernel)
-from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
-    PplxPrepareAndFinalize)
 from vllm.platforms import current_platform
 
 PPLX_PREPARE_COMBOS = [(4, 128, 128), (32, 1024, 512), (64, 1024, 512),
@@ -350,6 +348,9 @@ def chunk_by_rank(t: torch.Tensor, r: int, w: int) -> torch.Tensor:
 def pplx_prepare_finalize(pgi: ProcessGroupInfo, dp_size: int, a: torch.Tensor,
                           topk_weight: torch.Tensor, topk_ids: torch.Tensor,
                           num_experts: int) -> torch.Tensor:
+    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
+        PplxPrepareAndFinalize)
+
     assert torch.cuda.current_device() == pgi.local_rank
 
     topk = topk_ids.shape[1]
@@ -499,6 +500,9 @@ def pplx_moe(
     use_compile: bool = True,
     use_cudagraphs: bool = True,
 ) -> torch.Tensor:
+    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
+        PplxPrepareAndFinalize)
+
     device = torch.device("cuda", rank)
     hidden_dim = a.shape[1]
     num_experts = w1.shape[0]
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -833,16 +833,15 @@ def __init__(
 
         # Note: get_quant_method will look at the layer's local_num_experts
         # for heuristic purposes, so it must be initialized first.
-        quant_method: Optional[FusedMoEMethodBase] = None
+        quant_method: Optional[QuantizeMethodBase] = None
 
         if quant_config is None:
             quant_method = UnquantizedFusedMoEMethod(moe)
         else:
-            quant_method = quant_config.get_quant_method(
-                self, prefix)  # type: ignore
-            assert isinstance(quant_method, FusedMoEMethodBase)
+            quant_method = quant_config.get_quant_method(self, prefix)
 
         assert quant_method is not None
+        assert isinstance(quant_method, FusedMoEMethodBase)
         self.quant_method = quant_method
 
         prepare_finalize = _construct_prepare_finalize(moe, quant_config)