[Bugfix] Fix _CPU_MOE_ACT AssertionError when vLLM config not set (vllm-project#32777)

karanb192 · web-flow · commit fa6e599a618b · 2026-01-23T08:22:37.000Z
Signed-off-by: Karan Bansal &lt;karanb192@gmail.com&gt;
diff --git a/tests/kernels/moe/test_cpu_fused_moe.py b/tests/kernels/moe/test_cpu_fused_moe.py
@@ -6,7 +6,7 @@
 
 from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight
-from vllm.model_executor.layers.fused_moe.cpu_fused_moe import _CPU_MOE_ACT
+from vllm.model_executor.layers.fused_moe.cpu_fused_moe import _CPU_MOE_ACT_FN
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import set_random_seed
 
@@ -68,12 +68,7 @@ def ref_fused_moe(
             tokens_for_this_expert, curr_w13, curr_w13_bias
         )
         # Note: to simulate the kernel implementation
-        gate_up = (
-            _CPU_MOE_ACT[activation]
-            .forward_native(gate_up)
-            .to(dtype=input.dtype)
-            .float()
-        )
+        gate_up = _CPU_MOE_ACT_FN[activation](gate_up).to(dtype=input.dtype).float()
         expert_out = torch.nn.functional.linear(gate_up, curr_w2, curr_w2_bias)
 
         outputs.append(expert_out)
diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
@@ -8,33 +8,38 @@
 
 from vllm import _custom_ops as ops
 from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight
-from vllm.model_executor.layers.activation import SiluAndMul, SwigluOAIAndMul
+from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.quantization.utils.layer_utils import replace_parameter
 from vllm.utils.torch_utils import direct_register_custom_op
 
 _CPU_MOE_LAYER_CACHE = {}
 
 
-class _LazyActivationDict(dict):
-    """Lazily instantiate activation functions on first access.
+def _swigluoai_forward_native(
+    x: torch.Tensor,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+) -> torch.Tensor:
+    """PyTorch-native implementation of SwigluOAIAndMul.forward_native.
 
-    Avoids triggering CustomOp.__init__() at module import time,
-    which would call get_current_vllm_config() before config is set.
+    Standalone function to avoid instantiating SwigluOAIAndMul (a CustomOp)
+    which would trigger get_current_vllm_config() before config is set.
     """
+    gate, up = x[..., ::2], x[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    gated_output = (up + 1) * glu
+    return gated_output
 
-    _factories: dict[str, type[SiluAndMul] | type[SwigluOAIAndMul]] = {
-        "silu": SiluAndMul,
-        "swigluoai": SwigluOAIAndMul,
-    }
 
-    def __missing__(self, key: str) -> SiluAndMul | SwigluOAIAndMul:
-        if key not in self._factories:
-            raise KeyError(f"{key} is not a supported activation")
-        self[key] = self._factories[key]()
-        return self[key]
-
-
-_CPU_MOE_ACT = _LazyActivationDict()
+# Map activation names to their native forward functions.
+# Uses static methods or standalone functions to avoid instantiating CustomOp
+# classes, which would call get_current_vllm_config() before config is set.
+_CPU_MOE_ACT_FN: dict[str, Callable[[torch.Tensor], torch.Tensor]] = {
+    "silu": SiluAndMul.forward_native,
+    "swigluoai": _swigluoai_forward_native,
+}
 
 
 def grouped_topk(
@@ -230,7 +235,7 @@ def __call__(
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
-        assert activation in _CPU_MOE_ACT._factories, f"{activation} is not supported."
+        assert activation in _CPU_MOE_ACT_FN, f"{activation} is not supported."
         assert not apply_router_weight_on_input
 
         topk_weights, topk_ids = select_experts(
@@ -418,7 +423,7 @@ def cpu_fused_moe_torch(
         tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
 
         gate_up = layer.gate_up_linear[i](tokens_for_this_expert)  # type: ignore
-        gate_up = _CPU_MOE_ACT[activation].forward_native(gate_up)
+        gate_up = _CPU_MOE_ACT_FN[activation](gate_up)
         expert_out = layer.down_linear[i](gate_up)  # type: ignore
         outputs.append(expert_out)
         start_idx = end_idx