Skip to content

Commit fa6e599

Browse files
authored
[Bugfix] Fix _CPU_MOE_ACT AssertionError when vLLM config not set (vllm-project#32777)
Signed-off-by: Karan Bansal <[email protected]>
1 parent 7ef5873 commit fa6e599

File tree

2 files changed

+26
-26
lines changed

2 files changed

+26
-26
lines changed

tests/kernels/moe/test_cpu_fused_moe.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from tests.kernels.allclose_default import get_default_atol, get_default_rtol
88
from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight
9-
from vllm.model_executor.layers.fused_moe.cpu_fused_moe import _CPU_MOE_ACT
9+
from vllm.model_executor.layers.fused_moe.cpu_fused_moe import _CPU_MOE_ACT_FN
1010
from vllm.platforms import current_platform
1111
from vllm.utils.torch_utils import set_random_seed
1212

@@ -68,12 +68,7 @@ def ref_fused_moe(
6868
tokens_for_this_expert, curr_w13, curr_w13_bias
6969
)
7070
# Note: to simulate the kernel implementation
71-
gate_up = (
72-
_CPU_MOE_ACT[activation]
73-
.forward_native(gate_up)
74-
.to(dtype=input.dtype)
75-
.float()
76-
)
71+
gate_up = _CPU_MOE_ACT_FN[activation](gate_up).to(dtype=input.dtype).float()
7772
expert_out = torch.nn.functional.linear(gate_up, curr_w2, curr_w2_bias)
7873

7974
outputs.append(expert_out)

vllm/model_executor/layers/fused_moe/cpu_fused_moe.py

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -8,33 +8,38 @@
88

99
from vllm import _custom_ops as ops
1010
from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight
11-
from vllm.model_executor.layers.activation import SiluAndMul, SwigluOAIAndMul
11+
from vllm.model_executor.layers.activation import SiluAndMul
1212
from vllm.model_executor.layers.quantization.utils.layer_utils import replace_parameter
1313
from vllm.utils.torch_utils import direct_register_custom_op
1414

1515
_CPU_MOE_LAYER_CACHE = {}
1616

1717

18-
class _LazyActivationDict(dict):
19-
"""Lazily instantiate activation functions on first access.
18+
def _swigluoai_forward_native(
19+
x: torch.Tensor,
20+
alpha: float = 1.702,
21+
limit: float = 7.0,
22+
) -> torch.Tensor:
23+
"""PyTorch-native implementation of SwigluOAIAndMul.forward_native.
2024
21-
Avoids triggering CustomOp.__init__() at module import time,
22-
which would call get_current_vllm_config() before config is set.
25+
Standalone function to avoid instantiating SwigluOAIAndMul (a CustomOp)
26+
which would trigger get_current_vllm_config() before config is set.
2327
"""
28+
gate, up = x[..., ::2], x[..., 1::2]
29+
gate = gate.clamp(min=None, max=limit)
30+
up = up.clamp(min=-limit, max=limit)
31+
glu = gate * torch.sigmoid(gate * alpha)
32+
gated_output = (up + 1) * glu
33+
return gated_output
2434

25-
_factories: dict[str, type[SiluAndMul] | type[SwigluOAIAndMul]] = {
26-
"silu": SiluAndMul,
27-
"swigluoai": SwigluOAIAndMul,
28-
}
2935

30-
def __missing__(self, key: str) -> SiluAndMul | SwigluOAIAndMul:
31-
if key not in self._factories:
32-
raise KeyError(f"{key} is not a supported activation")
33-
self[key] = self._factories[key]()
34-
return self[key]
35-
36-
37-
_CPU_MOE_ACT = _LazyActivationDict()
36+
# Map activation names to their native forward functions.
37+
# Uses static methods or standalone functions to avoid instantiating CustomOp
38+
# classes, which would call get_current_vllm_config() before config is set.
39+
_CPU_MOE_ACT_FN: dict[str, Callable[[torch.Tensor], torch.Tensor]] = {
40+
"silu": SiluAndMul.forward_native,
41+
"swigluoai": _swigluoai_forward_native,
42+
}
3843

3944

4045
def grouped_topk(
@@ -230,7 +235,7 @@ def __call__(
230235
apply_router_weight_on_input: bool = False,
231236
activation: str = "silu",
232237
) -> torch.Tensor:
233-
assert activation in _CPU_MOE_ACT._factories, f"{activation} is not supported."
238+
assert activation in _CPU_MOE_ACT_FN, f"{activation} is not supported."
234239
assert not apply_router_weight_on_input
235240

236241
topk_weights, topk_ids = select_experts(
@@ -418,7 +423,7 @@ def cpu_fused_moe_torch(
418423
tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
419424

420425
gate_up = layer.gate_up_linear[i](tokens_for_this_expert) # type: ignore
421-
gate_up = _CPU_MOE_ACT[activation].forward_native(gate_up)
426+
gate_up = _CPU_MOE_ACT_FN[activation](gate_up)
422427
expert_out = layer.down_linear[i](gate_up) # type: ignore
423428
outputs.append(expert_out)
424429
start_idx = end_idx

0 commit comments

Comments
 (0)