googlercolin
diff --git a/‎vllm/envs.py
Lines changed: 12 additions & 0 deletions b/‎vllm/envs.py
Lines changed: 12 additions & 0 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/layer.py
Lines changed: 29 additions & 3 deletions b/‎vllm/model_executor/layers/fused_moe/layer.py
Lines changed: 29 additions & 3 deletions
diff --git a/‎vllm/model_executor/layers/quantization/__init__.py
Lines changed: 3 additions & 0 deletions b/‎vllm/model_executor/layers/quantization/__init__.py
Lines changed: 3 additions & 0 deletions
@@ -154,6 +154,8 @@
     VLLM_ENABLE_RESPONSES_API_STORE: bool = False
     VLLM_USE_TRTLLM_CONTEXT_ATTENTION: bool = False
     VLLM_USE_TRTLLM_DECODE_ATTENTION: bool = False
+    VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False
+    VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
 
 
 def get_default_cache_root():
@@ -932,6 +934,16 @@ def get_vllm_port() -> Optional[int]:
     "VLLM_USE_FLASHINFER_MOE_FP4":
     lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP4", "0"))),
 
+    # If set to 1, use the FlashInfer
+    # MXFP8 (activation) x MXFP4 (weight) MoE backend.
+    "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8":
+    lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "0"))),
+
+    # If set to 1, use the FlashInfer
+    # BF16 (activation) x MXFP4 (weight) MoE backend.
+    "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16":
+    lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "0"))),
+
     # Control the cache sized used by the xgrammar compiler. The default
     # of 512 MB should be enough for roughly 1000 JSON schemas.
     # It can be changed with this variable if needed for some reason.
 
@@ -33,7 +33,8 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.platforms.interface import CpuArchEnum
-from vllm.utils import direct_register_custom_op, has_deep_ep, has_pplx
+from vllm.utils import (direct_register_custom_op, has_deep_ep, has_pplx,
+                        round_up)
 from vllm.utils.flashinfer import has_flashinfer
 
 if current_platform.is_cuda_alike():
@@ -719,6 +720,12 @@ def __init__(
 
         self.global_num_experts = num_experts + num_redundant_experts
 
+        # we padding globally so EP buffer allocation works
+        if quant_config and quant_config.get_name() == "mxfp4" and (
+                envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
+                or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
+            hidden_size = round_up(hidden_size, 256)
+
         # For smuggling this layer into the fused moe custom op
         compilation_config = vllm_config.compilation_config
         if prefix in compilation_config.static_forward_context:
@@ -1064,6 +1071,18 @@ def weight_loader(self,
                       shard_id: str,
                       expert_id: int,
                       return_success: bool = False) -> Optional[bool]:
+
+        if self.quant_config and self.quant_config.get_name() == "mxfp4":
+            # (FIXME) for gpt-oss all experts are combined
+            if "bias" in weight_name:
+                dim1 = loaded_weight.shape[1]
+                param.data[:, :dim1].copy_(loaded_weight)
+            else:
+                dim1 = loaded_weight.shape[1]
+                dim2 = loaded_weight.shape[2]
+                param.data[:, :dim1, :dim2].copy_(loaded_weight)
+            return True if return_success else None
+
         expert_id = self._map_global_expert_id_to_local_expert_id(expert_id)
         if expert_id == -1:
             # Failed to load this param since it's not local to this rank
@@ -1476,13 +1495,20 @@ def maybe_all_reduce_tensor_model_parallel(
 
     def forward(self, hidden_states: torch.Tensor,
                 router_logits: torch.Tensor):
+        og_hidden_states = hidden_states.shape[-1]
+        if self.hidden_size != og_hidden_states:
+            hidden_states = F.pad(hidden_states,
+                                  (0, self.hidden_size - og_hidden_states),
+                                  mode='constant',
+                                  value=0.0)
         # TODO: Once the OOM issue for the TPU backend is resolved, we will
         # switch to using the moe_forward custom op.
         if current_platform.is_tpu():
             return self.forward_impl(hidden_states, router_logits)
         else:
-            return torch.ops.vllm.moe_forward(hidden_states, router_logits,
-                                              self.layer_name)
+            return torch.ops.vllm.moe_forward(
+                hidden_states, router_logits,
+                self.layer_name)[..., :og_hidden_states]
 
     def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
                              full_router_logits: torch.Tensor):
 
@@ -37,6 +37,7 @@
     "auto-round",
     "rtn",
     "inc",
+    "mxfp4",
 ]
 QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods))
 
@@ -110,6 +111,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
     from .marlin import MarlinConfig
     from .modelopt import ModelOptFp8Config, ModelOptNvFp4Config
     from .moe_wna16 import MoeWNA16Config
+    from .mxfp4 import Mxfp4Config
     from .neuron_quant import NeuronQuantConfig
     from .ptpc_fp8 import PTPCFp8Config
     from .qqq import QQQConfig
@@ -148,6 +150,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
         "auto-round": AutoRoundConfig,
         "rtn": RTNConfig,
         "inc": INCConfig,
+        "mxfp4": Mxfp4Config,
     }
     # Update the `method_to_config` with customized quantization methods.
     method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)