Integrate mxfp4 MoE native kernels

mawong-amd · Doug Lehr · commit caea4430a624 · 2025-09-05T00:36:48.000-05:00
Signed-off-by: Matthew Wong &lt;Matthew.Wong2@amd.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -44,6 +44,11 @@
 
 from .rocm_aiter_fused_moe import is_rocm_aiter_moe_enabled
 
+try:
+    from aiter.ops.triton.moe_op_mxfp4 import _fused_moe_kernel_mxfp4
+except ImportError:
+    _fused_moe_kernel_mxfp4 = None
+
 logger = init_logger(__name__)
 
 
@@ -507,6 +512,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
                             use_int8_w8a8: bool,
                             use_int8_w8a16: bool,
                             use_int4_w4a16: bool,
+                            use_mxfp4_w4a4: bool,
                             per_channel_quant: bool,
                             block_shape: Optional[list[int]] = None,
                             B_bias: Optional[torch.Tensor] = None) -> None:
@@ -524,6 +530,9 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
     elif use_int8_w8a16 or use_int4_w4a16:
         assert B_scale is not None
         assert block_shape is None or block_shape[0] == 0
+    elif use_mxfp4_w4a4:
+        assert A_scale is not None
+        assert B_scale is not None
     else:
         assert A_scale is None
         assert B_scale is None
@@ -611,6 +620,55 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
             use_int8_w8a16=use_int8_w8a16,
             **config,
         )
+    elif use_mxfp4_w4a4:
+        ONE = torch.ones(B.size(0), dtype=torch.float32, device=A.device)
+        # overwrite config with a static one for now
+        config = {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 128,
+            "GROUP_SIZE_M": 4,
+            "num_warps": 8,
+            "num_stages": 2,
+            "waves_per_eu": 0,
+            "matrix_instr_nonkdim": 16,
+            "kpack": 1,
+        }
+        _fused_moe_kernel_mxfp4[grid](
+            A,
+            B,
+            C,
+            ONE[0],
+            ONE,
+            A_scale,
+            B_scale,
+            topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            B.size(1),
+            A.size(1),
+            EM,
+            num_tokens,
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(2),
+            B.stride(1),
+            C.stride(1),
+            C.stride(2),
+            A_scale.stride(0),
+            A_scale.stride(1),
+            B_scale.stride(0),
+            B_scale.stride(2),
+            B_scale.stride(1),
+            MUL_ROUTED_WEIGHT=mul_routed_weight,
+            top_k=top_k,
+            compute_type=compute_type,
+            SWIZZLE_MX_A=False,
+            SWIZZLE_MX_B=False,
+            **config,
+        )
     else:
         config = config.copy()
         BLOCK_SIZE_K = config.pop("BLOCK_SIZE_K")
@@ -1570,7 +1628,7 @@ def fused_experts_impl(
     else:
         out_hidden_states = torch.empty_like(hidden_states)
 
-    if use_mxfp4_w4a4:
+    if use_mxfp4_w4a4 and not current_platform.supports_mx():
         # Weight has to be dequantized for mxfp4 emulation.
         w1 = dequant_mxfp4(w1, w1_scale, hidden_states.dtype)
         w1_scale = None
@@ -1629,6 +1687,8 @@ def fused_experts_impl(
                                 use_int8_w8a8=use_int8_w8a8,
                                 use_int8_w8a16=use_int8_w8a16,
                                 use_int4_w4a16=use_int4_w4a16,
+                                use_mxfp4_w4a4=use_mxfp4_w4a4
+                                and current_platform.supports_mx(),
                                 per_channel_quant=per_channel_quant,
                                 block_shape=block_shape,
                                 B_bias=w1_bias)
@@ -1687,6 +1747,8 @@ def swiglu_oai(gate_up):
                                 use_int8_w8a8=use_int8_w8a8,
                                 use_int8_w8a16=use_int8_w8a16,
                                 use_int4_w4a16=use_int4_w4a16,
+                                use_mxfp4_w4a4=use_mxfp4_w4a4
+                                and current_platform.supports_mx(),
                                 per_channel_quant=per_channel_quant,
                                 block_shape=block_shape,
                                 B_bias=w2_bias)
@@ -1994,6 +2056,8 @@ def apply(
             use_int8_w8a8=self.use_int8_w8a8,
             use_int8_w8a16=self.use_int8_w8a16,
             use_int4_w4a16=self.use_int4_w4a16,
+            use_mxfp4_w4a4=self.use_mxfp4_w4a4
+            and current_platform.supports_mx(),
             per_channel_quant=self.per_act_token_quant,
             block_shape=self.block_shape,
             B_bias=None  # TODO support B_bias
@@ -2027,6 +2091,8 @@ def apply(
             use_int8_w8a8=self.use_int8_w8a8,
             use_int8_w8a16=self.use_int8_w8a16,
             use_int4_w4a16=self.use_int4_w4a16,
+            use_mxfp4_w4a4=self.use_mxfp4_w4a4
+            and current_platform.supports_mx(),
             per_channel_quant=self.per_act_token_quant,
             block_shape=self.block_shape,
             B_bias=None  # TODO support B_bias
diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py
@@ -17,6 +17,9 @@
 from vllm.utils import cdiv
 from vllm.utils.flashinfer import fp4_quantize
 
+if current_platform.supports_mx():
+    from aiter.ops.triton.quant import dynamic_mxfp4_quant
+
 
 @triton.jit
 def _count_expert_num_tokens(topk_ids_ptr, expert_num_tokens_ptr, num_experts,
@@ -167,14 +170,14 @@ def _mxfp4_quantize(
     A_scale: Optional[torch.Tensor],
     per_act_token_quant: bool,
     block_shape: Optional[list[int]] = None,
-) -> tuple[torch.Tensor, None]:
+) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
     assert block_shape is None
     if not current_platform.supports_mx():
         A = quant_dequant_mxfp4(A)
-    else:
-        raise NotImplementedError()
-
-    return A, None
+        return A, A_scale
+    if A_scale is not None:
+        return A, A_scale
+    return dynamic_mxfp4_quant(A)
 
 
 def moe_kernel_quantize_input(
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -282,13 +282,7 @@ def __init__(self, weight_config: dict[str, Any], input_config: dict[str,
                 "QDQ (quantize and dequantize) will be used, with the linear "
                 "layers computed in high precision.")
         else:
-            self.emulate = True
-            logger.warning_once(
-                "The current platform supports native MXFP4 "
-                "computation, but kernels are not yet integrated in vLLM. "
-                "Simulated weight dequantization and activation "
-                "QDQ (quantize and dequantize) will be used, with the linear "
-                "layers computed in high precision.")
+            self.emulate = False
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
@@ -144,13 +144,10 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             )
             weight_quantizer.scale.data = layer.weight_scale.data
 
-            if not envs.VLLM_QUARK_EMU_MEM_OPT:
-                layer.weight = torch.nn.Parameter(
-                    weight_quantizer(layer.weight.data).to(self.out_dtype),
-                    requires_grad=False,
-                )
-            else:
-                self.weight_quantizer = weight_quantizer
+            layer.weight = torch.nn.Parameter(
+                weight_quantizer(layer.weight.data).to(self.out_dtype),
+                requires_grad=False,
+            )
             layer.weight_scale = None
 
             # This call is necessary to release the scales memory.