vllm-project · djmmoss · Aug 18, 2025 · Aug 14, 2025 · Aug 14, 2025 · Aug 14, 2025
@@ -39,11 +39,11 @@
         return envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16
 
     # Enable by default on SM100 if MXFP8 is not explicitly enabled
-    if (current_platform.is_device_capability(100) and has_flashinfer()
+    if (current_platform.is_device_capability(100) or current_platform.is_device_capability(90) and has_flashinfer()
             and not envs.is_set("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8")):
         logger.info_once(
-            "Enabling FlashInfer MXFP4 BF16 backend by default for Blackwell. "
+            "Enabling FlashInfer MXFP4 BF16 backend by default for Blackwell and Hopper. "
             "For faster performance, consider setting "
             "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1, "
             "though this may impact accuracy.")
        return True
@@ -113,6 +113,7 @@
         self.topk_indices_dtype = None
         self.moe = moe
         self.use_marlin = self._should_use_marlin()
+        self.flashinfer_autotune = True
 
         if current_platform.is_device_capability(100) and not has_flashinfer():
             logger.warning_once(
@@ -171,14 +172,17 @@
             layer.hidden_size = hidden_size
             layer.intermediate_size_per_partition = \
                 intermediate_size_per_partition_after_pad
-        elif should_use_flashinfer_mxfp4():
+        elif should_use_flashinfer_mxfp4() and current_platform.is_device_capability(100):
             # pad the intermediate size to be a multiple of 2 * mxfp4_block
             # for to hold non-uniform sharded tensor as well as swizzling
             # other padding to increase performance
             intermediate_size_per_partition_after_pad = round_up(
                 intermediate_size_per_partition, 256)
             hidden_size = round_up(hidden_size, 256)
+        elif _should_use_flashinfer_mxfp4_bf16() and current_platform.is_device_capability(90):
+            intermediate_size_per_partition_after_pad = round_up(
+                intermediate_size_per_partition, 128)
         elif current_platform.is_rocm():
             intermediate_size_per_partition_after_pad = round_up(
                 intermediate_size_per_partition, 128)
        else:
@@ -384,6 +388,96 @@
             layer.w2_bias = Parameter(torch.stack(gemm2_bias_shuffled).reshape(
                 self.num_experts, -1),
                                       requires_grad=False)
+        elif _should_use_flashinfer_mxfp4_bf16() and current_platform.is_device_capability(90):
+            assert layer.w13_weight.dtype == torch.uint8, f"layer.w13_weight.dtype: {layer.w13_weight.dtype}, expected: {torch.uint8}"
+            assert layer.w2_weight.dtype == torch.uint8, f"layer.w2_weight.dtype: {layer.w2_weight.dtype}, expected: {torch.uint8}"
+            assert layer.w13_weight_scale.dtype == torch.uint8, f"layer.w13_weight_scale.dtype: {layer.w13_weight_scale.dtype}, expected: {torch.uint8}"
+            assert layer.w2_weight_scale.dtype == torch.uint8, f"layer.w2_weight_scale.dtype: {layer.w2_weight_scale.dtype}, expected: {torch.uint8}"
+            assert layer.w13_bias.dtype == torch.bfloat16, f"layer.w13_bias.dtype: {layer.w13_bias.dtype}, expected: {torch.bfloat16}"
+            assert layer.w2_bias.dtype == torch.bfloat16, f"layer.w2_bias.dtype: {layer.w2_bias.dtype}, expected: {torch.bfloat16}"
+
+            layer.gemm1_alpha = Parameter(torch.tensor(
+                [1.702] * self.num_experts, dtype=torch.float32).cuda(),
+                                          requires_grad=False)
+            layer.gemm1_beta = Parameter(torch.tensor(
+                [1.0] * self.num_experts, dtype=torch.float32).cuda(),
+                                         requires_grad=False)
+            layer.gemm1_clamp_limit = Parameter(torch.tensor(
+                [7.0] * self.num_experts, dtype=torch.float32).cuda(),
+                                                requires_grad=False)
+            sf_block_size = 32  # mxfp4 block size
+
+            assert (layer.w13_weight.dim() == 3
+                    and layer.w13_weight.shape[0] == self.num_experts
+                    and layer.w13_weight.shape[1] == self.intermediate_size * 2
+                    and layer.w13_weight.shape[2] == self.hidden_size // 2)
+            assert (layer.w13_weight_scale.dim() == 3
+                    and layer.w13_weight_scale.shape[0] == self.num_experts
+                    and layer.w13_weight_scale.shape[1]
+                    == self.intermediate_size * 2
+                    and layer.w13_weight_scale.shape[2]
+                    == self.hidden_size // sf_block_size)
+            assert (layer.w2_weight.dim() == 3
+                    and layer.w2_weight.shape[0] == self.num_experts
+                    and layer.w2_weight.shape[1] == self.hidden_size and
+                    layer.w2_weight.shape[2] == self.intermediate_size // 2)
+            assert (layer.w2_weight_scale.dim() == 3
+                    and layer.w2_weight_scale.shape[1] == self.hidden_size
+                    and layer.w2_weight_scale.shape[2]
+                    == self.intermediate_size // sf_block_size)
+            assert (layer.w13_bias.dim() == 2
+                    and layer.w13_bias.shape[0] == self.num_experts
+                    and layer.w13_bias.shape[1] == self.intermediate_size * 2)
+            assert (layer.w2_bias.dim() == 2
+                    and layer.w2_bias.shape[0] == self.num_experts
+                    and layer.w2_bias.shape[1] == self.hidden_size)
+
+
+
+            # De-interleave weights, scales, and biases for gate and up projections
+            w13_weight_data = layer.w13_weight.data
+            gate_w, up_w = w13_weight_data[:, ::2, :], w13_weight_data[:, 1::2, :]
+            deinterleaved_w13_weight = torch.cat([gate_w, up_w], dim=1)
+            w1_weight, w3_weight = torch.chunk(deinterleaved_w13_weight, 2, dim=1)
+            layer.w13_weight = torch.nn.Parameter(torch.cat([w3_weight, w1_weight], dim=1).cuda(), requires_grad=False)
+
+            w13_bias_data = layer.w13_bias.data.to(torch.float32)
+            gate_b, up_b = w13_bias_data[:, ::2], w13_bias_data[:, 1::2]
+            deinterleaved_w13_bias = torch.cat([gate_b, up_b], dim=1)
+            b1, b3 = torch.chunk(deinterleaved_w13_bias, 2, dim=-1)
+            b = torch.cat([b3, b1], dim=-1)
+            layer.w13_bias = torch.nn.Parameter(b.to(torch.bfloat16).cuda(), requires_grad=False)
+
+            # Scale
+            w13_scale_data = layer.w13_weight_scale.data
+            gate_s, up_s = w13_scale_data[:, ::2, :], w13_scale_data[:, 1::2, :]
+            deinterleaved_w13_scale = torch.cat([gate_s, up_s], dim=1)
+            w1_weight_scale, w3_weight_scale = torch.chunk(deinterleaved_w13_scale, 2, dim=1)
+            all_w31_scales = torch.cat([w3_weight_scale, w1_weight_scale], dim=1)
+
+            w31_scales = all_w31_scales.to(torch.uint8).view(torch.uint8)
+            w31_s_shape = w31_scales.shape
+            w31_scales_interleaved = w31_scales.reshape(
+                w31_s_shape[0], w31_s_shape[1],
+                (w31_s_shape[2] // 4), 4)
+            w31_scales_interleaved = w31_scales_interleaved.permute(0, 2, 1, 3)
+            w31_scales_interleaved = w31_scales_interleaved.reshape(
+                w31_s_shape[0], w31_s_shape[2] // 4, w31_s_shape[1] * 4)
+
+            layer.w13_weight_scale = torch.nn.Parameter(w31_scales_interleaved.cuda(), requires_grad=False)
+
+            w2_weight_scale = layer.w2_weight_scale.data
+            w2_scales = w2_weight_scale.to(torch.uint8).view(torch.uint8)
+            w2_s_shape = w2_scales.shape
+            w2_scales_interleaved = w2_scales.reshape(
+                w2_s_shape[0], w2_s_shape[1],
+                (w2_s_shape[2] // 4), 4)
+            w2_scales_interleaved = w2_scales_interleaved.permute(0, 2, 1, 3)
+            w2_scales_interleaved = w2_scales_interleaved.reshape(
+                w2_s_shape[0], w2_s_shape[2] // 4, w2_s_shape[1] * 4)
+
+            layer.w2_weight_scale = torch.nn.Parameter(w2_scales_interleaved.cuda(), requires_grad=False)
+
         else:
             from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
 
@@ -510,7 +604,7 @@
             logical_replica_count), (
                 "MXFP4 are not supported with this configuration.")
 
-        if should_use_flashinfer_mxfp4():
+        if should_use_flashinfer_mxfp4() and current_platform.is_device_capability(100):
             from flashinfer import mxfp8_quantize, trtllm_fp4_block_scale_moe
             assert not self.moe.use_ep, (
                 "EP is not supported for flashinfer mxfp4 moe backend yet.")
@@ -551,6 +645,49 @@
                 True,  # do finalize
             )[0]
             return trtllm_gen_output
+        elif _should_use_flashinfer_mxfp4_bf16() and current_platform.is_device_capability(90):
+
+            assert x.dtype == torch.bfloat16
+
+            quant_scales = [
+                layer.w13_weight_scale,
+                layer.w2_weight_scale,
+            ]
+
+            topk_weights, topk_ids = FusedMoE.select_experts(
+                hidden_states=x,
+                router_logits=router_logits,
+                use_grouped_topk=use_grouped_topk,
+                top_k=top_k,
+                renormalize=renormalize,
+                topk_group=topk_group,
+                num_expert_group=num_expert_group,
+                custom_routing_function=custom_routing_function,
+                scoring_func=scoring_func,
+                e_score_correction_bias=e_score_correction_bias,
+            )
+
+            output = torch.zeros_like(x)
+
+            with torch.inference_mode(), autotune(self.flashinfer_autotune):
+                _ = cutlass_fused_moe(
+                    input=x,
+                    token_selected_experts=topk_ids,
+                    token_final_scales=topk_weights,
+                    fc1_expert_weights=layer.w13_weight,
+                    fc2_expert_weights=layer.w2_weight,
+                    output_dtype=torch.bfloat16,
+                    quant_scales=quant_scales,
+                    fc1_expert_biases=layer.w13_bias,
+                    fc2_expert_biases=layer.w2_bias,
+                    swiglu_alpha=layer.gemm1_alpha,
+                    swiglu_beta=layer.gemm1_beta,
+                    swiglu_limit=layer.gemm1_clamp_limit,
+                    use_w4_group_scaling=True,
+                    output=output,
+                )
+                self.flashinfer_autotune = False
+            return output
         else:
             return triton_kernel_moe_forward(
                 hidden_states=x,