fix: separate out fp4 lib into sm90 and sm100 versions, add oob checking in fused moe (#1565)

djmmoss · web-flow · commit b513f862def5 · 2025-08-24T13:05:09.000-07:00
## 📌 Description This fixes an OOB issue in the fused MoE and creates a separate sm90 and sm100 path fp4 quantization. ## 🔍 Related Issues fix required for vllm-project/vllm#23369 ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes --------- Signed-off-by: Duncan Moss <djm.moss@gmail.com>
diff --git a/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh b/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh
@@ -1755,7 +1755,6 @@ __global__ void finalizeMoeRoutingKernel(
     ScaleBiasType const* bias, float const* scales, int const* unpermuted_row_to_permuted_row,
     int const* token_selected_experts, int64_t const orig_cols, int64_t const experts_per_token,
     int const num_experts_per_node, int const start_expert_id) {
-  assert(orig_cols % 4 == 0);
   int64_t const original_row = blockIdx.x;
   int64_t const num_rows = gridDim.x;
   auto const offset = original_row * orig_cols;
@@ -1765,6 +1764,8 @@ __global__ void finalizeMoeRoutingKernel(
   constexpr int64_t FINALIZE_ELEM_PER_THREAD =
       128 / std::min(sizeof_bits<OutputType>::value, sizeof_bits<GemmOutputType>::value);
 
+  assert(orig_cols % FINALIZE_ELEM_PER_THREAD == 0);
+
   int64_t const start_offset = threadIdx.x;
   int64_t const stride = FINALIZE_THREADS_PER_BLOCK;
   int64_t const num_elems_in_col = orig_cols / FINALIZE_ELEM_PER_THREAD;
@@ -1795,6 +1796,11 @@ __global__ void finalizeMoeRoutingKernel(
       int64_t const expanded_original_row = original_row + k_idx * num_rows;
       int64_t const expanded_permuted_row = unpermuted_row_to_permuted_row[expanded_original_row];
 
+      int64_t expanded_rows = num_rows * experts_per_token;
+      if (expanded_permuted_row < 0 || expanded_permuted_row >= expanded_rows) {
+        continue;
+      }
+
       float const row_scale = (SCALE_MODE == ScaleMode::NO_SCALE) ? 1.f : scales[k_offset];
 
       auto const* expanded_permuted_rows_row_ptr =
@@ -1828,8 +1834,6 @@ __global__ void finalizeMoeRoutingNoFillingKernel(
     int const* permuted_row_to_unpermuted_row, int const* token_selected_experts,
     int64_t const* expert_first_token_offset, int64_t const num_rows, int64_t const orig_cols,
     int64_t const experts_per_token, int const num_experts_per_node, int const start_expert_id) {
-  assert(orig_cols % 4 == 0);
-
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
   asm volatile("griddepcontrol.wait;");
 #endif
@@ -1864,6 +1868,8 @@ __global__ void finalizeMoeRoutingNoFillingKernel(
     constexpr int64_t FINALIZE_ELEM_PER_THREAD =
         128 / std::min(sizeof_bits<OutputType>::value, sizeof_bits<GemmOutputType>::value);
 
+    assert(orig_cols % FINALIZE_ELEM_PER_THREAD == 0);
+
     int64_t const start_offset = threadIdx.x;
     int64_t const stride = FINALIZE_THREADS_PER_BLOCK;
     int64_t const num_elems_in_col = orig_cols / FINALIZE_ELEM_PER_THREAD;
@@ -1889,6 +1895,11 @@ __global__ void finalizeMoeRoutingNoFillingKernel(
 
         int64_t const expanded_permuted_row_from_k_idx =
             unpermuted_row_to_permuted_row[source_row + k_idx * num_rows];
+        int64_t valid_tokens = expert_first_token_offset[num_experts_per_node];
+        if (expanded_permuted_row_from_k_idx < 0 ||
+            expanded_permuted_row_from_k_idx >= valid_tokens) {
+          continue;
+        }
 
         float const row_scale = (SCALE_MODE == ScaleMode::NO_SCALE) ? 1.f : scales[k_offset];
 
diff --git a/flashinfer/aot.py b/flashinfer/aot.py
@@ -11,7 +11,10 @@
 
 from .activation import act_func_def_str, gen_act_and_mul_module
 from .cascade import gen_cascade_module
-from .fp4_quantization import gen_fp4_quantization_module
+from .fp4_quantization import (
+    gen_fp4_quantization_sm100_module,
+    gen_fp4_quantization_sm90_module,
+)
 from .fused_moe import (
     gen_cutlass_fused_moe_sm100_module,
     gen_cutlass_fused_moe_sm90_module,
@@ -332,11 +335,12 @@ def gen_all_modules(
 
     if add_moe:
         jit_specs.append(gen_gemm_module())
-        jit_specs.append(gen_fp4_quantization_module())
         if has_sm90:
             jit_specs.append(gen_gemm_sm90_module())
+            jit_specs.append(gen_fp4_quantization_sm90_module())
             jit_specs.append(gen_cutlass_fused_moe_sm90_module())
         if has_sm100:
+            jit_specs.append(gen_fp4_quantization_sm100_module())
             jit_specs.append(gen_cutlass_fused_moe_sm100_module())
             jit_specs.append(gen_gemm_sm100_module())
 
diff --git a/flashinfer/fp4_quantization.py b/flashinfer/fp4_quantization.py
@@ -17,13 +17,13 @@
 import functools
 from enum import Enum
 from types import SimpleNamespace
-from typing import Optional, Tuple
+from typing import List, Optional, Tuple
 
 import torch
 
 from .jit import JitSpec
 from .jit import env as jit_env
-from .jit import gen_jit_spec, sm100a_nvcc_flags
+from .jit import gen_jit_spec, sm100a_nvcc_flags, sm90a_nvcc_flags
 from .utils import (
     device_support_pdl,
     get_shuffle_matrix_a_row_indices,
@@ -62,9 +62,17 @@ def _pad_scale_factors(
         ).contiguous()
 
 
-def gen_fp4_quantization_module() -> JitSpec:
+def gen_fp4_quantization_sm100_module() -> JitSpec:
+    return gen_fp4_quantization_module(sm100a_nvcc_flags, "100")
+
+
+def gen_fp4_quantization_sm90_module() -> JitSpec:
+    return gen_fp4_quantization_module(sm90a_nvcc_flags, "90")
+
+
+def gen_fp4_quantization_module(nvcc_flags: List[str], device_arch: str) -> JitSpec:
     return gen_jit_spec(
-        "fp4_quantization",
+        f"fp4_quantization_{device_arch}",
         [
             jit_env.FLASHINFER_CSRC_DIR
             / "nv_internal/tensorrt_llm/thop/fp4Quantize.cpp",
@@ -75,7 +83,7 @@ def gen_fp4_quantization_module() -> JitSpec:
             jit_env.FLASHINFER_CSRC_DIR / "nv_internal/cpp/common/stringUtils.cpp",
             jit_env.FLASHINFER_CSRC_DIR / "nv_internal/cpp/common/tllmException.cpp",
         ],
-        extra_cuda_cflags=sm100a_nvcc_flags
+        extra_cuda_cflags=nvcc_flags
         + [
             "-DENABLE_BF16",
             "-DENABLE_FP8",
@@ -94,8 +102,13 @@ def gen_fp4_quantization_module() -> JitSpec:
 
 
 @functools.cache
-def get_fp4_quantization_module():
-    module = gen_fp4_quantization_module().build_and_load()
+def get_fp4_quantization_module(backend: str = "100"):
+    if backend == "100":
+        module = gen_fp4_quantization_sm100_module().build_and_load()
+    elif backend == "90":
+        module = gen_fp4_quantization_sm90_module().build_and_load()
+    else:
+        raise ValueError(f"Invalid backend: {backend}")
 
     @register_custom_op(
         "flashinfer::fp4_quantize_sm100",
@@ -310,7 +323,7 @@ def fp4_quantize(
     assert input.shape[-1] % sf_vec_size == 0
     if enable_pdl is None:
         enable_pdl = device_support_pdl(input.device)
-    x_q, sf = get_fp4_quantization_module().fp4_quantize_sm100(
+    x_q, sf = get_fp4_quantization_module("100").fp4_quantize_sm100(
         input,
         global_scale,
         sf_vec_size,
@@ -346,7 +359,11 @@ def block_scale_interleave(unswizzled_sf: torch.Tensor) -> torch.Tensor:
     assert unswizzled_sf.dtype == torch.uint8, (
         f"Input dtype must be uint8, got {unswizzled_sf.dtype}"
     )
-    return get_fp4_quantization_module().block_scale_interleave_sm100(
+
+    major, minor = torch.cuda.get_device_capability()
+    device_arch = f"{major * 10 + minor}"
+
+    return get_fp4_quantization_module(device_arch).block_scale_interleave_sm100(
         unswizzled_sf,
     )
 
@@ -380,7 +397,11 @@ def e2m1_and_ufp8sf_scale_to_float(
         torch.Tensor: Dequantized float tensor of shape [M, K] with dtype float32.
 
     """
-    return get_fp4_quantization_module().e2m1_and_ufp8sf_scale_to_float_sm100(
+    major, minor = torch.cuda.get_device_capability()
+    device_arch = f"{major * 10 + minor}"
+    return get_fp4_quantization_module(
+        device_arch
+    ).e2m1_and_ufp8sf_scale_to_float_sm100(
         e2m1_tensor,
         ufp8_scale_tensor,
         global_scale_tensor,
@@ -547,7 +568,9 @@ def mxfp4_dequantize_host(
     Returns:
         torch.Tensor: Dequantized tensor of shape [M, K] with dtype float.
     """
-    return get_fp4_quantization_module().mxfp4_dequantize_host(
+    major, minor = torch.cuda.get_device_capability()
+    device_arch = f"{major * 10 + minor}"
+    return get_fp4_quantization_module(device_arch).mxfp4_dequantize_host(
         weight,
         scale,
         group_size,
diff --git a/tests/test_fp4_quantize.py b/tests/test_fp4_quantize.py
@@ -295,7 +295,10 @@ def test_e2m1_dequantization(
     )
 
 
-def test_mxfp4_quantize_roundtrip():
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_mxfp4_quantize_roundtrip(device: str):
+    if not is_sm100a_supported(torch.device(device)):
+        pytest.skip("Nvfp4 Requires compute capability of 10 or above")
     x = torch.randn((128, 64), device="cuda", dtype=torch.bfloat16) / 10
 
     quant_a, sfs = mxfp4_quantize(x)

Original file line number	Diff line number	Diff line change
`@@ -295,7 +295,10 @@ def test_e2m1_dequantization(`
`295`	`295`	`)`
`296`	`296`
`297`	`297`
`298`		`-def test_mxfp4_quantize_roundtrip():`
	`298`	`+@pytest.mark.parametrize("device", CUDA_DEVICES)`
	`299`	`+def test_mxfp4_quantize_roundtrip(device: str):`
	`300`	`+ if not is_sm100a_supported(torch.device(device)):`
	`301`	`+ pytest.skip("Nvfp4 Requires compute capability of 10 or above")`
`299`	`302`	`x = torch.randn((128, 64), device="cuda", dtype=torch.bfloat16) / 10`
`300`	`303`
`301`	`304`	`quant_a, sfs = mxfp4_quantize(x)`