feat: add support of fp4_batched_quantize (#1633)

yicwang · web-flow · commit 5ad2323bfd7d · 2025-09-06T08:27:03.000-07:00
diff --git a/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.cpp b/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.cpp
@@ -142,6 +142,82 @@ std::tuple<at::Tensor, at::Tensor> fp4_quantize(at::Tensor const& self,
 
   return {valueE2M1, scaleFP8SF};
 }
+
+// self: [B, M, K], fp16/bf16/fp8_quantized
+// globalScale: [1] float, = (448 * 6) / self.abs().max()
+// nvfp4: sfVecSize = 16, sfUseUE8M0 = false
+// mxfp4: sfVecSize = 32 (not supported yet), sfUseUE8M0 = true
+// alignment: sfVecSize
+// returns self_fp4, self_block_scale_factors
+// self_fp4: [B, M, K / 2], FLOAT4_E2M1X2
+// self_block_scale_factors:
+//   [B, ceil(M / 128) * 128 * ceil(K / sfVecSize / 4) * 4], SF_DTYPE (UE4M3 or UE8M0)
+std::tuple<at::Tensor, at::Tensor> fp4_batched_quantize(at::Tensor const& self,
+                                                        at::Tensor const& globalScale,
+                                                        int64_t sfVecSize, bool sfUseUE8M0) {
+  CHECK_TH_CUDA(self);
+  CHECK_CONTIGUOUS(self);
+  CHECK_INPUT_TYPE(globalScale, c10::ScalarType::Float);
+  TORCH_CHECK(sfVecSize == 16, "sfVecSize can only be 16");
+
+  auto const& inputShape = self.sizes();
+  auto const& rank = inputShape.size();
+
+  TORCH_CHECK(rank == 3, "Input should be 3D tensor.");
+
+  int64_t b = inputShape[0];
+  int64_t m = inputShape[1];
+  int64_t k = inputShape[2];
+
+  TORCH_CHECK(k % sfVecSize == 0);
+
+  std::vector<int64_t> outputShape(inputShape.begin(), inputShape.end());
+  outputShape[rank - 1] = k / 2;
+
+  at::Tensor valueE2M1 =
+      at::detail::empty_cuda(outputShape, FLOAT4_E2M1X2, self.device(), /* stride */ std::nullopt);
+  at::Tensor scaleFP8SF =
+      at::detail::empty_cuda({b, tensorrt_llm::computeSwizzledLayoutSFSize(m, k / sfVecSize)},
+                             SF_DTYPE, self.device(), /* stride */ std::nullopt);  // 2D tensor
+
+  const thread_local int mMultiProcessorCount = tensorrt_llm::common::getMultiProcessorCount();
+  auto layout = tensorrt_llm::QuantizationSFLayout::SWIZZLED_128x4;
+
+#define LAUNCH_FP4_QUANTIZE_KERNEL(T, SF_VEC_SIZE)                                                 \
+  tensorrt_llm::kernels::invokeFP4Quantization<T, SF_VEC_SIZE>(                                    \
+      b, m, k, reinterpret_cast<T*>(self.data_ptr()), globalScale.data_ptr<float>(),               \
+      reinterpret_cast<int64_t*>(valueE2M1.data_ptr()),                                            \
+      reinterpret_cast<int32_t*>(scaleFP8SF.data_ptr()), sfUseUE8M0, layout, mMultiProcessorCount, \
+      at::cuda::getCurrentCUDAStream(self.get_device()));
+
+  if (self.scalar_type() == at::ScalarType::Half) {
+    LAUNCH_FP4_QUANTIZE_KERNEL(half, 16)
+  } else if (self.scalar_type() == at::ScalarType::BFloat16) {
+#ifdef ENABLE_BF16
+    LAUNCH_FP4_QUANTIZE_KERNEL(__nv_bfloat16, 16)
+#else
+    C10_THROW_ERROR(NotImplementedError,
+                    "BFloat16 must be enabled to quantize an bf16 tensor to fp4.");
+#endif
+  } else if (self.scalar_type() == at::ScalarType::Float8_e4m3fn) {
+#ifdef ENABLE_FP8
+    LAUNCH_FP4_QUANTIZE_KERNEL(__nv_fp8_e4m3, 16)
+#else
+    C10_THROW_ERROR(NotImplementedError, "FP8 must be enabled to quantize an fp8 tensor to fp4.");
+#endif
+  } else {
+    C10_THROW_ERROR(NotImplementedError,
+                    "fp4_quantize only supports input tensor with dtypes fp16/bf16/e4m3.");
+  }
+
+#undef LAUNCH_FP4_QUANTIZE_KERNEL
+
+  return {valueE2M1, scaleFP8SF};
+}
+
 }  // namespace torch_ext
 
-TORCH_LIBRARY_FRAGMENT(TORCH_EXTENSION_NAME, m) { m.def("fp4_quantize", &torch_ext::fp4_quantize); }
+TORCH_LIBRARY_FRAGMENT(TORCH_EXTENSION_NAME, m) {
+  m.def("fp4_quantize", &torch_ext::fp4_quantize);
+  m.def("fp4_batched_quantize", &torch_ext::fp4_batched_quantize);
+}
diff --git a/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.h b/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.h
@@ -28,4 +28,8 @@ std::tuple<at::Tensor, at::Tensor> fp4_quantize(at::Tensor const& self,
                                                 int64_t sfVecSize, bool sfUseUE8M0,
                                                 bool isSfSwizzledLayout, bool isSf8x4Layout,
                                                 bool enable_pdl);
+
+std::tuple<at::Tensor, at::Tensor> fp4_batched_quantize(at::Tensor const& self,
+                                                        at::Tensor const& globalScale,
+                                                        int64_t sfVecSize, bool sfUseUE8M0);
 }  // namespace torch_ext
diff --git a/flashinfer/__init__.py b/flashinfer/__init__.py
@@ -58,6 +58,7 @@
     mxfp4_dequantize,
     mxfp4_quantize,
     nvfp4_quantize,
+    nvfp4_batched_quantize,
     shuffle_matrix_a,
     shuffle_matrix_sf_a,
 )
diff --git a/flashinfer/fp4_quantization.py b/flashinfer/fp4_quantization.py
@@ -251,6 +251,71 @@ def _fake_block_scale_interleave_sm100(
             [unswizzled_sf.shape[0] * unswizzled_sf.shape[1] // 16], dtype=torch.uint8
         )
 
+    @register_custom_op(
+        "flashinfer::fp4_batched_quantize_sm100",
+        mutates_args=("",),
+    )
+    def fp4_batched_quantize_sm100(
+        input: torch.Tensor,
+        global_scale: Optional[torch.Tensor] = None,
+        sf_vec_size: int = 16,
+        sf_use_ue8m0: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Quantize a batched tensor to FP4 (E2M1x2) with per-block scale factors.
+
+        This function converts a float/bfloat16 (or FP8-quantized) input tensor into a
+        packed FP4 tensor using the E2M1 format (two 4-bit values per byte), along with
+        per-block scale factors. Scale factors are encoded as UE4M3 by default, or UE8M0
+        when requested, and an optional global scale can be applied.
+
+        Args:
+            input (torch.Tensor): Input tensor of shape [B, M, K] with dtype torch.float16,
+                torch.bfloat16, or an FP8-quantized dtype supported by the kernel.
+            global_scale (torch.Tensor, optional): Global scale factor of shape [1] and
+                dtype float32.
+            sf_vec_size (int, optional): Scale-factor vector size and alignment unit along K.
+                Supported/expected values:
+                - 16 (NVFP4 path; supported)
+                - 32 (MXFP4 path; not supported yet)
+                Defaults to 16.
+            sf_use_ue8m0 (bool, optional): Scale-factor encoding type.
+                False → UE4M3 (default), True → UE8M0.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]:
+                - self_fp4 (torch.Tensor): Packed FP4 tensor in E2M1x2 format of shape
+                [B, M, K // 2] with dtype torch.uint8 (two FP4 lanes per byte).
+                - self_block_scale_factors (torch.Tensor): Block scale factors with dtype
+                uint8 (UE4M3 or UE8M0), laid out as a flat buffer of shape
+                [B, ceil(M / 128) * 128 * ceil(K / sf_vec_size / 4) * 4].
+
+        Notes:
+            - K must be even (because outputs pack two FP4 values per byte).
+            - For best performance, K should be a multiple of sf_vec_size; the scale-factor
+            buffer is aligned to sf_vec_size along K, pads M to multiples of 128, and
+            rounds (K / sf_vec_size) up to a multiple of 4 for storage.
+            - The batch dimension B is preserved for both outputs.
+        """
+        return module.fp4_batched_quantize(
+            input,
+            global_scale,
+            sf_vec_size,
+            sf_use_ue8m0,
+        )
+
+    @register_fake_op("flashinfer::fp4_batched_quantize_sm100")
+    def _fp4_batched_quantize_sm100(
+        input: torch.Tensor,
+        global_scale: Optional[torch.Tensor] = None,
+        sf_vec_size: int = 16,
+        sf_use_ue8m0: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        m, k = input.shape
+        return (
+            input.new_empty([m, k // 2], dtype=torch.int64),  # float4_e2m1_x2
+            input.new_empty([m * k // sf_vec_size], dtype=torch.int32),  # Scale factors
+        )
+
     @register_custom_op(
         "flashinfer::e2m1_and_ufp8sf_scale_to_float_sm100",
         mutates_args=(""),
@@ -307,6 +372,7 @@ def _fake_e2m1_and_ufp8sf_scale_to_float_sm100(
         block_scale_interleave_sm100=block_scale_interleave_sm100,
         e2m1_and_ufp8sf_scale_to_float_sm100=e2m1_and_ufp8sf_scale_to_float_sm100,
         mxfp4_dequantize_host=mxfp4_dequantize_host,
+        fp4_batched_quantize_sm100=fp4_batched_quantize_sm100,
     )
 
 
@@ -610,3 +676,30 @@ def mxfp4_dequantize_host(
         scale,
         group_size,
     )
+
+
+def nvfp4_batched_quantize(
+    a,
+    a_global_sf,
+    sf_vec_size=16,
+):
+    """
+    Quantize batched input tensor to NVFP4 format.
+
+    Parameters:
+        a (torch.Tensor): Input tensor of shape [B, M, K] with dtype fp16/bf16.
+        a_global_sf (torch.Tensor): Global scale factor of shape [1] with dtype float32.
+        sf_vec_size (int, optional): Scale factor vector size. Defaults to 16.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
+            - Quantized tensor of shape [B, M, K/2] with dtype FLOAT4_E2M1X2
+            - Scale factors tensor with shape determined by layout and sf_vec_size
+    """
+    a_fp4, a_sf = get_fp4_quantization_module().fp4_batched_quantize_sm100(
+        a,
+        a_global_sf,
+        sf_vec_size,
+        False,
+    )
+    return a_fp4, a_sf
diff --git a/tests/test_fp4_quantize.py b/tests/test_fp4_quantize.py
@@ -10,12 +10,14 @@
     fp4_quantize,
     mxfp4_quantize,
     mxfp4_dequantize,
+    nvfp4_batched_quantize,
 )
 from flashinfer.utils import is_sm100a_supported
 
 DTYPES = [torch.float16, torch.bfloat16]
 # The batch dimension doesn't need to be multiple of 128
 SHAPES = [(128, 64), (256, 128), (120, 64), (200, 256)]
+BATCH_SHAPES = [(2, 128, 64), (3, 256, 128), (1, 120, 64)]
 SEEDS = [42]
 CUDA_DEVICES = ["cuda:0"]
 
@@ -310,5 +312,46 @@ def test_mxfp4_quantize_roundtrip(device: str):
     )
 
 
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("batch_shape", BATCH_SHAPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_nvfp4_batched_quantize(
+    dtype: torch.dtype,
+    batch_shape: tuple[int, int, int],
+    seed: int,
+    device: str,
+) -> None:
+    """Test nvfp4_batched_quantize function."""
+    if not is_sm100a_supported(torch.device(device)):
+        pytest.skip("Nvfp4 Requires compute capability of 10 or above")
+    torch.set_default_device(device)
+    torch.manual_seed(seed)
+
+    b, m, n = batch_shape
+    x = torch.randn(batch_shape, dtype=dtype)
+    tensor_amax = torch.abs(x).max().to(torch.float32)
+    global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / tensor_amax
+
+    # Test the batched quantization
+    out, out_scale = nvfp4_batched_quantize(x, global_scale)
+
+    # Basic shape checks
+    assert out.shape == (b, m, n // 2), (
+        f"Expected shape {(b, m, n // 2)}, got {out.shape}"
+    )
+    assert out.dtype == torch.uint8, f"Expected uint8, got {out.dtype}"
+    assert out_scale.dtype == torch.uint8, f"Expected uint8, got {out_scale.dtype}"
+
+    # Compare with single tensor quantization for each batch
+    for i in range(b):
+        single_out, single_scale = fp4_quantize(x[i], global_scale, 16, False, True)
+        torch.testing.assert_close(out[i], single_out, rtol=1e-5, atol=1e-5)
+        torch.testing.assert_close(
+            out_scale[i], single_scale.flatten(), rtol=1e-5, atol=1e-5
+        )
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])

Original file line number	Diff line number	Diff line change
`@@ -58,6 +58,7 @@`
`58`	`58`	`mxfp4_dequantize,`
`59`	`59`	`mxfp4_quantize,`
`60`	`60`	`nvfp4_quantize,`
	`61`	`+ nvfp4_batched_quantize,`
`61`	`62`	`shuffle_matrix_a,`
`62`	`63`	`shuffle_matrix_sf_a,`
`63`	`64`	`)`