[Quantization] Add per-expert global scaling factor for fp4 batched quantize (#1835)

wenscarl · web-flow · commit f765a2a2ec07 · 2025-10-01T13:33:24.000-07:00
## 📌 Description Add per-expert global scaling factor for fp4 batched quantize ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [ ] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh b/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh
@@ -795,7 +795,6 @@ __device__ inline void quantize_with_block_size_impl(int32_t numbatches, int32_t
   static constexpr int CVT_NUM_THREADS_PER_SF = SF_VEC_SIZE / ELTS_PER_THREAD;
   static_assert(sizeof(PackedVec) == sizeof(Type) * ELTS_PER_THREAD, "Vec size is not matched.");
 
-  float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
   bool isSfSwizzledLayout = layout == QuantizationSFLayout::SWIZZLED_128x4 ||
                             layout == QuantizationSFLayout::SWIZZLED_8x4;
   int rowTile = (layout == QuantizationSFLayout::SWIZZLED_128x4) ? 128 : 8;
@@ -810,6 +809,7 @@ __device__ inline void quantize_with_block_size_impl(int32_t numbatches, int32_t
   asm volatile("griddepcontrol.wait;");
   for (int rowIdx = blockIdx.x; rowIdx < numPaddedRowsForSf; rowIdx += gridDim.x) {
     for (int batchIdx = 0; batchIdx < numbatches; batchIdx++) {
+      float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[batchIdx];
       for (int colIdx = threadIdx.x; colIdx < numColThreadsForSf; colIdx += blockDim.x) {
         std::optional<int> optionalBatchIdx = batchIdx;
         std::optional<int> optionalNumRows = numRows;
diff --git a/tests/utils/test_fp4_quantize.py b/tests/utils/test_fp4_quantize.py
@@ -18,7 +18,7 @@
 DTYPES = [torch.float16, torch.bfloat16]
 # The batch dimension doesn't need to be multiple of 128
 SHAPES = [(128, 64), (256, 128), (120, 64), (200, 256)]
-BATCH_SHAPES = [(2, 128, 64), (3, 256, 128), (1, 120, 64)]
+BATCH_SHAPES = [(1, 256, 128), (2, 128, 64), (3, 256, 128), (1, 120, 64)]
 SEEDS = [42]
 CUDA_DEVICES = ["cuda:0"]
 
@@ -334,7 +334,7 @@ def test_nvfp4_batched_quantize(
 
     b, m, n = batch_shape
     x = torch.randn(batch_shape, dtype=dtype)
-    tensor_amax = torch.abs(x).max().to(torch.float32)
+    tensor_amax = torch.abs(x).amax(dim=(1, 2)).to(torch.float32)
     global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / tensor_amax
     mask = None
     # Test the batched quantization
@@ -357,7 +357,7 @@ def test_nvfp4_batched_quantize(
 
     # Compare with single tensor quantization for each batch
     for i in range(b):
-        single_out, single_scale = fp4_quantize(x[i], global_scale, 16, False, True)
+        single_out, single_scale = fp4_quantize(x[i], global_scale[i], 16, False, True)
         if use_mask:
             torch.testing.assert_close(
                 out[i][: mask[i]], single_out[: mask[i]], rtol=1e-5, atol=1e-5
@@ -414,7 +414,7 @@ def test_silu_and_mul_nvfp4_batched_quantize(
     for i in range(b):
         x_silu_mul = silu_and_mul(x[i])
         single_out, single_scale = fp4_quantize(
-            x_silu_mul, global_scale, 16, False, True
+            x_silu_mul, global_scale[i], 16, False, True
         )
         torch.testing.assert_close(
             out[i][: mask[i]], single_out[: mask[i]], rtol=1e-5, atol=1e-5