feat: Add alignment in MxFP8Quantization (#1445)

Qiaolin-Yu · web-flow · commit e569912e0733 · 2025-08-10T01:16:30.000-07:00
## 📌 Description  Most of the code is from trtllm. ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [ ] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [ ] I have installed the hooks with `pre-commit install`. - [ ] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [ ] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/csrc/nv_internal/cpp/kernels/quantization.cu b/csrc/nv_internal/cpp/kernels/quantization.cu
@@ -74,15 +74,15 @@ template void invokeQuantization<__nv_bfloat16>(int8_t* dst, __nv_bfloat16 const
 // MXFP8 Quantization
 
 template <typename T>
-void invokeMxFP8Quantization(int b, int m, int n, T const* input, int64_t* output, int32_t* SFOuput,
-                             FP4QuantizationSFLayout layout, int multiProcessorCount,
-                             cudaStream_t stream) {
+void invokeMxFP8Quantization(int b, int m, int n, int padded_n, T const* input, int64_t* output,
+                             int32_t* SFOuput, FP4QuantizationSFLayout layout,
+                             int multiProcessorCount, cudaStream_t stream) {
   // Fixed SF_VEC_SIZE as 32
   static constexpr int SF_VEC_SIZE = 32;
 
   // Grid, Block size.
   // Each thread converts 8 values.
-  dim3 block(std::min(int(n / CVT_FP4_ELTS_PER_THREAD), 512));
+  dim3 block(std::min(int(padded_n / CVT_FP4_ELTS_PER_THREAD), 512));
   // Get number of blocks per SM (assume we can fully utilize the SM).
   int const numBlocksPerSM = std::max(1u, 2048u / block.x);
   dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
@@ -101,7 +101,7 @@ void invokeMxFP8Quantization(int b, int m, int n, T const* input, int64_t* outpu
   cudaLaunchKernelEx(
       &config,
       quantize_with_block_size<BlockScaleQuantizationType::FP16_TO_MXFP8, T, SF_VEC_SIZE, true>, b,
-      m, n, input, nullptr, reinterpret_cast<uint32_t*>(output),
+      m, n, padded_n, input, nullptr, reinterpret_cast<uint32_t*>(output),
       reinterpret_cast<uint32_t*>(SFOuput), layout);
 }
 
@@ -163,7 +163,7 @@ INSTANTIATE_INVOKE_PER_TOKEN_QUANTIZATION(__nv_bfloat16, __nv_fp8_e4m3);
 #endif
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-// FP4 Quantization
+// FP4/MXFP8 Quantization
 
 template <typename T, int SF_VEC_SIZE>
 void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale, int64_t* output,
@@ -355,9 +355,10 @@ template void invokeBatchedFP4Quantization<half, 16>(
 template void invokeBatchedFP4Quantization<half, 32>(
     int b, int m, int n, half const* input, float const* SFScale, int64_t* output, int32_t* SFOuput,
     bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout, cudaStream_t stream);
-template void invokeMxFP8Quantization<half>(int b, int m, int n, half const* input, int64_t* output,
-                                            int32_t* SFOuput, FP4QuantizationSFLayout layout,
-                                            int multiProcessorCount, cudaStream_t stream);
+template void invokeMxFP8Quantization<half>(int b, int m, int n, int padded_n, half const* input,
+                                            int64_t* output, int32_t* SFOuput,
+                                            FP4QuantizationSFLayout layout, int multiProcessorCount,
+                                            cudaStream_t stream);
 #ifdef ENABLE_BF16
 template void invokeFP4Quantization<__nv_bfloat16, 16>(int m, int n, __nv_bfloat16 const* input,
                                                        float const* SFScale, int64_t* output,
@@ -379,7 +380,7 @@ template void invokeBatchedFP4Quantization<__nv_bfloat16, 32>(
     int b, int m, int n, __nv_bfloat16 const* input, float const* SFScale, int64_t* output,
     int32_t* SFOuput, bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout,
     cudaStream_t stream);
-template void invokeMxFP8Quantization<__nv_bfloat16>(int b, int m, int n,
+template void invokeMxFP8Quantization<__nv_bfloat16>(int b, int m, int n, int padded_n,
                                                      __nv_bfloat16 const* input, int64_t* output,
                                                      int32_t* SFOuput,
                                                      FP4QuantizationSFLayout layout,
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh b/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh
@@ -924,8 +924,8 @@ __launch_bounds__(512, 4) quantize_with_block_size(
 #else
 quantize_with_block_size(
 #endif
-    int32_t numbatches, int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
-    uint32_t* out, uint32_t* SFout, FP4QuantizationSFLayout layout) {
+    int32_t numbatches, int32_t numRows, int32_t numCols, int32_t numPaddedCols, Type const* in,
+    float const* SFScale, uint32_t* out, uint32_t* SFout, FP4QuantizationSFLayout layout) {
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
 
   // The elements per thread.
@@ -941,46 +941,59 @@ quantize_with_block_size(
   // Note SFScale is the same as next GEMM's alpha, which is (448.f / (Alpha_A / 6.f)).
   float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
 
-  int numPaddedRows = numRows;
-  int numPaddedCols = numCols;
-  if (layout == FP4QuantizationSFLayout::SWIZZLED_128x4) {
-    // The number of padded rows considering 128x4 SF layout.
-    numPaddedRows = PadUpFn(numRows, 128);
-    numPaddedCols = PadUpFn(numCols, 4 * SF_VEC_SIZE);
-  } else if (layout == FP4QuantizationSFLayout::SWIZZLED_8x4) {
-    // The number of padded rows considering 8x4 SF layout.
-    numPaddedRows = PadUpFn(numRows, 8);
-    numPaddedCols = PadUpFn(numCols, 4 * SF_VEC_SIZE);
-  }
+  // Is it swizzled layout?
+  bool isSfSwizzledLayout = layout == FP4QuantizationSFLayout::SWIZZLED_128x4 ||
+                            layout == FP4QuantizationSFLayout::SWIZZLED_8x4;
+
+  // The number of padded rows considering 128x4 SF layout.
+  int numPaddedRowsForSf = isSfSwizzledLayout ? PadUpFn(numRows, 128) : numRows;
+  int numColsForSf = isSfSwizzledLayout ? PadUpFn(numPaddedCols, 4 * SF_VEC_SIZE) : numPaddedCols;
 
-  // The number of threads in the column dimension
+  // The number of threads in the column dimension.
+  // Note that numCols/numPaddedCols/numColsForSf are guaranteed to be multiples of ELTS_PER_THREAD.
   int numColThreads = numCols / ELTS_PER_THREAD;
   int numPaddedColThreads = numPaddedCols / ELTS_PER_THREAD;
+  int numColThreadsForSf = numColsForSf / ELTS_PER_THREAD;
 
   asm volatile("griddepcontrol.wait;");
   // Input tensor batch/row/col loops.
-  for (int rowIdx = blockIdx.x; rowIdx < numPaddedRows; rowIdx += gridDim.x) {
+  for (int rowIdx = blockIdx.x; rowIdx < numPaddedRowsForSf; rowIdx += gridDim.x) {
     for (int batchIdx = 0; batchIdx < numbatches; batchIdx++) {
-      for (int colIdx = threadIdx.x; colIdx < numPaddedColThreads; colIdx += blockDim.x) {
+      for (int colIdx = threadIdx.x; colIdx < numColThreadsForSf; colIdx += blockDim.x) {
         std::optional<int> optionalBatchIdx = batchIdx;
         std::optional<int> optionalNumRows = numRows;
 
         // The SF output pointer.
         auto sf_out =
             cvt_quant_to_fp4_get_sf_out_offset<uint32_t, CVT_NUM_THREADS_PER_SF, SF_VEC_SIZE>(
-                optionalBatchIdx, rowIdx, colIdx, optionalNumRows, numCols, SFout, layout);
+                optionalBatchIdx, rowIdx, colIdx, optionalNumRows, numPaddedCols, SFout, layout);
+
+        // The input tensor offset.
+        int64_t inOffset =
+            static_cast<int64_t>(batchIdx * numRows + rowIdx) * numColThreads + colIdx;
+        int64_t outOffset =
+            static_cast<int64_t>(batchIdx * numRows + rowIdx) * numPaddedColThreads + colIdx;
+
+        // Set the values to 0 of those are padded columns.
+        if (rowIdx < numRows && colIdx >= numColThreads && colIdx < numPaddedColThreads) {
+          // Dispatch the quantization kernel.
+          if constexpr (quantization_type == BlockScaleQuantizationType::FP16_TO_FP4) {
+            reinterpret_cast<uint32_t*>(out)[outOffset] = 0u;
+          } else if constexpr (quantization_type == BlockScaleQuantizationType::FP8_TO_FP4 ||
+                               quantization_type == BlockScaleQuantizationType::FP16_TO_MXFP8) {
+            reinterpret_cast<uint64_t*>(out)[outOffset] = 0ull;
+          }
+        }
 
         // Set the SF padding to 0.
         if (rowIdx >= numRows || colIdx >= numColThreads) {
+          // Set the SF padding to 0.
           if (sf_out != nullptr) {
             sf_out[0] = 0x00;
           }
         } else {
-          int64_t inOffset =
-              static_cast<int64_t>(batchIdx * numRows + rowIdx) * numColThreads + colIdx;
+          // Load the input vector.
           PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
-          // Get the output tensor offset as a packed vector.
-          int64_t outOffset = inOffset;
 
           // Dispatch the quantization kernel.
           if constexpr (quantization_type == BlockScaleQuantizationType::FP16_TO_FP4) {
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/quantization.h b/csrc/nv_internal/tensorrt_llm/kernels/quantization.h
@@ -94,9 +94,9 @@ void invokeNVFP4BlockScaleInterleaveReverse(int b, int m, int n, uint8_t const*
                                             cudaStream_t stream = 0);
 
 template <typename T>
-void invokeMxFP8Quantization(int b, int m, int n, T const* input, int64_t* output, int32_t* SFOuput,
-                             FP4QuantizationSFLayout layout, int multiProcessorCount,
-                             cudaStream_t stream = 0);
+void invokeMxFP8Quantization(int b, int m, int n, int padded_n, T const* input, int64_t* output,
+                             int32_t* SFOuput, FP4QuantizationSFLayout layout,
+                             int multiProcessorCount, cudaStream_t stream = 0);
 
 }  // namespace kernels
 }  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/thop/fp8Quantize.cpp b/csrc/nv_internal/tensorrt_llm/thop/fp8Quantize.cpp
@@ -18,6 +18,8 @@
 
 #include <ATen/cuda/EmptyTensor.h>
 
+#include <cstdint>
+
 #include "cutlass/numeric_types.h"
 #include "pytorch_extension_utils.h"
 #include "tensorrt_llm/thop/thUtils.h"
@@ -27,8 +29,10 @@ namespace torch_ext {
 // input: [M, K], fp32/fp16/bf16/fp8_quantized
 // isSfSwizzledLayout: bool, if true, the scale factors are stored in swizzled layout, otherwise in
 // linear layout. See FP4QuantizationSFLayout enum for more details about the two layouts.
+// alignment: sfVecSize
 // returns
-std::tuple<at::Tensor, at::Tensor> mxfp8_quantize(at::Tensor input, bool isSfSwizzledLayout) {
+std::tuple<at::Tensor, at::Tensor> mxfp8_quantize(at::Tensor input, bool isSfSwizzledLayout,
+                                                  int64_t alignment) {
   CHECK_TH_CUDA(input);
   CHECK_CONTIGUOUS(input);
 
@@ -43,17 +47,18 @@ std::tuple<at::Tensor, at::Tensor> mxfp8_quantize(at::Tensor input, bool isSfSwi
   auto const k = inputShape[rank - 1];
   int32_t const sfVecSize = 32;
   TORCH_CHECK(k % sfVecSize == 0);
+  auto const padded_k = ((k + alignment - 1) / alignment) * alignment;
 
   std::vector<int64_t> outputShape(inputShape.begin(), inputShape.end());
-  outputShape[rank - 1] = k;
+  outputShape[rank - 1] = padded_k;
 
   at::Tensor valueFP8 =
       at::detail::empty_cuda(outputShape, at::ScalarType::Float8_e4m3fn, input.device(),
                              /* stride */ std::nullopt);
 
   int64_t SFSize = isSfSwizzledLayout
-                       ? tensorrt_llm::computeFP4SwizzledLayoutSFSize(m, k / sfVecSize)
-                       : tensorrt_llm::computeFP4LinearLayoutSFSize(m, k / sfVecSize);
+                       ? tensorrt_llm::computeFP4SwizzledLayoutSFSize(m, padded_k / sfVecSize)
+                       : tensorrt_llm::computeFP4LinearLayoutSFSize(m, padded_k / sfVecSize);
 
   at::Tensor scaleFP8SF = at::detail::empty_cuda({SFSize}, SF_DTYPE, input.device(),
                                                  /* stride */ std::nullopt);  // 1D tensor
@@ -65,7 +70,7 @@ std::tuple<at::Tensor, at::Tensor> mxfp8_quantize(at::Tensor input, bool isSfSwi
 
 #define LAUNCH_MXFP8_QUANTIZE_KERNEL(T)                                                \
   tensorrt_llm::kernels::invokeMxFP8Quantization<T>(                                   \
-      1, m, k, reinterpret_cast<T*>(input.data_ptr()),                                 \
+      1, m, k, padded_k, reinterpret_cast<T*>(input.data_ptr()),                       \
       reinterpret_cast<int64_t*>(valueFP8.data_ptr()),                                 \
       reinterpret_cast<int32_t*>(scaleFP8SF.data_ptr()), layout, mMultiProcessorCount, \
       at::cuda::getCurrentCUDAStream(input.get_device()));
diff --git a/csrc/nv_internal/tensorrt_llm/thop/fp8Quantize.h b/csrc/nv_internal/tensorrt_llm/thop/fp8Quantize.h
@@ -62,9 +62,11 @@ inline int computeSFIndex(int rowIdx, int colIdx, int totalRow, int totalColumn,
 // input: [M, K], fp16/bf16_quantized
 // isSfSwizzledLayout: bool, if true, the scale factors are stored in swizzled layout, otherwise in
 // linear layout. See FP4QuantizationSFLayout enum for more details about the two layouts.
+// alignment: sfVecSize
 // returns fp8_quantized and block_scale_factors.
 std::tuple<at::Tensor, at::Tensor> mxfp8_quantize(at::Tensor input,
-                                                  bool is_sf_swizzled_layout = true);
+                                                  bool is_sf_swizzled_layout = true,
+                                                  int64_t alignment = 32);
 
 // x_fp32: [M, K], fp32_quantized (on the host)
 // isSfSwizzledLayout: bool, if true, the scale factors are stored in swizzled layout, otherwise in
diff --git a/flashinfer/fp8_quantization.py b/flashinfer/fp8_quantization.py
@@ -49,13 +49,14 @@ def get_mxfp8_quantization_sm100_module():
     def mxfp8_quantize_sm100(
         input: torch.Tensor,
         is_sf_swizzled_layout: bool = True,
+        alignment: int = 32,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Quantize input tensor to MxFP8 format.
 
         Args:
             input (torch.Tensor): Input tensor of shape [M, K] with dtype fp16/bf16/fp8_quantized.
             is_sf_swizzled_layout (bool, optional): Whether to use swizzled layout for scale factors. Defaults to True.
-
+            alignment (int, optional): sfVecSize. Defaults to 32. Note that alignment is not used in the host kernel.
         Returns:
             Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
                 - Quantized tensor of shape [M, K] with dtype FLOAT8_E4M3
@@ -70,6 +71,7 @@ def mxfp8_quantize_sm100(
             return module.mxfp8_quantize(
                 input,
                 is_sf_swizzled_layout,
+                alignment,
             )
 
     @register_fake_op("flashinfer::mxfp8_quantize_sm100")
@@ -126,6 +128,7 @@ def _fake_mxfp8_dequantize_host_sm100(
 def mxfp8_quantize(
     input: torch.Tensor,
     is_sf_swizzled_layout: bool = True,
+    alignment: int = 32,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """Quantize input tensor to MxFP8 format.
 
@@ -135,7 +138,7 @@ def mxfp8_quantize(
     Args:
         input (torch.Tensor): Input tensor of shape [M, K] with dtype fp16/bf16/fp8_quantized.
         is_sf_swizzled_layout (bool, optional): Whether to use swizzled layout for scale factors. Defaults to True.
-
+        alignment (int, optional): sfVecSize. Defaults to 32.
     Returns:
         Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
             - Quantized tensor of shape [M, K] with dtype FLOAT8_E4M3
@@ -147,8 +150,8 @@ def mxfp8_quantize(
     x_q, sf = get_mxfp8_quantization_sm100_module().mxfp8_quantize_sm100(
         input,
         is_sf_swizzled_layout,
+        alignment,
     )
-    sf = sf.reshape((-1, input.shape[-1] // sf_vec_size))
     return x_q, sf
 
 
diff --git a/tests/test_fp8_quantize.py b/tests/test_fp8_quantize.py
@@ -50,5 +50,95 @@ def check_accuracy(a, b, atol, rtol, percent):
     check_accuracy(a_pt, a, 8, 0, 0.999)
 
 
+def mxfp8_quantize_check_accuracy(a, b, atol, rtol, percent):
+    if torch.any(torch.isnan(a)):
+        raise Exception("NaN in a")
+    if torch.any(torch.isnan(b)):
+        raise Exception("NaN in b")
+    assert a.shape == b.shape
+    left = torch.abs(a - b)
+    right = atol + rtol * torch.abs(b)
+    count = torch.sum(left > right)
+    mismatch_percent = count / a.numel()
+    if mismatch_percent > 1 - percent:
+        raise Exception(
+            "Mismatch percentage is %f for rtol %f" % (mismatch_percent, rtol)
+        )
+
+
+@pytest.mark.parametrize("m", [1, 2, 16, 1024])
+@pytest.mark.parametrize("k", [512, 1024])
+@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
+@pytest.mark.parametrize("is_sf_swizzled_layout", [True, False])
+def test_mxfp8_quantize_torch_host(m, k, dtype, is_sf_swizzled_layout):
+    torch.random.manual_seed(0)
+    a = (torch.randn([m, k], dtype=torch.float) * 16).cpu().contiguous()
+
+    a_fp8, a_sf = mxfp8_quantize(a, is_sf_swizzled_layout)
+
+    a_pt = mxfp8_dequantize_host(
+        a_fp8.view(torch.uint8), a_sf.view(torch.uint8), is_sf_swizzled_layout
+    )
+
+    torch.cuda.synchronize()
+
+    mxfp8_quantize_check_accuracy(a_pt, a, 8, 0, 0.999)
+
+
+@pytest.mark.parametrize("m", [1, 2, 16, 1024])
+@pytest.mark.parametrize("k", [512, 1024])
+@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
+@pytest.mark.parametrize("is_sf_swizzled_layout", [True, False])
+def test_mxfp8_quantize_torch_device(m, k, dtype, is_sf_swizzled_layout):
+    torch.random.manual_seed(0)
+    a = (torch.randn([m, k], dtype=torch.float) * 16).to(dtype).cuda().contiguous()
+
+    a_fp8, a_sf = mxfp8_quantize(a, is_sf_swizzled_layout, 32)
+    a_pt = mxfp8_dequantize_host(
+        a_fp8.cpu().view(torch.uint8),
+        a_sf.cpu().view(torch.uint8),
+        is_sf_swizzled_layout,
+    )
+
+    torch.cuda.synchronize()
+    mxfp8_quantize_check_accuracy(
+        a_pt.cpu().to(torch.float32), a.cpu().to(torch.float32), 8, 0, 0.999
+    )
+
+
+@pytest.mark.parametrize("m", [1, 2, 16, 1024])
+@pytest.mark.parametrize("k", [1568])
+@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
+@pytest.mark.parametrize("is_sf_swizzled_layout", [True, False])
+@pytest.mark.parametrize("alignment", [64, 128])
+def test_mxfp8_quantize_alignment_torch_device(
+    m, k, dtype, is_sf_swizzled_layout, alignment
+):
+    torch.random.manual_seed(0)
+    a = (torch.randn([m, k], dtype=torch.float) * 16).to(dtype).cuda().contiguous()
+    padded_k = ((k + alignment - 1) // alignment) * alignment
+
+    # Quantize it on device.
+    a_fp8, a_sf = mxfp8_quantize(a, is_sf_swizzled_layout, alignment)
+    assert a_fp8.shape[1] == padded_k
+
+    # Dequantize it on host.
+    a_pt = mxfp8_dequantize_host(
+        a_fp8.cpu().view(torch.uint8),
+        a_sf.cpu().view(torch.uint8),
+        is_sf_swizzled_layout,
+    )
+
+    # Check if the bits of paddings are zero.
+    paddings = a_fp8.view(torch.int8)[:, k:]
+    assert torch.all(paddings == 0), "Paddings should be zero"
+
+    torch.cuda.synchronize()
+
+    mxfp8_quantize_check_accuracy(
+        a_pt[:, :k].cpu().to(torch.float32), a.cpu().to(torch.float32), 8, 0, 0.999
+    )
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])