Fixed FP8 fake quantization to use fp32 amax scaling

realAsma · realAsma · commit 34bcee8cd6cd · 2025-09-26T17:36:27.000Z
Signed-off-by: realAsma &lt;akuriparambi@nvidia.com&gt;

minor

Signed-off-by: realAsma &lt;akuriparambi@nvidia.com&gt;

minor

Signed-off-by: realAsma &lt;akuriparambi@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/src/tensor_quant.h b/modelopt/torch/quantization/src/tensor_quant.h
@@ -22,7 +22,6 @@ void fake_tensor_quant_cuda_inplace(at::Tensor, at::Tensor, int, bool, bool);
 at::Tensor fake_tensor_quant_cuda(at::Tensor, at::Tensor, int, bool, bool);
 at::Tensor fake_tensor_quant_with_axis_cuda(at::Tensor, at::Tensor, int, int, bool, bool);
 float bits_to_bound(int, int);
-at::Tensor fake_e4m3fy_cuda(at::Tensor inputs);
 
 // Dequantizes data using NF4 quantization scheme and per-block scaling factors.
 //
diff --git a/modelopt/torch/quantization/src/tensor_quant_fp8.cpp b/modelopt/torch/quantization/src/tensor_quant_fp8.cpp
@@ -19,30 +19,37 @@
 #include <cuda_fp8.h>
 #include <torch/extension.h>
 
-at::Tensor fake_e4m3fy_cuda(at::Tensor inputs);
-at::Tensor fused_fake_e4m3fy_cuda(at::Tensor inputs, at::Tensor amax, const float zero_threshold);
+at::Tensor fake_e4m3fy_cuda(at::Tensor inputs, at::Tensor amax);
+at::Tensor fake_e4m3fy_cuda_with_axis(at::Tensor inputs, at::Tensor amax, int axis);
 
-at::Tensor fake_e4m3fy(at::Tensor inputs) {
+at::Tensor fake_e4m3fy(at::Tensor inputs, at::Tensor amax) {
+  TORCH_CHECK(amax.numel(), 1);
+  inputs = inputs.contiguous();
+  auto amax_view = amax.view(-1).to(at::kFloat);
   if (inputs.is_cuda()) {
-    return fake_e4m3fy_cuda(inputs.contiguous());
+    return fake_e4m3fy_cuda(inputs, amax_view);
   } else {
     TORCH_CHECK(inputs.dtype() == at::ScalarType::Float);
-    TORCH_CHECK(inputs.is_contiguous());
+    float scale = 448.f / amax_view[0].item<float>();
+    float inv_scale = 1.f / scale;
     auto out = at::zeros_like(inputs);
     for (int i = 0; i < inputs.numel(); ++i) {
       out.data_ptr<float>()[i] =
-          static_cast<float>(static_cast<__nv_fp8_e4m3>(inputs.data_ptr<float>()[i]));
+          static_cast<float>(static_cast<__nv_fp8_e4m3>(inputs.data_ptr<float>()[i] * scale)) *
+          inv_scale;
     }
     return out;
   }
 }
 
-at::Tensor fused_fake_e4m3fy(at::Tensor inputs, at::Tensor amax, const float zero_threshold) {
-  return fused_fake_e4m3fy_cuda(inputs.contiguous(), amax, zero_threshold);
+at::Tensor fake_e4m3fy_with_axis(at::Tensor inputs, at::Tensor amax, int axis) {
+  TORCH_CHECK(inputs.is_cuda());
+  return fake_e4m3fy_cuda_with_axis(inputs.contiguous(), amax.contiguous().to(at::kFloat), axis);
 }
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("fake_e4m3fy", &fake_e4m3fy, "Reduce precision to E4M3", py::arg("inputs"));
-  m.def("fused_fake_e4m3fy", &fused_fake_e4m3fy, "Reduce precision to E4M3 (fused)",
-        py::arg("inputs"), py::arg("amax"), py::arg("zero_threshold"));
+  m.def("fake_e4m3fy", &fake_e4m3fy, "Reduce precision to E4M3", py::arg("inputs"),
+        py::arg("amax"));
+  m.def("fake_e4m3fy_with_axis", &fake_e4m3fy_with_axis, "Reduce precision to E4M3 (fused)",
+        py::arg("inputs"), py::arg("amax"), py::arg("axis"));
 }
diff --git a/modelopt/torch/quantization/src/tensor_quant_gpu_fp8.cu b/modelopt/torch/quantization/src/tensor_quant_gpu_fp8.cu
@@ -31,92 +31,65 @@
 #define AT_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                                                \
   AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
 
-template <typename T> __global__ void fake_e4m3fy_kernel(const T *inputs, size_t n, T *outputs) {
+template <typename T>
+__global__ void fake_e4m3fy_kernel(const T *inputs, size_t n, const float *scale,
+                                   const float *inv_scale, T *outputs) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
 
   for (int idx = 4 * tid; idx < 4 * (tid + 1) && idx < n; ++idx) {
     outputs[idx] = static_cast<T>(
-        static_cast<float>(static_cast<__nv_fp8_e4m3>(static_cast<float>(inputs[idx]))));
+        static_cast<float>(static_cast<__nv_fp8_e4m3>(static_cast<float>(inputs[idx]) * scale[0])) *
+        inv_scale[0]);
   }
 }
 
 template <typename T>
-__global__ void fused_fake_e4m3fy_kernel(const T *inputs, size_t n, float *amax,
-                                         bool per_block_scaling_factor, size_t blocksize,
-                                         float zero_threshold, T *outputs) {
+__global__ void fake_e4m3fy_with_axis_cuda_kernel(const T *inputs, size_t n, const float *scale,
+                                                  const float *inv_scale, int axis_size,
+                                                  int outer_size, T *outputs) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
 
   for (int idx = 4 * tid; idx < 4 * (tid + 1) && idx < n; ++idx) {
     float x = static_cast<float>(inputs[idx]);
 
-    // generate mask for zeroing tiny values
-    float x_abs = fabsf(x);
-    bool zero_mask = x_abs < zero_threshold;
-
-    // grab the global scaling factor
-    size_t amax_idx = (per_block_scaling_factor) ? (idx / blocksize) : 0;
-
-    // compute scale and inverse-scales
-    float scale = 448.f / (amax[amax_idx]);
-    float inv_scale = 1.f / scale;
+    int axis_id = (idx / outer_size) % axis_size;
 
     // compute the output
-    float output = static_cast<float>(static_cast<__nv_fp8_e4m3>(scale * x)) * inv_scale;
-
-    // zero out small values
-    if (zero_mask) {
-      output = 0.f;
-    }
+    float output =
+        static_cast<float>(static_cast<__nv_fp8_e4m3>(scale[axis_id] * x)) * inv_scale[axis_id];
 
     outputs[idx] = output;
   }
 }
 
-at::Tensor fused_fake_e4m3fy_cuda(at::Tensor inputs, at::Tensor amax, const float zero_threshold) {
-  size_t numel = inputs.numel();
+at::Tensor fake_e4m3fy_cuda_with_axis(at::Tensor inputs, at::Tensor amax, int axis) {
   auto outputs = torch::empty_like(inputs);
+  size_t numel = inputs.numel();
+  int axis_size = inputs.size(axis);
+  int outer_size = inputs.stride(axis);
 
-  bool per_block_scaling_factor = false;
-  size_t blocksize = numel;
-
-  int amax_ndim = amax.dim();
-  int input_ndim = inputs.dim();
-
-  // 3 options:
-  // 1.
-  //    inputs[numel], amax[1] -> per-tensor scaling
-  // 2.
-  //    inputs[numel], amax[numel/num_cols] -> per-row / per-channel scaling
-  // 3.
-  //    inputs[numel/bs, bs], amax[numel/bs, 1] -> blockwise scaling
-  if (amax.numel() == 1) {
-    // case 1.
-    per_block_scaling_factor = false;
-  } else if (amax.numel() > 1 && (amax_ndim > 1 && (amax.size(-1) == amax.numel()))) {
-    // case 2.
-    per_block_scaling_factor = true;
-    blocksize = numel / amax.numel();
-  } else {
-    throw std::runtime_error("invalid combination of inputs and amax shapes/sizes");
-  }
+  auto scale = 448.f / amax;
+  auto inv_scale = 1.f / scale;
 
-  auto stream = c10::cuda::getCurrentCUDAStream();
-
-  AT_DISPATCH_FLOATING_TYPES(inputs.type().scalarType(), "fused_fake_e4m3fy_cuda", [&] {
-    fused_fake_e4m3fy_kernel<<<numel / (BLOCK_SIZE * 4) + 1, BLOCK_SIZE, 0, stream>>>(
-        inputs.data_ptr<scalar_t>(), numel, amax.data_ptr<float>(), per_block_scaling_factor,
-        blocksize, zero_threshold, outputs.data_ptr<scalar_t>());
+  AT_DISPATCH_FLOATING_TYPES(inputs.type().scalarType(), "fake_e4m3fy_with_axis_cuda", [&] {
+    fake_e4m3fy_with_axis_cuda_kernel<<<numel / (BLOCK_SIZE * 4) + 1, BLOCK_SIZE>>>(
+        inputs.data_ptr<scalar_t>(), numel, scale.data_ptr<float>(), inv_scale.data_ptr<float>(),
+        axis_size, outer_size, outputs.data_ptr<scalar_t>());
   });
+
   return outputs;
 }
 
-at::Tensor fake_e4m3fy_cuda(at::Tensor inputs) {
+at::Tensor fake_e4m3fy_cuda(at::Tensor inputs, at::Tensor amax) {
   size_t numel = inputs.numel();
+  auto scale = 448.f / amax;
+  auto inv_scale = 1.f / scale;
   auto outputs = torch::empty_like(inputs);
   auto stream = c10::cuda::getCurrentCUDAStream();
   AT_DISPATCH_FLOATING_TYPES(inputs.type().scalarType(), "fake_e4m3fy_cuda", [&] {
     fake_e4m3fy_kernel<<<numel / (BLOCK_SIZE * 4) + 1, BLOCK_SIZE, 0, stream>>>(
-        inputs.data_ptr<scalar_t>(), numel, outputs.data_ptr<scalar_t>());
+        inputs.data_ptr<scalar_t>(), numel, scale.data_ptr<float>(), inv_scale.data_ptr<float>(),
+        outputs.data_ptr<scalar_t>());
   });
   return outputs;
 }
diff --git a/modelopt/torch/quantization/tensor_quant.py b/modelopt/torch/quantization/tensor_quant.py
@@ -43,9 +43,8 @@
 
 
 def scaled_e4m3_impl(
-    inputs: torch.Tensor,  # TODO: check support for multiple inputs
-    amax: torch.Tensor,
-    disable_fused_kernel=True,
+    inputs: torch.Tensor,
+    amax: torch.Tensor | None = None,
 ) -> torch.Tensor:
     """Implementation of fake quantizing input to FP8.
 
@@ -58,41 +57,20 @@ def scaled_e4m3_impl(
     """
     cuda_ext_fp8 = get_cuda_ext_fp8(raise_if_failed=True)
 
-    def is_fusable():
-        # ignore no scaling and shape([]) cases
-        if amax is None or len(amax.shape) == 0:
-            return False
-        else:
-            # can't have amax.shape = [1, 1, 4, 1] and the like
-            amax_last_dim_only = amax.numel() == amax.shape[-1]
-            # must be cuda
-            all_cuda = inputs.is_cuda and amax.is_cuda
-
-            # also check explicit disable.
-            return amax_last_dim_only and all_cuda and (not disable_fused_kernel)
-
     with torch.cuda.device(
         None if inputs.device.index == torch.cuda.current_device() else inputs.device.index
     ):
-        # differentiate between fused & unfused cases
-        if is_fusable():
-            zero_threshold = 1.0 / (1 << 24)
-            outputs = cuda_ext_fp8.fused_fake_e4m3fy(inputs, amax.float(), zero_threshold)
+        if amax is None:
+            amax = torch.tensor(448.0, device=inputs.device, dtype=inputs.dtype)
+        if amax.numel() == 1:
+            outputs = cuda_ext_fp8.fake_e4m3fy(inputs, amax)
         else:
-            zero_mask = inputs.abs() < 1.0 / (1 << 24)
-
-            if amax is None:
-                outputs = cuda_ext_fp8.fake_e4m3fy(inputs)
-            else:
-                scale = 448.0 / amax
-                outputs = cuda_ext_fp8.fake_e4m3fy(inputs * scale) / scale
-
-            # Zero out values that are tiny.
-            # Tiny values could lead to tiny amax and then large scale which cause overflow/saturation
-            # and won't go back to normal value after dividing by scale. The right behavior is to mark them
-            # as zero which also get rid of inf/nan
-            outputs[zero_mask] = 0.0
-
+            if amax.squeeze().ndim > 1:
+                raise NotImplementedError(
+                    "Fused E4M3 kernel does not support multiaxis quantization."
+                )
+            axis = amax.shape.index(amax.numel())
+            outputs = cuda_ext_fp8.fake_e4m3fy_with_axis(inputs, amax.squeeze(), axis)
         return outputs
 
 
diff --git a/tests/gpu/torch/quantization/test_tensor_quant_cuda.py b/tests/gpu/torch/quantization/test_tensor_quant_cuda.py
@@ -27,7 +27,7 @@
 import modelopt.torch.quantization.triton as triton_kernel
 import modelopt.torch.quantization.utils as quant_utils
 from modelopt.torch.quantization import tensor_quant
-from modelopt.torch.quantization.extensions import get_cuda_ext, get_cuda_ext_fp8, get_cuda_ext_mx
+from modelopt.torch.quantization.extensions import get_cuda_ext, get_cuda_ext_mx
 from modelopt.torch.quantization.tensor_quant import mx_format_map
 
 
@@ -187,20 +187,14 @@ def test_non_current_gpu(self, need_2_gpus):
         quant_x = tensor_quant.scaled_e4m3(x, x.amax(), None, 4, 3)
         assert torch.allclose(quant_x, quant_x_ref.cuda(device))
 
-    def test_fused_e4m3_kernel(self):
-        cuda_ext_fp8 = get_cuda_ext_fp8()
-        x = torch.tensor(TestScaledE4M3.x).cuda()
-        xq_ref = torch.tensor(TestScaledE4M3.xq_scaled).cuda()
-        amax = torch.ones(1, x.shape[-1]).cuda() * x.abs().amax()
-        e4m3_x = cuda_ext_fp8.fused_fake_e4m3fy(x, amax.float(), 1.0 / (1 << 24))
-        assert torch.allclose(e4m3_x, xq_ref, atol=1e-4, rtol=1e-4)
-
-    def test_e4m3_kernel_non_last_axis(self):
-        x = torch.tensor(TestScaledE4M3.x).cuda()
-        xq_ref = torch.tensor(TestScaledE4M3.xq_scaled).cuda()
-        amax = torch.ones(x.shape[0], 1).cuda() * x.abs().amax()
-        e4m3_x = tensor_quant.scaled_e4m3(x, amax, None, 4, 3)
-        assert torch.allclose(e4m3_x, xq_ref, atol=1e-4, rtol=1e-4)
+    @pytest.mark.parametrize("axis", [0, 1, 2])
+    def test_e4m3_per_channel(self, axis):
+        x = torch.randn(4, 4, 4, dtype=torch.float32).cuda()
+        amax = x.abs().amax(dim=[ax for ax in range(x.ndim) if ax != axis], keepdim=True)
+        scale = 448.0 / amax
+        xq_ref = tensor_quant.scaled_e4m3(x * scale, None, None, 4, 3) / scale
+        xq_test = tensor_quant.scaled_e4m3(x, amax.float(), None, 4, 3)
+        assert torch.allclose(xq_test, xq_ref)
 
 
 class Testfp4: