Use fp8_eager for cpu or if amax is None

realAsma · realAsma · commit 97dc2ef36121 · 2025-09-26T22:34:03.000Z
diff --git a/modelopt/torch/quantization/extensions.py b/modelopt/torch/quantization/extensions.py
@@ -41,7 +41,7 @@ def get_cuda_ext_fp8(raise_if_failed: bool = False):
     if not hasattr(get_cuda_ext_fp8, "extension"):
         get_cuda_ext_fp8.extension = load_cpp_extension(  # type:ignore[attr-defined]
             name="modelopt_cuda_ext_fp8",
-            sources=[path / "src/tensor_quant_fp8.cpp", path / "src/tensor_quant_gpu_fp8.cu"],
+            sources=[path / "src/tensor_quant_gpu_fp8.cu"],
             cuda_version_specifiers=">=11.8",
             fail_msg=(
                 "CUDA extension for FP8 quantization could not be built and loaded, FP8 simulated"
diff --git a/modelopt/torch/quantization/src/tensor_quant_fp8.cpp b/modelopt/torch/quantization/src/tensor_quant_fp8.cpp
diff --git a/modelopt/torch/quantization/src/tensor_quant_gpu_fp8.cu b/modelopt/torch/quantization/src/tensor_quant_gpu_fp8.cu
@@ -18,8 +18,8 @@
 #include <ATen/ATen.h>
 #include <c10/cuda/CUDAStream.h>
 #include <cuda_fp8.h>
-#include <torch/extension.h>
 #include <optional>
+#include <torch/extension.h>
 
 #define BLOCK_SIZE 128
 
@@ -63,7 +63,9 @@ __global__ void fake_e4m3fy_with_axis_cuda_kernel(const T *inputs, size_t n, con
   }
 }
 
-at::Tensor fake_e4m3fy_cuda_with_axis(at::Tensor inputs, at::Tensor amax, int axis) {
+at::Tensor fake_e4m3fy_with_axis(at::Tensor inputs, at::Tensor amax, int axis) {
+  inputs = inputs.contiguous();
+  amax = amax.contiguous();
   auto outputs = torch::empty_like(inputs);
   size_t numel = inputs.numel();
   int axis_size = inputs.size(axis);
@@ -73,7 +75,7 @@ at::Tensor fake_e4m3fy_cuda_with_axis(at::Tensor inputs, at::Tensor amax, int ax
   auto inv_scale = 1.f / scale;
 
   auto stream = c10::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES(inputs.type().scalarType(), "fake_e4m3fy_with_axis_cuda", [&] {
+  AT_DISPATCH_FLOATING_TYPES(inputs.type().scalarType(), "fake_e4m3fy_with_axis", [&] {
     fake_e4m3fy_with_axis_cuda_kernel<<<numel / (BLOCK_SIZE * 4) + 1, BLOCK_SIZE, 0, stream>>>(
         inputs.data_ptr<scalar_t>(), numel, scale.data_ptr<float>(), inv_scale.data_ptr<float>(),
         axis_size, outer_size, outputs.data_ptr<scalar_t>());
@@ -82,21 +84,24 @@ at::Tensor fake_e4m3fy_cuda_with_axis(at::Tensor inputs, at::Tensor amax, int ax
   return outputs;
 }
 
-at::Tensor fake_e4m3fy_cuda(at::Tensor inputs, std::optional<at::Tensor> amax_opt) {
+at::Tensor fake_e4m3fy(at::Tensor inputs, at::Tensor amax) {
+  inputs = inputs.contiguous();
   size_t numel = inputs.numel();
-  at::Tensor scale;
-  if (amax_opt.has_value()) {
-    scale = 448.f / amax_opt.value();
-  } else {
-    scale = at::ones({1}, inputs.options().dtype(at::kFloat));
-  }
+  at::Tensor scale = 448.f / amax;
   auto inv_scale = 1.f / scale;
   auto outputs = torch::empty_like(inputs);
   auto stream = c10::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES(inputs.type().scalarType(), "fake_e4m3fy_cuda", [&] {
+  AT_DISPATCH_FLOATING_TYPES(inputs.type().scalarType(), "fake_e4m3fy", [&] {
     fake_e4m3fy_kernel<<<numel / (BLOCK_SIZE * 4) + 1, BLOCK_SIZE, 0, stream>>>(
         inputs.data_ptr<scalar_t>(), numel, scale.data_ptr<float>(), inv_scale.data_ptr<float>(),
         outputs.data_ptr<scalar_t>());
   });
   return outputs;
 }
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("fake_e4m3fy", &fake_e4m3fy, "Reduce precision to E4M3", py::arg("inputs"),
+        py::arg("amax"));
+  m.def("fake_e4m3fy_with_axis", &fake_e4m3fy_with_axis, "Reduce precision to E4M3 (fused)",
+        py::arg("inputs"), py::arg("amax"), py::arg("axis"));
+}
diff --git a/modelopt/torch/quantization/tensor_quant.py b/modelopt/torch/quantization/tensor_quant.py
@@ -41,24 +41,32 @@
 
 DISABLE_TRITON_KERNEL = False
 
-def _fp8_eager(x, amax):
+
+def _fp8_eager(x, amax=None):
     dtype = x.dtype
-    x = x.to(torch.float32)
-    scale = 448.0 / (amax.to(torch.float32))
-    scale_inv = 1 / scale
-    x = (x*scale).to(torch.float8_e4m3fn).to(torch.float32)*scale_inv
+    if amax is not None:
+        scale = 448.0 / (amax.to(torch.float32))
+        scale_inv = 1 / scale
+        x = x.to(torch.float32) * scale
+    x = x.to(torch.float8_e4m3fn)
+    if amax is not None:
+        x = x.to(torch.float32) * scale_inv
     return x.to(dtype)
 
+
 @torch.compile(dynamic=True)
 def _fp8_triton(x, amax):
     return _fp8_eager(x, amax)
 
+
 def fp8_eager(x, amax):
+    """Eager mode implementation of FP8 quantization."""
     if triton_kernel.IS_AVAILABLE and not DISABLE_TRITON_KERNEL:
         return _fp8_triton(x, amax)
     else:
         return _fp8_eager(x, amax)
 
+
 def scaled_e4m3_impl(
     inputs: torch.Tensor,
     amax: torch.Tensor | None = None,
@@ -72,16 +80,21 @@ def scaled_e4m3_impl(
     Returns:
         Input tensors faked quantized to FP8.
     """
-    cuda_ext_fp8 = get_cuda_ext_fp8(raise_if_failed=True)
+    if inputs.is_cpu:
+        return fp8_eager(inputs, amax)
+
+    cuda_ext_fp8 = get_cuda_ext_fp8(raise_if_failed=False)
+    if cuda_ext_fp8 is None or amax is None:
+        return fp8_eager(inputs, amax)
 
     with torch.cuda.device(
         None if inputs.device.index == torch.cuda.current_device() else inputs.device.index
     ):
-        if amax is None or amax.numel() == 1:
+        if amax.numel() == 1:
             outputs = cuda_ext_fp8.fake_e4m3fy(inputs, amax)
         elif amax.squeeze().ndim == 1:
-                axis = amax.shape.index(amax.numel())
-                outputs = cuda_ext_fp8.fake_e4m3fy_with_axis(inputs, amax.squeeze(), axis)
+            axis = amax.shape.index(amax.numel())
+            outputs = cuda_ext_fp8.fake_e4m3fy_with_axis(inputs, amax.squeeze(), axis)
         else:
             outputs = fp8_eager(inputs, amax)
         return outputs