NVIDIA
diff --git a/‎modelopt/torch/quantization/nn/modules/tensor_quantizer.py‎
Lines changed: 8 additions & 11 deletions b/‎modelopt/torch/quantization/nn/modules/tensor_quantizer.py‎
Lines changed: 8 additions & 11 deletions
diff --git a/‎modelopt/torch/quantization/src/tensor_quant_fp8.cpp‎
Lines changed: 12 additions & 7 deletions b/‎modelopt/torch/quantization/src/tensor_quant_fp8.cpp‎
Lines changed: 12 additions & 7 deletions
diff --git a/‎modelopt/torch/quantization/src/tensor_quant_gpu_fp8.cu‎
Lines changed: 8 additions & 2 deletions b/‎modelopt/torch/quantization/src/tensor_quant_gpu_fp8.cu‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎modelopt/torch/quantization/tensor_quant.py‎
Lines changed: 4 additions & 201 deletions b/‎modelopt/torch/quantization/tensor_quant.py‎
Lines changed: 4 additions & 201 deletions
@@ -241,8 +241,8 @@ def amax(self, value):
 
         if not isinstance(value, torch.Tensor):
             value = torch.tensor(value)
-
-        if not hasattr(self, "_amax"):
+        value = value.to(torch.float32)
+        if not hasattr(self, "_amax") or self._amax.dtype != torch.float32:
             self.register_buffer("_amax", value.clone().detach())
         else:
             if self._amax.shape != value.shape:
@@ -265,7 +265,7 @@ def reset_bias(self):
     @property
     def step_size(self):
         """Return step size for integer quantization."""
-        if not hasattr(self, "_amax"):
+        if self.amax is None:
             warnings.warn("step_size is undefined under dynamic amax mode!")
             return None
         assert isinstance(self._num_bits, int), (
@@ -516,10 +516,7 @@ def load_calib_amax(self, *args, **kwargs):
                     err_msg
                     + " Passing 'strict=False' to `load_calib_amax()` will ignore the error."
                 )
-        if not hasattr(self, "_amax"):
-            self.register_buffer("_amax", calib_amax.clone().detach())
-        else:
-            self._amax.data.copy_(calib_amax.clone().detach())
+        self.amax = calib_amax
 
     def load_calib_bias(self, *args, **kwargs):
         """Load affine bias for quantization."""
@@ -537,8 +534,10 @@ def load_calib_bias(self, *args, **kwargs):
 
     def _get_amax(self, inputs):
         """Get amax from buffer or compute it dynamically."""
-        if hasattr(self, "_amax"):
-            amax = self._amax
+        if self.amax is not None:
+            amax = self.amax
+            if amax.dtype != torch.float32:
+                self.amax = amax.to(torch.float32)
         else:
             reduce_axis = quant_utils.convert_quantization_axis_to_reduce_axis(inputs, self._axis)
             amax = quant_utils.reduce_amax(inputs, axis=reduce_axis, keepdims=True).detach()
@@ -988,8 +987,6 @@ def _short_amax(self, fmt=".4f"):
             return "None"
         if not hasattr(self, "_amax"):
             return "dynamic"
-        if self._amax is None:
-            return "None"
         if self._amax.is_meta:
             return "meta"
         if self._amax.numel() == 1:
 
@@ -18,19 +18,24 @@
 #include <ATen/ATen.h>
 #include <cuda_fp8.h>
 #include <torch/extension.h>
+#include <optional>
 
-at::Tensor fake_e4m3fy_cuda(at::Tensor inputs, at::Tensor amax);
+at::Tensor fake_e4m3fy_cuda(at::Tensor inputs, std::optional<at::Tensor> amax);
 at::Tensor fake_e4m3fy_cuda_with_axis(at::Tensor inputs, at::Tensor amax, int axis);
 
-at::Tensor fake_e4m3fy(at::Tensor inputs, at::Tensor amax) {
-  TORCH_CHECK(amax.numel(), 1);
+at::Tensor fake_e4m3fy(at::Tensor inputs, std::optional<at::Tensor> amax) {
   inputs = inputs.contiguous();
-  auto amax_view = amax.view(-1).to(at::kFloat);
+  if (amax.has_value()) {
+    amax = amax.value().view(-1).to(at::kFloat);
+  }
   if (inputs.is_cuda()) {
-    return fake_e4m3fy_cuda(inputs, amax_view);
+    return fake_e4m3fy_cuda(inputs, amax);
   } else {
     TORCH_CHECK(inputs.dtype() == at::ScalarType::Float);
-    float scale = 448.f / amax_view[0].item<float>();
+    float scale = 1.f;
+    if (amax.has_value()) {
+      scale = 448.f / amax.value()[0].item<float>();
+    }
     float inv_scale = 1.f / scale;
     auto out = at::zeros_like(inputs);
     for (int i = 0; i < inputs.numel(); ++i) {
@@ -49,7 +54,7 @@ at::Tensor fake_e4m3fy_with_axis(at::Tensor inputs, at::Tensor amax, int axis) {
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("fake_e4m3fy", &fake_e4m3fy, "Reduce precision to E4M3", py::arg("inputs"),
-        py::arg("amax"));
+        py::arg("amax") = py::none());
   m.def("fake_e4m3fy_with_axis", &fake_e4m3fy_with_axis, "Reduce precision to E4M3 (fused)",
         py::arg("inputs"), py::arg("amax"), py::arg("axis"));
 }
@@ -19,6 +19,7 @@
 #include <c10/cuda/CUDAStream.h>
 #include <cuda_fp8.h>
 #include <torch/extension.h>
+#include <optional>
 
 #define BLOCK_SIZE 128
 
@@ -80,9 +81,14 @@ at::Tensor fake_e4m3fy_cuda_with_axis(at::Tensor inputs, at::Tensor amax, int ax
   return outputs;
 }
 
-at::Tensor fake_e4m3fy_cuda(at::Tensor inputs, at::Tensor amax) {
+at::Tensor fake_e4m3fy_cuda(at::Tensor inputs, std::optional<at::Tensor> amax_opt) {
   size_t numel = inputs.numel();
-  auto scale = 448.f / amax;
+  at::Tensor scale;
+  if (amax_opt.has_value()) {
+    scale = 448.f / amax_opt.value();
+  } else {
+    scale = at::ones({1}, inputs.options().dtype(at::kFloat));
+  }
   auto inv_scale = 1.f / scale;
   auto outputs = torch::empty_like(inputs);
   auto stream = c10::cuda::getCurrentCUDAStream();
 
@@ -60,10 +60,7 @@ def scaled_e4m3_impl(
     with torch.cuda.device(
         None if inputs.device.index == torch.cuda.current_device() else inputs.device.index
     ):
-        if amax is None:
-            # This adds overhead; however this is not a common use case.
-            amax = torch.tensor(448.0, device=inputs.device, dtype=inputs.dtype)
-        if amax.numel() == 1:
+        if amax is None or amax.numel() == 1:
             outputs = cuda_ext_fp8.fake_e4m3fy(inputs, amax)
         else:
             if amax.squeeze().ndim > 1:
@@ -556,136 +553,6 @@ def backward(ctx, grad_outputs):
         return _fake_quant_backward_function(ctx, grad_outputs, num_args=9)
 
 
-class TensorQuantFunction(Function):
-    """A universal tensor quantization function.
-
-    Take an input tensor, output an quantized tensor. The granularity of scale can be interpreted from the
-    shape of amax.
-    output_dtype indicates whether the quantized value will be stored in integer or float. The reason we want to store
-    it in float is the pytorch function takes the quantized value may not accept integer input, e.g. Conv2D.
-
-    It uses 2^num_bits -1 values instead of 2^num_bits. e.g., for num_bits=8, it uses [-127, 127] instead of [-128, 127]
-    """
-
-    @staticmethod
-    @symbolic_helper.parse_args("v", "t", "t", "i", "b", "b", "s")
-    def symbolic(
-        g,
-        inputs,
-        amax,
-        bias=None,
-        num_bits=8,
-        unsigned=False,
-        narrow_range=True,
-        trt_high_precision_dtype=None,
-    ):
-        """ONNX symbolic function."""
-        from .export_onnx import export_int8
-
-        return export_int8(
-            g, inputs, amax, num_bits, unsigned, narrow_range, trt_high_precision_dtype
-        )
-
-    @staticmethod
-    def forward(
-        ctx,
-        inputs,
-        amax,
-        bias=None,
-        num_bits=8,
-        unsigned=False,
-        narrow_range=True,
-        trt_high_precision_dtype=None,
-    ):
-        """Forward method.
-
-        Follow tensorflow convention, max value is passed in and used to decide scale, instead of inputting scale
-        directly. Though inputting scale directly may be more natural to use.
-
-        Args:
-            ctx: A Context object to store tensors for backward.
-            inputs: A Tensor of type float32.
-            amax: A Tensor of type float32. Inputs will be quantized within range [-amax, amax]
-                amax will be broadcasted to inputs tensor.
-            num_bits: A integer used to calculate scaling factor, scale = (2^(num_bits-1) - 1) / max
-                Effectively, it indicates how many integer bits is used to represent the value. Default 8.
-            output_dtype: A type of Tensor. torch.int32 or torch.float32.
-            unsigned: A boolean. Use unsigned integer range. E.g. [0, 255] for num_bits=8. Default False.
-            narrow_range: A boolean. Use symmetric integer range for signed quantization
-                E.g. [-127,127] instead of [-128,127] for num_bits=8. Default True.
-
-        Returns:
-            outputs: A Tensor of type output_dtype.
-            scale: A Tensor of type float32. outputs / scale will dequantize outputs tensor.
-
-        Raises:
-            ValueError:
-        """
-        if bias is not None:
-            inputs = inputs - bias
-
-        ctx.save_for_backward(inputs, amax)
-
-        outputs, scale = _tensor_quant(inputs, amax, num_bits, unsigned, narrow_range)
-        # Check if scale overflows FP16
-        if outputs.dtype == torch.half and scale.max() > 65504:
-            raise ValueError(f"scale is too large for FP16 with amax={amax}")
-
-        if bias is not None:
-            outputs = outputs + bias
-
-        return outputs, scale.to(inputs.dtype)
-
-    @staticmethod
-    def backward(ctx, grad_outputs, grad_scale):
-        """Implements straight through estimation with clipping.
-
-        For -amax <= input <= amax the gradient passes straight through, otherwise the gradient is zero.
-
-        Args:
-            ctx: A Context object with saved tensors from forward.
-            grad_outputs: A tensor of gradient of outputs.
-            grad_scale: A tensor of gradient of scale.
-
-        Returns:
-            grad_inputs: A tensor of gradient.
-        """
-        inputs, amax = ctx.saved_tensors
-        zero = grad_outputs.new_zeros(1)  # create a zero tensor with the same type and device
-        grad_inputs = torch.where(inputs.abs() <= amax, grad_outputs, zero)
-        return grad_inputs, None, None, None, None, None, None
-
-
-class LegacyFakeTensorQuantFunction(Function):
-    """Fake version of TensorQuantFunction.
-
-    See comments of TensorQuantFunction, arguments are the same.
-    """
-
-    @staticmethod
-    def forward(ctx, inputs, amax, bias, num_bits=8, unsigned=False, narrow_range=True):
-        """Forward method."""
-        if bias is not None:
-            inputs = inputs - bias
-
-        ctx.save_for_backward(inputs, amax)
-
-        outputs, scale = _tensor_quant(inputs, amax, num_bits, unsigned, narrow_range)
-
-        if bias is not None:
-            outputs = outputs + bias
-
-        return outputs / scale.to(inputs.dtype)
-
-    @staticmethod
-    def backward(ctx, grad_outputs):
-        """Implements straight through estimation."""
-        inputs, amax = ctx.saved_tensors
-        zero = grad_outputs.new_zeros(1)
-        grad_inputs = torch.where(inputs.abs() <= amax, grad_outputs, zero)
-        return grad_inputs, None, None, None, None, None
-
-
 def _tensor_quant(inputs, amax, num_bits=8, unsigned=False, narrow_range=True):
     """Shared function body between TensorQuantFunction and FakeTensorQuantFunction."""
     # Fine scale, per channel scale will be handled by broadcasting, which could be tricky. Pop a warning.
@@ -694,10 +561,8 @@ def _tensor_quant(inputs, amax, num_bits=8, unsigned=False, narrow_range=True):
 
     # Computation can be done in FP32 to prevent potential over flow.
     input_dtype = inputs.dtype
-    if inputs.dtype == torch.half:
-        inputs = inputs.float()
-    if amax.dtype == torch.half:
-        amax = amax.float()
+    inputs = inputs.float()
+    amax = amax.float()
 
     min_amax = amax.min()
     if min_amax < 0:
@@ -724,72 +589,10 @@ def _tensor_quant(inputs, amax, num_bits=8, unsigned=False, narrow_range=True):
             1.0  # Return 1 makes more sense for values quantized to 0 with amax=0
         )
 
-    if input_dtype == torch.half:
-        outputs = outputs.half()
-
+    outputs = outputs.to(input_dtype)
     return outputs, scale
 
 
-class FakeAffineTensorQuantFunction(Function):
-    """Fake version of affine quantization.
-
-    gemmlowp style scale+shift quantization. See more details in
-    https://github.com/google/gemmlowp/blob/master/doc/quantization.md.
-
-    We DO NOT recommend affine quantization on weights for performance reason. There might be value to affine quantize
-    activation as it can be cancelled by bias and comes with no performance penalty. This functionality is only added
-    for experimental purpose.
-    """
-
-    @staticmethod
-    def forward(ctx, inputs, min_range, max_range, num_bits=8):
-        """As it will be only applied on activation with per tensor granularity, broadcast is not needed.
-
-        Args:
-            ctx: Pytorch convention.
-            inputs: A Tensor of type float32.
-            min_range: A float.
-            max_range: A float.
-            num_bits: An integer
-
-        Returns:
-            outputs: A Tensor of type output_dtype
-        """
-        ctx.save_for_backward(inputs, min_range, max_range)
-
-        step_size = (max_range - min_range) / (2.0**num_bits - 1)
-
-        min_bound = -(2.0 ** (num_bits - 1))
-        max_bound = 2.0 ** (num_bits - 1) - 1
-
-        quant_zero = torch.round(min_range / step_size) - min_bound
-        quantized = torch.round(inputs / step_size) - quant_zero
-        quantized = torch.clamp(quantized, min_bound, max_bound)
-
-        outputs = (quantized + quant_zero) * step_size
-
-        return outputs
-
-    @staticmethod
-    def backward(ctx, grad_outputs):
-        """Implements straight through estimation with clipping.
-
-        Args:
-            ctx: Pytorch convention.
-            grad_output: A tensor of gradient of outputs.
-
-        Returns:
-            grad_inputs: A tensor of gradient
-        """
-        inputs, min_range, max_range = ctx.saved_tensors
-        zero = grad_outputs.new_zeros(1)
-        grad_inputs = torch.where((inputs <= max_range) * (inputs >= min_range), grad_outputs, zero)
-        return grad_inputs, None, None, None
-
-
-tensor_quant = TensorQuantFunction.apply
-legacy_fake_tensor_quant = LegacyFakeTensorQuantFunction.apply
 fake_tensor_quant = FakeTensorQuantFunction.apply
-fake_affine_tensor_quant = FakeAffineTensorQuantFunction.apply
 scaled_e4m3 = ScaledE4M3Function.apply
 dynamic_block_quant = DynamicBlockQuantizationFunction.apply