Misc fixes to prepare for adding Float8Tensor (#2603)

jerryzh168 · web-flow · commit 5fe4ebd4f65e · 2025-07-25T14:17:59.000-07:00
Summary:
* Moved some float8 related util function to torchao.float8.inference
* renamed _choose_qparams_affine_float8 to _choose_scale_float8
* added hp_value_lb and hp_value_ub to _choose_scale_float8
* added `__all__` to torchao/core/config.py

Test Plan:
pytest test/dtypes/test_affine_quantized_float.py -k test_choose_scale_float8_bounds

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/test/dtypes/test_affine_quantized_float.py b/test/dtypes/test_affine_quantized_float.py
@@ -42,7 +42,7 @@
 )
 from torchao.quantization.quant_primitives import (
     MappingType,
-    _choose_qparams_affine_float8,
+    _choose_scale_float8,
     _dequantize_affine_float8,
     _quantize_affine_float8,
     choose_qparams_affine,
@@ -350,6 +350,49 @@ def test_mm_float8dq_per_row(
         error = compute_error(ref_output, quant_output)
         assert error > 20, f"Quantization error is too high got a SQNR of {error}"
 
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(
+        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+    )
+    @common_utils.parametrize("float8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
+    @common_utils.parametrize("output_dtype", [torch.float32, torch.bfloat16])
+    def test_choose_scale_float8_bounds(self, float8_dtype, output_dtype):
+        block_size = ()
+        device = "cuda"
+        input_tensor = torch.randn(8, 64, device=device, dtype=torch.float32)
+
+        # testing upper bounds
+        input_tensor[0][0] = 2000
+        scale_ref = _choose_scale_float8(
+            input_tensor, float8_dtype=float8_dtype, block_size=block_size
+        )
+
+        hp_value_ub = 1200
+        scale_with_ub = _choose_scale_float8(
+            input_tensor,
+            float8_dtype=float8_dtype,
+            block_size=block_size,
+            hp_value_ub=hp_value_ub,
+        )
+        # since scale = abs_max / quant_max, larger abs_max means scale is larger
+        self.assertTrue(scale_ref > scale_with_ub)
+
+        # tesing lower bounds settings
+        # making sure that abs is on the scale of 1e-20, so hp_value_lb can take effect
+        input_tensor = torch.randn(8, 64, device=device, dtype=torch.float32) * 1e-20
+        scale_ref = _choose_scale_float8(
+            input_tensor, float8_dtype=float8_dtype, block_size=block_size
+        )
+        hp_value_lb = 1e-12
+        scale_with_lb = _choose_scale_float8(
+            input_tensor,
+            float8_dtype=float8_dtype,
+            block_size=block_size,
+            hp_value_lb=hp_value_lb,
+        )
+        # since scale = abs_max / quant_max, larger abs_max means scale is larger
+        self.assertTrue(scale_ref < scale_with_lb)
+
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @unittest.skipIf(
         not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
@@ -364,7 +407,7 @@ def test_dequantize_affine_float8(self, float8_dtype, output_dtype, block_size):
         input_tensor = torch.randn(8, 64, device=device, dtype=torch.float32)
 
         # Choose quantization parameters
-        scale = _choose_qparams_affine_float8(
+        scale = _choose_scale_float8(
             input_tensor, float8_dtype=float8_dtype, block_size=block_size
         )
 
@@ -395,7 +438,7 @@ def test_dequantize_affine_float8_scale_broadcasting(self):
         block_size = (2, 16)  # 2x2 blocks in first dim, 2x16 blocks in second dim
 
         # Choose quantization parameters
-        scale = _choose_qparams_affine_float8(
+        scale = _choose_scale_float8(
             input_tensor, float8_dtype=torch.float8_e4m3fn, block_size=block_size
         )
 
diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py
@@ -2102,7 +2102,7 @@ def forward(self, x):
         ep = torch.export.export(model, (inp,))
         print(ep)
         FileCheck().check_count(
-            "torch.ops.torchao.choose_qparams_affine_float8.default", 1, exactly=True
+            "torch.ops.torchao.choose_scale_float8.default", 1, exactly=True
         ).run(str(ep.graph))
 
 
diff --git a/torchao/core/config.py b/torchao/core/config.py
@@ -12,6 +12,14 @@
 
 import torch
 
+__all__ = [
+    "AOBaseConfig",
+    "VersionMismatchError",
+    "config_from_dict",
+    "config_to_dict",
+    "ALLOWED_AO_MODULES",
+]
+
 
 class AOBaseConfig(abc.ABC):
     """
diff --git a/torchao/dtypes/affine_quantized_tensor.py b/torchao/dtypes/affine_quantized_tensor.py
@@ -19,10 +19,10 @@
     MappingType,
     ZeroPointDomain,
     _choose_qparams_affine_dont_preserve_zero,
-    _choose_qparams_affine_float8,
     _choose_qparams_affine_floatx,
     _choose_qparams_affine_tinygemm,
     _choose_qparams_and_quantize_affine_hqq,
+    _choose_scale_float8,
     _dequantize_affine_float8,
     _dequantize_affine_floatx,
     _dequantize_affine_no_zero_point,
@@ -462,7 +462,7 @@ def from_hp_to_floatx(
         if target_dtype in FP8_TYPES:
             original_shape = input_float.shape
             input_float = _layout.pre_process(input_float)
-            scale = _choose_qparams_affine_float8(
+            scale = _choose_scale_float8(
                 input_float, float8_dtype=target_dtype, block_size=block_size
             )
             data = _quantize_affine_float8(input_float, scale, target_dtype)
diff --git a/torchao/dtypes/floatx/float8_layout.py b/torchao/dtypes/floatx/float8_layout.py
@@ -20,8 +20,10 @@
 from torchao.float8.inference import (
     Float8MMConfig,
     _is_rowwise_scaled,
+    _slice_scale_for_dimension,
     addmm_float8_unwrapped_inference,
     preprocess_data,
+    preprocess_scale,
 )
 from torchao.utils import _is_float8_type, fill_defaults
 
@@ -299,56 +301,6 @@ def _(func, types, args, kwargs):
     )
 
 
-def _slice_scale_for_dimension(
-    scale: torch.Tensor,
-    data_shape: List[int],
-    dim: int,
-    start: int,
-    end: int,
-    step: int,
-) -> torch.Tensor:
-    """
-    Slice the scale tensor appropriately based on the data tensor slicing.
-
-    This function calculates how the scale should be sliced when the data tensor
-    is sliced along a given dimension, taking into account the block structure.
-    """
-    # Unsupported case for now, this would be 1 scale per data element
-    if scale.shape == data_shape:
-        return aten.slice.Tensor(scale, dim, start, end, step)
-
-    # Reconstruct block sizes based on data shape and scale shape
-    block_sizes = tuple(data_shape[i] // scale.shape[i] for i in range(len(data_shape)))
-
-    if dim >= len(block_sizes):
-        # Slicing beyond the dimensions we care about
-        return scale
-
-    block_size_for_dim = block_sizes[dim]
-
-    if block_size_for_dim == 1:
-        # Scale is per-element along this dimension
-        # Slice away as normal
-        return aten.slice.Tensor(scale, dim, start, end, step)
-    else:
-        # There is blocking in this dimension
-        # Calculate which scale elements correspond to the sliced data
-        scale_start = start // block_size_for_dim if start is not None else None
-        scale_end = (
-            (end + block_size_for_dim - 1) // block_size_for_dim
-            if end is not None
-            else None
-        )
-
-        # Error on Step > 1
-        if step > 1:
-            raise NotImplementedError(
-                "Slicing with step > 1 is not implemented for scale tensors."
-            )
-
-        return aten.slice.Tensor(scale, dim, scale_start, scale_end, 1)
-
-
 ##########################
 # Float8 Dispatch Kernels
 ##########################
@@ -370,24 +322,6 @@ def check_aqt(aqt: Union[torch.Tensor, AffineQuantizedTensor]) -> bool:
     return check_aqt(input_tensor) and check_aqt(weight_tensor)
 
 
-def preprocess_scale(input_scale: torch.Tensor, input_shape: Tuple[int, ...]):
-    """Ensures input tensor is correctly formatted for _scaled_mm"""
-
-    # For PerTensor quantization, scale should be a scalar or have shape [1]
-    if input_scale.numel() == 1:
-        # Already a scalar, ensure it has the right shape for _scaled_mm
-        return input_scale.reshape(1, 1)
-
-    # For per-row/block quantization, we need to handle the reshaping
-    input_scale = input_scale.unsqueeze(-1)
-
-    # Match: #input_data.reshape(-1, input_data.shape[-1])
-    if input_scale.dim() > 2:
-        input_scale = input_scale.reshape(-1, input_scale.shape[-1])
-
-    return input_scale
-
-
 def _linear_fp8_act_fp8_weight_impl(
     input_tensor: "AffineQuantizedTensor",
     weight_tensor: "AffineQuantizedTensor",
diff --git a/torchao/float8/inference.py b/torchao/float8/inference.py
@@ -7,7 +7,7 @@
 Defines an nn module designed to be used during inference
 """
 
-from typing import NamedTuple, Optional, Tuple, Union
+from typing import List, NamedTuple, Optional, Tuple, Union
 
 import torch
 
@@ -67,6 +67,24 @@ def preprocess_data(
     return a_data, b_data
 
 
+def preprocess_scale(input_scale: torch.Tensor, input_shape: Tuple[int, ...]):
+    """Ensures input tensor is correctly formatted for _scaled_mm"""
+
+    # For PerTensor quantization, scale should be a scalar or have shape [1]
+    if input_scale.numel() == 1:
+        # Already a scalar, ensure it has the right shape for _scaled_mm
+        return input_scale.reshape(1, 1)
+
+    # For per-row/block quantization, we need to handle the reshaping
+    input_scale = input_scale.unsqueeze(-1)
+
+    # Match: #input_data.reshape(-1, input_data.shape[-1])
+    if input_scale.dim() > 2:
+        input_scale = input_scale.reshape(-1, input_scale.shape[-1])
+
+    return input_scale
+
+
 def addmm_float8_unwrapped_inference(
     a_data: Tensor,
     a_scale: Tensor,
@@ -107,12 +125,75 @@ def addmm_float8_unwrapped_inference(
     )
 
 
-def _is_rowwise_scaled(x) -> bool:
-    """Checks if an AQT tensor is rowwise scaled
+def _slice_scale_for_dimension(
+    scale: torch.Tensor,
+    data_shape: List[int],
+    dim: int,
+    start: int,
+    end: int,
+    step: int,
+) -> torch.Tensor:
+    """
+    Slice the scale tensor appropriately based on the data tensor slicing.
+    This function calculates how the scale should be sliced when the data tensor
+    is sliced along a given dimension, taking into account the block structure.
+    """
+    aten = torch.ops.aten
+
+    # Unsupported case for now, this would be 1 scale per data element
+    if scale.shape == data_shape:
+        return aten.slice.Tensor(scale, dim, start, end, step)
+
+    # Reconstruct block sizes based on data shape and scale shape
+    block_sizes = tuple(data_shape[i] // scale.shape[i] for i in range(len(data_shape)))
+
+    if dim >= len(block_sizes):
+        # Slicing beyond the dimensions we care about
+        return scale
+
+    block_size_for_dim = block_sizes[dim]
+
+    if block_size_for_dim == 1:
+        # Scale is per-element along this dimension
+        # Slice away as normal
+        return aten.slice.Tensor(scale, dim, start, end, step)
+    else:
+        # There is blocking in this dimension
+        # Calculate which scale elements correspond to the sliced data
+        scale_start = start // block_size_for_dim if start is not None else None
+        scale_end = (
+            (end + block_size_for_dim - 1) // block_size_for_dim
+            if end is not None
+            else None
+        )
+
+        # Error on Step > 1
+        if step > 1:
+            raise NotImplementedError(
+                "Slicing with step > 1 is not implemented for scale tensors."
+            )
+
+        return aten.slice.Tensor(scale, dim, scale_start, scale_end, 1)
+
+
+def _is_rowwise_scaled(x: torch.Tensor) -> bool:
+    """Checks if a quantized tensor is rowwise scaled
+    Args:
+        x: quantized tensor (should have `block_size` attribute)
+    """
+    assert hasattr(x, "block_size"), "Expecting input to have `block_size` attribute"
+    return tuple(x.block_size) == (1,) * (x.dim() - 1) + (x.shape[-1],)
+
+
+def _is_tensorwise_scaled(x: torch.Tensor) -> bool:
+    """Checks if a quantized tensor is rowwise scaled
     Args:
-        x: AffineQuantizedTensor tensor
+        x: quantized tensor (should have `block_size` attribute)
     """
-    return x.block_size == (1,) * (x.dim() - 1) + (x.shape[-1],)
+    assert hasattr(x, "block_size"), "Expecting input to have `block_size` attribute"
+    return all(
+        x.block_size[i] == -1 or x.block_size[i] == x.shape[i] for i in range(x.ndim)
+    )
 
 
 def _normalize_granularity(
diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py
@@ -36,7 +36,7 @@
     "_choose_qparams_affine_floatx",
     "_choose_qparams_and_quantize_affine_hqq",
     "_choose_qparams_and_quantize_affine_qqq",
-    "_choose_qparams_affine_float8",
+    "_choose_scale_float8",
     "_choose_qparams_gguf",
     "_quantize_affine_no_zero_point",
     "_quantize_affine_tinygemm",
@@ -2180,11 +2180,13 @@ def _dequantize_affine_floatx(
 
 
 @register_custom_op
-def _choose_qparams_affine_float8(
+def _choose_scale_float8(
     tensor: torch.Tensor,
     block_size: List[int],
     float8_dtype: torch.dtype = torch.float8_e4m3fn,
     scale_dtype: torch.dtype = torch.float32,
+    hp_value_lb: Optional[float] = None,
+    hp_value_ub: Optional[float] = None,
 ) -> torch.Tensor:
     """
     Calculates float8 scaling factor for the given high precision tensor, using tensorwise granularity.
@@ -2194,19 +2196,24 @@ def _choose_qparams_affine_float8(
         float8_dtype (torch.dtype): Data type of the quantized tensor (e.g., torch.float8_e4m3fn, torch.float8_e5m2).
         scale_dtype (torch.dtype): Data type of the scaling factor (e.g., torch.float32).
         block_size (Optional[Tuple[int, ...]]): Block size for block-wise quantization. If None, tensorwise quantization is used.
+        hp_value_lb (Optional[float]): the lower bound for high precision floating point value for calculating scale
+        hp_value_ub (Optional[float]): the upper bound for high precision floating point value for calculating scale
     """
     quant_max = torch.finfo(float8_dtype).max
     # only tensorwise scaling is supported for now:
     if len(block_size) == 0:
         max_abs = tensor.abs().max()
+        if hp_value_lb is not None or hp_value_ub is not None:
+            max_abs = torch.clamp(max_abs, min=hp_value_lb, max=hp_value_ub)
         scale = max_abs / quant_max
     else:
         shape_for_reduction, reduction_dims = _get_reduction_params(
             block_size, tensor.shape
         )
         tensor_reshaped = tensor.view(shape_for_reduction)
         max_abs = tensor_reshaped.abs().amax(dim=reduction_dims, keepdim=True)
-
+        if hp_value_lb is not None or hp_value_ub is not None:
+            max_abs = torch.clamp(max_abs, min=hp_value_lb, max=hp_value_ub)
         scale = max_abs / quant_max
         # Reshape scale back to match the expected output shape
         # The scale tensor should have the same shape as the input divided by block_size