Update backend-agnostic quantize_per_tensor to use scale_inv to match the quantize_per_tensor_out implementation (pytorch#13846)

Andrew Grebenisan · facebook-github-bot · commit 40c119e1cfe4 · 2025-09-03T09:51:16.000-07:00
Summary:

Fixes an interface mismatch between the quantize_per_tensor_out implementation and the old quantize_per_tensor python implementation

Reviewed By: ethansfng

Differential Revision: D81459313
diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
@@ -24,7 +24,7 @@
 
 @impl(m, "quantize_per_tensor")
 def quantize_per_tensor(
-    input: torch.Tensor,
+    input_tensor: torch.Tensor,
     scale: float,
     zero_point: int,
     quant_min: int,
@@ -35,10 +35,10 @@ def quantize_per_tensor(
     Quantizes a floating-point tensor to an integral tensor.
 
     Args:
-        - input (Tensor): input tensor
-        - scale (float): Quantization scale. Derived from the ratio
+        - input_tensor (Tensor): input tensor
+        - scale (float): Inverse of quantization scale. Derived from the ratio
             between the min/max of the floating-point tensor and the
-            min/max of the quantized range.
+            min/max of the quantized range, and then inverted.
         - zero_point (int): The point which represents 0 in the quantized
             range. For example, consider the floating point range [-1., 2.] and
             quantized integer range [-7, 7]. In this case, 0 is 1/3 of way from
@@ -61,7 +61,11 @@ def quantize_per_tensor(
         raise ValueError(
             f"Unsupported dtype to quantize to. Supported dtypes must be one of {supported_quant_types}"
         )
-    return torch.round(input / scale + zero_point).to(dtype)
+    tmp = torch.round(input_tensor * scale + zero_point).to(dtype)
+    return torch.max(
+        torch.min(tmp, torch.tensor(torch.iinfo(dtype).max)),
+        torch.tensor(torch.iinfo(dtype).min),
+    )
 
 
 @impl(m, "dequantize_per_tensor")
@@ -173,9 +177,11 @@ def quantized_add(
     dequant_X = X_scale * (X - X_zero_point)
     dequant_Y = Y_scale * (Y - Y_zero_point)
 
+    out_scale_inv = 1 / out_scale
+
     # q_min/q_max are unused args
     return quantize_per_tensor(
-        dequant_X + dequant_Y, out_scale, out_zero_point, -128, 127, dtype
+        dequant_X + dequant_Y, out_scale_inv, out_zero_point, -128, 127, dtype
     )
 
 
@@ -206,6 +212,7 @@ def quantized_linear(
         - offset (Tensor): The offset tensor
     """
     out_scale = -out_multiplier * (1 / (1 << 31)) * (2 ** out_shift[0])
+    out_scale_inv = 1 / out_scale
 
     N, K = weight.shape
 
@@ -228,7 +235,7 @@ def quantized_linear(
                 src[m] - in_zero_point, weight[n] - weight_zero_point
             )
             out[m][n] = quantize_per_tensor(
-                dot, out_scale, out_zero_point, -128, 127, torch.int8
+                dot, out_scale_inv, out_zero_point, -128, 127, torch.int8
             )
 
     return out.reshape(*leading_dims, N)
diff --git a/backends/cadence/aot/tests/test_ref_implementations.py b/backends/cadence/aot/tests/test_ref_implementations.py
@@ -40,11 +40,12 @@ def test_quantize_per_tensor(
     ) -> None:
         input_tensor = torch.tensor([input_value])
         scale = (f_max - f_min) / (q_max - q_min)
-        zero_point = round(-f_min / scale) + q_min
+        inv_scale = 1.0 / scale
+        zero_point = round(-f_min * inv_scale) + q_min
         expected_output = torch.tensor([expected_value], dtype=target_dtype)
 
         output = quantize_per_tensor(
-            input_tensor, scale, zero_point, q_min, q_max, target_dtype
+            input_tensor, inv_scale, zero_point, q_min, q_max, target_dtype
         )
 
         self.assertEqual(