Remove the clamp op when we do symmetric quantization on a tensor (#9465)

vanbasten23 · web-flow · commit cf156c685ab6 · 2025-07-09T18:51:51.000-07:00
diff --git a/test/quantized_ops/test_quantized_matmul.py b/test/quantized_ops/test_quantized_matmul.py
@@ -123,7 +123,7 @@ def test_q_linear_module_per_channel(self, quantize_activation):
       x = x.to(device)
       out_quant_xla = m(x)
       self.assertTrue(torch.allclose(out_fp, out_quant, atol=0.01))
-      self.assertTrue(torch.allclose(out_quant_xla.cpu(), out_quant))
+      self.assertTrue(torch.allclose(out_quant_xla.cpu(), out_quant, atol=2e-3))
 
   @parameterized.parameters([False, True])
   def test_q_linear_module_dynamo(self, quantize_activation):
@@ -139,7 +139,8 @@ def test_q_linear_module_dynamo(self, quantize_activation):
       m_dynamo = torch.compile(m, backend="openxla")
       out_quant_dynamo = m_dynamo(x.to(device))
       self.assertTrue(torch.allclose(out_fp, out_quant, atol=0.02))
-      self.assertTrue(torch.allclose(out_quant_dynamo.cpu(), out_quant))
+      self.assertTrue(
+          torch.allclose(out_quant_dynamo.cpu(), out_quant, atol=4e-3))
 
   @parameterized.parameters([False, True])
   def test_q_linear_hlo(self, quantize_activation):
@@ -240,7 +241,7 @@ def test_blockwise_linear_module(self):
           x = x.to(device)
           out_quant_xla = m(x)
           self.assertGreater(
-              self._calc_cosine_dist(out_quant_xla.cpu(), out_quant), 0.999999)
+              self._calc_cosine_dist(out_quant_xla.cpu(), out_quant), 0.99999)
 
   @parameterized.parameters([False, True])
   def test_asymmetric_per_channel(self, quantize_activation):
@@ -263,7 +264,7 @@ def test_asymmetric_per_channel(self, quantize_activation):
           x = x.to(device)
           out_quant_xla = m(x)
           self.assertGreater(
-              self._calc_cosine_dist(out_quant_xla.cpu(), out_quant), 0.999999)
+              self._calc_cosine_dist(out_quant_xla.cpu(), out_quant), 0.99999)
 
   def test_asymmetric_blockwise(self):
     for n_bit in [8]:
diff --git a/torch_xla/experimental/pallas_kernels/quantized_matmul_kernel.py b/torch_xla/experimental/pallas_kernels/quantized_matmul_kernel.py
@@ -12,15 +12,12 @@ def _quantize_array(
     x_abs_max_val: jax.Array,  # [1, bs_block_size]
 ):
   n_bits = 8
-  int_min = -2**(n_bits - 1)
   int_max = 2**(n_bits - 1) - 1
   scale = (x_abs_max_val / int_max).T  # [bs_block_size, 1]
   # Need to explicitly cast to f32 because Mosaic can't directly jnp.round a
   # bf16 array.
   # It seems x/0 in Pallas generates inf/-inf instead of an exception.
-  x_int = jnp.clip(
-      jnp.round((x / scale).astype(jnp.float32)), int_min,
-      int_max).astype(jnp.int8)
+  x_int = jnp.round((x / scale).astype(jnp.float32)).astype(jnp.int8)
   return x_int, scale.astype(x.dtype)
 
 
diff --git a/torch_xla/experimental/xla_quantized_matmul.py b/torch_xla/experimental/xla_quantized_matmul.py
@@ -67,10 +67,9 @@ def _quantize_tensor(x: torch.Tensor, n_bits: int = 8, dim: int = -1):
     torch.Tensor: The scaling factor used for quantization. (Same dtype as x)
   """
   max_val = torch.amax(torch.abs(x), dim=dim, keepdim=True)
-  int_min = -2**(n_bits - 1)
   int_max = 2**(n_bits - 1) - 1
   scale = max_val / int_max
-  x_int = torch.clamp(torch.round(x / scale), int_min, int_max).to(torch.int8)
+  x_int = torch.round(x / scale).to(torch.int8)
   return x_int, scale.to(x.dtype)