Unify the return type of w8a8 matmul between fallback and the actual impl. (#9452)

vanbasten23 · web-flow · commit 52569ecc623b · 2025-07-09T18:49:52.000-07:00
diff --git a/test/test_pallas.py b/test/test_pallas.py
@@ -878,10 +878,10 @@ def test_ragged_paged_attention_wrapper_without_dynamo(
         use_dynamo=False,
     )
 
+  # compute normalized Frobenius error.
   def _compute_rel_error(self, x, q_x):
-    return torch.mean(torch.sqrt(torch.mean(torch.square(q_x - x),
-                                            axis=1))) / torch.sqrt(
-                                                torch.mean(torch.square(x)))
+    abs_error = torch.sqrt(torch.mean(torch.square(q_x - x), axis=1))
+    return torch.mean(abs_error) / torch.sqrt(torch.mean(torch.square(x)))
 
   def _test_quantized_matmul_int8(
       self,
@@ -909,7 +909,9 @@ def _test_quantized_matmul_int8(
         qscheme=torch.per_channel_symmetric)
     w_int = torch.ops.quantized_decomposed.quantize_per_channel(
         w, scalar, zero_point, 0, int_min, int_max, torch.int8)
-    scalar = scalar.to(w.dtype)
+    # In the actual workload such as vLLM, the scalar is obtained
+    # offline and is usually in float32.
+    scalar = scalar.to(torch.float32)
 
     x_copy = x.clone()
     w_copy = w.clone()
@@ -942,7 +944,7 @@ def quantized_matmul_int8_wrapper(x, w_int, scalar, quantize_activation):
     rel_error = self._compute_rel_error(expected, actual)
 
     self.assertEqual(actual.shape, expected.shape)
-    self.assertEqual(actual.dtype, expected.dtype)
+    self.assertEqual(actual.dtype, x.dtype)
     self.assertTrue(rel_error < 3e-2)
 
   @parameterized.product(
@@ -1020,6 +1022,28 @@ def test_quantized_matmul_int8_wrapper_key_not_exists_in_table(
         use_dynamo=use_dynamo,
     )
 
+  @unittest.skipIf(xr.device_type() != 'TPU', "This test only works on TPU.")
+  @parameterized.product(
+      dtype=[torch.bfloat16, torch.float32],
+      use_dynamo=[True, False],
+  )
+  def test_quantized_matmul_int8_wrapper_fallback(self, dtype, use_dynamo):
+    x = torch.randn(10, 20, device='meta', dtype=dtype)
+    w = torch.randint(-128, 127, (30, 20), device='meta', dtype=torch.int8)
+    scalar = torch.randn(30, device='meta', dtype=torch.float32)
+    if use_dynamo:
+
+      def quantized_matmul_int8_wrapper(x, w_int, scalar, quantize_activation):
+        return torch.ops.xla.quantized_matmul_int8(
+            x, w_int, scalar, quantize_activation=quantize_activation)
+
+      quantized_matmul_int8 = torch.compile(
+          quantized_matmul_int8_wrapper, backend="openxla")
+    else:
+      quantized_matmul_int8 = torch.ops.xla.quantized_matmul_int8
+    res = quantized_matmul_int8(x, w, scalar, quantize_activation=True)
+    self.assertEqual(res.dtype, x.dtype)
+
   @unittest.skipIf(xr.device_type() != 'TPU' or tpu.version() < 4,
                    "This test only works on TPUv4+.")
   def test_paged_attention_multi_queries_wrapper(self):
diff --git a/test/test_quantized_matmul_pallas_kernel.py b/test/test_quantized_matmul_pallas_kernel.py
@@ -26,6 +26,13 @@ def quantize_array(x, n_bits: int = 8, dim: int = -1):
   return x_int, scale.astype(x.dtype)
 
 
+# compute normalized Frobenius error.
+@jax.jit
+def _compute_rel_error(x, q_x):
+  abs_error = jnp.sqrt(jnp.mean(jnp.square(q_x - x), axis=1))
+  return jnp.mean(abs_error) / jnp.sqrt(jnp.mean(jnp.square(x)))
+
+
 @jtu.with_config(jax_numpy_dtype_promotion="standard")
 class QuantizedMatmulKernelTest(jtu.JaxTestCase):
 
@@ -69,7 +76,10 @@ def _test_quantized_matmul(self,
     expected = jax.lax.dot_general(
         x_copy, w_copy, dimension_numbers=(((1,), (1,)), ((), ())))
 
-    self.assertEqual(output.dtype, expected.dtype)
+    rel_error = _compute_rel_error(expected, output)
+    self.assertTrue(rel_error < 2e-2)
+
+    self.assertEqual(output.dtype, x.dtype)
     self.assertEqual(output.shape, expected.shape)
     self.assertAllClose(output, expected, atol=atol)
 
diff --git a/torch_xla/experimental/custom_kernel.py b/torch_xla/experimental/custom_kernel.py
@@ -1100,7 +1100,7 @@ def quantized_matmul_int8(
         })
   from torch_xla.experimental.xla_quantized_matmul import quantized_matmul_xla
   return quantized_matmul_xla(
-      x, w, scalar, quantize_activation=quantize_activation)
+      x, w, scalar, quantize_activation=quantize_activation).to(x.dtype)
 
 
 def _multi_queries_paged_attention_nonkernel(
@@ -1778,4 +1778,4 @@ def quantized_matmul_int8_non_xla(
     warnings.warn(
         f'XLA quantized_matmul_int8 should only be applied to tensors on XLA device'
     )
-  return torch.empty(x.shape[0], w.shape[0], device=x.device)
+  return torch.empty(x.shape[0], w.shape[0], device=x.device, dtype=x.dtype)