[Intel GPU] add tf32 support for matmul on XPU (pytorch#144240)

jianyizh · pytorchmergebot · commit 9f98e37eb4a0 · 2025-05-13T14:03:01.000Z
Support xpu tf32 matmul using torch.bachend.mkldnn.allow_tf32, we will discuss in future if we need a new api to control matmul only ~~Support xpu tf32 matmul using torch.set_float32_matmul_precision. For conv, check pytorch#137570 We decide not following torch.backends.cuda.matmul.allow_tf32 because this API actually calls setAllowTF32CuBLAS to set matmul_precison to high. We also avoid other related tf32 changes (i.e. in inductor) by not introducing new API.~~ Pull Request resolved: pytorch#144240 Approved by: https://github.com/EikanWang
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Matmul.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/Matmul.cpp
@@ -194,7 +194,12 @@ sycl::event matmul(
   pattr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
 
   if (m1_dt == dnnl::memory::data_type::f32) {
-    pattr.set_fpmath_mode(dnnl::fpmath_mode::strict);
+    bool allow_tf32 = at::globalContext().allowTF32OneDNN();
+    if (allow_tf32) {
+      pattr.set_fpmath_mode(dnnl::fpmath_mode::tf32);
+    } else {
+      pattr.set_fpmath_mode(dnnl::fpmath_mode::strict);
+    }
   }
 
   // STEP3: create primitive
diff --git a/test/xpu/test_gemm.py b/test/xpu/test_gemm.py
@@ -1,5 +1,8 @@
 # Owner(s): ["module: intel"]
 
+import contextlib
+import functools
+import inspect
 import itertools
 import math
 import random
@@ -23,6 +26,102 @@
 )
 
 
+@contextlib.contextmanager
+def tf32_off():
+    enabled = torch.backends.mkldnn.enabled
+    deterministic = torch.backends.mkldnn.deterministic
+    with torch.backends.mkldnn.flags(
+        enabled=enabled, deterministic=deterministic, allow_tf32=False
+    ):
+        yield
+
+
+@contextlib.contextmanager
+def tf32_on(self, tf32_precision=1e-5):
+    enabled = torch.backends.mkldnn.enabled
+    deterministic = torch.backends.mkldnn.deterministic
+    old_precision = self.precision
+    try:
+        self.precision = tf32_precision
+        with torch.backends.mkldnn.flags(
+            enabled=enabled, deterministic=deterministic, allow_tf32=True
+        ):
+            yield
+    finally:
+        self.precision = old_precision
+
+
+# This is a wrapper that wraps a test to run this test twice, one with
+# allow_tf32=True, another with allow_tf32=False. When running with
+# allow_tf32=True, it will use reduced precision as specified by the
+# argument. For example:
+#    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
+#    @tf32_on_and_off(0.005)
+#    def test_matmul(self, device, dtype):
+#        a = ...; b = ...;
+#        c = torch.matmul(a, b)
+#        self.assertEqual(c, expected)
+# In the above example, when testing torch.float32 , the matmul will be running at
+# TF32 mode and TF32 mode off, and on TF32 mode, the assertEqual will use reduced
+# precision to check values.
+#
+# This decorator can be used for function with or without device/dtype, such as
+# @tf32_on_and_off(0.005)
+# def test_my_op(self)
+# @tf32_on_and_off(0.005)
+# def test_my_op(self, device)
+# @tf32_on_and_off(0.005)
+# def test_my_op(self, device, dtype)
+# @tf32_on_and_off(0.005)
+# def test_my_op(self, dtype)
+def tf32_on_and_off(tf32_precision=1e-5):
+    def with_tf32_disabled(self, function_call):
+        with tf32_off():
+            function_call()
+
+    def with_tf32_enabled(self, function_call):
+        with tf32_on(self, tf32_precision):
+            function_call()
+
+    def wrapper(f):
+        params = inspect.signature(f).parameters
+        arg_names = tuple(params.keys())
+
+        @functools.wraps(f)
+        def wrapped(*args, **kwargs):
+            kwargs.update(zip(arg_names, args))
+            cond = True
+            if "device" in kwargs:
+                cond = cond and (torch.device(kwargs["device"]).type == "xpu")
+            if "dtype" in kwargs:
+                cond = cond and (
+                    kwargs["dtype"] in {torch.float32}
+                )  # TODO: add complex64
+            if cond:
+                with_tf32_disabled(kwargs["self"], lambda: f(**kwargs))
+                with_tf32_enabled(kwargs["self"], lambda: f(**kwargs))
+            else:
+                f(**kwargs)
+
+        return wrapped
+
+    return wrapper
+
+
+# This is a wrapper that wraps a test to run it with TF32 turned off.
+# This wrapper is designed to be used when a test uses matmul or convolutions
+# but the purpose of that test is not testing matmul or convolutions.
+# Disabling TF32 will enforce torch.float tensors to be always computed
+# at full precision.
+def with_tf32_off(f):
+    @functools.wraps(f)
+    def wrapped(*args, **kwargs):
+        with tf32_off():
+            return f(*args, **kwargs)
+
+    return wrapped
+
+
 class TestBasicGEMM(TestCase):
     def _test_addmm_addmv(
         self, f, t, m, v, *, alpha=None, beta=None, transpose_out=False, activation=None
@@ -133,11 +232,13 @@ def maybe_transpose(cond, m):
 
     @precisionOverride({torch.float: 1e-4, torch.double: 1e-6, torch.half: 1e-1})
     @dtypes(torch.float32, torch.half, torch.double)
+    @tf32_on_and_off(0.05)
     def test_addmm(self, device, dtype):
         self._test_addmm_impl(torch.addmm, None, device, dtype)
 
     @precisionOverride({torch.bfloat16: 1e-0, torch.half: 1e-3, torch.float: 1e-4})
     @dtypes(torch.bfloat16, torch.half, torch.float, torch.double)
+    @tf32_on_and_off(0.005)
     def test_addmv(self, device, dtype):
         # have to use torch.randn(...).to(bfloat16) instead of
         # torch.randn(..., dtype=bfloat16). randn does not support
@@ -185,6 +286,7 @@ def test_addmv(self, device, dtype):
         torch.float32,
         torch.float64,
     )
+    @tf32_on_and_off(0.05)
     def test_mm(self, device, dtype):
         def _test_mm(n, m, p, dtype, genf):
             # helper function
@@ -287,6 +389,7 @@ def genf_Half(x, y):
 
     @precisionOverride({torch.half: 0.05, torch.bfloat16: 0.05})
     @dtypes(torch.float32, torch.bfloat16, torch.half, torch.float64)
+    @tf32_on_and_off(0.05)
     def test_bmm(self, device, dtype):
         batch_sizes = [1, 10]
         M, N, O = 23, 15, 12
@@ -403,6 +506,7 @@ def _test_addbmm_baddbmm(self, func, b1, b2, ref, out_tensor):
 
     @precisionOverride({torch.half: 0.05, torch.bfloat16: 0.05})
     @dtypes(torch.float64, torch.float32, torch.bfloat16, torch.half)
+    @tf32_on_and_off(0.005)
     def test_addbmm(self, device, dtype):
         num_batches = 2
         M, N, O = 16, 17, 18
@@ -506,6 +610,7 @@ def generate_tensor():
 
     @precisionOverride({torch.half: 0.1, torch.bfloat16: 0.5, torch.float64: 1e-6})
     @dtypes(torch.float64, torch.float32, torch.bfloat16, torch.half)
+    @tf32_on_and_off(0.01)
     def test_baddbmm(self, device, dtype):
         num_batches = 10
         M, N, O = 12, 8, 50
@@ -568,6 +673,7 @@ def generate_tensor():
         for b1, b2, ref, out_tensor in generate_tensor():
             self._test_addbmm_baddbmm("baddbmm", b1, b2, ref, out_tensor)
 
+    @tf32_on_and_off(0.05)
     def test_tensordot(self, device):
         a = torch.arange(60.0, device=device).reshape(3, 4, 5)
         b = torch.arange(24.0, device=device).reshape(4, 3, 2)
@@ -604,6 +710,7 @@ def test_tensordot(self, device):
 
     @dtypes(torch.float, torch.double)
     @precisionOverride({torch.float32: 1e-4})
+    @tf32_on_and_off(0.005)
     def test_1_sized_with_0_strided(self, device, dtype):
         a = make_tensor((8, 1, 64), dtype=dtype, device=device)
         a_strided = torch.as_strided(a, size=[8, 1, 64], stride=[64, 0, 1])
@@ -646,6 +753,7 @@ def _select_broadcastable_dims(self, dims_full=None):
                 dims_small = [ds] + dims_small
         return (dims_small, dims_large, dims_full)
 
+    @tf32_on_and_off(0.005)
     def test_broadcast_fused_matmul(self, device):
         fns = ["baddbmm", "addbmm", "addmm", "addmv", "addr"]
 
@@ -692,6 +800,7 @@ def dims_full_for_fn():
             self.assertEqual(r0, r1)
 
     @dtypes(torch.float32, torch.float64)
+    @tf32_on_and_off(0.005)
     def test_strided_mm_bmm(self, device, dtype):
         # Tests strided view case with stride smaller than corresponding dimension size
         x = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=dtype, device=device)
@@ -706,6 +815,7 @@ def test_strided_mm_bmm(self, device, dtype):
         torch_fn = lambda x: torch.mm(x, x)  # noqa: E731
         self.compare_with_numpy(torch_fn, np_fn, sx[0])
 
+    @tf32_on_and_off(0.005)
     def test_mm_empty_inputs_mixed_dtype_errors(self, device):
         a = torch.randint(0, 10, [1, 10], dtype=torch.int16, device=device)
         b = torch.randn(10, 20, dtype=torch.float32, device=device)
@@ -714,6 +824,7 @@ def test_mm_empty_inputs_mixed_dtype_errors(self, device):
         ):
             torch.mm(a, b)
 
+    @tf32_on_and_off(0.005)
     def test_matmul_45724(self, device):
         # https://github.com/pytorch/pytorch/issues/45724
         a = torch.rand(65537, 22, 64, device=device, dtype=torch.half)
@@ -731,6 +842,7 @@ def test_matmul_45724(self, device):
         torch.float32,
         torch.float64,
     )
+    @tf32_on_and_off(0.005)
     def test_baddbmm_input_dtypes_compatibility(self, device, dtype):
         batch1 = torch.rand((1, 2, 2), dtype=torch.float32, device=device)
         batch2 = torch.rand((1, 2, 2), dtype=torch.float32, device=device)
@@ -745,6 +857,7 @@ def test_baddbmm_input_dtypes_compatibility(self, device, dtype):
             self.assertEqual(out, y_ref)
 
     @dtypes(torch.float)
+    @tf32_on_and_off(0.005)
     def test_baddbmm_nan_input_with_zero_beta(self, device, dtype):
         for shape in [[3, 2, 2], [2, 20, 20]]:
             mat1, mat2 = (
@@ -767,6 +880,7 @@ def test_baddbmm_nan_input_with_zero_beta(self, device, dtype):
 
     @precisionOverride({torch.double: 1e-6})
     @dtypes(torch.float, torch.double)
+    @tf32_on_and_off(0.005)
     def test_addmm_sizes(self, device, dtype):
         for m in [0, 1, 25]:
             for n in [0, 1, 10]:
@@ -798,6 +912,7 @@ def test_addmm_sizes(self, device, dtype):
         }
     )
     @dtypes(torch.double, torch.float32, torch.bfloat16, torch.half)
+    @tf32_on_and_off(0.05)
     def test_addmm_gelu(self, device, dtype):
         self._test_addmm_impl(torch._addmm_activation, "gelu", device, dtype)
 
@@ -812,10 +927,12 @@ def test_addmm_gelu(self, device, dtype):
         }
     )
     @dtypes(torch.double, torch.float32, torch.bfloat16, torch.half)
+    @tf32_on_and_off(0.05)
     def test_addmm_relu(self, device, dtype):
         self._test_addmm_impl(torch._addmm_activation, "relu", device, dtype)
 
-    @dtypes(torch.float, torch.bfloat16, torch.half, torch.double)
+    @dtypes(torch.float, torch.bfloat16, torch.half)
+    @tf32_on_and_off(0.005)
     def test_addmv_rowmajor_colmajor_incx_incy_lda(self, device, dtype):
         # tests (o, s)*(s).  o is output size, s is summed size.
         o = 5
@@ -859,6 +976,7 @@ def _test(row_major, incx, incy, lda_tail):
         }
     )
     @dtypes(torch.double, torch.bfloat16, torch.half, torch.float32)
+    @tf32_on_and_off(0.005)
     def test_corner_cases_of_cublasltmatmul(self, device, dtype):
         # common case
         M = torch.randn(128, device=device).to(dtype)
@@ -998,6 +1116,7 @@ def call_torch_fn(*args, **kwargs):
             torch.tensor(0.0, device=device), fn(torch.dot, (0,), (0,), test_out=True)
         )
 
+    @tf32_on_and_off(0.005)
     def test_large_bmm_backward(self, device):
         A = torch.randn([1024, 2, 1024], device=device).mT.contiguous().mT
         B = torch.randn([1, 1024, 65536], device=device, requires_grad=True)
@@ -1006,6 +1125,7 @@ def test_large_bmm_backward(self, device):
         # Should not create an intermediary tensor of size [1024, 1024, 65536] (256GB of memory) and OOM
         (A @ B).backward(G)
 
+    @tf32_on_and_off(0.005)
     def test_large_bmm_mm_backward(self, device):
         A = torch.randn([1024, 2, 1024], device=device).mT.contiguous().mT
         B = torch.randn([1024, 65536], device=device, requires_grad=True)
@@ -1104,6 +1224,7 @@ def test_matmul_small_brute_force_3d_Nd(self, device, dtype):
             self.check_single_matmul(x, y)
 
     @dtypes(torch.float)
+    @tf32_on_and_off(0.005)
     def test_matmul_out_kernel_errors_with_autograd(self, device, dtype):
         a = torch.empty(
             (256, 512), device=device, dtype=dtype, requires_grad=True