fix test_linalg and test_torch issue with bf32_on_and_off updates (#1884)

daisyden · RUIJIEZHONG66166 · web-flow · commit 7e51233d2671 · 2025-08-04T02:01:19.000Z
Fix test_torch and test_linalg issues introduced by pytorch/pytorch@f4d8bc4#diff-7e17421f32124016eb8de04dc2f445da5786a28355e1addc72b305466f590180. --------- Co-authored-by: Zhong Ruijie <109201212+RUIJIEZHONG66166@users.noreply.github.com>
diff --git a/test/xpu/test_linalg_xpu.py b/test/xpu/test_linalg_xpu.py
@@ -1,5 +1,6 @@
 # Owner(s): ["module: intel"]
 
+import contextlib
 import itertools
 import math
 import unittest
@@ -13,8 +14,11 @@
     instantiate_device_type_tests,
     precisionOverride,
 )
-from torch.testing._internal.common_dtype import floating_and_complex_types_and
-from torch.testing._internal.common_mkldnn import bf32_on_and_off
+from torch.testing._internal.common_dtype import (
+    floating_and_complex_types_and,
+    floating_types_and,
+)
+from torch.testing._internal.common_mkldnn import reduced_f32_on_and_off
 from torch.testing._internal.common_utils import (
     IS_WINDOWS,
     parametrize,
@@ -98,7 +102,7 @@ def preferred_linalg_library(self):
 @precisionOverride({torch.half: 0.05, torch.bfloat16: 0.05})
 @dtypes(*floating_and_complex_types_and(torch.bfloat16, torch.half))
 @tf32_on_and_off(0.05)
-@bf32_on_and_off(0.05)
+@reduced_f32_on_and_off(0.05)
 def addbmm(self, device, dtype):
     num_batches = 2
     M, N, O = 16, 17, 18
@@ -392,6 +396,83 @@ def ck_blas_library(self):
     pass
 
 
+@precisionOverride(
+    {
+        torch.double: 1e-8,
+        torch.float: 1e-4,
+        torch.bfloat16: 5e-2,
+        torch.half: 5e-2,
+        torch.cfloat: 1e-4,
+        torch.cdouble: 1e-8,
+    }
+)
+@dtypes(*floating_types_and(torch.bfloat16, torch.half))
+@tf32_on_and_off(0.05)
+@reduced_f32_on_and_off(0.05)
+def addmm_relu_tunableop_rocm(self, device, dtype):
+    with self._tunableop_ctx():
+        torch.xpu.tunable.set_rotating_buffer_size(0)
+        torch.xpu.tunable.set_max_tuning_iterations(1)
+        self._test_addmm_impl(torch._addmm_activation, "relu", device, dtype)
+
+
+def get_tunableop_untuned_filename():
+    import os
+
+    ordinal = torch.xpu.current_device()
+    untuned_filename_env = os.getenv("PYTORCH_TUNABLEOP_UNTUNED_FILENAME")
+    untuned_filename_base, _, _ = untuned_filename_env.rpartition(".")
+    untuned_filename = f"{untuned_filename_base}{ordinal}.csv"
+    return untuned_filename
+
+
+@contextlib.contextmanager
+def __tunableop_ctx(self):
+    # Initialize and then tear down TunableOp
+    import glob
+    import os
+
+    self._set_tunableop_defaults()
+    torch.xpu.tunable.enable(True)
+
+    try:
+        yield
+    finally:
+        # disables TunableOp
+        torch.xpu.tunable.enable(False)
+
+        # clean up, remove any files that were generated
+        results_filename = torch.xpu.tunable.get_filename()
+        results_filename_pattern, _, _ = results_filename.rpartition(".")
+        untuned_filename = get_tunableop_untuned_filename()
+        untuned_filename_pattern, _, _ = untuned_filename.rpartition(".")
+        patterns = [
+            f"{results_filename_pattern[:-1]}*.csv",
+            f"{untuned_filename_pattern[:-1]}*.csv",
+        ]
+        files = [f for pattern in patterns for f in glob.glob(pattern)]
+        for file in files:
+            try:
+                os.remove(file)
+            # NB: The file is locked on Windows
+            except (FileNotFoundError, PermissionError):
+                pass
+
+        # undo all the environment variables set
+        # loop through a list of potentially used
+        # environment variables.
+        env_list = [
+            "PYTORCH_TUNABLEOP_BLAS_LOG",
+            "PYTORCH_TUNABLEOP_NUMERICAL_CHECK",
+            "PYTORCH_TUNABLEOP_UNTUNED_FILENAME",
+        ]
+        for env in env_list:
+            try:
+                del os.environ[env]
+            except KeyError:
+                pass
+
+
 with XPUPatchForImport(False):
     from test_linalg import TestLinalg
 
@@ -410,6 +491,8 @@ def ck_blas_library(self):
 TestLinalg.test_matmul_small_brute_force_2d_Nd = matmul_small_brute_force_2d_Nd
 TestLinalg.test_matmul_small_brute_force_3d_Nd = matmul_small_brute_force_3d_Nd
 TestLinalg.test_ck_blas_library = ck_blas_library
+TestLinalg.test_addmm_relu_tunableop_rocm = addmm_relu_tunableop_rocm
+TestLinalg._tunableop_ctx = __tunableop_ctx
 
 TestLinalg._default_dtype_check_enabled = True
 instantiate_device_type_tests(TestLinalg, globals(), only_for=("xpu"), allow_xpu=True)
diff --git a/test/xpu/test_torch_xpu.py b/test/xpu/test_torch_xpu.py
@@ -66,7 +66,7 @@
     get_all_qint_dtypes,
     integral_types_and,
 )
-from torch.testing._internal.common_mkldnn import bf32_on_and_off
+from torch.testing._internal.common_mkldnn import reduced_f32_on_and_off
 from torch.testing._internal.common_optimizers import (
     _get_optim_inputs_including_global_cliquey_kwargs,
     optim_db,
@@ -2996,7 +2996,7 @@ def test_cdist_cuda_backward(self, device):
                         self.assertEqual(y1.grad, y2.grad, rtol=0, atol=0.001)
 
     @tf32_on_and_off(0.005)
-    @bf32_on_and_off(0.005)
+    @reduced_f32_on_and_off(0.08)
     def test_cdist_large(self, device):
         for cm in [
             "use_mm_for_euclid_dist_if_necessary",
@@ -3011,7 +3011,7 @@ def test_cdist_large(self, device):
 
     @slowTest
     @tf32_on_and_off(0.01)
-    @bf32_on_and_off(0.01)
+    @reduced_f32_on_and_off(0.08)
     def test_cdist_large_batch(self, device):
         for cm in [
             "use_mm_for_euclid_dist_if_necessary",
@@ -3025,7 +3025,7 @@ def test_cdist_large_batch(self, device):
             self.assertEqual(expected, actual)
 
     @tf32_on_and_off(0.005)
-    @bf32_on_and_off(0.005)
+    @reduced_f32_on_and_off(0.04)
     def test_cdist_non_contiguous(self, device):
         for cm in ["use_mm_for_euclid_dist", "donot_use_mm_for_euclid_dist"]:
             x = torch.randn(5, 7, device=device).mT
@@ -3053,7 +3053,7 @@ def test_cdist_non_contiguous(self, device):
             self.assertEqual(expected, actual)
 
     @tf32_on_and_off(0.005)
-    @bf32_on_and_off(0.005)
+    @reduced_f32_on_and_off(0.04)
     def test_cdist_non_contiguous_batch(self, device):
         for cm in ["use_mm_for_euclid_dist", "donot_use_mm_for_euclid_dist"]:
             x = torch.randn(4, 3, 2, 5, 7, device=device).mT
@@ -10913,7 +10913,9 @@ def test_manual_seed(self):
             )
             self.assertEqual(expected_initial_seed, actual_initial_seed, msg=msg)
         for invalid_seed in [min_int64 - 1, max_uint64 + 1]:
-            with self.assertRaisesRegex(RuntimeError, r"Overflow when unpacking long"):
+            with self.assertRaisesRegex(
+                ValueError, r"Overflow when unpacking long long"
+            ):
                 torch.manual_seed(invalid_seed)
 
         torch.set_rng_state(rng_state)
@@ -12546,9 +12548,12 @@ def test_size_stride(self) -> None:
     def test_invalid_arg_error_handling(self) -> None:
         """Tests that errors from old TH functions are propagated back"""
         for invalid_val in [-1, 2**65]:
-            self.assertRaises(RuntimeError, lambda: torch.set_num_threads(invalid_val))
             self.assertRaises(
-                RuntimeError, lambda: torch.set_num_interop_threads(invalid_val)
+                (ValueError, RuntimeError), lambda: torch.set_num_threads(invalid_val)
+            )
+            self.assertRaises(
+                (ValueError, RuntimeError),
+                lambda: torch.set_num_interop_threads(invalid_val),
             )
 
     def _get_tensor_prop(self, t):