test_decompose_mem_bound_mm.py tolerance increase for navi3x

iupaikov-amd · pruthvistony · commit 03c7da05f618 · 2025-05-20T11:24:59.000-05:00
diff --git a/test/inductor/test_decompose_mem_bound_mm.py b/test/inductor/test_decompose_mem_bound_mm.py
@@ -1,6 +1,7 @@
 # Owner(s): ["module: inductor"]
 
 import logging
+import unittest
 
 import torch
 import torch._inductor
@@ -11,8 +12,10 @@
 from torch.testing import FileCheck
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
+    patch_test_members,
+    is_navi3_arch,
     parametrize,
-    skipIfXpu,
+    TEST_XPU,
 )
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CUDA
 from torch.testing._internal.triton_utils import requires_gpu
@@ -48,9 +51,10 @@ def forward(self, input1, input2):
 
 
 @requires_gpu
-@skipIfXpu(
-    msg="Intel GPU has not enabled decompose_mem_bound_mm PASS in "
-    "torch/_inductor/fx_passes/decompose_mem_bound_mm.py"
+@unittest.skipIf(
+    TEST_XPU,
+    "Intel GPU has not enabled decompose_mem_bound_mm PASS in "
+    "torch/_inductor/fx_passes/decompose_mem_bound_mm.py",
 )
 @torch._inductor.config.patch(
     post_grad_fusion_options={
@@ -59,31 +63,46 @@ def forward(self, input1, input2):
 )
 @instantiate_parametrized_tests
 class TestDecomposeMemMM(TestCase):
-    def compare_dict_tensors(self, ref_dict, res_dict, rtol=1e-3, atol=1e-3):
+    def __init__(self, method_name='runTest', methodName='runTest'):
+        super().__init__(method_name, methodName)
+        self.atol = 1e-3
+        self.rtol = 1e-3
+
+    def setup_tolerance(self, rtol=None, atol=None):
+        if rtol is None:
+            rtol = self.rtol
+        if atol is None:
+            atol = self.rtol
+
+    def compare_dict_tensors(self, ref_dict, res_dict, rtol=None, atol=None):
+        self.setup_tolerance(rtol, atol)
         if len(set(ref_dict.keys())) != len(set(res_dict.keys())):
             return False
         for key1 in ref_dict.keys():
             key2 = "_orig_mod." + key1
             assert key2 in res_dict, f"{key1} does not exist in traced module"
-            if not torch.allclose(ref_dict[key1], res_dict[key2], rtol=rtol, atol=atol):
+            if not torch.allclose(ref_dict[key1], res_dict[key2], rtol=self.rtol, atol=self.atol):
                 return False
         return True
 
-    def compare_pred(self, module, traced, input, rtol=1e-3, atol=1e-3):
+    def compare_pred(self, module, traced, input, rtol=None, atol=None):
+        self.setup_tolerance(rtol, atol)
         ref = module(*input)
         res = traced(*input)
-        self.assertEqual(ref, res, rtol=rtol, atol=atol)
+        self.assertEqual(ref, res, rtol=self.rtol, atol=self.atol)
 
-    def compare_parameters(self, module, traced, rtol=1e-3, atol=1e-3):
+    def compare_parameters(self, module, traced, rtol=None, atol=None):
+        self.setup_tolerance(rtol, atol)
         ref_params = dict(module.named_parameters())
         res_params = dict(traced.named_parameters())
-        self.assertTrue(self.compare_dict_tensors(ref_params, res_params, rtol, atol))
+        self.assertTrue(self.compare_dict_tensors(ref_params, res_params, rtol=self.rtol, atol=self.atol))
 
-    def compare_gradients(self, module, traced, rtol=1e-3, atol=1e-3):
+    def compare_gradients(self, module, traced, rtol=None, atol=None):
+        self.setup_tolerance(rtol, atol)
         ref_grad = {key: param.grad for key, param in module.named_parameters()}
         res_grad = {key: param.grad for key, param in traced.named_parameters()}
         self.assertTrue(
-            self.compare_dict_tensors(ref_grad, res_grad, rtol=rtol, atol=atol)
+            self.compare_dict_tensors(ref_grad, res_grad, rtol=self.rtol, atol=self.atol)
         )
 
     @parametrize(
@@ -190,6 +209,12 @@ def test_decompose_linear(self, m, n, k, has_bias, should_decompose):
         )
         counters.clear()
 
+    # We have to increase tolerance for navi3 because all fp16, bf16
+    # GEMMs operations have an accuracy issue caused by hardware limitation
+    @patch_test_members({
+        "atol": 2e-3 if is_navi3_arch() else 1e-3,
+        "rtol": 2e-3 if is_navi3_arch() else 1e-3
+    })
     @parametrize(
         "m,k,n, should_decompose",
         [(20480, 5, 2, True), (20480, 32, 2, False), (2048, 2, 2, False)],
@@ -298,6 +323,12 @@ def test_decompose_mm_cpu(self, m, n, k, should_decompose):
         )
         counters.clear()
 
+    # We have to increase tolerance for navi3 because all fp16, bf16
+    # GEMMs operations have an accuracy issue caused by hardware limitation
+    @patch_test_members({
+        "atol": 3e-3 if is_navi3_arch() else 1e-3,
+        "rtol": 4e-3 if is_navi3_arch() else 1e-3
+    })
     @parametrize(
         "m,k,n, should_decompose",
         [(20480, 5, 2, True), (20480, 32, 2, False), (2048, 2, 2, False)],
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
@@ -102,8 +102,18 @@
     has_pytest = False
 
 
-MI300_ARCH = ("gfx942",)
-
+MI300_ARCH = ("gfx940", "gfx941", "gfx942")
+NAVI_ARCH = ("gfx1030", "gfx1100", "gfx1101", "gfx1200", "gfx1201")
+NAVI3_ARCH = ("gfx1100", "gfx1101")
+NAVI4_ARCH = ("gfx1200", "gfx1201")
+
+def is_navi3_arch():
+    if torch.cuda.is_available():
+        prop = torch.cuda.get_device_properties(0)
+        gfx_arch = prop.gcnArchName.split(":")[0]
+        if gfx_arch in NAVI3_ARCH:
+            return True
+    return False
 
 def freeze_rng_state(*args, **kwargs):
     return torch.testing._utils.freeze_rng_state(*args, **kwargs)
@@ -5646,3 +5656,26 @@ def load_inline(*args, **kwargs):
         return func(*args, load_inline=load_inline, **kwargs)
 
     return wrapper
+
+# Decorator to patch multiple test class members for the duration of the subtest
+def patch_test_members(updates: Dict[str, Any]):
+    def decorator(test_func):
+        @wraps(test_func)
+        def wrapper(self, *args, **kwargs):
+            # Store the original values of the specified members
+            original_values = {member: getattr(self, member) for member in updates}
+
+            # Update the members before running the subtest
+            for member, value in updates.items():
+                setattr(self, member, value)
+
+            # Run the test function, allowing subtests to run
+            try:
+                return test_func(self, *args, **kwargs)
+            finally:
+                # Restore the original values of the specified members after the subtest finishes
+                for member, original_value in original_values.items():
+                    setattr(self, member, original_value)
+
+        return wrapper
+    return decorator