[release/2.6] NAVI32 specific fixes - Used general decorator instead of ROCm specific (#2515)

iupaikov-amd · web-flow · commit 0016598e273a · 2025-08-18T10:47:58.000-05:00
Fixes ROCm/frameworks-internal#12096
diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
@@ -30,14 +30,18 @@
 from torch._inductor.virtualized import V
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.testing import FileCheck
+from torch.testing._internal.common_device_type import largeTensorTest
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
-    skipIfRocmNotEnoughMemory,
     skipIfRocm,
     TEST_WITH_ROCM,
 )
-from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
+from torch.testing._internal.inductor_utils import (
+    GPU_TYPE,
+    HAS_CPU,
+    HAS_CUDA,
+)    
 
 
 torch.set_float32_matmul_precision("high")
@@ -719,7 +723,7 @@ def test_conv_backend(self):
         self.assertIn("NoValidChoicesError", str(context.exception))
 
     # Some ROCm GPUs don't have enough VRAM to run all autotune configurations and padding benchmarks
-    @skipIfRocmNotEnoughMemory(30)
+    @largeTensorTest("30 GB", device=GPU_TYPE)
     def test_non_contiguous_input_mm(self):
         """
         Make sure the triton template can work with non-contiguous inputs without crash.
@@ -770,7 +774,7 @@ def f(x, y):
         torch.testing.assert_close(act, ref, atol=2e-2, rtol=1e-2)
 
     # Some ROCm GPUs don't have enough VRAM to run all autotune configurations and padding benchmarks
-    @skipIfRocmNotEnoughMemory(30)
+    @largeTensorTest("30 GB", device=GPU_TYPE)
     def test_non_contiguous_input_mm_plus_mm(self):
         x1 = rand_strided((50257, 32768), (1, 50304), device="cuda")
         y1 = rand_strided((32768, 768), (768, 1), device="cuda")
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
@@ -1872,31 +1872,6 @@ def wrap_fn(self, *args, **kwargs):
         return wrap_fn
     return dec_fn
 
-# Checks if current ROCm device has enough VRAM against the required amount in GB
-def skipIfRocmNotEnoughMemory(required_amount):
-    def dec_fn(fn):
-        @wraps(fn)
-        def wrap_fn(self, *args, **kwargs):
-            if TEST_WITH_ROCM:
-                device = torch.cuda.current_device()
-                props = torch.cuda.get_device_properties(device)
-                
-                total = props.total_memory / (1024 ** 3)  # in GB
-                # This will probably return 0 because it only counts tensors
-                # and doesn't take into account any small supporting allocations
-                allocated = torch.cuda.memory_allocated(device) / (1024 ** 3)
-                free_global = total - allocated
-
-                result = free_global > required_amount
-
-                if not result:
-                    reason = f"skipIfRocm: Not enough free VRAM on current ROCm device. " \
-                        f"Available: {free_global:.2f} GB | Required: {required_amount:.2f} GB."
-                    raise unittest.SkipTest(reason)
-            return fn(self, *args, **kwargs)
-        return wrap_fn
-    return dec_fn
-
 def runOnRocm(fn):
     @wraps(fn)
     def wrapper(*args, **kwargs):