Unskipped multiple inductor tests for ROCm (pytorch#143581)

iupaikov-amd · jeffdaily · pytorchmergebot · commit e05d67790ee4 · 2025-01-08T03:55:33.000Z
All of them should be fine to run now after the triton fix. Pull Request resolved: pytorch#143581 Approved by: https://github.com/jataylo, https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>
diff --git a/test/inductor/test_flex_decoding.py b/test/inductor/test_flex_decoding.py
@@ -1350,7 +1350,6 @@ def bias_mod(score, batch, head, token_q, token_kv):
         self.run_test(bias_mod)
         self.run_test_with_paged_attention(bias_mod)
 
-    @skipIfRocm
     @supported_platform
     def test_fully_masked_out_rows_0_check_gqa(self):
         # Ensure fully masked out rows won't cause NaNs.
diff --git a/test/inductor/test_inductor_freezing.py b/test/inductor/test_inductor_freezing.py
@@ -712,7 +712,6 @@ def foo(mod, inp):
         self.assertEqual(eager, compiled)
         self.assertTrue(weight_ref() is None)
 
-    @skipIfRocm
     def test_conv_with_as_strided(self):
         class Model(nn.Module):
             def __init__(self, groups):
diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
@@ -305,7 +305,6 @@ def mm(a, b):
         with config.patch({"max_autotune": True}):
             torch.compile(mm, dynamic=dynamic)(a, b)
 
-    @skipIfRocm
     def test_precompilation_threads(self):
         import threading
         from typing import Any, Dict
@@ -481,7 +480,6 @@ def addmm(x, a, b):
         with config.patch({"max_autotune": True}):
             torch.compile(addmm, dynamic=dynamic)(x, a, b)
 
-    @skipIfRocm
     def test_autotune_conv1x1(self):
         # Assuming input has 3 channels and we want to produce 16 channels as output
         conv1x1 = (
@@ -512,7 +510,6 @@ def foo(mod, x):
             FileCheck().check_not("extern_kernels.convolution").run(code[0])
             self.assertEqual(conv1x1(input_tensor), out, atol=1e-2, rtol=0)
 
-    @skipIfRocm
     def test_filled_cache_precompile(self):
         def fn(a, b, c):
             a = (a @ b) @ c
@@ -531,7 +528,6 @@ def fn(a, b, c):
         fn_c = torch.compile(mode="max-autotune-no-cudagraphs")(fn)
         self.assertEqual(counters["inductor"]["select_algorithm_precompile"], 0)
 
-    @skipIfRocm
     @fresh_inductor_cache()
     @config.patch(search_autotune_cache=True)
     def test_search_autotune_cache(self):
@@ -547,7 +543,6 @@ def fn(a, b, c):
         self.assertEqual(fn(*inputs), fn_c(*inputs), atol=1e-2, rtol=1e-2)
         self.assertEqual(counters["inductor"]["select_algorithm_precompile"], 0)
 
-    @skipIfRocm
     @fresh_inductor_cache()
     @config.patch(max_autotune=True, max_fusion_size=2)
     def test_jit_fusion_matches_aot_fusion(self):
@@ -990,7 +985,6 @@ def tearDown(self):
         super().tearDown()
         PatchCaches.tearDown()
 
-    @skipIfRocm
     @parametrize("dynamic", (False, True))
     def test_max_autotune_remote_caching(self, dynamic: bool):
         from unittest.mock import patch
diff --git a/test/inductor/test_memory_planning.py b/test/inductor/test_memory_planning.py
@@ -3,12 +3,7 @@
 import sys
 import unittest
 
-from torch.testing._internal.common_utils import (
-    IS_CI,
-    IS_WINDOWS,
-    skipIfRocm,
-    skipIfXpu,
-)
+from torch.testing._internal.common_utils import IS_CI, IS_WINDOWS, skipIfXpu
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU, requires_gpu
 
 
@@ -83,7 +78,6 @@ def test_cpp_wrapper(self):
         )
         self.assertTrue(same(f(*args), result))
 
-    @skipIfRocm(msg="test_aot_inductor doesn't work on ROCm")
     @skipIfXpu(msg="aoti doesn't work on XPU")
     def test_aoti(self):
         try:
diff --git a/test/inductor/test_pattern_matcher.py b/test/inductor/test_pattern_matcher.py
@@ -143,7 +143,6 @@ def _test_fused_int_mm_mul_impl(self, fn, args, fused_int_mm_mul_expected=True):
                 ref[indices], test[indices]
             )  # also checks that dtype is correct
 
-    @skipIfRocm
     @skipIfXpu
     @skipCUDAIf(not SM80OrLater, "need sm_80")
     @inductor_config.patch(force_fuse_int_mm_with_mul=True)
@@ -237,7 +236,6 @@ def f_replaced(x: torch.Tensor) -> torch.Tensor:
         self.assertEqual(f(inp), f_replaced(inp))
         self.assertEqual(count, 2)
 
-    @skipIfRocm
     @skipIfXpu
     @skipCUDAIf(not SM80OrLater, "need sm_80")
     @inductor_config.patch(force_fuse_int_mm_with_mul=True)
diff --git a/test/inductor/test_select_algorithm.py b/test/inductor/test_select_algorithm.py
@@ -112,8 +112,6 @@ def foo(a, b):
         )
         self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
 
-    # FIXME: Investigate why _int_mm_out_cuda is not compiled on ROCm
-    @skipIfRocm
     @patches
     def test__int_mm(self):
         @torch.compile
@@ -296,7 +294,6 @@ def fn(x1, x2, seed):
         )
         self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
 
-    @skipIfRocm
     @patches
     @torch._inductor.config.patch(conv_1x1_as_mm=False)
     def test_convolution2(self):
diff --git a/test/inductor/test_triton_kernels.py b/test/inductor/test_triton_kernels.py
@@ -19,12 +19,7 @@
 from torch._library import capture_triton
 from torch.testing import FileCheck
 from torch.testing._internal import common_utils
-from torch.testing._internal.common_utils import (
-    parametrize,
-    skipIfRocm,
-    skipIfXpu,
-    TEST_WITH_ROCM,
-)
+from torch.testing._internal.common_utils import parametrize, skipIfXpu, TEST_WITH_ROCM
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CUDA, HAS_GPU, HAS_XPU
 from torch.testing._internal.logging_utils import logs_to_string
 
@@ -550,7 +545,6 @@ def call_triton(output):
         call_triton(output)
 
     @requires_gpu
-    @skipIfRocm
     def test_triton_kernel_dependancies(self):
         def call_triton(
             x: torch.Tensor,
@@ -669,7 +663,6 @@ def call_triton(
 
     @requires_gpu
     @skipIfXpu
-    @skipIfRocm
     def test_triton_kernel_constants(self):
         @triton.jit
         def mulC_kernel(
@@ -754,7 +747,6 @@ def grid_fn(meta):
         self.assertEqual(compiled_func(t1, t2, output2), torch_add)
 
     @requires_gpu
-    @skipIfRocm  # https://github.com/pytorch/pytorch/actions/runs/10051552819/job/27782048305?pr=131431
     @common_utils.parametrize("backend", ["eager", "aot_eager", "inductor"])
     @patch.object(
         torch._inductor.config, "unsafe_ignore_unsupported_triton_autotune_args", True
@@ -1304,7 +1296,6 @@ def f(x, y):
         self.assertEqual(compiled_out, eager_out)
 
     @requires_gpu
-    @skipIfRocm
     def test_triton_kernel_with_imported_symbol(self):
         @triton.jit
         def add_kernel_with_imported_symbol(
@@ -1336,7 +1327,6 @@ def f(x):
         self.assertEqual(compiled_out, eager_out)
 
     @requires_gpu
-    @skipIfRocm
     def test_triton_kernel_with_imported_symbol_with_custom_name(self):
         @triton.jit
         def add_kernel_with_imported_symbol(
@@ -2434,7 +2424,6 @@ def argmax_kernel(a_ptr, c_ptr, stride_am, stride_an):
         )
 
     @requires_gpu
-    @skipIfRocm
     def test_triton_kernel_inference_mode(self):
         def f(x, y, out):
             n_elements = x.numel()

Original file line number	Diff line number	Diff line change
`@@ -112,8 +112,6 @@ def foo(a, b):`
`112`	`112`	`)`
`113`	`113`	`self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)`
`114`	`114`
`115`		`- # FIXME: Investigate why _int_mm_out_cuda is not compiled on ROCm`
`116`		`- @skipIfRocm`
`117`	`115`	`@patches`
`118`	`116`	`def test__int_mm(self):`
`119`	`117`	`@torch.compile`
`@@ -296,7 +294,6 @@ def fn(x1, x2, seed):`
`296`	`294`	`)`
`297`	`295`	`self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)`
`298`	`296`
`299`		`- @skipIfRocm`
`300`	`297`	`@patches`
`301`	`298`	`@torch._inductor.config.patch(conv_1x1_as_mm=False)`
`302`	`299`	`def test_convolution2(self):`