ROCm
diff --git a/‎test/dynamo/test_higher_order_ops.py‎
Lines changed: 5 additions & 2 deletions b/‎test/dynamo/test_higher_order_ops.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎test/inductor/test_aot_inductor.py‎
Lines changed: 8 additions & 4 deletions b/‎test/inductor/test_aot_inductor.py‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎test/inductor/test_aot_inductor_package.py‎
Lines changed: 3 additions & 3 deletions b/‎test/inductor/test_aot_inductor_package.py‎
Lines changed: 3 additions & 3 deletions
@@ -39,7 +39,10 @@
 )
 from torch.testing._internal.hop_db import hop_db
 from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
-from torch.testing._internal.triton_utils import requires_cuda_and_triton
+from torch.testing._internal.triton_utils import (
+    requires_cuda_and_triton,
+    requires_gpu_and_triton,
+)
 
 
 def count_ops(gm, args, freq, op):
@@ -6980,7 +6983,7 @@ def fn(x, y):
             fn, backend, x, y, skip_check=True
         )  # dropout decomp is known to diverge with eager
 
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     @torch._functorch.config.patch(functionalize_rng_ops=True)
     def test_fallback(self):
         def gn(x, y):
 
@@ -1554,7 +1554,8 @@ def forward(self, x, y):
 
     # scaled_dot_product_flash_attention
     @unittest.skipIf(
-        not HAS_XPU_AND_TRITON and not SM80OrLater, "bfloat16 only supported in sm80+"
+        not SM80OrLater and not HAS_XPU_AND_TRITON,
+        "bfloat16 only supported in sm80+ or XPU",
     )
     def test_sdpa(self):
         class Model(torch.nn.Module):
@@ -1571,7 +1572,10 @@ def forward(self, q, k, v):
         )
         self.check_model(Model(), example_inputs)
 
-    @unittest.skipIf(not SM80OrLater, "bfloat16 only supported in sm80+")
+    @unittest.skipIf(
+        not SM80OrLater and not HAS_XPU_AND_TRITON,
+        "bfloat16 only supported in sm80+ or XPU",
+    )
     @unittest.skipIf(
         # for archs where this isn't lowered to flash attention, the math
         # backend will be used and it doesn't work for bfloat16
@@ -5926,8 +5930,8 @@ def forward_block(self, x):
     @requires_gpu
     def test_d2h_copy(self):
         # device to copy host should always have the same stride
-        if "cuda" not in self.device:
-            raise unittest.SkipTest("This test is only for CUDA")
+        if self.device not in ["cuda", "xpu"]:
+            raise unittest.SkipTest("This test is only for CUDA or XPU")
 
         class ToCpuModel(nn.Module):
             def forward(self, x):
 
@@ -28,7 +28,7 @@
     load_weights_to_pt2_contents,
 )
 from torch.testing._internal.common_cuda import _get_torch_cuda_version
-from torch.testing._internal.common_utils import IS_FBCODE, skipIfXpu
+from torch.testing._internal.common_utils import IS_FBCODE, skipIfXpu, TEST_CUDA
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
 
 
@@ -267,9 +267,9 @@ def forward(self, x, y):
 
     @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
     @unittest.skipIf(
-        _get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+"
+        TEST_CUDA and _get_torch_cuda_version() < (12, 6),
+        "Test is only supported on CUDA 12.6+",
     )
-    @skipIfXpu  # build system may be different
     def test_compile_after_package(self):
         self.check_package_cpp_only()