ROCm
diff --git a/‎.github/workflows/xpu.yml‎
Lines changed: 12 additions & 8 deletions b/‎.github/workflows/xpu.yml‎
Lines changed: 12 additions & 8 deletions
diff --git a/‎test/inductor/test_aot_inductor.py‎
Lines changed: 24 additions & 19 deletions b/‎test/inductor/test_aot_inductor.py‎
Lines changed: 24 additions & 19 deletions
diff --git a/‎test/inductor/test_aot_inductor_custom_ops.py‎
Lines changed: 14 additions & 14 deletions b/‎test/inductor/test_aot_inductor_custom_ops.py‎
Lines changed: 14 additions & 14 deletions
diff --git a/‎test/inductor/test_aot_inductor_package.py‎
Lines changed: 3 additions & 8 deletions b/‎test/inductor/test_aot_inductor_package.py‎
Lines changed: 3 additions & 8 deletions
diff --git a/‎test/inductor/test_benchmark_fusion.py‎
Lines changed: 31 additions & 27 deletions b/‎test/inductor/test_benchmark_fusion.py‎
Lines changed: 31 additions & 27 deletions
@@ -59,14 +59,18 @@ jobs:
       runner: linux.c7i.12xlarge
       test-matrix: |
         { include: [
-          { config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 2, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 3, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 4, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 5, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 6, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 7, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 8, num_shards: 8, runner: "linux.idc.xpu" },
+          { config: "default", shard: 1, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 2, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 3, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 4, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 5, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 6, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 7, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 8, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 9, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 10, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 11, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 12, num_shards: 12, runner: "linux.idc.xpu" },
         ]}
     secrets: inherit
 
 
@@ -80,7 +80,12 @@
     TEST_WITH_ROCM,
 )
 from torch.testing._internal.custom_tensor import CustomTensorPlainOut
-from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU, IS_BIG_GPU
+from torch.testing._internal.inductor_utils import (
+    GPU_TYPE,
+    HAS_GPU,
+    HAS_XPU_AND_TRITON,
+    IS_BIG_GPU,
+)
 from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
 from torch.testing._internal.triton_utils import requires_gpu
 from torch.utils import _pytree as pytree
@@ -1544,7 +1549,9 @@ def forward(self, x, y):
         )
 
     # scaled_dot_product_flash_attention
-    @unittest.skipIf(not SM80OrLater, "bfloat16 only supported in sm80+")
+    @unittest.skipIf(
+        not HAS_XPU_AND_TRITON and not SM80OrLater, "bfloat16 only supported in sm80+"
+    )
     def test_sdpa(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -5575,8 +5582,8 @@ def forward(self, x, weight, bias, scale_a, scale_b):
                 ).run(code)
 
     def test_aoti_debug_printing_model_inputs_codegen(self):
-        if self.device != "cuda":
-            raise unittest.SkipTest("requires CUDA")
+        if self.device not in ["cuda", "xpu"]:
+            raise unittest.SkipTest("requires CUDA/XPU")
 
         class Model(torch.nn.Module):
             def __init__(self):
@@ -6368,8 +6375,8 @@ def runner_call(*args, **kwargs):
         runner.free_inactive_constant_buffer()
 
     def test_update_user_managed_buffer(self):
-        if self.device != "cuda":
-            raise unittest.SkipTest("requires CUDA")
+        if self.device not in ["cuda", "xpu"]:
+            raise unittest.SkipTest("requires CUDA/XPU")
 
         class Model(torch.nn.Module):
             def __init__(self, n, k, device):
@@ -6413,10 +6420,10 @@ def runner_call(*args, **kwargs):
             "L__self___weight": torch.randn(N, K, device=self.device),
             "L__self___bias": torch.randn(N, device=self.device),
         }
-        mem_before, _ = torch.cuda.mem_get_info(self.device)
+        mem_before, _ = getattr(torch, GPU_TYPE).mem_get_info(self.device)
         # Do not use user managed_buffer, should have less free memory.
         runner.update_constant_buffer(new_weights, True, False, False)
-        mem_after, _ = torch.cuda.mem_get_info(self.device)
+        mem_after, _ = getattr(torch, GPU_TYPE).mem_get_info(self.device)
         self.assertGreater(mem_before, mem_after)
 
         runner.swap_constant_buffer()
@@ -6448,10 +6455,10 @@ def runner_call(*args, **kwargs):
             "L__self___weight": torch.randn(N, K, device=self.device),
             "L__self___bias": torch.randn(N, device=self.device),
         }
-        mem_before, _ = torch.cuda.mem_get_info(self.device)
+        mem_before, _ = getattr(torch, GPU_TYPE).mem_get_info(self.device)
         # Try user managed_buffer, should have same free memory.
         runner.update_constant_buffer(new_weights, True, False, True)
-        mem_after, _ = torch.cuda.mem_get_info(self.device)
+        mem_after, _ = getattr(torch, GPU_TYPE).mem_get_info(self.device)
         self.assertEqual(mem_before, mem_after, atol=1e-3, rtol=1e-3)
 
         runner.swap_constant_buffer()
@@ -6523,8 +6530,8 @@ def forward(self, predicate, x):
         "To enable after the C shim FC window ends",
     )
     def test_misaligned_input_1(self):
-        if self.device != "cuda":
-            raise unittest.SkipTest("CUDA test only")
+        if self.device not in ["cuda", "xpu"]:
+            raise unittest.SkipTest("CUDA/XPU test only")
 
         class Model(torch.nn.Module):
             def forward(self, x):
@@ -6550,8 +6557,8 @@ def forward(self, x):
         torch.testing.assert_close(actual, expected)
 
     def test_misaligned_input_2(self):
-        if self.device != "cuda":
-            raise unittest.SkipTest("CUDA test only")
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("GPU test only")
 
         class Model(torch.nn.Module):
             def forward(self, x):
@@ -7107,8 +7114,8 @@ def forward(self, x, y, z, x1, z1):
         self.check_model(Model(), example_inputs, dynamic_shapes=dynamic_shapes)
 
     def test_sym_expr_indexing(self):
-        if self.device != "cuda":
-            raise unittest.SkipTest("requires CUDA")
+        if self.device not in ["cuda", "xpu"]:
+            raise unittest.SkipTest("requires CUDA/XPU")
 
         class Repro(torch.nn.Module):
             def __init__(self) -> None:
@@ -7126,7 +7133,7 @@ def forward(
                 arange_1 = torch.ops.aten.arange.start(
                     180,
                     181,
-                    device=torch.device(type="cuda", index=0),
+                    device=torch.device(type=GPU_TYPE, index=0),
                     pin_memory=False,
                 )
                 add_14 = torch.ops.aten.add.Tensor(arange_1, 198)
@@ -7645,8 +7652,6 @@ def fail_gpu(suffixes: tuple[str, ...], is_skip=False):
     "test_quantized_linear_bias_none": fail_gpu(("cuda", "xpu")),
     # No scaled_dot_product_efficient_attention implementation for XPU yet.
     "test_scaled_dot_product_efficient_attention": fail_gpu(("xpu",)),
-    # No fft implementation for XPU yet.
-    "test_fft_c2c": fail_gpu(("xpu",), is_skip=True),
 }
 
 MPS_TEST_FAILURES = {
 
@@ -22,8 +22,8 @@
     IS_WINDOWS,
     skipIfXpu,
 )
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU_AND_TRITON
 from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
-from torch.testing._internal.triton_utils import HAS_CUDA_AND_TRITON
 from torch.utils._python_dispatch import TorchDispatchMode
 
 
@@ -492,9 +492,9 @@ def fail_cpu(is_skip=False):
     )
 
 
-def fail_cuda(is_skip=False):
+def fail_gpu(suffixes: tuple[str, ...], is_skip=False):
     return TestFailure(
-        ("cuda"),
+        suffixes,
         is_skip=is_skip,
     )
 
@@ -506,11 +506,11 @@ def fail_cuda(is_skip=False):
 }
 
 # test_failures, xfail by default, set is_skip=True to skip
-CUDA_TEST_FAILURES = {
+GPU_TEST_FAILURES = {
     # quantized unsupported for GPU
-    "test_quantized_linear": fail_cuda(),
-    "test_quanatized_int8_linear": fail_cuda(),
-    "test_quantized_linear_bias_none": fail_cuda(),
+    "test_quantized_linear": fail_gpu(("cuda", "xpu")),
+    "test_quanatized_int8_linear": fail_gpu(("cuda", "xpu")),
+    "test_quantized_linear_bias_none": fail_gpu(("cuda", "xpu")),
 }
 
 
@@ -533,9 +533,9 @@ class AOTInductorTestABICompatibleCpu(AOTICustomOpTestCase):
 
 
 @unittest.skipIf(sys.platform == "darwin", "No CUDA on MacOS")
-class AOTInductorTestABICompatibleCuda(AOTICustomOpTestCase):
-    device = "cuda"
-    device_type = "cuda"
+class AOTInductorTestABICompatibleGpu(AOTICustomOpTestCase):
+    device = GPU_TYPE
+    device_type = GPU_TYPE
     check_model = check_model
     check_model_with_multiple_inputs = check_model_with_multiple_inputs
     code_check_count = code_check_count
@@ -545,14 +545,14 @@ class AOTInductorTestABICompatibleCuda(AOTICustomOpTestCase):
 
 copy_tests(
     AOTInductorTestsTemplate,
-    AOTInductorTestABICompatibleCuda,
-    "cuda",
-    CUDA_TEST_FAILURES,
+    AOTInductorTestABICompatibleGpu,
+    GPU_TYPE,
+    GPU_TEST_FAILURES,
 )
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
 
     # cpp_extension N/A in fbcode
-    if HAS_CUDA_AND_TRITON or sys.platform == "darwin":
+    if HAS_GPU_AND_TRITON or sys.platform == "darwin":
         run_tests(needs="filelock")
@@ -28,12 +28,7 @@
     load_weights_to_pt2_contents,
 )
 from torch.testing._internal.common_cuda import _get_torch_cuda_version
-from torch.testing._internal.common_utils import (
-    IS_FBCODE,
-    skipIfRocm,
-    skipIfXpu,
-    TEST_CUDA,
-)
+from torch.testing._internal.common_utils import IS_FBCODE, skipIfRocm, skipIfXpu
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
 
 
@@ -688,13 +683,13 @@ def forward(self, x):
         self.assertEqual(loaded1(*example_inputs1), ep1.module()(*example_inputs1))
         self.assertEqual(loaded2(*example_inputs2), ep2.module()(*example_inputs2))
 
-    @unittest.skipIf(not TEST_CUDA, "requires cuda")
+    @unittest.skipIf(not HAS_GPU, "requires gpu")
     def test_duplicate_calls(self):
         options = {
             "aot_inductor.package": True,
         }
 
-        device = "cuda"
+        device = GPU_TYPE
 
         class Model1(torch.nn.Module):
             def __init__(self) -> None:
 
@@ -12,8 +12,9 @@
 from torch.testing._internal.common_utils import slowTest
 from torch.testing._internal.inductor_utils import (
     get_func_call,
+    GPU_TYPE,
     HAS_CPU,
-    HAS_CUDA_AND_TRITON,
+    HAS_GPU_AND_TRITON,
     IS_BIG_GPU,
 )
 
@@ -27,7 +28,7 @@
 
 from inductor.test_torchinductor import (  # @manual=fbcode//caffe2/test/inductor:test_inductor-library
     check_model,
-    check_model_cuda,
+    check_model_gpu,
     copy_tests,
     skip_if_cpp_wrapper,
 )
@@ -140,8 +141,8 @@ def f(a, b):
     )
     @config.patch(max_autotune_gemm_backends="TRITON")
     def test_avoid_register_spilling(self):
-        if self.device != "cuda":
-            raise unittest.SkipTest("CUDA only")
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("GPU only")
 
         from torch.nn.functional import gelu
 
@@ -156,8 +157,8 @@ def foo(m, inp):
 
             return curr
 
-        m = torch.nn.Linear(2048, 2048, bias=True).half().cuda()
-        inp = torch.rand([2048, 2048]).half().cuda()
+        m = torch.nn.Linear(2048, 2048, bias=True).half().to(GPU_TYPE)
+        inp = torch.rand([2048, 2048]).half().to(GPU_TYPE)
 
         with torch.no_grad():
             foo_c = torch.compile(mode="max-autotune-no-cudagraphs")(foo)
@@ -185,7 +186,7 @@ def foo(m, inp):
 
         for c in out_code[0], out_code2[0]:
             FileCheck().check("async_compile.wait").check("DeviceGuard").check_count(
-                "empty_strided_cuda", 1, exactly=True
+                f"empty_strided_{GPU_TYPE}", 1, exactly=True
             ).check_regex("buf[0-9]* = buf[0-9]*; del buf[0-9]*").check("return").run(c)
 
     def test_tield_kernel_fusion(self):
@@ -197,47 +198,50 @@ def f(x):
         self.common(f, (x,))
 
 
-if HAS_CUDA_AND_TRITON:
+if HAS_GPU_AND_TRITON:
 
-    class BenchmarkFusionCudaTest(TestCase):
-        common = check_model_cuda
-        device = "cuda"
+    class BenchmarkFusionGpuTest(TestCase):
+        common = check_model_gpu
+        device = GPU_TYPE
 
-    copy_tests(BenchmarkFusionTestTemplate, BenchmarkFusionCudaTest, "cuda")
+    copy_tests(BenchmarkFusionTestTemplate, BenchmarkFusionGpuTest, GPU_TYPE)
 
     class BenchmarkingTest(TestCase):
         @unittest.skipIf(
-            torch.cuda.device_count() < 2, "The test need at least 2 devices"
+            getattr(torch, GPU_TYPE).device_count() < 2,
+            "The test need at least 2 devices",
         )
         @skip_if_cpp_wrapper("This tests triton scheduling directly")
         def test_benchmark_on_non_zero_device(self):
             hit_count = 0
-            with torch.cuda.device("cuda:0"):
+            with getattr(torch, GPU_TYPE).device(f"{GPU_TYPE}:0"):
 
                 @torch.compile
                 def relu(x):
                     return realize(x.relu()) + x
 
-                x = torch.randn(int(16e6), device="cuda:1")
+                x = torch.randn(int(16e6), device=f"{GPU_TYPE}:1")
 
-                orig_benchmark_fused_nodes = TritonScheduling.benchmark_fused_nodes
+                orig_benchmark_codegened_module = (
+                    TritonScheduling.benchmark_codegened_module
+                )
 
-                def mock_benchmark_fused_nodes(*args, **kwargs):
+                def benchmark_codegened_module(*args, **kwargs):
                     nonlocal hit_count
                     hit_count += 1
-                    ms, path = orig_benchmark_fused_nodes(*args, **kwargs)
+                    ms, path = orig_benchmark_codegened_module(*args, **kwargs)
                     self.assertTrue(ms > 0)
                     return ms, path
 
                 with unittest.mock.patch.object(
                     TritonScheduling,
-                    "benchmark_fused_nodes",
-                    mock_benchmark_fused_nodes,
+                    "benchmark_codegened_module",
+                    benchmark_codegened_module,
                 ):
                     relu(x)
                 self.assertTrue(hit_count > 0)
 
-    class BenchmarkMultiTemplateFusionCudaTest(InductorTestCase):
+    class BenchmarkMultiTemplateFusionGpuTest(InductorTestCase):
         @classmethod
         def setUpClass(cls):
             super().setUpClass()
@@ -272,8 +276,8 @@ def foo(m, inp):
             foo_c = torch.compile(mode="max-autotune-no-cudagraphs")(foo)
             first_dim = first_dim if first_dim is not None else size
 
-            m = torch.nn.Linear(size, size, bias=True).half().cuda()
-            inp = torch.rand([first_dim, size]).half().cuda()
+            m = torch.nn.Linear(size, size, bias=True).half().to(GPU_TYPE)
+            inp = torch.rand([first_dim, size]).half().to(GPU_TYPE)
 
             with torch.no_grad():
                 res, code = run_and_get_code(foo_c, m, inp)
@@ -324,9 +328,9 @@ def fn(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor):
                 )
 
             args = [
-                torch.randn(4, 4, device="cuda"),
-                torch.randn(4, 4, device="cuda"),
-                torch.randn(4, 4, device="cuda"),
+                torch.randn(4, 4, device=GPU_TYPE),
+                torch.randn(4, 4, device=GPU_TYPE),
+                torch.randn(4, 4, device=GPU_TYPE),
             ]
 
             expected = fn(*args)
@@ -347,5 +351,5 @@ class BenchmarkFusionCpuTest(TestCase):
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
 
-    if HAS_CPU or HAS_CUDA_AND_TRITON:
+    if HAS_CPU or HAS_GPU_AND_TRITON:
         run_tests()