[xpu][feature][inductor] Enable pad_mm Pass on Intel GPU (pytorch#166618)

jianyizh · pytorchmergebot · commit d0e7d2e09337 · 2025-11-18T15:17:26.000Z
Pull Request resolved: pytorch#166618 Approved by: https://github.com/EikanWang, https://github.com/desertfire, https://github.com/jansel
diff --git a/test/inductor/test_pad_mm.py b/test/inductor/test_pad_mm.py
@@ -16,7 +16,7 @@
 from torch._inductor.utils import fresh_cache, is_big_gpu, run_and_get_code
 from torch.testing import FileCheck
 from torch.testing._internal.common_utils import skipIfRocm
-from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU_AND_TRITON
 
 
 class PadMMTest(TestCase):
@@ -38,15 +38,15 @@ class Model(torch.nn.Module):
             def __init__(self) -> None:
                 super().__init__()
                 self.w = rand_strided(
-                    (K2, N), (1, K2), device="cuda", dtype=torch.float32
+                    (K2, N), (1, K2), device=GPU_TYPE, dtype=torch.float32
                 )
 
             def forward(self, a):
                 a1 = torch.narrow(a, 1, 0, K2)
                 return torch.mm(a1, self.w)
 
-        fn = Model().cuda()
-        a = rand_strided((M, K1), (K1, 1), device="cuda", dtype=torch.float32)
+        fn = Model().to(GPU_TYPE)
+        a = rand_strided((M, K1), (K1, 1), device=GPU_TYPE, dtype=torch.float32)
         aligned_k = get_padded_length(K2, get_alignment_size(a)) + K2
         torch._dynamo.mark_dynamic(a, 0)
         with unittest.mock.patch(
@@ -72,17 +72,17 @@ class Model(torch.nn.Module):
             def __init__(self) -> None:
                 super().__init__()
                 self.w = rand_strided(
-                    (K2, N), (1, K2), device="cuda", dtype=torch.float32
+                    (K2, N), (1, K2), device=GPU_TYPE, dtype=torch.float32
                 )
 
             def forward(self, a, b):
                 c = torch.cat([a, b], dim=0)
                 a1 = torch.narrow(c, 1, 0, K2)
                 return torch.mm(a1, self.w)
 
-        fn = Model().cuda()
-        a = rand_strided((M1, K1), (K1, 1), device="cuda", dtype=torch.float32)
-        b = rand_strided((M2, K1), (K1, 1), device="cuda", dtype=torch.float32)
+        fn = Model().to(GPU_TYPE)
+        a = rand_strided((M1, K1), (K1, 1), device=GPU_TYPE, dtype=torch.float32)
+        b = rand_strided((M2, K1), (K1, 1), device=GPU_TYPE, dtype=torch.float32)
         torch._dynamo.mark_dynamic(a, 0)
         torch._dynamo.mark_dynamic(b, 0)
         aligned_k = get_padded_length(K2, get_alignment_size(a)) + K2
@@ -110,9 +110,9 @@ def __init__(self) -> None:
             def forward(self, a, b):
                 return torch.mm(a, b)
 
-        fn = Model().cuda()
-        a = rand_strided((M, K), (K, 1), device="cuda", dtype=torch.float32)
-        b = rand_strided((K, N), (1, K), device="cuda", dtype=torch.float32)
+        fn = Model().to(GPU_TYPE)
+        a = rand_strided((M, K), (K, 1), device=GPU_TYPE, dtype=torch.float32)
+        b = rand_strided((K, N), (1, K), device=GPU_TYPE, dtype=torch.float32)
         aligned_k = get_padded_length(K, get_alignment_size(a)) + K
         torch._dynamo.mark_dynamic(b, 1)
         with unittest.mock.patch(
@@ -139,9 +139,9 @@ def __init__(self) -> None:
             def forward(self, a, b):
                 return torch.mm(a, b)
 
-        fn = Model().cuda()
-        a = rand_strided((M, K), (K, 1), device="cuda", dtype=torch.float32)
-        b = rand_strided((K, N), (1, K), device="cuda", dtype=torch.float32)
+        fn = Model().to(GPU_TYPE)
+        a = rand_strided((M, K), (K, 1), device=GPU_TYPE, dtype=torch.float32)
+        b = rand_strided((K, N), (1, K), device=GPU_TYPE, dtype=torch.float32)
         # TODO: Getting the alignment right requires pattern matcher to
         # run on newly added nodes
         aligned_m = get_padded_length(M, get_alignment_size(a)) + M
@@ -168,9 +168,9 @@ def __init__(self) -> None:
             def forward(self, a, b):
                 return torch.mm(a, b)
 
-        fn = Model().cuda()
-        a = rand_strided((M, K), (K, 1), device="cuda", dtype=torch.float32)
-        b = rand_strided((K, N), (1, K), device="cuda", dtype=torch.float32)
+        fn = Model().to(GPU_TYPE)
+        a = rand_strided((M, K), (K, 1), device=GPU_TYPE, dtype=torch.float32)
+        b = rand_strided((K, N), (1, K), device=GPU_TYPE, dtype=torch.float32)
         torch._dynamo.mark_dynamic(a, 0)
         torch._dynamo.mark_dynamic(a, 1)
         torch._dynamo.mark_dynamic(b, 0)
@@ -188,9 +188,9 @@ def test_zero_dim(self):
         def addmm(x, a, b):
             return torch.addmm(x, a, b)
 
-        x = torch.randn(100).cuda()
-        a = torch.randn(0, 10).cuda()
-        b = torch.randn(10, 100).cuda()
+        x = torch.randn(100).to(GPU_TYPE)
+        a = torch.randn(0, 10).to(GPU_TYPE)
+        b = torch.randn(10, 100).to(GPU_TYPE)
         self.assertEqual(torch.compile(addmm)(x, a, b), addmm(x, a, b))
 
     @inductor_config.patch(
@@ -209,9 +209,9 @@ def __init__(self) -> None:
             def forward(self, a, b):
                 return torch.bmm(a, b)
 
-        fn = Model().cuda()
-        a = torch.randn(B, M, K, device="cuda", dtype=torch.float32)
-        b = torch.randn(B, K, N, device="cuda", dtype=torch.float32)
+        fn = Model().to(GPU_TYPE)
+        a = torch.randn(B, M, K, device=GPU_TYPE, dtype=torch.float32)
+        b = torch.randn(B, K, N, device=GPU_TYPE, dtype=torch.float32)
         aligned_k = get_padded_length(K, get_alignment_size(a)) + K
         torch._dynamo.mark_dynamic(a, 0)
         torch._dynamo.mark_dynamic(b, 0)
@@ -240,9 +240,9 @@ def __init__(self) -> None:
             def forward(self, a, b):
                 return torch.bmm(a, b)
 
-        fn = Model().cuda()
-        a = torch.randn(B, M, K, device="cuda", dtype=torch.float32)
-        b = torch.randn(B, K, N, device="cuda", dtype=torch.float32)
+        fn = Model().to(GPU_TYPE)
+        a = torch.randn(B, M, K, device=GPU_TYPE, dtype=torch.float32)
+        b = torch.randn(B, K, N, device=GPU_TYPE, dtype=torch.float32)
         aligned_n = get_padded_length(N, get_alignment_size(b)) + N
         torch._dynamo.mark_dynamic(a, 2)
         torch._dynamo.mark_dynamic(b, 1)
@@ -271,9 +271,9 @@ def __init__(self) -> None:
             def forward(self, a, b):
                 return torch.bmm(a, b)
 
-        fn = Model().cuda()
-        a = torch.randn(B, M, K, device="cuda", dtype=torch.float32)
-        b = torch.randn(B, K, N, device="cuda", dtype=torch.float32)
+        fn = Model().to(GPU_TYPE)
+        a = torch.randn(B, M, K, device=GPU_TYPE, dtype=torch.float32)
+        b = torch.randn(B, K, N, device=GPU_TYPE, dtype=torch.float32)
         aligned_n = get_padded_length(N, get_alignment_size(b)) + N
         torch._dynamo.mark_dynamic(a, 0)
         torch._dynamo.mark_dynamic(a, 1)
@@ -302,10 +302,10 @@ def __init__(self) -> None:
             def forward(self, a, b, c):
                 return torch.addmm(a, b, c)
 
-        fn = Model().cuda()
-        a = torch.randn(M, N, device="cuda", dtype=torch.float32)
-        b = torch.randn(M, K, device="cuda", dtype=torch.float32)
-        c = torch.randn(K, N, device="cuda", dtype=torch.float32)
+        fn = Model().to(GPU_TYPE)
+        a = torch.randn(M, N, device=GPU_TYPE, dtype=torch.float32)
+        b = torch.randn(M, K, device=GPU_TYPE, dtype=torch.float32)
+        c = torch.randn(K, N, device=GPU_TYPE, dtype=torch.float32)
         aligned_k = get_padded_length(K, get_alignment_size(b)) + K
         torch._dynamo.mark_dynamic(a, 0)
         torch._dynamo.mark_dynamic(b, 0)
@@ -333,10 +333,10 @@ def __init__(self) -> None:
             def forward(self, a, b, c):
                 return torch.addmm(a, b, c)
 
-        fn = Model().cuda()
-        a = torch.randn(M, N, device="cuda", dtype=torch.float32)
-        b = torch.randn(M, K, device="cuda", dtype=torch.float32)
-        c = torch.randn(K, N, device="cuda", dtype=torch.float32)
+        fn = Model().to(GPU_TYPE)
+        a = torch.randn(M, N, device=GPU_TYPE, dtype=torch.float32)
+        b = torch.randn(M, K, device=GPU_TYPE, dtype=torch.float32)
+        c = torch.randn(K, N, device=GPU_TYPE, dtype=torch.float32)
         torch._dynamo.mark_dynamic(a, 0)
         torch._dynamo.mark_dynamic(a, 1)
         torch._dynamo.mark_dynamic(b, 0)
@@ -357,7 +357,7 @@ def test_pad_single_cat(self):
         def foo(x, y):
             return x @ y
 
-        inps = [torch.rand([5, 5], device="cuda") for _ in range(2)]
+        inps = [torch.rand([5, 5], device=GPU_TYPE) for _ in range(2)]
         out = foo(*inps)
         self.assertEqual(out, inps[0] @ inps[1])
 
@@ -371,19 +371,19 @@ def foo(input, x, y):
         for a in [1, 4]:
             for b in [1, 6]:
                 inps = (
-                    torch.rand([a, b], device="cuda"),
-                    torch.rand([4, 5], device="cuda"),
-                    torch.rand([5, 6], device="cuda"),
+                    torch.rand([a, b], device=GPU_TYPE),
+                    torch.rand([4, 5], device=GPU_TYPE),
+                    torch.rand([5, 6], device=GPU_TYPE),
                 )
                 out = foo(*inps)
                 out_eager = torch.ops.aten.addmm(*inps)
                 self.assertEqual(out, out_eager)
 
         for a in [1, 6]:
             inps = (
-                torch.rand([a], device="cuda"),
-                torch.rand([4, 5], device="cuda"),
-                torch.rand([5, 6], device="cuda"),
+                torch.rand([a], device=GPU_TYPE),
+                torch.rand([4, 5], device=GPU_TYPE),
+                torch.rand([5, 6], device=GPU_TYPE),
             )
             out = foo(*inps)
             out_eager = torch.ops.aten.addmm(*inps)
@@ -395,8 +395,8 @@ def test_pad_batch(self):
         n = 9
         k = 11
         batch_size = 3
-        mat1 = torch.ones((batch_size, m, k), device="cuda", dtype=torch.float16)
-        mat2 = torch.ones((batch_size, k, n), device="cuda", dtype=torch.float16)
+        mat1 = torch.ones((batch_size, m, k), device=GPU_TYPE, dtype=torch.float16)
+        mat2 = torch.ones((batch_size, k, n), device=GPU_TYPE, dtype=torch.float16)
         expected_alignment = get_alignment_size(mat1)
 
         assert expected_alignment == 8, "Alignment for float16 should be 8"
@@ -413,7 +413,7 @@ def bmm(mat1, mat2):
         # in call code, expect to see a single pad per input, and then we should see padded allocation for output
         FileCheck().check("del async_compile").check_count(
             ".run(", 2, exactly=True
-        ).check("empty_strided_cuda((3, 8, 16)").run(code)
+        ).check(f"empty_strided_{GPU_TYPE}((3, 8, 16)").run(code)
 
         assert torch.allclose(res2, bmm_expected_result), (
             "BMM results are not identical"
@@ -425,7 +425,7 @@ def test_exclude_padding(self):
         def mm(a, b):
             return a @ b
 
-        mm(torch.rand([25, 25], device="cuda"), torch.rand([25, 25], device="cuda"))
+        mm(torch.rand([25, 25], device=GPU_TYPE), torch.rand([25, 25], device=GPU_TYPE))
         local_cache = get_pad_cache().get_local_cache()
         self.assertTrue(len(local_cache) == 2)
         FileCheck().check_count("exclude_pad:False", 2, exactly=True).run(
@@ -436,7 +436,7 @@ def mm(a, b):
         def mm(a, b):
             return (a + 1) @ b
 
-        mm(torch.rand([25, 25], device="cuda"), torch.rand([25, 25], device="cuda"))
+        mm(torch.rand([25, 25], device=GPU_TYPE), torch.rand([25, 25], device=GPU_TYPE))
         local_cache = get_pad_cache().get_local_cache()
         # reuse original base timing
         self.assertTrue(len(local_cache) == 3)
@@ -455,8 +455,8 @@ def test_exclude_cat_padding(self):
         def mm(inps, b):
             return torch.cat(inps) @ b
 
-        inp = torch.rand([2046, 2046], device="cuda")
-        inp2 = torch.rand([2046, 2046], device="cuda")
+        inp = torch.rand([2046, 2046], device=GPU_TYPE)
+        inp2 = torch.rand([2046, 2046], device=GPU_TYPE)
 
         inps = inp.chunk(3)
         mm(inps, inp2)
@@ -471,7 +471,8 @@ def mm(inps, b):
         )
 
     @unittest.skipIf(
-        not torch.cuda.is_available() or torch.cuda.get_device_capability() >= (9, 0),
+        (not torch.cuda.is_available() or torch.cuda.get_device_capability() >= (9, 0))
+        and (not torch.xpu.is_available()),
         "No perf regression on H100+ with BF16",
     )
     @skipIfRocm
@@ -483,8 +484,8 @@ def test_pad_mm_bf16(self):
         m = 2
         n = 13
         k = 15691904
-        mat1 = torch.ones((m, k), device="cuda", dtype=torch.bfloat16)
-        mat2 = torch.ones((k, n), device="cuda", dtype=torch.bfloat16)
+        mat1 = torch.ones((m, k), device=GPU_TYPE, dtype=torch.bfloat16)
+        mat2 = torch.ones((k, n), device=GPU_TYPE, dtype=torch.bfloat16)
         expected_alignment = get_alignment_size(mat1)
 
         assert expected_alignment == 8, "Alignment for bfloat16 should be 8"
@@ -504,7 +505,7 @@ def mm(mat1, mat2):
         # in call code, expect to see a single pad per input, and then we should see padded allocation for output
         FileCheck().check("del async_compile").check_count(
             ".run(", 2, exactly=True
-        ).check("empty_strided_cuda((8, 16)").run(code)
+        ).check(f"empty_strided_{GPU_TYPE}((8, 16)").run(code)
 
         assert torch.allclose(res2, mm_expected_result), "MM results are not identical"
 
@@ -521,8 +522,8 @@ def fn(x, y):
             return x @ y
 
         args = [
-            torch.randn(2**4, 2**8 - 1, device="cuda", dtype=torch.float16),
-            torch.randn(2**8 - 1, 2**4, device="cuda", dtype=torch.float16),
+            torch.randn(2**4, 2**8 - 1, device=GPU_TYPE, dtype=torch.float16),
+            torch.randn(2**8 - 1, 2**4, device=GPU_TYPE, dtype=torch.float16),
         ]
 
         counters.clear()
@@ -615,7 +616,7 @@ def test_masked_mha(B, H, S, D, device, dtype):
             ):
                 mha = torch.compile(mha, fullgraph=True, backend="inductor")
                 with torch.autocast(
-                    device_type="cuda", dtype=dtype, cache_enabled=False
+                    device_type=GPU_TYPE, dtype=dtype, cache_enabled=False
                 ):
                     out_vid = mha(x1, x2, attn_mask)
                     target_vid = torch.randn_like(out_vid)
@@ -624,7 +625,7 @@ def test_masked_mha(B, H, S, D, device, dtype):
                     loss = loss_vid
                 loss.backward()
 
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
 
             # Check if any bmm operations had dtype changes
             for node_name_pre, node_name_post in zip(
@@ -642,13 +643,13 @@ def test_masked_mha(B, H, S, D, device, dtype):
             self.assertFalse(torch.any(x2.grad.isnan()).item())
 
         B, H, S, D = 2, 32, 549, 128
-        device = "cuda"
+        device = GPU_TYPE
         dtype = torch.bfloat16
         torch.compiler.reset()
         torch.manual_seed(42)
         test_masked_mha(B, H, S, D, device, dtype)
 
 
 if __name__ == "__main__":
-    if HAS_CUDA_AND_TRITON:
+    if HAS_GPU_AND_TRITON:
         run_tests()
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
@@ -15893,8 +15893,6 @@ def wrapper(inp, weight):
             _, code = run_and_get_code(wrapper, inp, weight)
             self.assertTrue("in_out_ptr" in code[1])
 
-        # TODO: Enable this case after pad_mm is enabled on XPU.
-        @expectedFailureXPU
         @torch._functorch.config.patch("donated_buffer", True)
         @torch._inductor.config.patch("force_shape_pad", True)
         def test_donated_buffer_inplace_gpt(self):
diff --git a/torch/_inductor/fx_passes/pad_mm.py b/torch/_inductor/fx_passes/pad_mm.py
@@ -76,7 +76,7 @@ def get_alignment_size_dtype(dtype: torch.dtype) -> int:
 
 
 def check_device(a: Tensor, b: Tensor) -> bool:
-    return a.is_cuda and b.is_cuda
+    return (a.is_cuda and b.is_cuda) or (a.is_xpu and b.is_xpu)
 
 
 def check_dtype(a: Tensor, b: Tensor) -> bool:
@@ -225,7 +225,7 @@ def is_mm_compute_bound(M: int, K: int, N: int, dtype: torch.dtype) -> bool:
         dtype is torch.bfloat16
         and K > M
         and K > N
-        and torch.cuda.get_device_capability() < (9, 0)
+        and (torch.xpu.is_available() or torch.cuda.get_device_capability() < (9, 0))
     ):  # doesn't repro on h100s:
         return True
 
@@ -280,7 +280,9 @@ def tensor_key(t: Tensor) -> tuple[torch.Size, tuple[int, ...], torch.dtype]:
         return (t.shape, t.stride(), t.dtype)
 
     tf32_key = (
-        None if mat1.dtype != torch.float32 else torch.backends.cuda.matmul.allow_tf32
+        None
+        if mat1.dtype != torch.float32
+        else torch.backends.cuda.matmul.allow_tf32 or torch.backends.mkldnn.allow_tf32
     )
 
     def fmt_pad(name: str) -> str | None:
@@ -381,7 +383,7 @@ def should_pad_mm_bf16(dtype: torch.dtype, M: int, N: int, K: int) -> bool:
         and K > N
         and N % 2 == 1
         and K >= large_k_threshold_to_pad
-        and torch.cuda.get_device_capability() < (9, 0)
+        and (torch.xpu.is_available() or torch.cuda.get_device_capability() < (9, 0))
     ):  # doesn't repro on h100s:
         return True
     return False
@@ -549,7 +551,7 @@ def write_pad():
 
         if op is torch.ops.aten.addmm:
             input_pad = None
-            if input is not None and input.is_cuda:
+            if input is not None and (input.is_cuda or input.is_xpu):
                 input_pad = torch.randn_like(input)
             fns.append(
                 lambda: pad_addmm(
@@ -870,6 +872,8 @@ def _pad_mm_init() -> None:
     if torch.cuda.is_available():
         # workaround https://github.com/pytorch/pytorch/issues/97894
         device = "cuda"
+    elif torch.xpu.is_available():
+        device = "xpu"
     else:
         device = "cpu"