Skip to content

Commit 1b655a8

Browse files
etafdaisydenjansel
authored andcommitted
[xpu][test] Enable more UTs for Intel GPU. (pytorch#166047)
This PR enables additional Inductor unit tests for Intel GPU. Due to the increased number of test cases, the number of runners has been extended from 8 to 12 to prevent CI timeouts. Pull Request resolved: pytorch#166047 Approved by: https://github.com/jansel Co-authored-by: Deng, Daisy <[email protected]> Co-authored-by: Jason Ansel <[email protected]>
1 parent cb69667 commit 1b655a8

19 files changed

+417
-392
lines changed

.github/workflows/xpu.yml

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -59,14 +59,18 @@ jobs:
5959
runner: linux.c7i.12xlarge
6060
test-matrix: |
6161
{ include: [
62-
{ config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" },
63-
{ config: "default", shard: 2, num_shards: 8, runner: "linux.idc.xpu" },
64-
{ config: "default", shard: 3, num_shards: 8, runner: "linux.idc.xpu" },
65-
{ config: "default", shard: 4, num_shards: 8, runner: "linux.idc.xpu" },
66-
{ config: "default", shard: 5, num_shards: 8, runner: "linux.idc.xpu" },
67-
{ config: "default", shard: 6, num_shards: 8, runner: "linux.idc.xpu" },
68-
{ config: "default", shard: 7, num_shards: 8, runner: "linux.idc.xpu" },
69-
{ config: "default", shard: 8, num_shards: 8, runner: "linux.idc.xpu" },
62+
{ config: "default", shard: 1, num_shards: 12, runner: "linux.idc.xpu" },
63+
{ config: "default", shard: 2, num_shards: 12, runner: "linux.idc.xpu" },
64+
{ config: "default", shard: 3, num_shards: 12, runner: "linux.idc.xpu" },
65+
{ config: "default", shard: 4, num_shards: 12, runner: "linux.idc.xpu" },
66+
{ config: "default", shard: 5, num_shards: 12, runner: "linux.idc.xpu" },
67+
{ config: "default", shard: 6, num_shards: 12, runner: "linux.idc.xpu" },
68+
{ config: "default", shard: 7, num_shards: 12, runner: "linux.idc.xpu" },
69+
{ config: "default", shard: 8, num_shards: 12, runner: "linux.idc.xpu" },
70+
{ config: "default", shard: 9, num_shards: 12, runner: "linux.idc.xpu" },
71+
{ config: "default", shard: 10, num_shards: 12, runner: "linux.idc.xpu" },
72+
{ config: "default", shard: 11, num_shards: 12, runner: "linux.idc.xpu" },
73+
{ config: "default", shard: 12, num_shards: 12, runner: "linux.idc.xpu" },
7074
]}
7175
secrets: inherit
7276

test/inductor/test_aot_inductor.py

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,12 @@
8080
TEST_WITH_ROCM,
8181
)
8282
from torch.testing._internal.custom_tensor import CustomTensorPlainOut
83-
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU, IS_BIG_GPU
83+
from torch.testing._internal.inductor_utils import (
84+
GPU_TYPE,
85+
HAS_GPU,
86+
HAS_XPU_AND_TRITON,
87+
IS_BIG_GPU,
88+
)
8489
from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
8590
from torch.testing._internal.triton_utils import requires_gpu
8691
from torch.utils import _pytree as pytree
@@ -1544,7 +1549,9 @@ def forward(self, x, y):
15441549
)
15451550

15461551
# scaled_dot_product_flash_attention
1547-
@unittest.skipIf(not SM80OrLater, "bfloat16 only supported in sm80+")
1552+
@unittest.skipIf(
1553+
not HAS_XPU_AND_TRITON and not SM80OrLater, "bfloat16 only supported in sm80+"
1554+
)
15481555
def test_sdpa(self):
15491556
class Model(torch.nn.Module):
15501557
def __init__(self) -> None:
@@ -5575,8 +5582,8 @@ def forward(self, x, weight, bias, scale_a, scale_b):
55755582
).run(code)
55765583

55775584
def test_aoti_debug_printing_model_inputs_codegen(self):
5578-
if self.device != "cuda":
5579-
raise unittest.SkipTest("requires CUDA")
5585+
if self.device not in ["cuda", "xpu"]:
5586+
raise unittest.SkipTest("requires CUDA/XPU")
55805587

55815588
class Model(torch.nn.Module):
55825589
def __init__(self):
@@ -6368,8 +6375,8 @@ def runner_call(*args, **kwargs):
63686375
runner.free_inactive_constant_buffer()
63696376

63706377
def test_update_user_managed_buffer(self):
6371-
if self.device != "cuda":
6372-
raise unittest.SkipTest("requires CUDA")
6378+
if self.device not in ["cuda", "xpu"]:
6379+
raise unittest.SkipTest("requires CUDA/XPU")
63736380

63746381
class Model(torch.nn.Module):
63756382
def __init__(self, n, k, device):
@@ -6413,10 +6420,10 @@ def runner_call(*args, **kwargs):
64136420
"L__self___weight": torch.randn(N, K, device=self.device),
64146421
"L__self___bias": torch.randn(N, device=self.device),
64156422
}
6416-
mem_before, _ = torch.cuda.mem_get_info(self.device)
6423+
mem_before, _ = getattr(torch, GPU_TYPE).mem_get_info(self.device)
64176424
# Do not use user managed_buffer, should have less free memory.
64186425
runner.update_constant_buffer(new_weights, True, False, False)
6419-
mem_after, _ = torch.cuda.mem_get_info(self.device)
6426+
mem_after, _ = getattr(torch, GPU_TYPE).mem_get_info(self.device)
64206427
self.assertGreater(mem_before, mem_after)
64216428

64226429
runner.swap_constant_buffer()
@@ -6448,10 +6455,10 @@ def runner_call(*args, **kwargs):
64486455
"L__self___weight": torch.randn(N, K, device=self.device),
64496456
"L__self___bias": torch.randn(N, device=self.device),
64506457
}
6451-
mem_before, _ = torch.cuda.mem_get_info(self.device)
6458+
mem_before, _ = getattr(torch, GPU_TYPE).mem_get_info(self.device)
64526459
# Try user managed_buffer, should have same free memory.
64536460
runner.update_constant_buffer(new_weights, True, False, True)
6454-
mem_after, _ = torch.cuda.mem_get_info(self.device)
6461+
mem_after, _ = getattr(torch, GPU_TYPE).mem_get_info(self.device)
64556462
self.assertEqual(mem_before, mem_after, atol=1e-3, rtol=1e-3)
64566463

64576464
runner.swap_constant_buffer()
@@ -6523,8 +6530,8 @@ def forward(self, predicate, x):
65236530
"To enable after the C shim FC window ends",
65246531
)
65256532
def test_misaligned_input_1(self):
6526-
if self.device != "cuda":
6527-
raise unittest.SkipTest("CUDA test only")
6533+
if self.device not in ["cuda", "xpu"]:
6534+
raise unittest.SkipTest("CUDA/XPU test only")
65286535

65296536
class Model(torch.nn.Module):
65306537
def forward(self, x):
@@ -6550,8 +6557,8 @@ def forward(self, x):
65506557
torch.testing.assert_close(actual, expected)
65516558

65526559
def test_misaligned_input_2(self):
6553-
if self.device != "cuda":
6554-
raise unittest.SkipTest("CUDA test only")
6560+
if self.device != GPU_TYPE:
6561+
raise unittest.SkipTest("GPU test only")
65556562

65566563
class Model(torch.nn.Module):
65576564
def forward(self, x):
@@ -7107,8 +7114,8 @@ def forward(self, x, y, z, x1, z1):
71077114
self.check_model(Model(), example_inputs, dynamic_shapes=dynamic_shapes)
71087115

71097116
def test_sym_expr_indexing(self):
7110-
if self.device != "cuda":
7111-
raise unittest.SkipTest("requires CUDA")
7117+
if self.device not in ["cuda", "xpu"]:
7118+
raise unittest.SkipTest("requires CUDA/XPU")
71127119

71137120
class Repro(torch.nn.Module):
71147121
def __init__(self) -> None:
@@ -7126,7 +7133,7 @@ def forward(
71267133
arange_1 = torch.ops.aten.arange.start(
71277134
180,
71287135
181,
7129-
device=torch.device(type="cuda", index=0),
7136+
device=torch.device(type=GPU_TYPE, index=0),
71307137
pin_memory=False,
71317138
)
71327139
add_14 = torch.ops.aten.add.Tensor(arange_1, 198)
@@ -7645,8 +7652,6 @@ def fail_gpu(suffixes: tuple[str, ...], is_skip=False):
76457652
"test_quantized_linear_bias_none": fail_gpu(("cuda", "xpu")),
76467653
# No scaled_dot_product_efficient_attention implementation for XPU yet.
76477654
"test_scaled_dot_product_efficient_attention": fail_gpu(("xpu",)),
7648-
# No fft implementation for XPU yet.
7649-
"test_fft_c2c": fail_gpu(("xpu",), is_skip=True),
76507655
}
76517656

76527657
MPS_TEST_FAILURES = {

test/inductor/test_aot_inductor_custom_ops.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@
2222
IS_WINDOWS,
2323
skipIfXpu,
2424
)
25+
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU_AND_TRITON
2526
from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
26-
from torch.testing._internal.triton_utils import HAS_CUDA_AND_TRITON
2727
from torch.utils._python_dispatch import TorchDispatchMode
2828

2929

@@ -492,9 +492,9 @@ def fail_cpu(is_skip=False):
492492
)
493493

494494

495-
def fail_cuda(is_skip=False):
495+
def fail_gpu(suffixes: tuple[str, ...], is_skip=False):
496496
return TestFailure(
497-
("cuda"),
497+
suffixes,
498498
is_skip=is_skip,
499499
)
500500

@@ -506,11 +506,11 @@ def fail_cuda(is_skip=False):
506506
}
507507

508508
# test_failures, xfail by default, set is_skip=True to skip
509-
CUDA_TEST_FAILURES = {
509+
GPU_TEST_FAILURES = {
510510
# quantized unsupported for GPU
511-
"test_quantized_linear": fail_cuda(),
512-
"test_quanatized_int8_linear": fail_cuda(),
513-
"test_quantized_linear_bias_none": fail_cuda(),
511+
"test_quantized_linear": fail_gpu(("cuda", "xpu")),
512+
"test_quanatized_int8_linear": fail_gpu(("cuda", "xpu")),
513+
"test_quantized_linear_bias_none": fail_gpu(("cuda", "xpu")),
514514
}
515515

516516

@@ -533,9 +533,9 @@ class AOTInductorTestABICompatibleCpu(AOTICustomOpTestCase):
533533

534534

535535
@unittest.skipIf(sys.platform == "darwin", "No CUDA on MacOS")
536-
class AOTInductorTestABICompatibleCuda(AOTICustomOpTestCase):
537-
device = "cuda"
538-
device_type = "cuda"
536+
class AOTInductorTestABICompatibleGpu(AOTICustomOpTestCase):
537+
device = GPU_TYPE
538+
device_type = GPU_TYPE
539539
check_model = check_model
540540
check_model_with_multiple_inputs = check_model_with_multiple_inputs
541541
code_check_count = code_check_count
@@ -545,14 +545,14 @@ class AOTInductorTestABICompatibleCuda(AOTICustomOpTestCase):
545545

546546
copy_tests(
547547
AOTInductorTestsTemplate,
548-
AOTInductorTestABICompatibleCuda,
549-
"cuda",
550-
CUDA_TEST_FAILURES,
548+
AOTInductorTestABICompatibleGpu,
549+
GPU_TYPE,
550+
GPU_TEST_FAILURES,
551551
)
552552

553553
if __name__ == "__main__":
554554
from torch._inductor.test_case import run_tests
555555

556556
# cpp_extension N/A in fbcode
557-
if HAS_CUDA_AND_TRITON or sys.platform == "darwin":
557+
if HAS_GPU_AND_TRITON or sys.platform == "darwin":
558558
run_tests(needs="filelock")

test/inductor/test_aot_inductor_package.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,7 @@
2828
load_weights_to_pt2_contents,
2929
)
3030
from torch.testing._internal.common_cuda import _get_torch_cuda_version
31-
from torch.testing._internal.common_utils import (
32-
IS_FBCODE,
33-
skipIfRocm,
34-
skipIfXpu,
35-
TEST_CUDA,
36-
)
31+
from torch.testing._internal.common_utils import IS_FBCODE, skipIfRocm, skipIfXpu
3732
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
3833

3934

@@ -688,13 +683,13 @@ def forward(self, x):
688683
self.assertEqual(loaded1(*example_inputs1), ep1.module()(*example_inputs1))
689684
self.assertEqual(loaded2(*example_inputs2), ep2.module()(*example_inputs2))
690685

691-
@unittest.skipIf(not TEST_CUDA, "requires cuda")
686+
@unittest.skipIf(not HAS_GPU, "requires gpu")
692687
def test_duplicate_calls(self):
693688
options = {
694689
"aot_inductor.package": True,
695690
}
696691

697-
device = "cuda"
692+
device = GPU_TYPE
698693

699694
class Model1(torch.nn.Module):
700695
def __init__(self) -> None:

test/inductor/test_benchmark_fusion.py

Lines changed: 31 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,9 @@
1212
from torch.testing._internal.common_utils import slowTest
1313
from torch.testing._internal.inductor_utils import (
1414
get_func_call,
15+
GPU_TYPE,
1516
HAS_CPU,
16-
HAS_CUDA_AND_TRITON,
17+
HAS_GPU_AND_TRITON,
1718
IS_BIG_GPU,
1819
)
1920

@@ -27,7 +28,7 @@
2728

2829
from inductor.test_torchinductor import ( # @manual=fbcode//caffe2/test/inductor:test_inductor-library
2930
check_model,
30-
check_model_cuda,
31+
check_model_gpu,
3132
copy_tests,
3233
skip_if_cpp_wrapper,
3334
)
@@ -140,8 +141,8 @@ def f(a, b):
140141
)
141142
@config.patch(max_autotune_gemm_backends="TRITON")
142143
def test_avoid_register_spilling(self):
143-
if self.device != "cuda":
144-
raise unittest.SkipTest("CUDA only")
144+
if self.device != GPU_TYPE:
145+
raise unittest.SkipTest("GPU only")
145146

146147
from torch.nn.functional import gelu
147148

@@ -156,8 +157,8 @@ def foo(m, inp):
156157

157158
return curr
158159

159-
m = torch.nn.Linear(2048, 2048, bias=True).half().cuda()
160-
inp = torch.rand([2048, 2048]).half().cuda()
160+
m = torch.nn.Linear(2048, 2048, bias=True).half().to(GPU_TYPE)
161+
inp = torch.rand([2048, 2048]).half().to(GPU_TYPE)
161162

162163
with torch.no_grad():
163164
foo_c = torch.compile(mode="max-autotune-no-cudagraphs")(foo)
@@ -185,7 +186,7 @@ def foo(m, inp):
185186

186187
for c in out_code[0], out_code2[0]:
187188
FileCheck().check("async_compile.wait").check("DeviceGuard").check_count(
188-
"empty_strided_cuda", 1, exactly=True
189+
f"empty_strided_{GPU_TYPE}", 1, exactly=True
189190
).check_regex("buf[0-9]* = buf[0-9]*; del buf[0-9]*").check("return").run(c)
190191

191192
def test_tield_kernel_fusion(self):
@@ -197,47 +198,50 @@ def f(x):
197198
self.common(f, (x,))
198199

199200

200-
if HAS_CUDA_AND_TRITON:
201+
if HAS_GPU_AND_TRITON:
201202

202-
class BenchmarkFusionCudaTest(TestCase):
203-
common = check_model_cuda
204-
device = "cuda"
203+
class BenchmarkFusionGpuTest(TestCase):
204+
common = check_model_gpu
205+
device = GPU_TYPE
205206

206-
copy_tests(BenchmarkFusionTestTemplate, BenchmarkFusionCudaTest, "cuda")
207+
copy_tests(BenchmarkFusionTestTemplate, BenchmarkFusionGpuTest, GPU_TYPE)
207208

208209
class BenchmarkingTest(TestCase):
209210
@unittest.skipIf(
210-
torch.cuda.device_count() < 2, "The test need at least 2 devices"
211+
getattr(torch, GPU_TYPE).device_count() < 2,
212+
"The test need at least 2 devices",
211213
)
212214
@skip_if_cpp_wrapper("This tests triton scheduling directly")
213215
def test_benchmark_on_non_zero_device(self):
214216
hit_count = 0
215-
with torch.cuda.device("cuda:0"):
217+
with getattr(torch, GPU_TYPE).device(f"{GPU_TYPE}:0"):
216218

217219
@torch.compile
218220
def relu(x):
219221
return realize(x.relu()) + x
220222

221-
x = torch.randn(int(16e6), device="cuda:1")
223+
x = torch.randn(int(16e6), device=f"{GPU_TYPE}:1")
222224

223-
orig_benchmark_fused_nodes = TritonScheduling.benchmark_fused_nodes
225+
orig_benchmark_codegened_module = (
226+
TritonScheduling.benchmark_codegened_module
227+
)
224228

225-
def mock_benchmark_fused_nodes(*args, **kwargs):
229+
def benchmark_codegened_module(*args, **kwargs):
226230
nonlocal hit_count
227231
hit_count += 1
228-
ms, path = orig_benchmark_fused_nodes(*args, **kwargs)
232+
ms, path = orig_benchmark_codegened_module(*args, **kwargs)
229233
self.assertTrue(ms > 0)
230234
return ms, path
231235

232236
with unittest.mock.patch.object(
233237
TritonScheduling,
234-
"benchmark_fused_nodes",
235-
mock_benchmark_fused_nodes,
238+
"benchmark_codegened_module",
239+
benchmark_codegened_module,
236240
):
237241
relu(x)
238242
self.assertTrue(hit_count > 0)
239243

240-
class BenchmarkMultiTemplateFusionCudaTest(InductorTestCase):
244+
class BenchmarkMultiTemplateFusionGpuTest(InductorTestCase):
241245
@classmethod
242246
def setUpClass(cls):
243247
super().setUpClass()
@@ -272,8 +276,8 @@ def foo(m, inp):
272276
foo_c = torch.compile(mode="max-autotune-no-cudagraphs")(foo)
273277
first_dim = first_dim if first_dim is not None else size
274278

275-
m = torch.nn.Linear(size, size, bias=True).half().cuda()
276-
inp = torch.rand([first_dim, size]).half().cuda()
279+
m = torch.nn.Linear(size, size, bias=True).half().to(GPU_TYPE)
280+
inp = torch.rand([first_dim, size]).half().to(GPU_TYPE)
277281

278282
with torch.no_grad():
279283
res, code = run_and_get_code(foo_c, m, inp)
@@ -324,9 +328,9 @@ def fn(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor):
324328
)
325329

326330
args = [
327-
torch.randn(4, 4, device="cuda"),
328-
torch.randn(4, 4, device="cuda"),
329-
torch.randn(4, 4, device="cuda"),
331+
torch.randn(4, 4, device=GPU_TYPE),
332+
torch.randn(4, 4, device=GPU_TYPE),
333+
torch.randn(4, 4, device=GPU_TYPE),
330334
]
331335

332336
expected = fn(*args)
@@ -347,5 +351,5 @@ class BenchmarkFusionCpuTest(TestCase):
347351
if __name__ == "__main__":
348352
from torch._inductor.test_case import run_tests
349353

350-
if HAS_CPU or HAS_CUDA_AND_TRITON:
354+
if HAS_CPU or HAS_GPU_AND_TRITON:
351355
run_tests()

0 commit comments

Comments
 (0)