|
27 | 27 | TritonTemplateCaller, |
28 | 28 | ) |
29 | 29 | from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8 |
| 30 | +from torch.testing._internal.common_device_type import largeTensorTest |
30 | 31 | from torch.testing._internal.common_utils import ( |
31 | 32 | instantiate_parametrized_tests, |
32 | 33 | IS_WINDOWS, |
|
44 | 45 | from torch.fx.experimental.proxy_tensor import make_fx |
45 | 46 | from torch.testing import FileCheck |
46 | 47 | from torch.testing._internal.common_utils import skipIfRocm, skipIfXpu |
47 | | -from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_CUDA, HAS_GPU |
| 48 | +from torch.testing._internal.inductor_utils import ( |
| 49 | + GPU_TYPE, |
| 50 | + HAS_CPU, |
| 51 | + HAS_CUDA, |
| 52 | + HAS_GPU, |
| 53 | +) |
48 | 54 |
|
49 | 55 |
|
50 | 56 | torch.set_float32_matmul_precision("high") |
@@ -981,6 +987,8 @@ def test_conv_backend(self): |
981 | 987 |
|
982 | 988 | self.assertIn("NoValidChoicesError", str(context.exception)) |
983 | 989 |
|
| 990 | + # Some ROCm GPUs don't have enough VRAM to run all autotune configurations and padding benchmarks |
| 991 | + @largeTensorTest("30 GB", device=GPU_TYPE) |
984 | 992 | def test_non_contiguous_input_mm(self): |
985 | 993 | """ |
986 | 994 | Make sure the triton template can work with non-contiguous inputs without crash. |
@@ -1033,6 +1041,8 @@ def f(x, y): |
1033 | 1041 | # TODO: fix accuracy failure of the triton template on XPU. |
1034 | 1042 | # and enable this test case. |
1035 | 1043 | @skipIfXpu |
| 1044 | + # Some ROCm GPUs don't have enough VRAM to run all autotune configurations and padding benchmarks |
| 1045 | + @largeTensorTest("30 GB", device=GPU_TYPE) |
1036 | 1046 | def test_non_contiguous_input_mm_plus_mm(self): |
1037 | 1047 | x1 = rand_strided((50257, 32768), (1, 50304), device=GPU_TYPE) |
1038 | 1048 | y1 = rand_strided((32768, 768), (768, 1), device=GPU_TYPE) |
|
0 commit comments