Skip to content

Commit 77d85fb

Browse files
authored
Remove CUDA from remaining tests. (#9613)
This PR removes CUDA specific logic and tests from the remaining tests (after #9612). This is in line with the CUDA deprecation that started on release 2.8.
1 parent 342de86 commit 77d85fb

17 files changed

+42
-501
lines changed

test/cpp/test_aten_xla_tensor_2.cpp

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1555,20 +1555,18 @@ TEST_F(AtenXlaTensorTest, TestGroupNormBackward) {
15551555
/*cudnn_enabled=*/false);
15561556
};
15571557
torch::Tensor undef;
1558-
ForEachDevice({XlaDeviceType::CUDA, XlaDeviceType::TPU},
1559-
[&](const torch::Device& device) {
1560-
TestBackward({input, undef_weight ? undef : weight,
1561-
undef_weight ? undef : bias},
1562-
device, testfn,
1563-
/*rtol=*/1e-3, /*atol=*/1e-3,
1564-
/*derivative_level=*/2);
1565-
ExpectCounterNotChanged("aten::.*",
1566-
cpp_test::GetIgnoredCounters());
1567-
ExpectCounterChanged("xla::native_batch_norm",
1568-
cpp_test::GetIgnoredCounters());
1569-
ExpectCounterChanged("xla::native_batch_norm_backward",
1570-
cpp_test::GetIgnoredCounters());
1571-
});
1558+
ForEachDevice({XlaDeviceType::TPU}, [&](const torch::Device& device) {
1559+
TestBackward(
1560+
{input, undef_weight ? undef : weight, undef_weight ? undef : bias},
1561+
device, testfn,
1562+
/*rtol=*/1e-3, /*atol=*/1e-3,
1563+
/*derivative_level=*/2);
1564+
ExpectCounterNotChanged("aten::.*", cpp_test::GetIgnoredCounters());
1565+
ExpectCounterChanged("xla::native_batch_norm",
1566+
cpp_test::GetIgnoredCounters());
1567+
ExpectCounterChanged("xla::native_batch_norm_backward",
1568+
cpp_test::GetIgnoredCounters());
1569+
});
15721570
}
15731571
}
15741572
}

test/cpp/test_aten_xla_tensor_6.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -873,7 +873,7 @@ TEST_F(AtenXlaTensorTest, TestEmbeddingBackward) {
873873
TEST_F(AtenXlaTensorTest, TestAmpUpdateScale) {
874874
XlaDeviceType hw_type =
875875
static_cast<XlaDeviceType>(bridge::GetDefaultDevice()->type());
876-
if (hw_type != XlaDeviceType::CUDA && hw_type != XlaDeviceType::CPU) {
876+
if (hw_type != XlaDeviceType::CPU) {
877877
return;
878878
}
879879
torch::Tensor growth_tracker =

test/cpp/test_replication.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -98,9 +98,7 @@ void TestSingleReplication(
9898

9999
class ReplicationTest : public AtenXlaTensorTestBase {};
100100

101-
// Parallelism for DataParallel uses multi-threads. But cuda assumes one GPU
102-
// device per process instead of relying on threads so we will not run the test
103-
// on GPU.
101+
// Parallelism for DataParallel uses multi-threads.
104102
TEST_F(ReplicationTest, TestNSingleReplication) {
105103
WithAllDevices(
106104
{XlaDeviceType::TPU},

test/ds/test_dynamic_shapes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ def test_masked_select_shape(self):
186186
def test_nonzero_cast(self):
187187
t1 = torch.ones(5, 2, device='xla')
188188
# Result of the nonzero should be the index type. Currently
189-
# index type is s64 on cpu and gpu, but s32 on TPU. We should be
189+
# index type is s64 on cpu, but s32 on TPU. We should be
190190
# able to cast it to any other type without error.
191191
t2 = torch.nonzero(t1.int()).float()
192192
torch_xla.sync()

test/dynamo/test_dynamo.py

Lines changed: 14 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -148,17 +148,6 @@ def fn_simple(self, x, y):
148148
b = torch.sin(y)
149149
return a + b
150150

151-
def _choose_proper_device(self, initialize_on_cuda):
152-
if not initialize_on_cuda:
153-
return torch_xla.device()
154-
155-
assert initialize_on_cuda
156-
if xr.device_type() != "CUDA" or not torch.cuda.is_available():
157-
self.skipTest(
158-
"Skip this test because it requires xr.device_type()=='CUDA' and torch.cuda.is_available()."
159-
)
160-
return "cuda:0"
161-
162151
@skipOnNeuron
163152
def test_simple_model(self):
164153
device = torch_xla.device()
@@ -193,71 +182,23 @@ def test_simple_model(self):
193182
# Dynamo has to sync the input since they are intermedate IR(xla_xy and xla_y3)
194183
self.assertEqual(met.counter_value('DynamoSyncInputExecuteTime'), 1)
195184

196-
# Tests that the dynamo bridge automatically moves tensors to XLA device,
197-
# then back to the original device.
198-
@unittest.skipIf(xr.device_type() != "CUDA" or not torch.cuda.is_available(),
199-
f"GPU tests should only run on GPU devices.")
200-
@parameterized.parameters(
201-
"0",
202-
"1",
203-
)
204-
def test_simple_model_automoves_tensors(self, zero_copy_enabled):
205-
x = torch.tensor(100.0, requires_grad=True, device="cuda:0")
206-
y = torch.tensor(200.0, requires_grad=True, device="cuda:0")
207-
original_device = x.device
208-
eager_result = self.fn_simple(x, y)
209-
210-
# Since all tests run in the same process, have to reset the metrics report.
211-
met.clear_all()
212-
torch._dynamo.reset()
213-
214-
fn_simple_dynamo = torch.compile(self.fn_simple, backend="openxla")
215-
res_xla_dynamo = fn_simple_dynamo(x, y)
216-
self.assertIn('xla::add', met.counter_names())
217-
self.assertTrue(res_xla_dynamo.device == original_device)
218-
self.assertTrue(torch.allclose(eager_result, res_xla_dynamo))
219-
220-
# verify that tracing is skipped in following runs
221-
met.clear_counters()
222-
res_xla_dynamo_reused = fn_simple_dynamo(x, y)
223-
self.assertNotIn('xla::add', met.counter_names())
224-
self.assertTrue(res_xla_dynamo_reused.device == original_device)
225-
self.assertTrue(torch.allclose(eager_result, res_xla_dynamo_reused))
226-
227-
# verify that dynamo can handle different inputs
228-
res_xla_dynamo_different = fn_simple_dynamo(x + y, y * 3)
229-
res_cpu_3 = self.fn_simple(x + y, y * 3)
230-
self.assertTrue(res_xla_dynamo_different.device == original_device)
231-
self.assertTrue(torch.allclose(res_cpu_3, res_xla_dynamo_different))
232-
233-
# There should not be any fallbacks.
234-
self.assertEqual(torch_xla._XLAC._get_executed_fallback_ops(), [])
235-
236-
@parameterized.parameters(
237-
True,
238-
False,
239-
)
240-
def test_fn_without_input(self, initialize_on_cuda):
185+
def test_fn_without_input(self):
241186

242187
def fn_without_input(device):
243188
constant = 0.835
244189
expanded = torch.full((4, 4), constant, device=device)
245190
arange = torch.arange(16, device=device).reshape(4, 4)
246191
return expanded + arange
247192

248-
device = self._choose_proper_device(initialize_on_cuda)
193+
device = torch_xla.device()
249194

250195
compiled_fn = torch.compile(fn_without_input, backend='openxla')
251196
res_cpu = fn_without_input('cpu')
252197
res_xla_dynamo = compiled_fn(device)
253198
self.assertTrue(torch.allclose(res_cpu, res_xla_dynamo.cpu()))
254199

255-
@parameterized.parameters(
256-
(True, 'openxla'),
257-
(False, dynamo_backend2.dynamo_backend),
258-
(False, 'openxla'),
259-
)
260-
def test_simple_model_with_in_place_ops(self, initialize_on_cuda, backend):
200+
@parameterized.parameters('openxla', dynamo_backend2.dynamo_backend)
201+
def test_simple_model_with_in_place_ops(self, backend):
261202

262203
class TestModel(nn.Module):
263204

@@ -279,7 +220,7 @@ def forward(self, index, copy_tensor, input_tensor, op_name):
279220
output = input_tensor + self.self_tensor
280221
return output
281222

282-
device = self._choose_proper_device(initialize_on_cuda)
223+
device = torch_xla.device()
283224

284225
torch._dynamo.reset()
285226
met.clear_all()
@@ -306,18 +247,14 @@ def forward(self, index, copy_tensor, input_tensor, op_name):
306247
op_name=in_place_op)
307248
self.assertTrue(torch.allclose(res_cpu, res_device_dynamo.cpu()))
308249

309-
@parameterized.parameters(
310-
(True, 'openxla'),
311-
(False, dynamo_backend2.dynamo_backend),
312-
(False, 'openxla'),
313-
)
314-
def test_einsum(self, initialize_on_cuda, backend):
250+
@parameterized.parameters('openxla', dynamo_backend2.dynamo_backend)
251+
def test_einsum(self, backend):
315252
# einsum currently does not have meta function to compute the shape hence
316253
# will fallback to XLA with FakeTensor as input to infer the output shape.
317254
def einsum_mm(a, b):
318255
return torch.einsum('ijkl,ijlm->ijkm', a, b)
319256

320-
device = self._choose_proper_device(initialize_on_cuda)
257+
device = torch_xla.device()
321258
a = torch.randn(4, 4, 4, 4).to(device)
322259
b = torch.randn(4, 4, 4, 4).to(device)
323260
torch_xla.sync()
@@ -328,16 +265,10 @@ def einsum_mm(a, b):
328265
self.assertTrue(
329266
torch.allclose(res_device_non_dynamo.cpu(), res_device_dynamo.cpu()))
330267

331-
@parameterized.parameters(
332-
True,
333-
False,
334-
)
335-
def test_simple_model_with_different_input_shape(self, initialize_on_cuda):
268+
def test_simple_model_with_different_input_shape(self):
336269
met.clear_all()
337-
device = self._choose_proper_device(initialize_on_cuda)
338-
# We need to make `dim` depend on `initialize_on_cuda` because the XLA compilation cache
339-
# does not clean itself between the parameterized tests.
340-
dim = 5 + int(initialize_on_cuda)
270+
device = torch_xla.device()
271+
dim = 5
341272
device_x = torch.randn(dim, dim).to(device)
342273
device_y = torch.randn(dim, dim).to(device)
343274
new_dim = 2 * dim
@@ -369,13 +300,9 @@ def get_loader(self, device, sample_count, batch_size=4):
369300

370301
@skipOnTpu
371302
@skipOnNeuron
372-
@parameterized.parameters(
373-
(True, 'openxla'),
374-
(False, dynamo_backend2.dynamo_backend),
375-
(False, 'openxla'),
376-
)
377-
def test_resnet18(self, initialize_on_cuda, backend):
378-
device = self._choose_proper_device(initialize_on_cuda)
303+
@parameterized.parameters('openxla', dynamo_backend2.dynamo_backend)
304+
def test_resnet18(self, backend):
305+
device = torch_xla.device()
379306
sample_count = xu.getenv_as('SAMPLE_COUNT', int, defval=10)
380307
loader = self.get_loader(device, sample_count, batch_size=4)
381308
resnet18 = torchvision.models.resnet18()

test/dynamo/test_traceable_collectives.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def collective_broadcast_and_cos(input, src):
2020
def _mp_fn(index):
2121
device = torch_xla.device()
2222
world_size = xr.world_size()
23-
if xm.xla_device_hw(device) not in ('TPU', 'CUDA', 'NEURON'):
23+
if xm.xla_device_hw(device) not in ('TPU', 'NEURON'):
2424
print(f'skip this test for hw {xm.xla_device_hw(device)}')
2525
return
2626
ordinal_tensor = torch.tensor([index], dtype=torch.float).to(device)

test/pjrt/test_runtime.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ class TestExperimentalPjrt(parameterized.TestCase):
1717
def setUp(self):
1818
xr.set_device_type('CPU')
1919

20-
@parameterized.parameters(('CPU', 'CPU'), ('CUDA', 'CUDA'), ('TPU', 'TPU'))
20+
@parameterized.parameters(('CPU', 'CPU'), ('TPU', 'TPU'))
2121
def test_device_type(self, pjrt_device, expected):
2222
with mock.patch.dict(os.environ, {'PJRT_DEVICE': pjrt_device}, clear=True):
2323
self.assertEqual(xr.device_type(), expected)
@@ -69,11 +69,6 @@ def test_xla_device_error(self):
6969
}, True), ('pjrt_tpu_precedence', {
7070
'PJRT_DEVICE': 'TPU',
7171
'XRT_TPU_CONFIG': 'localservice;0;localhost:51011',
72-
}, True), ('gpu_num_devives', {
73-
'GPU_NUM_DEVICES': '4'
74-
}, True), ('pjrt_gpu', {
75-
'PJRT_DEVICE': 'CUDA',
76-
'GPU_NUM_DEVICES': '4'
7772
}, True))
7873
def test_pjrt_default_device(self, env_vars, expect_using_pjrt):
7974
# Prevent flag checking during reinitialization of PJRT backend.
@@ -86,7 +81,7 @@ def test_pjrt_default_device(self, env_vars, expect_using_pjrt):
8681
reload(torch_xla)
8782
logs_context = contextlib.nullcontext()
8883
if expect_using_pjrt:
89-
self.assertIn(xr.device_type(), ['CPU', 'CUDA', 'TPU', 'NEURON'])
84+
self.assertIn(xr.device_type(), ['CPU', 'TPU', 'NEURON'])
9085
else:
9186
self.assertIsNone(xr.device_type())
9287

test/pytorch_test_base.py

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -295,7 +295,7 @@
295295
'test_leaky_relu_inplace_with_neg_slope_xla', # expecting a specific error message
296296
'test_upsamplingBicubic2d_correctness_xla', # FIXME! Got dtypes torch.float32 and torch.float64
297297
'test_CTCLoss_no_batch_dim_xla', # Value out of range
298-
'test_upsamplingBilinear2d_xla', # precision on GPU/TPU, slow compilation on CPU
298+
'test_upsamplingBilinear2d_xla', # precision on TPU, slow compilation on CPU
299299
# torch.autograd.gradcheck.GradcheckError: Jacobian mismatch for output 0 with respect to input 0
300300
'test_GRU_grad_and_gradgrad_xla_float64', # grad check failure
301301
'test_LSTM_grad_and_gradgrad_xla_float64', # grad check failure
@@ -475,18 +475,6 @@
475475
},
476476
}
477477

478-
DISABLED_TORCH_TESTS_GPU_ONLY = {
479-
# test_torch.py
480-
'TestTorchDeviceTypeXLA': {
481-
'test_maximum_minimum_float_nan_and_inf', # maximum(nan,inf) = inf on GPU
482-
},
483-
484-
# test_indexing.py
485-
'TestIndexingXLA': {
486-
'test_index_put_accumulate_large_tensor_xla', # illegal memory access was encountered
487-
},
488-
}
489-
490478

491479
class MatchSet(object):
492480

@@ -526,15 +514,12 @@ def union_of_disabled_tests(sets):
526514

527515

528516
DISABLED_TORCH_TESTS_CPU = DISABLED_TORCH_TESTS_ANY
529-
DISABLED_TORCH_TESTS_GPU = union_of_disabled_tests(
530-
[DISABLED_TORCH_TESTS_ANY, DISABLED_TORCH_TESTS_GPU_ONLY])
531517
DISABLED_TORCH_TESTS_TPU = union_of_disabled_tests(
532518
[DISABLED_TORCH_TESTS_ANY, DISABLED_TORCH_TESTS_TPU_ONLY])
533519

534520
DISABLED_TORCH_TESTS = {
535521
'TPU': prepare_match_set(DISABLED_TORCH_TESTS_TPU),
536522
'CPU': prepare_match_set(DISABLED_TORCH_TESTS_CPU),
537-
'CUDA': prepare_match_set(DISABLED_TORCH_TESTS_GPU),
538523
}
539524

540525

test/run_tests.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,6 @@ function run_xla_op_tests3 {
254254
run_test "$_TEST_DIR/test_devices.py"
255255
run_test "$_TEST_DIR/test_manual_xla_registration.py"
256256
run_test_multi_devices "$_TEST_DIR/spmd/test_xla_dtensor_placements.py"
257-
# NOTE: this line below is testing export and don't care about GPU
258257
PJRT_DEVICE=CPU CPU_NUM_DEVICES=1 run_coverage "$_TEST_DIR/test_core_aten_ops.py"
259258
run_test "$_TEST_DIR/test_pallas.py"
260259
run_xla_ir_hlo_debug run_test "$_TEST_DIR/test_user_computation_debug_cache.py"

test/test_autocast.py

Lines changed: 0 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -152,82 +152,6 @@ def __init__(self, dev):
152152
self.methods_bf16 = [("__matmul__", mat0_bf16 + mat1_fp32)]
153153

154154

155-
class AutocastCudaTestExtraLists(object):
156-
157-
def __init__(self, dev):
158-
super().__init__()
159-
n = 8
160-
dimsets = ((n, n, n), (n, n, n, n), (n, n, n, n, n))
161-
conv_args_fp32 = [(torch.randn(dimset, dtype=torch.float32, device=dev),
162-
torch.randn(dimset, dtype=torch.float32, device=dev))
163-
for dimset in dimsets]
164-
165-
mat0_fp32 = (torch.randn((n, n), dtype=torch.float32, device=dev),)
166-
mat1_fp32 = (torch.randn((n, n), dtype=torch.float32, device=dev),)
167-
mat2_fp32 = (torch.randn((n, n), dtype=torch.float32, device=dev),)
168-
mat3_fp32 = (torch.randn((n, n), dtype=torch.float32, device=dev),)
169-
170-
pointwise0_fp32 = (torch.randn(n, dtype=torch.float32, device=dev),)
171-
172-
element0_fp32 = (torch.randn(1, dtype=torch.float32, device=dev),)
173-
174-
# This is currently not part of AutocastTestLists and excludes `relu`, `addbmm`
175-
self.torch_bf16 = [
176-
("conv1d", conv_args_fp32[0]),
177-
("conv2d", conv_args_fp32[1]),
178-
("conv3d", conv_args_fp32[2]),
179-
("bmm", (torch.randn((n, n, n), device=dev, dtype=torch.float32),
180-
torch.randn((n, n, n), device=dev, dtype=torch.float32))),
181-
("mm", mat0_fp32 + mat1_fp32),
182-
("matmul",
183-
torch.matmul(
184-
torch.ones([2, 3], device=dev, dtype=torch.float32),
185-
torch.ones([3, 2], device=dev, dtype=torch.float32))),
186-
("baddbmm", (torch.randn((n, n, n), device=dev, dtype=torch.float32),
187-
torch.randn((n, n, n), device=dev, dtype=torch.float32),
188-
torch.randn((n, n, n), device=dev, dtype=torch.float32))),
189-
("addmm", mat1_fp32 + mat2_fp32 + mat3_fp32),
190-
("conv_tbc", (torch.randn((10, 7, 3), device=dev, dtype=torch.float32),
191-
torch.randn((5, 3, 5), device=dev, dtype=torch.float32),
192-
torch.randn(5, device=dev, dtype=torch.float32), 0)),
193-
("conv_transpose1d", conv_args_fp32[0]),
194-
("conv_transpose2d", conv_args_fp32[1]),
195-
("conv_transpose3d", conv_args_fp32[2]),
196-
("prelu", pointwise0_fp32 + element0_fp32),
197-
]
198-
199-
200-
class AutocastCudaTestUnsupportedLists(object):
201-
202-
def __init__(self):
203-
super().__init__()
204-
# Utility arguments, created as one-element tuples
205-
self.torch_expect_builtin_promote = [
206-
"cat", # requires all input tensors to be the same type
207-
"equal", # requires all input tensors to be the same type
208-
"stack", # return f16 instead of f32
209-
]
210-
self.methods_expect_builtin_promote = []
211-
212-
# The remaining lists organize ops that autocast treats explicitly.
213-
self.torch_fp16 = [
214-
"_convolution_nogroup", # need lowering
215-
"addmv", # need lowering
216-
]
217-
self.torch_fp32 = [
218-
"norm", # produce f16 instead of f32
219-
]
220-
self.torch_need_autocast_promote = [
221-
"scatter_add", # cat currently requires all input tensors to be the same type
222-
]
223-
self.nn_fp16 = []
224-
self.nn_fp32 = []
225-
self.linalg_fp16 = []
226-
self.methods_fp16 = []
227-
self.methods_fp32 = []
228-
self.banned = []
229-
230-
231155
class TestAutocastBase(unittest.TestCase):
232156

233157
@classmethod

0 commit comments

Comments
 (0)