[CI] Separate GPU and CPU tests with pytest markers

vmoens · vmoens · commit 9195b217e982 · 2026-01-28T11:07:56.000Z
Add pytest.mark.gpu to tests that require CUDA, and update run_all.sh to filter tests based on whether running on GPU or CPU machines. Changes: - Register 'gpu' marker in pytest.ini and conftest.py - Add pytest.mark.gpu to ~30 tests that explicitly require CUDA - Update run_all.sh to use GPU_MARKER_FILTER: - GPU jobs (CU_VERSION != cpu): run only pytest.mark.gpu tests - CPU jobs (CU_VERSION = cpu): run all tests except pytest.mark.gpu This significantly reduces GPU machine usage by running only GPU-requiring tests on expensive GPU runners (~30 tests instead of ~2000+). Tests that can run on either device will run on CPU machines only. The optimization can be disabled by setting TORCHRL_GPU_FILTER=0. ghstack-source-id: 9235913 Pull-Request: #3404
diff --git a/.github/unittest/linux/scripts/run_all.sh b/.github/unittest/linux/scripts/run_all.sh
@@ -269,6 +269,28 @@ fi
 
 TORCHRL_TEST_SUITE="${TORCHRL_TEST_SUITE:-all}" # all|distributed|nondistributed
 
+# GPU test filtering: Run GPU-only tests on GPU machines, CPU-only tests on CPU machines.
+# This avoids running ~2000+ tests on expensive GPU machines when only ~30 require GPU.
+# Tests are marked with @pytest.mark.gpu if they require CUDA.
+#
+# Set TORCHRL_GPU_FILTER=0 to disable this optimization and run all tests.
+#
+# We use an array to handle the marker expression properly (avoids quoting issues).
+GPU_MARKER_FILTER=()
+if [ "${TORCHRL_GPU_FILTER:-1}" = "1" ]; then
+  if [ "${CU_VERSION:-}" == cpu ]; then
+    # CPU job: run only tests that do NOT require GPU
+    GPU_MARKER_FILTER=(-m 'not gpu')
+    echo "GPU filtering enabled: Running CPU-only tests (excluding @pytest.mark.gpu)"
+  else
+    # GPU job: run only tests that require GPU
+    GPU_MARKER_FILTER=(-m gpu)
+    echo "GPU filtering enabled: Running GPU-only tests (@pytest.mark.gpu)"
+  fi
+else
+  echo "GPU filtering disabled: Running all tests"
+fi
+
 export PYTORCH_TEST_WITH_SLOW='1'
 python -m torch.utils.collect_env
 
@@ -287,6 +309,7 @@ run_distributed_tests() {
     return 1
   fi
   # Run both test_distributed.py and test_rb_distributed.py (both use torch.distributed)
+  # Note: distributed tests always run on GPU, no need for GPU_MARKER_FILTER here
   python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_distributed.py test/test_rb_distributed.py \
     --instafail --durations 200 -vv --capture no \
     --timeout=120 --mp_fork_if_no_cuda
@@ -317,12 +340,12 @@ run_non_distributed_tests() {
     1)
       echo "Running shard 1: test_transforms.py only"
       python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_transforms.py \
-        ${common_args}
+        "${GPU_MARKER_FILTER[@]}" ${common_args}
       ;;
     2)
       echo "Running shard 2: test_envs.py and test_collectors.py"
       python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_envs.py test/test_collectors.py \
-        ${common_args}
+        "${GPU_MARKER_FILTER[@]}" ${common_args}
       ;;
     3)
       echo "Running shard 3: All other tests"
@@ -332,13 +355,13 @@ run_non_distributed_tests() {
         --ignore test/test_envs.py \
         --ignore test/test_collectors.py \
         ${xdist_args} \
-        ${common_args}
+        "${GPU_MARKER_FILTER[@]}" ${common_args}
       ;;
     all|"")
       echo "Running all tests (no sharding)"
       python .github/unittest/helpers/coverage_run_parallel.py -m pytest test \
         ${common_ignores} \
-        ${common_args}
+        "${GPU_MARKER_FILTER[@]}" ${common_args}
       ;;
     *)
       echo "Unknown TORCHRL_TEST_SHARD='${shard}'. Expected: all|1|2|3."
diff --git a/pytest.ini b/pytest.ini
@@ -6,6 +6,8 @@ addopts =
     --tb=native
 markers =
     unity_editor
+    slow: mark test as slow to run
+    gpu: mark test as requiring a GPU (CUDA device)
 testpaths =
     test
 xfail_strict = True
diff --git a/test/compile/test_compile_collectors.py b/test/compile/test_compile_collectors.py
@@ -77,6 +77,7 @@ def test_compiled_policy(self, collector_cls, compile_policy, device):
             collector.shutdown()
             del collector
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available")
     @pytest.mark.parametrize(
         "collector_cls",
diff --git a/test/conftest.py b/test/conftest.py
@@ -145,6 +145,9 @@ def pytest_runtest_setup(item):
 
 def pytest_configure(config):
     config.addinivalue_line("markers", "slow: mark test as slow to run")
+    config.addinivalue_line(
+        "markers", "gpu: mark test as requiring a GPU (CUDA device)"
+    )
 
 
 def pytest_collection_modifyitems(config, items):
diff --git a/test/llm/test_llm_updaters.py b/test/llm/test_llm_updaters.py
@@ -72,6 +72,7 @@ def get_open_port():
     )
 
 
+@pytest.mark.gpu
 @pytest.mark.skipif(not _has_transformers, reason="missing transformers dependencies")
 @pytest.mark.skipif(not _has_vllm, reason="missing vllm dependencies")
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@@ -415,6 +416,7 @@ def test_local_llm_specific_features(self, target_vllm_engine):
     "See LLM_TEST_ISSUES.md for details.",
     strict=False,
 )
+@pytest.mark.gpu
 @pytest.mark.skipif(not _has_ray, reason="missing ray dependencies")
 @pytest.mark.skipif(not _has_vllm, reason="missing vllm dependencies")
 @pytest.mark.skipif(not _has_transformers, reason="missing transformers dependencies")
@@ -611,6 +613,7 @@ def test_weight_sync_vllm_collective_ray(self, request):
                 ray.shutdown()
 
 
+@pytest.mark.gpu
 @pytest.mark.xfail(
     reason="AsyncVLLM tests fail due to Ray placement group timeout. "
     "See LLM_TEST_ISSUES.md for details.",
diff --git a/test/llm/test_vllm.py b/test/llm/test_vllm.py
@@ -39,6 +39,7 @@ def sampling_params():
 class TestAsyncVLLMIntegration:
     """Integration tests for AsyncVLLM with real models."""
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not _has_vllm, reason="vllm not available")
     @pytest.mark.skipif(not _has_ray, reason="ray not available")
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@@ -111,6 +112,7 @@ def test_vllm_api_compatibility(self, sampling_params):
         finally:
             service.shutdown()
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not _has_vllm, reason="vllm not available")
     @pytest.mark.skipif(not _has_ray, reason="ray not available")
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
diff --git a/test/llm/test_wrapper.py b/test/llm/test_wrapper.py
@@ -2104,6 +2104,7 @@ def test_log_probs_consistency(
         "See LLM_TEST_ISSUES.md for details.",
         strict=False,
     )
+    @pytest.mark.gpu
     @pytest.mark.skipif(not _has_vllm, reason="vllm not available")
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
     def test_sync_async_vllm_strict_equivalence(
diff --git a/test/test_collectors.py b/test/test_collectors.py
@@ -2333,6 +2333,7 @@ def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
         def _set_seed(self, seed: int | None) -> None:
             ...
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="no cuda device")
     @pytest.mark.parametrize("env_device", ["cuda:0", "cpu"])
     @pytest.mark.parametrize("storing_device", [None, "cuda:0", "cpu"])
@@ -2371,6 +2372,7 @@ def test_no_synchronize(self, env_device, storing_device, no_cuda_sync):
                     assert u == i, i
                 mock_synchronize.assert_not_called()
 
+    @pytest.mark.gpu
     @pytest.mark.parametrize("device", ["cuda", "cpu"])
     @pytest.mark.parametrize("storing_device", ["cuda", "cpu"])
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="no cuda device found")
@@ -3162,6 +3164,7 @@ def test_multi_collector_consistency(
         assert_allclose_td(c2.unsqueeze(0), d2)
 
 
+@pytest.mark.gpu
 @pytest.mark.skipif(
     not torch.cuda.is_available() and (not has_mps()),
     reason="No casting if no cuda",
@@ -3363,6 +3366,7 @@ def test_param_sync_mixed_device(
             col.shutdown()
             del col
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(
         not torch.cuda.is_available() or torch.cuda.device_count() < 3,
         reason="requires at least 3 CUDA devices",
diff --git a/test/test_envs.py b/test/test_envs.py
@@ -597,6 +597,7 @@ def test_auto_spec(self, env_type):
         env.auto_specs_(policy, tensordict=td.copy(), observation_key=obs_vals)
         env.check_env_specs(tensordict=td.copy())
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.device_count(), reason="No cuda device found.")
     @pytest.mark.parametrize("break_when_any_done", [True, False])
     def test_auto_cast_to_device(self, break_when_any_done):
@@ -1526,6 +1527,7 @@ def test_parallel_env_with_policy(
             # env_serial.close()
             env0.close(raise_if_closed=False)
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")
     @pytest.mark.parametrize("heterogeneous", [False, True])
     def test_transform_env_transform_no_device(
@@ -1638,6 +1640,7 @@ def test_parallel_env_custom_method(self, parallel, maybe_fork_ParallelEnv):
         finally:
             env.close(raise_if_closed=False)
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda to test on")
     @pytest.mark.skipif(not _has_gym, reason="no gym")
     @pytest.mark.parametrize("frame_skip", [4])
@@ -1742,6 +1745,7 @@ def test_parallel_env_cast(
             env_serial.close(raise_if_closed=False)
             env0.close(raise_if_closed=False)
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not _has_gym, reason="no gym")
     @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda device detected")
     @pytest.mark.parametrize("frame_skip", [4])
@@ -2726,6 +2730,7 @@ def test_marl_group_type(group_type):
         check_marl_grouping(group_type.get_group_map(agent_names), agent_names)
 
 
+@pytest.mark.gpu
 @pytest.mark.skipif(not torch.cuda.device_count(), reason="No cuda device")
 class TestConcurrentEnvs:
     """Concurrent parallel envs on multiple procs can interfere."""
diff --git a/test/test_libs.py b/test/test_libs.py
@@ -2157,6 +2157,7 @@ def test_set_seed_and_reset_works(self):
 
         assert isinstance(td, TensorDict)
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires cuda")
     def test_dmcontrol_kwargs_preserved_with_seed(self):
         """Test that kwargs like camera_id are preserved when seed is provided.
@@ -2182,6 +2183,7 @@ def test_dmcontrol_kwargs_preserved_with_seed(self):
         finally:
             env.close()
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires cuda")
     @pytest.mark.parametrize("env_name,task", [["cheetah", "run"]])
     @pytest.mark.parametrize("frame_skip", [1, 3])
@@ -2776,6 +2778,7 @@ def test_multithread_env_shutdown(self):
         assert not env.is_closed
         env.close()
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda to test on")
     @pytest.mark.skipif(not _has_gym, reason="no gym")
     @pytest.mark.parametrize("frame_skip", [4])
@@ -2816,6 +2819,7 @@ def test_multithreaded_env_cast(
         assert td_device.device == torch.device(device), env_multithread
         env_multithread.close()
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not _has_gym, reason="no gym")
     @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda device detected")
     @pytest.mark.parametrize("frame_skip", [4])
@@ -3097,6 +3101,7 @@ def test_brax_automatic_cache_clearing_parameter(self, envname, device, freq):
             out_td, next_td = env.step_and_maybe_reset(next_td)
             assert env._step_count == i + 1
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires cuda")
     def test_brax_kwargs_preserved_with_seed(self, envname, device):
         """Test that kwargs like camera_id are preserved when seed is provided.
diff --git a/test/test_rb.py b/test/test_rb.py
@@ -753,6 +753,7 @@ def test_state_dict(self, storage_type, data_type):
             storage2.get(range(10))
         )
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(
         not torch.cuda.device_count(),
         reason="not cuda device found to test rb storage.",
diff --git a/test/test_specs.py b/test/test_specs.py
@@ -3001,6 +3001,7 @@ def test_stack_zero_shape(self, stack_dim):
             assert r["a"].shape == torch.Size([*shape, 1, 3, 2])  # access tensor
         assert (r["a"] == 0).all()
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda")
     @pytest.mark.parametrize("stack_dim", [0, 1, 2, -3, -2, -1])
     def test_to(self, stack_dim):
@@ -3958,6 +3959,7 @@ def test_encode(self):
         assert r.get("nontensor").shape == (1,)
 
 
+@pytest.mark.gpu
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="not cuda device")
 def test_device_ordinal():
     device = torch.device("cpu")
diff --git a/test/test_transforms.py b/test/test_transforms.py
@@ -1292,6 +1292,7 @@ def test_constant_padding(self, padding_value):
         assert (cat_td.get("cat_first_key") == padding_value).sum() == N - 4
 
 
+@pytest.mark.gpu
 @pytest.mark.skipif(not _has_tv, reason="torchvision not installed")
 @pytest.mark.skipif(not torch.cuda.device_count(), reason="Testing R3M on cuda only")
 @pytest.mark.parametrize("device", [torch.device("cuda:0")])
@@ -8748,6 +8749,7 @@ def test_transform_env(self):
         assert (env.reset()["_eps_gSDE"] != 0.0).all()
 
 
+@pytest.mark.gpu
 @pytest.mark.skipif(not _has_tv, reason="torchvision not installed")
 @pytest.mark.skipif(not torch.cuda.device_count(), reason="Testing VIP on cuda only")
 @pytest.mark.parametrize("device", [torch.device("cuda:0")])
@@ -9219,6 +9221,7 @@ def test_vip_spec_against_real(self, model, tensor_pixels_key, device):
         assert set(expected_keys) == set(transformed_env.rollout(3).keys(True))
 
 
+@pytest.mark.gpu
 @pytest.mark.skipif(not _has_vc, reason="vc_models not installed")
 @pytest.mark.skipif(not torch.cuda.device_count(), reason="VC1 should run on cuda")
 @pytest.mark.parametrize("device", [torch.device("cuda:0")])
@@ -10912,6 +10915,7 @@ def test_finitetensordictcheck(self, device):
         with pytest.raises(ValueError, match="Encountered a non-finite tensor"):
             ftd(td)
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda device found")
     @pytest.mark.parametrize("device", get_default_devices())
     def test_pin_mem(self, device):

Original file line number	Diff line number	Diff line change
`@@ -2104,6 +2104,7 @@ def test_log_probs_consistency(`
`2104`	`2104`	`"See LLM_TEST_ISSUES.md for details.",`
`2105`	`2105`	`strict=False,`
`2106`	`2106`	`)`
	`2107`	`+ @pytest.mark.gpu`
`2107`	`2108`	`@pytest.mark.skipif(not _has_vllm, reason="vllm not available")`
`2108`	`2109`	`@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")`
`2109`	`2110`	`def test_sync_async_vllm_strict_equivalence(`