Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions .github/unittest/linux/scripts/run_all.sh
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,28 @@ fi

TORCHRL_TEST_SUITE="${TORCHRL_TEST_SUITE:-all}" # all|distributed|nondistributed

# GPU test filtering: Run GPU-only tests on GPU machines, CPU-only tests on CPU machines.
# This avoids running ~2000+ tests on expensive GPU machines when only ~30 require GPU.
# Tests are marked with @pytest.mark.gpu if they require CUDA.
#
# Set TORCHRL_GPU_FILTER=0 to disable this optimization and run all tests.
#
# We use an array to handle the marker expression properly (avoids quoting issues).
GPU_MARKER_FILTER=()
if [ "${TORCHRL_GPU_FILTER:-1}" = "1" ]; then
if [ "${CU_VERSION:-}" == cpu ]; then
# CPU job: run only tests that do NOT require GPU
GPU_MARKER_FILTER=(-m 'not gpu')
echo "GPU filtering enabled: Running CPU-only tests (excluding @pytest.mark.gpu)"
else
# GPU job: run only tests that require GPU
GPU_MARKER_FILTER=(-m gpu)
echo "GPU filtering enabled: Running GPU-only tests (@pytest.mark.gpu)"
fi
else
echo "GPU filtering disabled: Running all tests"
fi

export PYTORCH_TEST_WITH_SLOW='1'
python -m torch.utils.collect_env

Expand All @@ -292,6 +314,7 @@ run_distributed_tests() {
local json_report_args="--json-report --json-report-file=${json_report_dir}/test-results-distributed.json --json-report-indent=2"

# Run both test_distributed.py and test_rb_distributed.py (both use torch.distributed)
# Note: distributed tests always run on GPU, no need for GPU_MARKER_FILTER here
python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_distributed.py test/test_rb_distributed.py \
${json_report_args} \
--instafail --durations 200 -vv --capture no \
Expand Down Expand Up @@ -327,12 +350,14 @@ run_non_distributed_tests() {
1)
echo "Running shard 1: test_transforms.py only"
python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_transforms.py \
"${GPU_MARKER_FILTER[@]}" \
${json_report_args} \
${common_args}
;;
2)
echo "Running shard 2: test_envs.py and test_collectors.py"
python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_envs.py test/test_collectors.py \
"${GPU_MARKER_FILTER[@]}" \
${json_report_args} \
${common_args}
;;
Expand All @@ -344,13 +369,15 @@ run_non_distributed_tests() {
--ignore test/test_envs.py \
--ignore test/test_collectors.py \
${xdist_args} \
"${GPU_MARKER_FILTER[@]}" \
${json_report_args} \
${common_args}
;;
all|"")
echo "Running all tests (no sharding)"
python .github/unittest/helpers/coverage_run_parallel.py -m pytest test \
${common_ignores} \
"${GPU_MARKER_FILTER[@]}" \
${json_report_args} \
${common_args}
;;
Expand Down
73 changes: 73 additions & 0 deletions .github/workflows/validate-test-partitioning.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# Validates that GPU/CPU test partitioning covers all tests
#
# This workflow ensures that:
# 1. Tests marked with @pytest.mark.gpu + tests not marked = all tests
# 2. No tests are accidentally excluded from CI
#
# Runs on PRs to catch partitioning issues before merge.
name: Validate Test Partitioning

on:
pull_request:
push:
branches: [main, nightly]
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
validate:
runs-on: ubuntu-latest
timeout-minutes: 10
steps:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.12"

- name: Install minimal dependencies
run: |
pip install pytest tensordict
pip install torch --index-url https://download.pytorch.org/whl/cpu
# Skip editable install - just add to PYTHONPATH for test collection
- name: Validate test partitioning
env:
PYTHONPATH: ${{ github.workspace }}
run: |
set -e
echo "=================================================="
echo " TEST PARTITIONING VALIDATION"
echo "=================================================="
# Collect test counts
# Note: We ignore test_loggers.py due to torchvision operator issues on CPU-only
ALL=$(pytest --collect-only -q test/ --ignore test/test_loggers.py 2>/dev/null | tail -1 | grep -oE "^[0-9]+")
GPU=$(pytest --collect-only -q -m gpu test/ --ignore test/test_loggers.py 2>/dev/null | tail -1 | grep -oE "^[0-9]+")
CPU=$(pytest --collect-only -q -m "not gpu" test/ --ignore test/test_loggers.py 2>/dev/null | tail -1 | grep -oE "^[0-9]+")
echo ""
echo "Total tests: $ALL"
echo "GPU tests (@pytest.mark.gpu): $GPU"
echo "CPU tests (not gpu): $CPU"
echo "GPU + CPU: $((GPU + CPU))"
echo ""
# Validate: GPU + CPU should equal ALL
if [ "$((GPU + CPU))" -eq "$ALL" ]; then
echo "✅ PASS: Test partitioning is valid!"
echo " All tests are accounted for."
else
echo "❌ FAIL: Test partitioning mismatch!"
echo " GPU ($GPU) + CPU ($CPU) = $((GPU + CPU)), but total is $ALL"
echo ""
echo " This means some tests are either:"
echo " - Missing the @pytest.mark.gpu marker (if they require CUDA)"
echo " - Being excluded unintentionally"
exit 1
fi
2 changes: 2 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ addopts =
--tb=native
markers =
unity_editor
slow: mark test as slow to run
gpu: mark test as requiring a GPU (CUDA device)
testpaths =
test
xfail_strict = True
1 change: 1 addition & 0 deletions test/compile/test_compile_collectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def test_compiled_policy(self, collector_cls, compile_policy, device):
collector.shutdown()
del collector

@pytest.mark.gpu
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available")
@pytest.mark.parametrize(
"collector_cls",
Expand Down
3 changes: 3 additions & 0 deletions test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,9 @@ def pytest_runtest_setup(item):

def pytest_configure(config):
config.addinivalue_line("markers", "slow: mark test as slow to run")
config.addinivalue_line(
"markers", "gpu: mark test as requiring a GPU (CUDA device)"
)


def pytest_collection_modifyitems(config, items):
Expand Down
3 changes: 3 additions & 0 deletions test/llm/test_llm_updaters.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def get_open_port():
)


@pytest.mark.gpu
@pytest.mark.skipif(not _has_transformers, reason="missing transformers dependencies")
@pytest.mark.skipif(not _has_vllm, reason="missing vllm dependencies")
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
Expand Down Expand Up @@ -415,6 +416,7 @@ def test_local_llm_specific_features(self, target_vllm_engine):
"See LLM_TEST_ISSUES.md for details.",
strict=False,
)
@pytest.mark.gpu
@pytest.mark.skipif(not _has_ray, reason="missing ray dependencies")
@pytest.mark.skipif(not _has_vllm, reason="missing vllm dependencies")
@pytest.mark.skipif(not _has_transformers, reason="missing transformers dependencies")
Expand Down Expand Up @@ -611,6 +613,7 @@ def test_weight_sync_vllm_collective_ray(self, request):
ray.shutdown()


@pytest.mark.gpu
@pytest.mark.xfail(
reason="AsyncVLLM tests fail due to Ray placement group timeout. "
"See LLM_TEST_ISSUES.md for details.",
Expand Down
2 changes: 2 additions & 0 deletions test/llm/test_vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def sampling_params():
class TestAsyncVLLMIntegration:
"""Integration tests for AsyncVLLM with real models."""

@pytest.mark.gpu
@pytest.mark.skipif(not _has_vllm, reason="vllm not available")
@pytest.mark.skipif(not _has_ray, reason="ray not available")
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
Expand Down Expand Up @@ -111,6 +112,7 @@ def test_vllm_api_compatibility(self, sampling_params):
finally:
service.shutdown()

@pytest.mark.gpu
@pytest.mark.skipif(not _has_vllm, reason="vllm not available")
@pytest.mark.skipif(not _has_ray, reason="ray not available")
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
Expand Down
1 change: 1 addition & 0 deletions test/llm/test_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2104,6 +2104,7 @@ def test_log_probs_consistency(
"See LLM_TEST_ISSUES.md for details.",
strict=False,
)
@pytest.mark.gpu
@pytest.mark.skipif(not _has_vllm, reason="vllm not available")
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
def test_sync_async_vllm_strict_equivalence(
Expand Down
4 changes: 4 additions & 0 deletions test/test_collectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2333,6 +2333,7 @@ def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
def _set_seed(self, seed: int | None) -> None:
...

@pytest.mark.gpu
@pytest.mark.skipif(not torch.cuda.is_available(), reason="no cuda device")
@pytest.mark.parametrize("env_device", ["cuda:0", "cpu"])
@pytest.mark.parametrize("storing_device", [None, "cuda:0", "cpu"])
Expand Down Expand Up @@ -2371,6 +2372,7 @@ def test_no_synchronize(self, env_device, storing_device, no_cuda_sync):
assert u == i, i
mock_synchronize.assert_not_called()

@pytest.mark.gpu
@pytest.mark.parametrize("device", ["cuda", "cpu"])
@pytest.mark.parametrize("storing_device", ["cuda", "cpu"])
@pytest.mark.skipif(not torch.cuda.is_available(), reason="no cuda device found")
Expand Down Expand Up @@ -3162,6 +3164,7 @@ def test_multi_collector_consistency(
assert_allclose_td(c2.unsqueeze(0), d2)


@pytest.mark.gpu
@pytest.mark.skipif(
not torch.cuda.is_available() and (not has_mps()),
reason="No casting if no cuda",
Expand Down Expand Up @@ -3363,6 +3366,7 @@ def test_param_sync_mixed_device(
col.shutdown()
del col

@pytest.mark.gpu
@pytest.mark.skipif(
not torch.cuda.is_available() or torch.cuda.device_count() < 3,
reason="requires at least 3 CUDA devices",
Expand Down
5 changes: 5 additions & 0 deletions test/test_envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -597,6 +597,7 @@ def test_auto_spec(self, env_type):
env.auto_specs_(policy, tensordict=td.copy(), observation_key=obs_vals)
env.check_env_specs(tensordict=td.copy())

@pytest.mark.gpu
@pytest.mark.skipif(not torch.cuda.device_count(), reason="No cuda device found.")
@pytest.mark.parametrize("break_when_any_done", [True, False])
def test_auto_cast_to_device(self, break_when_any_done):
Expand Down Expand Up @@ -1526,6 +1527,7 @@ def test_parallel_env_with_policy(
# env_serial.close()
env0.close(raise_if_closed=False)

@pytest.mark.gpu
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")
@pytest.mark.parametrize("heterogeneous", [False, True])
def test_transform_env_transform_no_device(
Expand Down Expand Up @@ -1638,6 +1640,7 @@ def test_parallel_env_custom_method(self, parallel, maybe_fork_ParallelEnv):
finally:
env.close(raise_if_closed=False)

@pytest.mark.gpu
@pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda to test on")
@pytest.mark.skipif(not _has_gym, reason="no gym")
@pytest.mark.parametrize("frame_skip", [4])
Expand Down Expand Up @@ -1742,6 +1745,7 @@ def test_parallel_env_cast(
env_serial.close(raise_if_closed=False)
env0.close(raise_if_closed=False)

@pytest.mark.gpu
@pytest.mark.skipif(not _has_gym, reason="no gym")
@pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda device detected")
@pytest.mark.parametrize("frame_skip", [4])
Expand Down Expand Up @@ -2726,6 +2730,7 @@ def test_marl_group_type(group_type):
check_marl_grouping(group_type.get_group_map(agent_names), agent_names)


@pytest.mark.gpu
@pytest.mark.skipif(not torch.cuda.device_count(), reason="No cuda device")
class TestConcurrentEnvs:
"""Concurrent parallel envs on multiple procs can interfere."""
Expand Down
5 changes: 5 additions & 0 deletions test/test_libs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2157,6 +2157,7 @@ def test_set_seed_and_reset_works(self):

assert isinstance(td, TensorDict)

@pytest.mark.gpu
@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires cuda")
def test_dmcontrol_kwargs_preserved_with_seed(self):
"""Test that kwargs like camera_id are preserved when seed is provided.
Expand All @@ -2182,6 +2183,7 @@ def test_dmcontrol_kwargs_preserved_with_seed(self):
finally:
env.close()

@pytest.mark.gpu
@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires cuda")
@pytest.mark.parametrize("env_name,task", [["cheetah", "run"]])
@pytest.mark.parametrize("frame_skip", [1, 3])
Expand Down Expand Up @@ -2849,6 +2851,7 @@ def test_multithread_env_shutdown(self):
assert not env.is_closed
env.close()

@pytest.mark.gpu
@pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda to test on")
@pytest.mark.skipif(not _has_gym, reason="no gym")
@pytest.mark.parametrize("frame_skip", [4])
Expand Down Expand Up @@ -2889,6 +2892,7 @@ def test_multithreaded_env_cast(
assert td_device.device == torch.device(device), env_multithread
env_multithread.close()

@pytest.mark.gpu
@pytest.mark.skipif(not _has_gym, reason="no gym")
@pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda device detected")
@pytest.mark.parametrize("frame_skip", [4])
Expand Down Expand Up @@ -3170,6 +3174,7 @@ def test_brax_automatic_cache_clearing_parameter(self, envname, device, freq):
out_td, next_td = env.step_and_maybe_reset(next_td)
assert env._step_count == i + 1

@pytest.mark.gpu
@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires cuda")
def test_brax_kwargs_preserved_with_seed(self, envname, device):
"""Test that kwargs like camera_id are preserved when seed is provided.
Expand Down
1 change: 1 addition & 0 deletions test/test_rb.py
Original file line number Diff line number Diff line change
Expand Up @@ -753,6 +753,7 @@ def test_state_dict(self, storage_type, data_type):
storage2.get(range(10))
)

@pytest.mark.gpu
@pytest.mark.skipif(
not torch.cuda.device_count(),
reason="not cuda device found to test rb storage.",
Expand Down
2 changes: 2 additions & 0 deletions test/test_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3001,6 +3001,7 @@ def test_stack_zero_shape(self, stack_dim):
assert r["a"].shape == torch.Size([*shape, 1, 3, 2]) # access tensor
assert (r["a"] == 0).all()

@pytest.mark.gpu
@pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda")
@pytest.mark.parametrize("stack_dim", [0, 1, 2, -3, -2, -1])
def test_to(self, stack_dim):
Expand Down Expand Up @@ -3958,6 +3959,7 @@ def test_encode(self):
assert r.get("nontensor").shape == (1,)


@pytest.mark.gpu
@pytest.mark.skipif(not torch.cuda.is_available(), reason="not cuda device")
def test_device_ordinal():
device = torch.device("cpu")
Expand Down
4 changes: 4 additions & 0 deletions test/test_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -1292,6 +1292,7 @@ def test_constant_padding(self, padding_value):
assert (cat_td.get("cat_first_key") == padding_value).sum() == N - 4


@pytest.mark.gpu
@pytest.mark.skipif(not _has_tv, reason="torchvision not installed")
@pytest.mark.skipif(not torch.cuda.device_count(), reason="Testing R3M on cuda only")
@pytest.mark.parametrize("device", [torch.device("cuda:0")])
Expand Down Expand Up @@ -8788,6 +8789,7 @@ def test_transform_env(self):
assert (env.reset()["_eps_gSDE"] != 0.0).all()


@pytest.mark.gpu
@pytest.mark.skipif(not _has_tv, reason="torchvision not installed")
@pytest.mark.skipif(not torch.cuda.device_count(), reason="Testing VIP on cuda only")
@pytest.mark.parametrize("device", [torch.device("cuda:0")])
Expand Down Expand Up @@ -9259,6 +9261,7 @@ def test_vip_spec_against_real(self, model, tensor_pixels_key, device):
assert set(expected_keys) == set(transformed_env.rollout(3).keys(True))


@pytest.mark.gpu
@pytest.mark.skipif(not _has_vc, reason="vc_models not installed")
@pytest.mark.skipif(not torch.cuda.device_count(), reason="VC1 should run on cuda")
@pytest.mark.parametrize("device", [torch.device("cuda:0")])
Expand Down Expand Up @@ -10952,6 +10955,7 @@ def test_finitetensordictcheck(self, device):
with pytest.raises(ValueError, match="Encountered a non-finite tensor"):
ftd(td)

@pytest.mark.gpu
@pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda device found")
@pytest.mark.parametrize("device", get_default_devices())
def test_pin_mem(self, device):
Expand Down
Loading