[BugFix] Brax memory leak fix (#3052)

vmoens · web-flow · commit 32f7d72db5b4 · 2025-07-10T12:53:46.000+01:00
diff --git a/.github/unittest/linux_libs/scripts_brax/environment.yml b/.github/unittest/linux_libs/scripts_brax/environment.yml
@@ -21,3 +21,4 @@ dependencies:
     - hydra-core
     - jax[cuda12]
     - brax
+    - psutil
diff --git a/.github/unittest/linux_libs/scripts_brax/run_test.sh b/.github/unittest/linux_libs/scripts_brax/run_test.sh
@@ -8,6 +8,13 @@ conda activate ./env
 
 export PYTORCH_TEST_WITH_SLOW='1'
 export LAZY_LEGACY_OP=False
+
+# Configure JAX for proper GPU initialization
+export XLA_PYTHON_CLIENT_PREALLOCATE=false
+export XLA_PYTHON_CLIENT_ALLOCATOR=platform
+export TF_FORCE_GPU_ALLOW_GROWTH=true
+export CUDA_VISIBLE_DEVICES=0
+
 python -m torch.utils.collect_env
 # Avoid error: "fatal: unsafe repository"
 git config --global --add safe.directory '*'
@@ -28,7 +35,33 @@ export MAGNUM_LOG=verbose MAGNUM_GPU_VALIDATION=ON
 # this workflow only tests the libs
 python -c "import brax"
 python -c "import brax.envs"
-python -c "import jax"
+
+# Initialize JAX with proper GPU configuration
+python -c "
+import jax
+import jax.numpy as jnp
+import os
+
+# Configure JAX for GPU
+os.environ['XLA_PYTHON_CLIENT_PREALLOCATE'] = 'false'
+os.environ['XLA_PYTHON_CLIENT_ALLOCATOR'] = 'platform'
+
+# Test JAX GPU availability
+try:
+    devices = jax.devices()
+    print(f'JAX devices: {devices}')
+    if len(devices) > 1:
+        print('JAX GPU is available')
+    else:
+        print('JAX CPU only')
+except Exception as e:
+    print(f'JAX initialization error: {e}')
+    # Fallback to CPU
+    os.environ['JAX_PLATFORM_NAME'] = 'cpu'
+    jax.config.update('jax_platform_name', 'cpu')
+    print('Falling back to JAX CPU')
+"
+
 python3 -c 'import torch;t = torch.ones([2,2], device="cuda:0");print(t);print("tensor device:" + str(t.device))'
 
 python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_libs.py --instafail -v --durations 200 --capture no -k TestBrax --error-for-skips
diff --git a/.github/workflows/test-linux-libs.yml b/.github/workflows/test-linux-libs.yml
@@ -21,39 +21,39 @@ permissions:
 
 jobs:
 
-  unittests-atari-dqn:
-    strategy:
-      matrix:
-        python_version: ["3.10"]
-        cuda_arch_version: ["12.8"]
-    if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Data') }}
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    with:
-      repository: pytorch/rl
-      runner: "linux.g5.4xlarge.nvidia.gpu"
-      docker-image: "nvidia/cuda:12.4.0-devel-ubuntu22.04"
-      timeout: 120
-      script: |
-        if [[ "${{ github.ref }}" =~ release/* ]]; then
-          export RELEASE=1
-          export TORCH_VERSION=stable
-        else
-          export RELEASE=0
-          export TORCH_VERSION=nightly
-        fi
-
-        set -euo pipefail
-        export PYTHON_VERSION="3.10"
-        export CU_VERSION="cu128"
-        export TAR_OPTIONS="--no-same-owner"
-        export UPLOAD_CHANNEL="nightly"
-        export TF_CPP_MIN_LOG_LEVEL=0
-        export TD_GET_DEFAULTS_TO_NONE=1
-
-        bash .github/unittest/linux_libs/scripts_ataridqn/setup_env.sh
-        bash .github/unittest/linux_libs/scripts_ataridqn/install.sh
-        bash .github/unittest/linux_libs/scripts_ataridqn/run_test.sh
-        bash .github/unittest/linux_libs/scripts_ataridqn/post_process.sh
+  # unittests-atari-dqn:
+  #   strategy:
+  #     matrix:
+  #       python_version: ["3.10"]
+  #       cuda_arch_version: ["12.8"]
+  #   if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Data') }}
+  #   uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+  #   with:
+  #     repository: pytorch/rl
+  #     runner: "linux.g5.4xlarge.nvidia.gpu"
+  #     docker-image: "nvidia/cuda:12.4.0-devel-ubuntu22.04"
+  #     timeout: 120
+  #     script: |
+  #       if [[ "${{ github.ref }}" =~ release/* ]]; then
+  #         export RELEASE=1
+  #         export TORCH_VERSION=stable
+  #       else
+  #         export RELEASE=0
+  #         export TORCH_VERSION=nightly
+  #       fi
+
+  #       set -euo pipefail
+  #       export PYTHON_VERSION="3.10"
+  #       export CU_VERSION="cu128"
+  #       export TAR_OPTIONS="--no-same-owner"
+  #       export UPLOAD_CHANNEL="nightly"
+  #       export TF_CPP_MIN_LOG_LEVEL=0
+  #       export TD_GET_DEFAULTS_TO_NONE=1
+
+  #       bash .github/unittest/linux_libs/scripts_ataridqn/setup_env.sh
+  #       bash .github/unittest/linux_libs/scripts_ataridqn/install.sh
+  #       bash .github/unittest/linux_libs/scripts_ataridqn/run_test.sh
+  #       bash .github/unittest/linux_libs/scripts_ataridqn/post_process.sh
 
   unittests-brax:
     strategy:
diff --git a/test/test_libs.py b/test/test_libs.py
@@ -8,8 +8,10 @@
 import functools
 import gc
 import importlib.util
+import os
 import urllib.error
 
+
 _has_isaac = importlib.util.find_spec("isaacgym") is not None
 
 if _has_isaac:
@@ -19,7 +21,6 @@
     from torchrl.envs.libs.isaacgym import IsaacGymEnv
 import argparse
 import importlib
-import os
 
 import time
 import urllib
@@ -2414,6 +2415,28 @@ def test_env_device(self, env_name, frame_skip, transformed_out, device):
 @pytest.mark.parametrize("device", get_available_devices())
 @pytest.mark.parametrize("envname", ["fast"])
 class TestBrax:
+    @pytest.fixture(autouse=True)
+    def _setup_jax(self):
+        """Configure JAX for proper GPU initialization."""
+        import os
+
+        import jax
+
+        # Set JAX environment variables for better GPU handling
+        os.environ.setdefault("XLA_PYTHON_CLIENT_PREALLOCATE", "false")
+        os.environ.setdefault("XLA_PYTHON_CLIENT_ALLOCATOR", "platform")
+        os.environ.setdefault("TF_FORCE_GPU_ALLOW_GROWTH", "true")
+
+        # Try to initialize JAX with GPU, fallback to CPU if it fails
+        try:
+            jax.devices()
+        except Exception:
+            # Fallback to CPU
+            os.environ["JAX_PLATFORM_NAME"] = "cpu"
+            jax.config.update("jax_platform_name", "cpu")
+
+        yield
+
     @pytest.mark.parametrize("requires_grad", [False, True])
     def test_brax_constructor(self, envname, requires_grad, device):
         env0 = BraxEnv(envname, requires_grad=requires_grad, device=device)
@@ -2545,6 +2568,75 @@ def make_brax():
         tensordict = env.rollout(3)
         assert tensordict.shape == torch.Size([n, *batch_size, 3])
 
+    def test_brax_memory_leak(self, envname, device):
+        """Test memory usage with different cache clearing strategies."""
+        import psutil
+
+        process = psutil.Process(os.getpid())
+        env = BraxEnv(
+            envname,
+            batch_size=[10],
+            requires_grad=True,
+            device=device,
+        )
+        env.clear_cache()
+        gc.collect()
+        env.set_seed(0)
+        next_td = env.reset()
+        num_steps = 200
+        policy = TensorDictModule(
+            torch.nn.Linear(
+                env.observation_spec[env.observation_keys[0]].shape[-1],
+                env.action_spec.shape[-1],
+                device=device,
+            ),
+            in_keys=env.observation_keys[:1],
+            out_keys=["action"],
+        )
+        initial_memory = process.memory_info().rss / 1024 / 1024  # MB
+        for i in range(num_steps):
+            policy(next_td)
+            out_td, next_td = env.step_and_maybe_reset(next_td)
+            if i % 50 == 0:
+                loss = out_td["next", "observation"].sum()
+                loss.backward()
+                next_td = next_td.detach().clone()
+            # gc.collect()
+        final_memory = process.memory_info().rss / 1024 / 1024  # MB
+        memory_increase = final_memory - initial_memory
+        assert (
+            memory_increase < 100
+        ), f"Memory leak with automatic clearing: {memory_increase:.2f} MB"
+
+    def test_brax_cache_clearing(self, envname, device):
+        env = BraxEnv(envname, batch_size=[1], requires_grad=True, device=device)
+        env.clear_cache()
+        for _ in range(5):
+            env.clear_cache()
+
+    @pytest.mark.parametrize("freq", [10, None, False])
+    def test_brax_automatic_cache_clearing_parameter(self, envname, device, freq):
+        env = BraxEnv(
+            envname,
+            batch_size=[1],
+            requires_grad=True,
+            device=device,
+            cache_clear_frequency=freq,
+        )
+        if freq is False:
+            assert env._cache_clear_frequency is False
+        elif freq is None:
+            assert env._cache_clear_frequency == 20  # Default value
+        else:
+            assert env._cache_clear_frequency == freq
+        env.set_seed(0)
+        next_td = env.reset()
+        for i in range(10):
+            action = env.action_spec.rand()
+            next_td["action"] = action
+            out_td, next_td = env.step_and_maybe_reset(next_td)
+            assert env._step_count == i + 1
+
 
 @pytest.mark.skipif(not _has_vmas, reason="vmas not installed")
 class TestVmas:
diff --git a/torchrl/data/datasets/atari_dqn.py b/torchrl/data/datasets/atari_dqn.py
@@ -411,6 +411,12 @@ def __init__(
         mp_start_method: str = "fork",
         **kwargs,
     ):
+        import warnings
+
+        warnings.warn(
+            "This dataset is no longer available. We are working on a fix, or possibly a deprecation.",
+            DeprecationWarning,
+        )
         if dataset_id not in self.available_datasets:
             raise ValueError(
                 "The dataseet_id is not part of the available datasets. The dataset should be named <game_name>/<run> "
diff --git a/torchrl/envs/libs/brax.py b/torchrl/envs/libs/brax.py