pytorch
diff --git a/‎.github/workflows/build-wheels-aarch64-linux.yml‎
Lines changed: 13 additions & 0 deletions b/‎.github/workflows/build-wheels-aarch64-linux.yml‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎.github/workflows/build-wheels-linux.yml‎
Lines changed: 37 additions & 0 deletions b/‎.github/workflows/build-wheels-linux.yml‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎.github/workflows/build-wheels-m1.yml‎
Lines changed: 13 additions & 0 deletions b/‎.github/workflows/build-wheels-m1.yml‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎.github/workflows/build-wheels-windows.yml‎
Lines changed: 26 additions & 0 deletions b/‎.github/workflows/build-wheels-windows.yml‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎.github/workflows/release.yml‎
Lines changed: 27 additions & 17 deletions b/‎.github/workflows/release.yml‎
Lines changed: 27 additions & 17 deletions
diff --git a/‎knowledge_base/DM_CONTROL_INSTALLATION.md‎
Lines changed: 61 additions & 1 deletion b/‎knowledge_base/DM_CONTROL_INSTALLATION.md‎
Lines changed: 61 additions & 1 deletion
diff --git a/‎sota-implementations/a2c/a2c_atari.py‎
Lines changed: 1 addition & 2 deletions b/‎sota-implementations/a2c/a2c_atari.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎sota-implementations/a2c/a2c_mujoco.py‎
Lines changed: 1 addition & 2 deletions b/‎sota-implementations/a2c/a2c_mujoco.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎sota-implementations/cql/utils.py‎
Lines changed: 1 addition & 2 deletions b/‎sota-implementations/cql/utils.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎sota-implementations/crossq/utils.py‎
Lines changed: 1 addition & 2 deletions b/‎sota-implementations/crossq/utils.py‎
Lines changed: 1 addition & 2 deletions
@@ -12,6 +12,12 @@ on:
         # Release candidate tags look like: v1.11.0-rc1
         - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
   workflow_dispatch:
+    inputs:
+      build-cpu:
+        description: 'Build CPU wheels'
+        required: false
+        type: boolean
+        default: true
   workflow_call:
     inputs:
       test-infra-ref:
@@ -29,6 +35,11 @@ on:
         required: false
         type: string
         default: ''
+      with-cpu:
+        description: 'Build with CPU (enable/disable)'
+        required: false
+        type: string
+        default: 'enable'
 
 permissions:
   id-token: write
@@ -48,7 +59,9 @@ jobs:
       os: linux-aarch64
       test-infra-repository: pytorch/test-infra
       test-infra-ref: ${{ inputs.test-infra-ref || 'main' }}
+      # aarch64 only supports CPU builds
       with-cuda: disable
+      with-cpu: ${{ github.event_name == 'workflow_dispatch' && (inputs.build-cpu && 'enable' || 'disable') || inputs.with-cpu || 'enable' }}
       channel: ${{ inputs.channel || '' }}
       use-only-dl-pytorch-org: ${{ inputs.channel == 'release' && 'true' || 'false' }}
   build:
 
@@ -12,6 +12,22 @@ on:
         # Release candidate tags look like: v1.11.0-rc1
         - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
   workflow_dispatch:
+    inputs:
+      build-cpu:
+        description: 'Build CPU wheels'
+        required: false
+        type: boolean
+        default: true
+      build-cuda:
+        description: 'Build CUDA wheels'
+        required: false
+        type: boolean
+        default: false
+      build-rocm:
+        description: 'Build ROCm wheels'
+        required: false
+        type: boolean
+        default: false
   workflow_call:
     inputs:
       test-infra-ref:
@@ -29,6 +45,21 @@ on:
         required: false
         type: string
         default: ''
+      with-cuda:
+        description: 'Build with CUDA (enable/disable)'
+        required: false
+        type: string
+        default: 'enable'
+      with-rocm:
+        description: 'Build with ROCm (enable/disable)'
+        required: false
+        type: string
+        default: 'enable'
+      with-cpu:
+        description: 'Build with CPU (enable/disable)'
+        required: false
+        type: string
+        default: 'enable'
 
 permissions:
   id-token: write
@@ -50,6 +81,12 @@ jobs:
       test-infra-ref: ${{ inputs.test-infra-ref || 'main' }}
       channel: ${{ inputs.channel || '' }}
       use-only-dl-pytorch-org: ${{ inputs.channel == 'release' && 'true' || 'false' }}
+      # For workflow_dispatch: convert boolean to enable/disable string
+      # For workflow_call: use the string input directly
+      # Default: enable all variants
+      with-cuda: ${{ github.event_name == 'workflow_dispatch' && (inputs.build-cuda && 'enable' || 'disable') || inputs.with-cuda || 'enable' }}
+      with-rocm: ${{ github.event_name == 'workflow_dispatch' && (inputs.build-rocm && 'enable' || 'disable') || inputs.with-rocm || 'enable' }}
+      with-cpu: ${{ github.event_name == 'workflow_dispatch' && (inputs.build-cpu && 'enable' || 'disable') || inputs.with-cpu || 'enable' }}
   build:
     needs: generate-matrix
     strategy:
 
@@ -12,6 +12,12 @@ on:
         # Release candidate tags look like: v1.11.0-rc1
         - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
   workflow_dispatch:
+    inputs:
+      build-cpu:
+        description: 'Build CPU wheels'
+        required: false
+        type: boolean
+        default: true
   workflow_call:
     inputs:
       test-infra-ref:
@@ -29,6 +35,11 @@ on:
         required: false
         type: string
         default: ''
+      with-cpu:
+        description: 'Build with CPU (enable/disable)'
+        required: false
+        type: string
+        default: 'enable'
 
 permissions:
   id-token: write
@@ -50,6 +61,8 @@ jobs:
       test-infra-ref: ${{ inputs.test-infra-ref || 'main' }}
       channel: ${{ inputs.channel || '' }}
       use-only-dl-pytorch-org: ${{ inputs.channel == 'release' && 'true' || 'false' }}
+      # macOS only supports CPU builds
+      with-cpu: ${{ github.event_name == 'workflow_dispatch' && (inputs.build-cpu && 'enable' || 'disable') || inputs.with-cpu || 'enable' }}
   build:
     needs: generate-matrix
     strategy:
 
@@ -12,6 +12,17 @@ on:
         # Release candidate tags look like: v1.11.0-rc1
         - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
   workflow_dispatch:
+    inputs:
+      build-cpu:
+        description: 'Build CPU wheels'
+        required: false
+        type: boolean
+        default: true
+      build-cuda:
+        description: 'Build CUDA wheels'
+        required: false
+        type: boolean
+        default: false
   workflow_call:
     inputs:
       test-infra-ref:
@@ -29,6 +40,16 @@ on:
         required: false
         type: string
         default: ''
+      with-cuda:
+        description: 'Build with CUDA (enable/disable)'
+        required: false
+        type: string
+        default: 'enable'
+      with-cpu:
+        description: 'Build with CPU (enable/disable)'
+        required: false
+        type: string
+        default: 'enable'
 
 permissions:
   id-token: write
@@ -50,6 +71,11 @@ jobs:
       test-infra-ref: ${{ inputs.test-infra-ref || 'main' }}
       channel: ${{ inputs.channel || '' }}
       use-only-dl-pytorch-org: ${{ inputs.channel == 'release' && 'true' || 'false' }}
+      # For workflow_dispatch: convert boolean to enable/disable string
+      # For workflow_call: use the string input directly
+      # Default: enable all variants (Windows has no ROCm)
+      with-cuda: ${{ github.event_name == 'workflow_dispatch' && (inputs.build-cuda && 'enable' || 'disable') || inputs.with-cuda || 'enable' }}
+      with-cpu: ${{ github.event_name == 'workflow_dispatch' && (inputs.build-cpu && 'enable' || 'disable') || inputs.with-cpu || 'enable' }}
   build:
     needs: generate-matrix
     strategy:
 
@@ -16,10 +16,10 @@
 # - main, nightly, PRs: install from git (latest dev)
 # - Can be overridden with tensordict_source input
 #
-# Wheel Variants:
-# - cpu (default): Recommended for torchrl - avoids duplicate filename conflicts
-# - gpu: Only CUDA builds
-# - all: All variants (with deduplication to prevent corruption)
+# Wheel Variants (selectable via checkboxes):
+# - build_cpu (default: true): Recommended for torchrl - pure Python library
+# - build_cuda (default: false): CUDA builds (Linux, Windows)
+# - build_rocm (default: false): ROCm builds (Linux only)
 #
 # NOTE: This workflow is NOT automatically triggered on tag push to avoid
 # race conditions with wheel builds. Use workflow_dispatch to trigger releases.
@@ -57,15 +57,21 @@ on:
           - 'stable'
           - 'git'
         default: 'auto'
-      wheel_variants:
-        description: 'Which wheel variants to collect (cpu recommended for torchrl)'
+      build_cpu:
+        description: 'Build CPU wheels (recommended for torchrl - pure Python library)'
         required: false
-        type: choice
-        options:
-          - 'cpu'
-          - 'gpu'
-          - 'all'
-        default: 'cpu'
+        type: boolean
+        default: true
+      build_cuda:
+        description: 'Build CUDA wheels'
+        required: false
+        type: boolean
+        default: false
+      build_rocm:
+        description: 'Build ROCm wheels (Linux only)'
+        required: false
+        type: boolean
+        default: false
 
 # Ensure only one release workflow runs at a time
 # cancel-in-progress: true means new runs cancel previous ones
@@ -259,6 +265,9 @@ jobs:
       test-infra-ref: ${{ inputs.pytorch_release || 'main' }}
       tensordict-source: ${{ inputs.tensordict_source || 'auto' }}
       channel: release
+      with-cpu: ${{ inputs.build_cpu && 'enable' || 'disable' }}
+      with-cuda: ${{ inputs.build_cuda && 'enable' || 'disable' }}
+      with-rocm: ${{ inputs.build_rocm && 'enable' || 'disable' }}
     secrets: inherit
 
   build-windows:
@@ -271,6 +280,8 @@ jobs:
       test-infra-ref: ${{ inputs.pytorch_release || 'main' }}
       tensordict-source: ${{ inputs.tensordict_source || 'auto' }}
       channel: release
+      with-cpu: ${{ inputs.build_cpu && 'enable' || 'disable' }}
+      with-cuda: ${{ inputs.build_cuda && 'enable' || 'disable' }}
     secrets: inherit
 
   build-macos:
@@ -283,6 +294,7 @@ jobs:
       test-infra-ref: ${{ inputs.pytorch_release || 'main' }}
       tensordict-source: ${{ inputs.tensordict_source || 'auto' }}
       channel: release
+      with-cpu: ${{ inputs.build_cpu && 'enable' || 'disable' }}
     secrets: inherit
 
   build-aarch64:
@@ -295,6 +307,7 @@ jobs:
       test-infra-ref: ${{ inputs.pytorch_release || 'main' }}
       tensordict-source: ${{ inputs.tensordict_source || 'auto' }}
       channel: release
+      with-cpu: ${{ inputs.build_cpu && 'enable' || 'disable' }}
     secrets: inherit
 
   # =============================================================================
@@ -330,12 +343,9 @@ jobs:
         uses: actions/download-artifact@v4
         with:
           path: wheels-raw
-          # Pattern based on wheel_variants input:
-          # - cpu: Only CPU builds (recommended - avoids duplicate wheel conflicts)
-          # - gpu: Only CUDA builds
-          # - all: All variants (requires deduplication)
+          # Download all pytorch_rl artifacts - filtering by selected variants happens below
           # pytorch/test-infra uploads artifacts named like: pytorch_rl__3.11_cpu_x86_64
-          pattern: ${{ inputs.wheel_variants == 'gpu' && 'pytorch_rl__*_cu*' || inputs.wheel_variants == 'all' && 'pytorch_rl*' || 'pytorch_rl__*_cpu_*' }}
+          pattern: pytorch_rl*
           merge-multiple: true
 
       - name: Deduplicate and verify wheels
 
@@ -62,9 +62,69 @@ pip install dm-env dm-tree glfw lxml mujoco numpy pyopengl pyparsing scipy
 
 ### 2. EGL/rendering issues
 
-See [MUJOCO_INSTALLATION.md](./MUJOCO_INSTALLATION.md) for rendering-related 
+See [MUJOCO_INSTALLATION.md](./MUJOCO_INSTALLATION.md) for rendering-related
 issues, as dm_control uses MuJoCo for rendering.
 
+#### EGL multi-GPU device selection in containers (Docker / SLURM)
+
+When running `ParallelEnv` with pixel-based dm_control environments on a
+multi-GPU machine, all rendering contends on a **single GPU** — even if the
+host has 8 GPUs. This inflates per-worker render time by ~3x (e.g. 17ms serial
+→ 54ms with 8 workers sharing one GPU's EGL queue).
+
+**Root cause:** Inside Docker or SLURM containers, the NVIDIA container runtime
+only exposes the GPU(s) assigned to the job to EGL. `eglQueryDevicesEXT()`
+returns 1 device regardless of how many physical GPUs the host has.
+Setting `MUJOCO_EGL_DEVICE_ID` or `EGL_DEVICE_ID` to anything other than 0
+raises:
+
+```
+RuntimeError: MUJOCO_EGL_DEVICE_ID must be an integer between 0 and 0 (inclusive), got 1.
+```
+
+Unsetting `CUDA_VISIBLE_DEVICES` in the worker does **not** help — the
+container isolation happens at the NVIDIA driver/runtime level, below the
+environment variable.
+
+**Note on variable naming:** dm_control uses `MUJOCO_EGL_DEVICE_ID` internally
+(which maps to the same thing as MuJoCo's variable). Historically there was
+also `EGL_DEVICE_ID` used by older dm_control versions. See
+[dm_control#345](https://github.com/google-deepmind/dm_control/issues/345)
+for the unification discussion.
+
+**Upstream issues:**
+- [mujoco#572 — Cannot access all GPUs through EGL devices when using docker](https://github.com/google-deepmind/mujoco/issues/572)
+- [dm_control#345 — Unify EGL_DEVICE_ID with MUJOCO_EGL_DEVICE_ID](https://github.com/google-deepmind/dm_control/issues/345)
+
+**Workarounds:**
+
+1. **Configure container for full GPU access.** If you control the container
+   runtime, set `NVIDIA_VISIBLE_DEVICES=all` and
+   `NVIDIA_DRIVER_CAPABILITIES=all` so EGL can see all GPUs. Then assign
+   `MUJOCO_EGL_DEVICE_ID=<worker_idx % num_gpus>` per worker process
+   **before** dm_control is imported (the EGL display is created at import
+   time).
+
+2. **Run outside containers.** On bare metal, `eglQueryDevicesEXT()` correctly
+   returns all GPUs (plus the X server display, if any).
+
+3. **Reduce rendering overhead.** If multi-GPU rendering is not possible:
+   - Lower the rendering resolution (e.g. 64x64 instead of 84x84)
+   - Render at a lower frequency than the simulation step (frame-skip)
+   - Use state-only observations where possible — the IPC overhead is small
+     compared to rendering
+
+#### No batched rendering support in MuJoCo
+
+MuJoCo does not support batched GPU rendering — each environment renders its
+scene independently through its own OpenGL context. There is no API to submit
+multiple scenes to the GPU in one call.
+
+MuJoCo XLA (MJX) accelerates *simulation* on GPU via JAX but still requires
+copying data back to CPU for rendering through the standard `mujoco.Renderer`
+pipeline. See [mujoco#1604](https://github.com/google-deepmind/mujoco/issues/1604)
+for discussion on batched rendering support.
+
 ### 3. macOS ARM64 (Apple Silicon) specific issues
 
 On Apple Silicon Macs, ensure you're using native ARM Python, not Rosetta:
 
@@ -279,8 +279,7 @@ def update(batch, max_grad_norm=cfg.optim.max_grad_norm):
         if logger:
             metrics_to_log.update(timeit.todict(prefix="time"))
             metrics_to_log["time/speed"] = pbar.format_dict["rate"]
-            for key, value in metrics_to_log.items():
-                logger.log_scalar(key, value, collected_frames)
+            logger.log_metrics(metrics_to_log, collected_frames)
 
     collector.shutdown()
     if not test_env.is_closed:
 
@@ -261,8 +261,7 @@ def update(batch):
         if logger:
             metrics_to_log.update(timeit.todict(prefix="time"))
             metrics_to_log["time/speed"] = pbar.format_dict["rate"]
-            for key, value in metrics_to_log.items():
-                logger.log_scalar(key, value, collected_frames)
+            logger.log_metrics(metrics_to_log, collected_frames)
 
     collector.shutdown()
     if not test_env.is_closed:
 
@@ -462,8 +462,7 @@ def make_continuous_cql_optimizer(cfg, loss_module):
 
 def log_metrics(logger, metrics, step):
     if logger is not None:
-        for metric_name, metric_value in metrics.items():
-            logger.log_scalar(metric_name, metric_value, step)
+        logger.log_metrics(metrics, step)
 
 
 def dump_video(module):
 
@@ -305,8 +305,7 @@ def make_crossQ_optimizer(cfg, loss_module):
 
 
 def log_metrics(logger, metrics, step):
-    for metric_name, metric_value in metrics.items():
-        logger.log_scalar(metric_name, metric_value, step)
+    logger.log_metrics(metrics, step)
 
 
 def get_activation(activation: str):