huggingface · nijkah · Oct 20, 2025 · Oct 20, 2025 · Oct 20, 2025 · Oct 20, 2025
diff --git a/docs/source/multi_gpu_training.mdx b/docs/source/multi_gpu_training.mdx
@@ -74,44 +74,38 @@ When you launch training with accelerate:
 3. **Gradient synchronization**: Gradients are synchronized across GPUs during backpropagation
 4. **Single process logging**: Only the main process logs to wandb and saves checkpoints
 
-## Learning Rate and Training Steps Scaling
+## Learning Rate and Steps Auto-Scaling
 
-**Important:** LeRobot does **NOT** automatically scale learning rates or training steps based on the number of GPUs. This gives you full control over your training hyperparameters.
+By default, LeRobot keeps your hyperparameters exactly as you set them. If you want automatic scaling when using multiple GPUs, enable it with `--auto_scale=true`.
 
-### Why No Automatic Scaling?
+When `--auto_scale=true` is enabled and training runs with multiple processes (GPUs), LeRobot will:
 
-Many distributed training frameworks automatically scale the learning rate by the number of GPUs (e.g., `lr = base_lr × num_gpus`).
-However, LeRobot keeps the learning rate exactly as you specify it.
+- Multiply the optimizer learning rate by the number of processes (linear LR scaling)
+- Divide the total number of training steps by the number of processes
 
-### When and How to Scale
+This keeps the total number of samples and overall training dynamics roughly consistent across different GPU counts.
 
-If you want to scale your hyperparameters when using multiple GPUs, you should do it manually:
-
-**Learning Rate Scaling:**
+Example with 2 GPUs:
 
 ```bash
-# Example: 2 GPUs with linear LR scaling
-# Base LR: 1e-4, with 2 GPUs -> 2e-4
 accelerate launch --num_processes=2 $(which lerobot-train) \
-  --optimizer.lr=2e-4 \
   --dataset.repo_id=lerobot/pusht \
-  --policy=act
+  --policy.type=act \
+  --batch_size=8 \
+  --steps=100000 \
+  --auto_scale=true
 ```
 
-**Training Steps Scaling:**
+The effective behavior will be approximately:
 
-Since the effective batch size `bs` increases with multiple GPUs (batch_size × num_gpus), you may want to reduce the number of training steps proportionally:
+- Optimizer LR: `lr × 2`
+- Steps: `100000 → 50000`
 
-```bash
-# Example: 2 GPUs with effective batch size 2x larger
-# Original: batch_size=8, steps=100000
-# With 2 GPUs: batch_size=8 (16 in total), steps=50000
-accelerate launch --num_processes=2 $(which lerobot-train) \
-  --batch_size=8 \
-  --steps=50000 \
-  --dataset.repo_id=lerobot/pusht \
-  --policy=act
-```
+Notes:
+
+- Auto-scaling is skipped when resuming from a checkpoint to preserve training state.
+- If you prefer manual control, omit `--auto_scale` and set `--optimizer.lr` and `--steps` yourself.
+- Checkpoint and eval cadence: `auto_scale` currently does not adjust `save_freq` or `eval_freq`. If you want the same number of checkpoints/evaluations as a single-GPU run, consider scaling them as `ceil(freq / world_size)`. Whether to scale these by default is under discussion with maintainers.
 
 ## Notes
 

diff --git a/src/lerobot/configs/train.py b/src/lerobot/configs/train.py
@@ -58,6 +58,11 @@ class TrainPipelineConfig(HubMixin):
     save_checkpoint: bool = True
     # Checkpoint is saved every `save_freq` training iterations and after the last training step.
     save_freq: int = 20_000
+    # When enabled, and if running under Accelerate with multiple processes, we:
+    # - multiply the learning rate by the number of processes (linear LR scaling)
+    # - divide the number of training steps by the number of processes
+    # This keeps the total number of samples and (approximately) the effective learning dynamics consistent.
+    auto_scale: bool = False
     use_policy_training_preset: bool = True
     optimizer: OptimizerConfig | None = None
     scheduler: LRSchedulerConfig | None = None

diff --git a/src/lerobot/scripts/lerobot_train.py b/src/lerobot/scripts/lerobot_train.py
@@ -158,6 +158,60 @@ def train(cfg: TrainPipelineConfig, accelerator: Accelerator | None = None):
     # When using accelerate, only the main process should log to avoid duplicate outputs
     is_main_process = accelerator.is_main_process
 
+    # Optionally auto-scale LR and steps when using multiple processes BEFORE logging config
+    def _scale_optimizer_lr(opt_cfg, factor: int):
+        """Scale learning rates within optimizer config, including grouped configs."""
+        try:
+            if hasattr(opt_cfg, "lr") and isinstance(getattr(opt_cfg, "lr"), (int, float)):
+                opt_cfg.lr *= factor
+            # Handle multi-optimizer groups if present
+            if hasattr(opt_cfg, "optimizer_groups") and isinstance(opt_cfg.optimizer_groups, dict):
+                for _name, group in opt_cfg.optimizer_groups.items():
+                    if isinstance(group, dict) and "lr" in group and isinstance(group["lr"], (int, float)):
+                        group["lr"] *= factor
+        except Exception:
+            # Best-effort scaling; if structure is unexpected, skip silently
+            pass
+
+    if cfg.auto_scale and not cfg.resume:
+        world_size = max(1, accelerator.num_processes)
+        if world_size > 1:
+            old_steps = cfg.steps
+            # Linear LR scaling
+            _scale_optimizer_lr(cfg.optimizer, world_size)
+            # Also scale policy-level LR fields when using presets (e.g., optimizer_lr, optimizer_lr_backbone)
+            if cfg.use_policy_training_preset and cfg.policy is not None:
+                scaled_policy_lr_fields: list[tuple[str, float, float]] = []
+                for attr in dir(cfg.policy):
+                    if not attr.startswith("optimizer_lr"):
+                        continue
+                    try:
+                        val = getattr(cfg.policy, attr)
+                    except Exception:
+                        continue
+                    if isinstance(val, (int, float)):
+                        new_val = val * world_size
+                        setattr(cfg.policy, attr, new_val)
+                        scaled_policy_lr_fields.append((attr, val, new_val))
+            # Scale steps down so total samples processed remains comparable
+            cfg.steps = max(1, (cfg.steps + world_size - 1) // world_size)
+            if is_main_process:
+                logging.info(
+                    colored(
+                        f"Auto-scale enabled with world_size={world_size}: lr x{world_size}, steps {old_steps} -> {cfg.steps}",
+                        "cyan",
+                        attrs=["bold"],
+                    )
+                )
+                if cfg.use_policy_training_preset and cfg.policy is not None:
+                    for (name, old_v, new_v) in scaled_policy_lr_fields:
+                        logging.info(colored(f"Auto-scale policy {name}: {old_v} -> {new_v}", "cyan"))
+        else:
+            if is_main_process:
+                logging.info("Auto-scale enabled but single process detected; skipping scaling.")
+    elif cfg.auto_scale and cfg.resume and is_main_process:
+        logging.info("Auto-scale requested but resume=True; skipping scaling to preserve checkpoint state.")
+
     # Only log on main process
     if is_main_process:
         logging.info(pformat(cfg.to_dict()))

diff --git a/tests/training/test_auto_scale.py b/tests/training/test_auto_scale.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+
+import os
+import subprocess
+import tempfile
+from pathlib import Path
+
+import pytest
+import torch
+
+
+def get_num_available_gpus():
+    if not torch.cuda.is_available():
+        return 0
+    return torch.cuda.device_count()
+
+
+def run_training_with_auto_scale(config_args, num_processes=2, temp_dir=None):
+    config_path = Path(temp_dir) / "accelerate_config.yaml"
+
+    with open(config_path, "w") as f:
+        f.write("compute_environment: LOCAL_MACHINE\n")
+        f.write("distributed_type: MULTI_GPU\n")
+        f.write("mixed_precision: 'no'\n")
+        f.write(f"num_processes: {num_processes}\n")
+        f.write("use_cpu: false\n")
+        f.write("gpu_ids: all\n")
+        f.write("downcast_bf16: 'no'\n")
+        f.write("machine_rank: 0\n")
+        f.write("main_training_function: main\n")
+        f.write("num_machines: 1\n")
+        f.write("rdzv_backend: static\n")
+        f.write("same_network: true\n")
+
+    cmd = [
+        "accelerate",
+        "launch",
+        "--config_file",
+        str(config_path),
+        "-m",
+        "lerobot.scripts.lerobot_train",
+    ] + config_args
+
+    result = subprocess.run(
+        cmd,
+        capture_output=True,
+        text=True,
+        env={**os.environ, "CUDA_VISIBLE_DEVICES": ",".join(map(str, range(num_processes)))}
+    )
+
+    return result
+
+
+@pytest.mark.skipif(
+    get_num_available_gpus() < 2,
+    reason="Auto-scale test requires at least 2 GPUs",
+)
+def test_auto_scale_steps_and_lr():
+    with tempfile.TemporaryDirectory() as temp_dir:
+        output_dir = Path(temp_dir) / "outputs"
+
+        base_steps = 20
+        args = [
+            "--dataset.repo_id=lerobot/pusht",
+            "--dataset.episodes=[0]",
+            "--policy.type=act",
+            "--policy.device=cuda",
+            "--policy.push_to_hub=false",
+            f"--output_dir={output_dir}",
+            "--batch_size=4",
+            f"--steps={base_steps}",
+            "--eval_freq=-1",
+            "--log_freq=5",
+            "--save_freq=10",
+            "--seed=42",
+            "--num_workers=0",
+            "--auto_scale=true",
+        ]
+
+        result = run_training_with_auto_scale(args, num_processes=2, temp_dir=temp_dir)
+
+        assert result.returncode == 0, (
+            f"Training failed with auto-scale enabled.\nSTDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}"
+        )
+
+        # Check for auto-scale log message indicating steps were scaled
+        combined = result.stdout + "\n" + result.stderr
+        assert "Auto-scale enabled with world_size=2" in combined
+        assert "steps 20 -> 10" in combined or "steps 20 " in combined