diff --git a/docs/source/multi_gpu_training.mdx b/docs/source/multi_gpu_training.mdx index 122670f697..cdedd185e7 100644 --- a/docs/source/multi_gpu_training.mdx +++ b/docs/source/multi_gpu_training.mdx @@ -74,44 +74,38 @@ When you launch training with accelerate: 3. **Gradient synchronization**: Gradients are synchronized across GPUs during backpropagation 4. **Single process logging**: Only the main process logs to wandb and saves checkpoints -## Learning Rate and Training Steps Scaling +## Learning Rate and Steps Auto-Scaling -**Important:** LeRobot does **NOT** automatically scale learning rates or training steps based on the number of GPUs. This gives you full control over your training hyperparameters. +By default, LeRobot keeps your hyperparameters exactly as you set them. If you want automatic scaling when using multiple GPUs, enable it with `--auto_scale=true`. -### Why No Automatic Scaling? +When `--auto_scale=true` is enabled and training runs with multiple processes (GPUs), LeRobot will: -Many distributed training frameworks automatically scale the learning rate by the number of GPUs (e.g., `lr = base_lr × num_gpus`). -However, LeRobot keeps the learning rate exactly as you specify it. +- Multiply the optimizer learning rate by the number of processes (linear LR scaling) +- Divide the total number of training steps by the number of processes -### When and How to Scale +This keeps the total number of samples and overall training dynamics roughly consistent across different GPU counts. -If you want to scale your hyperparameters when using multiple GPUs, you should do it manually: - -**Learning Rate Scaling:** +Example with 2 GPUs: ```bash -# Example: 2 GPUs with linear LR scaling -# Base LR: 1e-4, with 2 GPUs -> 2e-4 accelerate launch --num_processes=2 $(which lerobot-train) \ - --optimizer.lr=2e-4 \ --dataset.repo_id=lerobot/pusht \ - --policy=act + --policy.type=act \ + --batch_size=8 \ + --steps=100000 \ + --auto_scale=true ``` -**Training Steps Scaling:** +The effective behavior will be approximately: -Since the effective batch size `bs` increases with multiple GPUs (batch_size × num_gpus), you may want to reduce the number of training steps proportionally: +- Optimizer LR: `lr × 2` +- Steps: `100000 → 50000` -```bash -# Example: 2 GPUs with effective batch size 2x larger -# Original: batch_size=8, steps=100000 -# With 2 GPUs: batch_size=8 (16 in total), steps=50000 -accelerate launch --num_processes=2 $(which lerobot-train) \ - --batch_size=8 \ - --steps=50000 \ - --dataset.repo_id=lerobot/pusht \ - --policy=act -``` +Notes: + +- Auto-scaling is skipped when resuming from a checkpoint to preserve training state. +- If you prefer manual control, omit `--auto_scale` and set `--optimizer.lr` and `--steps` yourself. +- Checkpoint and eval cadence: `auto_scale` currently does not adjust `save_freq` or `eval_freq`. If you want the same number of checkpoints/evaluations as a single-GPU run, consider scaling them as `ceil(freq / world_size)`. Whether to scale these by default is under discussion with maintainers. ## Notes diff --git a/src/lerobot/configs/train.py b/src/lerobot/configs/train.py index d17915c36e..c9799b2c2e 100644 --- a/src/lerobot/configs/train.py +++ b/src/lerobot/configs/train.py @@ -59,6 +59,11 @@ class TrainPipelineConfig(HubMixin): save_checkpoint: bool = True # Checkpoint is saved every `save_freq` training iterations and after the last training step. save_freq: int = 20_000 + # When enabled, and if running under Accelerate with multiple processes, we: + # - multiply the learning rate by the number of processes (linear LR scaling) + # - divide the number of training steps by the number of processes + # This keeps the total number of samples and (approximately) the effective learning dynamics consistent. + auto_scale: bool = False use_policy_training_preset: bool = True optimizer: OptimizerConfig | None = None scheduler: LRSchedulerConfig | None = None diff --git a/src/lerobot/scripts/lerobot_train.py b/src/lerobot/scripts/lerobot_train.py index 0cc6e037fd..71216f212b 100644 --- a/src/lerobot/scripts/lerobot_train.py +++ b/src/lerobot/scripts/lerobot_train.py @@ -158,6 +158,62 @@ def train(cfg: TrainPipelineConfig, accelerator: Accelerator | None = None): # When using accelerate, only the main process should log to avoid duplicate outputs is_main_process = accelerator.is_main_process + # Optionally auto-scale LR and steps when using multiple processes BEFORE logging config + def _scale_optimizer_lr(opt_cfg, factor: int): + """Scale learning rates within optimizer config, including grouped configs.""" + try: + if hasattr(opt_cfg, "lr") and isinstance(getattr(opt_cfg, "lr"), (int, float)): + opt_cfg.lr *= factor + # Handle multi-optimizer groups if present + if hasattr(opt_cfg, "optimizer_groups") and isinstance(opt_cfg.optimizer_groups, dict): + for _name, group in opt_cfg.optimizer_groups.items(): + if isinstance(group, dict) and "lr" in group and isinstance(group["lr"], (int, float)): + group["lr"] *= factor + except (AttributeError, TypeError) as e: + # Best-effort scaling; if structure is unexpected, log a warning + logging.warning(f"Failed to scale optimizer lr: {e}") + + if cfg.auto_scale and not cfg.resume: + world_size = max(1, accelerator.num_processes) + if world_size > 1: + old_steps = cfg.steps + # Linear LR scaling + _scale_optimizer_lr(cfg.optimizer, world_size) + # Also scale policy-level LR fields when using presets (e.g., optimizer_lr, optimizer_lr_backbone) + if cfg.use_policy_training_preset and cfg.policy is not None: + scaled_policy_lr_fields: list[tuple[str, float, float]] = [] + for attr in dir(cfg.policy): + if not attr.startswith("optimizer_lr"): + continue + try: + val = getattr(cfg.policy, attr) + except AttributeError: + continue + if isinstance(val, (int, float)): + new_val = val * world_size + setattr(cfg.policy, attr, new_val) + scaled_policy_lr_fields.append((attr, val, new_val)) + # Scale steps down so total samples processed remains comparable + # Use ceiling division to ensure at least the original number of total samples are processed across all GPUs. + # Implements ceil(steps / world_size) + cfg.steps = max(1, (cfg.steps + world_size - 1) // world_size) + if is_main_process: + logging.info( + colored( + f"Auto-scale enabled with world_size={world_size}: lr x{world_size}, steps {old_steps} -> {cfg.steps}", + "cyan", + attrs=["bold"], + ) + ) + if cfg.use_policy_training_preset and cfg.policy is not None: + for (name, old_v, new_v) in scaled_policy_lr_fields: + logging.info(colored(f"Auto-scale policy {name}: {old_v} -> {new_v}", "cyan")) + else: + if is_main_process: + logging.info("Auto-scale enabled but single process detected; skipping scaling.") + elif cfg.auto_scale and cfg.resume and is_main_process: + logging.info("Auto-scale requested but resume=True; skipping scaling to preserve checkpoint state.") + # Only log on main process if is_main_process: logging.info(pformat(cfg.to_dict())) diff --git a/tests/training/test_auto_scale.py b/tests/training/test_auto_scale.py new file mode 100644 index 0000000000..7e10b2346e --- /dev/null +++ b/tests/training/test_auto_scale.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python + +import os +import subprocess +import tempfile +from pathlib import Path + +import pytest +import torch + + +def get_num_available_gpus(): + if not torch.cuda.is_available(): + return 0 + return torch.cuda.device_count() + + +def run_training_with_auto_scale(config_args, num_processes=2, temp_dir=None): + config_path = Path(temp_dir) / "accelerate_config.yaml" + + with open(config_path, "w") as f: + f.write("compute_environment: LOCAL_MACHINE\n") + f.write("distributed_type: MULTI_GPU\n") + f.write("mixed_precision: 'no'\n") + f.write(f"num_processes: {num_processes}\n") + f.write("use_cpu: false\n") + f.write("gpu_ids: all\n") + f.write("downcast_bf16: 'no'\n") + f.write("machine_rank: 0\n") + f.write("main_training_function: main\n") + f.write("num_machines: 1\n") + f.write("rdzv_backend: static\n") + f.write("same_network: true\n") + + cmd = [ + "accelerate", + "launch", + "--config_file", + str(config_path), + "-m", + "lerobot.scripts.lerobot_train", + ] + config_args + + result = subprocess.run( + cmd, + capture_output=True, + text=True, + env={**os.environ, "CUDA_VISIBLE_DEVICES": ",".join(map(str, range(num_processes)))} + ) + + return result + + +@pytest.mark.skipif( + get_num_available_gpus() < 2, + reason="Auto-scale test requires at least 2 GPUs", +) +def test_auto_scale_steps_and_lr(): + with tempfile.TemporaryDirectory() as temp_dir: + output_dir = Path(temp_dir) / "outputs" + + base_steps = 20 + args = [ + "--dataset.repo_id=lerobot/pusht", + "--dataset.episodes=[0]", + "--policy.type=act", + "--policy.device=cuda", + "--policy.push_to_hub=false", + f"--output_dir={output_dir}", + "--batch_size=4", + f"--steps={base_steps}", + "--eval_freq=-1", + "--log_freq=5", + "--save_freq=10", + "--seed=42", + "--num_workers=0", + "--auto_scale=true", + ] + + result = run_training_with_auto_scale(args, num_processes=2, temp_dir=temp_dir) + + assert result.returncode == 0, ( + f"Training failed with auto-scale enabled.\nSTDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}" + ) + + # Check for auto-scale log message indicating steps were scaled + combined = result.stdout + "\n" + result.stderr + assert "Auto-scale enabled with world_size=2" in combined + assert "steps 20 -> 10" in combined or "steps 20 " in combined