Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 19 additions & 25 deletions docs/source/multi_gpu_training.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -74,44 +74,38 @@ When you launch training with accelerate:
3. **Gradient synchronization**: Gradients are synchronized across GPUs during backpropagation
4. **Single process logging**: Only the main process logs to wandb and saves checkpoints

## Learning Rate and Training Steps Scaling
## Learning Rate and Steps Auto-Scaling

**Important:** LeRobot does **NOT** automatically scale learning rates or training steps based on the number of GPUs. This gives you full control over your training hyperparameters.
By default, LeRobot keeps your hyperparameters exactly as you set them. If you want automatic scaling when using multiple GPUs, enable it with `--auto_scale=true`.

### Why No Automatic Scaling?
When `--auto_scale=true` is enabled and training runs with multiple processes (GPUs), LeRobot will:

Many distributed training frameworks automatically scale the learning rate by the number of GPUs (e.g., `lr = base_lr × num_gpus`).
However, LeRobot keeps the learning rate exactly as you specify it.
- Multiply the optimizer learning rate by the number of processes (linear LR scaling)
- Divide the total number of training steps by the number of processes

### When and How to Scale
This keeps the total number of samples and overall training dynamics roughly consistent across different GPU counts.

If you want to scale your hyperparameters when using multiple GPUs, you should do it manually:

**Learning Rate Scaling:**
Example with 2 GPUs:

```bash
# Example: 2 GPUs with linear LR scaling
# Base LR: 1e-4, with 2 GPUs -> 2e-4
accelerate launch --num_processes=2 $(which lerobot-train) \
--optimizer.lr=2e-4 \
--dataset.repo_id=lerobot/pusht \
--policy=act
--policy.type=act \
--batch_size=8 \
--steps=100000 \
--auto_scale=true
```

**Training Steps Scaling:**
The effective behavior will be approximately:

Since the effective batch size `bs` increases with multiple GPUs (batch_size × num_gpus), you may want to reduce the number of training steps proportionally:
- Optimizer LR: `lr × 2`
- Steps: `100000 → 50000`

```bash
# Example: 2 GPUs with effective batch size 2x larger
# Original: batch_size=8, steps=100000
# With 2 GPUs: batch_size=8 (16 in total), steps=50000
accelerate launch --num_processes=2 $(which lerobot-train) \
--batch_size=8 \
--steps=50000 \
--dataset.repo_id=lerobot/pusht \
--policy=act
```
Notes:

- Auto-scaling is skipped when resuming from a checkpoint to preserve training state.
- If you prefer manual control, omit `--auto_scale` and set `--optimizer.lr` and `--steps` yourself.
- Checkpoint and eval cadence: `auto_scale` currently does not adjust `save_freq` or `eval_freq`. If you want the same number of checkpoints/evaluations as a single-GPU run, consider scaling them as `ceil(freq / world_size)`. Whether to scale these by default is under discussion with maintainers.

## Notes

Expand Down
5 changes: 5 additions & 0 deletions src/lerobot/configs/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ class TrainPipelineConfig(HubMixin):
save_checkpoint: bool = True
# Checkpoint is saved every `save_freq` training iterations and after the last training step.
save_freq: int = 20_000
# When enabled, and if running under Accelerate with multiple processes, we:
# - multiply the learning rate by the number of processes (linear LR scaling)
# - divide the number of training steps by the number of processes
# This keeps the total number of samples and (approximately) the effective learning dynamics consistent.
auto_scale: bool = False
use_policy_training_preset: bool = True
optimizer: OptimizerConfig | None = None
scheduler: LRSchedulerConfig | None = None
Expand Down
54 changes: 54 additions & 0 deletions src/lerobot/scripts/lerobot_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,60 @@ def train(cfg: TrainPipelineConfig, accelerator: Accelerator | None = None):
# When using accelerate, only the main process should log to avoid duplicate outputs
is_main_process = accelerator.is_main_process

# Optionally auto-scale LR and steps when using multiple processes BEFORE logging config
def _scale_optimizer_lr(opt_cfg, factor: int):
"""Scale learning rates within optimizer config, including grouped configs."""
try:
if hasattr(opt_cfg, "lr") and isinstance(getattr(opt_cfg, "lr"), (int, float)):
opt_cfg.lr *= factor
# Handle multi-optimizer groups if present
if hasattr(opt_cfg, "optimizer_groups") and isinstance(opt_cfg.optimizer_groups, dict):
for _name, group in opt_cfg.optimizer_groups.items():
if isinstance(group, dict) and "lr" in group and isinstance(group["lr"], (int, float)):
group["lr"] *= factor
except Exception:
# Best-effort scaling; if structure is unexpected, skip silently
pass

if cfg.auto_scale and not cfg.resume:
world_size = max(1, accelerator.num_processes)
if world_size > 1:
old_steps = cfg.steps
# Linear LR scaling
_scale_optimizer_lr(cfg.optimizer, world_size)
# Also scale policy-level LR fields when using presets (e.g., optimizer_lr, optimizer_lr_backbone)
if cfg.use_policy_training_preset and cfg.policy is not None:
scaled_policy_lr_fields: list[tuple[str, float, float]] = []
for attr in dir(cfg.policy):
if not attr.startswith("optimizer_lr"):
continue
try:
val = getattr(cfg.policy, attr)
except Exception:
continue
if isinstance(val, (int, float)):
new_val = val * world_size
setattr(cfg.policy, attr, new_val)
scaled_policy_lr_fields.append((attr, val, new_val))
# Scale steps down so total samples processed remains comparable
cfg.steps = max(1, (cfg.steps + world_size - 1) // world_size)
if is_main_process:
logging.info(
colored(
f"Auto-scale enabled with world_size={world_size}: lr x{world_size}, steps {old_steps} -> {cfg.steps}",
"cyan",
attrs=["bold"],
)
)
if cfg.use_policy_training_preset and cfg.policy is not None:
for (name, old_v, new_v) in scaled_policy_lr_fields:
logging.info(colored(f"Auto-scale policy {name}: {old_v} -> {new_v}", "cyan"))
else:
if is_main_process:
logging.info("Auto-scale enabled but single process detected; skipping scaling.")
elif cfg.auto_scale and cfg.resume and is_main_process:
logging.info("Auto-scale requested but resume=True; skipping scaling to preserve checkpoint state.")

# Only log on main process
if is_main_process:
logging.info(pformat(cfg.to_dict()))
Expand Down
89 changes: 89 additions & 0 deletions tests/training/test_auto_scale.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/usr/bin/env python

import os
import subprocess
import tempfile
from pathlib import Path

import pytest
import torch


def get_num_available_gpus():
if not torch.cuda.is_available():
return 0
return torch.cuda.device_count()


def run_training_with_auto_scale(config_args, num_processes=2, temp_dir=None):
config_path = Path(temp_dir) / "accelerate_config.yaml"

with open(config_path, "w") as f:
f.write("compute_environment: LOCAL_MACHINE\n")
f.write("distributed_type: MULTI_GPU\n")
f.write("mixed_precision: 'no'\n")
f.write(f"num_processes: {num_processes}\n")
f.write("use_cpu: false\n")
f.write("gpu_ids: all\n")
f.write("downcast_bf16: 'no'\n")
f.write("machine_rank: 0\n")
f.write("main_training_function: main\n")
f.write("num_machines: 1\n")
f.write("rdzv_backend: static\n")
f.write("same_network: true\n")

cmd = [
"accelerate",
"launch",
"--config_file",
str(config_path),
"-m",
"lerobot.scripts.lerobot_train",
] + config_args

result = subprocess.run(
cmd,
capture_output=True,
text=True,
env={**os.environ, "CUDA_VISIBLE_DEVICES": ",".join(map(str, range(num_processes)))}
)

return result


@pytest.mark.skipif(
get_num_available_gpus() < 2,
reason="Auto-scale test requires at least 2 GPUs",
)
def test_auto_scale_steps_and_lr():
with tempfile.TemporaryDirectory() as temp_dir:
output_dir = Path(temp_dir) / "outputs"

base_steps = 20
args = [
"--dataset.repo_id=lerobot/pusht",
"--dataset.episodes=[0]",
"--policy.type=act",
"--policy.device=cuda",
"--policy.push_to_hub=false",
f"--output_dir={output_dir}",
"--batch_size=4",
f"--steps={base_steps}",
"--eval_freq=-1",
"--log_freq=5",
"--save_freq=10",
"--seed=42",
"--num_workers=0",
"--auto_scale=true",
]

result = run_training_with_auto_scale(args, num_processes=2, temp_dir=temp_dir)

assert result.returncode == 0, (
f"Training failed with auto-scale enabled.\nSTDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}"
)

# Check for auto-scale log message indicating steps were scaled
combined = result.stdout + "\n" + result.stderr
assert "Auto-scale enabled with world_size=2" in combined
assert "steps 20 -> 10" in combined or "steps 20 " in combined