Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 24 additions & 4 deletions distributed/ddp-tutorial-series/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,27 @@ Each code file extends upon the previous one. The series starts with a non-distr
* [slurm/setup_pcluster_slurm.md](slurm/setup_pcluster_slurm.md): instructions to set up an AWS cluster
* [slurm/config.yaml.template](slurm/config.yaml.template): configuration to set up an AWS cluster
* [slurm/sbatch_run.sh](slurm/sbatch_run.sh): slurm script to launch the training job




## Installation
```
pip install -r requirements.txt
```
## Running Examples
For running the examples to run for 20 Epochs and save checkpoints every 5 Epochs, you can use the following command:
### Single GPU
```
python single_gpu.py 20 5
```
### Multi-GPU
```
python multigpu.py 20 5
```
### Multi-GPU Torchrun
```
torchrun --nnodes=1 --nproc_per_node=4 multigpu_torchrun.py 20 5
```
### Multi-Node
```
torchrun --nnodes=2 --nproc_per_node=4 multinode.py 20 5
```

For more details, check the [run_examples.sh](distributed/ddp-tutorial-series/run_examples.sh) script.
12 changes: 9 additions & 3 deletions distributed/ddp-tutorial-series/multigpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,14 @@ def ddp_setup(rank, world_size):
"""
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "12355"
torch.cuda.set_device(rank)
init_process_group(backend="nccl", rank=rank, world_size=world_size)

if torch.accelerator.is_available():
device = torch.device(f"{torch.accelerator.current_accelerator()}:{rank}")
torch.accelerator.set_device_index(rank)
print(f"Running on rank {rank} on device {device}")

backend = torch.distributed.get_default_backend_for_device(device)
init_process_group(backend=backend, rank=rank, world_size=world_size)

class Trainer:
def __init__(
Expand Down Expand Up @@ -100,5 +106,5 @@ def main(rank: int, world_size: int, save_every: int, total_epochs: int, batch_s
parser.add_argument('--batch_size', default=32, type=int, help='Input batch size on each device (default: 32)')
args = parser.parse_args()

world_size = torch.cuda.device_count()
world_size = torch.accelerator.device_count()
mp.spawn(main, args=(world_size, args.save_every, args.total_epochs, args.batch_size), nprocs=world_size)
16 changes: 13 additions & 3 deletions distributed/ddp-tutorial-series/multigpu_torchrun.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,18 @@


def ddp_setup():
torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
init_process_group(backend="nccl")
rank = int(os.environ["LOCAL_RANK"])
if torch.accelerator.is_available():
device = torch.device(f"{torch.accelerator.current_accelerator()}:{rank}")
torch.accelerator.set_device_index(rank)
print(f"Running on rank {rank} on device {device}")
else:
print(f"Multi-GPU environment not detected")

backend = torch.distributed.get_default_backend_for_device(rank)
torch.distributed.init_process_group(backend=backend, rank=rank, device_id=rank)



class Trainer:
def __init__(
Expand All @@ -37,7 +47,7 @@ def __init__(
self.model = DDP(self.model, device_ids=[self.gpu_id])

def _load_snapshot(self, snapshot_path):
loc = f"cuda:{self.gpu_id}"
loc = str(torch.accelerator.current_accelerator())
snapshot = torch.load(snapshot_path, map_location=loc)
self.model.load_state_dict(snapshot["MODEL_STATE"])
self.epochs_run = snapshot["EPOCHS_RUN"]
Expand Down
15 changes: 12 additions & 3 deletions distributed/ddp-tutorial-series/multinode.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,17 @@


def ddp_setup():
torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
init_process_group(backend="nccl")
rank = int(os.environ["LOCAL_RANK"])
if torch.accelerator.is_available():
device = torch.device(f"{torch.accelerator.current_accelerator()}:{rank}")
torch.accelerator.set_device_index(rank)
print(f"Running on rank {rank} on device {device}")
else:
print(f"Multi-GPU environment not detected")

backend = torch.distributed.get_default_backend_for_device(rank)
torch.distributed.init_process_group(backend=backend, rank=rank, device_id=rank)


class Trainer:
def __init__(
Expand All @@ -38,7 +47,7 @@ def __init__(
self.model = DDP(self.model, device_ids=[self.local_rank])

def _load_snapshot(self, snapshot_path):
loc = f"cuda:{self.local_rank}"
loc = str(torch.accelerator.current_accelerator())
snapshot = torch.load(snapshot_path, map_location=loc)
self.model.load_state_dict(snapshot["MODEL_STATE"])
self.epochs_run = snapshot["EPOCHS_RUN"]
Expand Down
2 changes: 1 addition & 1 deletion distributed/ddp-tutorial-series/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
torch>=1.11.0
torch>=2.7
11 changes: 11 additions & 0 deletions distributed/ddp-tutorial-series/run_example.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# /bin/bash
# bash run_example.sh {file_to_run.py} {num_gpus}
# where file_to_run = example to run. Default = 'example.py'
# num_gpus = num local gpus to use (must be at least 2). Default = 2

# samples to run include:
# multigpu_torchrun.py
# multinode.py

echo "Launching ${1:-example.py} with ${2:-2} gpus"
torchrun --nnodes=1 --nproc_per_node=${2:-2} --rdzv_id=101 --rdzv_endpoint="localhost:5972" ${1:-example.py} 10 1
2 changes: 1 addition & 1 deletion distributed/ddp-tutorial-series/single_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,5 +78,5 @@ def main(device, total_epochs, save_every, batch_size):
parser.add_argument('--batch_size', default=32, type=int, help='Input batch size on each device (default: 32)')
args = parser.parse_args()

device = 0 # shorthand for cuda:0
device = 0
main(device, args.total_epochs, args.save_every, args.batch_size)
7 changes: 7 additions & 0 deletions run_distributed_examples.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,13 @@ function distributed_tensor_parallelism() {
uv run bash run_example.sh fsdp_tp_example.py || error "2D parallel example failed"
}

function distributed_ddp-tutorial-series() {
uv python multigpu.py 10 1 || error "ddp tutorial series multigpu example failed"
uv run bash run_example.sh multigpu_torchrun.py || error "ddp tutorial series multigpu torchrun example failed"
uv run bash run_example.sh multinode.py || error "ddp tutorial series multinode example failed"
uv python single_gpu.py 10 1 || error "ddp tutorial series single gpu example failed"
}

function distributed_ddp() {
uv run main.py || error "ddp example failed"
}
Expand Down