Skip to content

Commit fdb1acd

Browse files
committed
support multi gpu per diloco worker in the run training script
1 parent a53b294 commit fdb1acd

File tree

2 files changed

+28
-13
lines changed

2 files changed

+28
-13
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ torchrun --nproc_per_node=8 \
127127
## 150m on 8 DiLoCo Worker with 500 local steps
128128
In the `open_diloco` folder, run:
129129
```bash
130-
./run_training.sh 8 $PEER \
130+
./run_training.sh 8 1 $PEER \
131131
--sharding-strategy NO_SHARD \
132132
--per-device-train-batch-size 8 \
133133
--precision bf16-mixed \
@@ -149,7 +149,7 @@ under the hood the `run_training.sh` script calls `train_fsdp.py` 8 times with t
149149
## 150m on 8 DiLoCo Worker with 50 local steps
150150
In the `open_diloco` folder, run:
151151
```bash
152-
./run_training.sh 8 $PEER \
152+
./run_training.sh 8 1 $PEER \
153153
--sharding-strategy NO_SHARD \
154154
--per-device-train-batch-size 8 \
155155
--total-batch-size 512 \

open_diloco/run_training.sh

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,48 @@
11
#!/bin/bash
22

33
## example usage
4-
# ./run_training.sh 2 /ip4/127.0.0.1/tcp/36593/p2p/12D3KooWEAyutJ1zFqhAbzDn1LSzraB3o1uS8GSHxQYM87QP4AHN --per-device-train-batch-size 16 --batch-size 512 --local-steps 10 --total-steps 88000 --c4-tiny
4+
# ./run_training.sh 4 2 /ip4/127.0.0.1/tcp/36593/p2p/12D3KooWEAyutJ1zFqhAbzDn1LSzraB3o1uS8GSHxQYM87QP4AHN --per-device-train-batch-size 16 --batch-size 512 --local-steps 10 --total-steps 88000 --c4-tiny
55
# note that everything after the initial peer with will pass to all worker
6+
#
7+
## the command above will use a total of 8 gpu and create 4 diloco workers each of them with two gpu training ddp/fsdp wise
68

7-
# Check if at least two arguments were passed
8-
if [ "$#" -lt 2 ]; then
9-
echo "Usage: $0 <N> <initial_peer> [additional_python_args]"
9+
# Function to get CUDA devices based on the number of GPUs and index
10+
function get_cuda_devices() {
11+
local num_gpu=$1
12+
local index=$2
13+
local start_gpu=$((num_gpu * index))
14+
local end_gpu=$((start_gpu + num_gpu - 1))
15+
16+
if [ "$num_gpu" -eq 1 ]; then
17+
echo $start_gpu
18+
else
19+
echo $(seq -s ',' $start_gpu $end_gpu)
20+
fi
21+
}
22+
23+
# Check if at least three arguments were passed
24+
if [ "$#" -lt 3 ]; then
25+
echo "Usage: $0 <N> <initial_peer> <num_gpu> [additional_python_args]"
1026
exit 1
1127
fi
1228

1329
N=$1 # Set N from the first argument
14-
INITIAL_PEER=$2 # Set INITIAL_PEER from the second argument
15-
shift 2 # Remove the first two arguments so $@ contains only additional Python arguments
30+
NUM_GPU=$2
31+
INITIAL_PEER=$3 # Set INITIAL_PEER from the second argument
32+
shift 3 # Remove the first three arguments so $@ contains only additional Python arguments
1633

1734
# Ensure the logs directory exists
1835
mkdir -p logs
1936

20-
# Execute the command for the first device (CUDA_VISIBLE_DEVICES=0) and log the output, run in background
21-
echo "Command: CUDA_VISIBLE_DEVICES=0 torchrun --nproc_per_node=1 --nnodes=1 train_fsdp.py --initial-peers $INITIAL_PEER $@ "
22-
CUDA_VISIBLE_DEVICES=0 torchrun --nproc_per_node=1 --nnodes=1 train_fsdp.py --hv.initial-peers $INITIAL_PEER $@ --hv.world-rank 0 --hv.galaxy-size $N > logs/log0 2>&1 &
37+
# Execute the command for the first device and log the output, run in background
38+
CUDA_VISIBLE_DEVICES=$(get_cuda_devices $NUM_GPU 0) torchrun --nproc_per_node=$NUM_GPU --nnodes=1 train_fsdp.py --hv.initial-peers $INITIAL_PEER $@ --hv.world-rank 0 --hv.galaxy-size $N > logs/log0 2>&1 &
2339
# Wait for 1 second before continuing with the rest
2440
sleep 2
2541

2642
# Loop from 1 to N-1 and execute the command with different CUDA_VISIBLE_DEVICES and seed values, logging each command's output, run each in background
2743
for i in $(seq 1 $(($N - 1)))
2844
do
29-
echo "Command: CUDA_VISIBLE_DEVICES=$i torchrun --nproc_per_node=1 --nnodes=1 train_fsdp.py.py --initial-peers $INITIAL_PEER $@"
30-
WANDB_MODE=disabled CUDA_VISIBLE_DEVICES=$i torchrun --nproc_per_node=1 --rdzv-endpoint localhost:123$i --nnodes=1 train_fsdp.py --hv.initial-peers $INITIAL_PEER $@ --hv.world-rank $i --hv.galaxy-size $N > logs/log$i 2>&1 &
45+
CUDA_VISIBLE_DEVICES=$(get_cuda_devices $NUM_GPU $i) torchrun --nproc_per_node=$NUM_GPU --rdzv-endpoint localhost:123$i --nnodes=1 train_fsdp.py --hv.initial-peers $INITIAL_PEER $@ --hv.world-rank $i --hv.galaxy-size $N > logs/log$i 2>&1 &
3146
done
3247

3348
tail -f logs/log0

0 commit comments

Comments
 (0)