support multi gpu per diloco worker in the run training script

samsja · samsja · commit fdb1acdff112 · 2024-08-09T10:25:25.000+02:00
diff --git a/README.md b/README.md
@@ -127,7 +127,7 @@ torchrun --nproc_per_node=8 \
 ## 150m on 8 DiLoCo Worker with 500 local steps
 In the `open_diloco` folder, run:
 ```bash
-./run_training.sh 8 $PEER \
+./run_training.sh 8 1 $PEER \
     --sharding-strategy NO_SHARD \
     --per-device-train-batch-size 8 \
     --precision bf16-mixed \
@@ -149,7 +149,7 @@ under the hood the `run_training.sh` script calls `train_fsdp.py` 8 times with t
 ## 150m on 8 DiLoCo Worker with 50 local steps
 In the `open_diloco` folder, run:
 ```bash
-./run_training.sh 8 $PEER \
+./run_training.sh 8 1 $PEER \
     --sharding-strategy NO_SHARD \
     --per-device-train-batch-size 8 \
     --total-batch-size 512 \
diff --git a/open_diloco/run_training.sh b/open_diloco/run_training.sh
@@ -1,33 +1,48 @@
 #!/bin/bash
 
 ## example usage
-# ./run_training.sh 2 /ip4/127.0.0.1/tcp/36593/p2p/12D3KooWEAyutJ1zFqhAbzDn1LSzraB3o1uS8GSHxQYM87QP4AHN --per-device-train-batch-size 16 --batch-size 512 --local-steps 10  --total-steps 88000 --c4-tiny
+# ./run_training.sh 4 2 /ip4/127.0.0.1/tcp/36593/p2p/12D3KooWEAyutJ1zFqhAbzDn1LSzraB3o1uS8GSHxQYM87QP4AHN --per-device-train-batch-size 16 --batch-size 512 --local-steps 10  --total-steps 88000 --c4-tiny
 # note that everything after the initial peer with will pass to all worker
+#
+## the command above will use a total of 8 gpu and create 4 diloco workers each of them with two gpu training ddp/fsdp wise
 
-# Check if at least two arguments were passed
-if [ "$#" -lt 2 ]; then
-    echo "Usage: $0 <N> <initial_peer> [additional_python_args]"
+# Function to get CUDA devices based on the number of GPUs and index
+function get_cuda_devices() {
+    local num_gpu=$1
+    local index=$2
+    local start_gpu=$((num_gpu * index))
+    local end_gpu=$((start_gpu + num_gpu - 1))
+
+    if [ "$num_gpu" -eq 1 ]; then
+        echo $start_gpu
+    else
+        echo $(seq -s ',' $start_gpu $end_gpu)
+    fi
+}
+
+# Check if at least three arguments were passed
+if [ "$#" -lt 3 ]; then
+    echo "Usage: $0 <N> <initial_peer> <num_gpu> [additional_python_args]"
     exit 1
 fi
 
 N=$1         # Set N from the first argument
-INITIAL_PEER=$2  # Set INITIAL_PEER from the second argument
-shift 2      # Remove the first two arguments so $@ contains only additional Python arguments
+NUM_GPU=$2
+INITIAL_PEER=$3  # Set INITIAL_PEER from the second argument
+shift 3      # Remove the first three arguments so $@ contains only additional Python arguments
 
 # Ensure the logs directory exists
 mkdir -p logs
 
-# Execute the command for the first device (CUDA_VISIBLE_DEVICES=0) and log the output, run in background
-echo "Command: CUDA_VISIBLE_DEVICES=0 torchrun --nproc_per_node=1 --nnodes=1 train_fsdp.py  --initial-peers $INITIAL_PEER $@ "
-CUDA_VISIBLE_DEVICES=0 torchrun --nproc_per_node=1 --nnodes=1 train_fsdp.py --hv.initial-peers $INITIAL_PEER $@ --hv.world-rank 0 --hv.galaxy-size $N  > logs/log0 2>&1 &
+# Execute the command for the first device and log the output, run in background
+CUDA_VISIBLE_DEVICES=$(get_cuda_devices $NUM_GPU 0) torchrun --nproc_per_node=$NUM_GPU --nnodes=1 train_fsdp.py --hv.initial-peers $INITIAL_PEER $@ --hv.world-rank 0 --hv.galaxy-size $N  > logs/log0 2>&1 &
 # Wait for 1 second before continuing with the rest
 sleep 2
 
 # Loop from 1 to N-1 and execute the command with different CUDA_VISIBLE_DEVICES and seed values, logging each command's output, run each in background
 for i in $(seq 1 $(($N - 1)))
 do
-  echo "Command: CUDA_VISIBLE_DEVICES=$i torchrun --nproc_per_node=1 --nnodes=1 train_fsdp.py.py --initial-peers $INITIAL_PEER $@"
-  WANDB_MODE=disabled CUDA_VISIBLE_DEVICES=$i torchrun --nproc_per_node=1 --rdzv-endpoint localhost:123$i --nnodes=1 train_fsdp.py --hv.initial-peers $INITIAL_PEER $@ --hv.world-rank $i --hv.galaxy-size $N > logs/log$i 2>&1 &
+  CUDA_VISIBLE_DEVICES=$(get_cuda_devices $NUM_GPU $i) torchrun --nproc_per_node=$NUM_GPU --rdzv-endpoint localhost:123$i --nnodes=1 train_fsdp.py --hv.initial-peers $INITIAL_PEER $@ --hv.world-rank $i --hv.galaxy-size $N > logs/log$i 2>&1 &
 done
 
 tail -f logs/log0