|
1 | 1 | #!/bin/bash
|
2 | 2 |
|
3 | 3 | ## example usage
|
4 |
| -# ./run_training.sh 2 /ip4/127.0.0.1/tcp/36593/p2p/12D3KooWEAyutJ1zFqhAbzDn1LSzraB3o1uS8GSHxQYM87QP4AHN --per-device-train-batch-size 16 --batch-size 512 --local-steps 10 --total-steps 88000 --c4-tiny |
| 4 | +# ./run_training.sh 4 2 /ip4/127.0.0.1/tcp/36593/p2p/12D3KooWEAyutJ1zFqhAbzDn1LSzraB3o1uS8GSHxQYM87QP4AHN --per-device-train-batch-size 16 --batch-size 512 --local-steps 10 --total-steps 88000 --c4-tiny |
5 | 5 | # note that everything after the initial peer with will pass to all worker
|
| 6 | +# |
| 7 | +## the command above will use a total of 8 gpu and create 4 diloco workers each of them with two gpu training ddp/fsdp wise |
6 | 8 |
|
7 |
| -# Check if at least two arguments were passed |
8 |
| -if [ "$#" -lt 2 ]; then |
9 |
| - echo "Usage: $0 <N> <initial_peer> [additional_python_args]" |
| 9 | +# Function to get CUDA devices based on the number of GPUs and index |
| 10 | +function get_cuda_devices() { |
| 11 | + local num_gpu=$1 |
| 12 | + local index=$2 |
| 13 | + local start_gpu=$((num_gpu * index)) |
| 14 | + local end_gpu=$((start_gpu + num_gpu - 1)) |
| 15 | + |
| 16 | + if [ "$num_gpu" -eq 1 ]; then |
| 17 | + echo $start_gpu |
| 18 | + else |
| 19 | + echo $(seq -s ',' $start_gpu $end_gpu) |
| 20 | + fi |
| 21 | +} |
| 22 | + |
| 23 | +# Check if at least three arguments were passed |
| 24 | +if [ "$#" -lt 3 ]; then |
| 25 | + echo "Usage: $0 <N> <initial_peer> <num_gpu> [additional_python_args]" |
10 | 26 | exit 1
|
11 | 27 | fi
|
12 | 28 |
|
13 | 29 | N=$1 # Set N from the first argument
|
14 |
| -INITIAL_PEER=$2 # Set INITIAL_PEER from the second argument |
15 |
| -shift 2 # Remove the first two arguments so $@ contains only additional Python arguments |
| 30 | +NUM_GPU=$2 |
| 31 | +INITIAL_PEER=$3 # Set INITIAL_PEER from the second argument |
| 32 | +shift 3 # Remove the first three arguments so $@ contains only additional Python arguments |
16 | 33 |
|
17 | 34 | # Ensure the logs directory exists
|
18 | 35 | mkdir -p logs
|
19 | 36 |
|
20 |
| -# Execute the command for the first device (CUDA_VISIBLE_DEVICES=0) and log the output, run in background |
21 |
| -echo "Command: CUDA_VISIBLE_DEVICES=0 torchrun --nproc_per_node=1 --nnodes=1 train_fsdp.py --initial-peers $INITIAL_PEER $@ " |
22 |
| -CUDA_VISIBLE_DEVICES=0 torchrun --nproc_per_node=1 --nnodes=1 train_fsdp.py --hv.initial-peers $INITIAL_PEER $@ --hv.world-rank 0 --hv.galaxy-size $N > logs/log0 2>&1 & |
| 37 | +# Execute the command for the first device and log the output, run in background |
| 38 | +CUDA_VISIBLE_DEVICES=$(get_cuda_devices $NUM_GPU 0) torchrun --nproc_per_node=$NUM_GPU --nnodes=1 train_fsdp.py --hv.initial-peers $INITIAL_PEER $@ --hv.world-rank 0 --hv.galaxy-size $N > logs/log0 2>&1 & |
23 | 39 | # Wait for 1 second before continuing with the rest
|
24 | 40 | sleep 2
|
25 | 41 |
|
26 | 42 | # Loop from 1 to N-1 and execute the command with different CUDA_VISIBLE_DEVICES and seed values, logging each command's output, run each in background
|
27 | 43 | for i in $(seq 1 $(($N - 1)))
|
28 | 44 | do
|
29 |
| - echo "Command: CUDA_VISIBLE_DEVICES=$i torchrun --nproc_per_node=1 --nnodes=1 train_fsdp.py.py --initial-peers $INITIAL_PEER $@" |
30 |
| - WANDB_MODE=disabled CUDA_VISIBLE_DEVICES=$i torchrun --nproc_per_node=1 --rdzv-endpoint localhost:123$i --nnodes=1 train_fsdp.py --hv.initial-peers $INITIAL_PEER $@ --hv.world-rank $i --hv.galaxy-size $N > logs/log$i 2>&1 & |
| 45 | + CUDA_VISIBLE_DEVICES=$(get_cuda_devices $NUM_GPU $i) torchrun --nproc_per_node=$NUM_GPU --rdzv-endpoint localhost:123$i --nnodes=1 train_fsdp.py --hv.initial-peers $INITIAL_PEER $@ --hv.world-rank $i --hv.galaxy-size $N > logs/log$i 2>&1 & |
31 | 46 | done
|
32 | 47 |
|
33 | 48 | tail -f logs/log0
|
0 commit comments