Skip to content

Commit 1b3ea8a

Browse files
committed
fix training script
1 parent eaae47a commit 1b3ea8a

File tree

1 file changed

+1
-1
lines changed

1 file changed

+1
-1
lines changed

open_diloco/run_training.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ sleep 2
6666
# Loop from 1 to N-1 and execute the command with different CUDA_VISIBLE_DEVICES and seed values, logging each command's output, run each in background
6767
for i in $(seq 1 $(($N - 1)))
6868
do
69-
CUDA_VISIBLE_DEVICES=$(get_cuda_devices $NUM_GPU $i) torchrun --nproc_per_node=$NUM_GPU --rdzv-endpoint localhost:123$i --nnodes=1 train_fsdp.py --hv.initial-peers $INITIAL_PEER $@ --hv.world-rank $i --hv.galaxy-size $N > logs/log$i 2>&1 &
69+
WANDB_MODE=disabled CUDA_VISIBLE_DEVICES=$(get_cuda_devices $NUM_GPU $i) torchrun --nproc_per_node=$NUM_GPU --rdzv-endpoint localhost:123$i --nnodes=1 train_fsdp.py --hv.initial-peers $INITIAL_PEER $@ --hv.world-rank $i --hv.galaxy-size $N > logs/log$i 2>&1 &
7070
done
7171

7272
tail -f logs/log0

0 commit comments

Comments
 (0)