Skip to content

Commit 3e1fa45

Browse files
committed
allow training script to automatically start the dht server
1 parent f2be2f7 commit 3e1fa45

File tree

1 file changed

+24
-0
lines changed

1 file changed

+24
-0
lines changed

open_diloco/run_training.sh

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@
66
#
77
## the command above will use a total of 8 gpu and create 4 diloco workers each of them with two gpu training ddp/fsdp wise
88

9+
10+
# you can either pass a fixed initial peer or set it to auto and the script will start a dht server for you
11+
## # ./run_training.sh 2 1 auto --per-device-train-batch-size 16 --batch-size 512 --local-steps 10 --total-steps 88000 --c4-tiny
12+
13+
914
# Function to get CUDA devices based on the number of GPUs and index
1015
function get_cuda_devices() {
1116
local num_gpu=$1
@@ -31,6 +36,25 @@ NUM_GPU=$2
3136
INITIAL_PEER=$3 # Set INITIAL_PEER from the second argument
3237
shift 3 # Remove the first three arguments so $@ contains only additional Python arguments
3338

39+
mkdir -p logs
40+
echo "Initial peer: $INITIAL_PEER"
41+
42+
# Check if INITIAL_PEER is set to 'auto' and adjust accordingly
43+
if [ "$INITIAL_PEER" = "auto" ]; then
44+
# start the dht server
45+
echo "Starting DHT server"
46+
hivemind-dht --host_maddr /ip4/0.0.0.0/tcp/12345 --identity_path fixed_key.pem > logs/log_dht 2>&1 &
47+
48+
INITIAL_PEER=""
49+
# get the initial peer from the logs, loop until the peer is found
50+
while [ -z "$INITIAL_PEER" ]; do
51+
sleep 1
52+
INITIAL_PEER=$(awk '/Running a DHT instance/ {print $NF}' logs/log_dht)
53+
54+
done
55+
fi
56+
echo "Initial peer: $INITIAL_PEER"
57+
3458
# Ensure the logs directory exists
3559
mkdir -p logs
3660

0 commit comments

Comments
 (0)