File tree Expand file tree Collapse file tree 1 file changed +24
-0
lines changed Expand file tree Collapse file tree 1 file changed +24
-0
lines changed Original file line number Diff line number Diff line change 6
6
#
7
7
# # the command above will use a total of 8 gpu and create 4 diloco workers each of them with two gpu training ddp/fsdp wise
8
8
9
+
10
+ # you can either pass a fixed initial peer or set it to auto and the script will start a dht server for you
11
+ # # # ./run_training.sh 2 1 auto --per-device-train-batch-size 16 --batch-size 512 --local-steps 10 --total-steps 88000 --c4-tiny
12
+
13
+
9
14
# Function to get CUDA devices based on the number of GPUs and index
10
15
function get_cuda_devices() {
11
16
local num_gpu=$1
@@ -31,6 +36,25 @@ NUM_GPU=$2
31
36
INITIAL_PEER=$3 # Set INITIAL_PEER from the second argument
32
37
shift 3 # Remove the first three arguments so $@ contains only additional Python arguments
33
38
39
+ mkdir -p logs
40
+ echo " Initial peer: $INITIAL_PEER "
41
+
42
+ # Check if INITIAL_PEER is set to 'auto' and adjust accordingly
43
+ if [ " $INITIAL_PEER " = " auto" ]; then
44
+ # start the dht server
45
+ echo " Starting DHT server"
46
+ hivemind-dht --host_maddr /ip4/0.0.0.0/tcp/12345 --identity_path fixed_key.pem > logs/log_dht 2>&1 &
47
+
48
+ INITIAL_PEER=" "
49
+ # get the initial peer from the logs, loop until the peer is found
50
+ while [ -z " $INITIAL_PEER " ]; do
51
+ sleep 1
52
+ INITIAL_PEER=$( awk ' /Running a DHT instance/ {print $NF}' logs/log_dht)
53
+
54
+ done
55
+ fi
56
+ echo " Initial peer: $INITIAL_PEER "
57
+
34
58
# Ensure the logs directory exists
35
59
mkdir -p logs
36
60
You can’t perform that action at this time.
0 commit comments