-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsetup_multi_host_ssh.sh
More file actions
executable file
·111 lines (94 loc) · 3.12 KB
/
setup_multi_host_ssh.sh
File metadata and controls
executable file
·111 lines (94 loc) · 3.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/bin/bash
# Setup SSH between TPU hosts for multi-host training
# Run this on worker-0
set -e
echo "======================================================================"
echo "Setting up SSH for multi-host TPU pod"
echo "======================================================================"
# Step 1: Check current host
CURRENT_HOST=$(hostname)
echo "Current host: $CURRENT_HOST"
# Step 2: Generate SSH key if doesn't exist
if [ ! -f ~/.ssh/id_rsa ]; then
echo "Generating SSH key..."
ssh-keygen -t rsa -N "" -f ~/.ssh/id_rsa -q
echo "✓ SSH key generated"
else
echo "✓ SSH key already exists"
fi
# Step 3: Add to authorized_keys
if ! grep -q "$(cat ~/.ssh/id_rsa.pub)" ~/.ssh/authorized_keys 2>/dev/null; then
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
chmod 600 ~/.ssh/authorized_keys
echo "✓ Added key to authorized_keys"
else
echo "✓ Key already in authorized_keys"
fi
# Step 4: Detect other workers
echo ""
echo "Detecting other workers in the pod..."
# TPU VMs typically have workers named like:
# t1v-n-XXXXX-w-0, t1v-n-XXXXX-w-1, etc.
BASE_NAME=$(hostname | sed 's/-w-[0-9]*$//')
CURRENT_INDEX=$(hostname | grep -oP 'w-\K[0-9]+')
echo " Base name: $BASE_NAME"
echo " Current worker index: $CURRENT_INDEX"
# Find other workers
echo ""
echo "Other workers in this pod:"
WORKERS=()
for i in 0 1 2 3; do
WORKER_NAME="${BASE_NAME}-w-${i}"
if [ "$WORKER_NAME" != "$CURRENT_HOST" ]; then
# Try to ping the worker
if timeout 2 ping -c 1 $WORKER_NAME &> /dev/null; then
echo " ✓ Found: $WORKER_NAME"
WORKERS+=($WORKER_NAME)
fi
fi
done
if [ ${#WORKERS[@]} -eq 0 ]; then
echo ""
echo "No other workers found. This might be a single-host TPU."
echo "Multi-host setup not needed."
exit 0
fi
# Step 5: Copy SSH key to other workers
echo ""
echo "Setting up SSH to other workers..."
for WORKER in "${WORKERS[@]}"; do
echo ""
echo "Setting up SSH to $WORKER..."
# First, try to copy the SSH key
# This will ask for password once
if ssh-copy-id -i ~/.ssh/id_rsa.pub -o StrictHostKeyChecking=no $WORKER 2>/dev/null; then
echo " ✓ SSH key copied to $WORKER"
else
echo " ! Could not copy key automatically"
echo " Please run manually:"
echo " ssh-copy-id $WORKER"
fi
# Test SSH
if ssh -o StrictHostKeyChecking=no -o BatchMode=yes $WORKER "echo 'SSH test successful'" 2>/dev/null; then
echo " ✓ SSH working to $WORKER"
else
echo " ! SSH not working yet to $WORKER"
echo " You may need to accept the host key first:"
echo " ssh $WORKER"
fi
done
echo ""
echo "======================================================================"
echo "Setup complete!"
echo "======================================================================"
echo ""
echo "Workers in your pod:"
echo " - $CURRENT_HOST (current)"
for WORKER in "${WORKERS[@]}"; do
echo " - $WORKER"
done
echo ""
echo "Next steps:"
echo " 1. Test SSH: ssh ${WORKERS[0]}"
echo " 2. Run multi-host test: ./run_multi_host_local.sh python3 multi_host_test.py"
echo ""